diff options
author | JJfutbol <JJfutbol@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> | 2009-07-11 02:13:40 +0000 |
---|---|---|
committer | JJfutbol <JJfutbol@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> | 2009-07-11 02:13:40 +0000 |
commit | 94a1e8525e666ff5685b58795386da72b8e873d2 (patch) | |
tree | b4b472570720962ec86d67d878e5f542a3d9efd7 /js | |
parent | 90cf706688f8899b381e98008327b77a15b754ca (diff) | |
download | readability-simple-94a1e8525e666ff5685b58795386da72b8e873d2.tar.bz2 readability-simple-94a1e8525e666ff5685b58795386da72b8e873d2.zip |
worked on the algorithm for removeNonContentElement, considering another way to determine whether to remove element
git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@63 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
Diffstat (limited to 'js')
-rwxr-xr-x | js/readability.js | 50 |
1 files changed, 40 insertions, 10 deletions
diff --git a/js/readability.js b/js/readability.js index be254df..93a3e80 100755 --- a/js/readability.js +++ b/js/readability.js @@ -82,7 +82,7 @@ function determineContentScore(score, parent, element) // TODO: should set as a global var since badKeywords are used elsewhere
var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"];
var semiGoodKeywords = ["area", "container", "inner", "main"];
- var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"];
+ var badKeywords = ["ad", "captcha", "classified", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"];
// we'll be doing a case insensitive compare
var className = parent.className.toLowerCase();
@@ -469,18 +469,48 @@ function removeNonContentElement(element, tagName) var a = descendant.getElementsByTagName("a").length;
var embed = descendant.getElementsByTagName("embed").length;
- var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"];
-
- // should improve this but for if the element has a single bad keyword remove it
- for (var j = 0; j < badKeywords.length; j++)
+ /*
+ // no basic elements were found at all
+ if (a == 0 && embed == 0 & img == 0 && li == 0 && p == 0)
{
- if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0)
+ // retrieve all children to see if it contains any elements
+ var children = descendant.getElementsByTagName("*");
+ var containsOnlyText = true;
+
+ for (var j = 0; j < children.length; j++)
{
- descendant.parentNode.removeChild(descendant);
- descendant = null;
- break;
+ var child = children[j];
+
+ // element type found so we don't have an element (e.g. DIV) with just text
+ if (child.nodeType == 1)
+ {
+ containsOnlyText = false;
+ break;
+ }
}
- }
+
+ //
+ if (!containsOnlyText)
+ {
+ descendant.parentNode.removeChild(descendant);
+ }
+ continue; + }
+ else
+ {*/
+ var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "crumbs", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "tag", "toolbar", "tools", "trackback", "widget"];
+
+ // should improve this but for if the element has a single bad keyword remove it
+ for (var j = 0; j < badKeywords.length; j++)
+ {
+ if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0)
+ {
+ descendant.parentNode.removeChild(descendant);
+ descendant = null;
+ break;
+ }
+ }
+ /*}*/
// found a bad keyword so the element has been removed, continue to the next one
if (!descendant)
|