From 94a1e8525e666ff5685b58795386da72b8e873d2 Mon Sep 17 00:00:00 2001 From: JJfutbol Date: Sat, 11 Jul 2009 02:13:40 +0000 Subject: worked on the algorithm for removeNonContentElement, considering another way to determine whether to remove element git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@63 d4e419ec-0920-11de-bbfd-a7c1bc4c261e --- js/readability.js | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'js') diff --git a/js/readability.js b/js/readability.js index be254df..93a3e80 100755 --- a/js/readability.js +++ b/js/readability.js @@ -82,7 +82,7 @@ function determineContentScore(score, parent, element) // TODO: should set as a global var since badKeywords are used elsewhere var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"]; var semiGoodKeywords = ["area", "container", "inner", "main"]; - var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"]; + var badKeywords = ["ad", "captcha", "classified", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"]; // we'll be doing a case insensitive compare var className = parent.className.toLowerCase(); @@ -469,18 +469,48 @@ function removeNonContentElement(element, tagName) var a = descendant.getElementsByTagName("a").length; var embed = descendant.getElementsByTagName("embed").length; - var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"]; - - // should improve this but for if the element has a single bad keyword remove it - for (var j = 0; j < badKeywords.length; j++) + /* + // no basic elements were found at all + if (a == 0 && embed == 0 & img == 0 && li == 0 && p == 0) { - if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0) + // retrieve all children to see if it contains any elements + var children = descendant.getElementsByTagName("*"); + var containsOnlyText = true; + + for (var j = 0; j < children.length; j++) { - descendant.parentNode.removeChild(descendant); - descendant = null; - break; + var child = children[j]; + + // element type found so we don't have an element (e.g. DIV) with just text + if (child.nodeType == 1) + { + containsOnlyText = false; + break; + } } - } + + // + if (!containsOnlyText) + { + descendant.parentNode.removeChild(descendant); + } + continue; + } + else + {*/ + var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "crumbs", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "tag", "toolbar", "tools", "trackback", "widget"]; + + // should improve this but for if the element has a single bad keyword remove it + for (var j = 0; j < badKeywords.length; j++) + { + if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0) + { + descendant.parentNode.removeChild(descendant); + descendant = null; + break; + } + } + /*}*/ // found a bad keyword so the element has been removed, continue to the next one if (!descendant) -- cgit v1.2.3