From af01903012c701583f73f5512855c0a27774f8a4 Mon Sep 17 00:00:00 2001 From: umbrae Date: Sat, 7 Mar 2009 14:46:01 +0000 Subject: Kill extra break tags in the filtered HTML. #8 git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@13 d4e419ec-0920-11de-bbfd-a7c1bc4c261e --- js/readability-0.1.js | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/js/readability-0.1.js b/js/readability-0.1.js index 6c255e6..aade9b2 100755 --- a/js/readability-0.1.js +++ b/js/readability-0.1.js @@ -81,15 +81,6 @@ function grabArticle() { /* Add points for any commas within this paragraph */ parentNode.readability.contentScore += getCharCount(allParagraphs[j]); - - /* The old way of determining fitness: - var tempParas = allParagraphs[j].parentNode.getElementsByTagName("p"); - - if ( tempParas.length > topDivCount && getCharCount(allParagraphs[j].parentNode) >= tempParas.length ) { - topDivCount = tempParas.length; - topDiv = allParagraphs[j].parentNode; - } - */ } /* Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 */ @@ -122,7 +113,8 @@ function grabArticle() { cleanStyles(topDiv); // Removes all style attributes topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non

stuff than

stuff - + topDiv = killBreaks(topDiv); // Removes any consecutive
's into just one
+ // Cleans out junk from the topDiv just in case: topDiv = clean(topDiv, "form"); topDiv = clean(topDiv, "object"); @@ -174,8 +166,9 @@ function killDivs ( e ) { var divsList = e.getElementsByTagName( "div" ); var curDivLength = divsList.length; - // Gather counts for other typical elements embedded within : - for (var i=0; i < curDivLength; i ++) { + // Gather counts for other typical elements embedded within. + // Traverse backwards so we can remove nodes at the same time without effecting the traversal. + for (var i=curDivLength-1; i >= 0; i--) { var p = divsList[i].getElementsByTagName("p").length; var img = divsList[i].getElementsByTagName("img").length; var li = divsList[i].getElementsByTagName("li").length; @@ -187,13 +180,18 @@ function killDivs ( e ) { // And the number of non-paragraph elements is more than paragraphs // or other ominous signs : if ( img > p || li > p || a > p || p == 0 || embed > 0) { - divsList[i].style.display = "none"; + divsList[i].parentNode.removeChild(divsList[i]); } } } return e; } +function killBreaks ( e ) { + e.innerHTML = e.innerHTML.replace(/(\s*){1,}/,'
'); + return e; +} + function clean(e, tags, minWords) { var targetList = e.getElementsByTagName( tags ); minWords = minWords || 1000000; -- cgit v1.2.3