Kill extra break tags in the filtered HTML. #8

git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@13 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
author: umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> 2009-03-07 14:46:01 +0000
committer: umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> 2009-03-07 14:46:01 +0000
commit: af01903012c701583f73f5512855c0a27774f8a4 (patch)
tree: 01123a63f52f273ae391d39df213395ba9818ecb
parent: 999fe5ef1559a19a0ef21dfc34729581196e3ebc (diff)
download: readability-simple-af01903012c701583f73f5512855c0a27774f8a4.tar.bz2
readability-simple-af01903012c701583f73f5512855c0a27774f8a4.zip
1 files changed, 11 insertions, 13 deletions
diff --git a/js/readability-0.1.js b/js/readability-0.1.js
index 6c255e6..aade9b2 100755
--- a/js/readability-0.1.js
+++ b/js/readability-0.1.js
@@ -81,15 +81,6 @@ function grabArticle() {
 
 		/* Add points for any commas within this paragraph */
 		parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
-		
-		/* The old way of determining fitness:
-		var tempParas = allParagraphs[j].parentNode.getElementsByTagName("p");
-	
-		if ( tempParas.length > topDivCount && getCharCount(allParagraphs[j].parentNode) >= tempParas.length ) {
-			topDivCount = tempParas.length;
-			topDiv = allParagraphs[j].parentNode;
-		}
-		*/
 	}
 
 	/* Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 */
@@ -122,7 +113,8 @@ function grabArticle() {
 
 	cleanStyles(topDiv);					// Removes all style attributes
 	topDiv = killDivs(topDiv);				// Goes in and removes DIV's that have more non <p> stuff than <p> stuff
-	
+	topDiv = killBreaks(topDiv);            // Removes any consecutive <br />'s into just one <br /> 
+
 	// Cleans out junk from the topDiv just in case:
 	topDiv = clean(topDiv, "form");
 	topDiv = clean(topDiv, "object");
@@ -174,8 +166,9 @@ function killDivs ( e ) {
 	var divsList = e.getElementsByTagName( "div" );
 	var curDivLength = divsList.length;
 	
-	// Gather counts for other typical elements embedded within :
-	for (var i=0; i < curDivLength; i ++) {
+	// Gather counts for other typical elements embedded within.
+	// Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+	for (var i=curDivLength-1; i >= 0; i--) {
 		var p = divsList[i].getElementsByTagName("p").length;
 		var img = divsList[i].getElementsByTagName("img").length;
 		var li = divsList[i].getElementsByTagName("li").length;
@@ -187,13 +180,18 @@ function killDivs ( e ) {
 			// And the number of non-paragraph elements is more than paragraphs 
 			// or other ominous signs :
 			if ( img > p || li > p || a > p || p == 0 || embed > 0) {
-				divsList[i].style.display = "none";
+				divsList[i].parentNode.removeChild(divsList[i]);
 			}
 		}
 	}
 	return e;
 }
 
+function killBreaks ( e ) {
+	e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>\s*){1,}/,'<br />');
+	return e;
+}
+
 function clean(e, tags, minWords) {
 	var targetList = e.getElementsByTagName( tags );
 	minWords = minWords || 1000000;
author	umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>	2009-03-07 14:46:01 +0000
committer	umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>	2009-03-07 14:46:01 +0000
commit	af01903012c701583f73f5512855c0a27774f8a4 (patch)
tree	01123a63f52f273ae391d39df213395ba9818ecb
parent	999fe5ef1559a19a0ef21dfc34729581196e3ebc (diff)
download	readability-simple-af01903012c701583f73f5512855c0a27774f8a4.tar.bz2 readability-simple-af01903012c701583f73f5512855c0a27774f8a4.zip