Some minor tweaks to not count empty paragraphs, and to allow tables if they have decent content in them

git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@15 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
author: umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> 2009-03-09 15:32:02 +0000
committer: umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> 2009-03-09 15:32:02 +0000
commit: e1b7d7c642cc583a47af8738f2a14f87354e911b (patch)
tree: 96d489dec314e0b8e501ab4f542e77bfc9016faa
parent: 384cb8c5fae2a33b8d621e4846a37e2a23670497 (diff)
download: readability-simple-e1b7d7c642cc583a47af8738f2a14f87354e911b.tar.bz2
readability-simple-e1b7d7c642cc583a47af8738f2a14f87354e911b.zip
1 files changed, 19 insertions, 13 deletions
diff --git a/js/readability-0.1.js b/js/readability-0.1.js
index 3cbc6bf..7250433 100755
--- a/js/readability-0.1.js
+++ b/js/readability-0.1.js
@@ -67,19 +67,21 @@ function grabArticle() {
 			if(parentNode.className.match(/(comment|meta)/))
 				parentNode.readability.contentScore -= 50;
 			else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/))
-				parentNode.readability.contentScore += 50;
+				parentNode.readability.contentScore += 25;
 
 			// Look for a special ID
-			if(parentNode.className.match(/(comment|meta)/))
+			if(parentNode.id.match(/(comment|meta)/))
 				parentNode.readability.contentScore -= 50;
-			else if(parentNode.className.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/))
-				parentNode.readability.contentScore += 50;
+			else if(parentNode.id.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/))
+				parentNode.readability.contentScore += 25;
 		}
 
 		/* Add a point for the paragraph found */
-		parentNode.readability.contentScore++;
+		if(getInnerText(allParagraphs[j]).length > 10)
+			parentNode.readability.contentScore++;
 
 		/* Add points for any commas within this paragraph */
+		dbg("Current paragraph has " + getCharCount(allParagraphs[j]) + " commas.");
 		parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
 	}
 
@@ -91,7 +93,7 @@ function grabArticle() {
 			dbg('Found a node with a content score of ' + node.readability.contentScore);
 			if(topDiv == null || node.readability.contentScore > topDiv.readability.contentScore)
 			{
-				dbg('Found a more fit node. Setting topDiv');				
+				dbg('Found a more fit node. Setting topDiv.' + node.className);				
 				topDiv = node;
 			}
 		}
@@ -118,7 +120,7 @@ function grabArticle() {
 	// Cleans out junk from the topDiv just in case:
 	topDiv = clean(topDiv, "form");
 	topDiv = clean(topDiv, "object");
-	topDiv = clean(topDiv, "table");
+	topDiv = clean(topDiv, "table", 250);
 	topDiv = clean(topDiv, "h1");
 	topDiv = clean(topDiv, "h2");
 	topDiv = clean(topDiv, "iframe");
@@ -133,15 +135,19 @@ function grabArticle() {
 	return articleContent;
 }
 
+// Get the inner text of a node - cross browser compatibly.
+function getInnerText(e)
+{
+	if (navigator.appName == "Microsoft Internet Explorer")
+		return e.innerText;
+	else
+		return e.textContent;
+}
+
 // Get character count
 function getCharCount ( e,s ) {
     s = s || ",";
-	if (navigator.appName == "Microsoft Internet Explorer") {
-		return parentContent = e.innerText.split(',').length;
-	}
-	else {
-		return parentContent = e.textContent.split(',').length;
-	}
+	return getInnerText(e).split(s).length;
 }
 
 function cleanStyles( e ) {
author	umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>	2009-03-09 15:32:02 +0000
committer	umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>	2009-03-09 15:32:02 +0000
commit	e1b7d7c642cc583a47af8738f2a14f87354e911b (patch)
tree	96d489dec314e0b8e501ab4f542e77bfc9016faa
parent	384cb8c5fae2a33b8d621e4846a37e2a23670497 (diff)
download	readability-simple-e1b7d7c642cc583a47af8738f2a14f87354e911b.tar.bz2 readability-simple-e1b7d7c642cc583a47af8738f2a14f87354e911b.zip