Changing the fitness algorith a bit to be a little smarter - taking into account classnames and ids, and a bit more from commas and paragraphs.

Also takes into account the hAtom microformat for fitness. (Issue 1) The following sites, which were said to not work, should now work. I don't want to get into the habit of fixing things just for small sites, but it's a good bellwhether of how readability works in general. http://www.paulgraham.com/13sentences.html (Issue 3) http://news.bbc.co.uk/2/hi/south_asia/7921430.stm (Issue 2) http://www.macalope.com/2009/03/03/pinch-the-macalope/ http://www.macworld.com/article/139208/2009/03/target_disk_mode_to_the_rescue.html http://psychclassics.yorku.ca/Maslow/motivation.htm http://www.politico.com/news/stories/0309/19693.html git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@7 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
author: umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> 2009-03-07 01:27:26 +0000
committer: umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> 2009-03-07 01:27:26 +0000
commit: 6aee20bdf2b9fa23320e900ea2f2ea8bd5ab94e5 (patch)
tree: 9deea89605c758818d6b9ef7cefa285846f788e9 /js
parent: 2d9a06c6b02ef0f9386d43cbdb722ce868cb6205 (diff)
download: readability-simple-6aee20bdf2b9fa23320e900ea2f2ea8bd5ab94e5.tar.bz2
readability-simple-6aee20bdf2b9fa23320e900ea2f2ea8bd5ab94e5.zip
1 files changed, 63 insertions, 4 deletions
diff --git a/js/readability-0.1.js b/js/readability-0.1.js
index d1a2ccb..d0e9a56 100755
--- a/js/readability-0.1.js
+++ b/js/readability-0.1.js
@@ -12,6 +12,13 @@
 	
 	objinnerDiv.appendChild(grabArticle());		// Get the article and place it inside the inner Div
 	objOverlay.appendChild(objinnerDiv);		// Insert the inner div into the overlay
+
+	// For totally hosed HTML, add body node that can't be found because of bad HTML or something.
+	if(document.body == null)
+	{
+		body = document.createElement("body");
+		document.body = body;
+	}
 	
 	// This removes everything else on the page. Requires a page refresh to undo it.
 	// I tried the damn overlay on top - but had rendering issues:
@@ -21,32 +28,84 @@
 	document.body.insertBefore(objOverlay, document.body.firstChild);
 })()
 
+/* Remove this and any dbg calls before release to bring down file size. */
+function dbg(text)
+{
+	if(typeof console != 'undefined')
+		console.log(text);
+}
+
 function grabArticle() {
 	var allParagraphs = document.getElementsByTagName("p");
 	var topDivCount = 0;
-	var topDiv;
+	var topDiv = null;
 	var topDivParas;
 	
 	var articleContent = document.createElement("DIV");
 	var articleTitle = document.createElement("H1");
 	var articleFooter = document.createElement("DIV");
 	
-	// Replace all doubled-up <BR> tags with <P> tags :
+	// Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
 	var pattern =  new RegExp ("<br/?>[ \r\n\s]*<br/?>", "g");
-	document.body.innerHTML = document.body.innerHTML.replace(pattern, "</p><p>");
+	document.body.innerHTML = document.body.innerHTML.replace(pattern, "</p><p>").replace(/<\/?font[^>]*>/, '');
 	
 	// Grab the title from the <title> tag and inject it as the title.
 	articleTitle.innerHTML = document.title;
 	articleContent.appendChild(articleTitle);
 	
-	// Study all the paragraphs and find the chunk that has the most <p>'s and keep it:
+	// Study all the paragraphs and find the chunk that has the best score.
+	// A score is determined by things like: Number of <p>'s, commas, special classes, etc.
 	for (var j=0; j	< allParagraphs.length; j++) {
+		parentNode = allParagraphs[j].parentNode;
+
+		/* Initialize readability data */
+		if(typeof parentNode.readability == 'undefined')
+		{
+			parentNode.readability = {"contentScore": 0};			
+
+			// Look for a special classname
+			if(parentNode.className.match(/(comment|meta)/))
+				parentNode.readability.contentScore -= 50;
+			else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/))
+				parentNode.readability.contentScore += 50;
+
+			// Look for a special ID
+			if(parentNode.className.match(/(comment|meta)/))
+				parentNode.readability.contentScore -= 50;
+			else if(parentNode.className.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/))
+				parentNode.readability.contentScore += 50;
+		}
+
+		/* Add a point for the paragraph found */
+		parentNode.readability.contentScore++;
+
+		/* Add points for any commas within this paragraph */
+		parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
+		
+		/* The old way of determining fitness:
 		var tempParas = allParagraphs[j].parentNode.getElementsByTagName("p");
 	
 		if ( tempParas.length > topDivCount && getCharCount(allParagraphs[j].parentNode) >= tempParas.length ) {
 			topDivCount = tempParas.length;
 			topDiv = allParagraphs[j].parentNode;
 		}
+		*/
+	}
+
+	allNodes = document.getElementsByTagName("*");
+	for(nodeIndex = 0; nodeIndex < allNodes.length; nodeIndex++)
+	{
+		node = allNodes[nodeIndex];
+		
+		if(typeof node.readability != 'undefined')
+		{
+			dbg('Found a node with a content score of ' + node.readability.contentScore);
+			if(topDiv == null || node.readability.contentScore > topDiv.readability.contentScore)
+			{
+				dbg('Found a more fit node. Setting topDiv');				
+				topDiv = node;
+			}
+		}
 	}
 	
 	// REMOVES ALL STYLESHEETS ...
author	umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>	2009-03-07 01:27:26 +0000
committer	umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>	2009-03-07 01:27:26 +0000
commit	6aee20bdf2b9fa23320e900ea2f2ea8bd5ab94e5 (patch)
tree	9deea89605c758818d6b9ef7cefa285846f788e9 /js
parent	2d9a06c6b02ef0f9386d43cbdb722ce868cb6205 (diff)
download	readability-simple-6aee20bdf2b9fa23320e900ea2f2ea8bd5ab94e5.tar.bz2 readability-simple-6aee20bdf2b9fa23320e900ea2f2ea8bd5ab94e5.zip