From 6aee20bdf2b9fa23320e900ea2f2ea8bd5ab94e5 Mon Sep 17 00:00:00 2001 From: umbrae Date: Sat, 7 Mar 2009 01:27:26 +0000 Subject: Changing the fitness algorith a bit to be a little smarter - taking into account classnames and ids, and a bit more from commas and paragraphs. Also takes into account the hAtom microformat for fitness. (Issue 1) The following sites, which were said to not work, should now work. I don't want to get into the habit of fixing things just for small sites, but it's a good bellwhether of how readability works in general. http://www.paulgraham.com/13sentences.html (Issue 3) http://news.bbc.co.uk/2/hi/south_asia/7921430.stm (Issue 2) http://www.macalope.com/2009/03/03/pinch-the-macalope/ http://www.macworld.com/article/139208/2009/03/target_disk_mode_to_the_rescue.html http://psychclassics.yorku.ca/Maslow/motivation.htm http://www.politico.com/news/stories/0309/19693.html git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@7 d4e419ec-0920-11de-bbfd-a7c1bc4c261e --- js/readability-0.1.js | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/js/readability-0.1.js b/js/readability-0.1.js index d1a2ccb..d0e9a56 100755 --- a/js/readability-0.1.js +++ b/js/readability-0.1.js @@ -12,6 +12,13 @@ objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay + + // For totally hosed HTML, add body node that can't be found because of bad HTML or something. + if(document.body == null) + { + body = document.createElement("body"); + document.body = body; + } // This removes everything else on the page. Requires a page refresh to undo it. // I tried the damn overlay on top - but had rendering issues: @@ -21,32 +28,84 @@ document.body.insertBefore(objOverlay, document.body.firstChild); })() +/* Remove this and any dbg calls before release to bring down file size. */ +function dbg(text) +{ + if(typeof console != 'undefined') + console.log(text); +} + function grabArticle() { var allParagraphs = document.getElementsByTagName("p"); var topDivCount = 0; - var topDiv; + var topDiv = null; var topDivParas; var articleContent = document.createElement("DIV"); var articleTitle = document.createElement("H1"); var articleFooter = document.createElement("DIV"); - // Replace all doubled-up
tags with

tags : + // Replace all doubled-up
tags with

tags, and remove fonts. var pattern = new RegExp ("
[ \r\n\s]*
", "g"); - document.body.innerHTML = document.body.innerHTML.replace(pattern, "

"); + document.body.innerHTML = document.body.innerHTML.replace(pattern, "

").replace(/<\/?font[^>]*>/, ''); // Grab the title from the tag and inject it as the title. articleTitle.innerHTML = document.title; articleContent.appendChild(articleTitle); - // Study all the paragraphs and find the chunk that has the most <p>'s and keep it: + // Study all the paragraphs and find the chunk that has the best score. + // A score is determined by things like: Number of <p>'s, commas, special classes, etc. for (var j=0; j < allParagraphs.length; j++) { + parentNode = allParagraphs[j].parentNode; + + /* Initialize readability data */ + if(typeof parentNode.readability == 'undefined') + { + parentNode.readability = {"contentScore": 0}; + + // Look for a special classname + if(parentNode.className.match(/(comment|meta)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/)) + parentNode.readability.contentScore += 50; + + // Look for a special ID + if(parentNode.className.match(/(comment|meta)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.className.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/)) + parentNode.readability.contentScore += 50; + } + + /* Add a point for the paragraph found */ + parentNode.readability.contentScore++; + + /* Add points for any commas within this paragraph */ + parentNode.readability.contentScore += getCharCount(allParagraphs[j]); + + /* The old way of determining fitness: var tempParas = allParagraphs[j].parentNode.getElementsByTagName("p"); if ( tempParas.length > topDivCount && getCharCount(allParagraphs[j].parentNode) >= tempParas.length ) { topDivCount = tempParas.length; topDiv = allParagraphs[j].parentNode; } + */ + } + + allNodes = document.getElementsByTagName("*"); + for(nodeIndex = 0; nodeIndex < allNodes.length; nodeIndex++) + { + node = allNodes[nodeIndex]; + + if(typeof node.readability != 'undefined') + { + dbg('Found a node with a content score of ' + node.readability.contentScore); + if(topDiv == null || node.readability.contentScore > topDiv.readability.contentScore) + { + dbg('Found a more fit node. Setting topDiv'); + topDiv = node; + } + } } // REMOVES ALL STYLESHEETS ... -- cgit v1.2.3