From e1b7d7c642cc583a47af8738f2a14f87354e911b Mon Sep 17 00:00:00 2001 From: umbrae Date: Mon, 9 Mar 2009 15:32:02 +0000 Subject: Some minor tweaks to not count empty paragraphs, and to allow tables if they have decent content in them git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@15 d4e419ec-0920-11de-bbfd-a7c1bc4c261e --- js/readability-0.1.js | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/js/readability-0.1.js b/js/readability-0.1.js index 3cbc6bf..7250433 100755 --- a/js/readability-0.1.js +++ b/js/readability-0.1.js @@ -67,19 +67,21 @@ function grabArticle() { if(parentNode.className.match(/(comment|meta)/)) parentNode.readability.contentScore -= 50; else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/)) - parentNode.readability.contentScore += 50; + parentNode.readability.contentScore += 25; // Look for a special ID - if(parentNode.className.match(/(comment|meta)/)) + if(parentNode.id.match(/(comment|meta)/)) parentNode.readability.contentScore -= 50; - else if(parentNode.className.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/)) - parentNode.readability.contentScore += 50; + else if(parentNode.id.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/)) + parentNode.readability.contentScore += 25; } /* Add a point for the paragraph found */ - parentNode.readability.contentScore++; + if(getInnerText(allParagraphs[j]).length > 10) + parentNode.readability.contentScore++; /* Add points for any commas within this paragraph */ + dbg("Current paragraph has " + getCharCount(allParagraphs[j]) + " commas."); parentNode.readability.contentScore += getCharCount(allParagraphs[j]); } @@ -91,7 +93,7 @@ function grabArticle() { dbg('Found a node with a content score of ' + node.readability.contentScore); if(topDiv == null || node.readability.contentScore > topDiv.readability.contentScore) { - dbg('Found a more fit node. Setting topDiv'); + dbg('Found a more fit node. Setting topDiv.' + node.className); topDiv = node; } } @@ -118,7 +120,7 @@ function grabArticle() { // Cleans out junk from the topDiv just in case: topDiv = clean(topDiv, "form"); topDiv = clean(topDiv, "object"); - topDiv = clean(topDiv, "table"); + topDiv = clean(topDiv, "table", 250); topDiv = clean(topDiv, "h1"); topDiv = clean(topDiv, "h2"); topDiv = clean(topDiv, "iframe"); @@ -133,15 +135,19 @@ function grabArticle() { return articleContent; } +// Get the inner text of a node - cross browser compatibly. +function getInnerText(e) +{ + if (navigator.appName == "Microsoft Internet Explorer") + return e.innerText; + else + return e.textContent; +} + // Get character count function getCharCount ( e,s ) { s = s || ","; - if (navigator.appName == "Microsoft Internet Explorer") { - return parentContent = e.innerText.split(',').length; - } - else { - return parentContent = e.textContent.split(',').length; - } + return getInnerText(e).split(s).length; } function cleanStyles( e ) { -- cgit v1.2.3