summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorumbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>2009-03-09 15:32:02 +0000
committerumbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>2009-03-09 15:32:02 +0000
commite1b7d7c642cc583a47af8738f2a14f87354e911b (patch)
tree96d489dec314e0b8e501ab4f542e77bfc9016faa
parent384cb8c5fae2a33b8d621e4846a37e2a23670497 (diff)
downloadreadability-simple-e1b7d7c642cc583a47af8738f2a14f87354e911b.tar.bz2
readability-simple-e1b7d7c642cc583a47af8738f2a14f87354e911b.zip
Some minor tweaks to not count empty paragraphs, and to allow tables if they have decent content in them
git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@15 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
-rwxr-xr-xjs/readability-0.1.js32
1 files changed, 19 insertions, 13 deletions
diff --git a/js/readability-0.1.js b/js/readability-0.1.js
index 3cbc6bf..7250433 100755
--- a/js/readability-0.1.js
+++ b/js/readability-0.1.js
@@ -67,19 +67,21 @@ function grabArticle() {
if(parentNode.className.match(/(comment|meta)/))
parentNode.readability.contentScore -= 50;
else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/))
- parentNode.readability.contentScore += 50;
+ parentNode.readability.contentScore += 25;
// Look for a special ID
- if(parentNode.className.match(/(comment|meta)/))
+ if(parentNode.id.match(/(comment|meta)/))
parentNode.readability.contentScore -= 50;
- else if(parentNode.className.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/))
- parentNode.readability.contentScore += 50;
+ else if(parentNode.id.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/))
+ parentNode.readability.contentScore += 25;
}
/* Add a point for the paragraph found */
- parentNode.readability.contentScore++;
+ if(getInnerText(allParagraphs[j]).length > 10)
+ parentNode.readability.contentScore++;
/* Add points for any commas within this paragraph */
+ dbg("Current paragraph has " + getCharCount(allParagraphs[j]) + " commas.");
parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
}
@@ -91,7 +93,7 @@ function grabArticle() {
dbg('Found a node with a content score of ' + node.readability.contentScore);
if(topDiv == null || node.readability.contentScore > topDiv.readability.contentScore)
{
- dbg('Found a more fit node. Setting topDiv');
+ dbg('Found a more fit node. Setting topDiv.' + node.className);
topDiv = node;
}
}
@@ -118,7 +120,7 @@ function grabArticle() {
// Cleans out junk from the topDiv just in case:
topDiv = clean(topDiv, "form");
topDiv = clean(topDiv, "object");
- topDiv = clean(topDiv, "table");
+ topDiv = clean(topDiv, "table", 250);
topDiv = clean(topDiv, "h1");
topDiv = clean(topDiv, "h2");
topDiv = clean(topDiv, "iframe");
@@ -133,15 +135,19 @@ function grabArticle() {
return articleContent;
}
+// Get the inner text of a node - cross browser compatibly.
+function getInnerText(e)
+{
+ if (navigator.appName == "Microsoft Internet Explorer")
+ return e.innerText;
+ else
+ return e.textContent;
+}
+
// Get character count
function getCharCount ( e,s ) {
s = s || ",";
- if (navigator.appName == "Microsoft Internet Explorer") {
- return parentContent = e.innerText.split(',').length;
- }
- else {
- return parentContent = e.textContent.split(',').length;
- }
+ return getInnerText(e).split(s).length;
}
function cleanStyles( e ) {