summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJJfutbol <JJfutbol@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>2009-07-07 02:02:48 +0000
committerJJfutbol <JJfutbol@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>2009-07-07 02:02:48 +0000
commitb260a47d808ed044f5d12ec8920e946c581096e9 (patch)
treee7ef3bb44a441c8f1f97cd120a39056f8c79eb1c
parent57bf81102a6177794a5aa48dba75070e3f4c3485 (diff)
downloadreadability-simple-b260a47d808ed044f5d12ec8920e946c581096e9.tar.bz2
readability-simple-b260a47d808ed044f5d12ec8920e946c581096e9.zip
- updated cancel email link to use lab.arc90.com domain, not davidh's site
- complete reworking of readability, initial commit of what will be 1.0.0.1 git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@58 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
-rwxr-xr-xcss/readability.css162
-rw-r--r--email.php12
-rwxr-xr-xjs/readability.js864
3 files changed, 763 insertions, 275 deletions
diff --git a/css/readability.css b/css/readability.css
index 49c4c13..0d3367e 100755
--- a/css/readability.css
+++ b/css/readability.css
@@ -1,105 +1,114 @@
@charset "utf-8";
/* CSS Document */
-
-/*
-#readOverlay {
- background-image: none;
- background: #eee;
- font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;
-
-}
-*/
#readOverlay {
display: block;
+ left: 0;
position: absolute;
top: 0;
- left: 0;
width: 100%;
}
#readInner {
- text-align: left;
line-height: 1.4em;
margin: 1em auto;
max-width: 800px;
+ text-align: left;
+ width: 800px;
}
#readInner a {
color: blue;
text-decoration: underline;
}
+#readInner pre {
+ white-space: pre-wrap;
+}
+#readOverlay #readInner pre.normalPre {
+ font-size: 14px;
+ line-height: 1.4em;
+ overflow: auto;
+ white-space: pre-wrap;
+}
+/* custom - PRE content (will revisit)
+#readInner code, #readInner pre {
+ font-family: "Courier New", Courier, monospace;
+}
+#readInner pre {
+ background-color: #FFF;
+ border: #000 1px inset;
+ font-size: 110%;
+ max-height: 250px;
+ overflow: auto;
+ margin: 0;
+ padding: 6px 0;
+ overflow-x: auto;
+ width: 100%;
+}
+*/
#readInner * {
- margin-bottom: 16px;
- border: none;
background: none;
-}
-#readInner img {
- float: left;
- margin-right: 12px;
- margin-bottom: 12px;
+ border: none;
+ margin-bottom: 16px;
+ text-align: left;
}
#readInner h1 {
- display: block;
- width: 100%;
border-bottom: 1px solid #333;
+ display: block;
font-size: 1.2em;
+ width: 100%;
}
#readInner blockquote {
margin-left: 3em;
margin-right: 3em;
}
#readFooter {
- display: block;
border-top: 1px solid #333;
- text-align: center;
- clear: both;
}
-div.footer-right {
- float: right;
- line-height: 1;
- text-align: right;
- font-size: .75em;
- margin-top: 18px
+#readFooter, #readFooter div, #readFooter a {
+ margin: 0;
+ padding: 0;
+ text-align: center;
}
-span.version {
- display: none;
+#readability-version {
+ font-size: 12px;
+ font-weight: bold;
}
/* Article Tools */
#readTools {
- width: 34px;
height: 150px;
+ left: 10px;
position: fixed;
- z-index: 100;
top: 10px;
- left: 10px;
+ width: 34px;
+ z-index: 100;
}
#readTools a {
- overflow: hidden;
- margin-bottom: 8px;
display: block;
- opacity: .4;
- filter:alpha(opacity=40);
- text-indent: -99999px;
- height: 34px;
+ filter: alpha(opacity=40);
+ height: 34px;
+ margin-bottom: 8px;
+ opacity: .4;
+ overflow: hidden;
+ text-indent: -99999px;
}
-#email-page{
- background: url(http://lab.arc90.com/experiments/readability/images/read-email.png) no-repeat left top;
+#email-page {
+ background: url(http://lab.arc90.com/experiments/readability/images/read-email.png) no-repeat left top;
}
-#reload-page{
- background: url(http://lab.arc90.com/experiments/readability/images/read-refresh.png) no-repeat left top;
+#reload-page {
+ background: url(http://lab.arc90.com/experiments/readability/images/read-refresh.png) no-repeat left top;
}
-#print-page{
- background: url(http://lab.arc90.com/experiments/readability/images/read-print.png) no-repeat left top;
+#print-page {
+ background: url(http://lab.arc90.com/experiments/readability/images/read-print.png) no-repeat left top;
}
-#readTools a:hover{
- opacity: 1;
- filter:alpha(opacity=100);
+#readTools a:hover {
+ filter: alpha(opacity=100);
+ opacity: 1;
}
-/* ---------------- USER-CONFIGURABLE STYLING --------------------- */
+
/* ---------------- USER-CONFIGURABLE STYLING --------------------- */
-/* Size options */
+/* ------ Size Options ------- */
.size-small {
font-size: 12px;
@@ -113,37 +122,39 @@ span.version {
.size-x-large {
font-size: 34px;
}
-/* Style options */
+
+/* ------ Style Options ------- */
.style-novel {
- font-family:"Palatino Linotype", "Book Antiqua", Palatino, serif;
background: #F4F3DB;
color: #222;
+ font-family: "Palatino Linotype", "Book Antiqua", Palatino, serif;
}
.style-ebook {
- font-family:Arial, Helvetica, sans-serif;
- background: #eee;
+ background: #EEE;
color: #333;
+ font-family: Arial, Helvetica, sans-serif;
}
.style-ebook h1 {
font-family: "Arial Black", Gadget, sans-serif;
font-weight: normal;
}
.style-newspaper {
- font-family:"Times New Roman", Times, serif;
background: #FFF;
color: #222;
+ font-family: "Times New Roman", Times, serif;
}
.style-newspaper h1 {
- text-transform:capitalize;
font-family: Georgia, "Times New Roman", Times, serif;
+ text-transform: capitalize;
}
.style-terminal {
- font-family: "Lucida Console", Monaco, monospace;
background: #1D4E2C;
color: #C6FFC6;
+ font-family: "Lucida Console", Monaco, monospace;
}
-/* Margin Options */
+
+/* ------ Margin Options ------- */
.margin-x-wide {
width: 35%;
@@ -157,9 +168,11 @@ span.version {
.margin-narrow {
width: 95%;
}
-/* ---------------- USER-CONFIGURABLE STYLING --------------------- */
-/* ------ DEBUG ------- */
+/* ---------------- END USER-CONFIGURABLE STYLING --------------------- */
+
+
+/* ---------------- DEBUG --------------------- */
.bug-green {
background: #BBF9B0;
@@ -175,18 +188,19 @@ span.version {
background: #BFDFFF;
}
+
/* ---------------- EMAIL POP UP --------------------- */
-#email-container{
- position: fixed;
- top: 60px;
- left: 50%;
- margin: 0 0 0 -240px;
- padding: 0;
- width: 500px;
- height: 490px;
- border: solid 3px #666;
- background-color: #fff;
- z-index: 100 !important;
- overflow: hidden;
-}
+#email-container {
+ background-color: #fff;
+ border: solid 3px #666;
+ height: 490px;
+ left: 50%;
+ margin: 0 0 0 -240px;
+ overflow: hidden;
+ padding: 0;
+ position: fixed;
+ top: 60px;
+ width: 500px;
+ z-index: 100 !important;
+} \ No newline at end of file
diff --git a/email.php b/email.php
index 9f7404a..4fcd045 100644
--- a/email.php
+++ b/email.php
@@ -262,7 +262,7 @@
<script type="text/javascript" charset="utf-8">
window.onload = function(){
document.getElementById('cancel-email').onclick = function(){
- window.location = 'http://davehauenstein.com/readability/close.html';
+ window.location = 'http://lab.arc90.com/experiments/readability/close.html';
return false;
};
document.getElementById('send-email').onclick = function(){
@@ -271,9 +271,13 @@
};
};
<?php if($page == "complete"){ ?>
- timer = setTimeout(function(){
- window.location = 'close.html';
- }, 3000);
+ var timer = setTimeout(redirectToClosingPage, 3000);
+
+ function redirectToClosingPage()
+ {
+ clearTimeout(timer);
+ window.location = 'close.html';
+ }
<?php } ?>
</script>
<style type="text/css" media="screen">
diff --git a/js/readability.js b/js/readability.js
index 9f59bcb..6b53a15 100755
--- a/js/readability.js
+++ b/js/readability.js
@@ -1,254 +1,724 @@
-var readabilityVersion = "0.4";
-var emailSrc = 'http://davehauenstein.com/readability/email.php';
-var iframeLoads = 0;
+var readabilityVersion = "v1.0.0.1";
+var emailSrc = "http://proto1.arc90.com/readability/email.php";
+var highestScore = -1;
+var malformedContent = false;
(function(){
- var objOverlay = document.createElement("div");
- var objinnerDiv = document.createElement("div");
- var articleTools = document.createElement("DIV");
+ // some sites use plugins (jCarousel) that when Readability removes scripts
+ // or does something funky it causes an alert to appear every few seconds,
+ // to avoid this we'll override the alert and timer methods, we won't need
+ // them, yet consider a better approach
+ window.alert = function(message) {};
+ window.setInterval = function(method, timeout) {};
+ window.setTimeout = function(method, timeout) {};
- objOverlay.id = "readOverlay";
- objinnerDiv.id = "readInner";
+ var overlayContainer = document.createElement("DIV");
+ var articleTitle = document.createElement("H1");
+ var contentContainer = document.createElement("DIV");
+ var articleFooter = document.createElement("DIV");
+ var toolBar = document.createElement("DIV");
+
+ overlayContainer.id = "readOverlay";
+ contentContainer.id = "readInner";
- // Apply user-selected styling:
+ // apply user-selected styling
document.body.className = readStyle;
- objOverlay.className = readStyle;
- objinnerDiv.className = readMargin + " " + readSize;
+ overlayContainer.className = readStyle;
+ contentContainer.className = readMargin + " " + readSize;
+
+ // set up the toolbar widget
+ toolBar.id = "readTools";
+ toolBar.innerHTML = '<a href="#" onclick="return window.location.reload();" title="Reload original page" id="reload-page">Reload Original Page</a>' +
+ '<a href="#" onclick="javascript:window.print();" title="Print page" id="print-page">Print Page</a>' +
+ '<a href="#" onclick="emailBox();return false;" title="Email page" id="email-page">Email Page</a>';
+
+ // we'll use the page title as our title, unfortunately not all sites use
+ // this well, so we might want to consider say stripping an H1 tag
+ articleTitle.innerHTML = document.title;
+ contentContainer.appendChild(articleTitle);
+
+ // parse the article content and add it to the new content container
+ contentContainer.appendChild(parseContent());
+
+ // FIXME: footer image has both arc90 and readability logos, they should
+ // each have their own unique link (issue 59)
+ // http://code.google.com/p/arc90labs-readability/issues/detail?id=59
+ //
+ // add the footer and contents
+ articleFooter.id = "readFooter";
+ articleFooter.innerHTML = '<div><a href="http://www.arc90.com"><img src="http://lab.arc90.com/experiments/readability/images/footer.png" width="308" height="66" /></a></div>' +
+ '<div id="readability-version">' + readabilityVersion + '</div>';
+ contentContainer.appendChild(articleFooter);
+
+ // add the toolbar and then the conent container to our body
+ overlayContainer.appendChild(toolBar);
+ overlayContainer.appendChild(contentContainer);
+
+ // for totally hosed HTML, add body node that can"t be found because of bad HTML or something
+ if (!document.body)
+ document.body = document.createElement("body");
- // Set up tools widget
+ document.body.id = "";
+ document.body.innerHTML = "";
- // NOTE THE IMAGE URL'S HERE !!!!!!!!!!!!!!!!!
- // NOTE THE IMAGE URL'S HERE !!!!!!!!!!!!!!!!!
- // NOTE THE IMAGE URL'S HERE !!!!!!!!!!!!!!!!!
- articleTools.id = "readTools";
- articleTools.innerHTML = "\
- <a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>\
- <a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>\
- <a href='#' onclick='emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>\
- ";
+ // with all previous body content removed, add our new overlay/main container
+ document.body.insertBefore(overlayContainer, document.body.firstChild);
+})();
- objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div
- objOverlay.appendChild(articleTools);
- objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay
- // For totally hosed HTML, add body node that can't be found because of bad HTML or something.
- if(document.body == null)
+function determineContentScore(score, parent, element)
+{
+ // TODO: should set as a global var since badKeywords are used elsewhere
+ var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"];
+ var semiGoodKeywords = ["area", "container", "inner", "main"];
+ var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "toolbar", "tools", "trackback", "widget"];
+
+ // we'll be doing a case insensitive compare
+ var className = parent.className.toLowerCase();
+ var id = parent.id.toLowerCase();
+
+ // increment the score if the content might be what we are looking for
+ for (var i = 0; i < goodKeywords.length; i++)
{
- body = document.createElement("body");
- document.body = body;
+ if (className.indexOf(goodKeywords[i]) >= 0)
+ score++;
+
+ if (id.indexOf(goodKeywords[i]) >= 0)
+ score++;
}
-
- document.body.innerHTML = "";
- // Inserts the new content :
- document.body.insertBefore(objOverlay, document.body.firstChild);
-})()
+ // TODO: would like to improve the content scoring algorithm here
+ // to not have to use so many for loops
+
+ // at least a single good keyword was found indiciating we may have found our
+ // content container but we have other keywords that don't necessarily have to
+ // do with content but when used in conjuction with the good keywords we want
+ // to increment our score
+ if (score >= 1)
+ {
+ // increment the score if the content might be what we are looking for
+ for (var i = 0; i < semiGoodKeywords.length; i++)
+ {
+ if (className.indexOf(semiGoodKeywords[i]) >= 0)
+ score++;
+
+ if (id.indexOf(semiGoodKeywords[i]) >= 0)
+ score++;
+ }
+ }
+
+ // decrement the score if the content is not what we are looking for
+ for (var j = 0; j < badKeywords.length; j++)
+ {
+ if (className.indexOf(badKeywords[j]) >= 0)
+ score = score - 15;
+
+ if (id.indexOf(badKeywords[j]) >= 0)
+ score = score - 15;
+ }
+
+ // TODO: verify that 20 seems an acceptable minimum, consider 15
+ //
+ // Add a point for the paragraph found
+ if (element.tagName.toLowerCase() == "p" && getWordCount(element) > 20) //|| (score == 0 && getText(element).length > 10))
+ score++;
+
+ // FIXME: not sure yet if this will be included, this would break
+ // pages that use multiple containers for content, or we could tweak
+ // the acceptable minimum... but that would have to be set quite
+ // high, for now we'll leave it out
+ //
+ // Add points for any words within this paragraph
+ //if (score > 0 && malformedContent)
+ // score += getWordCount(element);
+
+ // keep track of the highest score we've come across
+ if (score > highestScore)
+ highestScore = score;
+
+ return score;
+}
+
-function grabArticle() {
- var allParagraphs = document.getElementsByTagName("p");
- var topDivCount = 0;
- var topDiv = null;
- var topDivParas;
+function parseContent() {
+ // replace all doubled-up <BR> tags with <P> tags, and remove fonts
+ //var pattern = new RegExp("<br/?>[ \r\n\s]*<br/?>", "gi");
+ //document.body.innerHTML = document.body.innerHTML.replace(pattern, "</p><p>").replace(/<\/?font[^>]*>/gi, "");
+ document.body.innerHTML = document.body.innerHTML.replace(/<br\/?>\s*<br\/?>/gi, "<p />").replace(/<\/?font[^>]*>/gi, "");
+
+ /*
+
+ // was part of the PRE based content parsing but tweaking below
+ // could resolve the bad regex above replacing double br tags
+ // with an empty paragraph
+
+ var html = document.body.innerHTML;
+ var firstTime = true;
+
+ while (html.indexOf('\n\n') >= 0)
+ {
+ if (firstTime)
+ {
+ html = html.replace('\n\n', '<p>'); // first item
+ firstTime = false;
+ }
+
+ if (html.indexOf('\n\n') == html.lastIndexOf('\n\n'))
+ html = html.replace('\n\n', '</p>'); // last item
+ else
+ html = html.replace('\n\n', '</p><p>'); // every item in between
+ }
+
+ document.body.innerHTML = html;
+ */
var articleContent = document.createElement("DIV");
- var articleTitle = document.createElement("H1");
- var articleFooter = document.createElement("DIV");
+ var paragraphs = document.getElementsByTagName("P");
+ var contentBlocks = [];
- // Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
- var pattern = new RegExp ("<br/?>[ \r\n\s]*<br/?>", "g");
- document.body.innerHTML = document.body.innerHTML.replace(pattern, "</p><p>").replace(/<\/?font[^>]*>/g, '');
- // Grab the title from the <title> tag and inject it as the title.
- articleTitle.innerHTML = document.title;
- articleContent.appendChild(articleTitle);
+ // DEBUG
+ console.log(paragraphs.length + " Paragraphs found");
- // Study all the paragraphs and find the chunk that has the best score.
- // A score is determined by things like: Number of <p>'s, commas, special classes, etc.
- for (var j=0; j < allParagraphs.length; j++) {
- parentNode = allParagraphs[j].parentNode;
-
- // Initialize readability data
- if(typeof parentNode.readability == 'undefined')
+
+ /*
+ // PRE based content parsing only!
+ // this was only an EXPERIMENT, need to be revisited
+
+ var pres = document.getElementsByTagName("PRE");
+ for (var i = 0; i < pres.length; i++)
+ {
+ var pre = pres[i];
+
+ var content = document.createElement("DIV");
+
+ var text = pre.textContent;
+ var firstTime = true;
+
+ while (text.indexOf('\n\n') >= 0)
{
- parentNode.readability = {"contentScore": 0};
-
- // Look for a special classname
- if(parentNode.className.match(/(comment|meta|footer|footnote)/))
- parentNode.readability.contentScore -= 50;
- else if(parentNode.className.match(/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/))
- parentNode.readability.contentScore += 25;
-
- // Look for a special ID
- if(parentNode.id.match(/(comment|meta|footer|footnote)/))
- parentNode.readability.contentScore -= 50;
- else if(parentNode.id.match(/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/))
- parentNode.readability.contentScore += 25;
+ if (firstTime)
+ {
+ text = text.replace('\n\n', '<p>'); // first item
+ firstTime = false;
+ }
+ else
+ {
+ if (text.indexOf('\n\n') == text.lastIndexOf('\n\n'))
+ text = text.replace('\n\n', '</p>'); // last item
+ else
+ text = text.replace('\n\n', '</p><p>'); // every item in between
+ }
+ }
+
+ content.innerHTML = text.replace(/={10,}/g, "====================");
+
+ paragraphs = content.getElementsByTagName("P");
+
+ var preElements = [];
+ for (var j = 0; j < paragraphs.length; j++)
+ {
+ p = paragraphs[j];
+
+ breaks = p.getElementsByTagName("BR");
+
+ if (p.innerHTML.indexOf("\t") == -1 && p.innerHTML.indexOf(" ") == -1 && breaks.length >= 1)
+ {
+ p.innerHTML = p.innerHTML.replace(/<br\/?>/gi, " ");
+ }
+
+ console.log("tabs: " + p.innerHTML.split("\t").length + " -- " + p.innerHTML.split(/\s{2,}/g).length + " -- " + p.innerHTML.substr(0, 35))
+
+ numTabs = p.innerHTML.split("\t").length + p.innerHTML.split(/ {3,}/g).length;
+
+ if (numTabs > 3)
+ {
+ preElements.push(p);
+ }
+ }
+
+ for (var k = 0; k < preElements.length; k++)
+ {
+ var p = preElements[k];
+
+ var newPre = document.createElement("PRE");
+ newPre.innerHTML = p.innerHTML.replace(/<br\/>/gi, "\n");
+ newPre.className = "normalPre";
+
+ p.parentNode.replaceChild(newPre, p);
+ }
+
+ content.innerHTML = content.innerHTML.replace(/<p>[ \r\n\s]*<p>/gi, "<p>");
+
+ contentBlocks.push(content);
+ }
+ */
+
+ // wow.. talk about a bad site, no paragraphs found so we'll attempt to
+ // parse content from div's and set our malformedContent flag
+ if (paragraphs.length == 0)
+ {
+ paragraphs = document.getElementsByTagName("DIV");
+
+ malformedContent = true;
+ }
+
+ for (var i = 0; i < paragraphs.length; i++)
+ {
+ var parentNode = paragraphs[i].parentNode;
+
+ // if the parent happens to be a form element, accessing properties
+ // such as id or className don't work, or rather it attempts to access
+ // children so we need to make sure we only deal with string values,
+ // also if the parent element is the body then its ignored
+ if (parentNode.tagName.toLowerCase() == "body" || typeof parentNode.id != "string" || typeof parentNode.className != "string")
+ continue;
+
+ // initialize readability score data
+ if (typeof parentNode.readability == "undefined")
+ parentNode.readability = {"contentScore": 0};
+
+ parentNode.readability.contentScore = determineContentScore(parentNode.readability.contentScore, parentNode, paragraphs[i]);
+
+ // looks like we have possible content candidates, add it
+ if (parentNode.readability.contentScore > 0)
+ {
+ // DEBUG
+ console.log(parentNode.tagName + " id: " + parentNode.id + " -- class: " + parentNode.className + " -- score: " + parentNode.readability.contentScore);
+
+ // careful, only add parent element once!
+ if (contentBlocks.indexOf(parentNode) == -1)
+ contentBlocks.push(parentNode);
+ }
+ }
+
+ /*
+ // TODO: need to revisit parsing strictly tables/divs content only
+ if (contentBlocks.length == 0)
+ {
+ var paragraphs = document.getElementsByTagName("tbody");
+
+ for (var i = 0; i < paragraphs.length; i++)
+ {
+ var parentNode = paragraphs[i].parentNode;
+
+ // Initialize readability data
+ if (typeof parentNode.readability == "undefined")
+ {
+ parentNode.readability = {"contentScore": determineContentScore(parentNode, paragraphs[i])};
+
+ if (parentNode.readability.contentScore > 0)
+ {
+ console.log(parentNode.tagName + " id: " + parentNode.id + " -- class: " + parentNode.className + " -- score: " + parentNode.readability.contentScore);
+
+ if (contentBlocks.indexOf(parentNode) == -1)
+ contentBlocks.push(parentNode);
+ }
+ }
+ }
+ }
+ */
+
+ removeScripts();
+ removeStylesheets();
+ removeStyles();
+
+
+ // DEBUG
+ console.log("ContentBlocks: " + contentBlocks.length + " -- HighestScore: " + highestScore);
+
+
+ // remove all content elements that aren't of the highest score
+ var numContentBlocks = contentBlocks.length - 1;
+ for (var m = numContentBlocks; m >= 0; m--)
+ {
+ var contentElement = contentBlocks[m];
+
+
+ // DEBUG
+ //console.log("id: " + contentElement.id + " -- class: " + contentElement.className + " -- result: " + ((highestScore < 20 && contentElement.readability.contentScore < highestScore) || (contentElement.readability.contentScore < 20)).toString().toUpperCase());
+
+
+ // FIXME: had trouble writing the if/else if as a single if or statement
+ // FIXME: not sure the minimum score is correct, need to test against wide
+ // range of content, particularly content divided in 2+ containers
+
+ // sometimes our content won't reach such a high score so here we look for an
+ // acceptable minimum, if our highest score didn't go above twenty remove all
+ // but the highest
+ if (highestScore < 20 && contentElement.readability.contentScore < highestScore)
+ {
+ contentBlocks.splice(m, 1);
+ } //otherwise we only remove content blocks that have scored less than that minimum
+ else if (highestScore > 20 && contentElement.readability.contentScore < 20)
+ {
+ contentBlocks.splice(m, 1);
+ }
+ }
+
+
+ // with many content containers we need to verify that some
+ // aren't descendants of others otherwise we'll get multiple output
+ if (contentBlocks.length > 1)
+ {
+ // remove all content elements that are descandants of another
+ var numContentBlocks = contentBlocks.length - 1;
+ for (var m = numContentBlocks; m >= 0; m--)
+ {
+ var contentElement = contentBlocks[m];
+
+ /**
+ * hasAnyAncestor should work better overall but some sites
+ * have so many div's up the hierarchy with lots of good keywords
+ * its hard to keep those out, for those sites
+ * (http://www.azstarnet.com/news/290815) hasAnyDescendant works
+ * best so will need to consider changing and QA heavily.
+ */
+ if (hasAnyDescendant(contentElement, contentBlocks))
+ contentBlocks.splice(m, 1);
}
+ }
+
+
+ // DEBUG
+ console.log("ContentBlocks: " + contentBlocks.length);
+
+
+ for (var m = 0; m < contentBlocks.length; m++)
+ {
+ var contentElement = contentBlocks[m];
+
+ removeElementStyles(contentElement);
+
+ // remove any consecutive <br />'s into just one <br />
+ removeBreaks(contentElement);
+
+ // this cleanup should only happen if paragraphs were found since
+ // malformed content suggests div's are used to maintain content
+ if (!malformedContent)
+ {
+ // goes in and removes DIV's that have more non <p> stuff than <p> stuff
+ removeNonContentElement(contentElement, "div");
+ }
+
+ //removeNonContentElement(contentElement, "ul");
+
+ // clean out anymore possible junk
+ removeElementByMinWords(contentElement, "form");
+ removeElementByMinWords(contentElement, "object");
+ removeElementByMinWords(contentElement, "table", 250);
+ removeElementByMinWords(contentElement, "h1");
+ removeElementByMinWords(contentElement, "h2");
+ removeElementByMinWords(contentElement, "iframe");
+
+ articleContent.appendChild(contentElement);
+ }
+
+ // Readability has failed you.. show msg that content was not found
+ if (contentBlocks.length == 0)
+ {
+ articleContent = document.createElement("DIV");
+ articleContent.innerHTML = 'Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href="http://code.google.com/p/arc90labs-readability/issues/entry">let us know by submitting an issue.</a>';
+ }
+
+ return articleContent;
+}
- // Add a point for the paragraph found
- if(getInnerText(allParagraphs[j]).length > 10)
- parentNode.readability.contentScore++;
- // Add points for any commas within this paragraph
- parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
- }
- // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
- for(nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++)
- if(typeof node.readability != 'undefined' && (topDiv == null || node.readability.contentScore > topDiv.readability.contentScore))
- topDiv = node;
+//--------------------------------------------------------------------------
+//
+// ContentParserUtils
+//
+//--------------------------------------------------------------------------
- if(topDiv == null)
+/**
+ * Removes any elements of the provided tag name from the specified element
+ * if it doesn't contain the minimum amount of words.
+ *
+ * @param element The element.
+ * @param tagName The tag name of the elements to be retrieved from within
+ * the provided element.
+ * @param minWords The minimum number of words.
+ */
+function removeElementByMinWords(element, tagName, minWords)
+{
+ // default minimum if none is provided
+ minWords = minWords || 1000000; // FIXME: not sure why such a higher number!
+
+ var elements = element.getElementsByTagName(tagName);
+ var numElements = elements.length - 1;
+
+ for (var i = numElements; i >= 0; i--)
{
- topDiv = document.createElement('div');
- topDiv.innerHTML = 'Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href="http://code.google.com/p/arc90labs-readability/issues/entry">let us know by submitting an issue.</a>';
+ var target = elements[i];
+
+ // the text content doesn't meet our requirements so remove it
+ if (getWordCount(target) < minWords)
+ {
+ target.parentNode.removeChild(target);
+ }
}
+}
+
+/**
+ * Removes any instances of the provided non-content element from the
+ * specified root element if it passes a few tests. First, if a single
+ * bad keyword is found or second less than 25 words exist within.
+ *
+ * @param element The element.
+ * @param tagName The tag name of the elements to be retrieved from within
+ * the provided element.
+ */
+function removeNonContentElement(element, tagName)
+{
+ var elements = element.getElementsByTagName(tagName);
+ var numElements = elements.length - 1;
- // REMOVES ALL STYLESHEETS ...
- for (var k=0;k < document.styleSheets.length; k++) {
- if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf("readability") == -1) {
- document.styleSheets[k].disabled = true;
+ // gather counts for other typical elements embedded within and then traverse
+ // backwards so we can remove elements at the same time without effecting the traversal
+ for (var i = numElements; i >= 0; i--)
+ {
+ var descendant = elements[i];
+ var p = descendant.getElementsByTagName("p").length;
+ var img = descendant.getElementsByTagName("img").length;
+ var li = descendant.getElementsByTagName("li").length;
+ var a = descendant.getElementsByTagName("a").length;
+ var embed = descendant.getElementsByTagName("embed").length;
+
+ var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "toolbar", "tools", "trackback", "widget"];
+
+ // should improve this but for if the element has a single bad keyword remove it
+ for (var j = 0; j < badKeywords.length; j++)
+ {
+ if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0)
+ {
+ descendant.parentNode.removeChild(descendant);
+ descendant = null;
+ break;
+ }
+ }
+
+ // found a bad keyword so the element has been removed, continue to the next one
+ if (!descendant)
+ continue;
+
+ // we have fewer than 25 words.. bad sign..
+ if (getWordCount(descendant) < 25)
+ {
+ // the number of non-paragraph elements is more than actual
+ // paragraphs or other ominous signs (:) and elements
+ if (img > p || li >= p || a >= p || p == 0 || embed > 0)
+ {
+ descendant.parentNode.removeChild(descendant);
+ }
}
}
+}
- // Remove all style tags in head (not doing this on IE) :
- var styleTags = document.getElementsByTagName("style");
- for (var j=0;j < styleTags.length; j++)
- if (navigator.appName != "Microsoft Internet Explorer")
- styleTags[j].textContent = "";
+//--------------------------------------------------------------------------
+//
+// ElementUtils
+//
+//--------------------------------------------------------------------------
+
+/**
+ * Returns the word count for the specified element.
+ *
+ * @param element The element.
+ *
+ * @returns A count indicating the number of words
+ */
+function getWordCount(element)
+{
+ // normalize replaces consecutive spacing with a single space,
+ // by then triming, we can safely split on a space for a count
+ return trim(normalize(getText(element))).split(" ").length;
+}
- cleanStyles(topDiv); // Removes all style attributes
- topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff
- topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br />
+/**
+ * Returns the text content of the specified element.
+ *
+ * @param element The element from which to retrieve its text content.
+ *
+ * @return The string content of the specified element.
+ */
+function getText(element)
+{
+ return (typeof element.textContent != "undefined")
+ ? element.textContent
+ : element.innerText;
+}
- // Cleans out junk from the topDiv just in case:
- topDiv = clean(topDiv, "form");
- topDiv = clean(topDiv, "object");
- topDiv = clean(topDiv, "table", 250);
- topDiv = clean(topDiv, "h1");
- topDiv = clean(topDiv, "h2");
- topDiv = clean(topDiv, "iframe");
+/**
+ * Determines if the specified element has one of the provided array of
+ * ancestors and if so returns true.
+ *
+ * @param element The element.
+ * @param ancestors An array of possible ancestors.
+ *
+ * @returns True if the element has one of the provided ancestors,
+ * false if it does not.
+ */
+function hasAnyAncestor(element, ancestors)
+{
+ var parent = element.parentNode;
-
- // Add the footer and contents:
- articleFooter.id = "readFooter";
- articleFooter.innerHTML = "\
- <a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png'></a>\
- <div class='footer-right' >\
- <span class='version'>Readability version " + readabilityVersion + "</span>\
- </div>\
- ";
-
- articleContent.appendChild(topDiv);
- articleContent.appendChild(articleFooter);
+ while (parent != null)
+ {
+ // ancestor found!
+ if (ancestors.indexOf(parent) >= 0)
+ return true;
+
+ parent = parent.parentNode;
+ }
- return articleContent;
+ return false;
}
-// Get the inner text of a node - cross browser compatibly.
-function getInnerText(e) {
- if (navigator.appName == "Microsoft Internet Explorer")
- return e.innerText;
- else
- return e.textContent;
+/**
+ * Determines if the specified element has one of the provided array of
+ * descendants and if so returns true.
+ *
+ * @param element The element.
+ * @param descendants An array of possible descendants.
+ *
+ * @returns True if the element has one of the provided descendants,
+ * false if it does not.
+ */
+function hasAnyDescendant(element, descendants)
+{
+ var elements = element.getElementsByTagName("*");
+
+ for (var i = 0; i < elements.length; i++)
+ {
+ // descendant found!
+ if (descendants.indexOf(elements[i]) >= 0)
+ return true;
+ }
+
+ return false;
}
-// Get character count
-function getCharCount ( e,s ) {
- s = s || ",";
- return getInnerText(e).split(s).length;
+/**
+ * Replaces consecutive spaces with a single space.
+ */
+function normalize(text)
+{
+ return (text || "").replace(/\s{2,}/g, " ");
}
-function cleanStyles( e ) {
- e = e || document;
- var cur = e.firstChild;
+/**
+ * Replaces consecutive br tags with a single br tag from the specified element.
+ *
+ * @param element The element containing consecutive br tags.
+ */
+function removeBreaks(element)
+{
+ element.innerHTML = element.innerHTML.replace(/(<br[^>]*\/?>(\s|&nbsp;?)*){1,}/gi, "<br />");
+}
- // If we had a bad node, there's not much we can do.
- if(!e)
+/**
+ * Removes any styles on the specified element.
+ *
+ * @param element The element containing the styles to be removed.
+ */
+function removeElementStyles(element)
+{
+ // bad node, there's not much we can do
+ if (!element)
return;
-
- // Remove any root styles, if we're able.
- if(typeof e.removeAttribute == 'function')
- e.removeAttribute('style');
-
- // Go until there are no more child nodes
- while ( cur != null ) {
- if ( cur.nodeType == 1 ) {
- // Remove style attribute(s) :
- cur.removeAttribute("style");
- cleanStyles( cur );
+
+ // remove any root styles, if we're able
+ if (typeof element.removeAttribute == "function")
+ element.removeAttribute("style");
+
+ // prepare to remove styles on all children and siblings
+ var childElement = element.firstChild;
+
+ while (childElement)
+ {
+ if (childElement.nodeType == 1)
+ {
+ childElement.removeAttribute("style");
+
+ // remove styles recursively
+ removeElementStyles(childElement);
}
- cur = cur.nextSibling;
+
+ childElement = childElement.nextSibling;
}
}
-function killDivs ( e ) {
- var divsList = e.getElementsByTagName( "div" );
- var curDivLength = divsList.length;
-
- // Gather counts for other typical elements embedded within.
- // Traverse backwards so we can remove nodes at the same time without effecting the traversal.
- for (var i=curDivLength-1; i >= 0; i--) {
- var p = divsList[i].getElementsByTagName("p").length;
- var img = divsList[i].getElementsByTagName("img").length;
- var li = divsList[i].getElementsByTagName("li").length;
- var a = divsList[i].getElementsByTagName("a").length;
- var embed = divsList[i].getElementsByTagName("embed").length;
-
- // If the number of commas is less than 10 (bad sign) ...
- if ( getCharCount(divsList[i]) < 10) {
- // And the number of non-paragraph elements is more than paragraphs
- // or other ominous signs :
- if ( img > p || li > p || a > p || p == 0 || embed > 0) {
- divsList[i].parentNode.removeChild(divsList[i]);
- }
+/**
+ * Removes all inline or external referencing scripts.
+ */
+function removeScripts()
+{
+ var scripts = document.getElementsByTagName("SCRIPT");
+ var numScripts = scripts.length - 1;
+
+ for (var n = numScripts; n >= 0; n--)
+ {
+ var script = scripts[n];
+
+ // remove inline or external referencing scripts (that aren't Readability related)
+ if (!script.src || (script.src && script.src.indexOf("readability") == -1))
+ {
+ script.parentNode.removeChild(scripts[n]);
}
}
- return e;
-}
-
-function killBreaks ( e ) {
- e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,'<br />');
- return e;
}
-function clean(e, tags, minWords) {
- var targetList = e.getElementsByTagName( tags );
- minWords = minWords || 1000000;
-
- for (var y=0; y < targetList.length; y++) {
- // If the text content isn't laden with words, remove the child:
- if (getCharCount(targetList[y], " ") < minWords) {
- targetList[y].parentNode.removeChild(targetList[y]);
+/**
+ * Removes all inline styles.
+ */
+function removeStyles()
+{
+ var styleTags = document.getElementsByTagName("STYLE");
+
+ for (var j = 0; j < styleTags.length; j++)
+ {
+ var style = styleTags[j];
+
+ // TODO: need to verify that clearing out innerText works in IE
+ // might want to consider removing from parent
+ if (style.textContent)
+ {
+ style.textContent = "";
+ }
+ else
+ {
+ style.innerText = "";
}
}
- return e;
}
-function emailBox() {
- var emailContainer = document.getElementById('email-container');
- if(null != emailContainer)
- {
- return;
- }
-
- var emailContainer = document.createElement('div');
- emailContainer.setAttribute('id', 'email-container');
- emailContainer.innerHTML = '<iframe src="'+emailSrc + '?pageUrl='+escape(window.location)+'&pageTitle='+escape(document.title)+'" scrolling="no" onload="removeFrame()" style="width:500px; height: 490px; border: 0;"></iframe>';
-
- document.body.appendChild(emailContainer);
+/**
+ * Removes all linked stylesheets.
+ */
+function removeStylesheets()
+{
+ // TODO: need to do more research, not sure if disabling is enough
+ // for cross browser compatibility, might consider removal via parent
+ // just as done in the removeScripts method
+ for (var k = 0; k < document.styleSheets.length; k++)
+ {
+ if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf("readability") == -1)
+ {
+ document.styleSheets[k].disabled = true;
+ }
+ }
}
-function removeFrame()
+/**
+ * Removes whitespace from the front and the end of the specified string.
+ *
+ * @param text The String whose beginning and ending whitespace will be removed.
+ *
+ * @returns A String with whitespace removed from the begining and end
+ */
+function trim(text)
{
- ++iframeLoads;
- if(iframeLoads >= 6)
- {
- var emailContainer = document.getElementById('email-container');
- if(null != emailContainer) {
- emailContainer.parentNode.removeChild(emailContainer);
- }
- // reset the count
- iframeLoads = 0;
- }
-}
+ return (text || "").replace(/^\s+|\s+$/g, "");
+} \ No newline at end of file