summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xindex.html (renamed from index.htm)0
-rwxr-xr-xjs/readability.js144
-rw-r--r--themes.html43
3 files changed, 130 insertions, 57 deletions
diff --git a/index.htm b/index.html
index 95aed0b..95aed0b 100755
--- a/index.htm
+++ b/index.html
diff --git a/js/readability.js b/js/readability.js
index 869fcad..ba2b18f 100755
--- a/js/readability.js
+++ b/js/readability.js
@@ -1,5 +1,3 @@
-var readabilityVersion = "1.0.0.1";
-var emailSrc = "http://proto1.arc90.com/readability/email.php";
var highestScore = -1;
var malformedContent = false;
@@ -22,9 +20,13 @@ if (typeof console == 'undefined')
window.setInterval = function(method, timeout) {};
window.setTimeout = function(method, timeout) {};
- var overlayContainer = document.createElement("DIV");
- var articleTitle = document.createElement("H1");
- var contentContainer = document.createElement("DIV");
+ var overlayContainer = document.createElement("DIV"),
+ articleTitle = document.createElement("H1"),
+ contentContainer = document.createElement("DIV"),
+ articleFooter = document.createElement("DIV"),
+ toolBar = document.createElement("DIV"),
+ readabilityVersion = "1.0.0.3",
+ emailSrc = "http://proto1.arc90.com/readability/email.php";
overlayContainer.id = "readOverlay";
contentContainer.id = "readInner";
@@ -54,16 +56,17 @@ if (typeof console == 'undefined')
function determineContentScore(score, parent, element)
{
// TODO: should set as a global var since badKeywords are used elsewhere
- var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"];
- var semiGoodKeywords = ["area", "container", "inner", "main"];
- var badKeywords = ["ad", "captcha", "classified", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"];
-
- // we'll be doing a case insensitive compare
- var className = parent.className.toLowerCase();
- var id = parent.id.toLowerCase();
+ var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"],
+ semiGoodKeywords = ["area", "container", "inner", "main"],
+ badKeywords = ["ad", "captcha", "classified", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"],
+ className = parent.className.toLowerCase(), // we'll be doing a case insensitive compare
+ id = parent.id.toLowerCase(), // we'll be doing a case insensitive compare
+ i = goodKeywords.length,
+ j = semiGoodKeywords.length,
+ k = badKeywords.length;
// increment the score if the content might be what we are looking for
- for (var i = 0; i < goodKeywords.length; i++)
+ while (i--)
{
if (className.indexOf(goodKeywords[i]) >= 0)
score++;
@@ -82,7 +85,7 @@ function determineContentScore(score, parent, element)
if (score >= 1)
{
// increment the score if the content might be what we are looking for
- for (var i = 0; i < semiGoodKeywords.length; i++)
+ while (j--)
{
if (className.indexOf(semiGoodKeywords[i]) >= 0)
score++;
@@ -93,7 +96,7 @@ function determineContentScore(score, parent, element)
}
// decrement the score if the content is not what we are looking for
- for (var j = 0; j < badKeywords.length; j++)
+ while (k--)
{
if (className.indexOf(badKeywords[j]) >= 0)
score = score - 15;
@@ -108,6 +111,7 @@ function determineContentScore(score, parent, element)
if (element.tagName.toLowerCase() == "p" && getWordCount(element) > 20) //|| (score == 0 && getText(element).length > 10))
score++;
+ // DEBUG
console.log(element.tagName.toLowerCase() + " " + getWordCount(element));
//if (getWordCount(element) > 30)
@@ -134,9 +138,9 @@ function parseContent() {
// replace all doubled-up <BR> tags with <P> tags, and remove inline fonts
document.body.innerHTML = document.body.innerHTML.replace(/<br[^>]*>\s|&nbsp;*<br[^>]*>/gi, "<p />").replace(/<\/?font[^>]*>/gi, "");
- var articleContent = document.createElement("DIV");
- var paragraphs = document.getElementsByTagName("P");
- var contentBlocks = [];
+ var articleContent = document.createElement("DIV"),
+ paragraphs = document.getElementsByTagName("P"),
+ contentBlocks = [];
// DEBUG
@@ -216,8 +220,8 @@ function parseContent() {
}
*/
- // wow.. talk about a bad site, no paragraphs found so we'll attempt to
- // parse content from div's and set our malformedContent flag
+ // no paragraphs found so we'll attempt to parse content from
+ // div's and set our malformedContent flag
if (paragraphs.length == 0)
{
paragraphs = document.getElementsByTagName("DIV");
@@ -225,7 +229,9 @@ function parseContent() {
malformedContent = true;
}
- for (var i = 0; i < paragraphs.length; i++)
+ var i = paragraphs.length;
+
+ while (i--)
{
var parentNode = paragraphs[i].parentNode;
@@ -292,9 +298,10 @@ function parseContent() {
console.log("ContentBlocks: " + contentBlocks.length + " -- HighestScore: " + highestScore);
+ var m = contentBlocks.length;
+
// remove all content elements that aren't of the highest score
- var numContentBlocks = contentBlocks.length - 1;
- for (var m = numContentBlocks; m >= 0; m--)
+ while (m--)
{
var contentElement = contentBlocks[m];
@@ -310,11 +317,11 @@ function parseContent() {
// sometimes our content won't reach such a high score so here we look for an
// acceptable minimum, if our highest score didn't go above twenty remove all
// but the highest
- if (highestScore < 20 && contentElement.readability.contentScore < highestScore)
+ if (highestScore < 20 && contentElement.readability && contentElement.readability.contentScore < highestScore)
{
contentBlocks.splice(m, 1);
} //otherwise we only remove content blocks that have scored less than that minimum
- else if (highestScore > 20 && contentElement.readability.contentScore < 20)
+ else if (highestScore > 20 && contentElement.readability && contentElement.readability.contentScore < 20)
{
contentBlocks.splice(m, 1);
}
@@ -325,11 +332,12 @@ function parseContent() {
// aren't descendants of others otherwise we'll get multiple output
if (contentBlocks.length > 1)
{
+ var n = contentBlocks.length;
+
// remove all content elements that are descandants of another
- var numContentBlocks = contentBlocks.length - 1;
- for (var m = numContentBlocks; m >= 0; m--)
+ while (n--)
{
- var contentElement = contentBlocks[m];
+ var contentElement = contentBlocks[n];
/**
* hasAnyAncestor should work better overall but some sites
@@ -339,7 +347,7 @@ function parseContent() {
* best so will need to consider changing and QA heavily.
*/
if (hasAnyDescendant(contentElement, contentBlocks))
- contentBlocks.splice(m, 1);
+ contentBlocks.splice(n, 1);
}
}
@@ -348,9 +356,11 @@ function parseContent() {
console.log("ContentBlocks: " + contentBlocks.length);
- for (var m = 0; m < contentBlocks.length; m++)
+ var p = contentBlocks.length;
+
+ while (p--)
{
- var contentElement = contentBlocks[m];
+ var contentElement = contentBlocks[p];
removeElementStyles(contentElement);
@@ -410,10 +420,10 @@ function removeElementByMinWords(element, tagName, minWords)
// default minimum if none is provided
minWords = minWords || 1000000; // FIXME: not sure why such a higher number!
- var elements = element.getElementsByTagName(tagName);
- var numElements = elements.length - 1;
+ var elements = element.getElementsByTagName(tagName),
+ i = elements.length;
- for (var i = numElements; i >= 0; i--)
+ while (i--)
{
var target = elements[i];
@@ -436,19 +446,23 @@ function removeElementByMinWords(element, tagName, minWords)
*/
function removeNonContentElement(element, tagName)
{
- var elements = element.getElementsByTagName(tagName);
- var numElements = elements.length - 1;
+ var elements = element.getElementsByTagName(tagName),
+ i = elements.length;
// gather counts for other typical elements embedded within and then traverse
// backwards so we can remove elements at the same time without effecting the traversal
- for (var i = numElements; i >= 0; i--)
+ while (i--)
{
- var descendant = elements[i];
- var p = descendant.getElementsByTagName("p").length;
- var img = descendant.getElementsByTagName("img").length;
- var li = descendant.getElementsByTagName("li").length;
- var a = descendant.getElementsByTagName("a").length;
- var embed = descendant.getElementsByTagName("embed").length;
+ var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "crumbs", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "tag", "toolbar", "tools", "trackback", "tweetback", "widget"],
+ descendant = elements[i],
+ descendantId = descendant.id.toLowerCase(),
+ descendantClassName = descendant.className.toLowerCase(),
+ p = descendant.getElementsByTagName("p").length,
+ img = descendant.getElementsByTagName("img").length,
+ li = descendant.getElementsByTagName("li").length,
+ a = descendant.getElementsByTagName("a").length,
+ embed = descendant.getElementsByTagName("embed").length;
+
/*
// no basic elements were found at all
@@ -480,12 +494,12 @@ function removeNonContentElement(element, tagName)
}
else
{*/
- var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "crumbs", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "tag", "toolbar", "tools", "trackback", "tweetback", "widget"];
+ var j = badKeywords.length;
// should improve this but for if the element has a single bad keyword remove it
- for (var j = 0; j < badKeywords.length; j++)
+ while (j--)
{
- if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0)
+ if (descendantId.indexOf(badKeywords[j]) >= 0 || descendantClassName.indexOf(badKeywords[j]) >= 0)
{
descendant.parentNode.removeChild(descendant);
descendant = null;
@@ -583,9 +597,10 @@ function hasAnyAncestor(element, ancestors)
*/
function hasAnyDescendant(element, descendants)
{
- var elements = element.getElementsByTagName("*");
+ var elements = element.getElementsByTagName("*"),
+ i = elements.length;
- for (var i = 0; i < elements.length; i++)
+ while (i--)
{
// descendant found!
if (descendants.indexOf(elements[i]) >= 0)
@@ -596,6 +611,19 @@ function hasAnyDescendant(element, descendants)
}
/**
+ * Returns true if the value given is defined. Otherwise returns false.
+ *
+ * @param value The value to determine if defined.
+ *
+ * @return True if the value given is defined, false if it does not.
+ */
+function isDefined(value)
+{
+ var undefined;
+ return value !== undefined;
+}
+
+/**
* Replaces consecutive spaces with a single space.
*/
function normalize(text)
@@ -657,10 +685,10 @@ function removeElementStyles(element)
*/
function removeScripts()
{
- var scripts = document.getElementsByTagName("SCRIPT");
- var numScripts = scripts.length - 1;
+ var scripts = document.getElementsByTagName("SCRIPT"),
+ i = scripts.length;
- for (var i = numScripts; i >= 0; i--)
+ while (i--)
{
var script = scripts[i];
@@ -677,10 +705,10 @@ function removeScripts()
*/
function removeStyles()
{
- var styles = document.getElementsByTagName("STYLE");
- var startIndex = styles.length - 1;
+ var styles = document.getElementsByTagName("STYLE"),
+ i = styles.length;
- for (var i = startIndex; i >= 0; i--)
+ while (i--)
{
var style = styles[i];
@@ -714,16 +742,18 @@ function removeStyles()
*/
function removeStylesheets()
{
+ var i = document.styleSheets.length;
+
// TODO: need to do more research, not sure if disabling is enough
// for cross browser compatibility, might consider removal via parent
// just as done in the removeScripts method, but will need to retrieve
// all LINK tags and make sure rel attr is "stylesheet" or that its
// type attr is "text/css"
- for (var k = 0; k < document.styleSheets.length; k++)
+ while (i--)
{
- var styleSheet = document.styleSheets[k];
+ var styleSheet = document.styleSheets[i];
- if (styleSheet.href != null && styleSheet.href.lastIndexOf("readability") == -1)
+ if (styleSheet.href && styleSheet.href.lastIndexOf("readability") == -1)
{
styleSheet.disabled = true;
}
diff --git a/themes.html b/themes.html
new file mode 100644
index 0000000..9d5a253
--- /dev/null
+++ b/themes.html
@@ -0,0 +1,43 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Readability - An Arc90 Lab Experiment</title>
+<link rel="stylesheet" href="css/readability.css" type="text/css" media="screen" />
+<link rel="stylesheet" href="css/readability-print.css" type="text/css" media="print" />
+</head>
+<body>
+
+<div id="readOverlay" class="style-newspaper">
+
+ <div id="readTools">
+ <a href="#" onclick="return window.location.reload();" title="Reload original page" id="reload-page">Reload Original Page</a>
+ <a href="#" onclick="javascript:window.print();" title="Print page" id="print-page">Print Page</a>
+ <a href="#" onclick="emailBox();return false;" title="Email page" id="email-page">Email Page</a>
+ </div>
+
+ <div id="readInner" class="margin-wide size-large">
+
+ <div>
+ <h1>Compelling Content Worth Reading</h1>
+ <hr/>
+ <p>One morning, when <a href="http://www.arc90.com/">Gregor Samsa</a> woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.</p>
+ <p> The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked. "What's happened to me?" he thought. It wasn't a dream.</p>
+ <p> His room, a proper human room although a little too small, lay peacefully between its four familiar walls. A <a href="http://www.arc90.com/">collection of textile samples</a> lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.</p>
+ <p> It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm towards the viewer. Gregor then turned to look out the window at the dull weather. Drops of rain could be heard hitting the pane, which made him feel quite sad.</p>
+ <p> "How about if I sleep a little bit longer and forget all this nonsense", he thought, but that was something he was unable to do because he was used to sleeping on his right, and in his present state couldn't get into that position. However hard he threw himself onto his right, he always rolled back to where he was.</p>
+ <p> He must have tried it a hundred times, shut his eyes so that he wouldn't have to look at the floundering legs, and only stopped when he began to feel a mild, dull pain there that he had never felt before. "Oh, God", he thought, "what a strenuous career it is that I've chosen! Travelling day in and day out. Doing business like this takes much more effort than doing your own business at home, and on top of that there's the curse of travelling, worries about making train connections, bad and irregular food, contact with different people all the time so that you can never get to know anyone or become friendly with them. It can all go to Hell!" He felt a slight itch.</p>
+ </div>
+
+ <div id="readFooter">
+ <div>
+ <a href="http://www.arc90.com"><img src="http://lab.arc90.com/experiments/readability/images/footer.png" width="308" height="66" /></a>
+ </div>
+ <div id="readability-version">1.0.0.1</div>
+ </div>
+
+ </div>
+
+</div>
+
+</body>
+</html> \ No newline at end of file