diff options
author | Nick White <git@njw.me.uk> | 2009-12-01 22:33:43 +0000 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2009-12-01 22:33:43 +0000 |
commit | 76442801921820fd73afa2ab1e751b6b34489cfc (patch) | |
tree | b78eb2af0a62bd3561fb2604dec17def05708d51 /js | |
parent | 7229fb4ad7a548b59320b85ba18aa0cf000f54ad (diff) | |
parent | 5c5767acbe6dc81567c7c7b6273441cbac373b8d (diff) | |
download | readability-simple-njw.tar.bz2 readability-simple-njw.zip |
Merged with latest upstream1.0.0.3-njwnjw
Diffstat (limited to 'js')
-rwxr-xr-x | js/readability.js | 144 |
1 files changed, 87 insertions, 57 deletions
diff --git a/js/readability.js b/js/readability.js index 869fcad..ba2b18f 100755 --- a/js/readability.js +++ b/js/readability.js @@ -1,5 +1,3 @@ -var readabilityVersion = "1.0.0.1";
-var emailSrc = "http://proto1.arc90.com/readability/email.php";
var highestScore = -1;
var malformedContent = false;
@@ -22,9 +20,13 @@ if (typeof console == 'undefined') window.setInterval = function(method, timeout) {};
window.setTimeout = function(method, timeout) {};
- var overlayContainer = document.createElement("DIV");
- var articleTitle = document.createElement("H1");
- var contentContainer = document.createElement("DIV");
+ var overlayContainer = document.createElement("DIV"),
+ articleTitle = document.createElement("H1"),
+ contentContainer = document.createElement("DIV"),
+ articleFooter = document.createElement("DIV"),
+ toolBar = document.createElement("DIV"),
+ readabilityVersion = "1.0.0.3",
+ emailSrc = "http://proto1.arc90.com/readability/email.php";
overlayContainer.id = "readOverlay";
contentContainer.id = "readInner";
@@ -54,16 +56,17 @@ if (typeof console == 'undefined') function determineContentScore(score, parent, element)
{
// TODO: should set as a global var since badKeywords are used elsewhere
- var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"];
- var semiGoodKeywords = ["area", "container", "inner", "main"];
- var badKeywords = ["ad", "captcha", "classified", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"];
-
- // we'll be doing a case insensitive compare
- var className = parent.className.toLowerCase();
- var id = parent.id.toLowerCase();
+ var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"],
+ semiGoodKeywords = ["area", "container", "inner", "main"],
+ badKeywords = ["ad", "captcha", "classified", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"],
+ className = parent.className.toLowerCase(), // we'll be doing a case insensitive compare
+ id = parent.id.toLowerCase(), // we'll be doing a case insensitive compare
+ i = goodKeywords.length,
+ j = semiGoodKeywords.length,
+ k = badKeywords.length;
// increment the score if the content might be what we are looking for
- for (var i = 0; i < goodKeywords.length; i++)
+ while (i--)
{
if (className.indexOf(goodKeywords[i]) >= 0)
score++;
@@ -82,7 +85,7 @@ function determineContentScore(score, parent, element) if (score >= 1)
{
// increment the score if the content might be what we are looking for
- for (var i = 0; i < semiGoodKeywords.length; i++)
+ while (j--)
{
if (className.indexOf(semiGoodKeywords[i]) >= 0)
score++;
@@ -93,7 +96,7 @@ function determineContentScore(score, parent, element) }
// decrement the score if the content is not what we are looking for
- for (var j = 0; j < badKeywords.length; j++)
+ while (k--)
{
if (className.indexOf(badKeywords[j]) >= 0)
score = score - 15;
@@ -108,6 +111,7 @@ function determineContentScore(score, parent, element) if (element.tagName.toLowerCase() == "p" && getWordCount(element) > 20) //|| (score == 0 && getText(element).length > 10))
score++;
+ // DEBUG
console.log(element.tagName.toLowerCase() + " " + getWordCount(element));
//if (getWordCount(element) > 30)
@@ -134,9 +138,9 @@ function parseContent() { // replace all doubled-up <BR> tags with <P> tags, and remove inline fonts
document.body.innerHTML = document.body.innerHTML.replace(/<br[^>]*>\s| *<br[^>]*>/gi, "<p />").replace(/<\/?font[^>]*>/gi, "");
- var articleContent = document.createElement("DIV");
- var paragraphs = document.getElementsByTagName("P");
- var contentBlocks = [];
+ var articleContent = document.createElement("DIV"),
+ paragraphs = document.getElementsByTagName("P"),
+ contentBlocks = [];
// DEBUG
@@ -216,8 +220,8 @@ function parseContent() { }
*/
- // wow.. talk about a bad site, no paragraphs found so we'll attempt to
- // parse content from div's and set our malformedContent flag
+ // no paragraphs found so we'll attempt to parse content from
+ // div's and set our malformedContent flag
if (paragraphs.length == 0)
{
paragraphs = document.getElementsByTagName("DIV");
@@ -225,7 +229,9 @@ function parseContent() { malformedContent = true;
}
- for (var i = 0; i < paragraphs.length; i++)
+ var i = paragraphs.length;
+
+ while (i--)
{
var parentNode = paragraphs[i].parentNode;
@@ -292,9 +298,10 @@ function parseContent() { console.log("ContentBlocks: " + contentBlocks.length + " -- HighestScore: " + highestScore);
+ var m = contentBlocks.length;
+
// remove all content elements that aren't of the highest score
- var numContentBlocks = contentBlocks.length - 1;
- for (var m = numContentBlocks; m >= 0; m--)
+ while (m--)
{
var contentElement = contentBlocks[m];
@@ -310,11 +317,11 @@ function parseContent() { // sometimes our content won't reach such a high score so here we look for an
// acceptable minimum, if our highest score didn't go above twenty remove all
// but the highest
- if (highestScore < 20 && contentElement.readability.contentScore < highestScore)
+ if (highestScore < 20 && contentElement.readability && contentElement.readability.contentScore < highestScore)
{
contentBlocks.splice(m, 1);
} //otherwise we only remove content blocks that have scored less than that minimum
- else if (highestScore > 20 && contentElement.readability.contentScore < 20)
+ else if (highestScore > 20 && contentElement.readability && contentElement.readability.contentScore < 20)
{
contentBlocks.splice(m, 1);
}
@@ -325,11 +332,12 @@ function parseContent() { // aren't descendants of others otherwise we'll get multiple output
if (contentBlocks.length > 1)
{
+ var n = contentBlocks.length;
+
// remove all content elements that are descandants of another
- var numContentBlocks = contentBlocks.length - 1;
- for (var m = numContentBlocks; m >= 0; m--)
+ while (n--)
{
- var contentElement = contentBlocks[m];
+ var contentElement = contentBlocks[n];
/**
* hasAnyAncestor should work better overall but some sites
@@ -339,7 +347,7 @@ function parseContent() { * best so will need to consider changing and QA heavily.
*/
if (hasAnyDescendant(contentElement, contentBlocks))
- contentBlocks.splice(m, 1);
+ contentBlocks.splice(n, 1);
}
}
@@ -348,9 +356,11 @@ function parseContent() { console.log("ContentBlocks: " + contentBlocks.length);
- for (var m = 0; m < contentBlocks.length; m++)
+ var p = contentBlocks.length;
+
+ while (p--)
{
- var contentElement = contentBlocks[m];
+ var contentElement = contentBlocks[p];
removeElementStyles(contentElement);
@@ -410,10 +420,10 @@ function removeElementByMinWords(element, tagName, minWords) // default minimum if none is provided
minWords = minWords || 1000000; // FIXME: not sure why such a higher number!
- var elements = element.getElementsByTagName(tagName);
- var numElements = elements.length - 1;
+ var elements = element.getElementsByTagName(tagName),
+ i = elements.length;
- for (var i = numElements; i >= 0; i--)
+ while (i--)
{
var target = elements[i];
@@ -436,19 +446,23 @@ function removeElementByMinWords(element, tagName, minWords) */
function removeNonContentElement(element, tagName)
{
- var elements = element.getElementsByTagName(tagName);
- var numElements = elements.length - 1;
+ var elements = element.getElementsByTagName(tagName),
+ i = elements.length;
// gather counts for other typical elements embedded within and then traverse
// backwards so we can remove elements at the same time without effecting the traversal
- for (var i = numElements; i >= 0; i--)
+ while (i--)
{
- var descendant = elements[i];
- var p = descendant.getElementsByTagName("p").length;
- var img = descendant.getElementsByTagName("img").length;
- var li = descendant.getElementsByTagName("li").length;
- var a = descendant.getElementsByTagName("a").length;
- var embed = descendant.getElementsByTagName("embed").length;
+ var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "crumbs", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "tag", "toolbar", "tools", "trackback", "tweetback", "widget"],
+ descendant = elements[i],
+ descendantId = descendant.id.toLowerCase(),
+ descendantClassName = descendant.className.toLowerCase(),
+ p = descendant.getElementsByTagName("p").length,
+ img = descendant.getElementsByTagName("img").length,
+ li = descendant.getElementsByTagName("li").length,
+ a = descendant.getElementsByTagName("a").length,
+ embed = descendant.getElementsByTagName("embed").length;
+
/*
// no basic elements were found at all
@@ -480,12 +494,12 @@ function removeNonContentElement(element, tagName) }
else
{*/
- var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "crumbs", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "tag", "toolbar", "tools", "trackback", "tweetback", "widget"];
+ var j = badKeywords.length;
// should improve this but for if the element has a single bad keyword remove it
- for (var j = 0; j < badKeywords.length; j++)
+ while (j--)
{
- if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0)
+ if (descendantId.indexOf(badKeywords[j]) >= 0 || descendantClassName.indexOf(badKeywords[j]) >= 0)
{
descendant.parentNode.removeChild(descendant);
descendant = null;
@@ -583,9 +597,10 @@ function hasAnyAncestor(element, ancestors) */
function hasAnyDescendant(element, descendants)
{
- var elements = element.getElementsByTagName("*");
+ var elements = element.getElementsByTagName("*"),
+ i = elements.length;
- for (var i = 0; i < elements.length; i++)
+ while (i--)
{
// descendant found!
if (descendants.indexOf(elements[i]) >= 0)
@@ -596,6 +611,19 @@ function hasAnyDescendant(element, descendants) }
/**
+ * Returns true if the value given is defined. Otherwise returns false.
+ *
+ * @param value The value to determine if defined.
+ *
+ * @return True if the value given is defined, false if it does not.
+ */
+function isDefined(value)
+{
+ var undefined;
+ return value !== undefined;
+}
+
+/**
* Replaces consecutive spaces with a single space.
*/
function normalize(text)
@@ -657,10 +685,10 @@ function removeElementStyles(element) */
function removeScripts()
{
- var scripts = document.getElementsByTagName("SCRIPT");
- var numScripts = scripts.length - 1;
+ var scripts = document.getElementsByTagName("SCRIPT"),
+ i = scripts.length;
- for (var i = numScripts; i >= 0; i--)
+ while (i--)
{
var script = scripts[i];
@@ -677,10 +705,10 @@ function removeScripts() */
function removeStyles()
{
- var styles = document.getElementsByTagName("STYLE");
- var startIndex = styles.length - 1;
+ var styles = document.getElementsByTagName("STYLE"),
+ i = styles.length;
- for (var i = startIndex; i >= 0; i--)
+ while (i--)
{
var style = styles[i];
@@ -714,16 +742,18 @@ function removeStyles() */
function removeStylesheets()
{
+ var i = document.styleSheets.length;
+
// TODO: need to do more research, not sure if disabling is enough
// for cross browser compatibility, might consider removal via parent
// just as done in the removeScripts method, but will need to retrieve
// all LINK tags and make sure rel attr is "stylesheet" or that its
// type attr is "text/css"
- for (var k = 0; k < document.styleSheets.length; k++)
+ while (i--)
{
- var styleSheet = document.styleSheets[k];
+ var styleSheet = document.styleSheets[i];
- if (styleSheet.href != null && styleSheet.href.lastIndexOf("readability") == -1)
+ if (styleSheet.href && styleSheet.href.lastIndexOf("readability") == -1)
{
styleSheet.disabled = true;
}
|