var highestScore = -1;
var malformedContent = false;
// for now we want to hold on to our debugging, but if a browser doesn't
// support it, we'll create a console.log() method that does nothing
if (typeof console == 'undefined')
{
var console = {};
console.log = function(msg) {
return;
};
}
// some sites use plugins (jCarousel) that when Readability removes scripts
// or does something funky it causes an alert to appear every few seconds,
// to avoid this we'll override the alert and timer methods, we won't need
// them, yet consider a better approach
window.alert = function(message) {};
window.setInterval = function(method, timeout) {};
window.setTimeout = function(method, timeout) {};
var overlayContainer = document.createElement("DIV"),
articleTitle = document.createElement("H1"),
contentContainer = document.createElement("DIV"),
articleFooter = document.createElement("DIV"),
toolBar = document.createElement("DIV"),
readabilityVersion = "1.0.0.3",
emailSrc = "http://proto1.arc90.com/readability/email.php";
overlayContainer.id = "readOverlay";
contentContainer.id = "readInner";
// we'll use the page title as our title, unfortunately not all sites use
// this well, so we might want to consider say stripping an H1 tag
articleTitle.innerHTML = document.title;
contentContainer.appendChild(articleTitle);
// parse the article content and add it to the new content container
contentContainer.appendChild(parseContent());
// add the toolbar and then the conent container to our body
overlayContainer.appendChild(contentContainer);
// for totally hosed HTML, add body node that can"t be found because of bad HTML or something
if (!document.body)
document.body = document.createElement("body");
document.body.id = "";
document.body.innerHTML = "";
// with all previous body content removed, add our new overlay/main container
document.body.insertBefore(overlayContainer, document.body.firstChild);
function determineContentScore(score, parent, element)
{
// TODO: should set as a global var since badKeywords are used elsewhere
var goodKeywords = ["article", "body", "content", "entry", "hentry", "post", "story", "text"],
semiGoodKeywords = ["area", "container", "inner", "main"],
badKeywords = ["ad", "captcha", "classified", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "toolbar", "tools", "trackback", "widget"],
className = parent.className.toLowerCase(), // we'll be doing a case insensitive compare
id = parent.id.toLowerCase(), // we'll be doing a case insensitive compare
i = goodKeywords.length,
j = semiGoodKeywords.length,
k = badKeywords.length;
// increment the score if the content might be what we are looking for
while (i--)
{
if (className.indexOf(goodKeywords[i]) >= 0)
score++;
if (id.indexOf(goodKeywords[i]) >= 0)
score++;
}
// TODO: would like to improve the content scoring algorithm here
// to not have to use so many for loops
// at least a single good keyword was found indiciating we may have found our
// content container but we have other keywords that don't necessarily have to
// do with content but when used in conjuction with the good keywords we want
// to increment our score
if (score >= 1)
{
// increment the score if the content might be what we are looking for
while (j--)
{
if (className.indexOf(semiGoodKeywords[i]) >= 0)
score++;
if (id.indexOf(semiGoodKeywords[i]) >= 0)
score++;
}
}
// decrement the score if the content is not what we are looking for
while (k--)
{
if (className.indexOf(badKeywords[j]) >= 0)
score = score - 15;
if (id.indexOf(badKeywords[j]) >= 0)
score = score - 15;
}
// TODO: verify that 20 seems an acceptable minimum, consider 15
//
// Add a point for the paragraph found
if (element.tagName.toLowerCase() == "p" && getWordCount(element) > 20) //|| (score == 0 && getText(element).length > 10))
score++;
// DEBUG
console.log(element.tagName.toLowerCase() + " " + getWordCount(element));
//if (getWordCount(element) > 30)
// score++;
// FIXME: not sure yet if this will be included, this would break
// pages that use multiple containers for content, or we could tweak
// the acceptable minimum... but that would have to be set quite
// high, for now we'll leave it out
//
// Add points for any words within this paragraph
//if (score > 0 && malformedContent)
// score += getWordCount(element);
// keep track of the highest score we've come across
if (score > highestScore)
highestScore = score;
return score;
}
function parseContent() {
// replace all doubled-up
tags with
tags, and remove inline fonts
document.body.innerHTML = document.body.innerHTML.replace(/
]*>\s| *
]*>/gi, "
'); // first item firstTime = false; } else { if (text.indexOf('\n\n') == text.lastIndexOf('\n\n')) text = text.replace('\n\n', '
'); // last item else text = text.replace('\n\n', ''); // every item in between
}
}
content.innerHTML = text.replace(/={10,}/g, "====================");
paragraphs = content.getElementsByTagName("P");
var preElements = [];
for (var j = 0; j < paragraphs.length; j++)
{
p = paragraphs[j];
breaks = p.getElementsByTagName("BR");
if (p.innerHTML.indexOf("\t") == -1 && p.innerHTML.indexOf(" ") == -1 && breaks.length >= 1)
{
p.innerHTML = p.innerHTML.replace(/
/gi, " ");
}
console.log("tabs: " + p.innerHTML.split("\t").length + " -- " + p.innerHTML.split(/\s{2,}/g).length + " -- " + p.innerHTML.substr(0, 35))
numTabs = p.innerHTML.split("\t").length + p.innerHTML.split(/ {3,}/g).length;
if (numTabs > 3)
{
preElements.push(p);
}
}
for (var k = 0; k < preElements.length; k++)
{
var p = preElements[k];
var newPre = document.createElement("PRE");
newPre.innerHTML = p.innerHTML.replace(/
/gi, "\n");
newPre.className = "normalPre";
p.parentNode.replaceChild(newPre, p);
}
content.innerHTML = content.innerHTML.replace(/
[ \r\n\s]*
/gi, "
");
contentBlocks.push(content);
}
*/
// no paragraphs found so we'll attempt to parse content from
// div's and set our malformedContent flag
if (paragraphs.length == 0)
{
paragraphs = document.getElementsByTagName("DIV");
malformedContent = true;
}
var i = paragraphs.length;
while (i--)
{
var parentNode = paragraphs[i].parentNode;
// TODO: originally the if/continue statement below checked if the parent
// was the body tag and if it was continued on.. why?
// if the parent happens to be a form element, accessing properties
// such as id or className don't work, or rather it attempts to access
// children so we need to make sure we only deal with string values
if (typeof parentNode.id != "string" || typeof parentNode.className != "string")
continue;
// initialize readability score data
if (typeof parentNode.readability == "undefined")
parentNode.readability = {"contentScore": 0};
parentNode.readability.contentScore = determineContentScore(parentNode.readability.contentScore, parentNode, paragraphs[i]);
// looks like we have possible content candidates, add it
if (parentNode.readability.contentScore > 0)
{
// DEBUG
console.log(parentNode.tagName + " id: " + parentNode.id + " -- class: " + parentNode.className + " -- score: " + parentNode.readability.contentScore);
// careful, only add parent element once!
if (contentBlocks.indexOf(parentNode) == -1)
contentBlocks.push(parentNode);
}
}
/*
// TODO: need to revisit parsing strictly tables/divs content only
if (contentBlocks.length == 0)
{
var paragraphs = document.getElementsByTagName("tbody");
for (var i = 0; i < paragraphs.length; i++)
{
var parentNode = paragraphs[i].parentNode;
// Initialize readability data
if (typeof parentNode.readability == "undefined")
{
parentNode.readability = {"contentScore": determineContentScore(parentNode, paragraphs[i])};
if (parentNode.readability.contentScore > 0)
{
console.log(parentNode.tagName + " id: " + parentNode.id + " -- class: " + parentNode.className + " -- score: " + parentNode.readability.contentScore);
if (contentBlocks.indexOf(parentNode) == -1)
contentBlocks.push(parentNode);
}
}
}
}
*/
removeScripts();
removeStylesheets();
removeStyles();
// DEBUG
console.log("ContentBlocks: " + contentBlocks.length + " -- HighestScore: " + highestScore);
var m = contentBlocks.length;
// remove all content elements that aren't of the highest score
while (m--)
{
var contentElement = contentBlocks[m];
// DEBUG
//console.log("id: " + contentElement.id + " -- class: " + contentElement.className + " -- result: " + ((highestScore < 20 && contentElement.readability.contentScore < highestScore) || (contentElement.readability.contentScore < 20)).toString().toUpperCase());
// FIXME: had trouble writing the if/else if as a single if or statement
// FIXME: not sure the minimum score is correct, need to test against wide
// range of content, particularly content divided in 2+ containers
// sometimes our content won't reach such a high score so here we look for an
// acceptable minimum, if our highest score didn't go above twenty remove all
// but the highest
if (highestScore < 20 && contentElement.readability && contentElement.readability.contentScore < highestScore)
{
contentBlocks.splice(m, 1);
} //otherwise we only remove content blocks that have scored less than that minimum
else if (highestScore > 20 && contentElement.readability && contentElement.readability.contentScore < 20)
{
contentBlocks.splice(m, 1);
}
}
// with many content containers we need to verify that some
// aren't descendants of others otherwise we'll get multiple output
if (contentBlocks.length > 1)
{
var n = contentBlocks.length;
// remove all content elements that are descandants of another
while (n--)
{
var contentElement = contentBlocks[n];
/**
* hasAnyAncestor should work better overall but some sites
* have so many div's up the hierarchy with lots of good keywords
* its hard to keep those out, for those sites
* (http://www.azstarnet.com/news/290815) hasAnyDescendant works
* best so will need to consider changing and QA heavily.
*/
if (hasAnyDescendant(contentElement, contentBlocks))
contentBlocks.splice(n, 1);
}
}
// DEBUG
console.log("ContentBlocks: " + contentBlocks.length);
var p = contentBlocks.length;
while (p--)
{
var contentElement = contentBlocks[p];
removeElementStyles(contentElement);
// remove any consecutive
's into just one
removeBreaks(contentElement);
// this cleanup should only happen if paragraphs were found since
// malformed content suggests div's are used to maintain content
if (!malformedContent)
{
// goes in and removes DIV's that have more non
stuff than
stuff
removeNonContentElement(contentElement, "div");
}
//removeNonContentElement(contentElement, "ul");
// clean out anymore possible junk
removeElementByMinWords(contentElement, "form");
removeElementByMinWords(contentElement, "object");
removeElementByMinWords(contentElement, "table", 250);
removeElementByMinWords(contentElement, "h1");
removeElementByMinWords(contentElement, "h2");
removeElementByMinWords(contentElement, "iframe");
articleContent.appendChild(contentElement);
}
// Readability has failed you.. show msg that content was not found
if (contentBlocks.length == 0)
{
articleContent = document.createElement("DIV");
articleContent.innerHTML = 'Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.';
}
return articleContent;
}
//--------------------------------------------------------------------------
//
// ContentParserUtils
//
//--------------------------------------------------------------------------
/**
* Removes any elements of the provided tag name from the specified element
* if it doesn't contain the minimum amount of words.
*
* @param element The element.
* @param tagName The tag name of the elements to be retrieved from within
* the provided element.
* @param minWords The minimum number of words.
*/
function removeElementByMinWords(element, tagName, minWords)
{
// default minimum if none is provided
minWords = minWords || 1000000; // FIXME: not sure why such a higher number!
var elements = element.getElementsByTagName(tagName),
i = elements.length;
while (i--)
{
var target = elements[i];
// the text content doesn't meet our requirements so remove it
if (getWordCount(target) < minWords)
{
target.parentNode.removeChild(target);
}
}
}
/**
* Removes any instances of the provided non-content element from the
* specified root element if it passes a few tests. First, if a single
* bad keyword is found or second less than 25 words exist within.
*
* @param element The element.
* @param tagName The tag name of the elements to be retrieved from within
* the provided element.
*/
function removeNonContentElement(element, tagName)
{
var elements = element.getElementsByTagName(tagName),
i = elements.length;
// gather counts for other typical elements embedded within and then traverse
// backwards so we can remove elements at the same time without effecting the traversal
while (i--)
{
var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "crumbs", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "tab", "tag", "toolbar", "tools", "trackback", "tweetback", "widget"],
descendant = elements[i],
descendantId = descendant.id.toLowerCase(),
descendantClassName = descendant.className.toLowerCase(),
p = descendant.getElementsByTagName("p").length,
img = descendant.getElementsByTagName("img").length,
li = descendant.getElementsByTagName("li").length,
a = descendant.getElementsByTagName("a").length,
embed = descendant.getElementsByTagName("embed").length;
/*
// no basic elements were found at all
if (a == 0 && embed == 0 & img == 0 && li == 0 && p == 0)
{
// retrieve all children to see if it contains any elements
var children = descendant.getElementsByTagName("*");
var containsOnlyText = true;
for (var j = 0; j < children.length; j++)
{
var child = children[j];
// element type found so we don't have an element (e.g. DIV) with just text
if (child.nodeType == 1)
{
containsOnlyText = false;
break;
}
}
//
if (!containsOnlyText)
{
descendant.parentNode.removeChild(descendant);
}
continue;
}
else
{*/
var j = badKeywords.length;
// should improve this but for if the element has a single bad keyword remove it
while (j--)
{
if (descendantId.indexOf(badKeywords[j]) >= 0 || descendantClassName.indexOf(badKeywords[j]) >= 0)
{
descendant.parentNode.removeChild(descendant);
descendant = null;
break;
}
}
/*}*/
// found a bad keyword so the element has been removed, continue to the next one
if (!descendant)
continue;
// we have fewer than 25 words.. bad sign..
if (getWordCount(descendant) < 25)
{
// the number of non-paragraph elements is more than actual
// paragraphs or other ominous signs (:) and elements
if (img > p || li > p || a > p || p == 0 || embed > 0)
{
descendant.parentNode.removeChild(descendant);
}
}
}
}
//--------------------------------------------------------------------------
//
// ElementUtils
//
//--------------------------------------------------------------------------
/**
* Returns the text content of the specified element.
*
* @param element The element from which to retrieve its text content.
*
* @return The string content of the specified element.
*/
function getText(element)
{
return (typeof element.textContent != "undefined")
? element.textContent
: element.innerText;
}
/**
* Returns the word count for the specified element.
*
* @param element The element.
*
* @returns A count indicating the number of words
*/
function getWordCount(element)
{
// normalize replaces consecutive spacing with a single space,
// by then triming, we can safely split on a space for a count
return trim(normalize(getText(element))).split(" ").length;
}
/**
* Determines if the specified element has one of the provided array of
* ancestors and if so returns true.
*
* @param element The element.
* @param ancestors An array of possible ancestors.
*
* @returns True if the element has one of the provided ancestors,
* false if it does not.
*/
function hasAnyAncestor(element, ancestors)
{
var parent = element.parentNode;
while (parent != null)
{
// ancestor found!
if (ancestors.indexOf(parent) >= 0)
return true;
parent = parent.parentNode;
}
return false;
}
/**
* Determines if the specified element has one of the provided array of
* descendants and if so returns true.
*
* @param element The element.
* @param descendants An array of possible descendants.
*
* @returns True if the element has one of the provided descendants,
* false if it does not.
*/
function hasAnyDescendant(element, descendants)
{
var elements = element.getElementsByTagName("*"),
i = elements.length;
while (i--)
{
// descendant found!
if (descendants.indexOf(elements[i]) >= 0)
return true;
}
return false;
}
/**
* Returns true if the value given is defined. Otherwise returns false.
*
* @param value The value to determine if defined.
*
* @return True if the value given is defined, false if it does not.
*/
function isDefined(value)
{
var undefined;
return value !== undefined;
}
/**
* Replaces consecutive spaces with a single space.
*/
function normalize(text)
{
return (text || "").replace(/\s{2,}/g, " ");
}
/**
* Replaces consecutive br tags with a single br tag from the specified element.
*
* @param element The element containing consecutive br tags.
*/
function removeBreaks(element)
{
// FIXME: the regex doesn't seem to pick up consecutive br tags, need to revisit
element.innerHTML = element.innerHTML.replace(/((
]*>)[\s]*(
]*>)){1,}/gi, "
");
}
/**
* Removes any styles on the specified element.
*
* @param element The element containing the styles to be removed.
*/
function removeElementStyles(element)
{
// bad node, there's not much we can do
if (!element)
return;
// remove any root styles, if we're able
if (typeof element.removeAttribute == "function")
element.removeAttribute("style");
// TODO: do not use firstChild and nextSibling, use childNodes array instead
// prepare to remove styles on all children and siblings
var childElement = element.firstChild;
while (childElement)
{
if (childElement.nodeType == 1)
{
// remove any root styles, if we're able
if (typeof element.removeAttribute == "function")
childElement.removeAttribute("style");
// remove styles recursively
removeElementStyles(childElement);
}
childElement = childElement.nextSibling;
}
}
/**
* Removes all inline or external referencing scripts.
*/
function removeScripts()
{
var scripts = document.getElementsByTagName("SCRIPT"),
i = scripts.length;
while (i--)
{
var script = scripts[i];
// remove inline or external referencing scripts (that aren't Readability related)
if (!script.src || (script.src && script.src.indexOf("readability") == -1))
{
script.parentNode.removeChild(script);
}
}
}
/**
* Removes all inline styles.
*/
function removeStyles()
{
var styles = document.getElementsByTagName("STYLE"),
i = styles.length;
while (i--)
{
var style = styles[i];
// we prefer to remove the tag completely but if not able we'll clear it
if (style.parentNode)
{
style.parentNode.removeChild(style);
}
else
{
if (style.textContent)
{
style.textContent = "";
}
else
{
// most browsers support textContent but IE has its own way but it
// seems that Firefox supports both, check link for last example
// http://www.phpied.com/the-star-hack-in-ie8-and-dynamic-stylesheets/
// note that if the style tag contains no text content, then
// no styleSheet object is defined either
if (style.styleSheet)
style.styleSheet.cssText = "";
}
}
}
}
/**
* Removes all linked stylesheets.
*/
function removeStylesheets()
{
var i = document.styleSheets.length;
// TODO: need to do more research, not sure if disabling is enough
// for cross browser compatibility, might consider removal via parent
// just as done in the removeScripts method, but will need to retrieve
// all LINK tags and make sure rel attr is "stylesheet" or that its
// type attr is "text/css"
while (i--)
{
var styleSheet = document.styleSheets[i];
if (styleSheet.href && styleSheet.href.lastIndexOf("readability") == -1)
{
styleSheet.disabled = true;
}
}
}
/**
* Removes whitespace from the front and the end of the specified string.
*
* @param text The String whose beginning and ending whitespace will be removed.
*
* @returns A String with whitespace removed from the begining and end
*/
function trim(text)
{
return (text || "").replace(/^\s+|\s+$/g, "");
}