From b260a47d808ed044f5d12ec8920e946c581096e9 Mon Sep 17 00:00:00 2001
From: JJfutbol tags, and remove fonts
+ //var pattern = new RegExp(" ").replace(/<\/?font[^>]*>/gi, "");
+ document.body.innerHTML = document.body.innerHTML.replace(/ '); // first item
+ firstTime = false;
+ }
+
+ if (html.indexOf('\n\n') == html.lastIndexOf('\n\n'))
+ html = html.replace('\n\n', '
tags with
[ \r\n\s]*
", "gi");
+ //document.body.innerHTML = document.body.innerHTML.replace(pattern, "
\s*
/gi, "
'); // every item in between
+ }
+
+ document.body.innerHTML = html;
+ */
var articleContent = document.createElement("DIV");
- var articleTitle = document.createElement("H1");
- var articleFooter = document.createElement("DIV");
+ var paragraphs = document.getElementsByTagName("P");
+ var contentBlocks = [];
- // Replace all doubled-up
tags with
tags, and remove fonts.
- var pattern = new RegExp ("
[ \r\n\s]*
", "g");
- document.body.innerHTML = document.body.innerHTML.replace(pattern, "
").replace(/<\/?font[^>]*>/g, ''); - // Grab the title from the
's, commas, special classes, etc. - for (var j=0; j < allParagraphs.length; j++) { - parentNode = allParagraphs[j].parentNode; - - // Initialize readability data - if(typeof parentNode.readability == 'undefined') + + /* + // PRE based content parsing only! + // this was only an EXPERIMENT, need to be revisited + + var pres = document.getElementsByTagName("PRE"); + for (var i = 0; i < pres.length; i++) + { + var pre = pres[i]; + + var content = document.createElement("DIV"); + + var text = pre.textContent; + var firstTime = true; + + while (text.indexOf('\n\n') >= 0) { - parentNode.readability = {"contentScore": 0}; - - // Look for a special classname - if(parentNode.className.match(/(comment|meta|footer|footnote)/)) - parentNode.readability.contentScore -= 50; - else if(parentNode.className.match(/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/)) - parentNode.readability.contentScore += 25; - - // Look for a special ID - if(parentNode.id.match(/(comment|meta|footer|footnote)/)) - parentNode.readability.contentScore -= 50; - else if(parentNode.id.match(/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/)) - parentNode.readability.contentScore += 25; + if (firstTime) + { + text = text.replace('\n\n', '
'); // first item + firstTime = false; + } + else + { + if (text.indexOf('\n\n') == text.lastIndexOf('\n\n')) + text = text.replace('\n\n', '
'); // last item + else + text = text.replace('\n\n', ''); // every item in between
+ }
+ }
+
+ content.innerHTML = text.replace(/={10,}/g, "====================");
+
+ paragraphs = content.getElementsByTagName("P");
+
+ var preElements = [];
+ for (var j = 0; j < paragraphs.length; j++)
+ {
+ p = paragraphs[j];
+
+ breaks = p.getElementsByTagName("BR");
+
+ if (p.innerHTML.indexOf("\t") == -1 && p.innerHTML.indexOf(" ") == -1 && breaks.length >= 1)
+ {
+ p.innerHTML = p.innerHTML.replace(/
/gi, " ");
+ }
+
+ console.log("tabs: " + p.innerHTML.split("\t").length + " -- " + p.innerHTML.split(/\s{2,}/g).length + " -- " + p.innerHTML.substr(0, 35))
+
+ numTabs = p.innerHTML.split("\t").length + p.innerHTML.split(/ {3,}/g).length;
+
+ if (numTabs > 3)
+ {
+ preElements.push(p);
+ }
+ }
+
+ for (var k = 0; k < preElements.length; k++)
+ {
+ var p = preElements[k];
+
+ var newPre = document.createElement("PRE");
+ newPre.innerHTML = p.innerHTML.replace(/
/gi, "\n");
+ newPre.className = "normalPre";
+
+ p.parentNode.replaceChild(newPre, p);
+ }
+
+ content.innerHTML = content.innerHTML.replace(/
[ \r\n\s]*
/gi, "
");
+
+ contentBlocks.push(content);
+ }
+ */
+
+ // wow.. talk about a bad site, no paragraphs found so we'll attempt to
+ // parse content from div's and set our malformedContent flag
+ if (paragraphs.length == 0)
+ {
+ paragraphs = document.getElementsByTagName("DIV");
+
+ malformedContent = true;
+ }
+
+ for (var i = 0; i < paragraphs.length; i++)
+ {
+ var parentNode = paragraphs[i].parentNode;
+
+ // if the parent happens to be a form element, accessing properties
+ // such as id or className don't work, or rather it attempts to access
+ // children so we need to make sure we only deal with string values,
+ // also if the parent element is the body then its ignored
+ if (parentNode.tagName.toLowerCase() == "body" || typeof parentNode.id != "string" || typeof parentNode.className != "string")
+ continue;
+
+ // initialize readability score data
+ if (typeof parentNode.readability == "undefined")
+ parentNode.readability = {"contentScore": 0};
+
+ parentNode.readability.contentScore = determineContentScore(parentNode.readability.contentScore, parentNode, paragraphs[i]);
+
+ // looks like we have possible content candidates, add it
+ if (parentNode.readability.contentScore > 0)
+ {
+ // DEBUG
+ console.log(parentNode.tagName + " id: " + parentNode.id + " -- class: " + parentNode.className + " -- score: " + parentNode.readability.contentScore);
+
+ // careful, only add parent element once!
+ if (contentBlocks.indexOf(parentNode) == -1)
+ contentBlocks.push(parentNode);
+ }
+ }
+
+ /*
+ // TODO: need to revisit parsing strictly tables/divs content only
+ if (contentBlocks.length == 0)
+ {
+ var paragraphs = document.getElementsByTagName("tbody");
+
+ for (var i = 0; i < paragraphs.length; i++)
+ {
+ var parentNode = paragraphs[i].parentNode;
+
+ // Initialize readability data
+ if (typeof parentNode.readability == "undefined")
+ {
+ parentNode.readability = {"contentScore": determineContentScore(parentNode, paragraphs[i])};
+
+ if (parentNode.readability.contentScore > 0)
+ {
+ console.log(parentNode.tagName + " id: " + parentNode.id + " -- class: " + parentNode.className + " -- score: " + parentNode.readability.contentScore);
+
+ if (contentBlocks.indexOf(parentNode) == -1)
+ contentBlocks.push(parentNode);
+ }
+ }
+ }
+ }
+ */
+
+ removeScripts();
+ removeStylesheets();
+ removeStyles();
+
+
+ // DEBUG
+ console.log("ContentBlocks: " + contentBlocks.length + " -- HighestScore: " + highestScore);
+
+
+ // remove all content elements that aren't of the highest score
+ var numContentBlocks = contentBlocks.length - 1;
+ for (var m = numContentBlocks; m >= 0; m--)
+ {
+ var contentElement = contentBlocks[m];
+
+
+ // DEBUG
+ //console.log("id: " + contentElement.id + " -- class: " + contentElement.className + " -- result: " + ((highestScore < 20 && contentElement.readability.contentScore < highestScore) || (contentElement.readability.contentScore < 20)).toString().toUpperCase());
+
+
+ // FIXME: had trouble writing the if/else if as a single if or statement
+ // FIXME: not sure the minimum score is correct, need to test against wide
+ // range of content, particularly content divided in 2+ containers
+
+ // sometimes our content won't reach such a high score so here we look for an
+ // acceptable minimum, if our highest score didn't go above twenty remove all
+ // but the highest
+ if (highestScore < 20 && contentElement.readability.contentScore < highestScore)
+ {
+ contentBlocks.splice(m, 1);
+ } //otherwise we only remove content blocks that have scored less than that minimum
+ else if (highestScore > 20 && contentElement.readability.contentScore < 20)
+ {
+ contentBlocks.splice(m, 1);
+ }
+ }
+
+
+ // with many content containers we need to verify that some
+ // aren't descendants of others otherwise we'll get multiple output
+ if (contentBlocks.length > 1)
+ {
+ // remove all content elements that are descandants of another
+ var numContentBlocks = contentBlocks.length - 1;
+ for (var m = numContentBlocks; m >= 0; m--)
+ {
+ var contentElement = contentBlocks[m];
+
+ /**
+ * hasAnyAncestor should work better overall but some sites
+ * have so many div's up the hierarchy with lots of good keywords
+ * its hard to keep those out, for those sites
+ * (http://www.azstarnet.com/news/290815) hasAnyDescendant works
+ * best so will need to consider changing and QA heavily.
+ */
+ if (hasAnyDescendant(contentElement, contentBlocks))
+ contentBlocks.splice(m, 1);
}
+ }
+
+
+ // DEBUG
+ console.log("ContentBlocks: " + contentBlocks.length);
+
+
+ for (var m = 0; m < contentBlocks.length; m++)
+ {
+ var contentElement = contentBlocks[m];
+
+ removeElementStyles(contentElement);
+
+ // remove any consecutive
's into just one
+ removeBreaks(contentElement);
+
+ // this cleanup should only happen if paragraphs were found since
+ // malformed content suggests div's are used to maintain content
+ if (!malformedContent)
+ {
+ // goes in and removes DIV's that have more non
stuff than
stuff + removeNonContentElement(contentElement, "div"); + } + + //removeNonContentElement(contentElement, "ul"); + + // clean out anymore possible junk + removeElementByMinWords(contentElement, "form"); + removeElementByMinWords(contentElement, "object"); + removeElementByMinWords(contentElement, "table", 250); + removeElementByMinWords(contentElement, "h1"); + removeElementByMinWords(contentElement, "h2"); + removeElementByMinWords(contentElement, "iframe"); + + articleContent.appendChild(contentElement); + } + + // Readability has failed you.. show msg that content was not found + if (contentBlocks.length == 0) + { + articleContent = document.createElement("DIV"); + articleContent.innerHTML = 'Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.'; + } + + return articleContent; +} - // Add a point for the paragraph found - if(getInnerText(allParagraphs[j]).length > 10) - parentNode.readability.contentScore++; - // Add points for any commas within this paragraph - parentNode.readability.contentScore += getCharCount(allParagraphs[j]); - } - // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 - for(nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++) - if(typeof node.readability != 'undefined' && (topDiv == null || node.readability.contentScore > topDiv.readability.contentScore)) - topDiv = node; +//-------------------------------------------------------------------------- +// +// ContentParserUtils +// +//-------------------------------------------------------------------------- - if(topDiv == null) +/** + * Removes any elements of the provided tag name from the specified element + * if it doesn't contain the minimum amount of words. + * + * @param element The element. + * @param tagName The tag name of the elements to be retrieved from within + * the provided element. + * @param minWords The minimum number of words. + */ +function removeElementByMinWords(element, tagName, minWords) +{ + // default minimum if none is provided + minWords = minWords || 1000000; // FIXME: not sure why such a higher number! + + var elements = element.getElementsByTagName(tagName); + var numElements = elements.length - 1; + + for (var i = numElements; i >= 0; i--) { - topDiv = document.createElement('div'); - topDiv.innerHTML = 'Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.'; + var target = elements[i]; + + // the text content doesn't meet our requirements so remove it + if (getWordCount(target) < minWords) + { + target.parentNode.removeChild(target); + } } +} + +/** + * Removes any instances of the provided non-content element from the + * specified root element if it passes a few tests. First, if a single + * bad keyword is found or second less than 25 words exist within. + * + * @param element The element. + * @param tagName The tag name of the elements to be retrieved from within + * the provided element. + */ +function removeNonContentElement(element, tagName) +{ + var elements = element.getElementsByTagName(tagName); + var numElements = elements.length - 1; - // REMOVES ALL STYLESHEETS ... - for (var k=0;k < document.styleSheets.length; k++) { - if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf("readability") == -1) { - document.styleSheets[k].disabled = true; + // gather counts for other typical elements embedded within and then traverse + // backwards so we can remove elements at the same time without effecting the traversal + for (var i = numElements; i >= 0; i--) + { + var descendant = elements[i]; + var p = descendant.getElementsByTagName("p").length; + var img = descendant.getElementsByTagName("img").length; + var li = descendant.getElementsByTagName("li").length; + var a = descendant.getElementsByTagName("a").length; + var embed = descendant.getElementsByTagName("embed").length; + + var badKeywords = ["ad", "captcha", "classified", "clear", "comment", "footer", "footnote", "leftcolumn", "listing", "menu", "meta", "module", "nav", "navbar", "rightcolumn", "sidebar", "sponsor", "toolbar", "tools", "trackback", "widget"]; + + // should improve this but for if the element has a single bad keyword remove it + for (var j = 0; j < badKeywords.length; j++) + { + if (descendant.id.toLowerCase().indexOf(badKeywords[j]) >= 0 || descendant.className.toLowerCase().indexOf(badKeywords[j]) >= 0) + { + descendant.parentNode.removeChild(descendant); + descendant = null; + break; + } + } + + // found a bad keyword so the element has been removed, continue to the next one + if (!descendant) + continue; + + // we have fewer than 25 words.. bad sign.. + if (getWordCount(descendant) < 25) + { + // the number of non-paragraph elements is more than actual + // paragraphs or other ominous signs (:) and elements + if (img > p || li >= p || a >= p || p == 0 || embed > 0) + { + descendant.parentNode.removeChild(descendant); + } } } +} - // Remove all style tags in head (not doing this on IE) : - var styleTags = document.getElementsByTagName("style"); - for (var j=0;j < styleTags.length; j++) - if (navigator.appName != "Microsoft Internet Explorer") - styleTags[j].textContent = ""; +//-------------------------------------------------------------------------- +// +// ElementUtils +// +//-------------------------------------------------------------------------- + +/** + * Returns the word count for the specified element. + * + * @param element The element. + * + * @returns A count indicating the number of words + */ +function getWordCount(element) +{ + // normalize replaces consecutive spacing with a single space, + // by then triming, we can safely split on a space for a count + return trim(normalize(getText(element))).split(" ").length; +} - cleanStyles(topDiv); // Removes all style attributes - topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non
stuff than
stuff
- topDiv = killBreaks(topDiv); // Removes any consecutive
's into just one
+/**
+ * Returns the text content of the specified element.
+ *
+ * @param element The element from which to retrieve its text content.
+ *
+ * @return The string content of the specified element.
+ */
+function getText(element)
+{
+ return (typeof element.textContent != "undefined")
+ ? element.textContent
+ : element.innerText;
+}
- // Cleans out junk from the topDiv just in case:
- topDiv = clean(topDiv, "form");
- topDiv = clean(topDiv, "object");
- topDiv = clean(topDiv, "table", 250);
- topDiv = clean(topDiv, "h1");
- topDiv = clean(topDiv, "h2");
- topDiv = clean(topDiv, "iframe");
+/**
+ * Determines if the specified element has one of the provided array of
+ * ancestors and if so returns true.
+ *
+ * @param element The element.
+ * @param ancestors An array of possible ancestors.
+ *
+ * @returns True if the element has one of the provided ancestors,
+ * false if it does not.
+ */
+function hasAnyAncestor(element, ancestors)
+{
+ var parent = element.parentNode;
-
- // Add the footer and contents:
- articleFooter.id = "readFooter";
- articleFooter.innerHTML = "\
- \
-