From d0743842804a6a8b853d45d81082f3c50f4d8fef Mon Sep 17 00:00:00 2001 From: umbrae Date: Mon, 9 Mar 2009 15:47:58 +0000 Subject: Removing debug messages, formatting comments to standard, moving to readability.js git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@17 d4e419ec-0920-11de-bbfd-a7c1bc4c261e --- js/readability.js | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100755 js/readability.js (limited to 'js/readability.js') diff --git a/js/readability.js b/js/readability.js new file mode 100755 index 0000000..1c92e3f --- /dev/null +++ b/js/readability.js @@ -0,0 +1,193 @@ +(function(){ + var objOverlay = document.createElement("div"); + var objinnerDiv = document.createElement("div"); + + objOverlay.id = "readOverlay"; + objinnerDiv.id = "readInner"; + + // Apply user-selected styling: + document.body.className = readStyle; + objOverlay.className = readStyle; + objinnerDiv.className = readMargin + " " + readSize; + + objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div + objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay + + // For totally hosed HTML, add body node that can't be found because of bad HTML or something. + if(document.body == null) + { + body = document.createElement("body"); + document.body = body; + } + + // This removes everything else on the page. Requires a page refresh to undo it. + // I tried the damn overlay on top - but had rendering issues: + document.body.innerHTML = ""; + + // Inserts the new content : + document.body.insertBefore(objOverlay, document.body.firstChild); +})() + +function grabArticle() { + var allParagraphs = document.getElementsByTagName("p"); + var topDivCount = 0; + var topDiv = null; + var topDivParas; + + var articleContent = document.createElement("DIV"); + var articleTitle = document.createElement("H1"); + var articleFooter = document.createElement("DIV"); + + // Replace all doubled-up
tags with

tags, and remove fonts. + var pattern = new RegExp ("
[ \r\n\s]*
", "g"); + document.body.innerHTML = document.body.innerHTML.replace(pattern, "

").replace(/<\/?font[^>]*>/g, ''); + + // Grab the title from the tag and inject it as the title. + articleTitle.innerHTML = document.title; + articleContent.appendChild(articleTitle); + + // Study all the paragraphs and find the chunk that has the best score. + // A score is determined by things like: Number of <p>'s, commas, special classes, etc. + for (var j=0; j < allParagraphs.length; j++) { + parentNode = allParagraphs[j].parentNode; + + // Initialize readability data + if(typeof parentNode.readability == 'undefined') + { + parentNode.readability = {"contentScore": 0}; + + // Look for a special classname + if(parentNode.className.match(/(comment|meta)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/)) + parentNode.readability.contentScore += 25; + + // Look for a special ID + if(parentNode.id.match(/(comment|meta)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.id.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/)) + parentNode.readability.contentScore += 25; + } + + // Add a point for the paragraph found + if(getInnerText(allParagraphs[j]).length > 10) + parentNode.readability.contentScore++; + + // Add points for any commas within this paragraph + parentNode.readability.contentScore += getCharCount(allParagraphs[j]); + } + + // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 + for(nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++) + if(typeof node.readability != 'undefined' && (topDiv == null || node.readability.contentScore > topDiv.readability.contentScore)) + topDiv = node; + + // REMOVES ALL STYLESHEETS ... + for (var k=0;k < document.styleSheets.length; k++) + if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf("readability") == -1) + document.styleSheets[k].disabled = true; + + // Remove all style tags in head (not doing this on IE) : + var styleTags = document.getElementsByTagName("style"); + for (var j=0;j < styleTags.length; j++) + if (navigator.appName != "Microsoft Internet Explorer") + styleTags[j].textContent = ""; + + cleanStyles(topDiv); // Removes all style attributes + topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff + topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br /> + + // Cleans out junk from the topDiv just in case: + topDiv = clean(topDiv, "form"); + topDiv = clean(topDiv, "object"); + topDiv = clean(topDiv, "table", 250); + topDiv = clean(topDiv, "h1"); + topDiv = clean(topDiv, "h2"); + topDiv = clean(topDiv, "iframe"); + + // Add the footer and contents: + articleFooter.id = "readFooter"; + articleFooter.innerHTML = "<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png'></a>"; + + articleContent.appendChild(topDiv); + articleContent.appendChild(articleFooter); + + return articleContent; +} + +// Get the inner text of a node - cross browser compatibly. +function getInnerText(e) +{ + if (navigator.appName == "Microsoft Internet Explorer") + return e.innerText; + else + return e.textContent; +} + +// Get character count +function getCharCount ( e,s ) { + s = s || ","; + return getInnerText(e).split(s).length; +} + +function cleanStyles( e ) { + e = e || document; + var cur = e.firstChild; + + // Remove any root styles + e.removeAttribute('style'); + + // Go until there are no more child nodes + while ( cur != null ) { + if ( cur.nodeType == 1 ) { + // Remove style attribute(s) : + cur.removeAttribute("style"); + cleanStyles( cur ); + } + cur = cur.nextSibling; + } +} + +function killDivs ( e ) { + var divsList = e.getElementsByTagName( "div" ); + var curDivLength = divsList.length; + + // Gather counts for other typical elements embedded within. + // Traverse backwards so we can remove nodes at the same time without effecting the traversal. + for (var i=curDivLength-1; i >= 0; i--) { + var p = divsList[i].getElementsByTagName("p").length; + var img = divsList[i].getElementsByTagName("img").length; + var li = divsList[i].getElementsByTagName("li").length; + var a = divsList[i].getElementsByTagName("a").length; + var embed = divsList[i].getElementsByTagName("embed").length; + + // If the number of commas is less than 10 (bad sign) ... + if ( getCharCount(divsList[i]) < 10) { + // And the number of non-paragraph elements is more than paragraphs + // or other ominous signs : + if ( img > p || li > p || a > p || p == 0 || embed > 0) { + divsList[i].parentNode.removeChild(divsList[i]); + } + } + } + return e; +} + +function killBreaks ( e ) { + e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>(\s| ?)*){1,}/g,'<br />'); + return e; +} + +function clean(e, tags, minWords) { + var targetList = e.getElementsByTagName( tags ); + minWords = minWords || 1000000; + + for (var y=0; y < targetList.length; y++) { + // If the text content isn't laden with words, remove the child: + if (getCharCount(targetList[y], " ") < minWords) { + targetList[y].parentNode.removeChild(targetList[y]); + } + } + return e; +} + -- cgit v1.2.3