From d0743842804a6a8b853d45d81082f3c50f4d8fef Mon Sep 17 00:00:00 2001 From: umbrae Date: Mon, 9 Mar 2009 15:47:58 +0000 Subject: Removing debug messages, formatting comments to standard, moving to readability.js git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@17 d4e419ec-0920-11de-bbfd-a7c1bc4c261e --- js/readability-0.1.js | 213 ------------------------------------------------ js/readability-start.js | 2 +- js/readability.js | 193 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 214 deletions(-) delete mode 100755 js/readability-0.1.js create mode 100755 js/readability.js diff --git a/js/readability-0.1.js b/js/readability-0.1.js deleted file mode 100755 index 7250433..0000000 --- a/js/readability-0.1.js +++ /dev/null @@ -1,213 +0,0 @@ -(function(){ - var objOverlay = document.createElement("div"); - var objinnerDiv = document.createElement("div"); - - objOverlay.id = "readOverlay"; - objinnerDiv.id = "readInner"; - - // Apply user-selected styling: - document.body.className = readStyle; - objOverlay.className = readStyle; - objinnerDiv.className = readMargin + " " + readSize; - - objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div - objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay - - // For totally hosed HTML, add body node that can't be found because of bad HTML or something. - if(document.body == null) - { - body = document.createElement("body"); - document.body = body; - } - - // This removes everything else on the page. Requires a page refresh to undo it. - // I tried the damn overlay on top - but had rendering issues: - document.body.innerHTML = ""; - - // Inserts the new content : - document.body.insertBefore(objOverlay, document.body.firstChild); -})() - -/* Remove this and any dbg calls before release to bring down file size. */ -function dbg(text) -{ - if(typeof console != 'undefined') - console.log(text); -} - -function grabArticle() { - var allParagraphs = document.getElementsByTagName("p"); - var topDivCount = 0; - var topDiv = null; - var topDivParas; - - var articleContent = document.createElement("DIV"); - var articleTitle = document.createElement("H1"); - var articleFooter = document.createElement("DIV"); - - // Replace all doubled-up
tags with

tags, and remove fonts. - var pattern = new RegExp ("
[ \r\n\s]*
", "g"); - document.body.innerHTML = document.body.innerHTML.replace(pattern, "

").replace(/<\/?font[^>]*>/g, ''); - - // Grab the title from the tag and inject it as the title. - articleTitle.innerHTML = document.title; - articleContent.appendChild(articleTitle); - - // Study all the paragraphs and find the chunk that has the best score. - // A score is determined by things like: Number of <p>'s, commas, special classes, etc. - for (var j=0; j < allParagraphs.length; j++) { - parentNode = allParagraphs[j].parentNode; - - /* Initialize readability data */ - if(typeof parentNode.readability == 'undefined') - { - parentNode.readability = {"contentScore": 0}; - - // Look for a special classname - if(parentNode.className.match(/(comment|meta)/)) - parentNode.readability.contentScore -= 50; - else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/)) - parentNode.readability.contentScore += 25; - - // Look for a special ID - if(parentNode.id.match(/(comment|meta)/)) - parentNode.readability.contentScore -= 50; - else if(parentNode.id.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/)) - parentNode.readability.contentScore += 25; - } - - /* Add a point for the paragraph found */ - if(getInnerText(allParagraphs[j]).length > 10) - parentNode.readability.contentScore++; - - /* Add points for any commas within this paragraph */ - dbg("Current paragraph has " + getCharCount(allParagraphs[j]) + " commas."); - parentNode.readability.contentScore += getCharCount(allParagraphs[j]); - } - - /* Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 */ - for(nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++) - { - if(typeof node.readability != 'undefined') - { - dbg('Found a node with a content score of ' + node.readability.contentScore); - if(topDiv == null || node.readability.contentScore > topDiv.readability.contentScore) - { - dbg('Found a more fit node. Setting topDiv.' + node.className); - topDiv = node; - } - } - } - - // REMOVES ALL STYLESHEETS ... - for (var k=0;k < document.styleSheets.length; k++) { - if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf("readability") == -1) { - document.styleSheets[k].disabled = true; - } - } - // Remove all style tags in head (not doing this on IE) : - var styleTags = document.getElementsByTagName("style"); - for (var j=0;j < styleTags.length; j++) { - if (navigator.appName != "Microsoft Internet Explorer") { - styleTags[j].textContent = ""; - } - } - - cleanStyles(topDiv); // Removes all style attributes - topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff - topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br /> - - // Cleans out junk from the topDiv just in case: - topDiv = clean(topDiv, "form"); - topDiv = clean(topDiv, "object"); - topDiv = clean(topDiv, "table", 250); - topDiv = clean(topDiv, "h1"); - topDiv = clean(topDiv, "h2"); - topDiv = clean(topDiv, "iframe"); - - // Add the footer and contents: - articleFooter.id = "readFooter"; - articleFooter.innerHTML = "<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png'></a>"; - - articleContent.appendChild(topDiv); - articleContent.appendChild(articleFooter); - - return articleContent; -} - -// Get the inner text of a node - cross browser compatibly. -function getInnerText(e) -{ - if (navigator.appName == "Microsoft Internet Explorer") - return e.innerText; - else - return e.textContent; -} - -// Get character count -function getCharCount ( e,s ) { - s = s || ","; - return getInnerText(e).split(s).length; -} - -function cleanStyles( e ) { - e = e || document; - var cur = e.firstChild; - - // Remove any root styles - e.removeAttribute('style'); - - // Go until there are no more child nodes - while ( cur != null ) { - if ( cur.nodeType == 1 ) { - // Remove style attribute(s) : - cur.removeAttribute("style"); - cleanStyles( cur ); - } - cur = cur.nextSibling; - } -} - -function killDivs ( e ) { - var divsList = e.getElementsByTagName( "div" ); - var curDivLength = divsList.length; - - // Gather counts for other typical elements embedded within. - // Traverse backwards so we can remove nodes at the same time without effecting the traversal. - for (var i=curDivLength-1; i >= 0; i--) { - var p = divsList[i].getElementsByTagName("p").length; - var img = divsList[i].getElementsByTagName("img").length; - var li = divsList[i].getElementsByTagName("li").length; - var a = divsList[i].getElementsByTagName("a").length; - var embed = divsList[i].getElementsByTagName("embed").length; - - // If the number of commas is less than 10 (bad sign) ... - if ( getCharCount(divsList[i]) < 10) { - // And the number of non-paragraph elements is more than paragraphs - // or other ominous signs : - if ( img > p || li > p || a > p || p == 0 || embed > 0) { - divsList[i].parentNode.removeChild(divsList[i]); - } - } - } - return e; -} - -function killBreaks ( e ) { - e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>(\s| ?)*){1,}/g,'<br />'); - return e; -} - -function clean(e, tags, minWords) { - var targetList = e.getElementsByTagName( tags ); - minWords = minWords || 1000000; - - for (var y=0; y < targetList.length; y++) { - // If the text content isn't laden with words, remove the child: - if (getCharCount(targetList[y], " ") < minWords) { - targetList[y].parentNode.removeChild(targetList[y]); - } - } - return e; -} - diff --git a/js/readability-start.js b/js/readability-start.js index 834984e..ae90443 100755 --- a/js/readability-start.js +++ b/js/readability-start.js @@ -6,7 +6,7 @@ var baseHref = window.location.toString().match(/.*\//); var linkStringStart = "javascript:(function(){"; -var linkStringEnd = "';_readability_script=document.createElement('SCRIPT');_readability_script.type='text/javascript';_readability_script.src='" + baseHref + "js/readability-0.1.js?x='+(Math.random());document.getElementsByTagName('head')[0].appendChild(_readability_script);_readability_css=document.createElement('LINK');_readability_css.rel='stylesheet';_readability_css.href='" + baseHref + "css/readability.css';_readability_css.type='text/css';_readability_css.media='screen';document.getElementsByTagName('head')[0].appendChild(_readability_css);_readability_print_css=document.createElement('LINK');_readability_print_css.rel='stylesheet';_readability_print_css.href='" + baseHref + "css/readability-print.css';_readability_print_css.media='print';_readability_print_css.type='text/css';document.getElementsByTagName('head')[0].appendChild(_readability_print_css);})();"; +var linkStringEnd = "';_readability_script=document.createElement('SCRIPT');_readability_script.type='text/javascript';_readability_script.src='" + baseHref + "js/readability.js?x='+(Math.random());document.getElementsByTagName('head')[0].appendChild(_readability_script);_readability_css=document.createElement('LINK');_readability_css.rel='stylesheet';_readability_css.href='" + baseHref + "css/readability.css';_readability_css.type='text/css';_readability_css.media='screen';document.getElementsByTagName('head')[0].appendChild(_readability_css);_readability_print_css=document.createElement('LINK');_readability_print_css.rel='stylesheet';_readability_print_css.href='" + baseHref + "css/readability-print.css';_readability_print_css.media='print';_readability_print_css.type='text/css';document.getElementsByTagName('head')[0].appendChild(_readability_print_css);})();"; $(document).ready(function() { diff --git a/js/readability.js b/js/readability.js new file mode 100755 index 0000000..1c92e3f --- /dev/null +++ b/js/readability.js @@ -0,0 +1,193 @@ +(function(){ + var objOverlay = document.createElement("div"); + var objinnerDiv = document.createElement("div"); + + objOverlay.id = "readOverlay"; + objinnerDiv.id = "readInner"; + + // Apply user-selected styling: + document.body.className = readStyle; + objOverlay.className = readStyle; + objinnerDiv.className = readMargin + " " + readSize; + + objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div + objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay + + // For totally hosed HTML, add body node that can't be found because of bad HTML or something. + if(document.body == null) + { + body = document.createElement("body"); + document.body = body; + } + + // This removes everything else on the page. Requires a page refresh to undo it. + // I tried the damn overlay on top - but had rendering issues: + document.body.innerHTML = ""; + + // Inserts the new content : + document.body.insertBefore(objOverlay, document.body.firstChild); +})() + +function grabArticle() { + var allParagraphs = document.getElementsByTagName("p"); + var topDivCount = 0; + var topDiv = null; + var topDivParas; + + var articleContent = document.createElement("DIV"); + var articleTitle = document.createElement("H1"); + var articleFooter = document.createElement("DIV"); + + // Replace all doubled-up <BR> tags with <P> tags, and remove fonts. + var pattern = new RegExp ("<br/?>[ \r\n\s]*<br/?>", "g"); + document.body.innerHTML = document.body.innerHTML.replace(pattern, "</p><p>").replace(/<\/?font[^>]*>/g, ''); + + // Grab the title from the <title> tag and inject it as the title. + articleTitle.innerHTML = document.title; + articleContent.appendChild(articleTitle); + + // Study all the paragraphs and find the chunk that has the best score. + // A score is determined by things like: Number of <p>'s, commas, special classes, etc. + for (var j=0; j < allParagraphs.length; j++) { + parentNode = allParagraphs[j].parentNode; + + // Initialize readability data + if(typeof parentNode.readability == 'undefined') + { + parentNode.readability = {"contentScore": 0}; + + // Look for a special classname + if(parentNode.className.match(/(comment|meta)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.className.match(/(hentry|entry[-]?(content|text|body)|article[-]?(content|text|body))/)) + parentNode.readability.contentScore += 25; + + // Look for a special ID + if(parentNode.id.match(/(comment|meta)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.id.match(/(hentry|entry[-]?(content|text)|article[-]?(text|content))/)) + parentNode.readability.contentScore += 25; + } + + // Add a point for the paragraph found + if(getInnerText(allParagraphs[j]).length > 10) + parentNode.readability.contentScore++; + + // Add points for any commas within this paragraph + parentNode.readability.contentScore += getCharCount(allParagraphs[j]); + } + + // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 + for(nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++) + if(typeof node.readability != 'undefined' && (topDiv == null || node.readability.contentScore > topDiv.readability.contentScore)) + topDiv = node; + + // REMOVES ALL STYLESHEETS ... + for (var k=0;k < document.styleSheets.length; k++) + if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf("readability") == -1) + document.styleSheets[k].disabled = true; + + // Remove all style tags in head (not doing this on IE) : + var styleTags = document.getElementsByTagName("style"); + for (var j=0;j < styleTags.length; j++) + if (navigator.appName != "Microsoft Internet Explorer") + styleTags[j].textContent = ""; + + cleanStyles(topDiv); // Removes all style attributes + topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff + topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br /> + + // Cleans out junk from the topDiv just in case: + topDiv = clean(topDiv, "form"); + topDiv = clean(topDiv, "object"); + topDiv = clean(topDiv, "table", 250); + topDiv = clean(topDiv, "h1"); + topDiv = clean(topDiv, "h2"); + topDiv = clean(topDiv, "iframe"); + + // Add the footer and contents: + articleFooter.id = "readFooter"; + articleFooter.innerHTML = "<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png'></a>"; + + articleContent.appendChild(topDiv); + articleContent.appendChild(articleFooter); + + return articleContent; +} + +// Get the inner text of a node - cross browser compatibly. +function getInnerText(e) +{ + if (navigator.appName == "Microsoft Internet Explorer") + return e.innerText; + else + return e.textContent; +} + +// Get character count +function getCharCount ( e,s ) { + s = s || ","; + return getInnerText(e).split(s).length; +} + +function cleanStyles( e ) { + e = e || document; + var cur = e.firstChild; + + // Remove any root styles + e.removeAttribute('style'); + + // Go until there are no more child nodes + while ( cur != null ) { + if ( cur.nodeType == 1 ) { + // Remove style attribute(s) : + cur.removeAttribute("style"); + cleanStyles( cur ); + } + cur = cur.nextSibling; + } +} + +function killDivs ( e ) { + var divsList = e.getElementsByTagName( "div" ); + var curDivLength = divsList.length; + + // Gather counts for other typical elements embedded within. + // Traverse backwards so we can remove nodes at the same time without effecting the traversal. + for (var i=curDivLength-1; i >= 0; i--) { + var p = divsList[i].getElementsByTagName("p").length; + var img = divsList[i].getElementsByTagName("img").length; + var li = divsList[i].getElementsByTagName("li").length; + var a = divsList[i].getElementsByTagName("a").length; + var embed = divsList[i].getElementsByTagName("embed").length; + + // If the number of commas is less than 10 (bad sign) ... + if ( getCharCount(divsList[i]) < 10) { + // And the number of non-paragraph elements is more than paragraphs + // or other ominous signs : + if ( img > p || li > p || a > p || p == 0 || embed > 0) { + divsList[i].parentNode.removeChild(divsList[i]); + } + } + } + return e; +} + +function killBreaks ( e ) { + e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>(\s| ?)*){1,}/g,'<br />'); + return e; +} + +function clean(e, tags, minWords) { + var targetList = e.getElementsByTagName( tags ); + minWords = minWords || 1000000; + + for (var y=0; y < targetList.length; y++) { + // If the text content isn't laden with words, remove the child: + if (getCharCount(targetList[y], " ") < minWords) { + targetList[y].parentNode.removeChild(targetList[y]); + } + } + return e; +} + -- cgit v1.2.3