From 1c6c114ee9c716f3dba8e31ec281e556ca99a08a Mon Sep 17 00:00:00 2001 From: umbrae Date: Thu, 5 Mar 2009 15:45:51 +0000 Subject: Adding the 0.1 source to trunk git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@2 d4e419ec-0920-11de-bbfd-a7c1bc4c261e --- js/readability-0.1.js | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100755 js/readability-0.1.js (limited to 'js/readability-0.1.js') diff --git a/js/readability-0.1.js b/js/readability-0.1.js new file mode 100755 index 0000000..5b153df --- /dev/null +++ b/js/readability-0.1.js @@ -0,0 +1,149 @@ +(function(){ + var objOverlay = document.createElement("div"); + var objinnerDiv = document.createElement("div"); + + objOverlay.id = "readOverlay"; + objinnerDiv.id = "readInner"; + + // Apply user-selected styling: + document.body.className = readStyle; + objOverlay.className = readStyle; + objinnerDiv.className = readMargin + " " + readSize; + + objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div + objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay + + // This removes everything else on the page. Requires a page refresh to undo it. + // I tried the damn overlay on top - but had rendering issues: + document.body.innerHTML = ""; + + // Inserts the new content : + document.body.insertBefore(objOverlay, document.body.firstChild); +})() + +function grabArticle() { + var allParagraphs = document.getElementsByTagName("p"); + var topDivCount = 0; + var topDiv; + var topDivParas; + + var articleContent = document.createElement("DIV"); + var articleTitle = document.createElement("H1"); + var articleFooter = document.createElement("DIV"); + + // Replace all doubled-up
tags with

tags : + var pattern = new RegExp ("
[ \r\n\s]*
", "g"); + document.body.innerHTML = document.body.innerHTML.replace(pattern, "

"); + + // Grab the title from the tag and inject it as the title. + articleTitle.innerHTML = document.title; + articleContent.appendChild(articleTitle); + + // Study all the paragraphs and find the chunk that has the most <p>'s and keep it: + for (var j=0; j < allParagraphs.length; j++) { + var tempParas = allParagraphs[j].parentNode.getElementsByTagName("p"); + + if ( tempParas.length > topDivCount && getCharCount(allParagraphs[j].parentNode) >= tempParas.length ) { + topDivCount = tempParas.length; + topDiv = allParagraphs[j].parentNode; + } + } + + // REMOVES ALL STYLESHEETS ... + for (var k=0;k < document.styleSheets.length; k++) { + if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf("arc90.com/experiments/readability") == -1) { + document.styleSheets[k].disabled = true; + } + } + // Remove all style tags in head (not doing this on IE) : + var styleTags = document.getElementsByTagName("style"); + for (var j=0;j < styleTags.length; j++) { + if (navigator.appName != "Microsoft Internet Explorer") { + styleTags[j].textContent = ""; + } + } + + cleanStyles(topDiv); // Removes all style attributes + topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff + + // Cleans out junk from the topDiv just in case: + topDiv = clean(topDiv, "form"); + topDiv = clean(topDiv, "object"); + topDiv = clean(topDiv, "table"); + topDiv = clean(topDiv, "h1"); + topDiv = clean(topDiv, "h2"); + topDiv = clean(topDiv, "iframe"); + + // Add the footer and contents: + articleFooter.id = "readFooter"; + articleFooter.innerHTML = "<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png'></a>"; + + articleContent.appendChild(topDiv); + articleContent.appendChild(articleFooter); + + return articleContent; +} + +// Get character count +function getCharCount ( e,s ) { + s = s || ","; + if (navigator.appName == "Microsoft Internet Explorer") { + return parentContent = e.innerText.split(',').length; + } + else { + return parentContent = e.textContent.split(',').length; + } +} + +function cleanStyles( e ) { + e = e || document; + var cur = e.firstChild; + + // Go until there are no more child nodes + while ( cur != null ) { + if ( cur.nodeType == 1 ) { + // Remove style attribute(s) : + cur.removeAttribute("style"); + cleanStyles( cur ); + } + cur = cur.nextSibling; + } +} + +function killDivs ( e ) { + var divsList = e.getElementsByTagName( "div" ); + var curDivLength = divsList.length; + + // Gather counts for other typical elements embedded within : + for (var i=0; i < curDivLength; i ++) { + var p = divsList[i].getElementsByTagName("p").length; + var img = divsList[i].getElementsByTagName("img").length; + var li = divsList[i].getElementsByTagName("li").length; + var a = divsList[i].getElementsByTagName("a").length; + var embed = divsList[i].getElementsByTagName("embed").length; + + // If the number of commas is less than 10 (bad sign) ... + if ( getCharCount(divsList[i]) < 10) { + // And the number of non-paragraph elements is more than paragraphs + // or other ominous signs : + if ( img > p || li > p || a > p || p == 0 || embed > 0) { + divsList[i].style.display = "none"; + } + } + } + return e; +} + +function clean(e, tags, minWords) { + var targetList = e.getElementsByTagName( tags ); + minWords = minWords || 1000000; + + for (var y=0; y < targetList.length; y++) { + // If the text content isn't laden with words, remove the child: + if (getCharCount(targetList[y], " ") < minWords) { + targetList[y].parentNode.removeChild(targetList[y]); + } + } + return e; +} + -- cgit v1.2.3