summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorumbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>2009-03-07 14:46:01 +0000
committerumbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e>2009-03-07 14:46:01 +0000
commitaf01903012c701583f73f5512855c0a27774f8a4 (patch)
tree01123a63f52f273ae391d39df213395ba9818ecb
parent999fe5ef1559a19a0ef21dfc34729581196e3ebc (diff)
downloadreadability-simple-af01903012c701583f73f5512855c0a27774f8a4.tar.bz2
readability-simple-af01903012c701583f73f5512855c0a27774f8a4.zip
Kill extra break tags in the filtered HTML. #8
git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@13 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
-rwxr-xr-xjs/readability-0.1.js24
1 files changed, 11 insertions, 13 deletions
diff --git a/js/readability-0.1.js b/js/readability-0.1.js
index 6c255e6..aade9b2 100755
--- a/js/readability-0.1.js
+++ b/js/readability-0.1.js
@@ -81,15 +81,6 @@ function grabArticle() {
/* Add points for any commas within this paragraph */
parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
-
- /* The old way of determining fitness:
- var tempParas = allParagraphs[j].parentNode.getElementsByTagName("p");
-
- if ( tempParas.length > topDivCount && getCharCount(allParagraphs[j].parentNode) >= tempParas.length ) {
- topDivCount = tempParas.length;
- topDiv = allParagraphs[j].parentNode;
- }
- */
}
/* Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 */
@@ -122,7 +113,8 @@ function grabArticle() {
cleanStyles(topDiv); // Removes all style attributes
topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff
-
+ topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br />
+
// Cleans out junk from the topDiv just in case:
topDiv = clean(topDiv, "form");
topDiv = clean(topDiv, "object");
@@ -174,8 +166,9 @@ function killDivs ( e ) {
var divsList = e.getElementsByTagName( "div" );
var curDivLength = divsList.length;
- // Gather counts for other typical elements embedded within :
- for (var i=0; i < curDivLength; i ++) {
+ // Gather counts for other typical elements embedded within.
+ // Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+ for (var i=curDivLength-1; i >= 0; i--) {
var p = divsList[i].getElementsByTagName("p").length;
var img = divsList[i].getElementsByTagName("img").length;
var li = divsList[i].getElementsByTagName("li").length;
@@ -187,13 +180,18 @@ function killDivs ( e ) {
// And the number of non-paragraph elements is more than paragraphs
// or other ominous signs :
if ( img > p || li > p || a > p || p == 0 || embed > 0) {
- divsList[i].style.display = "none";
+ divsList[i].parentNode.removeChild(divsList[i]);
}
}
}
return e;
}
+function killBreaks ( e ) {
+ e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>\s*){1,}/,'<br />');
+ return e;
+}
+
function clean(e, tags, minWords) {
var targetList = e.getElementsByTagName( tags );
minWords = minWords || 1000000;