diff options
author | umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> | 2009-03-07 14:46:01 +0000 |
---|---|---|
committer | umbrae <umbrae@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> | 2009-03-07 14:46:01 +0000 |
commit | af01903012c701583f73f5512855c0a27774f8a4 (patch) | |
tree | 01123a63f52f273ae391d39df213395ba9818ecb /js | |
parent | 999fe5ef1559a19a0ef21dfc34729581196e3ebc (diff) | |
download | readability-simple-af01903012c701583f73f5512855c0a27774f8a4.tar.bz2 readability-simple-af01903012c701583f73f5512855c0a27774f8a4.zip |
Kill extra break tags in the filtered HTML. #8
git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@13 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
Diffstat (limited to 'js')
-rwxr-xr-x | js/readability-0.1.js | 24 |
1 files changed, 11 insertions, 13 deletions
diff --git a/js/readability-0.1.js b/js/readability-0.1.js index 6c255e6..aade9b2 100755 --- a/js/readability-0.1.js +++ b/js/readability-0.1.js @@ -81,15 +81,6 @@ function grabArticle() { /* Add points for any commas within this paragraph */
parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
-
- /* The old way of determining fitness:
- var tempParas = allParagraphs[j].parentNode.getElementsByTagName("p");
-
- if ( tempParas.length > topDivCount && getCharCount(allParagraphs[j].parentNode) >= tempParas.length ) {
- topDivCount = tempParas.length;
- topDiv = allParagraphs[j].parentNode;
- }
- */
}
/* Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 */
@@ -122,7 +113,8 @@ function grabArticle() { cleanStyles(topDiv); // Removes all style attributes
topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff
-
+ topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br />
+
// Cleans out junk from the topDiv just in case:
topDiv = clean(topDiv, "form");
topDiv = clean(topDiv, "object");
@@ -174,8 +166,9 @@ function killDivs ( e ) { var divsList = e.getElementsByTagName( "div" );
var curDivLength = divsList.length;
- // Gather counts for other typical elements embedded within :
- for (var i=0; i < curDivLength; i ++) {
+ // Gather counts for other typical elements embedded within.
+ // Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+ for (var i=curDivLength-1; i >= 0; i--) {
var p = divsList[i].getElementsByTagName("p").length;
var img = divsList[i].getElementsByTagName("img").length;
var li = divsList[i].getElementsByTagName("li").length;
@@ -187,13 +180,18 @@ function killDivs ( e ) { // And the number of non-paragraph elements is more than paragraphs
// or other ominous signs :
if ( img > p || li > p || a > p || p == 0 || embed > 0) {
- divsList[i].style.display = "none";
+ divsList[i].parentNode.removeChild(divsList[i]);
}
}
}
return e;
}
+function killBreaks ( e ) {
+ e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>\s*){1,}/,'<br />');
+ return e;
+}
+
function clean(e, tags, minWords) {
var targetList = e.getElementsByTagName( tags );
minWords = minWords || 1000000;
|