diff options
author | JJfutbol <JJfutbol@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> | 2009-07-21 22:33:23 +0000 |
---|---|---|
committer | JJfutbol <JJfutbol@d4e419ec-0920-11de-bbfd-a7c1bc4c261e> | 2009-07-21 22:33:23 +0000 |
commit | 658a366744f1deabe625988a46ae108e116b62a7 (patch) | |
tree | 9548fdcf89afd91eda6e83b63197c23ec7316c48 /js | |
parent | a92b5ba9239c324ec4f286ed8bdd931d4ba30ff7 (diff) | |
download | readability-simple-658a366744f1deabe625988a46ae108e116b62a7.tar.bz2 readability-simple-658a366744f1deabe625988a46ae108e116b62a7.zip |
added score increment if word count is greater than 30 when parsing content, for now commented out but will continue to test against various sites
git-svn-id: http://arc90labs-readability.googlecode.com/svn/trunk@71 d4e419ec-0920-11de-bbfd-a7c1bc4c261e
Diffstat (limited to 'js')
-rwxr-xr-x | js/readability.js | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/js/readability.js b/js/readability.js index 57eb240..cde729d 100755 --- a/js/readability.js +++ b/js/readability.js @@ -129,6 +129,9 @@ function determineContentScore(score, parent, element) if (element.tagName.toLowerCase() == "p" && getWordCount(element) > 20) //|| (score == 0 && getText(element).length > 10))
score++;
+ //if (getWordCount(element) > 30)
+ // score++;
+
// FIXME: not sure yet if this will be included, this would break
// pages that use multiple containers for content, or we could tweak
// the acceptable minimum... but that would have to be set quite
@@ -148,8 +151,8 @@ function determineContentScore(score, parent, element) function parseContent() {
// replace all doubled-up <BR> tags with <P> tags, and remove inline fonts
- document.body.innerHTML = document.body.innerHTML.replace(/<br[^>]*>\s*<br[^>]*>/gi, "<p />").replace(/<\/?font[^>]*>/gi, "");
-
+ document.body.innerHTML = document.body.innerHTML.replace(/<br[^>]*>\s| *<br[^>]*>/gi, "<p />").replace(/<\/?font[^>]*>/gi, "");
+ console.log(document.body.innerHTML.substring(0, 600));
var articleContent = document.createElement("DIV");
var paragraphs = document.getElementsByTagName("P");
var contentBlocks = [];
|