mirror of
https://github.com/laurent22/joplin.git
synced 2024-12-24 10:27:10 +02:00
Clipper: Improved: Updated Readability library to improve Simplified Page clipping
This commit is contained in:
parent
68268cb35d
commit
356f8e580b
@ -1,3 +1,5 @@
|
||||
// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
|
||||
|
||||
/*eslint-env es6:false*/
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
@ -496,17 +498,9 @@
|
||||
},
|
||||
setValue: function(newValue) {
|
||||
this._value = newValue;
|
||||
delete this._decodedValue;
|
||||
},
|
||||
setDecodedValue: function(newValue) {
|
||||
this._value = encodeHTML(newValue);
|
||||
this._decodedValue = newValue;
|
||||
},
|
||||
getDecodedValue: function() {
|
||||
if (typeof this._decodedValue === "undefined") {
|
||||
this._decodedValue = (this._value && decodeHTML(this._value)) || "";
|
||||
}
|
||||
return this._decodedValue;
|
||||
getEncodedValue: function() {
|
||||
return encodeHTML(this._value);
|
||||
},
|
||||
};
|
||||
|
||||
@ -611,6 +605,13 @@
|
||||
};
|
||||
|
||||
var Element = function (tag) {
|
||||
// We use this to find the closing tag.
|
||||
this._matchingTag = tag;
|
||||
// We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
|
||||
var lastColonIndex = tag.lastIndexOf(":");
|
||||
if (lastColonIndex != -1) {
|
||||
tag = tag.substring(lastColonIndex + 1);
|
||||
}
|
||||
this.attributes = [];
|
||||
this.childNodes = [];
|
||||
this.children = [];
|
||||
@ -659,6 +660,14 @@
|
||||
this.setAttribute("src", str);
|
||||
},
|
||||
|
||||
get srcset() {
|
||||
return this.getAttribute("srcset") || "";
|
||||
},
|
||||
|
||||
set srcset(str) {
|
||||
this.setAttribute("srcset", str);
|
||||
},
|
||||
|
||||
get nodeName() {
|
||||
return this.tagName;
|
||||
},
|
||||
@ -675,9 +684,9 @@
|
||||
for (var j = 0; j < child.attributes.length; j++) {
|
||||
var attr = child.attributes[j];
|
||||
// the attribute value will be HTML escaped.
|
||||
var val = attr.value;
|
||||
var val = attr.getEncodedValue();
|
||||
var quote = (val.indexOf('"') === -1 ? '"' : "'");
|
||||
arr.push(" " + attr.name + '=' + quote + val + quote);
|
||||
arr.push(" " + attr.name + "=" + quote + val + quote);
|
||||
}
|
||||
|
||||
if (child.localName in voidElems && !child.childNodes.length) {
|
||||
@ -753,8 +762,9 @@
|
||||
getAttribute: function (name) {
|
||||
for (var i = this.attributes.length; --i >= 0;) {
|
||||
var attr = this.attributes[i];
|
||||
if (attr.name === name)
|
||||
return attr.getDecodedValue();
|
||||
if (attr.name === name) {
|
||||
return attr.value;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
},
|
||||
@ -763,11 +773,11 @@
|
||||
for (var i = this.attributes.length; --i >= 0;) {
|
||||
var attr = this.attributes[i];
|
||||
if (attr.name === name) {
|
||||
attr.setDecodedValue(value);
|
||||
attr.setValue(value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
this.attributes.push(new Attribute(name, encodeHTML(value)));
|
||||
this.attributes.push(new Attribute(name, value));
|
||||
},
|
||||
|
||||
removeAttribute: function (name) {
|
||||
@ -778,7 +788,13 @@
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
hasAttribute: function (name) {
|
||||
return this.attributes.some(function (attr) {
|
||||
return attr.name == name;
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
var Style = function (node) {
|
||||
@ -925,7 +941,7 @@
|
||||
// Read the attribute value (and consume the matching quote)
|
||||
var value = this.readString(c);
|
||||
|
||||
node.attributes.push(new Attribute(name, value));
|
||||
node.attributes.push(new Attribute(name, decodeHTML(value)));
|
||||
|
||||
return;
|
||||
},
|
||||
@ -950,7 +966,7 @@
|
||||
strBuf.push(c);
|
||||
c = this.nextChar();
|
||||
}
|
||||
var tag = strBuf.join('');
|
||||
var tag = strBuf.join("");
|
||||
|
||||
if (!tag)
|
||||
return false;
|
||||
@ -961,7 +977,9 @@
|
||||
while (c !== "/" && c !== ">") {
|
||||
if (c === undefined)
|
||||
return false;
|
||||
while (whitespace.indexOf(this.html[this.currentChar++]) != -1);
|
||||
while (whitespace.indexOf(this.html[this.currentChar++]) != -1) {
|
||||
// Advance cursor to first non-whitespace char.
|
||||
}
|
||||
this.currentChar--;
|
||||
c = this.nextChar();
|
||||
if (c !== "/" && c !== ">") {
|
||||
@ -1055,9 +1073,10 @@
|
||||
return null;
|
||||
|
||||
// Read any text as Text node
|
||||
var textNode;
|
||||
if (c !== "<") {
|
||||
--this.currentChar;
|
||||
var textNode = new Text();
|
||||
textNode = new Text();
|
||||
var n = this.html.indexOf("<", this.currentChar);
|
||||
if (n === -1) {
|
||||
textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
|
||||
@ -1069,6 +1088,18 @@
|
||||
return textNode;
|
||||
}
|
||||
|
||||
if (this.match("![CDATA[")) {
|
||||
var endChar = this.html.indexOf("]]>", this.currentChar);
|
||||
if (endChar === -1) {
|
||||
this.error("unclosed CDATA section");
|
||||
return null;
|
||||
}
|
||||
textNode = new Text();
|
||||
textNode.textContent = this.html.substring(this.currentChar, endChar);
|
||||
this.currentChar = endChar + ("]]>").length;
|
||||
return textNode;
|
||||
}
|
||||
|
||||
c = this.peekNext();
|
||||
|
||||
// Read Comment node. Normally, Comment nodes know their inner
|
||||
@ -1100,7 +1131,7 @@
|
||||
// If this isn't a void Element, read its child nodes
|
||||
if (!closed) {
|
||||
this.readChildren(node);
|
||||
var closingTag = "</" + localName + ">";
|
||||
var closingTag = "</" + node._matchingTag + ">";
|
||||
if (!this.match(closingTag)) {
|
||||
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
|
||||
return null;
|
||||
|
@ -1,3 +1,5 @@
|
||||
// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
|
||||
|
||||
/* eslint-env es6:false */
|
||||
/* globals exports */
|
||||
/*
|
||||
|
@ -1,3 +1,5 @@
|
||||
// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
|
||||
|
||||
/*eslint-env es6:false*/
|
||||
/*
|
||||
* Copyright (c) 2010 Arc90 Inc
|
||||
@ -39,6 +41,7 @@ function Readability(doc, options) {
|
||||
this._articleTitle = null;
|
||||
this._articleByline = null;
|
||||
this._articleDir = null;
|
||||
this._articleSiteName = null;
|
||||
this._attempts = [];
|
||||
|
||||
// Configurable options
|
||||
@ -111,15 +114,18 @@ Readability.prototype = {
|
||||
// All of the regular expressions in use within readability.
|
||||
// Defined up here so we don't instantiate them repeatedly in loops.
|
||||
REGEXPS: {
|
||||
unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
|
||||
// NOTE: These two regular expressions are duplicated in
|
||||
// Readability-readerable.js. Please keep both copies in sync.
|
||||
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
|
||||
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
|
||||
|
||||
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
|
||||
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
|
||||
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
|
||||
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
|
||||
byline: /byline|author|dateline|writtenby|p-author/i,
|
||||
replaceFonts: /<(\/?)font[^>]*>/gi,
|
||||
normalize: /\s{2,}/g,
|
||||
videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||
videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
|
||||
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
|
||||
prevLink: /(prev|earl|old|new|<|«)/i,
|
||||
whitespace: /^\s*$/,
|
||||
@ -260,7 +266,7 @@ Readability.prototype = {
|
||||
|
||||
_getAllNodesWithTag: function(node, tagNames) {
|
||||
if (node.querySelectorAll) {
|
||||
return node.querySelectorAll(tagNames.join(','));
|
||||
return node.querySelectorAll(tagNames.join(","));
|
||||
}
|
||||
return [].concat.apply([], tagNames.map(function(tag) {
|
||||
var collection = node.getElementsByTagName(tag);
|
||||
@ -320,7 +326,7 @@ Readability.prototype = {
|
||||
return uri;
|
||||
}
|
||||
|
||||
var links = articleContent.getElementsByTagName("a");
|
||||
var links = this._getAllNodesWithTag(articleContent, ["a"]);
|
||||
this._forEachNode(links, function(link) {
|
||||
var href = link.getAttribute("href");
|
||||
if (href) {
|
||||
@ -335,7 +341,7 @@ Readability.prototype = {
|
||||
}
|
||||
});
|
||||
|
||||
var imgs = articleContent.getElementsByTagName("img");
|
||||
var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
|
||||
this._forEachNode(imgs, function(img) {
|
||||
var src = img.getAttribute("src");
|
||||
if (src) {
|
||||
@ -355,11 +361,11 @@ Readability.prototype = {
|
||||
var origTitle = "";
|
||||
|
||||
try {
|
||||
curTitle = origTitle = doc.title;
|
||||
curTitle = origTitle = doc.title.trim();
|
||||
|
||||
// If they had an element with id "title" in their HTML
|
||||
if (typeof curTitle !== "string")
|
||||
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
|
||||
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
|
||||
} catch (e) {/* ignore exceptions setting the title. */}
|
||||
|
||||
var titleHadHierarchicalSeparators = false;
|
||||
@ -370,44 +376,45 @@ Readability.prototype = {
|
||||
// If there's a separator in the title, first remove the final part
|
||||
if ((/ [\|\-\\\/>»] /).test(curTitle)) {
|
||||
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
|
||||
curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
|
||||
curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
|
||||
|
||||
// If the resulting title is too short (3 words or fewer), remove
|
||||
// the first part instead:
|
||||
if (wordCount(curTitle) < 3)
|
||||
curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
|
||||
} else if (curTitle.indexOf(': ') !== -1) {
|
||||
curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
|
||||
} else if (curTitle.indexOf(": ") !== -1) {
|
||||
// Check if we have an heading containing this exact string, so we
|
||||
// could assume it's the full title.
|
||||
var headings = this._concatNodeLists(
|
||||
doc.getElementsByTagName('h1'),
|
||||
doc.getElementsByTagName('h2')
|
||||
doc.getElementsByTagName("h1"),
|
||||
doc.getElementsByTagName("h2")
|
||||
);
|
||||
var trimmedTitle = curTitle.trim();
|
||||
var match = this._someNode(headings, function(heading) {
|
||||
return heading.textContent === curTitle;
|
||||
return heading.textContent.trim() === trimmedTitle;
|
||||
});
|
||||
|
||||
// If we don't, let's extract the title out of the original title string.
|
||||
if (!match) {
|
||||
curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
|
||||
curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
|
||||
|
||||
// If the title is now too short, try the first colon instead:
|
||||
if (wordCount(curTitle) < 3) {
|
||||
curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
|
||||
curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
|
||||
// But if we have too many words before the colon there's something weird
|
||||
// with the titles and the H tags so let's just use the original title instead
|
||||
} else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
|
||||
} else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
|
||||
curTitle = origTitle;
|
||||
}
|
||||
}
|
||||
} else if (curTitle.length > 150 || curTitle.length < 15) {
|
||||
var hOnes = doc.getElementsByTagName('h1');
|
||||
var hOnes = doc.getElementsByTagName("h1");
|
||||
|
||||
if (hOnes.length === 1)
|
||||
curTitle = this._getInnerText(hOnes[0]);
|
||||
}
|
||||
|
||||
curTitle = curTitle.trim();
|
||||
curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
|
||||
// If we now have 4 words or fewer as our title, and either no
|
||||
// 'hierarchical' separators (\, /, > or ») were found in the original
|
||||
// title or we decreased the number of words by more than 1 word, use
|
||||
@ -497,7 +504,8 @@ Readability.prototype = {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!this._isPhrasingContent(next)) break;
|
||||
if (!this._isPhrasingContent(next))
|
||||
break;
|
||||
|
||||
// Otherwise, make this node a child of the new <p>.
|
||||
var sibling = next.nextSibling;
|
||||
@ -505,7 +513,12 @@ Readability.prototype = {
|
||||
next = sibling;
|
||||
}
|
||||
|
||||
while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
|
||||
while (p.lastChild && this._isWhitespace(p.lastChild)) {
|
||||
p.removeChild(p.lastChild);
|
||||
}
|
||||
|
||||
if (p.parentNode.tagName === "P")
|
||||
this._setNodeTag(p.parentNode, "DIV");
|
||||
}
|
||||
});
|
||||
},
|
||||
@ -527,7 +540,16 @@ Readability.prototype = {
|
||||
replacement.readability = node.readability;
|
||||
|
||||
for (var i = 0; i < node.attributes.length; i++) {
|
||||
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
|
||||
try {
|
||||
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
|
||||
} catch (ex) {
|
||||
/* it's possible for setAttribute() to throw if the attribute name
|
||||
* isn't a valid XML Name. Such attributes can however be parsed from
|
||||
* source in HTML docs, see https://github.com/whatwg/html/issues/4275,
|
||||
* so we can hit them here and then throw. We don't care about such
|
||||
* attributes so we ignore them.
|
||||
*/
|
||||
}
|
||||
}
|
||||
return replacement;
|
||||
},
|
||||
@ -547,6 +569,8 @@ Readability.prototype = {
|
||||
// visually linked to other content-ful elements (text, images, etc.).
|
||||
this._markDataTables(articleContent);
|
||||
|
||||
this._fixLazyImages(articleContent);
|
||||
|
||||
// Clean out junk from the article content
|
||||
this._cleanConditionally(articleContent, "form");
|
||||
this._cleanConditionally(articleContent, "fieldset");
|
||||
@ -557,16 +581,21 @@ Readability.prototype = {
|
||||
this._clean(articleContent, "link");
|
||||
this._clean(articleContent, "aside");
|
||||
|
||||
// Clean out elements have "share" in their id/class combinations from final top candidates,
|
||||
// Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
|
||||
// which means we don't remove the top candidates even they have "share".
|
||||
this._forEachNode(articleContent.children, function(topCandidate) {
|
||||
this._cleanMatchedNodes(topCandidate, /share/);
|
||||
|
||||
var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
|
||||
|
||||
this._forEachNode(articleContent.children, function (topCandidate) {
|
||||
this._cleanMatchedNodes(topCandidate, function (node, matchString) {
|
||||
return /share/.test(matchString) && node.textContent.length < shareElementThreshold;
|
||||
});
|
||||
});
|
||||
|
||||
// If there is only one h2 and its text content substantially equals article title,
|
||||
// they are probably using it as a header and not a subheader,
|
||||
// so remove it since we already extract the title separately.
|
||||
var h2 = articleContent.getElementsByTagName('h2');
|
||||
var h2 = articleContent.getElementsByTagName("h2");
|
||||
if (h2.length === 1) {
|
||||
var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
|
||||
if (Math.abs(lengthSimilarRate) < 0.5) {
|
||||
@ -596,12 +625,12 @@ Readability.prototype = {
|
||||
this._cleanConditionally(articleContent, "div");
|
||||
|
||||
// Remove extra paragraphs
|
||||
this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
|
||||
var imgCount = paragraph.getElementsByTagName('img').length;
|
||||
var embedCount = paragraph.getElementsByTagName('embed').length;
|
||||
var objectCount = paragraph.getElementsByTagName('object').length;
|
||||
this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
|
||||
var imgCount = paragraph.getElementsByTagName("img").length;
|
||||
var embedCount = paragraph.getElementsByTagName("embed").length;
|
||||
var objectCount = paragraph.getElementsByTagName("object").length;
|
||||
// At this point, nasty iframes have been removed, only remain embedded video ones.
|
||||
var iframeCount = paragraph.getElementsByTagName('iframe').length;
|
||||
var iframeCount = paragraph.getElementsByTagName("iframe").length;
|
||||
var totalCount = imgCount + embedCount + objectCount + iframeCount;
|
||||
|
||||
return totalCount === 0 && !this._getInnerText(paragraph, false);
|
||||
@ -612,6 +641,19 @@ Readability.prototype = {
|
||||
if (next && next.tagName == "P")
|
||||
br.parentNode.removeChild(br);
|
||||
});
|
||||
|
||||
// Remove single-cell tables
|
||||
this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
|
||||
var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
|
||||
if (this._hasSingleTagInsideElement(tbody, "TR")) {
|
||||
var row = tbody.firstElementChild;
|
||||
if (this._hasSingleTagInsideElement(row, "TD")) {
|
||||
var cell = row.firstElementChild;
|
||||
cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
|
||||
table.parentNode.replaceChild(cell, table);
|
||||
}
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
@ -625,34 +667,34 @@ Readability.prototype = {
|
||||
node.readability = {"contentScore": 0};
|
||||
|
||||
switch (node.tagName) {
|
||||
case 'DIV':
|
||||
case "DIV":
|
||||
node.readability.contentScore += 5;
|
||||
break;
|
||||
|
||||
case 'PRE':
|
||||
case 'TD':
|
||||
case 'BLOCKQUOTE':
|
||||
case "PRE":
|
||||
case "TD":
|
||||
case "BLOCKQUOTE":
|
||||
node.readability.contentScore += 3;
|
||||
break;
|
||||
|
||||
case 'ADDRESS':
|
||||
case 'OL':
|
||||
case 'UL':
|
||||
case 'DL':
|
||||
case 'DD':
|
||||
case 'DT':
|
||||
case 'LI':
|
||||
case 'FORM':
|
||||
case "ADDRESS":
|
||||
case "OL":
|
||||
case "UL":
|
||||
case "DL":
|
||||
case "DD":
|
||||
case "DT":
|
||||
case "LI":
|
||||
case "FORM":
|
||||
node.readability.contentScore -= 3;
|
||||
break;
|
||||
|
||||
case 'H1':
|
||||
case 'H2':
|
||||
case 'H3':
|
||||
case 'H4':
|
||||
case 'H5':
|
||||
case 'H6':
|
||||
case 'TH':
|
||||
case "H1":
|
||||
case "H2":
|
||||
case "H3":
|
||||
case "H4":
|
||||
case "H5":
|
||||
case "H6":
|
||||
case "TH":
|
||||
node.readability.contentScore -= 5;
|
||||
break;
|
||||
}
|
||||
@ -691,37 +733,6 @@ Readability.prototype = {
|
||||
return node && node.nextElementSibling;
|
||||
},
|
||||
|
||||
/**
|
||||
* Like _getNextNode, but for DOM implementations with no
|
||||
* firstElementChild/nextElementSibling functionality...
|
||||
*/
|
||||
_getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
|
||||
function nextSiblingEl(n) {
|
||||
do {
|
||||
n = n.nextSibling;
|
||||
} while (n && n.nodeType !== n.ELEMENT_NODE);
|
||||
return n;
|
||||
}
|
||||
// First check for kids if those aren't being ignored
|
||||
if (!ignoreSelfAndKids && node.children[0]) {
|
||||
return node.children[0];
|
||||
}
|
||||
// Then for siblings...
|
||||
var next = nextSiblingEl(node);
|
||||
if (next) {
|
||||
return next;
|
||||
}
|
||||
// And finally, move up the parent chain *and* find a sibling
|
||||
// (because this is depth-first traversal, we will have already
|
||||
// seen the parent nodes themselves).
|
||||
do {
|
||||
node = node.parentNode;
|
||||
if (node)
|
||||
next = nextSiblingEl(node);
|
||||
} while (node && !next);
|
||||
return node && next;
|
||||
},
|
||||
|
||||
_checkByline: function(node, matchString) {
|
||||
if (this._articleByline) {
|
||||
return false;
|
||||
@ -729,9 +740,10 @@ Readability.prototype = {
|
||||
|
||||
if (node.getAttribute !== undefined) {
|
||||
var rel = node.getAttribute("rel");
|
||||
var itemprop = node.getAttribute("itemprop");
|
||||
}
|
||||
|
||||
if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
|
||||
if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
|
||||
this._articleByline = node.textContent.trim();
|
||||
return true;
|
||||
}
|
||||
@ -784,6 +796,12 @@ Readability.prototype = {
|
||||
while (node) {
|
||||
var matchString = node.className + " " + node.id;
|
||||
|
||||
if (!this._isProbablyVisible(node)) {
|
||||
this.log("Removing hidden node - " + matchString);
|
||||
node = this._removeAndGetNext(node);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check to see if this node is a byline, and remove it if it is.
|
||||
if (this._checkByline(node, matchString)) {
|
||||
node = this._removeAndGetNext(node);
|
||||
@ -794,6 +812,7 @@ Readability.prototype = {
|
||||
if (stripUnlikelyCandidates) {
|
||||
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
||||
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
|
||||
!this._hasAncestorTag(node, "table") &&
|
||||
node.tagName !== "BODY" &&
|
||||
node.tagName !== "A") {
|
||||
this.log("Removing unlikely candidate - " + matchString);
|
||||
@ -826,12 +845,14 @@ Readability.prototype = {
|
||||
if (p !== null) {
|
||||
p.appendChild(childNode);
|
||||
} else if (!this._isWhitespace(childNode)) {
|
||||
p = doc.createElement('p');
|
||||
p = doc.createElement("p");
|
||||
node.replaceChild(p, childNode);
|
||||
p.appendChild(childNode);
|
||||
}
|
||||
} else if (p !== null) {
|
||||
while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
|
||||
while (p.lastChild && this._isWhitespace(p.lastChild)) {
|
||||
p.removeChild(p.lastChild);
|
||||
}
|
||||
p = null;
|
||||
}
|
||||
childNode = nextSibling;
|
||||
@ -841,7 +862,7 @@ Readability.prototype = {
|
||||
// element. DIVs with only a P element inside and no text content can be
|
||||
// safely converted into plain P elements to avoid confusing the scoring
|
||||
// algorithm with DIVs with are, in practice, paragraphs.
|
||||
if (this._hasSinglePInsideElement(node) && this._getLinkDensity(node) < 0.25) {
|
||||
if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
|
||||
var newNode = node.children[0];
|
||||
node.parentNode.replaceChild(newNode, node);
|
||||
node = newNode;
|
||||
@ -862,7 +883,7 @@ Readability.prototype = {
|
||||
**/
|
||||
var candidates = [];
|
||||
this._forEachNode(elementsToScore, function(elementToScore) {
|
||||
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
|
||||
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
|
||||
return;
|
||||
|
||||
// If this paragraph is less than 25 characters, don't even count it.
|
||||
@ -881,17 +902,17 @@ Readability.prototype = {
|
||||
contentScore += 1;
|
||||
|
||||
// Add points for any commas within this paragraph.
|
||||
contentScore += innerText.split(',').length;
|
||||
contentScore += innerText.split(",").length;
|
||||
|
||||
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
|
||||
|
||||
// Initialize and score ancestors.
|
||||
this._forEachNode(ancestors, function(ancestor, level) {
|
||||
if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
|
||||
if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
|
||||
return;
|
||||
|
||||
if (typeof(ancestor.readability) === 'undefined') {
|
||||
if (typeof(ancestor.readability) === "undefined") {
|
||||
this._initializeNode(ancestor);
|
||||
candidates.push(ancestor);
|
||||
}
|
||||
@ -922,7 +943,7 @@ Readability.prototype = {
|
||||
var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
|
||||
candidate.readability.contentScore = candidateScore;
|
||||
|
||||
this.log('Candidate:', candidate, "with score " + candidateScore);
|
||||
this.log("Candidate:", candidate, "with score " + candidateScore);
|
||||
|
||||
for (var t = 0; t < this._nbTopCandidates; t++) {
|
||||
var aTopCandidate = topCandidates[t];
|
||||
@ -1041,8 +1062,8 @@ Readability.prototype = {
|
||||
var sibling = siblings[s];
|
||||
var append = false;
|
||||
|
||||
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
|
||||
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
|
||||
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
|
||||
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
|
||||
|
||||
if (sibling === topCandidate) {
|
||||
append = true;
|
||||
@ -1076,7 +1097,7 @@ Readability.prototype = {
|
||||
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
|
||||
// We have a node that isn't a common block level element, like a form or td tag.
|
||||
// Turn it into a div so it doesn't get filtered out later by accident.
|
||||
this.log("Altering sibling:", sibling, 'to div.');
|
||||
this.log("Altering sibling:", sibling, "to div.");
|
||||
|
||||
sibling = this._setNodeTag(sibling, "DIV");
|
||||
}
|
||||
@ -1144,7 +1165,7 @@ Readability.prototype = {
|
||||
this._attempts.push({articleContent: articleContent, textLength: textLength});
|
||||
// No luck after removing flags, just return the longest text we found during the different loops
|
||||
this._attempts.sort(function (a, b) {
|
||||
return a.textLength < b.textLength;
|
||||
return b.textLength - a.textLength;
|
||||
});
|
||||
|
||||
// But first check if we actually have something
|
||||
@ -1184,7 +1205,7 @@ Readability.prototype = {
|
||||
* @return Boolean - whether the input string is a byline.
|
||||
*/
|
||||
_isValidByline: function(byline) {
|
||||
if (typeof byline == 'string' || byline instanceof String) {
|
||||
if (typeof byline == "string" || byline instanceof String) {
|
||||
byline = byline.trim();
|
||||
return (byline.length > 0) && (byline.length < 100);
|
||||
}
|
||||
@ -1201,61 +1222,75 @@ Readability.prototype = {
|
||||
var values = {};
|
||||
var metaElements = this._doc.getElementsByTagName("meta");
|
||||
|
||||
// Match "description", or Twitter's "twitter:description" (Cards)
|
||||
// in name attribute.
|
||||
var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;
|
||||
// property is a space-separated list of values
|
||||
var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi;
|
||||
|
||||
// Match Facebook's Open Graph title & description properties.
|
||||
var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
|
||||
// name is a single value
|
||||
var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
|
||||
|
||||
// Find description tags.
|
||||
this._forEachNode(metaElements, function(element) {
|
||||
var elementName = element.getAttribute("name");
|
||||
var elementProperty = element.getAttribute("property");
|
||||
|
||||
if ([elementName, elementProperty].indexOf("author") !== -1) {
|
||||
metadata.byline = element.getAttribute("content");
|
||||
var content = element.getAttribute("content");
|
||||
if (!content) {
|
||||
return;
|
||||
}
|
||||
|
||||
var matches = null;
|
||||
var name = null;
|
||||
if (namePattern.test(elementName)) {
|
||||
name = elementName;
|
||||
} else if (propertyPattern.test(elementProperty)) {
|
||||
name = elementProperty;
|
||||
}
|
||||
|
||||
if (name) {
|
||||
var content = element.getAttribute("content");
|
||||
if (elementProperty) {
|
||||
matches = elementProperty.match(propertyPattern);
|
||||
if (matches) {
|
||||
for (var i = matches.length - 1; i >= 0; i--) {
|
||||
// Convert to lowercase, and remove any whitespace
|
||||
// so we can match below.
|
||||
name = matches[i].toLowerCase().replace(/\s/g, "");
|
||||
// multiple authors
|
||||
values[name] = content.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!matches && elementName && namePattern.test(elementName)) {
|
||||
name = elementName;
|
||||
if (content) {
|
||||
// Convert to lowercase and remove any whitespace
|
||||
// so we can match below.
|
||||
name = name.toLowerCase().replace(/\s/g, '');
|
||||
// Convert to lowercase, remove any whitespace, and convert dots
|
||||
// to colons so we can match below.
|
||||
name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
|
||||
values[name] = content.trim();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if ("description" in values) {
|
||||
metadata.excerpt = values["description"];
|
||||
} else if ("og:description" in values) {
|
||||
// Use facebook open graph description.
|
||||
metadata.excerpt = values["og:description"];
|
||||
} else if ("twitter:description" in values) {
|
||||
// Use twitter cards description.
|
||||
metadata.excerpt = values["twitter:description"];
|
||||
// get title
|
||||
metadata.title = values["dc:title"] ||
|
||||
values["dcterm:title"] ||
|
||||
values["og:title"] ||
|
||||
values["weibo:article:title"] ||
|
||||
values["weibo:webpage:title"] ||
|
||||
values["title"] ||
|
||||
values["twitter:title"];
|
||||
|
||||
if (!metadata.title) {
|
||||
metadata.title = this._getArticleTitle();
|
||||
}
|
||||
|
||||
metadata.title = this._getArticleTitle();
|
||||
if (!metadata.title) {
|
||||
if ("og:title" in values) {
|
||||
// Use facebook open graph title.
|
||||
metadata.title = values["og:title"];
|
||||
} else if ("twitter:title" in values) {
|
||||
// Use twitter cards title.
|
||||
metadata.title = values["twitter:title"];
|
||||
}
|
||||
}
|
||||
// get author
|
||||
metadata.byline = values["dc:creator"] ||
|
||||
values["dcterm:creator"] ||
|
||||
values["author"];
|
||||
|
||||
// get description
|
||||
metadata.excerpt = values["dc:description"] ||
|
||||
values["dcterm:description"] ||
|
||||
values["og:description"] ||
|
||||
values["weibo:article:description"] ||
|
||||
values["weibo:webpage:description"] ||
|
||||
values["description"] ||
|
||||
values["twitter:description"];
|
||||
|
||||
// get site name
|
||||
metadata.siteName = values["og:site_name"];
|
||||
|
||||
return metadata;
|
||||
},
|
||||
@ -1266,24 +1301,25 @@ Readability.prototype = {
|
||||
* @param Element
|
||||
**/
|
||||
_removeScripts: function(doc) {
|
||||
this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
|
||||
this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
|
||||
scriptNode.nodeValue = "";
|
||||
scriptNode.removeAttribute('src');
|
||||
scriptNode.removeAttribute("src");
|
||||
return true;
|
||||
});
|
||||
this._removeNodes(doc.getElementsByTagName('noscript'));
|
||||
this._removeNodes(doc.getElementsByTagName("noscript"));
|
||||
},
|
||||
|
||||
/**
|
||||
* Check if this node has only whitespace and a single P element
|
||||
* Check if this node has only whitespace and a single element with given tag
|
||||
* Returns false if the DIV node contains non-empty text nodes
|
||||
* or if it contains no P or more than 1 element.
|
||||
* or if it contains no element with given tag or more than 1 element.
|
||||
*
|
||||
* @param Element
|
||||
* @param string tag of child element
|
||||
**/
|
||||
_hasSinglePInsideElement: function(element) {
|
||||
// There should be exactly 1 element child which is a P:
|
||||
if (element.children.length != 1 || element.children[0].tagName !== "P") {
|
||||
_hasSingleTagInsideElement: function(element, tag) {
|
||||
// There should be exactly 1 element child with given tag
|
||||
if (element.children.length != 1 || element.children[0].tagName !== tag) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1337,7 +1373,7 @@ Readability.prototype = {
|
||||
* @return string
|
||||
**/
|
||||
_getInnerText: function(e, normalizeSpaces) {
|
||||
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
|
||||
normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
|
||||
var textContent = e.textContent.trim();
|
||||
|
||||
if (normalizeSpaces) {
|
||||
@ -1366,7 +1402,7 @@ Readability.prototype = {
|
||||
* @return void
|
||||
**/
|
||||
_cleanStyles: function(e) {
|
||||
if (!e || e.tagName.toLowerCase() === 'svg')
|
||||
if (!e || e.tagName.toLowerCase() === "svg")
|
||||
return;
|
||||
|
||||
// Remove `style` and deprecated presentational attributes
|
||||
@ -1375,8 +1411,8 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
|
||||
e.removeAttribute('width');
|
||||
e.removeAttribute('height');
|
||||
e.removeAttribute("width");
|
||||
e.removeAttribute("height");
|
||||
}
|
||||
|
||||
var cur = e.firstElementChild;
|
||||
@ -1422,7 +1458,7 @@ Readability.prototype = {
|
||||
var weight = 0;
|
||||
|
||||
// Look for a special classname
|
||||
if (typeof(e.className) === 'string' && e.className !== '') {
|
||||
if (typeof(e.className) === "string" && e.className !== "") {
|
||||
if (this.REGEXPS.negative.test(e.className))
|
||||
weight -= 25;
|
||||
|
||||
@ -1431,7 +1467,7 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
// Look for a special ID
|
||||
if (typeof(e.id) === 'string' && e.id !== '') {
|
||||
if (typeof(e.id) === "string" && e.id !== "") {
|
||||
if (this.REGEXPS.negative.test(e.id))
|
||||
weight -= 25;
|
||||
|
||||
@ -1456,17 +1492,17 @@ Readability.prototype = {
|
||||
this._removeNodes(e.getElementsByTagName(tag), function(element) {
|
||||
// Allow youtube and vimeo videos through as people usually want to see those.
|
||||
if (isEmbed) {
|
||||
var attributeValues = [].map.call(element.attributes, function(attr) {
|
||||
return attr.value;
|
||||
}).join("|");
|
||||
|
||||
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
||||
if (this.REGEXPS.videos.test(attributeValues))
|
||||
return false;
|
||||
for (var i = 0; i < element.attributes.length; i++) {
|
||||
if (this.REGEXPS.videos.test(element.attributes[i].value)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Then check the elements inside this element for the same.
|
||||
if (this.REGEXPS.videos.test(element.innerHTML))
|
||||
// For embed with <object> tag, check inner HTML as well.
|
||||
if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -1584,6 +1620,39 @@ Readability.prototype = {
|
||||
}
|
||||
},
|
||||
|
||||
/* convert images and figures that have properties like data-src into images that can be loaded without JS */
|
||||
_fixLazyImages: function (root) {
|
||||
this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
|
||||
// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
|
||||
if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) {
|
||||
for (var i = 0; i < elem.attributes.length; i++) {
|
||||
var attr = elem.attributes[i];
|
||||
if (attr.name === "src" || attr.name === "srcset") {
|
||||
continue;
|
||||
}
|
||||
var copyTo = null;
|
||||
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
|
||||
copyTo = "srcset";
|
||||
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
|
||||
copyTo = "src";
|
||||
}
|
||||
if (copyTo) {
|
||||
//if this is an img or picture, set the attribute directly
|
||||
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
|
||||
elem.setAttribute(copyTo, attr.value);
|
||||
} else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
|
||||
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
|
||||
//see the nytimes-3 testcase for an example
|
||||
var img = this._doc.createElement("img");
|
||||
img.setAttribute(copyTo, attr.value);
|
||||
elem.appendChild(img);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Clean an element of all tags of type "tag" if they look fishy.
|
||||
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||
@ -1602,11 +1671,16 @@ Readability.prototype = {
|
||||
//
|
||||
// TODO: Consider taking into account original contentScore here.
|
||||
this._removeNodes(e.getElementsByTagName(tag), function(node) {
|
||||
// First check if we're in a data table, in which case don't remove us.
|
||||
// First check if this node IS data table, in which case don't remove it.
|
||||
var isDataTable = function(t) {
|
||||
return t._readabilityDataTable;
|
||||
};
|
||||
|
||||
if (tag === "table" && isDataTable(node)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Next check if we're inside a data table, in which case don't remove it as well.
|
||||
if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
|
||||
return false;
|
||||
}
|
||||
@ -1620,7 +1694,7 @@ Readability.prototype = {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (this._getCharCount(node, ',') < 10) {
|
||||
if (this._getCharCount(node, ",") < 10) {
|
||||
// If there are not very many commas, and the number of
|
||||
// non-paragraph elements is more than paragraphs or other
|
||||
// ominous signs, remove the element.
|
||||
@ -1630,10 +1704,25 @@ Readability.prototype = {
|
||||
var input = node.getElementsByTagName("input").length;
|
||||
|
||||
var embedCount = 0;
|
||||
var embeds = node.getElementsByTagName("embed");
|
||||
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
|
||||
if (!this.REGEXPS.videos.test(embeds[ei].src))
|
||||
embedCount += 1;
|
||||
var embeds = this._concatNodeLists(
|
||||
node.getElementsByTagName("object"),
|
||||
node.getElementsByTagName("embed"),
|
||||
node.getElementsByTagName("iframe"));
|
||||
|
||||
for (var i = 0; i < embeds.length; i++) {
|
||||
// If this embed has attribute that matches video regex, don't delete it.
|
||||
for (var j = 0; j < embeds[i].attributes.length; j++) {
|
||||
if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// For embed with <object> tag, check inner HTML as well.
|
||||
if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
embedCount++;
|
||||
}
|
||||
|
||||
var linkDensity = this._getLinkDensity(node);
|
||||
@ -1654,17 +1743,17 @@ Readability.prototype = {
|
||||
},
|
||||
|
||||
/**
|
||||
* Clean out elements whose id/class combinations match specific string.
|
||||
* Clean out elements that match the specified conditions
|
||||
*
|
||||
* @param Element
|
||||
* @param RegExp match id/class combination.
|
||||
* @param Function determines whether a node should be removed
|
||||
* @return void
|
||||
**/
|
||||
_cleanMatchedNodes: function(e, regex) {
|
||||
_cleanMatchedNodes: function(e, filter) {
|
||||
var endOfSearchMarkerNode = this._getNextNode(e, true);
|
||||
var next = this._getNextNode(e);
|
||||
while (next && next != endOfSearchMarkerNode) {
|
||||
if (regex.test(next.className + " " + next.id)) {
|
||||
if (filter(next, next.className + " " + next.id)) {
|
||||
next = this._removeAndGetNext(next);
|
||||
} else {
|
||||
next = this._getNextNode(next);
|
||||
@ -1680,7 +1769,7 @@ Readability.prototype = {
|
||||
**/
|
||||
_cleanHeaders: function(e) {
|
||||
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
|
||||
this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
|
||||
this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
|
||||
return this._getClassWeight(header) < 0;
|
||||
});
|
||||
}
|
||||
@ -1694,63 +1783,8 @@ Readability.prototype = {
|
||||
this._flags = this._flags & ~flag;
|
||||
},
|
||||
|
||||
/**
|
||||
* Decides whether or not the document is reader-able without parsing the whole thing.
|
||||
*
|
||||
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
|
||||
*/
|
||||
isProbablyReaderable: function(helperIsVisible) {
|
||||
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
|
||||
|
||||
// Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
|
||||
// Some articles' DOM structures might look like
|
||||
// <div>
|
||||
// Sentences<br>
|
||||
// <br>
|
||||
// Sentences<br>
|
||||
// </div>
|
||||
var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
|
||||
if (brNodes.length) {
|
||||
var set = new Set();
|
||||
[].forEach.call(brNodes, function(node) {
|
||||
set.add(node.parentNode);
|
||||
});
|
||||
nodes = [].concat.apply(Array.from(set), nodes);
|
||||
}
|
||||
|
||||
// FIXME we should have a fallback for helperIsVisible, but this is
|
||||
// problematic because of jsdom's elem.style handling - see
|
||||
// https://github.com/mozilla/readability/pull/186 for context.
|
||||
|
||||
var score = 0;
|
||||
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
|
||||
// this callback:
|
||||
return this._someNode(nodes, function(node) {
|
||||
if (helperIsVisible && !helperIsVisible(node))
|
||||
return false;
|
||||
var matchString = node.className + " " + node.id;
|
||||
|
||||
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
||||
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node.matches && node.matches("li p")) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var textContentLength = node.textContent.trim().length;
|
||||
if (textContentLength < 140) {
|
||||
return false;
|
||||
}
|
||||
|
||||
score += Math.sqrt(textContentLength - 140);
|
||||
|
||||
if (score > 20) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
_isProbablyVisible: function(node) {
|
||||
return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
|
||||
},
|
||||
|
||||
/**
|
||||
@ -1774,9 +1808,6 @@ Readability.prototype = {
|
||||
}
|
||||
}
|
||||
|
||||
if (typeof this._doc.documentElement.firstElementChild === "undefined") {
|
||||
this._getNextNode = this._getNextNodeNoElementProperties;
|
||||
}
|
||||
// Remove script tags from the document.
|
||||
this._removeScripts(this._doc);
|
||||
|
||||
@ -1812,6 +1843,7 @@ Readability.prototype = {
|
||||
textContent: textContent,
|
||||
length: textContent.length,
|
||||
excerpt: metadata.excerpt,
|
||||
siteName: metadata.siteName || this._articleSiteName
|
||||
};
|
||||
}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user