You've already forked joplin
							
							
				mirror of
				https://github.com/laurent22/joplin.git
				synced 2025-10-31 00:07:48 +02:00 
			
		
		
		
	Clipper: Improved: Updated Readability library to improve Simplified Page clipping
This commit is contained in:
		| @@ -1,3 +1,5 @@ | ||||
| // https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605 | ||||
|  | ||||
| /*eslint-env es6:false*/ | ||||
| /* This Source Code Form is subject to the terms of the Mozilla Public | ||||
|  * License, v. 2.0. If a copy of the MPL was not distributed with this file, | ||||
| @@ -496,17 +498,9 @@ | ||||
|     }, | ||||
|     setValue: function(newValue) { | ||||
|       this._value = newValue; | ||||
|       delete this._decodedValue; | ||||
|     }, | ||||
|     setDecodedValue: function(newValue) { | ||||
|       this._value = encodeHTML(newValue); | ||||
|       this._decodedValue = newValue; | ||||
|     }, | ||||
|     getDecodedValue: function() { | ||||
|       if (typeof this._decodedValue === "undefined") { | ||||
|         this._decodedValue = (this._value && decodeHTML(this._value)) || ""; | ||||
|       } | ||||
|       return this._decodedValue; | ||||
|     getEncodedValue: function() { | ||||
|       return encodeHTML(this._value); | ||||
|     }, | ||||
|   }; | ||||
|  | ||||
| @@ -611,6 +605,13 @@ | ||||
|   }; | ||||
|  | ||||
|   var Element = function (tag) { | ||||
|     // We use this to find the closing tag. | ||||
|     this._matchingTag = tag; | ||||
|     // We're explicitly a non-namespace aware parser, we just pretend it's all HTML. | ||||
|     var lastColonIndex = tag.lastIndexOf(":"); | ||||
|     if (lastColonIndex != -1) { | ||||
|       tag = tag.substring(lastColonIndex + 1); | ||||
|     } | ||||
|     this.attributes = []; | ||||
|     this.childNodes = []; | ||||
|     this.children = []; | ||||
| @@ -659,6 +660,14 @@ | ||||
|       this.setAttribute("src", str); | ||||
|     }, | ||||
|  | ||||
|     get srcset() { | ||||
|       return this.getAttribute("srcset") || ""; | ||||
|     }, | ||||
|  | ||||
|     set srcset(str) { | ||||
|       this.setAttribute("srcset", str); | ||||
|     }, | ||||
|  | ||||
|     get nodeName() { | ||||
|       return this.tagName; | ||||
|     }, | ||||
| @@ -675,9 +684,9 @@ | ||||
|             for (var j = 0; j < child.attributes.length; j++) { | ||||
|               var attr = child.attributes[j]; | ||||
|               // the attribute value will be HTML escaped. | ||||
|               var val = attr.value; | ||||
|               var val = attr.getEncodedValue(); | ||||
|               var quote = (val.indexOf('"') === -1 ? '"' : "'"); | ||||
|               arr.push(" " + attr.name + '=' + quote + val + quote); | ||||
|               arr.push(" " + attr.name + "=" + quote + val + quote); | ||||
|             } | ||||
|  | ||||
|             if (child.localName in voidElems && !child.childNodes.length) { | ||||
| @@ -753,8 +762,9 @@ | ||||
|     getAttribute: function (name) { | ||||
|       for (var i = this.attributes.length; --i >= 0;) { | ||||
|         var attr = this.attributes[i]; | ||||
|         if (attr.name === name) | ||||
|           return attr.getDecodedValue(); | ||||
|         if (attr.name === name) { | ||||
|           return attr.value; | ||||
|         } | ||||
|       } | ||||
|       return undefined; | ||||
|     }, | ||||
| @@ -763,11 +773,11 @@ | ||||
|       for (var i = this.attributes.length; --i >= 0;) { | ||||
|         var attr = this.attributes[i]; | ||||
|         if (attr.name === name) { | ||||
|           attr.setDecodedValue(value); | ||||
|           attr.setValue(value); | ||||
|           return; | ||||
|         } | ||||
|       } | ||||
|       this.attributes.push(new Attribute(name, encodeHTML(value))); | ||||
|       this.attributes.push(new Attribute(name, value)); | ||||
|     }, | ||||
|  | ||||
|     removeAttribute: function (name) { | ||||
| @@ -778,7 +788,13 @@ | ||||
|           break; | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     }, | ||||
|  | ||||
|     hasAttribute: function (name) { | ||||
|       return this.attributes.some(function (attr) { | ||||
|         return attr.name == name; | ||||
|       }); | ||||
|     }, | ||||
|   }; | ||||
|  | ||||
|   var Style = function (node) { | ||||
| @@ -925,7 +941,7 @@ | ||||
|       // Read the attribute value (and consume the matching quote) | ||||
|       var value = this.readString(c); | ||||
|  | ||||
|       node.attributes.push(new Attribute(name, value)); | ||||
|       node.attributes.push(new Attribute(name, decodeHTML(value))); | ||||
|  | ||||
|       return; | ||||
|     }, | ||||
| @@ -950,7 +966,7 @@ | ||||
|         strBuf.push(c); | ||||
|         c = this.nextChar(); | ||||
|       } | ||||
|       var tag = strBuf.join(''); | ||||
|       var tag = strBuf.join(""); | ||||
|  | ||||
|       if (!tag) | ||||
|         return false; | ||||
| @@ -961,7 +977,9 @@ | ||||
|       while (c !== "/" && c !== ">") { | ||||
|         if (c === undefined) | ||||
|           return false; | ||||
|         while (whitespace.indexOf(this.html[this.currentChar++]) != -1); | ||||
|         while (whitespace.indexOf(this.html[this.currentChar++]) != -1) { | ||||
|           // Advance cursor to first non-whitespace char. | ||||
|         } | ||||
|         this.currentChar--; | ||||
|         c = this.nextChar(); | ||||
|         if (c !== "/" && c !== ">") { | ||||
| @@ -1055,9 +1073,10 @@ | ||||
|         return null; | ||||
|  | ||||
|       // Read any text as Text node | ||||
|       var textNode; | ||||
|       if (c !== "<") { | ||||
|         --this.currentChar; | ||||
|         var textNode = new Text(); | ||||
|         textNode = new Text(); | ||||
|         var n = this.html.indexOf("<", this.currentChar); | ||||
|         if (n === -1) { | ||||
|           textNode.innerHTML = this.html.substring(this.currentChar, this.html.length); | ||||
| @@ -1069,6 +1088,18 @@ | ||||
|         return textNode; | ||||
|       } | ||||
|  | ||||
|       if (this.match("![CDATA[")) { | ||||
|         var endChar = this.html.indexOf("]]>", this.currentChar); | ||||
|         if (endChar === -1) { | ||||
|           this.error("unclosed CDATA section"); | ||||
|           return null; | ||||
|         } | ||||
|         textNode = new Text(); | ||||
|         textNode.textContent = this.html.substring(this.currentChar, endChar); | ||||
|         this.currentChar = endChar + ("]]>").length; | ||||
|         return textNode; | ||||
|       } | ||||
|  | ||||
|       c = this.peekNext(); | ||||
|  | ||||
|       // Read Comment node. Normally, Comment nodes know their inner | ||||
| @@ -1100,7 +1131,7 @@ | ||||
|       // If this isn't a void Element, read its child nodes | ||||
|       if (!closed) { | ||||
|         this.readChildren(node); | ||||
|         var closingTag = "</" + localName + ">"; | ||||
|         var closingTag = "</" + node._matchingTag + ">"; | ||||
|         if (!this.match(closingTag)) { | ||||
|           this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length)); | ||||
|           return null; | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| // https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605 | ||||
|  | ||||
| /* eslint-env es6:false */ | ||||
| /* globals exports */ | ||||
| /* | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| // https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605 | ||||
|  | ||||
| /*eslint-env es6:false*/ | ||||
| /* | ||||
|  * Copyright (c) 2010 Arc90 Inc | ||||
| @@ -39,6 +41,7 @@ function Readability(doc, options) { | ||||
|   this._articleTitle = null; | ||||
|   this._articleByline = null; | ||||
|   this._articleDir = null; | ||||
|   this._articleSiteName = null; | ||||
|   this._attempts = []; | ||||
|  | ||||
|   // Configurable options | ||||
| @@ -111,15 +114,18 @@ Readability.prototype = { | ||||
|   // All of the regular expressions in use within readability. | ||||
|   // Defined up here so we don't instantiate them repeatedly in loops. | ||||
|   REGEXPS: { | ||||
|     unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, | ||||
|     // NOTE: These two regular expressions are duplicated in | ||||
|     // Readability-readerable.js. Please keep both copies in sync. | ||||
|     unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, | ||||
|     okMaybeItsACandidate: /and|article|body|column|main|shadow/i, | ||||
|  | ||||
|     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, | ||||
|     negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, | ||||
|     negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, | ||||
|     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, | ||||
|     byline: /byline|author|dateline|writtenby|p-author/i, | ||||
|     replaceFonts: /<(\/?)font[^>]*>/gi, | ||||
|     normalize: /\s{2,}/g, | ||||
|     videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, | ||||
|     videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, | ||||
|     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, | ||||
|     prevLink: /(prev|earl|old|new|<|«)/i, | ||||
|     whitespace: /^\s*$/, | ||||
| @@ -260,7 +266,7 @@ Readability.prototype = { | ||||
|  | ||||
|   _getAllNodesWithTag: function(node, tagNames) { | ||||
|     if (node.querySelectorAll) { | ||||
|       return node.querySelectorAll(tagNames.join(',')); | ||||
|       return node.querySelectorAll(tagNames.join(",")); | ||||
|     } | ||||
|     return [].concat.apply([], tagNames.map(function(tag) { | ||||
|       var collection = node.getElementsByTagName(tag); | ||||
| @@ -320,7 +326,7 @@ Readability.prototype = { | ||||
|       return uri; | ||||
|     } | ||||
|  | ||||
|     var links = articleContent.getElementsByTagName("a"); | ||||
|     var links = this._getAllNodesWithTag(articleContent, ["a"]); | ||||
|     this._forEachNode(links, function(link) { | ||||
|       var href = link.getAttribute("href"); | ||||
|       if (href) { | ||||
| @@ -335,7 +341,7 @@ Readability.prototype = { | ||||
|       } | ||||
|     }); | ||||
|  | ||||
|     var imgs = articleContent.getElementsByTagName("img"); | ||||
|     var imgs = this._getAllNodesWithTag(articleContent, ["img"]); | ||||
|     this._forEachNode(imgs, function(img) { | ||||
|       var src = img.getAttribute("src"); | ||||
|       if (src) { | ||||
| @@ -355,11 +361,11 @@ Readability.prototype = { | ||||
|     var origTitle = ""; | ||||
|  | ||||
|     try { | ||||
|       curTitle = origTitle = doc.title; | ||||
|       curTitle = origTitle = doc.title.trim(); | ||||
|  | ||||
|       // If they had an element with id "title" in their HTML | ||||
|       if (typeof curTitle !== "string") | ||||
|         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); | ||||
|         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); | ||||
|     } catch (e) {/* ignore exceptions setting the title. */} | ||||
|  | ||||
|     var titleHadHierarchicalSeparators = false; | ||||
| @@ -370,44 +376,45 @@ Readability.prototype = { | ||||
|     // If there's a separator in the title, first remove the final part | ||||
|     if ((/ [\|\-\\\/>»] /).test(curTitle)) { | ||||
|       titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); | ||||
|       curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1'); | ||||
|       curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); | ||||
|  | ||||
|       // If the resulting title is too short (3 words or fewer), remove | ||||
|       // the first part instead: | ||||
|       if (wordCount(curTitle) < 3) | ||||
|         curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1'); | ||||
|     } else if (curTitle.indexOf(': ') !== -1) { | ||||
|         curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); | ||||
|     } else if (curTitle.indexOf(": ") !== -1) { | ||||
|       // Check if we have an heading containing this exact string, so we | ||||
|       // could assume it's the full title. | ||||
|       var headings = this._concatNodeLists( | ||||
|         doc.getElementsByTagName('h1'), | ||||
|         doc.getElementsByTagName('h2') | ||||
|         doc.getElementsByTagName("h1"), | ||||
|         doc.getElementsByTagName("h2") | ||||
|       ); | ||||
|       var trimmedTitle = curTitle.trim(); | ||||
|       var match = this._someNode(headings, function(heading) { | ||||
|         return heading.textContent === curTitle; | ||||
|         return heading.textContent.trim() === trimmedTitle; | ||||
|       }); | ||||
|  | ||||
|       // If we don't, let's extract the title out of the original title string. | ||||
|       if (!match) { | ||||
|         curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1); | ||||
|         curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); | ||||
|  | ||||
|         // If the title is now too short, try the first colon instead: | ||||
|         if (wordCount(curTitle) < 3) { | ||||
|           curTitle = origTitle.substring(origTitle.indexOf(':') + 1); | ||||
|           curTitle = origTitle.substring(origTitle.indexOf(":") + 1); | ||||
|           // But if we have too many words before the colon there's something weird | ||||
|           // with the titles and the H tags so let's just use the original title instead | ||||
|         } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) { | ||||
|         } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { | ||||
|           curTitle = origTitle; | ||||
|         } | ||||
|       } | ||||
|     } else if (curTitle.length > 150 || curTitle.length < 15) { | ||||
|       var hOnes = doc.getElementsByTagName('h1'); | ||||
|       var hOnes = doc.getElementsByTagName("h1"); | ||||
|  | ||||
|       if (hOnes.length === 1) | ||||
|         curTitle = this._getInnerText(hOnes[0]); | ||||
|     } | ||||
|  | ||||
|     curTitle = curTitle.trim(); | ||||
|     curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); | ||||
|     // If we now have 4 words or fewer as our title, and either no | ||||
|     // 'hierarchical' separators (\, /, > or ») were found in the original | ||||
|     // title or we decreased the number of words by more than 1 word, use | ||||
| @@ -497,7 +504,8 @@ Readability.prototype = { | ||||
|               break; | ||||
|           } | ||||
|  | ||||
|           if (!this._isPhrasingContent(next)) break; | ||||
|           if (!this._isPhrasingContent(next)) | ||||
|             break; | ||||
|  | ||||
|           // Otherwise, make this node a child of the new <p>. | ||||
|           var sibling = next.nextSibling; | ||||
| @@ -505,7 +513,12 @@ Readability.prototype = { | ||||
|           next = sibling; | ||||
|         } | ||||
|  | ||||
|         while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); | ||||
|         while (p.lastChild && this._isWhitespace(p.lastChild)) { | ||||
|           p.removeChild(p.lastChild); | ||||
|         } | ||||
|  | ||||
|         if (p.parentNode.tagName === "P") | ||||
|           this._setNodeTag(p.parentNode, "DIV"); | ||||
|       } | ||||
|     }); | ||||
|   }, | ||||
| @@ -527,7 +540,16 @@ Readability.prototype = { | ||||
|       replacement.readability = node.readability; | ||||
|  | ||||
|     for (var i = 0; i < node.attributes.length; i++) { | ||||
|       replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); | ||||
|       try { | ||||
|         replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); | ||||
|       } catch (ex) { | ||||
|         /* it's possible for setAttribute() to throw if the attribute name | ||||
|          * isn't a valid XML Name. Such attributes can however be parsed from | ||||
|          * source in HTML docs, see https://github.com/whatwg/html/issues/4275, | ||||
|          * so we can hit them here and then throw. We don't care about such | ||||
|          * attributes so we ignore them. | ||||
|          */ | ||||
|       } | ||||
|     } | ||||
|     return replacement; | ||||
|   }, | ||||
| @@ -547,6 +569,8 @@ Readability.prototype = { | ||||
|     // visually linked to other content-ful elements (text, images, etc.). | ||||
|     this._markDataTables(articleContent); | ||||
|  | ||||
|     this._fixLazyImages(articleContent); | ||||
|  | ||||
|     // Clean out junk from the article content | ||||
|     this._cleanConditionally(articleContent, "form"); | ||||
|     this._cleanConditionally(articleContent, "fieldset"); | ||||
| @@ -557,16 +581,21 @@ Readability.prototype = { | ||||
|     this._clean(articleContent, "link"); | ||||
|     this._clean(articleContent, "aside"); | ||||
|  | ||||
|     // Clean out elements have "share" in their id/class combinations from final top candidates, | ||||
|     // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, | ||||
|     // which means we don't remove the top candidates even they have "share". | ||||
|     this._forEachNode(articleContent.children, function(topCandidate) { | ||||
|       this._cleanMatchedNodes(topCandidate, /share/); | ||||
|  | ||||
|     var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; | ||||
|  | ||||
|     this._forEachNode(articleContent.children, function (topCandidate) { | ||||
|       this._cleanMatchedNodes(topCandidate, function (node, matchString) { | ||||
|         return /share/.test(matchString) && node.textContent.length < shareElementThreshold; | ||||
|       }); | ||||
|     }); | ||||
|  | ||||
|     // If there is only one h2 and its text content substantially equals article title, | ||||
|     // they are probably using it as a header and not a subheader, | ||||
|     // so remove it since we already extract the title separately. | ||||
|     var h2 = articleContent.getElementsByTagName('h2'); | ||||
|     var h2 = articleContent.getElementsByTagName("h2"); | ||||
|     if (h2.length === 1) { | ||||
|       var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length; | ||||
|       if (Math.abs(lengthSimilarRate) < 0.5) { | ||||
| @@ -596,12 +625,12 @@ Readability.prototype = { | ||||
|     this._cleanConditionally(articleContent, "div"); | ||||
|  | ||||
|     // Remove extra paragraphs | ||||
|     this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) { | ||||
|       var imgCount = paragraph.getElementsByTagName('img').length; | ||||
|       var embedCount = paragraph.getElementsByTagName('embed').length; | ||||
|       var objectCount = paragraph.getElementsByTagName('object').length; | ||||
|     this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) { | ||||
|       var imgCount = paragraph.getElementsByTagName("img").length; | ||||
|       var embedCount = paragraph.getElementsByTagName("embed").length; | ||||
|       var objectCount = paragraph.getElementsByTagName("object").length; | ||||
|       // At this point, nasty iframes have been removed, only remain embedded video ones. | ||||
|       var iframeCount = paragraph.getElementsByTagName('iframe').length; | ||||
|       var iframeCount = paragraph.getElementsByTagName("iframe").length; | ||||
|       var totalCount = imgCount + embedCount + objectCount + iframeCount; | ||||
|  | ||||
|       return totalCount === 0 && !this._getInnerText(paragraph, false); | ||||
| @@ -612,6 +641,19 @@ Readability.prototype = { | ||||
|       if (next && next.tagName == "P") | ||||
|         br.parentNode.removeChild(br); | ||||
|     }); | ||||
|  | ||||
|     // Remove single-cell tables | ||||
|     this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { | ||||
|       var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; | ||||
|       if (this._hasSingleTagInsideElement(tbody, "TR")) { | ||||
|         var row = tbody.firstElementChild; | ||||
|         if (this._hasSingleTagInsideElement(row, "TD")) { | ||||
|           var cell = row.firstElementChild; | ||||
|           cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); | ||||
|           table.parentNode.replaceChild(cell, table); | ||||
|         } | ||||
|       } | ||||
|     }); | ||||
|   }, | ||||
|  | ||||
|   /** | ||||
| @@ -625,34 +667,34 @@ Readability.prototype = { | ||||
|     node.readability = {"contentScore": 0}; | ||||
|  | ||||
|     switch (node.tagName) { | ||||
|       case 'DIV': | ||||
|       case "DIV": | ||||
|         node.readability.contentScore += 5; | ||||
|         break; | ||||
|  | ||||
|       case 'PRE': | ||||
|       case 'TD': | ||||
|       case 'BLOCKQUOTE': | ||||
|       case "PRE": | ||||
|       case "TD": | ||||
|       case "BLOCKQUOTE": | ||||
|         node.readability.contentScore += 3; | ||||
|         break; | ||||
|  | ||||
|       case 'ADDRESS': | ||||
|       case 'OL': | ||||
|       case 'UL': | ||||
|       case 'DL': | ||||
|       case 'DD': | ||||
|       case 'DT': | ||||
|       case 'LI': | ||||
|       case 'FORM': | ||||
|       case "ADDRESS": | ||||
|       case "OL": | ||||
|       case "UL": | ||||
|       case "DL": | ||||
|       case "DD": | ||||
|       case "DT": | ||||
|       case "LI": | ||||
|       case "FORM": | ||||
|         node.readability.contentScore -= 3; | ||||
|         break; | ||||
|  | ||||
|       case 'H1': | ||||
|       case 'H2': | ||||
|       case 'H3': | ||||
|       case 'H4': | ||||
|       case 'H5': | ||||
|       case 'H6': | ||||
|       case 'TH': | ||||
|       case "H1": | ||||
|       case "H2": | ||||
|       case "H3": | ||||
|       case "H4": | ||||
|       case "H5": | ||||
|       case "H6": | ||||
|       case "TH": | ||||
|         node.readability.contentScore -= 5; | ||||
|         break; | ||||
|     } | ||||
| @@ -691,37 +733,6 @@ Readability.prototype = { | ||||
|     return node && node.nextElementSibling; | ||||
|   }, | ||||
|  | ||||
|   /** | ||||
|    * Like _getNextNode, but for DOM implementations with no | ||||
|    * firstElementChild/nextElementSibling functionality... | ||||
|    */ | ||||
|   _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) { | ||||
|     function nextSiblingEl(n) { | ||||
|       do { | ||||
|         n = n.nextSibling; | ||||
|       } while (n && n.nodeType !== n.ELEMENT_NODE); | ||||
|       return n; | ||||
|     } | ||||
|     // First check for kids if those aren't being ignored | ||||
|     if (!ignoreSelfAndKids && node.children[0]) { | ||||
|       return node.children[0]; | ||||
|     } | ||||
|     // Then for siblings... | ||||
|     var next = nextSiblingEl(node); | ||||
|     if (next) { | ||||
|       return next; | ||||
|     } | ||||
|     // And finally, move up the parent chain *and* find a sibling | ||||
|     // (because this is depth-first traversal, we will have already | ||||
|     // seen the parent nodes themselves). | ||||
|     do { | ||||
|       node = node.parentNode; | ||||
|       if (node) | ||||
|         next = nextSiblingEl(node); | ||||
|     } while (node && !next); | ||||
|     return node && next; | ||||
|   }, | ||||
|  | ||||
|   _checkByline: function(node, matchString) { | ||||
|     if (this._articleByline) { | ||||
|       return false; | ||||
| @@ -729,9 +740,10 @@ Readability.prototype = { | ||||
|  | ||||
|     if (node.getAttribute !== undefined) { | ||||
|       var rel = node.getAttribute("rel"); | ||||
|       var itemprop = node.getAttribute("itemprop"); | ||||
|     } | ||||
|  | ||||
|     if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { | ||||
|     if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { | ||||
|       this._articleByline = node.textContent.trim(); | ||||
|       return true; | ||||
|     } | ||||
| @@ -784,6 +796,12 @@ Readability.prototype = { | ||||
|       while (node) { | ||||
|         var matchString = node.className + " " + node.id; | ||||
|  | ||||
|         if (!this._isProbablyVisible(node)) { | ||||
|           this.log("Removing hidden node - " + matchString); | ||||
|           node = this._removeAndGetNext(node); | ||||
|           continue; | ||||
|         } | ||||
|  | ||||
|         // Check to see if this node is a byline, and remove it if it is. | ||||
|         if (this._checkByline(node, matchString)) { | ||||
|           node = this._removeAndGetNext(node); | ||||
| @@ -794,6 +812,7 @@ Readability.prototype = { | ||||
|         if (stripUnlikelyCandidates) { | ||||
|           if (this.REGEXPS.unlikelyCandidates.test(matchString) && | ||||
|               !this.REGEXPS.okMaybeItsACandidate.test(matchString) && | ||||
|               !this._hasAncestorTag(node, "table") && | ||||
|               node.tagName !== "BODY" && | ||||
|               node.tagName !== "A") { | ||||
|             this.log("Removing unlikely candidate - " + matchString); | ||||
| @@ -826,12 +845,14 @@ Readability.prototype = { | ||||
|               if (p !== null) { | ||||
|                 p.appendChild(childNode); | ||||
|               } else if (!this._isWhitespace(childNode)) { | ||||
|                 p = doc.createElement('p'); | ||||
|                 p = doc.createElement("p"); | ||||
|                 node.replaceChild(p, childNode); | ||||
|                 p.appendChild(childNode); | ||||
|               } | ||||
|             } else if (p !== null) { | ||||
|               while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); | ||||
|               while (p.lastChild && this._isWhitespace(p.lastChild)) { | ||||
|                 p.removeChild(p.lastChild); | ||||
|               } | ||||
|               p = null; | ||||
|             } | ||||
|             childNode = nextSibling; | ||||
| @@ -841,7 +862,7 @@ Readability.prototype = { | ||||
|           // element. DIVs with only a P element inside and no text content can be | ||||
|           // safely converted into plain P elements to avoid confusing the scoring | ||||
|           // algorithm with DIVs with are, in practice, paragraphs. | ||||
|           if (this._hasSinglePInsideElement(node) && this._getLinkDensity(node) < 0.25) { | ||||
|           if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { | ||||
|             var newNode = node.children[0]; | ||||
|             node.parentNode.replaceChild(newNode, node); | ||||
|             node = newNode; | ||||
| @@ -862,7 +883,7 @@ Readability.prototype = { | ||||
|       **/ | ||||
|       var candidates = []; | ||||
|       this._forEachNode(elementsToScore, function(elementToScore) { | ||||
|         if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined') | ||||
|         if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") | ||||
|           return; | ||||
|  | ||||
|         // If this paragraph is less than 25 characters, don't even count it. | ||||
| @@ -881,17 +902,17 @@ Readability.prototype = { | ||||
|         contentScore += 1; | ||||
|  | ||||
|         // Add points for any commas within this paragraph. | ||||
|         contentScore += innerText.split(',').length; | ||||
|         contentScore += innerText.split(",").length; | ||||
|  | ||||
|         // For every 100 characters in this paragraph, add another point. Up to 3 points. | ||||
|         contentScore += Math.min(Math.floor(innerText.length / 100), 3); | ||||
|  | ||||
|         // Initialize and score ancestors. | ||||
|         this._forEachNode(ancestors, function(ancestor, level) { | ||||
|           if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined') | ||||
|           if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") | ||||
|             return; | ||||
|  | ||||
|           if (typeof(ancestor.readability) === 'undefined') { | ||||
|           if (typeof(ancestor.readability) === "undefined") { | ||||
|             this._initializeNode(ancestor); | ||||
|             candidates.push(ancestor); | ||||
|           } | ||||
| @@ -922,7 +943,7 @@ Readability.prototype = { | ||||
|         var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); | ||||
|         candidate.readability.contentScore = candidateScore; | ||||
|  | ||||
|         this.log('Candidate:', candidate, "with score " + candidateScore); | ||||
|         this.log("Candidate:", candidate, "with score " + candidateScore); | ||||
|  | ||||
|         for (var t = 0; t < this._nbTopCandidates; t++) { | ||||
|           var aTopCandidate = topCandidates[t]; | ||||
| @@ -1041,8 +1062,8 @@ Readability.prototype = { | ||||
|         var sibling = siblings[s]; | ||||
|         var append = false; | ||||
|  | ||||
|         this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ''); | ||||
|         this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown'); | ||||
|         this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); | ||||
|         this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); | ||||
|  | ||||
|         if (sibling === topCandidate) { | ||||
|           append = true; | ||||
| @@ -1076,7 +1097,7 @@ Readability.prototype = { | ||||
|           if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { | ||||
|             // We have a node that isn't a common block level element, like a form or td tag. | ||||
|             // Turn it into a div so it doesn't get filtered out later by accident. | ||||
|             this.log("Altering sibling:", sibling, 'to div.'); | ||||
|             this.log("Altering sibling:", sibling, "to div."); | ||||
|  | ||||
|             sibling = this._setNodeTag(sibling, "DIV"); | ||||
|           } | ||||
| @@ -1144,7 +1165,7 @@ Readability.prototype = { | ||||
|           this._attempts.push({articleContent: articleContent, textLength: textLength}); | ||||
|           // No luck after removing flags, just return the longest text we found during the different loops | ||||
|           this._attempts.sort(function (a, b) { | ||||
|             return a.textLength < b.textLength; | ||||
|             return b.textLength - a.textLength; | ||||
|           }); | ||||
|  | ||||
|           // But first check if we actually have something | ||||
| @@ -1184,7 +1205,7 @@ Readability.prototype = { | ||||
|    * @return Boolean - whether the input string is a byline. | ||||
|    */ | ||||
|   _isValidByline: function(byline) { | ||||
|     if (typeof byline == 'string' || byline instanceof String) { | ||||
|     if (typeof byline == "string" || byline instanceof String) { | ||||
|       byline = byline.trim(); | ||||
|       return (byline.length > 0) && (byline.length < 100); | ||||
|     } | ||||
| @@ -1201,61 +1222,75 @@ Readability.prototype = { | ||||
|     var values = {}; | ||||
|     var metaElements = this._doc.getElementsByTagName("meta"); | ||||
|  | ||||
|     // Match "description", or Twitter's "twitter:description" (Cards) | ||||
|     // in name attribute. | ||||
|     var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi; | ||||
|     // property is a space-separated list of values | ||||
|     var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; | ||||
|  | ||||
|     // Match Facebook's Open Graph title & description properties. | ||||
|     var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi; | ||||
|     // name is a single value | ||||
|     var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; | ||||
|  | ||||
|     // Find description tags. | ||||
|     this._forEachNode(metaElements, function(element) { | ||||
|       var elementName = element.getAttribute("name"); | ||||
|       var elementProperty = element.getAttribute("property"); | ||||
|  | ||||
|       if ([elementName, elementProperty].indexOf("author") !== -1) { | ||||
|         metadata.byline = element.getAttribute("content"); | ||||
|       var content = element.getAttribute("content"); | ||||
|       if (!content) { | ||||
|         return; | ||||
|       } | ||||
|  | ||||
|       var matches = null; | ||||
|       var name = null; | ||||
|       if (namePattern.test(elementName)) { | ||||
|         name = elementName; | ||||
|       } else if (propertyPattern.test(elementProperty)) { | ||||
|         name = elementProperty; | ||||
|       } | ||||
|  | ||||
|       if (name) { | ||||
|         var content = element.getAttribute("content"); | ||||
|       if (elementProperty) { | ||||
|         matches = elementProperty.match(propertyPattern); | ||||
|         if (matches) { | ||||
|           for (var i = matches.length - 1; i >= 0; i--) { | ||||
|             // Convert to lowercase, and remove any whitespace | ||||
|             // so we can match below. | ||||
|             name = matches[i].toLowerCase().replace(/\s/g, ""); | ||||
|             // multiple authors | ||||
|             values[name] = content.trim(); | ||||
|           } | ||||
|         } | ||||
|       } | ||||
|       if (!matches && elementName && namePattern.test(elementName)) { | ||||
|         name = elementName; | ||||
|         if (content) { | ||||
|           // Convert to lowercase and remove any whitespace | ||||
|           // so we can match below. | ||||
|           name = name.toLowerCase().replace(/\s/g, ''); | ||||
|           // Convert to lowercase, remove any whitespace, and convert dots | ||||
|           // to colons so we can match below. | ||||
|           name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); | ||||
|           values[name] = content.trim(); | ||||
|         } | ||||
|       } | ||||
|     }); | ||||
|  | ||||
|     if ("description" in values) { | ||||
|       metadata.excerpt = values["description"]; | ||||
|     } else if ("og:description" in values) { | ||||
|       // Use facebook open graph description. | ||||
|       metadata.excerpt = values["og:description"]; | ||||
|     } else if ("twitter:description" in values) { | ||||
|       // Use twitter cards description. | ||||
|       metadata.excerpt = values["twitter:description"]; | ||||
|     // get title | ||||
|     metadata.title = values["dc:title"] || | ||||
|                      values["dcterm:title"] || | ||||
|                      values["og:title"] || | ||||
|                      values["weibo:article:title"] || | ||||
|                      values["weibo:webpage:title"] || | ||||
|                      values["title"] || | ||||
|                      values["twitter:title"]; | ||||
|  | ||||
|     if (!metadata.title) { | ||||
|       metadata.title = this._getArticleTitle(); | ||||
|     } | ||||
|  | ||||
|     metadata.title = this._getArticleTitle(); | ||||
|     if (!metadata.title) { | ||||
|       if ("og:title" in values) { | ||||
|         // Use facebook open graph title. | ||||
|         metadata.title = values["og:title"]; | ||||
|       } else if ("twitter:title" in values) { | ||||
|         // Use twitter cards title. | ||||
|         metadata.title = values["twitter:title"]; | ||||
|       } | ||||
|     } | ||||
|     // get author | ||||
|     metadata.byline = values["dc:creator"] || | ||||
|                       values["dcterm:creator"] || | ||||
|                       values["author"]; | ||||
|  | ||||
|     // get description | ||||
|     metadata.excerpt = values["dc:description"] || | ||||
|                        values["dcterm:description"] || | ||||
|                        values["og:description"] || | ||||
|                        values["weibo:article:description"] || | ||||
|                        values["weibo:webpage:description"] || | ||||
|                        values["description"] || | ||||
|                        values["twitter:description"]; | ||||
|  | ||||
|     // get site name | ||||
|     metadata.siteName = values["og:site_name"]; | ||||
|  | ||||
|     return metadata; | ||||
|   }, | ||||
| @@ -1266,24 +1301,25 @@ Readability.prototype = { | ||||
|    * @param Element | ||||
|   **/ | ||||
|   _removeScripts: function(doc) { | ||||
|     this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) { | ||||
|     this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) { | ||||
|       scriptNode.nodeValue = ""; | ||||
|       scriptNode.removeAttribute('src'); | ||||
|       scriptNode.removeAttribute("src"); | ||||
|       return true; | ||||
|     }); | ||||
|     this._removeNodes(doc.getElementsByTagName('noscript')); | ||||
|     this._removeNodes(doc.getElementsByTagName("noscript")); | ||||
|   }, | ||||
|  | ||||
|   /** | ||||
|    * Check if this node has only whitespace and a single P element | ||||
|    * Check if this node has only whitespace and a single element with given tag | ||||
|    * Returns false if the DIV node contains non-empty text nodes | ||||
|    * or if it contains no P or more than 1 element. | ||||
|    * or if it contains no element with given tag or more than 1 element. | ||||
|    * | ||||
|    * @param Element | ||||
|    * @param string tag of child element | ||||
|   **/ | ||||
|   _hasSinglePInsideElement: function(element) { | ||||
|     // There should be exactly 1 element child which is a P: | ||||
|     if (element.children.length != 1 || element.children[0].tagName !== "P") { | ||||
|   _hasSingleTagInsideElement: function(element, tag) { | ||||
|     // There should be exactly 1 element child with given tag | ||||
|     if (element.children.length != 1 || element.children[0].tagName !== tag) { | ||||
|       return false; | ||||
|     } | ||||
|  | ||||
| @@ -1337,7 +1373,7 @@ Readability.prototype = { | ||||
|    * @return string | ||||
|   **/ | ||||
|   _getInnerText: function(e, normalizeSpaces) { | ||||
|     normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; | ||||
|     normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces; | ||||
|     var textContent = e.textContent.trim(); | ||||
|  | ||||
|     if (normalizeSpaces) { | ||||
| @@ -1366,7 +1402,7 @@ Readability.prototype = { | ||||
|    * @return void | ||||
|   **/ | ||||
|   _cleanStyles: function(e) { | ||||
|     if (!e || e.tagName.toLowerCase() === 'svg') | ||||
|     if (!e || e.tagName.toLowerCase() === "svg") | ||||
|       return; | ||||
|  | ||||
|     // Remove `style` and deprecated presentational attributes | ||||
| @@ -1375,8 +1411,8 @@ Readability.prototype = { | ||||
|     } | ||||
|  | ||||
|     if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { | ||||
|       e.removeAttribute('width'); | ||||
|       e.removeAttribute('height'); | ||||
|       e.removeAttribute("width"); | ||||
|       e.removeAttribute("height"); | ||||
|     } | ||||
|  | ||||
|     var cur = e.firstElementChild; | ||||
| @@ -1422,7 +1458,7 @@ Readability.prototype = { | ||||
|     var weight = 0; | ||||
|  | ||||
|     // Look for a special classname | ||||
|     if (typeof(e.className) === 'string' && e.className !== '') { | ||||
|     if (typeof(e.className) === "string" && e.className !== "") { | ||||
|       if (this.REGEXPS.negative.test(e.className)) | ||||
|         weight -= 25; | ||||
|  | ||||
| @@ -1431,7 +1467,7 @@ Readability.prototype = { | ||||
|     } | ||||
|  | ||||
|     // Look for a special ID | ||||
|     if (typeof(e.id) === 'string' && e.id !== '') { | ||||
|     if (typeof(e.id) === "string" && e.id !== "") { | ||||
|       if (this.REGEXPS.negative.test(e.id)) | ||||
|         weight -= 25; | ||||
|  | ||||
| @@ -1456,17 +1492,17 @@ Readability.prototype = { | ||||
|     this._removeNodes(e.getElementsByTagName(tag), function(element) { | ||||
|       // Allow youtube and vimeo videos through as people usually want to see those. | ||||
|       if (isEmbed) { | ||||
|         var attributeValues = [].map.call(element.attributes, function(attr) { | ||||
|           return attr.value; | ||||
|         }).join("|"); | ||||
|  | ||||
|         // First, check the elements attributes to see if any of them contain youtube or vimeo | ||||
|         if (this.REGEXPS.videos.test(attributeValues)) | ||||
|           return false; | ||||
|         for (var i = 0; i < element.attributes.length; i++) { | ||||
|           if (this.REGEXPS.videos.test(element.attributes[i].value)) { | ||||
|             return false; | ||||
|           } | ||||
|         } | ||||
|  | ||||
|         // Then check the elements inside this element for the same. | ||||
|         if (this.REGEXPS.videos.test(element.innerHTML)) | ||||
|         // For embed with <object> tag, check inner HTML as well. | ||||
|         if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) { | ||||
|           return false; | ||||
|         } | ||||
|       } | ||||
|  | ||||
|       return true; | ||||
| @@ -1584,6 +1620,39 @@ Readability.prototype = { | ||||
|     } | ||||
|   }, | ||||
|  | ||||
|   /* convert images and figures that have properties like data-src into images that can be loaded without JS */ | ||||
|   _fixLazyImages: function (root) { | ||||
|     this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) { | ||||
|       // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 | ||||
|       if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) { | ||||
|         for (var i = 0; i < elem.attributes.length; i++) { | ||||
|           var attr = elem.attributes[i]; | ||||
|           if (attr.name === "src" || attr.name === "srcset") { | ||||
|             continue; | ||||
|           } | ||||
|           var copyTo = null; | ||||
|           if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { | ||||
|             copyTo = "srcset"; | ||||
|           } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { | ||||
|             copyTo = "src"; | ||||
|           } | ||||
|           if (copyTo) { | ||||
|             //if this is an img or picture, set the attribute directly | ||||
|             if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { | ||||
|               elem.setAttribute(copyTo, attr.value); | ||||
|             } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) { | ||||
|               //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure | ||||
|               //see the nytimes-3 testcase for an example | ||||
|               var img = this._doc.createElement("img"); | ||||
|               img.setAttribute(copyTo, attr.value); | ||||
|               elem.appendChild(img); | ||||
|             } | ||||
|           } | ||||
|         } | ||||
|       } | ||||
|     }); | ||||
|   }, | ||||
|  | ||||
|   /** | ||||
|    * Clean an element of all tags of type "tag" if they look fishy. | ||||
|    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. | ||||
| @@ -1602,11 +1671,16 @@ Readability.prototype = { | ||||
|     // | ||||
|     // TODO: Consider taking into account original contentScore here. | ||||
|     this._removeNodes(e.getElementsByTagName(tag), function(node) { | ||||
|       // First check if we're in a data table, in which case don't remove us. | ||||
|       // First check if this node IS data table, in which case don't remove it. | ||||
|       var isDataTable = function(t) { | ||||
|         return t._readabilityDataTable; | ||||
|       }; | ||||
|  | ||||
|       if (tag === "table" && isDataTable(node)) { | ||||
|         return false; | ||||
|       } | ||||
|  | ||||
|       // Next check if we're inside a data table, in which case don't remove it as well. | ||||
|       if (this._hasAncestorTag(node, "table", -1, isDataTable)) { | ||||
|         return false; | ||||
|       } | ||||
| @@ -1620,7 +1694,7 @@ Readability.prototype = { | ||||
|         return true; | ||||
|       } | ||||
|  | ||||
|       if (this._getCharCount(node, ',') < 10) { | ||||
|       if (this._getCharCount(node, ",") < 10) { | ||||
|         // If there are not very many commas, and the number of | ||||
|         // non-paragraph elements is more than paragraphs or other | ||||
|         // ominous signs, remove the element. | ||||
| @@ -1630,10 +1704,25 @@ Readability.prototype = { | ||||
|         var input = node.getElementsByTagName("input").length; | ||||
|  | ||||
|         var embedCount = 0; | ||||
|         var embeds = node.getElementsByTagName("embed"); | ||||
|         for (var ei = 0, il = embeds.length; ei < il; ei += 1) { | ||||
|           if (!this.REGEXPS.videos.test(embeds[ei].src)) | ||||
|             embedCount += 1; | ||||
|         var embeds = this._concatNodeLists( | ||||
|           node.getElementsByTagName("object"), | ||||
|           node.getElementsByTagName("embed"), | ||||
|           node.getElementsByTagName("iframe")); | ||||
|  | ||||
|         for (var i = 0; i < embeds.length; i++) { | ||||
|           // If this embed has attribute that matches video regex, don't delete it. | ||||
|           for (var j = 0; j < embeds[i].attributes.length; j++) { | ||||
|             if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) { | ||||
|               return false; | ||||
|             } | ||||
|           } | ||||
|  | ||||
|           // For embed with <object> tag, check inner HTML as well. | ||||
|           if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) { | ||||
|             return false; | ||||
|           } | ||||
|  | ||||
|           embedCount++; | ||||
|         } | ||||
|  | ||||
|         var linkDensity = this._getLinkDensity(node); | ||||
| @@ -1654,17 +1743,17 @@ Readability.prototype = { | ||||
|   }, | ||||
|  | ||||
|   /** | ||||
|    * Clean out elements whose id/class combinations match specific string. | ||||
|    * Clean out elements that match the specified conditions | ||||
|    * | ||||
|    * @param Element | ||||
|    * @param RegExp match id/class combination. | ||||
|    * @param Function determines whether a node should be removed | ||||
|    * @return void | ||||
|    **/ | ||||
|   _cleanMatchedNodes: function(e, regex) { | ||||
|   _cleanMatchedNodes: function(e, filter) { | ||||
|     var endOfSearchMarkerNode = this._getNextNode(e, true); | ||||
|     var next = this._getNextNode(e); | ||||
|     while (next && next != endOfSearchMarkerNode) { | ||||
|       if (regex.test(next.className + " " + next.id)) { | ||||
|       if (filter(next, next.className + " " + next.id)) { | ||||
|         next = this._removeAndGetNext(next); | ||||
|       } else { | ||||
|         next = this._getNextNode(next); | ||||
| @@ -1680,7 +1769,7 @@ Readability.prototype = { | ||||
|   **/ | ||||
|   _cleanHeaders: function(e) { | ||||
|     for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { | ||||
|       this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) { | ||||
|       this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) { | ||||
|         return this._getClassWeight(header) < 0; | ||||
|       }); | ||||
|     } | ||||
| @@ -1694,63 +1783,8 @@ Readability.prototype = { | ||||
|     this._flags = this._flags & ~flag; | ||||
|   }, | ||||
|  | ||||
|   /** | ||||
|    * Decides whether or not the document is reader-able without parsing the whole thing. | ||||
|    * | ||||
|    * @return boolean Whether or not we suspect parse() will suceeed at returning an article object. | ||||
|    */ | ||||
|   isProbablyReaderable: function(helperIsVisible) { | ||||
|     var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]); | ||||
|  | ||||
|     // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable. | ||||
|     // Some articles' DOM structures might look like | ||||
|     // <div> | ||||
|     //   Sentences<br> | ||||
|     //   <br> | ||||
|     //   Sentences<br> | ||||
|     // </div> | ||||
|     var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]); | ||||
|     if (brNodes.length) { | ||||
|       var set = new Set(); | ||||
|       [].forEach.call(brNodes, function(node) { | ||||
|         set.add(node.parentNode); | ||||
|       }); | ||||
|       nodes = [].concat.apply(Array.from(set), nodes); | ||||
|     } | ||||
|  | ||||
|     // FIXME we should have a fallback for helperIsVisible, but this is | ||||
|     // problematic because of jsdom's elem.style handling - see | ||||
|     // https://github.com/mozilla/readability/pull/186 for context. | ||||
|  | ||||
|     var score = 0; | ||||
|     // This is a little cheeky, we use the accumulator 'score' to decide what to return from | ||||
|     // this callback: | ||||
|     return this._someNode(nodes, function(node) { | ||||
|       if (helperIsVisible && !helperIsVisible(node)) | ||||
|         return false; | ||||
|       var matchString = node.className + " " + node.id; | ||||
|  | ||||
|       if (this.REGEXPS.unlikelyCandidates.test(matchString) && | ||||
|           !this.REGEXPS.okMaybeItsACandidate.test(matchString)) { | ||||
|         return false; | ||||
|       } | ||||
|  | ||||
|       if (node.matches && node.matches("li p")) { | ||||
|         return false; | ||||
|       } | ||||
|  | ||||
|       var textContentLength = node.textContent.trim().length; | ||||
|       if (textContentLength < 140) { | ||||
|         return false; | ||||
|       } | ||||
|  | ||||
|       score += Math.sqrt(textContentLength - 140); | ||||
|  | ||||
|       if (score > 20) { | ||||
|         return true; | ||||
|       } | ||||
|       return false; | ||||
|     }); | ||||
|   _isProbablyVisible: function(node) { | ||||
|     return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden"); | ||||
|   }, | ||||
|  | ||||
|   /** | ||||
| @@ -1774,9 +1808,6 @@ Readability.prototype = { | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     if (typeof this._doc.documentElement.firstElementChild === "undefined") { | ||||
|       this._getNextNode = this._getNextNodeNoElementProperties; | ||||
|     } | ||||
|     // Remove script tags from the document. | ||||
|     this._removeScripts(this._doc); | ||||
|  | ||||
| @@ -1812,6 +1843,7 @@ Readability.prototype = { | ||||
|       textContent: textContent, | ||||
|       length: textContent.length, | ||||
|       excerpt: metadata.excerpt, | ||||
|       siteName: metadata.siteName || this._articleSiteName | ||||
|     }; | ||||
|   } | ||||
| }; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user