Clipper: Improved: Updated Readability library to improve Simplified Page clipping

2025-10-31 00:07:48 +02:00 · 2019-05-10 00:05:23 +01:00
parent 68268cb35d
commit 356f8e580b
3 changed files with 312 additions and 247 deletions
--- a/Clipper/joplin-webclipper/content_scripts/JSDOMParser.js
+++ b/Clipper/joplin-webclipper/content_scripts/JSDOMParser.js
@@ -1,3 +1,5 @@
+// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
+
 /*eslint-env es6:false*/
 /* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
@@ -496,17 +498,9 @@
    },
    setValue: function(newValue) {
      this._value = newValue;
-      delete this._decodedValue;
    },
-    setDecodedValue: function(newValue) {
-      this._value = encodeHTML(newValue);
-      this._decodedValue = newValue;
-    },
-    getDecodedValue: function() {
-      if (typeof this._decodedValue === "undefined") {
-        this._decodedValue = (this._value && decodeHTML(this._value)) || "";
-      }
-      return this._decodedValue;
+    getEncodedValue: function() {
+      return encodeHTML(this._value);
    },
  };

@@ -611,6 +605,13 @@
  };

  var Element = function (tag) {
+    // We use this to find the closing tag.
+    this._matchingTag = tag;
+    // We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
+    var lastColonIndex = tag.lastIndexOf(":");
+    if (lastColonIndex != -1) {
+      tag = tag.substring(lastColonIndex + 1);
+    }
    this.attributes = [];
    this.childNodes = [];
    this.children = [];
@@ -659,6 +660,14 @@
      this.setAttribute("src", str);
    },

+    get srcset() {
+      return this.getAttribute("srcset") || "";
+    },
+
+    set srcset(str) {
+      this.setAttribute("srcset", str);
+    },
+
    get nodeName() {
      return this.tagName;
    },
@@ -675,9 +684,9 @@
            for (var j = 0; j < child.attributes.length; j++) {
              var attr = child.attributes[j];
              // the attribute value will be HTML escaped.
-              var val = attr.value;
+              var val = attr.getEncodedValue();
              var quote = (val.indexOf('"') === -1 ? '"' : "'");
-              arr.push(" " + attr.name + '=' + quote + val + quote);
+              arr.push(" " + attr.name + "=" + quote + val + quote);
            }

            if (child.localName in voidElems && !child.childNodes.length) {
@@ -753,8 +762,9 @@
    getAttribute: function (name) {
      for (var i = this.attributes.length; --i >= 0;) {
        var attr = this.attributes[i];
-        if (attr.name === name)
-          return attr.getDecodedValue();
+        if (attr.name === name) {
+          return attr.value;
+        }
      }
      return undefined;
    },
@@ -763,11 +773,11 @@
      for (var i = this.attributes.length; --i >= 0;) {
        var attr = this.attributes[i];
        if (attr.name === name) {
-          attr.setDecodedValue(value);
+          attr.setValue(value);
          return;
        }
      }
-      this.attributes.push(new Attribute(name, encodeHTML(value)));
+      this.attributes.push(new Attribute(name, value));
    },

    removeAttribute: function (name) {
@@ -778,7 +788,13 @@
          break;
        }
      }
-    }
+    },
+
+    hasAttribute: function (name) {
+      return this.attributes.some(function (attr) {
+        return attr.name == name;
+      });
+    },
  };

  var Style = function (node) {
@@ -925,7 +941,7 @@
      // Read the attribute value (and consume the matching quote)
      var value = this.readString(c);

-      node.attributes.push(new Attribute(name, value));
+      node.attributes.push(new Attribute(name, decodeHTML(value)));

      return;
    },
@@ -950,7 +966,7 @@
        strBuf.push(c);
        c = this.nextChar();
      }
-      var tag = strBuf.join('');
+      var tag = strBuf.join("");

      if (!tag)
        return false;
@@ -961,7 +977,9 @@
      while (c !== "/" && c !== ">") {
        if (c === undefined)
          return false;
-        while (whitespace.indexOf(this.html[this.currentChar++]) != -1);
+        while (whitespace.indexOf(this.html[this.currentChar++]) != -1) {
+          // Advance cursor to first non-whitespace char.
+        }
        this.currentChar--;
        c = this.nextChar();
        if (c !== "/" && c !== ">") {
@@ -1055,9 +1073,10 @@
        return null;

      // Read any text as Text node
+      var textNode;
      if (c !== "<") {
        --this.currentChar;
-        var textNode = new Text();
+        textNode = new Text();
        var n = this.html.indexOf("<", this.currentChar);
        if (n === -1) {
          textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
@@ -1069,6 +1088,18 @@
        return textNode;
      }

+      if (this.match("![CDATA[")) {
+        var endChar = this.html.indexOf("]]>", this.currentChar);
+        if (endChar === -1) {
+          this.error("unclosed CDATA section");
+          return null;
+        }
+        textNode = new Text();
+        textNode.textContent = this.html.substring(this.currentChar, endChar);
+        this.currentChar = endChar + ("]]>").length;
+        return textNode;
+      }
+
      c = this.peekNext();

      // Read Comment node. Normally, Comment nodes know their inner
@@ -1100,7 +1131,7 @@
      // If this isn't a void Element, read its child nodes
      if (!closed) {
        this.readChildren(node);
-        var closingTag = "</" + localName + ">";
+        var closingTag = "</" + node._matchingTag + ">";
        if (!this.match(closingTag)) {
          this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
          return null;
--- a/Clipper/joplin-webclipper/content_scripts/Readability-readerable.js
+++ b/Clipper/joplin-webclipper/content_scripts/Readability-readerable.js
@@ -1,3 +1,5 @@
+// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
+
 /* eslint-env es6:false */
 /* globals exports */
 /*
--- a/Clipper/joplin-webclipper/content_scripts/Readability.js
+++ b/Clipper/joplin-webclipper/content_scripts/Readability.js
@@ -1,3 +1,5 @@
+// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
+
 /*eslint-env es6:false*/
 /*
 * Copyright (c) 2010 Arc90 Inc
@@ -39,6 +41,7 @@ function Readability(doc, options) {
  this._articleTitle = null;
  this._articleByline = null;
  this._articleDir = null;
+  this._articleSiteName = null;
  this._attempts = [];

  // Configurable options
@@ -111,15 +114,18 @@ Readability.prototype = {
  // All of the regular expressions in use within readability.
  // Defined up here so we don't instantiate them repeatedly in loops.
  REGEXPS: {
-    unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+    // NOTE: These two regular expressions are duplicated in
+    // Readability-readerable.js. Please keep both copies in sync.
+    unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
    okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
+
    positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
-    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
    extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
    byline: /byline|author|dateline|writtenby|p-author/i,
    replaceFonts: /<(\/?)font[^>]*>/gi,
    normalize: /\s{2,}/g,
-    videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+    videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
    nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
    prevLink: /(prev|earl|old|new|<|«)/i,
    whitespace: /^\s*$/,
@@ -260,7 +266,7 @@ Readability.prototype = {

  _getAllNodesWithTag: function(node, tagNames) {
    if (node.querySelectorAll) {
-      return node.querySelectorAll(tagNames.join(','));
+      return node.querySelectorAll(tagNames.join(","));
    }
    return [].concat.apply([], tagNames.map(function(tag) {
      var collection = node.getElementsByTagName(tag);
@@ -320,7 +326,7 @@ Readability.prototype = {
      return uri;
    }

-    var links = articleContent.getElementsByTagName("a");
+    var links = this._getAllNodesWithTag(articleContent, ["a"]);
    this._forEachNode(links, function(link) {
      var href = link.getAttribute("href");
      if (href) {
@@ -335,7 +341,7 @@ Readability.prototype = {
      }
    });

-    var imgs = articleContent.getElementsByTagName("img");
+    var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
    this._forEachNode(imgs, function(img) {
      var src = img.getAttribute("src");
      if (src) {
@@ -355,11 +361,11 @@ Readability.prototype = {
    var origTitle = "";

    try {
-      curTitle = origTitle = doc.title;
+      curTitle = origTitle = doc.title.trim();

      // If they had an element with id "title" in their HTML
      if (typeof curTitle !== "string")
-        curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
+        curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
    } catch (e) {/* ignore exceptions setting the title. */}

    var titleHadHierarchicalSeparators = false;
@@ -370,44 +376,45 @@ Readability.prototype = {
    // If there's a separator in the title, first remove the final part
    if ((/ [\|\-\\\/>»] /).test(curTitle)) {
      titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
-      curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
+      curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");

      // If the resulting title is too short (3 words or fewer), remove
      // the first part instead:
      if (wordCount(curTitle) < 3)
-        curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
-    } else if (curTitle.indexOf(': ') !== -1) {
+        curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
+    } else if (curTitle.indexOf(": ") !== -1) {
      // Check if we have an heading containing this exact string, so we
      // could assume it's the full title.
      var headings = this._concatNodeLists(
-        doc.getElementsByTagName('h1'),
-        doc.getElementsByTagName('h2')
+        doc.getElementsByTagName("h1"),
+        doc.getElementsByTagName("h2")
      );
+      var trimmedTitle = curTitle.trim();
      var match = this._someNode(headings, function(heading) {
-        return heading.textContent === curTitle;
+        return heading.textContent.trim() === trimmedTitle;
      });

      // If we don't, let's extract the title out of the original title string.
      if (!match) {
-        curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
+        curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);

        // If the title is now too short, try the first colon instead:
        if (wordCount(curTitle) < 3) {
-          curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
+          curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
          // But if we have too many words before the colon there's something weird
          // with the titles and the H tags so let's just use the original title instead
-        } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
+        } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
          curTitle = origTitle;
        }
      }
    } else if (curTitle.length > 150 || curTitle.length < 15) {
-      var hOnes = doc.getElementsByTagName('h1');
+      var hOnes = doc.getElementsByTagName("h1");

      if (hOnes.length === 1)
        curTitle = this._getInnerText(hOnes[0]);
    }

-    curTitle = curTitle.trim();
+    curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
    // If we now have 4 words or fewer as our title, and either no
    // 'hierarchical' separators (\, /, > or ») were found in the original
    // title or we decreased the number of words by more than 1 word, use
@@ -497,7 +504,8 @@ Readability.prototype = {
              break;
          }

-          if (!this._isPhrasingContent(next)) break;
+          if (!this._isPhrasingContent(next))
+            break;

          // Otherwise, make this node a child of the new <p>.
          var sibling = next.nextSibling;
@@ -505,7 +513,12 @@ Readability.prototype = {
          next = sibling;
        }

-        while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+        while (p.lastChild && this._isWhitespace(p.lastChild)) {
+          p.removeChild(p.lastChild);
+        }
+
+        if (p.parentNode.tagName === "P")
+          this._setNodeTag(p.parentNode, "DIV");
      }
    });
  },
@@ -527,7 +540,16 @@ Readability.prototype = {
      replacement.readability = node.readability;

    for (var i = 0; i < node.attributes.length; i++) {
-      replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
+      try {
+        replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
+      } catch (ex) {
+        /* it's possible for setAttribute() to throw if the attribute name
+         * isn't a valid XML Name. Such attributes can however be parsed from
+         * source in HTML docs, see https://github.com/whatwg/html/issues/4275,
+         * so we can hit them here and then throw. We don't care about such
+         * attributes so we ignore them.
+         */
+      }
    }
    return replacement;
  },
@@ -547,6 +569,8 @@ Readability.prototype = {
    // visually linked to other content-ful elements (text, images, etc.).
    this._markDataTables(articleContent);

+    this._fixLazyImages(articleContent);
+
    // Clean out junk from the article content
    this._cleanConditionally(articleContent, "form");
    this._cleanConditionally(articleContent, "fieldset");
@@ -557,16 +581,21 @@ Readability.prototype = {
    this._clean(articleContent, "link");
    this._clean(articleContent, "aside");

-    // Clean out elements have "share" in their id/class combinations from final top candidates,
+    // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
    // which means we don't remove the top candidates even they have "share".
-    this._forEachNode(articleContent.children, function(topCandidate) {
-      this._cleanMatchedNodes(topCandidate, /share/);
+
+    var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
+
+    this._forEachNode(articleContent.children, function (topCandidate) {
+      this._cleanMatchedNodes(topCandidate, function (node, matchString) {
+        return /share/.test(matchString) && node.textContent.length < shareElementThreshold;
+      });
    });

    // If there is only one h2 and its text content substantially equals article title,
    // they are probably using it as a header and not a subheader,
    // so remove it since we already extract the title separately.
-    var h2 = articleContent.getElementsByTagName('h2');
+    var h2 = articleContent.getElementsByTagName("h2");
    if (h2.length === 1) {
      var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
      if (Math.abs(lengthSimilarRate) < 0.5) {
@@ -596,12 +625,12 @@ Readability.prototype = {
    this._cleanConditionally(articleContent, "div");

    // Remove extra paragraphs
-    this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
-      var imgCount = paragraph.getElementsByTagName('img').length;
-      var embedCount = paragraph.getElementsByTagName('embed').length;
-      var objectCount = paragraph.getElementsByTagName('object').length;
+    this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
+      var imgCount = paragraph.getElementsByTagName("img").length;
+      var embedCount = paragraph.getElementsByTagName("embed").length;
+      var objectCount = paragraph.getElementsByTagName("object").length;
      // At this point, nasty iframes have been removed, only remain embedded video ones.
-      var iframeCount = paragraph.getElementsByTagName('iframe').length;
+      var iframeCount = paragraph.getElementsByTagName("iframe").length;
      var totalCount = imgCount + embedCount + objectCount + iframeCount;

      return totalCount === 0 && !this._getInnerText(paragraph, false);
@@ -612,6 +641,19 @@ Readability.prototype = {
      if (next && next.tagName == "P")
        br.parentNode.removeChild(br);
    });
+
+    // Remove single-cell tables
+    this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
+      var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
+      if (this._hasSingleTagInsideElement(tbody, "TR")) {
+        var row = tbody.firstElementChild;
+        if (this._hasSingleTagInsideElement(row, "TD")) {
+          var cell = row.firstElementChild;
+          cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
+          table.parentNode.replaceChild(cell, table);
+        }
+      }
+    });
  },

  /**
@@ -625,34 +667,34 @@ Readability.prototype = {
    node.readability = {"contentScore": 0};

    switch (node.tagName) {
-      case 'DIV':
+      case "DIV":
        node.readability.contentScore += 5;
        break;

-      case 'PRE':
-      case 'TD':
-      case 'BLOCKQUOTE':
+      case "PRE":
+      case "TD":
+      case "BLOCKQUOTE":
        node.readability.contentScore += 3;
        break;

-      case 'ADDRESS':
-      case 'OL':
-      case 'UL':
-      case 'DL':
-      case 'DD':
-      case 'DT':
-      case 'LI':
-      case 'FORM':
+      case "ADDRESS":
+      case "OL":
+      case "UL":
+      case "DL":
+      case "DD":
+      case "DT":
+      case "LI":
+      case "FORM":
        node.readability.contentScore -= 3;
        break;

-      case 'H1':
-      case 'H2':
-      case 'H3':
-      case 'H4':
-      case 'H5':
-      case 'H6':
-      case 'TH':
+      case "H1":
+      case "H2":
+      case "H3":
+      case "H4":
+      case "H5":
+      case "H6":
+      case "TH":
        node.readability.contentScore -= 5;
        break;
    }
@@ -691,37 +733,6 @@ Readability.prototype = {
    return node && node.nextElementSibling;
  },

-  /**
-   * Like _getNextNode, but for DOM implementations with no
-   * firstElementChild/nextElementSibling functionality...
-   */
-  _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
-    function nextSiblingEl(n) {
-      do {
-        n = n.nextSibling;
-      } while (n && n.nodeType !== n.ELEMENT_NODE);
-      return n;
-    }
-    // First check for kids if those aren't being ignored
-    if (!ignoreSelfAndKids && node.children[0]) {
-      return node.children[0];
-    }
-    // Then for siblings...
-    var next = nextSiblingEl(node);
-    if (next) {
-      return next;
-    }
-    // And finally, move up the parent chain *and* find a sibling
-    // (because this is depth-first traversal, we will have already
-    // seen the parent nodes themselves).
-    do {
-      node = node.parentNode;
-      if (node)
-        next = nextSiblingEl(node);
-    } while (node && !next);
-    return node && next;
-  },
-
  _checkByline: function(node, matchString) {
    if (this._articleByline) {
      return false;
@@ -729,9 +740,10 @@ Readability.prototype = {

    if (node.getAttribute !== undefined) {
      var rel = node.getAttribute("rel");
+      var itemprop = node.getAttribute("itemprop");
    }

-    if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
+    if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
      this._articleByline = node.textContent.trim();
      return true;
    }
@@ -784,6 +796,12 @@ Readability.prototype = {
      while (node) {
        var matchString = node.className + " " + node.id;

+        if (!this._isProbablyVisible(node)) {
+          this.log("Removing hidden node - " + matchString);
+          node = this._removeAndGetNext(node);
+          continue;
+        }
+
        // Check to see if this node is a byline, and remove it if it is.
        if (this._checkByline(node, matchString)) {
          node = this._removeAndGetNext(node);
@@ -794,6 +812,7 @@ Readability.prototype = {
        if (stripUnlikelyCandidates) {
          if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
              !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
+              !this._hasAncestorTag(node, "table") &&
              node.tagName !== "BODY" &&
              node.tagName !== "A") {
            this.log("Removing unlikely candidate - " + matchString);
@@ -826,12 +845,14 @@ Readability.prototype = {
              if (p !== null) {
                p.appendChild(childNode);
              } else if (!this._isWhitespace(childNode)) {
-                p = doc.createElement('p');
+                p = doc.createElement("p");
                node.replaceChild(p, childNode);
                p.appendChild(childNode);
              }
            } else if (p !== null) {
-              while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+              while (p.lastChild && this._isWhitespace(p.lastChild)) {
+                p.removeChild(p.lastChild);
+              }
              p = null;
            }
            childNode = nextSibling;
@@ -841,7 +862,7 @@ Readability.prototype = {
          // element. DIVs with only a P element inside and no text content can be
          // safely converted into plain P elements to avoid confusing the scoring
          // algorithm with DIVs with are, in practice, paragraphs.
-          if (this._hasSinglePInsideElement(node) && this._getLinkDensity(node) < 0.25) {
+          if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
            var newNode = node.children[0];
            node.parentNode.replaceChild(newNode, node);
            node = newNode;
@@ -862,7 +883,7 @@ Readability.prototype = {
      **/
      var candidates = [];
      this._forEachNode(elementsToScore, function(elementToScore) {
-        if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
+        if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
          return;

        // If this paragraph is less than 25 characters, don't even count it.
@@ -881,17 +902,17 @@ Readability.prototype = {
        contentScore += 1;

        // Add points for any commas within this paragraph.
-        contentScore += innerText.split(',').length;
+        contentScore += innerText.split(",").length;

        // For every 100 characters in this paragraph, add another point. Up to 3 points.
        contentScore += Math.min(Math.floor(innerText.length / 100), 3);

        // Initialize and score ancestors.
        this._forEachNode(ancestors, function(ancestor, level) {
-          if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
+          if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
            return;

-          if (typeof(ancestor.readability) === 'undefined') {
+          if (typeof(ancestor.readability) === "undefined") {
            this._initializeNode(ancestor);
            candidates.push(ancestor);
          }
@@ -922,7 +943,7 @@ Readability.prototype = {
        var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
        candidate.readability.contentScore = candidateScore;

-        this.log('Candidate:', candidate, "with score " + candidateScore);
+        this.log("Candidate:", candidate, "with score " + candidateScore);

        for (var t = 0; t < this._nbTopCandidates; t++) {
          var aTopCandidate = topCandidates[t];
@@ -1041,8 +1062,8 @@ Readability.prototype = {
        var sibling = siblings[s];
        var append = false;

-        this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
-        this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
+        this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
+        this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");

        if (sibling === topCandidate) {
          append = true;
@@ -1076,7 +1097,7 @@ Readability.prototype = {
          if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
            // We have a node that isn't a common block level element, like a form or td tag.
            // Turn it into a div so it doesn't get filtered out later by accident.
-            this.log("Altering sibling:", sibling, 'to div.');
+            this.log("Altering sibling:", sibling, "to div.");

            sibling = this._setNodeTag(sibling, "DIV");
          }
@@ -1144,7 +1165,7 @@ Readability.prototype = {
          this._attempts.push({articleContent: articleContent, textLength: textLength});
          // No luck after removing flags, just return the longest text we found during the different loops
          this._attempts.sort(function (a, b) {
-            return a.textLength < b.textLength;
+            return b.textLength - a.textLength;
          });

          // But first check if we actually have something
@@ -1184,7 +1205,7 @@ Readability.prototype = {
   * @return Boolean - whether the input string is a byline.
   */
  _isValidByline: function(byline) {
-    if (typeof byline == 'string' || byline instanceof String) {
+    if (typeof byline == "string" || byline instanceof String) {
      byline = byline.trim();
      return (byline.length > 0) && (byline.length < 100);
    }
@@ -1201,61 +1222,75 @@ Readability.prototype = {
    var values = {};
    var metaElements = this._doc.getElementsByTagName("meta");

-    // Match "description", or Twitter's "twitter:description" (Cards)
-    // in name attribute.
-    var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;
+    // property is a space-separated list of values
+    var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi;

-    // Match Facebook's Open Graph title & description properties.
-    var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
+    // name is a single value
+    var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;

    // Find description tags.
    this._forEachNode(metaElements, function(element) {
      var elementName = element.getAttribute("name");
      var elementProperty = element.getAttribute("property");
-
-      if ([elementName, elementProperty].indexOf("author") !== -1) {
-        metadata.byline = element.getAttribute("content");
+      var content = element.getAttribute("content");
+      if (!content) {
        return;
      }
-
+      var matches = null;
      var name = null;
-      if (namePattern.test(elementName)) {
-        name = elementName;
-      } else if (propertyPattern.test(elementProperty)) {
-        name = elementProperty;
-      }

-      if (name) {
-        var content = element.getAttribute("content");
+      if (elementProperty) {
+        matches = elementProperty.match(propertyPattern);
+        if (matches) {
+          for (var i = matches.length - 1; i >= 0; i--) {
+            // Convert to lowercase, and remove any whitespace
+            // so we can match below.
+            name = matches[i].toLowerCase().replace(/\s/g, "");
+            // multiple authors
+            values[name] = content.trim();
+          }
+        }
+      }
+      if (!matches && elementName && namePattern.test(elementName)) {
+        name = elementName;
        if (content) {
-          // Convert to lowercase and remove any whitespace
-          // so we can match below.
-          name = name.toLowerCase().replace(/\s/g, '');
+          // Convert to lowercase, remove any whitespace, and convert dots
+          // to colons so we can match below.
+          name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
          values[name] = content.trim();
        }
      }
    });

-    if ("description" in values) {
-      metadata.excerpt = values["description"];
-    } else if ("og:description" in values) {
-      // Use facebook open graph description.
-      metadata.excerpt = values["og:description"];
-    } else if ("twitter:description" in values) {
-      // Use twitter cards description.
-      metadata.excerpt = values["twitter:description"];
+    // get title
+    metadata.title = values["dc:title"] ||
+                     values["dcterm:title"] ||
+                     values["og:title"] ||
+                     values["weibo:article:title"] ||
+                     values["weibo:webpage:title"] ||
+                     values["title"] ||
+                     values["twitter:title"];
+
+    if (!metadata.title) {
+      metadata.title = this._getArticleTitle();
    }

-    metadata.title = this._getArticleTitle();
-    if (!metadata.title) {
-      if ("og:title" in values) {
-        // Use facebook open graph title.
-        metadata.title = values["og:title"];
-      } else if ("twitter:title" in values) {
-        // Use twitter cards title.
-        metadata.title = values["twitter:title"];
-      }
-    }
+    // get author
+    metadata.byline = values["dc:creator"] ||
+                      values["dcterm:creator"] ||
+                      values["author"];
+
+    // get description
+    metadata.excerpt = values["dc:description"] ||
+                       values["dcterm:description"] ||
+                       values["og:description"] ||
+                       values["weibo:article:description"] ||
+                       values["weibo:webpage:description"] ||
+                       values["description"] ||
+                       values["twitter:description"];
+
+    // get site name
+    metadata.siteName = values["og:site_name"];

    return metadata;
  },
@@ -1266,24 +1301,25 @@ Readability.prototype = {
   * @param Element
  **/
  _removeScripts: function(doc) {
-    this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
+    this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
      scriptNode.nodeValue = "";
-      scriptNode.removeAttribute('src');
+      scriptNode.removeAttribute("src");
      return true;
    });
-    this._removeNodes(doc.getElementsByTagName('noscript'));
+    this._removeNodes(doc.getElementsByTagName("noscript"));
  },

  /**
-   * Check if this node has only whitespace and a single P element
+   * Check if this node has only whitespace and a single element with given tag
   * Returns false if the DIV node contains non-empty text nodes
-   * or if it contains no P or more than 1 element.
+   * or if it contains no element with given tag or more than 1 element.
   *
   * @param Element
+   * @param string tag of child element
  **/
-  _hasSinglePInsideElement: function(element) {
-    // There should be exactly 1 element child which is a P:
-    if (element.children.length != 1 || element.children[0].tagName !== "P") {
+  _hasSingleTagInsideElement: function(element, tag) {
+    // There should be exactly 1 element child with given tag
+    if (element.children.length != 1 || element.children[0].tagName !== tag) {
      return false;
    }

@@ -1337,7 +1373,7 @@ Readability.prototype = {
   * @return string
  **/
  _getInnerText: function(e, normalizeSpaces) {
-    normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
+    normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
    var textContent = e.textContent.trim();

    if (normalizeSpaces) {
@@ -1366,7 +1402,7 @@ Readability.prototype = {
   * @return void
  **/
  _cleanStyles: function(e) {
-    if (!e || e.tagName.toLowerCase() === 'svg')
+    if (!e || e.tagName.toLowerCase() === "svg")
      return;

    // Remove `style` and deprecated presentational attributes
@@ -1375,8 +1411,8 @@ Readability.prototype = {
    }

    if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
-      e.removeAttribute('width');
-      e.removeAttribute('height');
+      e.removeAttribute("width");
+      e.removeAttribute("height");
    }

    var cur = e.firstElementChild;
@@ -1422,7 +1458,7 @@ Readability.prototype = {
    var weight = 0;

    // Look for a special classname
-    if (typeof(e.className) === 'string' && e.className !== '') {
+    if (typeof(e.className) === "string" && e.className !== "") {
      if (this.REGEXPS.negative.test(e.className))
        weight -= 25;

@@ -1431,7 +1467,7 @@ Readability.prototype = {
    }

    // Look for a special ID
-    if (typeof(e.id) === 'string' && e.id !== '') {
+    if (typeof(e.id) === "string" && e.id !== "") {
      if (this.REGEXPS.negative.test(e.id))
        weight -= 25;

@@ -1456,17 +1492,17 @@ Readability.prototype = {
    this._removeNodes(e.getElementsByTagName(tag), function(element) {
      // Allow youtube and vimeo videos through as people usually want to see those.
      if (isEmbed) {
-        var attributeValues = [].map.call(element.attributes, function(attr) {
-          return attr.value;
-        }).join("|");
-
        // First, check the elements attributes to see if any of them contain youtube or vimeo
-        if (this.REGEXPS.videos.test(attributeValues))
-          return false;
+        for (var i = 0; i < element.attributes.length; i++) {
+          if (this.REGEXPS.videos.test(element.attributes[i].value)) {
+            return false;
+          }
+        }

-        // Then check the elements inside this element for the same.
-        if (this.REGEXPS.videos.test(element.innerHTML))
+        // For embed with <object> tag, check inner HTML as well.
+        if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) {
          return false;
+        }
      }

      return true;
@@ -1584,6 +1620,39 @@ Readability.prototype = {
    }
  },

+  /* convert images and figures that have properties like data-src into images that can be loaded without JS */
+  _fixLazyImages: function (root) {
+    this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
+      // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
+      if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) {
+        for (var i = 0; i < elem.attributes.length; i++) {
+          var attr = elem.attributes[i];
+          if (attr.name === "src" || attr.name === "srcset") {
+            continue;
+          }
+          var copyTo = null;
+          if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
+            copyTo = "srcset";
+          } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
+            copyTo = "src";
+          }
+          if (copyTo) {
+            //if this is an img or picture, set the attribute directly
+            if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
+              elem.setAttribute(copyTo, attr.value);
+            } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
+              //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
+              //see the nytimes-3 testcase for an example
+              var img = this._doc.createElement("img");
+              img.setAttribute(copyTo, attr.value);
+              elem.appendChild(img);
+            }
+          }
+        }
+      }
+    });
+  },
+
  /**
   * Clean an element of all tags of type "tag" if they look fishy.
   * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
@@ -1602,11 +1671,16 @@ Readability.prototype = {
    //
    // TODO: Consider taking into account original contentScore here.
    this._removeNodes(e.getElementsByTagName(tag), function(node) {
-      // First check if we're in a data table, in which case don't remove us.
+      // First check if this node IS data table, in which case don't remove it.
      var isDataTable = function(t) {
        return t._readabilityDataTable;
      };

+      if (tag === "table" && isDataTable(node)) {
+        return false;
+      }
+
+      // Next check if we're inside a data table, in which case don't remove it as well.
      if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
        return false;
      }
@@ -1620,7 +1694,7 @@ Readability.prototype = {
        return true;
      }

-      if (this._getCharCount(node, ',') < 10) {
+      if (this._getCharCount(node, ",") < 10) {
        // If there are not very many commas, and the number of
        // non-paragraph elements is more than paragraphs or other
        // ominous signs, remove the element.
@@ -1630,10 +1704,25 @@ Readability.prototype = {
        var input = node.getElementsByTagName("input").length;

        var embedCount = 0;
-        var embeds = node.getElementsByTagName("embed");
-        for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
-          if (!this.REGEXPS.videos.test(embeds[ei].src))
-            embedCount += 1;
+        var embeds = this._concatNodeLists(
+          node.getElementsByTagName("object"),
+          node.getElementsByTagName("embed"),
+          node.getElementsByTagName("iframe"));
+
+        for (var i = 0; i < embeds.length; i++) {
+          // If this embed has attribute that matches video regex, don't delete it.
+          for (var j = 0; j < embeds[i].attributes.length; j++) {
+            if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) {
+              return false;
+            }
+          }
+
+          // For embed with <object> tag, check inner HTML as well.
+          if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) {
+            return false;
+          }
+
+          embedCount++;
        }

        var linkDensity = this._getLinkDensity(node);
@@ -1654,17 +1743,17 @@ Readability.prototype = {
  },

  /**
-   * Clean out elements whose id/class combinations match specific string.
+   * Clean out elements that match the specified conditions
   *
   * @param Element
-   * @param RegExp match id/class combination.
+   * @param Function determines whether a node should be removed
   * @return void
   **/
-  _cleanMatchedNodes: function(e, regex) {
+  _cleanMatchedNodes: function(e, filter) {
    var endOfSearchMarkerNode = this._getNextNode(e, true);
    var next = this._getNextNode(e);
    while (next && next != endOfSearchMarkerNode) {
-      if (regex.test(next.className + " " + next.id)) {
+      if (filter(next, next.className + " " + next.id)) {
        next = this._removeAndGetNext(next);
      } else {
        next = this._getNextNode(next);
@@ -1680,7 +1769,7 @@ Readability.prototype = {
  **/
  _cleanHeaders: function(e) {
    for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
-      this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
+      this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
        return this._getClassWeight(header) < 0;
      });
    }
@@ -1694,63 +1783,8 @@ Readability.prototype = {
    this._flags = this._flags & ~flag;
  },

-  /**
-   * Decides whether or not the document is reader-able without parsing the whole thing.
-   *
-   * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
-   */
-  isProbablyReaderable: function(helperIsVisible) {
-    var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
-
-    // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
-    // Some articles' DOM structures might look like
-    // <div>
-    //   Sentences<br>
-    //   <br>
-    //   Sentences<br>
-    // </div>
-    var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
-    if (brNodes.length) {
-      var set = new Set();
-      [].forEach.call(brNodes, function(node) {
-        set.add(node.parentNode);
-      });
-      nodes = [].concat.apply(Array.from(set), nodes);
-    }
-
-    // FIXME we should have a fallback for helperIsVisible, but this is
-    // problematic because of jsdom's elem.style handling - see
-    // https://github.com/mozilla/readability/pull/186 for context.
-
-    var score = 0;
-    // This is a little cheeky, we use the accumulator 'score' to decide what to return from
-    // this callback:
-    return this._someNode(nodes, function(node) {
-      if (helperIsVisible && !helperIsVisible(node))
-        return false;
-      var matchString = node.className + " " + node.id;
-
-      if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
-          !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
-        return false;
-      }
-
-      if (node.matches && node.matches("li p")) {
-        return false;
-      }
-
-      var textContentLength = node.textContent.trim().length;
-      if (textContentLength < 140) {
-        return false;
-      }
-
-      score += Math.sqrt(textContentLength - 140);
-
-      if (score > 20) {
-        return true;
-      }
-      return false;
-    });
+  _isProbablyVisible: function(node) {
+    return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
  },

  /**
@@ -1774,9 +1808,6 @@ Readability.prototype = {
      }
    }

-    if (typeof this._doc.documentElement.firstElementChild === "undefined") {
-      this._getNextNode = this._getNextNodeNoElementProperties;
-    }
    // Remove script tags from the document.
    this._removeScripts(this._doc);

@@ -1812,6 +1843,7 @@ Readability.prototype = {
      textContent: textContent,
      length: textContent.length,
      excerpt: metadata.excerpt,
+      siteName: metadata.siteName || this._articleSiteName
    };
  }
 };