// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605 /*eslint-env es6:false*/ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ /** * This is a relatively lightweight DOMParser that is safe to use in a web * worker. This is far from a complete DOM implementation; however, it should * contain the minimal set of functionality necessary for Readability.js. * * Aside from not implementing the full DOM API, there are other quirks to be * aware of when using the JSDOMParser: * * 1) Properly formed HTML/XML must be used. This means you should be extra * careful when using this parser on anything received directly from an * XMLHttpRequest. Providing a serialized string from an XMLSerializer, * however, should be safe (since the browser's XMLSerializer should * generate valid HTML/XML). Therefore, if parsing a document from an XHR, * the recommended approach is to do the XHR in the main thread, use * XMLSerializer.serializeToString() on the responseXML, and pass the * resulting string to the worker. * * 2) Live NodeLists are not supported. DOM methods and properties such as * getElementsByTagName() and childNodes return standard arrays. If you * want these lists to be updated when nodes are removed or added to the * document, you must take care to manually update them yourself. */ (function (global) { // XML only defines these and the numeric ones: var entityTable = { 'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'', }; var reverseEntityTable = { '<': '<', '>': '>', '&': '&', '"': '"', '\'': ''', }; function encodeTextContentHTML(s) { return s.replace(/[&<>]/g, function(x) { return reverseEntityTable[x]; }); } function encodeHTML(s) { return s.replace(/[&<>'"]/g, function(x) { return reverseEntityTable[x]; }); } function decodeHTML(str) { return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) { return entityTable[tag]; }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) { var num = parseInt(hex || numStr, hex ? 16 : 10); // read num return String.fromCharCode(num); }); } // When a style is set in JS, map it to the corresponding CSS attribute var styleMap = { 'alignmentBaseline': 'alignment-baseline', 'background': 'background', 'backgroundAttachment': 'background-attachment', 'backgroundClip': 'background-clip', 'backgroundColor': 'background-color', 'backgroundImage': 'background-image', 'backgroundOrigin': 'background-origin', 'backgroundPosition': 'background-position', 'backgroundPositionX': 'background-position-x', 'backgroundPositionY': 'background-position-y', 'backgroundRepeat': 'background-repeat', 'backgroundRepeatX': 'background-repeat-x', 'backgroundRepeatY': 'background-repeat-y', 'backgroundSize': 'background-size', 'baselineShift': 'baseline-shift', 'border': 'border', 'borderBottom': 'border-bottom', 'borderBottomColor': 'border-bottom-color', 'borderBottomLeftRadius': 'border-bottom-left-radius', 'borderBottomRightRadius': 'border-bottom-right-radius', 'borderBottomStyle': 'border-bottom-style', 'borderBottomWidth': 'border-bottom-width', 'borderCollapse': 'border-collapse', 'borderColor': 'border-color', 'borderImage': 'border-image', 'borderImageOutset': 'border-image-outset', 'borderImageRepeat': 'border-image-repeat', 'borderImageSlice': 'border-image-slice', 'borderImageSource': 'border-image-source', 'borderImageWidth': 'border-image-width', 'borderLeft': 'border-left', 'borderLeftColor': 'border-left-color', 'borderLeftStyle': 'border-left-style', 'borderLeftWidth': 'border-left-width', 'borderRadius': 'border-radius', 'borderRight': 'border-right', 'borderRightColor': 'border-right-color', 'borderRightStyle': 'border-right-style', 'borderRightWidth': 'border-right-width', 'borderSpacing': 'border-spacing', 'borderStyle': 'border-style', 'borderTop': 'border-top', 'borderTopColor': 'border-top-color', 'borderTopLeftRadius': 'border-top-left-radius', 'borderTopRightRadius': 'border-top-right-radius', 'borderTopStyle': 'border-top-style', 'borderTopWidth': 'border-top-width', 'borderWidth': 'border-width', 'bottom': 'bottom', 'boxShadow': 'box-shadow', 'boxSizing': 'box-sizing', 'captionSide': 'caption-side', 'clear': 'clear', 'clip': 'clip', 'clipPath': 'clip-path', 'clipRule': 'clip-rule', 'color': 'color', 'colorInterpolation': 'color-interpolation', 'colorInterpolationFilters': 'color-interpolation-filters', 'colorProfile': 'color-profile', 'colorRendering': 'color-rendering', 'content': 'content', 'counterIncrement': 'counter-increment', 'counterReset': 'counter-reset', 'cursor': 'cursor', 'direction': 'direction', 'display': 'display', 'dominantBaseline': 'dominant-baseline', 'emptyCells': 'empty-cells', 'enableBackground': 'enable-background', 'fill': 'fill', 'fillOpacity': 'fill-opacity', 'fillRule': 'fill-rule', 'filter': 'filter', 'cssFloat': 'float', 'floodColor': 'flood-color', 'floodOpacity': 'flood-opacity', 'font': 'font', 'fontFamily': 'font-family', 'fontSize': 'font-size', 'fontStretch': 'font-stretch', 'fontStyle': 'font-style', 'fontVariant': 'font-variant', 'fontWeight': 'font-weight', 'glyphOrientationHorizontal': 'glyph-orientation-horizontal', 'glyphOrientationVertical': 'glyph-orientation-vertical', 'height': 'height', 'imageRendering': 'image-rendering', 'kerning': 'kerning', 'left': 'left', 'letterSpacing': 'letter-spacing', 'lightingColor': 'lighting-color', 'lineHeight': 'line-height', 'listStyle': 'list-style', 'listStyleImage': 'list-style-image', 'listStylePosition': 'list-style-position', 'listStyleType': 'list-style-type', 'margin': 'margin', 'marginBottom': 'margin-bottom', 'marginLeft': 'margin-left', 'marginRight': 'margin-right', 'marginTop': 'margin-top', 'marker': 'marker', 'markerEnd': 'marker-end', 'markerMid': 'marker-mid', 'markerStart': 'marker-start', 'mask': 'mask', 'maxHeight': 'max-height', 'maxWidth': 'max-width', 'minHeight': 'min-height', 'minWidth': 'min-width', 'opacity': 'opacity', 'orphans': 'orphans', 'outline': 'outline', 'outlineColor': 'outline-color', 'outlineOffset': 'outline-offset', 'outlineStyle': 'outline-style', 'outlineWidth': 'outline-width', 'overflow': 'overflow', 'overflowX': 'overflow-x', 'overflowY': 'overflow-y', 'padding': 'padding', 'paddingBottom': 'padding-bottom', 'paddingLeft': 'padding-left', 'paddingRight': 'padding-right', 'paddingTop': 'padding-top', 'page': 'page', 'pageBreakAfter': 'page-break-after', 'pageBreakBefore': 'page-break-before', 'pageBreakInside': 'page-break-inside', 'pointerEvents': 'pointer-events', 'position': 'position', 'quotes': 'quotes', 'resize': 'resize', 'right': 'right', 'shapeRendering': 'shape-rendering', 'size': 'size', 'speak': 'speak', 'src': 'src', 'stopColor': 'stop-color', 'stopOpacity': 'stop-opacity', 'stroke': 'stroke', 'strokeDasharray': 'stroke-dasharray', 'strokeDashoffset': 'stroke-dashoffset', 'strokeLinecap': 'stroke-linecap', 'strokeLinejoin': 'stroke-linejoin', 'strokeMiterlimit': 'stroke-miterlimit', 'strokeOpacity': 'stroke-opacity', 'strokeWidth': 'stroke-width', 'tableLayout': 'table-layout', 'textAlign': 'text-align', 'textAnchor': 'text-anchor', 'textDecoration': 'text-decoration', 'textIndent': 'text-indent', 'textLineThrough': 'text-line-through', 'textLineThroughColor': 'text-line-through-color', 'textLineThroughMode': 'text-line-through-mode', 'textLineThroughStyle': 'text-line-through-style', 'textLineThroughWidth': 'text-line-through-width', 'textOverflow': 'text-overflow', 'textOverline': 'text-overline', 'textOverlineColor': 'text-overline-color', 'textOverlineMode': 'text-overline-mode', 'textOverlineStyle': 'text-overline-style', 'textOverlineWidth': 'text-overline-width', 'textRendering': 'text-rendering', 'textShadow': 'text-shadow', 'textTransform': 'text-transform', 'textUnderline': 'text-underline', 'textUnderlineColor': 'text-underline-color', 'textUnderlineMode': 'text-underline-mode', 'textUnderlineStyle': 'text-underline-style', 'textUnderlineWidth': 'text-underline-width', 'top': 'top', 'unicodeBidi': 'unicode-bidi', 'unicodeRange': 'unicode-range', 'vectorEffect': 'vector-effect', 'verticalAlign': 'vertical-align', 'visibility': 'visibility', 'whiteSpace': 'white-space', 'widows': 'widows', 'width': 'width', 'wordBreak': 'word-break', 'wordSpacing': 'word-spacing', 'wordWrap': 'word-wrap', 'writingMode': 'writing-mode', 'zIndex': 'z-index', 'zoom': 'zoom', }; // Elements that can be self-closing var voidElems = { 'area': true, 'base': true, 'br': true, 'col': true, 'command': true, 'embed': true, 'hr': true, 'img': true, 'input': true, 'link': true, 'meta': true, 'param': true, 'source': true, 'wbr': true, }; var whitespace = [' ', '\t', '\n', '\r']; // See http://www.w3schools.com/dom/dom_nodetype.asp var nodeTypes = { ELEMENT_NODE: 1, ATTRIBUTE_NODE: 2, TEXT_NODE: 3, CDATA_SECTION_NODE: 4, ENTITY_REFERENCE_NODE: 5, ENTITY_NODE: 6, PROCESSING_INSTRUCTION_NODE: 7, COMMENT_NODE: 8, DOCUMENT_NODE: 9, DOCUMENT_TYPE_NODE: 10, DOCUMENT_FRAGMENT_NODE: 11, NOTATION_NODE: 12, }; function getElementsByTagName(tag) { tag = tag.toUpperCase(); var elems = []; var allTags = (tag === '*'); function getElems(node) { var length = node.children.length; for (var i = 0; i < length; i++) { var child = node.children[i]; if (allTags || (child.tagName === tag)) elems.push(child); getElems(child); } } getElems(this); return elems; } var Node = function () {}; Node.prototype = { attributes: null, childNodes: null, localName: null, nodeName: null, parentNode: null, textContent: null, nextSibling: null, previousSibling: null, get firstChild() { return this.childNodes[0] || null; }, get firstElementChild() { return this.children[0] || null; }, get lastChild() { return this.childNodes[this.childNodes.length - 1] || null; }, get lastElementChild() { return this.children[this.children.length - 1] || null; }, appendChild: function (child) { if (child.parentNode) { child.parentNode.removeChild(child); } var last = this.lastChild; if (last) last.nextSibling = child; child.previousSibling = last; if (child.nodeType === Node.ELEMENT_NODE) { child.previousElementSibling = this.children[this.children.length - 1] || null; this.children.push(child); child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child); } this.childNodes.push(child); child.parentNode = this; }, removeChild: function (child) { var childNodes = this.childNodes; var childIndex = childNodes.indexOf(child); if (childIndex === -1) { throw 'removeChild: node not found'; } else { child.parentNode = null; var prev = child.previousSibling; var next = child.nextSibling; if (prev) prev.nextSibling = next; if (next) next.previousSibling = prev; if (child.nodeType === Node.ELEMENT_NODE) { prev = child.previousElementSibling; next = child.nextElementSibling; if (prev) prev.nextElementSibling = next; if (next) next.previousElementSibling = prev; this.children.splice(this.children.indexOf(child), 1); } child.previousSibling = child.nextSibling = null; child.previousElementSibling = child.nextElementSibling = null; return childNodes.splice(childIndex, 1)[0]; } }, replaceChild: function (newNode, oldNode) { var childNodes = this.childNodes; var childIndex = childNodes.indexOf(oldNode); if (childIndex === -1) { throw 'replaceChild: node not found'; } else { // This will take care of updating the new node if it was somewhere else before: if (newNode.parentNode) newNode.parentNode.removeChild(newNode); childNodes[childIndex] = newNode; // update the new node's sibling properties, and its new siblings' sibling properties newNode.nextSibling = oldNode.nextSibling; newNode.previousSibling = oldNode.previousSibling; if (newNode.nextSibling) newNode.nextSibling.previousSibling = newNode; if (newNode.previousSibling) newNode.previousSibling.nextSibling = newNode; newNode.parentNode = this; // Now deal with elements before we clear out those values for the old node, // because it can help us take shortcuts here: if (newNode.nodeType === Node.ELEMENT_NODE) { if (oldNode.nodeType === Node.ELEMENT_NODE) { // Both were elements, which makes this easier, we just swap things out: newNode.previousElementSibling = oldNode.previousElementSibling; newNode.nextElementSibling = oldNode.nextElementSibling; if (newNode.previousElementSibling) newNode.previousElementSibling.nextElementSibling = newNode; if (newNode.nextElementSibling) newNode.nextElementSibling.previousElementSibling = newNode; this.children[this.children.indexOf(oldNode)] = newNode; } else { // Hard way: newNode.previousElementSibling = (function() { for (var i = childIndex - 1; i >= 0; i--) { if (childNodes[i].nodeType === Node.ELEMENT_NODE) return childNodes[i]; } return null; })(); if (newNode.previousElementSibling) { newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling; } else { newNode.nextElementSibling = (function() { for (var i = childIndex + 1; i < childNodes.length; i++) { if (childNodes[i].nodeType === Node.ELEMENT_NODE) return childNodes[i]; } return null; })(); } if (newNode.previousElementSibling) newNode.previousElementSibling.nextElementSibling = newNode; if (newNode.nextElementSibling) newNode.nextElementSibling.previousElementSibling = newNode; if (newNode.nextElementSibling) this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode); else this.children.push(newNode); } } else if (oldNode.nodeType === Node.ELEMENT_NODE) { // new node is not an element node. // if the old one was, update its element siblings: if (oldNode.previousElementSibling) oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling; if (oldNode.nextElementSibling) oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling; this.children.splice(this.children.indexOf(oldNode), 1); // If the old node wasn't an element, neither the new nor the old node was an element, // and the children array and its members shouldn't need any updating. } oldNode.parentNode = null; oldNode.previousSibling = null; oldNode.nextSibling = null; if (oldNode.nodeType === Node.ELEMENT_NODE) { oldNode.previousElementSibling = null; oldNode.nextElementSibling = null; } return oldNode; } }, __JSDOMParser__: true, }; for (var nodeType in nodeTypes) { Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType]; } var Attribute = function (name, value) { this.name = name; this._value = value; }; Attribute.prototype = { get value() { return this._value; }, setValue: function(newValue) { this._value = newValue; }, getEncodedValue: function() { return encodeHTML(this._value); }, }; var Comment = function () { this.childNodes = []; }; Comment.prototype = { __proto__: Node.prototype, nodeName: '#comment', nodeType: Node.COMMENT_NODE, }; var Text = function () { this.childNodes = []; }; Text.prototype = { __proto__: Node.prototype, nodeName: '#text', nodeType: Node.TEXT_NODE, get textContent() { if (typeof this._textContent === 'undefined') { this._textContent = decodeHTML(this._innerHTML || ''); } return this._textContent; }, get innerHTML() { if (typeof this._innerHTML === 'undefined') { this._innerHTML = encodeTextContentHTML(this._textContent || ''); } return this._innerHTML; }, set innerHTML(newHTML) { this._innerHTML = newHTML; delete this._textContent; }, set textContent(newText) { this._textContent = newText; delete this._innerHTML; }, }; var Document = function (url) { this.documentURI = url; this.styleSheets = []; this.childNodes = []; this.children = []; }; Document.prototype = { __proto__: Node.prototype, nodeName: '#document', nodeType: Node.DOCUMENT_NODE, title: '', getElementsByTagName: getElementsByTagName, getElementById: function (id) { function getElem(node) { var length = node.children.length; if (node.id === id) return node; for (var i = 0; i < length; i++) { var el = getElem(node.children[i]); if (el) return el; } return null; } return getElem(this); }, createElement: function (tag) { var node = new Element(tag); return node; }, createTextNode: function (text) { var node = new Text(); node.textContent = text; return node; }, get baseURI() { if (!this.hasOwnProperty('_baseURI')) { this._baseURI = this.documentURI; var baseElements = this.getElementsByTagName('base'); var href = baseElements[0] && baseElements[0].getAttribute('href'); if (href) { try { this._baseURI = (new URL(href, this._baseURI)).href; } catch (ex) {/* Just fall back to documentURI */} } } return this._baseURI; }, }; var Element = function (tag) { // We use this to find the closing tag. this._matchingTag = tag; // We're explicitly a non-namespace aware parser, we just pretend it's all HTML. var lastColonIndex = tag.lastIndexOf(':'); if (lastColonIndex != -1) { tag = tag.substring(lastColonIndex + 1); } this.attributes = []; this.childNodes = []; this.children = []; this.nextElementSibling = this.previousElementSibling = null; this.localName = tag.toLowerCase(); this.tagName = tag.toUpperCase(); this.style = new Style(this); }; Element.prototype = { __proto__: Node.prototype, nodeType: Node.ELEMENT_NODE, getElementsByTagName: getElementsByTagName, get className() { return this.getAttribute('class') || ''; }, set className(str) { this.setAttribute('class', str); }, get id() { return this.getAttribute('id') || ''; }, set id(str) { this.setAttribute('id', str); }, get href() { return this.getAttribute('href') || ''; }, set href(str) { this.setAttribute('href', str); }, get src() { return this.getAttribute('src') || ''; }, set src(str) { this.setAttribute('src', str); }, get srcset() { return this.getAttribute('srcset') || ''; }, set srcset(str) { this.setAttribute('srcset', str); }, get nodeName() { return this.tagName; }, get innerHTML() { function getHTML(node) { var i = 0; for (i = 0; i < node.childNodes.length; i++) { var child = node.childNodes[i]; if (child.localName) { arr.push('<' + child.localName); // serialize attribute list for (var j = 0; j < child.attributes.length; j++) { var attr = child.attributes[j]; // the attribute value will be HTML escaped. var val = attr.getEncodedValue(); var quote = (val.indexOf('"') === -1 ? '"' : '\''); arr.push(' ' + attr.name + '=' + quote + val + quote); } if (child.localName in voidElems && !child.childNodes.length) { // if this is a self-closing element, end it here arr.push('/>'); } else { // otherwise, add its children arr.push('>'); getHTML(child); arr.push(''); } } else { // This is a text node, so asking for innerHTML won't recurse. arr.push(child.innerHTML); } } } // Using Array.join() avoids the overhead from lazy string concatenation. // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes var arr = []; getHTML(this); return arr.join(''); }, set innerHTML(html) { var parser = new JSDOMParser(); var node = parser.parse(html); var i; for (i = this.childNodes.length; --i >= 0;) { this.childNodes[i].parentNode = null; } this.childNodes = node.childNodes; this.children = node.children; for (i = this.childNodes.length; --i >= 0;) { this.childNodes[i].parentNode = this; } }, set textContent(text) { // clear parentNodes for existing children for (var i = this.childNodes.length; --i >= 0;) { this.childNodes[i].parentNode = null; } var node = new Text(); this.childNodes = [ node ]; this.children = []; node.textContent = text; node.parentNode = this; }, get textContent() { function getText(node) { var nodes = node.childNodes; for (var i = 0; i < nodes.length; i++) { var child = nodes[i]; if (child.nodeType === 3) { text.push(child.textContent); } else { getText(child); } } } // Using Array.join() avoids the overhead from lazy string concatenation. // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes var text = []; getText(this); return text.join(''); }, getAttribute: function (name) { for (var i = this.attributes.length; --i >= 0;) { var attr = this.attributes[i]; if (attr.name === name) { return attr.value; } } return undefined; }, setAttribute: function (name, value) { for (var i = this.attributes.length; --i >= 0;) { var attr = this.attributes[i]; if (attr.name === name) { attr.setValue(value); return; } } this.attributes.push(new Attribute(name, value)); }, removeAttribute: function (name) { for (var i = this.attributes.length; --i >= 0;) { var attr = this.attributes[i]; if (attr.name === name) { this.attributes.splice(i, 1); break; } } }, hasAttribute: function (name) { return this.attributes.some(function (attr) { return attr.name == name; }); }, }; var Style = function (node) { this.node = node; }; // getStyle() and setStyle() use the style attribute string directly. This // won't be very efficient if there are a lot of style manipulations, but // it's the easiest way to make sure the style attribute string and the JS // style property stay in sync. Readability.js doesn't do many style // manipulations, so this should be okay. Style.prototype = { getStyle: function (styleName) { var attr = this.node.getAttribute('style'); if (!attr) return undefined; var styles = attr.split(';'); for (var i = 0; i < styles.length; i++) { var style = styles[i].split(':'); var name = style[0].trim(); if (name === styleName) return style[1].trim(); } return undefined; }, setStyle: function (styleName, styleValue) { var value = this.node.getAttribute('style') || ''; var index = 0; do { var next = value.indexOf(';', index) + 1; var length = next - index - 1; var style = (length > 0 ? value.substr(index, length) : value.substr(index)); if (style.substr(0, style.indexOf(':')).trim() === styleName) { value = value.substr(0, index).trim() + (next ? ' ' + value.substr(next).trim() : ''); break; } index = next; } while (index); value += ' ' + styleName + ': ' + styleValue + ';'; this.node.setAttribute('style', value.trim()); }, }; // For each item in styleMap, define a getter and setter on the style // property. for (var jsName in styleMap) { (function (cssName) { Style.prototype.__defineGetter__(jsName, function () { return this.getStyle(cssName); }); Style.prototype.__defineSetter__(jsName, function (value) { this.setStyle(cssName, value); }); })(styleMap[jsName]); } var JSDOMParser = function () { this.currentChar = 0; // In makeElementNode() we build up many strings one char at a time. Using // += for this results in lots of short-lived intermediate strings. It's // better to build an array of single-char strings and then join() them // together at the end. And reusing a single array (i.e. |this.strBuf|) // over and over for this purpose uses less memory than using a new array // for each string. this.strBuf = []; // Similarly, we reuse this array to return the two arguments from // makeElementNode(), which saves us from having to allocate a new array // every time. this.retPair = []; this.errorState = ''; }; JSDOMParser.prototype = { error: function(m) { dump('JSDOMParser error: ' + m + '\n'); this.errorState += m + '\n'; }, /** * Look at the next character without advancing the index. */ peekNext: function () { return this.html[this.currentChar]; }, /** * Get the next character and advance the index. */ nextChar: function () { return this.html[this.currentChar++]; }, /** * Called after a quote character is read. This finds the next quote * character and returns the text string in between. */ readString: function (quote) { var str; var n = this.html.indexOf(quote, this.currentChar); if (n === -1) { this.currentChar = this.html.length; str = null; } else { str = this.html.substring(this.currentChar, n); this.currentChar = n + 1; } return str; }, /** * Called when parsing a node. This finds the next name/value attribute * pair and adds the result to the attributes list. */ readAttribute: function (node) { var name = ''; var n = this.html.indexOf('=', this.currentChar); if (n === -1) { this.currentChar = this.html.length; } else { // Read until a '=' character is hit; this will be the attribute key name = this.html.substring(this.currentChar, n); this.currentChar = n + 1; } if (!name) return; // After a '=', we should see a '"' for the attribute value var c = this.nextChar(); if (c !== '"' && c !== '\'') { this.error('Error reading attribute ' + name + ', expecting \'"\''); return; } // Read the attribute value (and consume the matching quote) var value = this.readString(c); node.attributes.push(new Attribute(name, decodeHTML(value))); return; }, /** * Parses and returns an Element node. This is called after a '<' has been * read. * * @returns an array; the first index of the array is the parsed node; * the second index is a boolean indicating whether this is a void * Element */ makeElementNode: function (retPair) { var c = this.nextChar(); // Read the Element tag name var strBuf = this.strBuf; strBuf.length = 0; while (whitespace.indexOf(c) == -1 && c !== '>' && c !== '/') { if (c === undefined) return false; strBuf.push(c); c = this.nextChar(); } var tag = strBuf.join(''); if (!tag) return false; var node = new Element(tag); // Read Element attributes while (c !== '/' && c !== '>') { if (c === undefined) return false; while (whitespace.indexOf(this.html[this.currentChar++]) != -1) { // Advance cursor to first non-whitespace char. } this.currentChar--; c = this.nextChar(); if (c !== '/' && c !== '>') { --this.currentChar; this.readAttribute(node); } } // If this is a self-closing tag, read '/>' var closed = false; if (c === '/') { closed = true; c = this.nextChar(); if (c !== '>') { this.error('expected \'>\' to close ' + tag); return false; } } retPair[0] = node; retPair[1] = closed; return true; }, /** * If the current input matches this string, advance the input index; * otherwise, do nothing. * * @returns whether input matched string */ match: function (str) { var strlen = str.length; if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) { this.currentChar += strlen; return true; } return false; }, /** * Searches the input until a string is found and discards all input up to * and including the matched string. */ discardTo: function (str) { var index = this.html.indexOf(str, this.currentChar) + str.length; if (index === -1) this.currentChar = this.html.length; this.currentChar = index; }, /** * Reads child nodes for the given node. */ readChildren: function (node) { var child; while ((child = this.readNode())) { // Don't keep Comment nodes if (child.nodeType !== 8) { node.appendChild(child); } } }, discardNextComment: function() { if (this.match('--')) { this.discardTo('-->'); } else { var c = this.nextChar(); while (c !== '>') { if (c === undefined) return null; if (c === '"' || c === '\'') this.readString(c); c = this.nextChar(); } } return new Comment(); }, /** * Reads the next child node from the input. If we're reading a closing * tag, or if we've reached the end of input, return null. * * @returns the node */ readNode: function () { var c = this.nextChar(); if (c === undefined) return null; // Read any text as Text node var textNode; if (c !== '<') { --this.currentChar; textNode = new Text(); var n = this.html.indexOf('<', this.currentChar); if (n === -1) { textNode.innerHTML = this.html.substring(this.currentChar, this.html.length); this.currentChar = this.html.length; } else { textNode.innerHTML = this.html.substring(this.currentChar, n); this.currentChar = n; } return textNode; } if (this.match('![CDATA[')) { var endChar = this.html.indexOf(']]>', this.currentChar); if (endChar === -1) { this.error('unclosed CDATA section'); return null; } textNode = new Text(); textNode.textContent = this.html.substring(this.currentChar, endChar); this.currentChar = endChar + (']]>').length; return textNode; } c = this.peekNext(); // Read Comment node. Normally, Comment nodes know their inner // textContent, but we don't really care about Comment nodes (we throw // them away in readChildren()). So just returning an empty Comment node // here is sufficient. if (c === '!' || c === '?') { // We're still before the ! or ? that is starting this comment: this.currentChar++; return this.discardNextComment(); } // If we're reading a closing tag, return null. This means we've reached // the end of this set of child nodes. if (c === '/') { --this.currentChar; return null; } // Otherwise, we're looking at an Element node var result = this.makeElementNode(this.retPair); if (!result) return null; var node = this.retPair[0]; var closed = this.retPair[1]; var localName = node.localName; // If this isn't a void Element, read its child nodes if (!closed) { this.readChildren(node); var closingTag = ''; if (!this.match(closingTag)) { this.error('expected \'' + closingTag + '\' and got ' + this.html.substr(this.currentChar, closingTag.length)); return null; } } // Only use the first title, because SVG might have other // title elements which we don't care about (medium.com // does this, at least). if (localName === 'title' && !this.doc.title) { this.doc.title = node.textContent.trim(); } else if (localName === 'head') { this.doc.head = node; } else if (localName === 'body') { this.doc.body = node; } else if (localName === 'html') { this.doc.documentElement = node; } return node; }, /** * Parses an HTML string and returns a JS implementation of the Document. */ parse: function (html, url) { this.html = html; var doc = this.doc = new Document(url); this.readChildren(doc); // If this is an HTML document, remove root-level children except for the // node if (doc.documentElement) { for (var i = doc.childNodes.length; --i >= 0;) { var child = doc.childNodes[i]; if (child !== doc.documentElement) { doc.removeChild(child); } } } return doc; }, }; // Attach the standard DOM types to the global scope global.Node = Node; global.Comment = Comment; global.Document = Document; global.Element = Element; global.Text = Text; // Attach JSDOMParser to the global scope global.JSDOMParser = JSDOMParser; })(this);