1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-03 08:35:29 +02:00
joplin/Clipper/content_scripts/JSDOMParser.js
2020-02-23 20:51:04 +00:00

1191 lines
32 KiB
JavaScript

// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
/*eslint-env es6:false*/
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* This is a relatively lightweight DOMParser that is safe to use in a web
* worker. This is far from a complete DOM implementation; however, it should
* contain the minimal set of functionality necessary for Readability.js.
*
* Aside from not implementing the full DOM API, there are other quirks to be
* aware of when using the JSDOMParser:
*
* 1) Properly formed HTML/XML must be used. This means you should be extra
* careful when using this parser on anything received directly from an
* XMLHttpRequest. Providing a serialized string from an XMLSerializer,
* however, should be safe (since the browser's XMLSerializer should
* generate valid HTML/XML). Therefore, if parsing a document from an XHR,
* the recommended approach is to do the XHR in the main thread, use
* XMLSerializer.serializeToString() on the responseXML, and pass the
* resulting string to the worker.
*
* 2) Live NodeLists are not supported. DOM methods and properties such as
* getElementsByTagName() and childNodes return standard arrays. If you
* want these lists to be updated when nodes are removed or added to the
* document, you must take care to manually update them yourself.
*/
(function (global) {
// XML only defines these and the numeric ones:
var entityTable = {
'lt': '<',
'gt': '>',
'amp': '&',
'quot': '"',
'apos': '\'',
};
var reverseEntityTable = {
'<': '&lt;',
'>': '&gt;',
'&': '&amp;',
'"': '&quot;',
'\'': '&apos;',
};
function encodeTextContentHTML(s) {
return s.replace(/[&<>]/g, function(x) {
return reverseEntityTable[x];
});
}
function encodeHTML(s) {
return s.replace(/[&<>'"]/g, function(x) {
return reverseEntityTable[x];
});
}
function decodeHTML(str) {
return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) {
return entityTable[tag];
}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) {
var num = parseInt(hex || numStr, hex ? 16 : 10); // read num
return String.fromCharCode(num);
});
}
// When a style is set in JS, map it to the corresponding CSS attribute
var styleMap = {
'alignmentBaseline': 'alignment-baseline',
'background': 'background',
'backgroundAttachment': 'background-attachment',
'backgroundClip': 'background-clip',
'backgroundColor': 'background-color',
'backgroundImage': 'background-image',
'backgroundOrigin': 'background-origin',
'backgroundPosition': 'background-position',
'backgroundPositionX': 'background-position-x',
'backgroundPositionY': 'background-position-y',
'backgroundRepeat': 'background-repeat',
'backgroundRepeatX': 'background-repeat-x',
'backgroundRepeatY': 'background-repeat-y',
'backgroundSize': 'background-size',
'baselineShift': 'baseline-shift',
'border': 'border',
'borderBottom': 'border-bottom',
'borderBottomColor': 'border-bottom-color',
'borderBottomLeftRadius': 'border-bottom-left-radius',
'borderBottomRightRadius': 'border-bottom-right-radius',
'borderBottomStyle': 'border-bottom-style',
'borderBottomWidth': 'border-bottom-width',
'borderCollapse': 'border-collapse',
'borderColor': 'border-color',
'borderImage': 'border-image',
'borderImageOutset': 'border-image-outset',
'borderImageRepeat': 'border-image-repeat',
'borderImageSlice': 'border-image-slice',
'borderImageSource': 'border-image-source',
'borderImageWidth': 'border-image-width',
'borderLeft': 'border-left',
'borderLeftColor': 'border-left-color',
'borderLeftStyle': 'border-left-style',
'borderLeftWidth': 'border-left-width',
'borderRadius': 'border-radius',
'borderRight': 'border-right',
'borderRightColor': 'border-right-color',
'borderRightStyle': 'border-right-style',
'borderRightWidth': 'border-right-width',
'borderSpacing': 'border-spacing',
'borderStyle': 'border-style',
'borderTop': 'border-top',
'borderTopColor': 'border-top-color',
'borderTopLeftRadius': 'border-top-left-radius',
'borderTopRightRadius': 'border-top-right-radius',
'borderTopStyle': 'border-top-style',
'borderTopWidth': 'border-top-width',
'borderWidth': 'border-width',
'bottom': 'bottom',
'boxShadow': 'box-shadow',
'boxSizing': 'box-sizing',
'captionSide': 'caption-side',
'clear': 'clear',
'clip': 'clip',
'clipPath': 'clip-path',
'clipRule': 'clip-rule',
'color': 'color',
'colorInterpolation': 'color-interpolation',
'colorInterpolationFilters': 'color-interpolation-filters',
'colorProfile': 'color-profile',
'colorRendering': 'color-rendering',
'content': 'content',
'counterIncrement': 'counter-increment',
'counterReset': 'counter-reset',
'cursor': 'cursor',
'direction': 'direction',
'display': 'display',
'dominantBaseline': 'dominant-baseline',
'emptyCells': 'empty-cells',
'enableBackground': 'enable-background',
'fill': 'fill',
'fillOpacity': 'fill-opacity',
'fillRule': 'fill-rule',
'filter': 'filter',
'cssFloat': 'float',
'floodColor': 'flood-color',
'floodOpacity': 'flood-opacity',
'font': 'font',
'fontFamily': 'font-family',
'fontSize': 'font-size',
'fontStretch': 'font-stretch',
'fontStyle': 'font-style',
'fontVariant': 'font-variant',
'fontWeight': 'font-weight',
'glyphOrientationHorizontal': 'glyph-orientation-horizontal',
'glyphOrientationVertical': 'glyph-orientation-vertical',
'height': 'height',
'imageRendering': 'image-rendering',
'kerning': 'kerning',
'left': 'left',
'letterSpacing': 'letter-spacing',
'lightingColor': 'lighting-color',
'lineHeight': 'line-height',
'listStyle': 'list-style',
'listStyleImage': 'list-style-image',
'listStylePosition': 'list-style-position',
'listStyleType': 'list-style-type',
'margin': 'margin',
'marginBottom': 'margin-bottom',
'marginLeft': 'margin-left',
'marginRight': 'margin-right',
'marginTop': 'margin-top',
'marker': 'marker',
'markerEnd': 'marker-end',
'markerMid': 'marker-mid',
'markerStart': 'marker-start',
'mask': 'mask',
'maxHeight': 'max-height',
'maxWidth': 'max-width',
'minHeight': 'min-height',
'minWidth': 'min-width',
'opacity': 'opacity',
'orphans': 'orphans',
'outline': 'outline',
'outlineColor': 'outline-color',
'outlineOffset': 'outline-offset',
'outlineStyle': 'outline-style',
'outlineWidth': 'outline-width',
'overflow': 'overflow',
'overflowX': 'overflow-x',
'overflowY': 'overflow-y',
'padding': 'padding',
'paddingBottom': 'padding-bottom',
'paddingLeft': 'padding-left',
'paddingRight': 'padding-right',
'paddingTop': 'padding-top',
'page': 'page',
'pageBreakAfter': 'page-break-after',
'pageBreakBefore': 'page-break-before',
'pageBreakInside': 'page-break-inside',
'pointerEvents': 'pointer-events',
'position': 'position',
'quotes': 'quotes',
'resize': 'resize',
'right': 'right',
'shapeRendering': 'shape-rendering',
'size': 'size',
'speak': 'speak',
'src': 'src',
'stopColor': 'stop-color',
'stopOpacity': 'stop-opacity',
'stroke': 'stroke',
'strokeDasharray': 'stroke-dasharray',
'strokeDashoffset': 'stroke-dashoffset',
'strokeLinecap': 'stroke-linecap',
'strokeLinejoin': 'stroke-linejoin',
'strokeMiterlimit': 'stroke-miterlimit',
'strokeOpacity': 'stroke-opacity',
'strokeWidth': 'stroke-width',
'tableLayout': 'table-layout',
'textAlign': 'text-align',
'textAnchor': 'text-anchor',
'textDecoration': 'text-decoration',
'textIndent': 'text-indent',
'textLineThrough': 'text-line-through',
'textLineThroughColor': 'text-line-through-color',
'textLineThroughMode': 'text-line-through-mode',
'textLineThroughStyle': 'text-line-through-style',
'textLineThroughWidth': 'text-line-through-width',
'textOverflow': 'text-overflow',
'textOverline': 'text-overline',
'textOverlineColor': 'text-overline-color',
'textOverlineMode': 'text-overline-mode',
'textOverlineStyle': 'text-overline-style',
'textOverlineWidth': 'text-overline-width',
'textRendering': 'text-rendering',
'textShadow': 'text-shadow',
'textTransform': 'text-transform',
'textUnderline': 'text-underline',
'textUnderlineColor': 'text-underline-color',
'textUnderlineMode': 'text-underline-mode',
'textUnderlineStyle': 'text-underline-style',
'textUnderlineWidth': 'text-underline-width',
'top': 'top',
'unicodeBidi': 'unicode-bidi',
'unicodeRange': 'unicode-range',
'vectorEffect': 'vector-effect',
'verticalAlign': 'vertical-align',
'visibility': 'visibility',
'whiteSpace': 'white-space',
'widows': 'widows',
'width': 'width',
'wordBreak': 'word-break',
'wordSpacing': 'word-spacing',
'wordWrap': 'word-wrap',
'writingMode': 'writing-mode',
'zIndex': 'z-index',
'zoom': 'zoom',
};
// Elements that can be self-closing
var voidElems = {
'area': true,
'base': true,
'br': true,
'col': true,
'command': true,
'embed': true,
'hr': true,
'img': true,
'input': true,
'link': true,
'meta': true,
'param': true,
'source': true,
'wbr': true,
};
var whitespace = [' ', '\t', '\n', '\r'];
// See http://www.w3schools.com/dom/dom_nodetype.asp
var nodeTypes = {
ELEMENT_NODE: 1,
ATTRIBUTE_NODE: 2,
TEXT_NODE: 3,
CDATA_SECTION_NODE: 4,
ENTITY_REFERENCE_NODE: 5,
ENTITY_NODE: 6,
PROCESSING_INSTRUCTION_NODE: 7,
COMMENT_NODE: 8,
DOCUMENT_NODE: 9,
DOCUMENT_TYPE_NODE: 10,
DOCUMENT_FRAGMENT_NODE: 11,
NOTATION_NODE: 12,
};
function getElementsByTagName(tag) {
tag = tag.toUpperCase();
var elems = [];
var allTags = (tag === '*');
function getElems(node) {
var length = node.children.length;
for (var i = 0; i < length; i++) {
var child = node.children[i];
if (allTags || (child.tagName === tag))
elems.push(child);
getElems(child);
}
}
getElems(this);
return elems;
}
var Node = function () {};
Node.prototype = {
attributes: null,
childNodes: null,
localName: null,
nodeName: null,
parentNode: null,
textContent: null,
nextSibling: null,
previousSibling: null,
get firstChild() {
return this.childNodes[0] || null;
},
get firstElementChild() {
return this.children[0] || null;
},
get lastChild() {
return this.childNodes[this.childNodes.length - 1] || null;
},
get lastElementChild() {
return this.children[this.children.length - 1] || null;
},
appendChild: function (child) {
if (child.parentNode) {
child.parentNode.removeChild(child);
}
var last = this.lastChild;
if (last)
last.nextSibling = child;
child.previousSibling = last;
if (child.nodeType === Node.ELEMENT_NODE) {
child.previousElementSibling = this.children[this.children.length - 1] || null;
this.children.push(child);
child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child);
}
this.childNodes.push(child);
child.parentNode = this;
},
removeChild: function (child) {
var childNodes = this.childNodes;
var childIndex = childNodes.indexOf(child);
if (childIndex === -1) {
throw 'removeChild: node not found';
} else {
child.parentNode = null;
var prev = child.previousSibling;
var next = child.nextSibling;
if (prev)
prev.nextSibling = next;
if (next)
next.previousSibling = prev;
if (child.nodeType === Node.ELEMENT_NODE) {
prev = child.previousElementSibling;
next = child.nextElementSibling;
if (prev)
prev.nextElementSibling = next;
if (next)
next.previousElementSibling = prev;
this.children.splice(this.children.indexOf(child), 1);
}
child.previousSibling = child.nextSibling = null;
child.previousElementSibling = child.nextElementSibling = null;
return childNodes.splice(childIndex, 1)[0];
}
},
replaceChild: function (newNode, oldNode) {
var childNodes = this.childNodes;
var childIndex = childNodes.indexOf(oldNode);
if (childIndex === -1) {
throw 'replaceChild: node not found';
} else {
// This will take care of updating the new node if it was somewhere else before:
if (newNode.parentNode)
newNode.parentNode.removeChild(newNode);
childNodes[childIndex] = newNode;
// update the new node's sibling properties, and its new siblings' sibling properties
newNode.nextSibling = oldNode.nextSibling;
newNode.previousSibling = oldNode.previousSibling;
if (newNode.nextSibling)
newNode.nextSibling.previousSibling = newNode;
if (newNode.previousSibling)
newNode.previousSibling.nextSibling = newNode;
newNode.parentNode = this;
// Now deal with elements before we clear out those values for the old node,
// because it can help us take shortcuts here:
if (newNode.nodeType === Node.ELEMENT_NODE) {
if (oldNode.nodeType === Node.ELEMENT_NODE) {
// Both were elements, which makes this easier, we just swap things out:
newNode.previousElementSibling = oldNode.previousElementSibling;
newNode.nextElementSibling = oldNode.nextElementSibling;
if (newNode.previousElementSibling)
newNode.previousElementSibling.nextElementSibling = newNode;
if (newNode.nextElementSibling)
newNode.nextElementSibling.previousElementSibling = newNode;
this.children[this.children.indexOf(oldNode)] = newNode;
} else {
// Hard way:
newNode.previousElementSibling = (function() {
for (var i = childIndex - 1; i >= 0; i--) {
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
return childNodes[i];
}
return null;
})();
if (newNode.previousElementSibling) {
newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling;
} else {
newNode.nextElementSibling = (function() {
for (var i = childIndex + 1; i < childNodes.length; i++) {
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
return childNodes[i];
}
return null;
})();
}
if (newNode.previousElementSibling)
newNode.previousElementSibling.nextElementSibling = newNode;
if (newNode.nextElementSibling)
newNode.nextElementSibling.previousElementSibling = newNode;
if (newNode.nextElementSibling)
this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
else
this.children.push(newNode);
}
} else if (oldNode.nodeType === Node.ELEMENT_NODE) {
// new node is not an element node.
// if the old one was, update its element siblings:
if (oldNode.previousElementSibling)
oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
if (oldNode.nextElementSibling)
oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
this.children.splice(this.children.indexOf(oldNode), 1);
// If the old node wasn't an element, neither the new nor the old node was an element,
// and the children array and its members shouldn't need any updating.
}
oldNode.parentNode = null;
oldNode.previousSibling = null;
oldNode.nextSibling = null;
if (oldNode.nodeType === Node.ELEMENT_NODE) {
oldNode.previousElementSibling = null;
oldNode.nextElementSibling = null;
}
return oldNode;
}
},
__JSDOMParser__: true,
};
for (var nodeType in nodeTypes) {
Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType];
}
var Attribute = function (name, value) {
this.name = name;
this._value = value;
};
Attribute.prototype = {
get value() {
return this._value;
},
setValue: function(newValue) {
this._value = newValue;
},
getEncodedValue: function() {
return encodeHTML(this._value);
},
};
var Comment = function () {
this.childNodes = [];
};
Comment.prototype = {
__proto__: Node.prototype,
nodeName: '#comment',
nodeType: Node.COMMENT_NODE,
};
var Text = function () {
this.childNodes = [];
};
Text.prototype = {
__proto__: Node.prototype,
nodeName: '#text',
nodeType: Node.TEXT_NODE,
get textContent() {
if (typeof this._textContent === 'undefined') {
this._textContent = decodeHTML(this._innerHTML || '');
}
return this._textContent;
},
get innerHTML() {
if (typeof this._innerHTML === 'undefined') {
this._innerHTML = encodeTextContentHTML(this._textContent || '');
}
return this._innerHTML;
},
set innerHTML(newHTML) {
this._innerHTML = newHTML;
delete this._textContent;
},
set textContent(newText) {
this._textContent = newText;
delete this._innerHTML;
},
};
var Document = function (url) {
this.documentURI = url;
this.styleSheets = [];
this.childNodes = [];
this.children = [];
};
Document.prototype = {
__proto__: Node.prototype,
nodeName: '#document',
nodeType: Node.DOCUMENT_NODE,
title: '',
getElementsByTagName: getElementsByTagName,
getElementById: function (id) {
function getElem(node) {
var length = node.children.length;
if (node.id === id)
return node;
for (var i = 0; i < length; i++) {
var el = getElem(node.children[i]);
if (el)
return el;
}
return null;
}
return getElem(this);
},
createElement: function (tag) {
var node = new Element(tag);
return node;
},
createTextNode: function (text) {
var node = new Text();
node.textContent = text;
return node;
},
get baseURI() {
if (!this.hasOwnProperty('_baseURI')) {
this._baseURI = this.documentURI;
var baseElements = this.getElementsByTagName('base');
var href = baseElements[0] && baseElements[0].getAttribute('href');
if (href) {
try {
this._baseURI = (new URL(href, this._baseURI)).href;
} catch (ex) {/* Just fall back to documentURI */}
}
}
return this._baseURI;
},
};
var Element = function (tag) {
// We use this to find the closing tag.
this._matchingTag = tag;
// We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
var lastColonIndex = tag.lastIndexOf(':');
if (lastColonIndex != -1) {
tag = tag.substring(lastColonIndex + 1);
}
this.attributes = [];
this.childNodes = [];
this.children = [];
this.nextElementSibling = this.previousElementSibling = null;
this.localName = tag.toLowerCase();
this.tagName = tag.toUpperCase();
this.style = new Style(this);
};
Element.prototype = {
__proto__: Node.prototype,
nodeType: Node.ELEMENT_NODE,
getElementsByTagName: getElementsByTagName,
get className() {
return this.getAttribute('class') || '';
},
set className(str) {
this.setAttribute('class', str);
},
get id() {
return this.getAttribute('id') || '';
},
set id(str) {
this.setAttribute('id', str);
},
get href() {
return this.getAttribute('href') || '';
},
set href(str) {
this.setAttribute('href', str);
},
get src() {
return this.getAttribute('src') || '';
},
set src(str) {
this.setAttribute('src', str);
},
get srcset() {
return this.getAttribute('srcset') || '';
},
set srcset(str) {
this.setAttribute('srcset', str);
},
get nodeName() {
return this.tagName;
},
get innerHTML() {
function getHTML(node) {
var i = 0;
for (i = 0; i < node.childNodes.length; i++) {
var child = node.childNodes[i];
if (child.localName) {
arr.push('<' + child.localName);
// serialize attribute list
for (var j = 0; j < child.attributes.length; j++) {
var attr = child.attributes[j];
// the attribute value will be HTML escaped.
var val = attr.getEncodedValue();
var quote = (val.indexOf('"') === -1 ? '"' : '\'');
arr.push(' ' + attr.name + '=' + quote + val + quote);
}
if (child.localName in voidElems && !child.childNodes.length) {
// if this is a self-closing element, end it here
arr.push('/>');
} else {
// otherwise, add its children
arr.push('>');
getHTML(child);
arr.push('</' + child.localName + '>');
}
} else {
// This is a text node, so asking for innerHTML won't recurse.
arr.push(child.innerHTML);
}
}
}
// Using Array.join() avoids the overhead from lazy string concatenation.
// See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
var arr = [];
getHTML(this);
return arr.join('');
},
set innerHTML(html) {
var parser = new JSDOMParser();
var node = parser.parse(html);
var i;
for (i = this.childNodes.length; --i >= 0;) {
this.childNodes[i].parentNode = null;
}
this.childNodes = node.childNodes;
this.children = node.children;
for (i = this.childNodes.length; --i >= 0;) {
this.childNodes[i].parentNode = this;
}
},
set textContent(text) {
// clear parentNodes for existing children
for (var i = this.childNodes.length; --i >= 0;) {
this.childNodes[i].parentNode = null;
}
var node = new Text();
this.childNodes = [ node ];
this.children = [];
node.textContent = text;
node.parentNode = this;
},
get textContent() {
function getText(node) {
var nodes = node.childNodes;
for (var i = 0; i < nodes.length; i++) {
var child = nodes[i];
if (child.nodeType === 3) {
text.push(child.textContent);
} else {
getText(child);
}
}
}
// Using Array.join() avoids the overhead from lazy string concatenation.
// See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
var text = [];
getText(this);
return text.join('');
},
getAttribute: function (name) {
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name) {
return attr.value;
}
}
return undefined;
},
setAttribute: function (name, value) {
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name) {
attr.setValue(value);
return;
}
}
this.attributes.push(new Attribute(name, value));
},
removeAttribute: function (name) {
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name) {
this.attributes.splice(i, 1);
break;
}
}
},
hasAttribute: function (name) {
return this.attributes.some(function (attr) {
return attr.name == name;
});
},
};
var Style = function (node) {
this.node = node;
};
// getStyle() and setStyle() use the style attribute string directly. This
// won't be very efficient if there are a lot of style manipulations, but
// it's the easiest way to make sure the style attribute string and the JS
// style property stay in sync. Readability.js doesn't do many style
// manipulations, so this should be okay.
Style.prototype = {
getStyle: function (styleName) {
var attr = this.node.getAttribute('style');
if (!attr)
return undefined;
var styles = attr.split(';');
for (var i = 0; i < styles.length; i++) {
var style = styles[i].split(':');
var name = style[0].trim();
if (name === styleName)
return style[1].trim();
}
return undefined;
},
setStyle: function (styleName, styleValue) {
var value = this.node.getAttribute('style') || '';
var index = 0;
do {
var next = value.indexOf(';', index) + 1;
var length = next - index - 1;
var style = (length > 0 ? value.substr(index, length) : value.substr(index));
if (style.substr(0, style.indexOf(':')).trim() === styleName) {
value = value.substr(0, index).trim() + (next ? ' ' + value.substr(next).trim() : '');
break;
}
index = next;
} while (index);
value += ' ' + styleName + ': ' + styleValue + ';';
this.node.setAttribute('style', value.trim());
},
};
// For each item in styleMap, define a getter and setter on the style
// property.
for (var jsName in styleMap) {
(function (cssName) {
Style.prototype.__defineGetter__(jsName, function () {
return this.getStyle(cssName);
});
Style.prototype.__defineSetter__(jsName, function (value) {
this.setStyle(cssName, value);
});
})(styleMap[jsName]);
}
var JSDOMParser = function () {
this.currentChar = 0;
// In makeElementNode() we build up many strings one char at a time. Using
// += for this results in lots of short-lived intermediate strings. It's
// better to build an array of single-char strings and then join() them
// together at the end. And reusing a single array (i.e. |this.strBuf|)
// over and over for this purpose uses less memory than using a new array
// for each string.
this.strBuf = [];
// Similarly, we reuse this array to return the two arguments from
// makeElementNode(), which saves us from having to allocate a new array
// every time.
this.retPair = [];
this.errorState = '';
};
JSDOMParser.prototype = {
error: function(m) {
dump('JSDOMParser error: ' + m + '\n');
this.errorState += m + '\n';
},
/**
* Look at the next character without advancing the index.
*/
peekNext: function () {
return this.html[this.currentChar];
},
/**
* Get the next character and advance the index.
*/
nextChar: function () {
return this.html[this.currentChar++];
},
/**
* Called after a quote character is read. This finds the next quote
* character and returns the text string in between.
*/
readString: function (quote) {
var str;
var n = this.html.indexOf(quote, this.currentChar);
if (n === -1) {
this.currentChar = this.html.length;
str = null;
} else {
str = this.html.substring(this.currentChar, n);
this.currentChar = n + 1;
}
return str;
},
/**
* Called when parsing a node. This finds the next name/value attribute
* pair and adds the result to the attributes list.
*/
readAttribute: function (node) {
var name = '';
var n = this.html.indexOf('=', this.currentChar);
if (n === -1) {
this.currentChar = this.html.length;
} else {
// Read until a '=' character is hit; this will be the attribute key
name = this.html.substring(this.currentChar, n);
this.currentChar = n + 1;
}
if (!name)
return;
// After a '=', we should see a '"' for the attribute value
var c = this.nextChar();
if (c !== '"' && c !== '\'') {
this.error('Error reading attribute ' + name + ', expecting \'"\'');
return;
}
// Read the attribute value (and consume the matching quote)
var value = this.readString(c);
node.attributes.push(new Attribute(name, decodeHTML(value)));
return;
},
/**
* Parses and returns an Element node. This is called after a '<' has been
* read.
*
* @returns an array; the first index of the array is the parsed node;
* the second index is a boolean indicating whether this is a void
* Element
*/
makeElementNode: function (retPair) {
var c = this.nextChar();
// Read the Element tag name
var strBuf = this.strBuf;
strBuf.length = 0;
while (whitespace.indexOf(c) == -1 && c !== '>' && c !== '/') {
if (c === undefined)
return false;
strBuf.push(c);
c = this.nextChar();
}
var tag = strBuf.join('');
if (!tag)
return false;
var node = new Element(tag);
// Read Element attributes
while (c !== '/' && c !== '>') {
if (c === undefined)
return false;
while (whitespace.indexOf(this.html[this.currentChar++]) != -1) {
// Advance cursor to first non-whitespace char.
}
this.currentChar--;
c = this.nextChar();
if (c !== '/' && c !== '>') {
--this.currentChar;
this.readAttribute(node);
}
}
// If this is a self-closing tag, read '/>'
var closed = false;
if (c === '/') {
closed = true;
c = this.nextChar();
if (c !== '>') {
this.error('expected \'>\' to close ' + tag);
return false;
}
}
retPair[0] = node;
retPair[1] = closed;
return true;
},
/**
* If the current input matches this string, advance the input index;
* otherwise, do nothing.
*
* @returns whether input matched string
*/
match: function (str) {
var strlen = str.length;
if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) {
this.currentChar += strlen;
return true;
}
return false;
},
/**
* Searches the input until a string is found and discards all input up to
* and including the matched string.
*/
discardTo: function (str) {
var index = this.html.indexOf(str, this.currentChar) + str.length;
if (index === -1)
this.currentChar = this.html.length;
this.currentChar = index;
},
/**
* Reads child nodes for the given node.
*/
readChildren: function (node) {
var child;
while ((child = this.readNode())) {
// Don't keep Comment nodes
if (child.nodeType !== 8) {
node.appendChild(child);
}
}
},
discardNextComment: function() {
if (this.match('--')) {
this.discardTo('-->');
} else {
var c = this.nextChar();
while (c !== '>') {
if (c === undefined)
return null;
if (c === '"' || c === '\'')
this.readString(c);
c = this.nextChar();
}
}
return new Comment();
},
/**
* Reads the next child node from the input. If we're reading a closing
* tag, or if we've reached the end of input, return null.
*
* @returns the node
*/
readNode: function () {
var c = this.nextChar();
if (c === undefined)
return null;
// Read any text as Text node
var textNode;
if (c !== '<') {
--this.currentChar;
textNode = new Text();
var n = this.html.indexOf('<', this.currentChar);
if (n === -1) {
textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length;
} else {
textNode.innerHTML = this.html.substring(this.currentChar, n);
this.currentChar = n;
}
return textNode;
}
if (this.match('![CDATA[')) {
var endChar = this.html.indexOf(']]>', this.currentChar);
if (endChar === -1) {
this.error('unclosed CDATA section');
return null;
}
textNode = new Text();
textNode.textContent = this.html.substring(this.currentChar, endChar);
this.currentChar = endChar + (']]>').length;
return textNode;
}
c = this.peekNext();
// Read Comment node. Normally, Comment nodes know their inner
// textContent, but we don't really care about Comment nodes (we throw
// them away in readChildren()). So just returning an empty Comment node
// here is sufficient.
if (c === '!' || c === '?') {
// We're still before the ! or ? that is starting this comment:
this.currentChar++;
return this.discardNextComment();
}
// If we're reading a closing tag, return null. This means we've reached
// the end of this set of child nodes.
if (c === '/') {
--this.currentChar;
return null;
}
// Otherwise, we're looking at an Element node
var result = this.makeElementNode(this.retPair);
if (!result)
return null;
var node = this.retPair[0];
var closed = this.retPair[1];
var localName = node.localName;
// If this isn't a void Element, read its child nodes
if (!closed) {
this.readChildren(node);
var closingTag = '</' + node._matchingTag + '>';
if (!this.match(closingTag)) {
this.error('expected \'' + closingTag + '\' and got ' + this.html.substr(this.currentChar, closingTag.length));
return null;
}
}
// Only use the first title, because SVG might have other
// title elements which we don't care about (medium.com
// does this, at least).
if (localName === 'title' && !this.doc.title) {
this.doc.title = node.textContent.trim();
} else if (localName === 'head') {
this.doc.head = node;
} else if (localName === 'body') {
this.doc.body = node;
} else if (localName === 'html') {
this.doc.documentElement = node;
}
return node;
},
/**
* Parses an HTML string and returns a JS implementation of the Document.
*/
parse: function (html, url) {
this.html = html;
var doc = this.doc = new Document(url);
this.readChildren(doc);
// If this is an HTML document, remove root-level children except for the
// <html> node
if (doc.documentElement) {
for (var i = doc.childNodes.length; --i >= 0;) {
var child = doc.childNodes[i];
if (child !== doc.documentElement) {
doc.removeChild(child);
}
}
}
return doc;
},
};
// Attach the standard DOM types to the global scope
global.Node = Node;
global.Comment = Comment;
global.Document = Document;
global.Element = Element;
global.Text = Text;
// Attach JSDOMParser to the global scope
global.JSDOMParser = JSDOMParser;
})(this);