1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-07-16 00:14:34 +02:00

All: Use Lerna to manage monorepo

This commit is contained in:
Laurent Cozic
2020-11-05 16:58:23 +00:00
parent 122f20905c
commit cc07016b07
2839 changed files with 54217 additions and 16111 deletions

View File

@ -0,0 +1,38 @@
import MultiplexHandler from "./MultiplexHandler";
import { Handler } from "./Parser";
export class CollectingHandler extends MultiplexHandler {
_cbs: Partial<Handler>;
events: [keyof Handler, ...unknown[]][];
constructor(cbs: Partial<Handler> = {}) {
super((name, ...args) => {
this.events.push([name, ...args]);
// @ts-ignore
if (this._cbs[name]) this._cbs[name](...args);
});
this._cbs = cbs;
this.events = [];
}
onreset() {
this.events = [];
if (this._cbs.onreset) this._cbs.onreset();
}
restart() {
if (this._cbs.onreset) this._cbs.onreset();
for (let i = 0; i < this.events.length; i++) {
const [name, ...args] = this.events[i];
if (!this._cbs[name]) {
continue;
}
// @ts-ignore
this._cbs[name](...args);
}
}
}

View File

@ -0,0 +1,25 @@
//Runs tests for feeds
import * as helper from "./__fixtures__/test-helper";
import { FeedHandler, parseFeed } from "./FeedHandler";
import fs from "fs";
import path from "path";
const documents = path.join(__dirname, "__fixtures__", "Documents");
helper.createSuite("Feeds", (test, cb) => {
const file = fs.readFileSync(path.join(documents, test.file), "utf8");
const handler: FeedHandler = new FeedHandler(err => cb(err, handler.feed));
helper.writeToParser(handler, { xmlMode: true }, file);
});
describe("parseFeed", () => {
test("(rssFeed)", async () => {
const file = path.join(documents, "RSS_Example.xml");
const rss = await fs.promises.readFile(file, "utf8");
const feed = parseFeed(rss);
expect(feed).toMatchSnapshot();
});
});

View File

@ -0,0 +1,205 @@
import DomHandler, { DomHandlerOptions, Node, Element } from "domhandler";
import * as DomUtils from "domutils";
import { Parser, ParserOptions } from "./Parser";
interface FeedItem {
id?: string;
title?: string;
link?: string;
description?: string;
pubDate?: Date;
}
interface Feed {
type?: string;
id?: string;
title?: string;
link?: string;
description?: string;
updated?: Date;
author?: string;
items?: FeedItem[];
}
//TODO: Consume data as it is coming in
export class FeedHandler extends DomHandler {
feed?: Feed;
/**
*
* @param callback
* @param options
*/
constructor(
callback?: ((error: Error | null) => void) | DomHandlerOptions,
options?: DomHandlerOptions
) {
if (typeof callback === "object" && callback !== null) {
callback = undefined;
options = callback;
}
super(callback, options);
}
onend() {
const feed: Feed = {};
const feedRoot = getOneElement(isValidFeed, this.dom);
if (feedRoot) {
if (feedRoot.name === "feed") {
const childs = feedRoot.children;
feed.type = "atom";
addConditionally(feed, "id", "id", childs);
addConditionally(feed, "title", "title", childs);
const href = getAttribute(
"href",
getOneElement("link", childs)
);
if (href) {
feed.link = href;
}
addConditionally(feed, "description", "subtitle", childs);
const updated = fetch("updated", childs);
if (updated) {
feed.updated = new Date(updated);
}
addConditionally(feed, "author", "email", childs, true);
feed.items = getElements("entry", childs).map(item => {
const entry: FeedItem = {};
const { children } = item;
addConditionally(entry, "id", "id", children);
addConditionally(entry, "title", "title", children);
const href = getAttribute(
"href",
getOneElement("link", children)
);
if (href) {
entry.link = href;
}
const description =
fetch("summary", children) ||
fetch("content", children);
if (description) {
entry.description = description;
}
const pubDate = fetch("updated", children);
if (pubDate) {
entry.pubDate = new Date(pubDate);
}
return entry;
});
} else {
const childs = getOneElement("channel", feedRoot.children)
.children;
feed.type = feedRoot.name.substr(0, 3);
feed.id = "";
addConditionally(feed, "title", "title", childs);
addConditionally(feed, "link", "link", childs);
addConditionally(feed, "description", "description", childs);
const updated = fetch("lastBuildDate", childs);
if (updated) {
feed.updated = new Date(updated);
}
addConditionally(
feed,
"author",
"managingEditor",
childs,
true
);
feed.items = getElements("item", feedRoot.children).map(
(item: Element) => {
const entry: FeedItem = {};
const { children } = item;
addConditionally(entry, "id", "guid", children);
addConditionally(entry, "title", "title", children);
addConditionally(entry, "link", "link", children);
addConditionally(
entry,
"description",
"description",
children
);
const pubDate = fetch("pubDate", children);
if (pubDate) entry.pubDate = new Date(pubDate);
return entry;
}
);
}
}
this.feed = feed;
this.handleCallback(
feedRoot ? null : Error("couldn't find root of feed")
);
}
}
function getElements(what: string, where: Node | Node[]) {
return DomUtils.getElementsByTagName(what, where, true);
}
function getOneElement(
what: string | ((name: string) => boolean),
where: Node | Node[]
) {
return DomUtils.getElementsByTagName(what, where, true, 1)[0];
}
function fetch(what: string, where: Node | Node[], recurse = false): string {
return DomUtils.getText(
DomUtils.getElementsByTagName(what, where, recurse, 1)
).trim();
}
function getAttribute(name: string, elem: Element | null): string | null {
if (!elem) {
return null;
}
const { attribs } = elem;
return attribs[name];
}
function addConditionally<T>(
obj: T,
prop: keyof T,
what: string,
where: Node | Node[],
recurse = false
) {
const tmp = fetch(what, where, recurse);
// @ts-ignore
if (tmp) obj[prop] = tmp;
}
function isValidFeed(value: string) {
return value === "rss" || value === "feed" || value === "rdf:RDF";
}
const defaultOptions = { xmlMode: true };
/**
* Parse a feed.
*
* @param feed The feed that should be parsed, as a string.
* @param options Optionally, options for parsing. When using this option, you probably want to set `xmlMode` to `true`.
*/
export function parseFeed(
feed: string,
options: ParserOptions & DomHandlerOptions = defaultOptions
): Feed | undefined {
const handler = new FeedHandler(options);
new Parser(handler, options).end(feed);
return handler.feed;
}

View File

@ -0,0 +1,58 @@
import { Handler } from "./Parser";
/**
* Calls a specific handler function for all events that are encountered.
*
* @param func — The function to multiplex all events to.
*/
export default class MultiplexHandler implements Handler {
_func: (event: keyof Handler, ...args: unknown[]) => void;
constructor(func: (event: keyof Handler, ...args: unknown[]) => void) {
this._func = func;
}
/* Format: eventname: number of arguments */
onattribute(name: string, value: string) {
this._func("onattribute", name, value);
}
oncdatastart() {
this._func("oncdatastart");
}
oncdataend() {
this._func("oncdataend");
}
ontext(text: string) {
this._func("ontext", text);
}
onprocessinginstruction(name: string, value: string) {
this._func("onprocessinginstruction", name, value);
}
oncomment(comment: string) {
this._func("oncomment", comment);
}
oncommentend() {
this._func("oncommentend");
}
onclosetag(name: string) {
this._func("onclosetag", name);
}
onopentag(name: string, attribs: { [key: string]: string }) {
this._func("onopentag", name, attribs);
}
onopentagname(name: string) {
this._func("onopentagname", name);
}
onerror(error: Error) {
this._func("onerror", error);
}
onend() {
this._func("onend");
}
onparserinit(parser: {}) {
this._func("onparserinit", parser);
}
onreset() {
this._func("onreset");
}
}

View File

@ -0,0 +1,98 @@
import { Parser, Tokenizer } from ".";
describe("API", () => {
test("should work without callbacks", () => {
const p = new Parser(null, {
xmlMode: true,
lowerCaseAttributeNames: true
});
p.end("<a foo><bar></a><!-- --><![CDATA[]]]><?foo?><!bar><boo/>boohay");
p.write("foo");
//check for an error
p.end();
let err = false;
p._cbs.onerror = () => (err = true);
p.write("foo");
expect(err).toBeTruthy();
err = false;
p.end();
expect(err).toBeTruthy();
p.reset();
//remove method
p._cbs.onopentag = () => {};
p.write("<a foo");
delete p._cbs.onopentag;
p.write(">");
//pause/resume
let processed = false;
p._cbs.ontext = t => {
expect(t).toBe("foo");
processed = true;
};
p.pause();
p.write("foo");
expect(processed).toBeFalsy();
p.resume();
expect(processed).toBeTruthy();
processed = false;
p.pause();
expect(processed).toBeFalsy();
p.resume();
expect(processed).toBeFalsy();
p.pause();
p.end("foo");
expect(processed).toBeFalsy();
p.resume();
expect(processed).toBeTruthy();
});
test("should update the position", () => {
const p = new Parser(null);
p.write("foo");
expect(p.startIndex).toBe(0);
expect(p.endIndex).toBe(2);
p.write("<bar>");
expect(p.startIndex).toBe(3);
expect(p.endIndex).toBe(7);
});
test("should update the position when a single tag is spread across multiple chunks", () => {
const p = new Parser(null);
p.write("<div ");
p.write("foo=bar>");
expect(p.startIndex).toBe(0);
expect(p.endIndex).toBe(12);
});
test("should parse <__proto__>", () => {
const p = new Parser(null);
// Should not throw (see #387)
p.write("<__proto__>");
});
test("should support custom tokenizer", () => {
class CustomTokenizer extends Tokenizer {}
const p = new Parser(
{
onparserinit(parser: Parser) {
expect(parser._tokenizer).toBeInstanceOf(CustomTokenizer);
}
},
{ Tokenizer: CustomTokenizer }
);
p.done();
});
});

View File

@ -0,0 +1,473 @@
import Tokenizer from "./Tokenizer";
import { EventEmitter } from "events";
const formTags = new Set([
"input",
"option",
"optgroup",
"select",
"button",
"datalist",
"textarea"
]);
const pTag = new Set(["p"]);
const openImpliesClose = {
tr: new Set(["tr", "th", "td"]),
th: new Set(["th"]),
td: new Set(["thead", "th", "td"]),
body: new Set(["head", "link", "script"]),
li: new Set(["li"]),
p: pTag,
h1: pTag,
h2: pTag,
h3: pTag,
h4: pTag,
h5: pTag,
h6: pTag,
select: formTags,
input: formTags,
output: formTags,
button: formTags,
datalist: formTags,
textarea: formTags,
option: new Set(["option"]),
optgroup: new Set(["optgroup", "option"]),
dd: new Set(["dt", "dd"]),
dt: new Set(["dt", "dd"]),
address: pTag,
article: pTag,
aside: pTag,
blockquote: pTag,
details: pTag,
div: pTag,
dl: pTag,
fieldset: pTag,
figcaption: pTag,
figure: pTag,
footer: pTag,
form: pTag,
header: pTag,
hr: pTag,
main: pTag,
nav: pTag,
ol: pTag,
pre: pTag,
section: pTag,
table: pTag,
ul: pTag,
rt: new Set(["rt", "rp"]),
rp: new Set(["rt", "rp"]),
tbody: new Set(["thead", "tbody"]),
tfoot: new Set(["thead", "tbody"])
};
const voidElements = new Set([
"area",
"base",
"basefont",
"br",
"col",
"command",
"embed",
"frame",
"hr",
"img",
"input",
"isindex",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr"
]);
const foreignContextElements = new Set(["math", "svg"]);
const htmlIntegrationElements = new Set([
"mi",
"mo",
"mn",
"ms",
"mtext",
"annotation-xml",
"foreignObject",
"desc",
"title"
]);
export interface ParserOptions {
/***
* Indicates whether special tags (<script> and <style>) should get special treatment
* and if "empty" tags (eg. <br>) can have children. If `false`, the content of special tags
* will be text only. For feeds and other XML content (documents that don't consist of HTML),
* set this to `true`. Default: `false`.
*/
xmlMode?: boolean;
/***
* If set to true, entities within the document will be decoded. Defaults to `false`.
*/
decodeEntities?: boolean;
/***
* If set to true, all tags will be lowercased. If xmlMode is disabled, this defaults to `true`.
*/
lowerCaseTags?: boolean;
/***
* If set to `true`, all attribute names will be lowercased. This has noticeable impact on speed, so it defaults to `false`.
*/
lowerCaseAttributeNames?: boolean;
/***
* If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled.
* NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text.
*/
recognizeCDATA?: boolean;
/***
* If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`.
* NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized.
*/
recognizeSelfClosing?: boolean;
/**
* Allows the default tokenizer to be overwritten.
*/
Tokenizer?: typeof Tokenizer;
}
export interface Handler {
onparserinit(parser: Parser): void;
/***
* Resets the handler back to starting state
*/
onreset(): void;
/***
* Signals the handler that parsing is done
*/
onend(): void;
onerror(error: Error): void;
onclosetag(name: string): void;
onopentagname(name: string): void;
onattribute(name: string, value: string): void;
onopentag(name: string, attribs: { [s: string]: string }): void;
ontext(data: string): void;
oncomment(data: string): void;
oncdatastart(): void;
oncdataend(): void;
oncommentend(): void;
onprocessinginstruction(name: string, data: string): void;
}
const reNameEnd = /\s|\//;
export class Parser extends EventEmitter {
_tagname = "";
_attribname = "";
_attribvalue = "";
_attribs: null | { [key: string]: string } = null;
_stack: string[] = [];
_foreignContext: boolean[] = [];
startIndex = 0;
endIndex: number | null = null;
_cbs: Partial<Handler>;
_options: ParserOptions;
_lowerCaseTagNames: boolean;
_lowerCaseAttributeNames: boolean;
_tokenizer: Tokenizer;
constructor(cbs: Partial<Handler> | null, options?: ParserOptions) {
super();
this._options = options || {};
this._cbs = cbs || {};
this._tagname = "";
this._attribname = "";
this._attribvalue = "";
this._attribs = null;
this._stack = [];
this._foreignContext = [];
this.startIndex = 0;
this.endIndex = null;
this._lowerCaseTagNames =
"lowerCaseTags" in this._options
? !!this._options.lowerCaseTags
: !this._options.xmlMode;
this._lowerCaseAttributeNames =
"lowerCaseAttributeNames" in this._options
? !!this._options.lowerCaseAttributeNames
: !this._options.xmlMode;
this._tokenizer = new (this._options.Tokenizer || Tokenizer)(
this._options,
this
);
if (this._cbs.onparserinit) this._cbs.onparserinit(this);
}
_updatePosition(initialOffset: number) {
if (this.endIndex === null) {
if (this._tokenizer._sectionStart <= initialOffset) {
this.startIndex = 0;
} else {
this.startIndex = this._tokenizer._sectionStart - initialOffset;
}
} else this.startIndex = this.endIndex + 1;
this.endIndex = this._tokenizer.getAbsoluteIndex();
}
//Tokenizer event handlers
ontext(data: string) {
this._updatePosition(1);
// @ts-ignore
this.endIndex--;
if (this._cbs.ontext) this._cbs.ontext(data);
}
onopentagname(name: string) {
if (this._lowerCaseTagNames) {
name = name.toLowerCase();
}
this._tagname = name;
if (
!this._options.xmlMode &&
Object.prototype.hasOwnProperty.call(openImpliesClose, name)
) {
for (
let el;
// @ts-ignore
openImpliesClose[name].has(
(el = this._stack[this._stack.length - 1])
);
this.onclosetag(el)
);
}
if (this._options.xmlMode || !voidElements.has(name)) {
this._stack.push(name);
if (foreignContextElements.has(name)) {
this._foreignContext.push(true);
} else if (htmlIntegrationElements.has(name)) {
this._foreignContext.push(false);
}
}
if (this._cbs.onopentagname) this._cbs.onopentagname(name);
if (this._cbs.onopentag) this._attribs = {};
}
onopentagend() {
this._updatePosition(1);
if (this._attribs) {
if (this._cbs.onopentag) {
this._cbs.onopentag(this._tagname, this._attribs);
}
this._attribs = null;
}
if (
!this._options.xmlMode &&
this._cbs.onclosetag &&
voidElements.has(this._tagname)
) {
this._cbs.onclosetag(this._tagname);
}
this._tagname = "";
}
onclosetag(name: string) {
// When this is true, the onclosetag event will always be emitted
// for closing tags (eg </div>) even if that tag was not previously
// open. This is needed because we reconstruct the HTML based on
// fragments that don't necessarily contain the opening tag.
// Without this patch, onopentagname would not be emitted, and
// so the closing tag would disappear from the output.
let alwaysClose = true;
this._updatePosition(1);
if (this._lowerCaseTagNames) {
name = name.toLowerCase();
}
if (
foreignContextElements.has(name) ||
htmlIntegrationElements.has(name)
) {
this._foreignContext.pop();
}
if (
this._stack.length &&
(this._options.xmlMode || !voidElements.has(name))
) {
let pos = this._stack.lastIndexOf(name);
if (pos !== -1) {
if (this._cbs.onclosetag) {
pos = this._stack.length - pos;
// @ts-ignore
while (pos--) this._cbs.onclosetag(this._stack.pop());
} else this._stack.length = pos;
} else if (name === "p" && !this._options.xmlMode) {
this.onopentagname(name);
this._closeCurrentTag();
} else if (!this._stack.length && alwaysClose) {
if (this._cbs.onclosetag) this._cbs.onclosetag(name);
}
} else if (!this._options.xmlMode && (name === "br" || name === "p")) {
this.onopentagname(name);
this._closeCurrentTag();
} else if (!this._stack.length && alwaysClose && this._cbs.onclosetag) {
this._cbs.onclosetag(name);
}
}
onselfclosingtag() {
if (
this._options.xmlMode ||
this._options.recognizeSelfClosing ||
this._foreignContext[this._foreignContext.length - 1]
) {
this._closeCurrentTag();
} else {
this.onopentagend();
}
}
_closeCurrentTag() {
const name = this._tagname;
this.onopentagend();
//self-closing tags will be on the top of the stack
//(cheaper check than in onclosetag)
if (this._stack[this._stack.length - 1] === name) {
if (this._cbs.onclosetag) {
this._cbs.onclosetag(name);
}
this._stack.pop();
}
}
onattribname(name: string) {
if (this._lowerCaseAttributeNames) {
name = name.toLowerCase();
}
this._attribname = name;
}
onattribdata(value: string) {
this._attribvalue += value;
}
onattribend() {
if (this._cbs.onattribute)
this._cbs.onattribute(this._attribname, this._attribvalue);
if (
this._attribs &&
!Object.prototype.hasOwnProperty.call(
this._attribs,
this._attribname
)
) {
this._attribs[this._attribname] = this._attribvalue;
}
this._attribname = "";
this._attribvalue = "";
}
_getInstructionName(value: string) {
const idx = value.search(reNameEnd);
let name = idx < 0 ? value : value.substr(0, idx);
if (this._lowerCaseTagNames) {
name = name.toLowerCase();
}
return name;
}
ondeclaration(value: string) {
if (this._cbs.onprocessinginstruction) {
const name = this._getInstructionName(value);
this._cbs.onprocessinginstruction(`!${name}`, `!${value}`);
}
}
onprocessinginstruction(value: string) {
if (this._cbs.onprocessinginstruction) {
const name = this._getInstructionName(value);
this._cbs.onprocessinginstruction(`?${name}`, `?${value}`);
}
}
oncomment(value: string) {
this._updatePosition(4);
if (this._cbs.oncomment) this._cbs.oncomment(value);
if (this._cbs.oncommentend) this._cbs.oncommentend();
}
oncdata(value: string) {
this._updatePosition(1);
if (this._options.xmlMode || this._options.recognizeCDATA) {
if (this._cbs.oncdatastart) this._cbs.oncdatastart();
if (this._cbs.ontext) this._cbs.ontext(value);
if (this._cbs.oncdataend) this._cbs.oncdataend();
} else {
this.oncomment(`[CDATA[${value}]]`);
}
}
onerror(err: Error) {
if (this._cbs.onerror) this._cbs.onerror(err);
}
onend() {
if (this._cbs.onclosetag) {
// Prevent the parser from auto-closing tags. Since we deal with fragments that
// maybe contain the opening tag but not the closing one, we don't want that
// closing tag to be auto-added.
// for (
// let i = this._stack.length;
// i > 0;
// this._cbs.onclosetag(this._stack[--i])
// );
}
if (this._cbs.onend) this._cbs.onend();
}
//Resets the parser to a blank state, ready to parse a new HTML document
reset() {
if (this._cbs.onreset) this._cbs.onreset();
this._tokenizer.reset();
this._tagname = "";
this._attribname = "";
this._attribs = null;
this._stack = [];
if (this._cbs.onparserinit) this._cbs.onparserinit(this);
}
//Parses a complete HTML document and pushes it to the handler
parseComplete(data: string) {
this.reset();
this.end(data);
}
write(chunk: string) {
this._tokenizer.write(chunk);
}
end(chunk?: string) {
this._tokenizer.end(chunk);
}
pause() {
this._tokenizer.pause();
}
resume() {
this._tokenizer.resume();
}
// Aliases for backwards compatibility
parseChunk = Parser.prototype.write;
done = Parser.prototype.end;
}

View File

@ -0,0 +1,906 @@
import decodeCodePoint from "entities/lib/decode_codepoint";
import entityMap from "entities/lib/maps/entities.json";
import legacyMap from "entities/lib/maps/legacy.json";
import xmlMap from "entities/lib/maps/xml.json";
/** All the states the tokenizer can be in. */
const enum State {
Text = 1,
BeforeTagName, //after <
InTagName,
InSelfClosingTag,
BeforeClosingTagName,
InClosingTagName,
AfterClosingTagName,
//attributes
BeforeAttributeName,
InAttributeName,
AfterAttributeName,
BeforeAttributeValue,
InAttributeValueDq, // "
InAttributeValueSq, // '
InAttributeValueNq,
//declarations
BeforeDeclaration, // !
InDeclaration,
//processing instructions
InProcessingInstruction, // ?
//comments
BeforeComment,
InComment,
AfterComment1,
AfterComment2,
//cdata
BeforeCdata1, // [
BeforeCdata2, // C
BeforeCdata3, // D
BeforeCdata4, // A
BeforeCdata5, // T
BeforeCdata6, // A
InCdata, // [
AfterCdata1, // ]
AfterCdata2, // ]
//special tags
BeforeSpecial, //S
BeforeSpecialEnd, //S
BeforeScript1, //C
BeforeScript2, //R
BeforeScript3, //I
BeforeScript4, //P
BeforeScript5, //T
AfterScript1, //C
AfterScript2, //R
AfterScript3, //I
AfterScript4, //P
AfterScript5, //T
BeforeStyle1, //T
BeforeStyle2, //Y
BeforeStyle3, //L
BeforeStyle4, //E
AfterStyle1, //T
AfterStyle2, //Y
AfterStyle3, //L
AfterStyle4, //E
BeforeEntity, //&
BeforeNumericEntity, //#
InNamedEntity,
InNumericEntity,
InHexEntity //X
}
const enum Special {
None = 1,
Script,
Style
}
function whitespace(c: string): boolean {
return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
}
interface Callbacks {
onattribdata(value: string): void; //TODO implement the new event
onattribend(): void;
onattribname(name: string): void;
oncdata(data: string): void;
onclosetag(name: string): void;
oncomment(data: string): void;
ondeclaration(content: string): void;
onend(): void;
onerror(error: Error, state?: State): void;
onopentagend(): void;
onopentagname(name: string): void;
onprocessinginstruction(instruction: string): void;
onselfclosingtag(): void;
ontext(value: string): void;
}
function ifElseState(upper: string, SUCCESS: State, FAILURE: State) {
const lower = upper.toLowerCase();
if (upper === lower) {
return (t: Tokenizer, c: string) => {
if (c === lower) {
t._state = SUCCESS;
} else {
t._state = FAILURE;
t._index--;
}
};
} else {
return (t: Tokenizer, c: string) => {
if (c === lower || c === upper) {
t._state = SUCCESS;
} else {
t._state = FAILURE;
t._index--;
}
};
}
}
function consumeSpecialNameChar(upper: string, NEXT_STATE: State) {
const lower = upper.toLowerCase();
return (t: Tokenizer, c: string) => {
if (c === lower || c === upper) {
t._state = NEXT_STATE;
} else {
t._state = State.InTagName;
t._index--; //consume the token again
}
};
}
const stateBeforeCdata1 = ifElseState(
"C",
State.BeforeCdata2,
State.InDeclaration
);
const stateBeforeCdata2 = ifElseState(
"D",
State.BeforeCdata3,
State.InDeclaration
);
const stateBeforeCdata3 = ifElseState(
"A",
State.BeforeCdata4,
State.InDeclaration
);
const stateBeforeCdata4 = ifElseState(
"T",
State.BeforeCdata5,
State.InDeclaration
);
const stateBeforeCdata5 = ifElseState(
"A",
State.BeforeCdata6,
State.InDeclaration
);
const stateBeforeScript1 = consumeSpecialNameChar("R", State.BeforeScript2);
const stateBeforeScript2 = consumeSpecialNameChar("I", State.BeforeScript3);
const stateBeforeScript3 = consumeSpecialNameChar("P", State.BeforeScript4);
const stateBeforeScript4 = consumeSpecialNameChar("T", State.BeforeScript5);
const stateAfterScript1 = ifElseState("R", State.AfterScript2, State.Text);
const stateAfterScript2 = ifElseState("I", State.AfterScript3, State.Text);
const stateAfterScript3 = ifElseState("P", State.AfterScript4, State.Text);
const stateAfterScript4 = ifElseState("T", State.AfterScript5, State.Text);
const stateBeforeStyle1 = consumeSpecialNameChar("Y", State.BeforeStyle2);
const stateBeforeStyle2 = consumeSpecialNameChar("L", State.BeforeStyle3);
const stateBeforeStyle3 = consumeSpecialNameChar("E", State.BeforeStyle4);
const stateAfterStyle1 = ifElseState("Y", State.AfterStyle2, State.Text);
const stateAfterStyle2 = ifElseState("L", State.AfterStyle3, State.Text);
const stateAfterStyle3 = ifElseState("E", State.AfterStyle4, State.Text);
const stateBeforeEntity = ifElseState(
"#",
State.BeforeNumericEntity,
State.InNamedEntity
);
const stateBeforeNumericEntity = ifElseState(
"X",
State.InHexEntity,
State.InNumericEntity
);
export default class Tokenizer {
/** The current state the tokenizer is in. */
_state = State.Text;
/** The read buffer. */
_buffer = "";
/** The beginning of the section that is currently being read. */
_sectionStart = 0;
/** The index within the buffer that we are currently looking at. */
_index = 0;
/**
* Data that has already been processed will be removed from the buffer occasionally.
* `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate.
*/
_bufferOffset = 0;
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
_baseState = State.Text;
/** For special parsing behavior inside of script and style tags. */
_special = Special.None;
/** Indicates whether the tokenizer has been paused. */
_running = true;
/** Indicates whether the tokenizer has finished running / `.end` has been called. */
_ended = false;
_cbs: Callbacks;
_xmlMode: boolean;
_decodeEntities: boolean;
constructor(
options: { xmlMode?: boolean; decodeEntities?: boolean } | null,
cbs: Callbacks
) {
this._cbs = cbs;
this._xmlMode = !!(options && options.xmlMode);
this._decodeEntities = !!(options && options.decodeEntities);
}
reset() {
this._state = State.Text;
this._buffer = "";
this._sectionStart = 0;
this._index = 0;
this._bufferOffset = 0;
this._baseState = State.Text;
this._special = Special.None;
this._running = true;
this._ended = false;
}
_stateText(c: string) {
if (c === "<") {
if (this._index > this._sectionStart) {
this._cbs.ontext(this._getSection());
}
this._state = State.BeforeTagName;
this._sectionStart = this._index;
} else if (
this._decodeEntities &&
this._special === Special.None &&
c === "&"
) {
if (this._index > this._sectionStart) {
this._cbs.ontext(this._getSection());
}
this._baseState = State.Text;
this._state = State.BeforeEntity;
this._sectionStart = this._index;
}
}
_stateBeforeTagName(c: string) {
if (c === "/") {
this._state = State.BeforeClosingTagName;
} else if (c === "<") {
this._cbs.ontext(this._getSection());
this._sectionStart = this._index;
} else if (
c === ">" ||
this._special !== Special.None ||
whitespace(c)
) {
this._state = State.Text;
} else if (c === "!") {
this._state = State.BeforeDeclaration;
this._sectionStart = this._index + 1;
} else if (c === "?") {
this._state = State.InProcessingInstruction;
this._sectionStart = this._index + 1;
} else {
this._state =
!this._xmlMode && (c === "s" || c === "S")
? State.BeforeSpecial
: State.InTagName;
this._sectionStart = this._index;
}
}
_stateInTagName(c: string) {
if (c === "/" || c === ">" || whitespace(c)) {
this._emitToken("onopentagname");
this._state = State.BeforeAttributeName;
this._index--;
}
}
_stateBeforeClosingTagName(c: string) {
if (whitespace(c)) {
// ignore
} else if (c === ">") {
this._state = State.Text;
} else if (this._special !== Special.None) {
if (c === "s" || c === "S") {
this._state = State.BeforeSpecialEnd;
} else {
this._state = State.Text;
this._index--;
}
} else {
this._state = State.InClosingTagName;
this._sectionStart = this._index;
}
}
_stateInClosingTagName(c: string) {
if (c === ">" || whitespace(c)) {
this._emitToken("onclosetag");
this._state = State.AfterClosingTagName;
this._index--;
}
}
_stateAfterClosingTagName(c: string) {
//skip everything until ">"
if (c === ">") {
this._state = State.Text;
this._sectionStart = this._index + 1;
}
}
_stateBeforeAttributeName(c: string) {
if (c === ">") {
this._cbs.onopentagend();
this._state = State.Text;
this._sectionStart = this._index + 1;
} else if (c === "/") {
this._state = State.InSelfClosingTag;
} else if (!whitespace(c)) {
this._state = State.InAttributeName;
this._sectionStart = this._index;
}
}
_stateInSelfClosingTag(c: string) {
if (c === ">") {
this._cbs.onselfclosingtag();
this._state = State.Text;
this._sectionStart = this._index + 1;
} else if (!whitespace(c)) {
this._state = State.BeforeAttributeName;
this._index--;
}
}
_stateInAttributeName(c: string) {
if (c === "=" || c === "/" || c === ">" || whitespace(c)) {
this._cbs.onattribname(this._getSection());
this._sectionStart = -1;
this._state = State.AfterAttributeName;
this._index--;
}
}
_stateAfterAttributeName(c: string) {
if (c === "=") {
this._state = State.BeforeAttributeValue;
} else if (c === "/" || c === ">") {
this._cbs.onattribend();
this._state = State.BeforeAttributeName;
this._index--;
} else if (!whitespace(c)) {
this._cbs.onattribend();
this._state = State.InAttributeName;
this._sectionStart = this._index;
}
}
_stateBeforeAttributeValue(c: string) {
if (c === '"') {
this._state = State.InAttributeValueDq;
this._sectionStart = this._index + 1;
} else if (c === "'") {
this._state = State.InAttributeValueSq;
this._sectionStart = this._index + 1;
} else if (!whitespace(c)) {
this._state = State.InAttributeValueNq;
this._sectionStart = this._index;
this._index--; //reconsume token
}
}
_stateInAttributeValueDoubleQuotes(c: string) {
if (c === '"') {
this._emitToken("onattribdata");
this._cbs.onattribend();
this._state = State.BeforeAttributeName;
} else if (this._decodeEntities && c === "&") {
this._emitToken("onattribdata");
this._baseState = this._state;
this._state = State.BeforeEntity;
this._sectionStart = this._index;
}
}
_stateInAttributeValueSingleQuotes(c: string) {
if (c === "'") {
this._emitToken("onattribdata");
this._cbs.onattribend();
this._state = State.BeforeAttributeName;
} else if (this._decodeEntities && c === "&") {
this._emitToken("onattribdata");
this._baseState = this._state;
this._state = State.BeforeEntity;
this._sectionStart = this._index;
}
}
_stateInAttributeValueNoQuotes(c: string) {
if (whitespace(c) || c === ">") {
this._emitToken("onattribdata");
this._cbs.onattribend();
this._state = State.BeforeAttributeName;
this._index--;
} else if (this._decodeEntities && c === "&") {
this._emitToken("onattribdata");
this._baseState = this._state;
this._state = State.BeforeEntity;
this._sectionStart = this._index;
}
}
_stateBeforeDeclaration(c: string) {
this._state =
c === "["
? State.BeforeCdata1
: c === "-"
? State.BeforeComment
: State.InDeclaration;
}
_stateInDeclaration(c: string) {
if (c === ">") {
this._cbs.ondeclaration(this._getSection());
this._state = State.Text;
this._sectionStart = this._index + 1;
}
}
_stateInProcessingInstruction(c: string) {
if (c === ">") {
this._cbs.onprocessinginstruction(this._getSection());
this._state = State.Text;
this._sectionStart = this._index + 1;
}
}
_stateBeforeComment(c: string) {
if (c === "-") {
this._state = State.InComment;
this._sectionStart = this._index + 1;
} else {
this._state = State.InDeclaration;
}
}
_stateInComment(c: string) {
if (c === "-") this._state = State.AfterComment1;
}
_stateAfterComment1(c: string) {
if (c === "-") {
this._state = State.AfterComment2;
} else {
this._state = State.InComment;
}
}
_stateAfterComment2(c: string) {
if (c === ">") {
//remove 2 trailing chars
this._cbs.oncomment(
this._buffer.substring(this._sectionStart, this._index - 2)
);
this._state = State.Text;
this._sectionStart = this._index + 1;
} else if (c !== "-") {
this._state = State.InComment;
}
// else: stay in AFTER_COMMENT_2 (`--->`)
}
_stateBeforeCdata6(c: string) {
if (c === "[") {
this._state = State.InCdata;
this._sectionStart = this._index + 1;
} else {
this._state = State.InDeclaration;
this._index--;
}
}
_stateInCdata(c: string) {
if (c === "]") this._state = State.AfterCdata1;
}
_stateAfterCdata1(c: string) {
if (c === "]") this._state = State.AfterCdata2;
else this._state = State.InCdata;
}
_stateAfterCdata2(c: string) {
if (c === ">") {
//remove 2 trailing chars
this._cbs.oncdata(
this._buffer.substring(this._sectionStart, this._index - 2)
);
this._state = State.Text;
this._sectionStart = this._index + 1;
} else if (c !== "]") {
this._state = State.InCdata;
}
//else: stay in AFTER_CDATA_2 (`]]]>`)
}
_stateBeforeSpecial(c: string) {
if (c === "c" || c === "C") {
this._state = State.BeforeScript1;
} else if (c === "t" || c === "T") {
this._state = State.BeforeStyle1;
} else {
this._state = State.InTagName;
this._index--; //consume the token again
}
}
_stateBeforeSpecialEnd(c: string) {
if (this._special === Special.Script && (c === "c" || c === "C")) {
this._state = State.AfterScript1;
} else if (
this._special === Special.Style &&
(c === "t" || c === "T")
) {
this._state = State.AfterStyle1;
} else this._state = State.Text;
}
_stateBeforeScript5(c: string) {
if (c === "/" || c === ">" || whitespace(c)) {
this._special = Special.Script;
}
this._state = State.InTagName;
this._index--; //consume the token again
}
_stateAfterScript5(c: string) {
if (c === ">" || whitespace(c)) {
this._special = Special.None;
this._state = State.InClosingTagName;
this._sectionStart = this._index - 6;
this._index--; //reconsume the token
} else this._state = State.Text;
}
_stateBeforeStyle4(c: string) {
if (c === "/" || c === ">" || whitespace(c)) {
this._special = Special.Style;
}
this._state = State.InTagName;
this._index--; //consume the token again
}
_stateAfterStyle4(c: string) {
if (c === ">" || whitespace(c)) {
this._special = Special.None;
this._state = State.InClosingTagName;
this._sectionStart = this._index - 5;
this._index--; //reconsume the token
} else this._state = State.Text;
}
//for entities terminated with a semicolon
_parseNamedEntityStrict() {
//offset = 1
if (this._sectionStart + 1 < this._index) {
const entity = this._buffer.substring(
this._sectionStart + 1,
this._index
),
map = this._xmlMode ? xmlMap : entityMap;
if (Object.prototype.hasOwnProperty.call(map, entity)) {
// @ts-ignore
this._emitPartial(map[entity]);
this._sectionStart = this._index + 1;
}
}
}
//parses legacy entities (without trailing semicolon)
_parseLegacyEntity() {
const start = this._sectionStart + 1;
let limit = this._index - start;
if (limit > 6) limit = 6; // The max length of legacy entities is 6
while (limit >= 2) {
// The min length of legacy entities is 2
const entity = this._buffer.substr(start, limit);
if (Object.prototype.hasOwnProperty.call(legacyMap, entity)) {
// @ts-ignore
this._emitPartial(legacyMap[entity]);
this._sectionStart += limit + 1;
return;
} else {
limit--;
}
}
}
_stateInNamedEntity(c: string) {
if (c === ";") {
this._parseNamedEntityStrict();
if (this._sectionStart + 1 < this._index && !this._xmlMode) {
this._parseLegacyEntity();
}
this._state = this._baseState;
} else if (
(c < "a" || c > "z") &&
(c < "A" || c > "Z") &&
(c < "0" || c > "9")
) {
if (this._xmlMode || this._sectionStart + 1 === this._index) {
// ignore
} else if (this._baseState !== State.Text) {
if (c !== "=") {
this._parseNamedEntityStrict();
}
} else {
this._parseLegacyEntity();
}
this._state = this._baseState;
this._index--;
}
}
_decodeNumericEntity(offset: number, base: number) {
const sectionStart = this._sectionStart + offset;
if (sectionStart !== this._index) {
//parse entity
const entity = this._buffer.substring(sectionStart, this._index);
const parsed = parseInt(entity, base);
this._emitPartial(decodeCodePoint(parsed));
this._sectionStart = this._index;
} else {
this._sectionStart--;
}
this._state = this._baseState;
}
_stateInNumericEntity(c: string) {
if (c === ";") {
this._decodeNumericEntity(2, 10);
this._sectionStart++;
} else if (c < "0" || c > "9") {
if (!this._xmlMode) {
this._decodeNumericEntity(2, 10);
} else {
this._state = this._baseState;
}
this._index--;
}
}
_stateInHexEntity(c: string) {
if (c === ";") {
this._decodeNumericEntity(3, 16);
this._sectionStart++;
} else if (
(c < "a" || c > "f") &&
(c < "A" || c > "F") &&
(c < "0" || c > "9")
) {
if (!this._xmlMode) {
this._decodeNumericEntity(3, 16);
} else {
this._state = this._baseState;
}
this._index--;
}
}
_cleanup() {
if (this._sectionStart < 0) {
this._buffer = "";
this._bufferOffset += this._index;
this._index = 0;
} else if (this._running) {
if (this._state === State.Text) {
if (this._sectionStart !== this._index) {
this._cbs.ontext(this._buffer.substr(this._sectionStart));
}
this._buffer = "";
this._bufferOffset += this._index;
this._index = 0;
} else if (this._sectionStart === this._index) {
//the section just started
this._buffer = "";
this._bufferOffset += this._index;
this._index = 0;
} else {
//remove everything unnecessary
this._buffer = this._buffer.substr(this._sectionStart);
this._index -= this._sectionStart;
this._bufferOffset += this._sectionStart;
}
this._sectionStart = 0;
}
}
//TODO make events conditional
write(chunk: string) {
if (this._ended) this._cbs.onerror(Error(".write() after done!"));
this._buffer += chunk;
this._parse();
}
// Iterates through the buffer, calling the function corresponding to the current state.
// States that are more likely to be hit are higher up, as a performance improvement.
_parse() {
while (this._index < this._buffer.length && this._running) {
const c = this._buffer.charAt(this._index);
if (this._state === State.Text) {
this._stateText(c);
} else if (this._state === State.InAttributeValueDq) {
this._stateInAttributeValueDoubleQuotes(c);
} else if (this._state === State.InAttributeName) {
this._stateInAttributeName(c);
} else if (this._state === State.InComment) {
this._stateInComment(c);
} else if (this._state === State.BeforeAttributeName) {
this._stateBeforeAttributeName(c);
} else if (this._state === State.InTagName) {
this._stateInTagName(c);
} else if (this._state === State.InClosingTagName) {
this._stateInClosingTagName(c);
} else if (this._state === State.BeforeTagName) {
this._stateBeforeTagName(c);
} else if (this._state === State.AfterAttributeName) {
this._stateAfterAttributeName(c);
} else if (this._state === State.InAttributeValueSq) {
this._stateInAttributeValueSingleQuotes(c);
} else if (this._state === State.BeforeAttributeValue) {
this._stateBeforeAttributeValue(c);
} else if (this._state === State.BeforeClosingTagName) {
this._stateBeforeClosingTagName(c);
} else if (this._state === State.AfterClosingTagName) {
this._stateAfterClosingTagName(c);
} else if (this._state === State.BeforeSpecial) {
this._stateBeforeSpecial(c);
} else if (this._state === State.AfterComment1) {
this._stateAfterComment1(c);
} else if (this._state === State.InAttributeValueNq) {
this._stateInAttributeValueNoQuotes(c);
} else if (this._state === State.InSelfClosingTag) {
this._stateInSelfClosingTag(c);
} else if (this._state === State.InDeclaration) {
this._stateInDeclaration(c);
} else if (this._state === State.BeforeDeclaration) {
this._stateBeforeDeclaration(c);
} else if (this._state === State.AfterComment2) {
this._stateAfterComment2(c);
} else if (this._state === State.BeforeComment) {
this._stateBeforeComment(c);
} else if (this._state === State.BeforeSpecialEnd) {
this._stateBeforeSpecialEnd(c);
} else if (this._state === State.AfterScript1) {
stateAfterScript1(this, c);
} else if (this._state === State.AfterScript2) {
stateAfterScript2(this, c);
} else if (this._state === State.AfterScript3) {
stateAfterScript3(this, c);
} else if (this._state === State.BeforeScript1) {
stateBeforeScript1(this, c);
} else if (this._state === State.BeforeScript2) {
stateBeforeScript2(this, c);
} else if (this._state === State.BeforeScript3) {
stateBeforeScript3(this, c);
} else if (this._state === State.BeforeScript4) {
stateBeforeScript4(this, c);
} else if (this._state === State.BeforeScript5) {
this._stateBeforeScript5(c);
} else if (this._state === State.AfterScript4) {
stateAfterScript4(this, c);
} else if (this._state === State.AfterScript5) {
this._stateAfterScript5(c);
} else if (this._state === State.BeforeStyle1) {
stateBeforeStyle1(this, c);
} else if (this._state === State.InCdata) {
this._stateInCdata(c);
} else if (this._state === State.BeforeStyle2) {
stateBeforeStyle2(this, c);
} else if (this._state === State.BeforeStyle3) {
stateBeforeStyle3(this, c);
} else if (this._state === State.BeforeStyle4) {
this._stateBeforeStyle4(c);
} else if (this._state === State.AfterStyle1) {
stateAfterStyle1(this, c);
} else if (this._state === State.AfterStyle2) {
stateAfterStyle2(this, c);
} else if (this._state === State.AfterStyle3) {
stateAfterStyle3(this, c);
} else if (this._state === State.AfterStyle4) {
this._stateAfterStyle4(c);
} else if (this._state === State.InProcessingInstruction) {
this._stateInProcessingInstruction(c);
} else if (this._state === State.InNamedEntity) {
this._stateInNamedEntity(c);
} else if (this._state === State.BeforeCdata1) {
stateBeforeCdata1(this, c);
} else if (this._state === State.BeforeEntity) {
stateBeforeEntity(this, c);
} else if (this._state === State.BeforeCdata2) {
stateBeforeCdata2(this, c);
} else if (this._state === State.BeforeCdata3) {
stateBeforeCdata3(this, c);
} else if (this._state === State.AfterCdata1) {
this._stateAfterCdata1(c);
} else if (this._state === State.AfterCdata2) {
this._stateAfterCdata2(c);
} else if (this._state === State.BeforeCdata4) {
stateBeforeCdata4(this, c);
} else if (this._state === State.BeforeCdata5) {
stateBeforeCdata5(this, c);
} else if (this._state === State.BeforeCdata6) {
this._stateBeforeCdata6(c);
} else if (this._state === State.InHexEntity) {
this._stateInHexEntity(c);
} else if (this._state === State.InNumericEntity) {
this._stateInNumericEntity(c);
} else if (this._state === State.BeforeNumericEntity) {
stateBeforeNumericEntity(this, c);
} else {
this._cbs.onerror(Error("unknown _state"), this._state);
}
this._index++;
}
this._cleanup();
}
pause() {
this._running = false;
}
resume() {
this._running = true;
if (this._index < this._buffer.length) {
this._parse();
}
if (this._ended) {
this._finish();
}
}
end(chunk?: string) {
if (this._ended) this._cbs.onerror(Error(".end() after done!"));
if (chunk) this.write(chunk);
this._ended = true;
if (this._running) this._finish();
}
_finish() {
//if there is remaining data, emit it in a reasonable way
if (this._sectionStart < this._index) {
this._handleTrailingData();
}
this._cbs.onend();
}
_handleTrailingData() {
const data = this._buffer.substr(this._sectionStart);
if (
this._state === State.InCdata ||
this._state === State.AfterCdata1 ||
this._state === State.AfterCdata2
) {
this._cbs.oncdata(data);
} else if (
this._state === State.InComment ||
this._state === State.AfterComment1 ||
this._state === State.AfterComment2
) {
this._cbs.oncomment(data);
} else if (this._state === State.InNamedEntity && !this._xmlMode) {
this._parseLegacyEntity();
if (this._sectionStart < this._index) {
this._state = this._baseState;
this._handleTrailingData();
}
} else if (this._state === State.InNumericEntity && !this._xmlMode) {
this._decodeNumericEntity(2, 10);
if (this._sectionStart < this._index) {
this._state = this._baseState;
this._handleTrailingData();
}
} else if (this._state === State.InHexEntity && !this._xmlMode) {
this._decodeNumericEntity(3, 16);
if (this._sectionStart < this._index) {
this._state = this._baseState;
this._handleTrailingData();
}
} else if (
this._state !== State.InTagName &&
this._state !== State.BeforeAttributeName &&
this._state !== State.BeforeAttributeValue &&
this._state !== State.AfterAttributeName &&
this._state !== State.InAttributeName &&
this._state !== State.InAttributeValueSq &&
this._state !== State.InAttributeValueDq &&
this._state !== State.InAttributeValueNq &&
this._state !== State.InClosingTagName
) {
this._cbs.ontext(data);
}
//else, ignore remaining data
//TODO add a way to remove current tag
}
getAbsoluteIndex(): number {
return this._bufferOffset + this._index;
}
_getSection(): string {
return this._buffer.substring(this._sectionStart, this._index);
}
_emitToken(name: "onopentagname" | "onclosetag" | "onattribdata") {
this._cbs[name](this._getSection());
this._sectionStart = -1;
}
_emitPartial(value: string) {
if (this._baseState !== State.Text) {
this._cbs.onattribdata(value); //TODO implement the new event
} else {
this._cbs.ontext(value);
}
}
}

View File

@ -0,0 +1,14 @@
import { WritableStream } from "./WritableStream";
describe("WritableStream", () => {
test("should decode fragmented unicode characters", () => {
const ontext = jest.fn();
const stream = new WritableStream({ ontext });
stream.write(Buffer.from([0xe2, 0x82]));
stream.write(Buffer.from([0xac]));
stream.end();
expect(ontext).toBeCalledWith("€");
});
});

View File

@ -0,0 +1,34 @@
import { Parser, Handler, ParserOptions } from "./Parser";
import { Writable } from "stream";
import { StringDecoder } from "string_decoder";
// Following the example in https://nodejs.org/api/stream.html#stream_decoding_buffers_in_a_writable_stream
function isBuffer(_chunk: string | Buffer, encoding: string): _chunk is Buffer {
return encoding === "buffer";
}
/**
* WritableStream makes the `Parser` interface available as a NodeJS stream.
*
* @see Parser
*/
export class WritableStream extends Writable {
_parser: Parser;
_decoder = new StringDecoder();
constructor(cbs: Partial<Handler>, options?: ParserOptions) {
super({ decodeStrings: false });
this._parser = new Parser(cbs, options);
}
_write(chunk: string | Buffer, encoding: string, cb: () => void) {
if (isBuffer(chunk, encoding)) chunk = this._decoder.write(chunk);
this._parser.write(chunk);
cb();
}
_final(cb: () => void) {
this._parser.end(this._decoder.end());
cb();
}
}

View File

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- http://en.wikipedia.org/wiki/Atom_%28standard%29 -->
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<subtitle>A subtitle.</subtitle>
<link href="http://example.org/feed/" rel="self" />
<link href="http://example.org/" />
<id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
<email>johndoe@example.com</email>
</author>
<entry>
<title>Atom-Powered Robots Run Amok</title>
<link href="http://example.org/2003/12/13/atom03" />
<link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
<link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<content type="html"><p>Some content.</p></content>
</entry>
</feed>

View File

@ -0,0 +1,16 @@
<!doctype html>
<html>
<head>
<title>Attributes test</title>
</head>
<body>
<!-- Normal attributes -->
<button id="test0" class="value0" title="value1">class="value0" title="value1"</button>
<!-- Attributes with no quotes or value -->
<button id="test1" class=value2 disabled>class=value2 disabled</button>
<!-- Attributes with no space between them. No valid, but accepted by the browser -->
<button id="test2" class="value4"title="value5">class="value4"title="value5"</button>
</body>
</html>

View File

@ -0,0 +1 @@
<!DOCTYPE html><html><title>The Title</title><body>Hello world</body></html>

View File

@ -0,0 +1,63 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:ev="http://purl.org/rss/1.0/modules/event/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:admin="http://webns.net/mvcb/">
<channel rdf:about="https://github.com/fb55/htmlparser2/">
<title>A title to parse and remember</title>
<link>https://github.com/fb55/htmlparser2/</link>
<description/>
<dc:language>en-us</dc:language>
<dc:rights>Copyright 2015 the authors</dc:rights>
<dc:publisher>webmaster@thisisafakedoma.in</dc:publisher>
<dc:creator>webmaster@thisisafakedoma.in</dc:creator>
<dc:source>https://github.com/fb55/htmlparser2/</dc:source>
<dc:title>A title to parse and remember</dc:title>
<dc:type>Collection</dc:type>
<syn:updateBase>2011-11-04T09:39:10-07:00</syn:updateBase>
<syn:updateFrequency>4</syn:updateFrequency>
<syn:updatePeriod>hourly</syn:updatePeriod>
<items>
<rdf:Seq>
<rdf:li rdf:resource="http://somefakesite/path/to/something.html"/>
</rdf:Seq>
</items>
</channel>
<item rdf:about="http://somefakesite/path/to/something.html">
<title><![CDATA[ Fast HTML Parsing ]]></title>
<link>
http://somefakesite/path/to/something.html
</link>
<description><![CDATA[
Great test content<br>A link: <a href="http://github.com">Github</a>
]]></description>
<dc:date>2011-11-04T09:35:17-07:00</dc:date>
<dc:language>en-us</dc:language>
<dc:rights>Copyright 2015 the authors</dc:rights>
<dc:source>
http://somefakesite/path/to/something.html
</dc:source>
<dc:title><![CDATA[ Fast HTML Parsing ]]></dc:title>
<dc:type>text</dc:type>
<dcterms:issued>2011-11-04T09:35:17-07:00</dcterms:issued>
</item>
<item rdf:about="http://somefakesite/path/to/something-else.html">
<title><![CDATA[
This space intentionally left blank
]]></title>
<link>
http://somefakesite/path/to/something-else.html
</link>
<description><![CDATA[
The early bird gets the worm
]]></description>
<dc:date>2011-11-04T09:34:54-07:00</dc:date>
<dc:language>en-us</dc:language>
<dc:rights>Copyright 2015 the authors</dc:rights>
<dc:source>
http://somefakesite/path/to/something-else.html
</dc:source>
<dc:title><![CDATA[
This space intentionally left blank
]]></dc:title>
<dc:type>text</dc:type>
<dcterms:issued>2011-11-04T09:34:54-07:00</dcterms:issued>
</item>
</rdf:RDF>

View File

@ -0,0 +1,48 @@
<?xml version="1.0"?>
<!-- http://cyber.law.harvard.edu/rss/examples/rss2sample.xml -->
<rss version="2.0">
<channel>
<title>Liftoff News</title>
<link>http://liftoff.msfc.nasa.gov/</link>
<description>Liftoff to Space Exploration.</description>
<language>en-us</language>
<pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
<lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
<docs>http://blogs.law.harvard.edu/tech/rss</docs>
<generator>Weblog Editor 2.0</generator>
<managingEditor>editor@example.com</managingEditor>
<webMaster>webmaster@example.com</webMaster>
<item>
<title>Star City</title>
<link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
<description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
<pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
<guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
</item>
<item>
<description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.</description>
<pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
<guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
</item>
<item>
<title>The Engine That Does More</title>
<link>http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp</link>
<description>Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.</description>
<pubDate>Tue, 27 May 2003 08:37:32 GMT</pubDate>
<guid>http://liftoff.msfc.nasa.gov/2003/05/27.html#item571</guid>
</item>
<item>
<title>Astronauts' Dirty Laundry</title>
<link>http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp</link>
<description>Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.</description>
<pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>
<guid>http://liftoff.msfc.nasa.gov/2003/05/20.html#item570</guid>
</item>
</channel>
</rss>

View File

@ -0,0 +1,19 @@
<!doctype html>
<html>
<head>
<title>SVG test</title>
</head>
<body>
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<title>Test</title>
<animate />
<polygon />
<g>
<path>
<title>x</title>
<animate />
</path>
</g>
</svg>
</body>
</html>

View File

@ -0,0 +1,35 @@
{
"name": "simple",
"options": {
"handler": {},
"parser": {}
},
"html": "<h1 class=test>adsf</h1>",
"expected": [
{
"event": "opentagname",
"data": ["h1"]
},
{
"event": "attribute",
"data": ["class", "test"]
},
{
"event": "opentag",
"data": [
"h1",
{
"class": "test"
}
]
},
{
"event": "text",
"data": ["adsf"]
},
{
"event": "closetag",
"data": ["h1"]
}
]
}

View File

@ -0,0 +1,47 @@
{
"name": "Template script tags",
"options": {
"handler": {},
"parser": {}
},
"html": "<p><script type=\"text/template\"><h1>Heading1</h1></script></p>",
"expected": [
{
"event": "opentagname",
"data": ["p"]
},
{
"event": "opentag",
"data": ["p", {}]
},
{
"event": "opentagname",
"data": ["script"]
},
{
"event": "attribute",
"data": ["type", "text/template"]
},
{
"event": "opentag",
"data": [
"script",
{
"type": "text/template"
}
]
},
{
"event": "text",
"data": ["<h1>Heading1</h1>"]
},
{
"event": "closetag",
"data": ["script"]
},
{
"event": "closetag",
"data": ["p"]
}
]
}

View File

@ -0,0 +1,37 @@
{
"name": "Lowercase tags",
"options": {
"handler": {},
"parser": {
"lowerCaseTags": true
}
},
"html": "<H1 class=test>adsf</H1>",
"expected": [
{
"event": "opentagname",
"data": ["h1"]
},
{
"event": "attribute",
"data": ["class", "test"]
},
{
"event": "opentag",
"data": [
"h1",
{
"class": "test"
}
]
},
{
"event": "text",
"data": ["adsf"]
},
{
"event": "closetag",
"data": ["h1"]
}
]
}

View File

@ -0,0 +1,38 @@
{
"name": "CDATA",
"options": {
"handler": {},
"parser": { "xmlMode": true }
},
"html": "<tag><![CDATA[ asdf ><asdf></adsf><> fo]]></tag><![CD>",
"expected": [
{
"event": "opentagname",
"data": ["tag"]
},
{
"event": "opentag",
"data": ["tag", {}]
},
{
"event": "cdatastart",
"data": []
},
{
"event": "text",
"data": [" asdf ><asdf></adsf><> fo"]
},
{
"event": "cdataend",
"data": []
},
{
"event": "closetag",
"data": ["tag"]
},
{
"event": "processinginstruction",
"data": ["![CD", "![CD"]
}
]
}

View File

@ -0,0 +1,26 @@
{
"name": "CDATA (inside special)",
"options": {
"handler": {},
"parser": {}
},
"html": "<script>/*<![CDATA[*/ asdf ><asdf></adsf><> fo/*]]>*/</script>",
"expected": [
{
"event": "opentagname",
"data": ["script"]
},
{
"event": "opentag",
"data": ["script", {}]
},
{
"event": "text",
"data": ["/*<![CDATA[*/ asdf ><asdf></adsf><> fo/*]]>*/"]
},
{
"event": "closetag",
"data": ["script"]
}
]
}

View File

@ -0,0 +1,14 @@
{
"name": "leading lt",
"options": {
"handler": {},
"parser": {}
},
"html": ">a>",
"expected": [
{
"event": "text",
"data": [">a>"]
}
]
}

View File

@ -0,0 +1,47 @@
{
"name": "Self-closing tags",
"options": {
"handler": {},
"parser": {}
},
"html": "<a href=http://test.com/>Foo</a><hr / >",
"expected": [
{
"event": "opentagname",
"data": ["a"]
},
{
"event": "attribute",
"data": ["href", "http://test.com/"]
},
{
"event": "opentag",
"data": [
"a",
{
"href": "http://test.com/"
}
]
},
{
"event": "text",
"data": ["Foo"]
},
{
"event": "closetag",
"data": ["a"]
},
{
"event": "opentagname",
"data": ["hr"]
},
{
"event": "opentag",
"data": ["hr", {}]
},
{
"event": "closetag",
"data": ["hr"]
}
]
}

View File

@ -0,0 +1,85 @@
{
"name": "Implicit close tags",
"options": {},
"html": "<ol><li class=test><div><table style=width:100%><tr><th>TH<td colspan=2><h3>Heading</h3><tr><td><div>Div</div><td><div>Div2</div></table></div><li><div><h3>Heading 2</h3></div></li></ol><p>Para<h4>Heading 4</h4><p><ul><li>Hi<li>bye</ul>",
"expected": [
{ "event": "opentagname", "data": ["ol"] },
{ "event": "opentag", "data": ["ol", {}] },
{ "event": "opentagname", "data": ["li"] },
{ "event": "attribute", "data": ["class", "test"] },
{ "event": "opentag", "data": ["li", { "class": "test" }] },
{ "event": "opentagname", "data": ["div"] },
{ "event": "opentag", "data": ["div", {}] },
{ "event": "opentagname", "data": ["table"] },
{ "event": "attribute", "data": ["style", "width:100%"] },
{ "event": "opentag", "data": ["table", { "style": "width:100%" }] },
{ "event": "opentagname", "data": ["tr"] },
{ "event": "opentag", "data": ["tr", {}] },
{ "event": "opentagname", "data": ["th"] },
{ "event": "opentag", "data": ["th", {}] },
{ "event": "text", "data": ["TH"] },
{ "event": "closetag", "data": ["th"] },
{ "event": "opentagname", "data": ["td"] },
{ "event": "attribute", "data": ["colspan", "2"] },
{ "event": "opentag", "data": ["td", { "colspan": "2" }] },
{ "event": "opentagname", "data": ["h3"] },
{ "event": "opentag", "data": ["h3", {}] },
{ "event": "text", "data": ["Heading"] },
{ "event": "closetag", "data": ["h3"] },
{ "event": "closetag", "data": ["td"] },
{ "event": "closetag", "data": ["tr"] },
{ "event": "opentagname", "data": ["tr"] },
{ "event": "opentag", "data": ["tr", {}] },
{ "event": "opentagname", "data": ["td"] },
{ "event": "opentag", "data": ["td", {}] },
{ "event": "opentagname", "data": ["div"] },
{ "event": "opentag", "data": ["div", {}] },
{ "event": "text", "data": ["Div"] },
{ "event": "closetag", "data": ["div"] },
{ "event": "closetag", "data": ["td"] },
{ "event": "opentagname", "data": ["td"] },
{ "event": "opentag", "data": ["td", {}] },
{ "event": "opentagname", "data": ["div"] },
{ "event": "opentag", "data": ["div", {}] },
{ "event": "text", "data": ["Div2"] },
{ "event": "closetag", "data": ["div"] },
{ "event": "closetag", "data": ["td"] },
{ "event": "closetag", "data": ["tr"] },
{ "event": "closetag", "data": ["table"] },
{ "event": "closetag", "data": ["div"] },
{ "event": "closetag", "data": ["li"] },
{ "event": "opentagname", "data": ["li"] },
{ "event": "opentag", "data": ["li", {}] },
{ "event": "opentagname", "data": ["div"] },
{ "event": "opentag", "data": ["div", {}] },
{ "event": "opentagname", "data": ["h3"] },
{ "event": "opentag", "data": ["h3", {}] },
{ "event": "text", "data": ["Heading 2"] },
{ "event": "closetag", "data": ["h3"] },
{ "event": "closetag", "data": ["div"] },
{ "event": "closetag", "data": ["li"] },
{ "event": "closetag", "data": ["ol"] },
{ "event": "opentagname", "data": ["p"] },
{ "event": "opentag", "data": ["p", {}] },
{ "event": "text", "data": ["Para"] },
{ "event": "closetag", "data": ["p"] },
{ "event": "opentagname", "data": ["h4"] },
{ "event": "opentag", "data": ["h4", {}] },
{ "event": "text", "data": ["Heading 4"] },
{ "event": "closetag", "data": ["h4"] },
{ "event": "opentagname", "data": ["p"] },
{ "event": "opentag", "data": ["p", {}] },
{ "event": "closetag", "data": ["p"] },
{ "event": "opentagname", "data": ["ul"] },
{ "event": "opentag", "data": ["ul", {}] },
{ "event": "opentagname", "data": ["li"] },
{ "event": "opentag", "data": ["li", {}] },
{ "event": "text", "data": ["Hi"] },
{ "event": "closetag", "data": ["li"] },
{ "event": "opentagname", "data": ["li"] },
{ "event": "opentag", "data": ["li", {}] },
{ "event": "text", "data": ["bye"] },
{ "event": "closetag", "data": ["li"] },
{ "event": "closetag", "data": ["ul"] }
]
}

View File

@ -0,0 +1,50 @@
{
"name": "attributes (no white space, no value, no quotes)",
"options": {
"handler": {},
"parser": {}
},
"html": "<button class=\"test0\"title=\"test1\" disabled value=test2>adsf</button>",
"expected": [
{
"event": "opentagname",
"data": ["button"]
},
{
"event": "attribute",
"data": ["class", "test0"]
},
{
"event": "attribute",
"data": ["title", "test1"]
},
{
"event": "attribute",
"data": ["disabled", ""]
},
{
"event": "attribute",
"data": ["value", "test2"]
},
{
"event": "opentag",
"data": [
"button",
{
"class": "test0",
"title": "test1",
"disabled": "",
"value": "test2"
}
]
},
{
"event": "text",
"data": ["adsf"]
},
{
"event": "closetag",
"data": ["button"]
}
]
}

View File

@ -0,0 +1,40 @@
{
"name": "crazy attribute",
"options": {
"handler": {},
"parser": {}
},
"html": "<p < = '' FAIL>stuff</p><a",
"expected": [
{
"event": "opentagname",
"data": ["p"]
},
{
"event": "attribute",
"data": ["<", ""]
},
{
"event": "attribute",
"data": ["fail", ""]
},
{
"event": "opentag",
"data": [
"p",
{
"<": "",
"fail": ""
}
]
},
{
"event": "text",
"data": ["stuff"]
},
{
"event": "closetag",
"data": ["p"]
}
]
}

View File

@ -0,0 +1,38 @@
{
"name": "Scripts creating other scripts",
"options": {
"handler": {},
"parser": {}
},
"html": "<p><script>var str = '<script></'+'script>';</script></p>",
"expected": [
{
"event": "opentagname",
"data": ["p"]
},
{
"event": "opentag",
"data": ["p", {}]
},
{
"event": "opentagname",
"data": ["script"]
},
{
"event": "opentag",
"data": ["script", {}]
},
{
"event": "text",
"data": ["var str = '<script></'+'script>';"]
},
{
"event": "closetag",
"data": ["script"]
},
{
"event": "closetag",
"data": ["p"]
}
]
}

View File

@ -0,0 +1,20 @@
{
"name": "Long comment ending",
"options": {
"handler": {},
"parser": {}
},
"html": "<meta id='before'><!-- text ---><meta id='after'>",
"expected": [
{ "event": "opentagname", "data": ["meta"] },
{ "event": "attribute", "data": ["id", "before"] },
{ "event": "opentag", "data": ["meta", { "id": "before" }] },
{ "event": "closetag", "data": ["meta"] },
{ "event": "comment", "data": [" text -"] },
{ "event": "commentend", "data": [] },
{ "event": "opentagname", "data": ["meta"] },
{ "event": "attribute", "data": ["id", "after"] },
{ "event": "opentag", "data": ["meta", { "id": "after" }] },
{ "event": "closetag", "data": ["meta"] }
]
}

View File

@ -0,0 +1,22 @@
{
"name": "Long CDATA ending",
"options": {
"handler": {},
"parser": { "xmlMode": true }
},
"html": "<before /><tag><![CDATA[ text ]]]></tag><after />",
"expected": [
{ "event": "opentagname", "data": ["before"] },
{ "event": "opentag", "data": ["before", {}] },
{ "event": "closetag", "data": ["before"] },
{ "event": "opentagname", "data": ["tag"] },
{ "event": "opentag", "data": ["tag", {}] },
{ "event": "cdatastart", "data": [] },
{ "event": "text", "data": [" text ]"] },
{ "event": "cdataend", "data": [] },
{ "event": "closetag", "data": ["tag"] },
{ "event": "opentagname", "data": ["after"] },
{ "event": "opentag", "data": ["after", {}] },
{ "event": "closetag", "data": ["after"] }
]
}

View File

@ -0,0 +1,27 @@
{
"name": "Implicit open p and br tags",
"options": {
"handler": {},
"parser": {}
},
"html": "<div>Hallo</p>World</br></ignore></div></p></br>",
"expected": [
{ "event": "opentagname", "data": ["div"] },
{ "event": "opentag", "data": ["div", {}] },
{ "event": "text", "data": ["Hallo"] },
{ "event": "opentagname", "data": ["p"] },
{ "event": "opentag", "data": ["p", {}] },
{ "event": "closetag", "data": ["p"] },
{ "event": "text", "data": ["World"] },
{ "event": "opentagname", "data": ["br"] },
{ "event": "opentag", "data": ["br", {}] },
{ "event": "closetag", "data": ["br"] },
{ "event": "closetag", "data": ["div"] },
{ "event": "opentagname", "data": ["p"] },
{ "event": "opentag", "data": ["p", {}] },
{ "event": "closetag", "data": ["p"] },
{ "event": "opentagname", "data": ["br"] },
{ "event": "opentag", "data": ["br", {}] },
{ "event": "closetag", "data": ["br"] }
]
}

View File

@ -0,0 +1,14 @@
{
"name": "lt followed by whitespace",
"options": {
"handler": {},
"parser": {}
},
"html": "a < b",
"expected": [
{
"event": "text",
"data": ["a < b"]
}
]
}

View File

@ -0,0 +1,35 @@
{
"name": "double attribute",
"options": {
"handler": {},
"parser": {}
},
"html": "<h1 class=test class=boo></h1>",
"expected": [
{
"event": "opentagname",
"data": ["h1"]
},
{
"event": "attribute",
"data": ["class", "test"]
},
{
"event": "attribute",
"data": ["class", "boo"]
},
{
"event": "opentag",
"data": [
"h1",
{
"class": "test"
}
]
},
{
"event": "closetag",
"data": ["h1"]
}
]
}

View File

@ -0,0 +1,14 @@
{
"name": "numeric entities",
"options": {
"handler": {},
"parser": { "decodeEntities": true }
},
"html": "&#x61;&#x62&#99;&#100&#x66g&#x;&#x68",
"expected": [
{
"event": "text",
"data": ["abcdfg&#x;h"]
}
]
}

View File

@ -0,0 +1,14 @@
{
"name": "legacy entities",
"options": {
"handler": {},
"parser": { "decodeEntities": true }
},
"html": "&AMPel&iacutee&ampeer;s&lter",
"expected": [
{
"event": "text",
"data": ["&el\u00EDe&eer;s<er"]
}
]
}

View File

@ -0,0 +1,14 @@
{
"name": "named entities",
"options": {
"handler": {},
"parser": { "decodeEntities": true }
},
"html": "&amp;el&lt;er&CounterClockwiseContourIntegral;foo&bar",
"expected": [
{
"event": "text",
"data": ["&el<er\u2233foo&bar"]
}
]
}

View File

@ -0,0 +1,14 @@
{
"name": "xml entities",
"options": {
"handler": {},
"parser": { "decodeEntities": true, "xmlMode": true }
},
"html": "&amp;&gt;&amp&lt;&uuml;&#x61;&#x62&#99;&#100&#101",
"expected": [
{
"event": "text",
"data": ["&>&amp<&uuml;a&#x62c&#100&#101"]
}
]
}

View File

@ -0,0 +1,34 @@
{
"name": "entity in attribute",
"options": {
"handler": {},
"parser": { "decodeEntities": true }
},
"html": "<a href='http://example.com/p&#x61;ge?param=value&param2&param3=&lt;val&; & &'>",
"expected": [
{
"event": "opentagname",
"data": ["a"]
},
{
"event": "attribute",
"data": [
"href",
"http://example.com/page?param=value&param2&param3=<val&; & &"
]
},
{
"event": "opentag",
"data": [
"a",
{
"href": "http://example.com/page?param=value&param2&param3=<val&; & &"
}
]
},
{
"event": "closetag",
"data": ["a"]
}
]
}

View File

@ -0,0 +1,30 @@
{
"name": "double brackets",
"options": {
"handler": {},
"parser": {}
},
"html": "<<princess-purpose>>testing</princess-purpose>",
"expected": [
{
"event": "text",
"data": ["<"]
},
{
"event": "opentagname",
"data": ["princess-purpose"]
},
{
"event": "opentag",
"data": ["princess-purpose", {}]
},
{
"event": "text",
"data": [">testing"]
},
{
"event": "closetag",
"data": ["princess-purpose"]
}
]
}

View File

@ -0,0 +1,14 @@
{
"name": "legacy entities",
"options": {
"handler": {},
"parser": { "decodeEntities": true }
},
"html": "M&M",
"expected": [
{
"event": "text",
"data": ["M&M"]
}
]
}

View File

@ -0,0 +1,87 @@
{
"name": "Special special tags",
"options": {},
"html": "<sCriPT></scripter</soo</sCript><STyLE></styler</STylE><sCiPt><stylee><scriptee><soo>",
"expected": [
{
"event": "opentagname",
"data": ["script"]
},
{
"event": "opentag",
"data": ["script", {}]
},
{
"event": "text",
"data": ["</scripter</soo"]
},
{
"event": "closetag",
"data": ["script"]
},
{
"event": "opentagname",
"data": ["style"]
},
{
"event": "opentag",
"data": ["style", {}]
},
{
"event": "text",
"data": ["</styler"]
},
{
"event": "closetag",
"data": ["style"]
},
{
"event": "opentagname",
"data": ["scipt"]
},
{
"event": "opentag",
"data": ["scipt", {}]
},
{
"event": "opentagname",
"data": ["stylee"]
},
{
"event": "opentag",
"data": ["stylee", {}]
},
{
"event": "opentagname",
"data": ["scriptee"]
},
{
"event": "opentag",
"data": ["scriptee", {}]
},
{
"event": "opentagname",
"data": ["soo"]
},
{
"event": "opentag",
"data": ["soo", {}]
},
{
"event": "closetag",
"data": ["soo"]
},
{
"event": "closetag",
"data": ["scriptee"]
},
{
"event": "closetag",
"data": ["stylee"]
},
{
"event": "closetag",
"data": ["scipt"]
}
]
}

View File

@ -0,0 +1,11 @@
{
"name": "Empty tag name",
"options": {},
"html": "< ></ >",
"expected": [
{
"event": "text",
"data": ["< ></ >"]
}
]
}

View File

@ -0,0 +1,28 @@
{
"name": "Not quite closed",
"options": {},
"html": "<foo /bar></foo bar>",
"expected": [
{
"event": "opentagname",
"data": ["foo"]
},
{
"event": "attribute",
"data": ["bar", ""]
},
{
"event": "opentag",
"data": [
"foo",
{
"bar": ""
}
]
},
{
"event": "closetag",
"data": ["foo"]
}
]
}

View File

@ -0,0 +1,46 @@
{
"name": "Entities in attributes",
"options": {
"handler": {},
"parser": { "decodeEntities": true }
},
"html": "<foo bar=&amp; baz=\"&amp;\" boo='&amp;' noo=>",
"expected": [
{
"event": "opentagname",
"data": ["foo"]
},
{
"event": "attribute",
"data": ["bar", "&"]
},
{
"event": "attribute",
"data": ["baz", "&"]
},
{
"event": "attribute",
"data": ["boo", "&"]
},
{
"event": "attribute",
"data": ["noo", ""]
},
{
"event": "opentag",
"data": [
"foo",
{
"bar": "&",
"baz": "&",
"boo": "&",
"noo": ""
}
]
},
{
"event": "closetag",
"data": ["foo"]
}
]
}

View File

@ -0,0 +1,9 @@
{
"name": "CDATA in HTML",
"options": {},
"html": "<![CDATA[ foo ]]>",
"expected": [
{ "event": "comment", "data": ["[CDATA[ foo ]]"] },
{ "event": "commentend", "data": [] }
]
}

View File

@ -0,0 +1,15 @@
{
"name": "Comment edge-cases",
"options": {},
"html": "<!-foo><!-- --- --><!--foo",
"expected": [
{
"event": "processinginstruction",
"data": ["!-foo", "!-foo"]
},
{ "event": "comment", "data": [" --- "] },
{ "event": "commentend", "data": [] },
{ "event": "comment", "data": ["foo"] },
{ "event": "commentend", "data": [] }
]
}

View File

@ -0,0 +1,19 @@
{
"name": "CDATA edge-cases",
"options": {
"parser": { "recognizeCDATA": true }
},
"html": "<![CDATA><![CDATA[[]]sdaf]]><![CDATA[foo",
"expected": [
{
"event": "processinginstruction",
"data": ["![cdata", "![CDATA"]
},
{ "event": "cdatastart", "data": [] },
{ "event": "text", "data": ["[]]sdaf"] },
{ "event": "cdataend", "data": [] },
{ "event": "cdatastart", "data": [] },
{ "event": "text", "data": ["foo"] },
{ "event": "cdataend", "data": [] }
]
}

View File

@ -0,0 +1,9 @@
{
"name": "Comment false ending",
"options": {},
"html": "<!-- a-b-> -->",
"expected": [
{ "event": "comment", "data": [" a-b-> "] },
{ "event": "commentend", "data": [] }
]
}

View File

@ -0,0 +1,26 @@
{
"name": "Scripts ending with <",
"options": {
"handler": {},
"parser": {}
},
"html": "<script><</script>",
"expected": [
{
"event": "opentagname",
"data": ["script"]
},
{
"event": "opentag",
"data": ["script", {}]
},
{
"event": "text",
"data": ["<"]
},
{
"event": "closetag",
"data": ["script"]
}
]
}

View File

@ -0,0 +1,12 @@
{
"name": "CDATA more edge-cases",
"options": {
"parser": { "recognizeCDATA": true }
},
"html": "<![CDATA[foo]bar]>baz]]>",
"expected": [
{ "event": "cdatastart", "data": [] },
{ "event": "text", "data": ["foo]bar]>baz"] },
{ "event": "cdataend", "data": [] }
]
}

View File

@ -0,0 +1,5 @@
{
"name": "RSS (2.0)",
"file": "RSS_Example.xml",
"useSnapshot": true
}

View File

@ -0,0 +1,5 @@
{
"name": "Atom (1.0)",
"file": "Atom_Example.xml",
"useSnapshot": true
}

View File

@ -0,0 +1,5 @@
{
"name": "RDF test",
"file": "RDF_Example.xml",
"useSnapshot": true
}

View File

@ -0,0 +1,55 @@
{
"name": "Basic html",
"options": {},
"file": "Basic.html",
"expected": [
{
"event": "processinginstruction",
"data": ["!doctype", "!DOCTYPE html"]
},
{
"event": "opentagname",
"data": ["html"]
},
{
"event": "opentag",
"data": ["html", {}]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["The Title"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "opentagname",
"data": ["body"]
},
{
"event": "opentag",
"data": ["body", {}]
},
{
"event": "text",
"data": ["Hello world"]
},
{
"event": "closetag",
"data": ["body"]
},
{
"event": "closetag",
"data": ["html"]
}
]
}

View File

@ -0,0 +1,722 @@
{
"name": "RSS feed",
"options": { "xmlMode": true },
"file": "RSS_Example.xml",
"expected": [
{
"event": "processinginstruction",
"data": ["?xml", "?xml version=\"1.0\"?"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "comment",
"data": [
" http://cyber.law.harvard.edu/rss/examples/rss2sample.xml "
]
},
{
"event": "commentend",
"data": []
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["rss"]
},
{
"event": "attribute",
"data": ["version", "2.0"]
},
{
"event": "opentag",
"data": [
"rss",
{
"version": "2.0"
}
]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["channel"]
},
{
"event": "opentag",
"data": ["channel", {}]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["Liftoff News"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "opentag",
"data": ["link", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/"]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "text",
"data": ["Liftoff to Space Exploration."]
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["language"]
},
{
"event": "opentag",
"data": ["language", {}]
},
{
"event": "text",
"data": ["en-us"]
},
{
"event": "closetag",
"data": ["language"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["pubDate"]
},
{
"event": "opentag",
"data": ["pubDate", {}]
},
{
"event": "text",
"data": ["Tue, 10 Jun 2003 04:00:00 GMT"]
},
{
"event": "closetag",
"data": ["pubDate"]
},
{
"event": "text",
"data": ["\n\n "]
},
{
"event": "opentagname",
"data": ["lastBuildDate"]
},
{
"event": "opentag",
"data": ["lastBuildDate", {}]
},
{
"event": "text",
"data": ["Tue, 10 Jun 2003 09:41:01 GMT"]
},
{
"event": "closetag",
"data": ["lastBuildDate"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["docs"]
},
{
"event": "opentag",
"data": ["docs", {}]
},
{
"event": "text",
"data": ["http://blogs.law.harvard.edu/tech/rss"]
},
{
"event": "closetag",
"data": ["docs"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["generator"]
},
{
"event": "opentag",
"data": ["generator", {}]
},
{
"event": "text",
"data": ["Weblog Editor 2.0"]
},
{
"event": "closetag",
"data": ["generator"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["managingEditor"]
},
{
"event": "opentag",
"data": ["managingEditor", {}]
},
{
"event": "text",
"data": ["editor@example.com"]
},
{
"event": "closetag",
"data": ["managingEditor"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["webMaster"]
},
{
"event": "opentag",
"data": ["webMaster", {}]
},
{
"event": "text",
"data": ["webmaster@example.com"]
},
{
"event": "closetag",
"data": ["webMaster"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["item"]
},
{
"event": "opentag",
"data": ["item", {}]
},
{
"event": "text",
"data": ["\n\n "]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["Star City"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "opentag",
"data": ["link", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp"]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "text",
"data": [
"How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href=\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\"&gt;Star City&lt;/a&gt;."
]
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["pubDate"]
},
{
"event": "opentag",
"data": ["pubDate", {}]
},
{
"event": "text",
"data": ["Tue, 03 Jun 2003 09:39:21 GMT"]
},
{
"event": "closetag",
"data": ["pubDate"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["guid"]
},
{
"event": "opentag",
"data": ["guid", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/2003/06/03.html#item573"]
},
{
"event": "closetag",
"data": ["guid"]
},
{
"event": "text",
"data": ["\n\n "]
},
{
"event": "closetag",
"data": ["item"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["item"]
},
{
"event": "opentag",
"data": ["item", {}]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "text",
"data": [
"Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href=\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st."
]
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["pubDate"]
},
{
"event": "opentag",
"data": ["pubDate", {}]
},
{
"event": "text",
"data": ["Fri, 30 May 2003 11:06:42 GMT"]
},
{
"event": "closetag",
"data": ["pubDate"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["guid"]
},
{
"event": "opentag",
"data": ["guid", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/2003/05/30.html#item572"]
},
{
"event": "closetag",
"data": ["guid"]
},
{
"event": "text",
"data": ["\n\n "]
},
{
"event": "closetag",
"data": ["item"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["item"]
},
{
"event": "opentag",
"data": ["item", {}]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["The Engine That Does More"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "opentag",
"data": ["link", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp"]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "text",
"data": [
"Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that."
]
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["pubDate"]
},
{
"event": "opentag",
"data": ["pubDate", {}]
},
{
"event": "text",
"data": ["Tue, 27 May 2003 08:37:32 GMT"]
},
{
"event": "closetag",
"data": ["pubDate"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["guid"]
},
{
"event": "opentag",
"data": ["guid", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/2003/05/27.html#item571"]
},
{
"event": "closetag",
"data": ["guid"]
},
{
"event": "text",
"data": ["\n\n "]
},
{
"event": "closetag",
"data": ["item"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["item"]
},
{
"event": "opentag",
"data": ["item", {}]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["Astronauts' Dirty Laundry"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "opentag",
"data": ["link", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp"]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "text",
"data": [
"Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options."
]
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["pubDate"]
},
{
"event": "opentag",
"data": ["pubDate", {}]
},
{
"event": "text",
"data": ["Tue, 20 May 2003 08:56:02 GMT"]
},
{
"event": "closetag",
"data": ["pubDate"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "opentagname",
"data": ["guid"]
},
{
"event": "opentag",
"data": ["guid", {}]
},
{
"event": "text",
"data": ["http://liftoff.msfc.nasa.gov/2003/05/20.html#item570"]
},
{
"event": "closetag",
"data": ["guid"]
},
{
"event": "text",
"data": ["\n\n "]
},
{
"event": "closetag",
"data": ["item"]
},
{
"event": "text",
"data": ["\n "]
},
{
"event": "closetag",
"data": ["channel"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["rss"]
}
]
}

View File

@ -0,0 +1,462 @@
{
"name": "Atom feed",
"options": { "xmlMode": true },
"file": "Atom_Example.xml",
"expected": [
{
"event": "processinginstruction",
"data": ["?xml", "?xml version=\"1.0\" encoding=\"utf-8\"?"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "comment",
"data": [" http://en.wikipedia.org/wiki/Atom_%28standard%29 "]
},
{
"event": "commentend",
"data": []
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["feed"]
},
{
"event": "attribute",
"data": ["xmlns", "http://www.w3.org/2005/Atom"]
},
{
"event": "opentag",
"data": [
"feed",
{
"xmlns": "http://www.w3.org/2005/Atom"
}
]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["Example Feed"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["subtitle"]
},
{
"event": "opentag",
"data": ["subtitle", {}]
},
{
"event": "text",
"data": ["A subtitle."]
},
{
"event": "closetag",
"data": ["subtitle"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "attribute",
"data": ["href", "http://example.org/feed/"]
},
{
"event": "attribute",
"data": ["rel", "self"]
},
{
"event": "opentag",
"data": [
"link",
{
"href": "http://example.org/feed/",
"rel": "self"
}
]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "attribute",
"data": ["href", "http://example.org/"]
},
{
"event": "opentag",
"data": [
"link",
{
"href": "http://example.org/"
}
]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["id"]
},
{
"event": "opentag",
"data": ["id", {}]
},
{
"event": "text",
"data": ["urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6"]
},
{
"event": "closetag",
"data": ["id"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["updated"]
},
{
"event": "opentag",
"data": ["updated", {}]
},
{
"event": "text",
"data": ["2003-12-13T18:30:02Z"]
},
{
"event": "closetag",
"data": ["updated"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["author"]
},
{
"event": "opentag",
"data": ["author", {}]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["name"]
},
{
"event": "opentag",
"data": ["name", {}]
},
{
"event": "text",
"data": ["John Doe"]
},
{
"event": "closetag",
"data": ["name"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["email"]
},
{
"event": "opentag",
"data": ["email", {}]
},
{
"event": "text",
"data": ["johndoe@example.com"]
},
{
"event": "closetag",
"data": ["email"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "closetag",
"data": ["author"]
},
{
"event": "text",
"data": ["\n\n\t"]
},
{
"event": "opentagname",
"data": ["entry"]
},
{
"event": "opentag",
"data": ["entry", {}]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["Atom-Powered Robots Run Amok"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "attribute",
"data": ["href", "http://example.org/2003/12/13/atom03"]
},
{
"event": "opentag",
"data": [
"link",
{
"href": "http://example.org/2003/12/13/atom03"
}
]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "attribute",
"data": ["rel", "alternate"]
},
{
"event": "attribute",
"data": ["type", "text/html"]
},
{
"event": "attribute",
"data": ["href", "http://example.org/2003/12/13/atom03.html"]
},
{
"event": "opentag",
"data": [
"link",
{
"rel": "alternate",
"type": "text/html",
"href": "http://example.org/2003/12/13/atom03.html"
}
]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "attribute",
"data": ["rel", "edit"]
},
{
"event": "attribute",
"data": ["href", "http://example.org/2003/12/13/atom03/edit"]
},
{
"event": "opentag",
"data": [
"link",
{
"rel": "edit",
"href": "http://example.org/2003/12/13/atom03/edit"
}
]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["id"]
},
{
"event": "opentag",
"data": ["id", {}]
},
{
"event": "text",
"data": ["urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"]
},
{
"event": "closetag",
"data": ["id"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["updated"]
},
{
"event": "opentag",
"data": ["updated", {}]
},
{
"event": "text",
"data": ["2003-12-13T18:30:02Z"]
},
{
"event": "closetag",
"data": ["updated"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["content"]
},
{
"event": "attribute",
"data": ["type", "html"]
},
{
"event": "opentag",
"data": [
"content",
{
"type": "html"
}
]
},
{
"event": "opentagname",
"data": ["p"]
},
{
"event": "opentag",
"data": ["p", {}]
},
{
"event": "text",
"data": ["Some content."]
},
{
"event": "closetag",
"data": ["p"]
},
{
"event": "closetag",
"data": ["content"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "closetag",
"data": ["entry"]
},
{
"event": "text",
"data": ["\n\n"]
},
{
"event": "closetag",
"data": ["feed"]
},
{
"event": "text",
"data": ["\n"]
}
]
}

View File

@ -0,0 +1,950 @@
{
"name": "RDF feed",
"options": { "xmlMode": true },
"file": "RDF_Example.xml",
"expected": [
{
"event": "processinginstruction",
"data": ["?xml", "?xml version=\"1.0\" encoding=\"UTF-8\"?"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["rdf:RDF"]
},
{
"event": "attribute",
"data": ["xmlns:rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"]
},
{
"event": "attribute",
"data": ["xmlns", "http://purl.org/rss/1.0/"]
},
{
"event": "attribute",
"data": ["xmlns:ev", "http://purl.org/rss/1.0/modules/event/"]
},
{
"event": "attribute",
"data": [
"xmlns:content",
"http://purl.org/rss/1.0/modules/content/"
]
},
{
"event": "attribute",
"data": ["xmlns:taxo", "http://purl.org/rss/1.0/modules/taxonomy/"]
},
{
"event": "attribute",
"data": ["xmlns:dc", "http://purl.org/dc/elements/1.1/"]
},
{
"event": "attribute",
"data": [
"xmlns:syn",
"http://purl.org/rss/1.0/modules/syndication/"
]
},
{
"event": "attribute",
"data": ["xmlns:dcterms", "http://purl.org/dc/terms/"]
},
{
"event": "attribute",
"data": ["xmlns:admin", "http://webns.net/mvcb/"]
},
{
"event": "opentag",
"data": [
"rdf:RDF",
{
"xmlns:rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"xmlns": "http://purl.org/rss/1.0/",
"xmlns:ev": "http://purl.org/rss/1.0/modules/event/",
"xmlns:content": "http://purl.org/rss/1.0/modules/content/",
"xmlns:taxo": "http://purl.org/rss/1.0/modules/taxonomy/",
"xmlns:dc": "http://purl.org/dc/elements/1.1/",
"xmlns:syn": "http://purl.org/rss/1.0/modules/syndication/",
"xmlns:dcterms": "http://purl.org/dc/terms/",
"xmlns:admin": "http://webns.net/mvcb/"
}
]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["channel"]
},
{
"event": "attribute",
"data": ["rdf:about", "https://github.com/fb55/htmlparser2/"]
},
{
"event": "opentag",
"data": [
"channel",
{
"rdf:about": "https://github.com/fb55/htmlparser2/"
}
]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["A title to parse and remember"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "opentag",
"data": ["link", {}]
},
{
"event": "text",
"data": ["https://github.com/fb55/htmlparser2/"]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:language"]
},
{
"event": "opentag",
"data": ["dc:language", {}]
},
{
"event": "text",
"data": ["en-us"]
},
{
"event": "closetag",
"data": ["dc:language"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:rights"]
},
{
"event": "opentag",
"data": ["dc:rights", {}]
},
{
"event": "text",
"data": ["Copyright 2015 the authors"]
},
{
"event": "closetag",
"data": ["dc:rights"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:publisher"]
},
{
"event": "opentag",
"data": ["dc:publisher", {}]
},
{
"event": "text",
"data": ["webmaster@thisisafakedoma.in"]
},
{
"event": "closetag",
"data": ["dc:publisher"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:creator"]
},
{
"event": "opentag",
"data": ["dc:creator", {}]
},
{
"event": "text",
"data": ["webmaster@thisisafakedoma.in"]
},
{
"event": "closetag",
"data": ["dc:creator"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:source"]
},
{
"event": "opentag",
"data": ["dc:source", {}]
},
{
"event": "text",
"data": ["https://github.com/fb55/htmlparser2/"]
},
{
"event": "closetag",
"data": ["dc:source"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:title"]
},
{
"event": "opentag",
"data": ["dc:title", {}]
},
{
"event": "text",
"data": ["A title to parse and remember"]
},
{
"event": "closetag",
"data": ["dc:title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:type"]
},
{
"event": "opentag",
"data": ["dc:type", {}]
},
{
"event": "text",
"data": ["Collection"]
},
{
"event": "closetag",
"data": ["dc:type"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["syn:updateBase"]
},
{
"event": "opentag",
"data": ["syn:updateBase", {}]
},
{
"event": "text",
"data": ["2011-11-04T09:39:10-07:00"]
},
{
"event": "closetag",
"data": ["syn:updateBase"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["syn:updateFrequency"]
},
{
"event": "opentag",
"data": ["syn:updateFrequency", {}]
},
{
"event": "text",
"data": ["4"]
},
{
"event": "closetag",
"data": ["syn:updateFrequency"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["syn:updatePeriod"]
},
{
"event": "opentag",
"data": ["syn:updatePeriod", {}]
},
{
"event": "text",
"data": ["hourly"]
},
{
"event": "closetag",
"data": ["syn:updatePeriod"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["items"]
},
{
"event": "opentag",
"data": ["items", {}]
},
{
"event": "text",
"data": ["\n\t\t\t"]
},
{
"event": "opentagname",
"data": ["rdf:Seq"]
},
{
"event": "opentag",
"data": ["rdf:Seq", {}]
},
{
"event": "text",
"data": ["\n\t\t\t\t"]
},
{
"event": "opentagname",
"data": ["rdf:li"]
},
{
"event": "attribute",
"data": [
"rdf:resource",
"http://somefakesite/path/to/something.html"
]
},
{
"event": "opentag",
"data": [
"rdf:li",
{
"rdf:resource": "http://somefakesite/path/to/something.html"
}
]
},
{
"event": "closetag",
"data": ["rdf:li"]
},
{
"event": "text",
"data": ["\n\t\t\t"]
},
{
"event": "closetag",
"data": ["rdf:Seq"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "closetag",
"data": ["items"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "closetag",
"data": ["channel"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["item"]
},
{
"event": "attribute",
"data": ["rdf:about", "http://somefakesite/path/to/something.html"]
},
{
"event": "opentag",
"data": [
"item",
{
"rdf:about": "http://somefakesite/path/to/something.html"
}
]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "cdatastart",
"data": []
},
{
"event": "text",
"data": [" Fast HTML Parsing "]
},
{
"event": "cdataend",
"data": []
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "opentag",
"data": ["link", {}]
},
{
"event": "text",
"data": ["\nhttp://somefakesite/path/to/something.html\n"]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "cdatastart",
"data": []
},
{
"event": "text",
"data": [
"\nGreat test content<br>A link: <a href=\"http://github.com\">Github</a>\n"
]
},
{
"event": "cdataend",
"data": []
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:date"]
},
{
"event": "opentag",
"data": ["dc:date", {}]
},
{
"event": "text",
"data": ["2011-11-04T09:35:17-07:00"]
},
{
"event": "closetag",
"data": ["dc:date"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:language"]
},
{
"event": "opentag",
"data": ["dc:language", {}]
},
{
"event": "text",
"data": ["en-us"]
},
{
"event": "closetag",
"data": ["dc:language"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:rights"]
},
{
"event": "opentag",
"data": ["dc:rights", {}]
},
{
"event": "text",
"data": ["Copyright 2015 the authors"]
},
{
"event": "closetag",
"data": ["dc:rights"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:source"]
},
{
"event": "opentag",
"data": ["dc:source", {}]
},
{
"event": "text",
"data": ["\nhttp://somefakesite/path/to/something.html\n"]
},
{
"event": "closetag",
"data": ["dc:source"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:title"]
},
{
"event": "opentag",
"data": ["dc:title", {}]
},
{
"event": "cdatastart",
"data": []
},
{
"event": "text",
"data": [" Fast HTML Parsing "]
},
{
"event": "cdataend",
"data": []
},
{
"event": "closetag",
"data": ["dc:title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:type"]
},
{
"event": "opentag",
"data": ["dc:type", {}]
},
{
"event": "text",
"data": ["text"]
},
{
"event": "closetag",
"data": ["dc:type"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dcterms:issued"]
},
{
"event": "opentag",
"data": ["dcterms:issued", {}]
},
{
"event": "text",
"data": ["2011-11-04T09:35:17-07:00"]
},
{
"event": "closetag",
"data": ["dcterms:issued"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "closetag",
"data": ["item"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["item"]
},
{
"event": "attribute",
"data": [
"rdf:about",
"http://somefakesite/path/to/something-else.html"
]
},
{
"event": "opentag",
"data": [
"item",
{
"rdf:about": "http://somefakesite/path/to/something-else.html"
}
]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "cdatastart",
"data": []
},
{
"event": "text",
"data": ["\nThis space intentionally left blank\n"]
},
{
"event": "cdataend",
"data": []
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["link"]
},
{
"event": "opentag",
"data": ["link", {}]
},
{
"event": "text",
"data": ["\nhttp://somefakesite/path/to/something-else.html\n"]
},
{
"event": "closetag",
"data": ["link"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["description"]
},
{
"event": "opentag",
"data": ["description", {}]
},
{
"event": "cdatastart",
"data": []
},
{
"event": "text",
"data": ["\nThe early bird gets the worm\n"]
},
{
"event": "cdataend",
"data": []
},
{
"event": "closetag",
"data": ["description"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:date"]
},
{
"event": "opentag",
"data": ["dc:date", {}]
},
{
"event": "text",
"data": ["2011-11-04T09:34:54-07:00"]
},
{
"event": "closetag",
"data": ["dc:date"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:language"]
},
{
"event": "opentag",
"data": ["dc:language", {}]
},
{
"event": "text",
"data": ["en-us"]
},
{
"event": "closetag",
"data": ["dc:language"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:rights"]
},
{
"event": "opentag",
"data": ["dc:rights", {}]
},
{
"event": "text",
"data": ["Copyright 2015 the authors"]
},
{
"event": "closetag",
"data": ["dc:rights"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:source"]
},
{
"event": "opentag",
"data": ["dc:source", {}]
},
{
"event": "text",
"data": ["\nhttp://somefakesite/path/to/something-else.html\n"]
},
{
"event": "closetag",
"data": ["dc:source"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:title"]
},
{
"event": "opentag",
"data": ["dc:title", {}]
},
{
"event": "cdatastart",
"data": []
},
{
"event": "text",
"data": ["\nThis space intentionally left blank\n"]
},
{
"event": "cdataend",
"data": []
},
{
"event": "closetag",
"data": ["dc:title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dc:type"]
},
{
"event": "opentag",
"data": ["dc:type", {}]
},
{
"event": "text",
"data": ["text"]
},
{
"event": "closetag",
"data": ["dc:type"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["dcterms:issued"]
},
{
"event": "opentag",
"data": ["dcterms:issued", {}]
},
{
"event": "text",
"data": ["2011-11-04T09:34:54-07:00"]
},
{
"event": "closetag",
"data": ["dcterms:issued"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "closetag",
"data": ["item"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["rdf:RDF"]
}
]
}

View File

@ -0,0 +1,246 @@
{
"name": "Attributes",
"options": {},
"file": "Attributes.html",
"expected": [
{
"event": "processinginstruction",
"data": ["!doctype", "!doctype html"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["html"]
},
{
"event": "opentag",
"data": ["html", {}]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["head"]
},
{
"event": "opentag",
"data": ["head", {}]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["Attributes test"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["head"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["body"]
},
{
"event": "opentag",
"data": ["body", {}]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "comment",
"data": [" Normal attributes "]
},
{
"event": "commentend",
"data": []
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["button"]
},
{
"event": "attribute",
"data": ["id", "test0"]
},
{
"event": "attribute",
"data": ["class", "value0"]
},
{
"event": "attribute",
"data": ["title", "value1"]
},
{
"event": "opentag",
"data": [
"button",
{
"id": "test0",
"class": "value0",
"title": "value1"
}
]
},
{
"event": "text",
"data": ["class=\"value0\" title=\"value1\""]
},
{
"event": "closetag",
"data": ["button"]
},
{
"event": "text",
"data": ["\n\n\t"]
},
{
"event": "comment",
"data": [" Attributes with no quotes or value "]
},
{
"event": "commentend",
"data": []
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["button"]
},
{
"event": "attribute",
"data": ["id", "test1"]
},
{
"event": "attribute",
"data": ["class", "value2"]
},
{
"event": "attribute",
"data": ["disabled", ""]
},
{
"event": "opentag",
"data": [
"button",
{
"id": "test1",
"class": "value2",
"disabled": ""
}
]
},
{
"event": "text",
"data": ["class=value2 disabled"]
},
{
"event": "closetag",
"data": ["button"]
},
{
"event": "text",
"data": ["\n\n\t"]
},
{
"event": "comment",
"data": [
" Attributes with no space between them. No valid, but accepted by the browser "
]
},
{
"event": "commentend",
"data": []
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["button"]
},
{
"event": "attribute",
"data": ["id", "test2"]
},
{
"event": "attribute",
"data": ["class", "value4"]
},
{
"event": "attribute",
"data": ["title", "value5"]
},
{
"event": "opentag",
"data": [
"button",
{
"id": "test2",
"class": "value4",
"title": "value5"
}
]
},
{
"event": "text",
"data": ["class=\"value4\"title=\"value5\""]
},
{
"event": "closetag",
"data": ["button"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["body"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["html"]
}
]
}

View File

@ -0,0 +1,261 @@
{
"name": "SVG",
"file": "Svg.html",
"expected": [
{
"event": "processinginstruction",
"data": ["!doctype", "!doctype html"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["html"]
},
{
"event": "opentag",
"data": ["html", {}]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["head"]
},
{
"event": "opentag",
"data": ["head", {}]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["SVG test"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["head"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "opentagname",
"data": ["body"]
},
{
"event": "opentag",
"data": ["body", {}]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "opentagname",
"data": ["svg"]
},
{
"event": "attribute",
"data": ["version", "1.1"]
},
{
"event": "attribute",
"data": ["xmlns", "http://www.w3.org/2000/svg"]
},
{
"event": "attribute",
"data": ["xmlns:xlink", "http://www.w3.org/1999/xlink"]
},
{
"event": "opentag",
"data": [
"svg",
{
"version": "1.1",
"xmlns": "http://www.w3.org/2000/svg",
"xmlns:xlink": "http://www.w3.org/1999/xlink"
}
]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["Test"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["animate"]
},
{
"event": "opentag",
"data": ["animate", {}]
},
{
"event": "closetag",
"data": ["animate"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["polygon"]
},
{
"event": "opentag",
"data": ["polygon", {}]
},
{
"event": "closetag",
"data": ["polygon"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "opentagname",
"data": ["g"]
},
{
"event": "opentag",
"data": ["g", {}]
},
{
"event": "text",
"data": ["\n\t\t\t"]
},
{
"event": "opentagname",
"data": ["path"]
},
{
"event": "opentag",
"data": ["path", {}]
},
{
"event": "text",
"data": ["\n\t\t\t\t"]
},
{
"event": "opentagname",
"data": ["title"]
},
{
"event": "opentag",
"data": ["title", {}]
},
{
"event": "text",
"data": ["x"]
},
{
"event": "closetag",
"data": ["title"]
},
{
"event": "text",
"data": ["\n\t\t\t\t"]
},
{
"event": "opentagname",
"data": ["animate"]
},
{
"event": "opentag",
"data": ["animate", {}]
},
{
"event": "closetag",
"data": ["animate"]
},
{
"event": "text",
"data": ["\n\t\t\t"]
},
{
"event": "closetag",
"data": ["path"]
},
{
"event": "text",
"data": ["\n\t\t"]
},
{
"event": "closetag",
"data": ["g"]
},
{
"event": "text",
"data": ["\n\t"]
},
{
"event": "closetag",
"data": ["svg"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["body"]
},
{
"event": "text",
"data": ["\n"]
},
{
"event": "closetag",
"data": ["html"]
},
{
"event": "text",
"data": ["\n"]
}
]
}

View File

@ -0,0 +1,116 @@
import { Parser, Handler, ParserOptions } from "../Parser";
import { CollectingHandler } from "../CollectingHandler";
import { DomHandlerOptions } from "..";
import fs from "fs";
import path from "path";
export function writeToParser(
handler: Partial<Handler>,
options: ParserOptions | undefined,
data: string
) {
const parser = new Parser(handler, options);
// First, try to run the test via chunks
for (let i = 0; i < data.length; i++) {
parser.write(data.charAt(i));
}
parser.end();
// Then, parse everything
parser.parseComplete(data);
}
interface Event {
event: string;
data: unknown[];
}
// Returns a tree structure
export function getEventCollector(
cb: (error: Error | null, events?: Event[]) => void
) {
const handler = new CollectingHandler({
onerror: cb,
onend() {
cb(null, handler.events.reduce(eventReducer, []));
}
});
return handler;
}
function eventReducer(events: Event[], arr: [string, ...unknown[]]): Event[] {
if (
arr[0] === "onerror" ||
arr[0] === "onend" ||
arr[0] === "onparserinit"
) {
// ignore
} else if (
arr[0] === "ontext" &&
events.length &&
events[events.length - 1].event === "text"
) {
// Combine text nodes
// @ts-ignore
events[events.length - 1].data[0] += arr[1];
} else {
events.push({
event: arr[0].substr(2),
data: arr.slice(1)
});
}
return events;
}
function getCallback(file: TestFile, done: (err?: Error | null) => void) {
let repeated = false;
return (err: null | Error, actual?: {} | {}[]) => {
expect(err).toBeNull();
if (file.useSnapshot) {
expect(actual).toMatchSnapshot();
} else {
expect(actual).toEqual(file.expected);
}
if (repeated) done();
else repeated = true;
};
}
interface TestFile {
name: string;
options: {
parser?: ParserOptions;
handler?: DomHandlerOptions;
} & Partial<ParserOptions>;
html: string;
file: string;
useSnapshot?: boolean;
expected?: {} | {}[];
}
export function createSuite(
name: string,
getResult: (
file: TestFile,
done: (error: Error | null, actual?: {} | {}[]) => void
) => void
) {
describe(name, readDir);
function readDir() {
const dir = path.join(__dirname, name);
fs.readdirSync(dir)
.filter(file => !file.startsWith(".") && !file.startsWith("_"))
.map(name => path.join(dir, name))
.map(require)
.forEach(runTest);
}
function runTest(file: TestFile) {
test(file.name, done => getResult(file, getCallback(file, done)));
}
}

View File

@ -0,0 +1,205 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`Feeds Atom (1.0) 1`] = `
Object {
"author": "johndoe@example.com",
"description": "A subtitle.",
"id": "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6",
"items": Array [
Object {
"description": "Some content.",
"id": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a",
"link": "http://example.org/2003/12/13/atom03",
"pubDate": 2003-12-13T18:30:02.000Z,
"title": "Atom-Powered Robots Run Amok",
},
],
"link": "http://example.org/feed/",
"title": "Example Feed",
"type": "atom",
"updated": 2003-12-13T18:30:02.000Z,
}
`;
exports[`Feeds Atom (1.0) 2`] = `
Object {
"author": "johndoe@example.com",
"description": "A subtitle.",
"id": "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6",
"items": Array [
Object {
"description": "Some content.",
"id": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a",
"link": "http://example.org/2003/12/13/atom03",
"pubDate": 2003-12-13T18:30:02.000Z,
"title": "Atom-Powered Robots Run Amok",
},
],
"link": "http://example.org/feed/",
"title": "Example Feed",
"type": "atom",
"updated": 2003-12-13T18:30:02.000Z,
}
`;
exports[`Feeds RDF test 1`] = `
Object {
"id": "",
"items": Array [
Object {
"description": "Great test content<br>A link: <a href=\\"http://github.com\\">Github</a>",
"link": "http://somefakesite/path/to/something.html",
"title": "Fast HTML Parsing",
},
Object {
"description": "The early bird gets the worm",
"link": "http://somefakesite/path/to/something-else.html",
"title": "This space intentionally left blank",
},
],
"link": "https://github.com/fb55/htmlparser2/",
"title": "A title to parse and remember",
"type": "rdf",
}
`;
exports[`Feeds RDF test 2`] = `
Object {
"id": "",
"items": Array [
Object {
"description": "Great test content<br>A link: <a href=\\"http://github.com\\">Github</a>",
"link": "http://somefakesite/path/to/something.html",
"title": "Fast HTML Parsing",
},
Object {
"description": "The early bird gets the worm",
"link": "http://somefakesite/path/to/something-else.html",
"title": "This space intentionally left blank",
},
],
"link": "https://github.com/fb55/htmlparser2/",
"title": "A title to parse and remember",
"type": "rdf",
}
`;
exports[`Feeds RSS (2.0) 1`] = `
Object {
"author": "editor@example.com",
"description": "Liftoff to Space Exploration.",
"id": "",
"items": Array [
Object {
"description": "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href=\\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\\"&gt;Star City&lt;/a&gt;.",
"id": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
"pubDate": 2003-06-03T09:39:21.000Z,
"title": "Star City",
},
Object {
"description": "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href=\\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\\"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
"pubDate": 2003-05-30T11:06:42.000Z,
},
Object {
"description": "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
"pubDate": 2003-05-27T08:37:32.000Z,
"title": "The Engine That Does More",
},
Object {
"description": "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
"pubDate": 2003-05-20T08:56:02.000Z,
"title": "Astronauts' Dirty Laundry",
},
],
"link": "http://liftoff.msfc.nasa.gov/",
"title": "Liftoff News",
"type": "rss",
"updated": 2003-06-10T09:41:01.000Z,
}
`;
exports[`Feeds RSS (2.0) 2`] = `
Object {
"author": "editor@example.com",
"description": "Liftoff to Space Exploration.",
"id": "",
"items": Array [
Object {
"description": "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href=\\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\\"&gt;Star City&lt;/a&gt;.",
"id": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
"pubDate": 2003-06-03T09:39:21.000Z,
"title": "Star City",
},
Object {
"description": "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href=\\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\\"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
"pubDate": 2003-05-30T11:06:42.000Z,
},
Object {
"description": "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
"pubDate": 2003-05-27T08:37:32.000Z,
"title": "The Engine That Does More",
},
Object {
"description": "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
"pubDate": 2003-05-20T08:56:02.000Z,
"title": "Astronauts' Dirty Laundry",
},
],
"link": "http://liftoff.msfc.nasa.gov/",
"title": "Liftoff News",
"type": "rss",
"updated": 2003-06-10T09:41:01.000Z,
}
`;
exports[`parseFeed (rssFeed) 1`] = `
Object {
"author": "editor@example.com",
"description": "Liftoff to Space Exploration.",
"id": "",
"items": Array [
Object {
"description": "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href=\\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\\"&gt;Star City&lt;/a&gt;.",
"id": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
"pubDate": 2003-06-03T09:39:21.000Z,
"title": "Star City",
},
Object {
"description": "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href=\\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\\"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
"pubDate": 2003-05-30T11:06:42.000Z,
},
Object {
"description": "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
"pubDate": 2003-05-27T08:37:32.000Z,
"title": "The Engine That Does More",
},
Object {
"description": "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.",
"id": "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570",
"link": "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
"pubDate": 2003-05-20T08:56:02.000Z,
"title": "Astronauts' Dirty Laundry",
},
],
"link": "http://liftoff.msfc.nasa.gov/",
"title": "Liftoff News",
"type": "rss",
"updated": 2003-06-10T09:41:01.000Z,
}
`;

View File

@ -0,0 +1,125 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`Index createDomStream 1`] = `
Array [
DataNode {
"data": "&amp;This is text",
"endIndex": null,
"next": DataNode {
"data": " and comments ",
"endIndex": null,
"next": <tags />,
"parent": null,
"prev": [Circular],
"startIndex": null,
"type": "comment",
},
"parent": null,
"prev": null,
"startIndex": null,
"type": "text",
},
DataNode {
"data": " and comments ",
"endIndex": null,
"next": <tags />,
"parent": null,
"prev": DataNode {
"data": "&amp;This is text",
"endIndex": null,
"next": [Circular],
"parent": null,
"prev": null,
"startIndex": null,
"type": "text",
},
"startIndex": null,
"type": "comment",
},
<tags />,
]
`;
exports[`Index parseDOM 1`] = `
Array [
<a
foo=""
>
<b>
<c>
ProcessingInstruction {
"data": "?foo",
"endIndex": null,
"name": "?foo",
"next": DataNode {
"data": "Yay!",
"endIndex": null,
"next": null,
"parent": <c>
[Circular]
[Circular]
</c>,
"prev": [Circular],
"startIndex": null,
"type": "text",
},
"parent": <c>
[Circular]
DataNode {
"data": "Yay!",
"endIndex": null,
"next": null,
"parent": <c>
[Circular]
[Circular]
</c>,
"prev": [Circular],
"startIndex": null,
"type": "text",
}
</c>,
"prev": null,
"startIndex": null,
"type": "directive",
}
DataNode {
"data": "Yay!",
"endIndex": null,
"next": null,
"parent": <c>
ProcessingInstruction {
"data": "?foo",
"endIndex": null,
"name": "?foo",
"next": [Circular],
"parent": <c>
[Circular]
[Circular]
</c>,
"prev": null,
"startIndex": null,
"type": "directive",
}
[Circular]
</c>,
"prev": ProcessingInstruction {
"data": "?foo",
"endIndex": null,
"name": "?foo",
"next": [Circular],
"parent": <c>
[Circular]
[Circular]
</c>,
"prev": null,
"startIndex": null,
"type": "directive",
},
"startIndex": null,
"type": "text",
}
</c>
</b>
</a>,
]
`;

View File

@ -0,0 +1,9 @@
import * as helper from "../__fixtures__/test-helper";
helper.createSuite("Events", (test, cb) =>
helper.writeToParser(
helper.getEventCollector(cb),
test.options.parser,
test.html
)
);

View File

@ -0,0 +1,33 @@
import * as helper from "../__fixtures__/test-helper";
import { WritableStream } from "../WritableStream";
import fs from "fs";
import path from "path";
helper.createSuite("Stream", (test, cb) => {
const filePath = path.join(
__dirname,
"..",
"__fixtures__",
"Documents",
test.file
);
fs.createReadStream(filePath)
.pipe(
new WritableStream(
helper.getEventCollector((err, events) => {
cb(err, events);
const handler = helper.getEventCollector(cb);
const stream = new WritableStream(handler, test.options);
fs.readFile(filePath, (err, data) => {
if (err) throw err;
stream.end(data);
});
}),
test.options
)
)
.on("error", cb);
});

View File

@ -0,0 +1,36 @@
import { parseDOM, createDomStream } from ".";
import { Element } from "domhandler";
// Add an `attributes` prop to the Element for now, to make it possible for Jest to render DOM nodes.
Object.defineProperty(Element.prototype, "attributes", {
get() {
return Object.keys(this.attribs).map(name => ({
name,
value: this.attribs[name]
}));
},
configurable: true,
enumerable: false
});
describe("Index", () => {
test("parseDOM", () => {
const dom = parseDOM("<a foo><b><c><?foo>Yay!");
expect(dom).toMatchSnapshot();
});
test("createDomStream", done => {
const domStream = createDomStream((err, dom) => {
expect(err).toBeNull();
expect(dom).toMatchSnapshot();
done();
});
for (const c of "&amp;This is text<!-- and comments --><tags>") {
domStream.write(c);
}
domStream.end();
});
});

View File

@ -0,0 +1,77 @@
import { Parser, ParserOptions } from "./Parser";
export { Parser, ParserOptions };
import { DomHandler, DomHandlerOptions, Node, Element } from "domhandler";
export { DomHandler, DomHandlerOptions };
type Options = ParserOptions & DomHandlerOptions;
// Helper methods
/**
* Parses data, returns the resulting DOM.
*
* @param data The data that should be parsed.
* @param options Optional options for the parser and DOM builder.
*/
export function parseDOM(data: string, options?: Options): Node[] {
const handler = new DomHandler(void 0, options);
new Parser(handler, options).end(data);
return handler.dom;
}
/**
* Creates a parser instance, with an attached DOM handler.
*
* @param cb A callback that will be called once parsing has been completed.
* @param options Optional options for the parser and DOM builder.
* @param elementCb An optional callback that will be called every time a tag has been completed inside of the DOM.
*/
export function createDomStream(
cb: (error: Error | null, dom: Node[]) => void,
options?: Options,
elementCb?: (element: Element) => void
) {
const handler = new DomHandler(cb, options, elementCb);
return new Parser(handler, options);
}
export { default as Tokenizer } from "./Tokenizer";
import * as ElementType from "domelementtype";
export { ElementType };
/**
* List of all events that the parser emits.
*
* Format: eventname: number of arguments.
*/
export const EVENTS = {
attribute: 2,
cdatastart: 0,
cdataend: 0,
text: 1,
processinginstruction: 2,
comment: 1,
commentend: 0,
closetag: 1,
opentag: 2,
opentagname: 1,
error: 1,
end: 0
};
/*
All of the following exports exist for backwards-compatibility.
They should probably be removed eventually.
*/
export * from "./FeedHandler";
export * from "./WritableStream";
export * from "./CollectingHandler";
import * as DomUtils from "domutils";
export { DomUtils };
// Old names for Dom- & FeedHandler
export { DomHandler as DefaultHandler };
export { FeedHandler as RssHandler } from "./FeedHandler";