joplin/packages/lib/htmlUtils.ts

const urlUtils = require('./urlUtils.js');
const Entities = require('html-entities').AllHtmlEntities;
const htmlentities = new Entities().encode;
const htmlparser2 = require('@joplin/fork-htmlparser2');
const { escapeHtml } = require('./string-utils.js');

// [\s\S] instead of . for multiline matching
// https://stackoverflow.com/a/16119722/561309
const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;

const selfClosingElements = [
	'area',
	'base',
	'basefont',
	'br',
	'col',
	'command',
	'embed',
	'frame',
	'hr',
	'img',
	'input',
	'isindex',
	'keygen',
	'link',
	'meta',
	'param',
	'source',
	'track',
	'wbr',
];

class HtmlUtils {

	public headAndBodyHtml(doc: any) {
		const output = [];
		if (doc.head) output.push(doc.head.innerHTML);
		if (doc.body) output.push(doc.body.innerHTML);
		return output.join('\n');
	}

	public isSelfClosingTag(tagName: string) {
		return selfClosingElements.includes(tagName.toLowerCase());
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	private extractUrls(regex: RegExp, html: string) {
		if (!html) return [];

		const output = [];
		let matches;
		while ((matches = regex.exec(html))) {
			output.push(matches[2]);
		}

		return output.filter(url => !!url);
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	public extractImageUrls(html: string) {
		return this.extractUrls(imageRegex, html);
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	public extractAnchorUrls(html: string) {
		return this.extractUrls(anchorRegex, html);
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	public extractFileUrls(html: string) {
		return this.extractImageUrls(html).concat(this.extractAnchorUrls(html));
	}

	public replaceResourceUrl(html: string, urlToReplace: string, id: string) {
		const htmlLinkRegex = `(?<=(?:src|href)=["'])${urlToReplace}(?=["'])`;
		const htmlReg = new RegExp(htmlLinkRegex, 'g');
		return html.replace(htmlReg, `:/${id}`);
	}

	public replaceImageUrls(html: string, callback: Function) {
		return this.processImageTags(html, (data: any) => {
			const newSrc = callback(data.src);
			return {
				type: 'replaceSource',
				src: newSrc,
			};
		});
	}

	// Note that the URLs provided by this function are URL-encoded, which is
	// usually what you want for web URLs. But if they are file:// URLs and the
	// file path is going to be used, it will need to be unescaped first. The
	// transformed SRC, must also be escaped before being sent back to this
	// function.
	public processImageTags(html: string, callback: Function) {
		if (!html) return '';

		return html.replace(imageRegex, (_v: string, before: string, src: string, after: string) => {
			const action = callback({ src: src });

			if (!action) return `<img${before}src="${src}"${after}>`;

			if (action.type === 'replaceElement') {
				return action.html;
			}

			if (action.type === 'replaceSource') {
				return `<img${before}src="${action.src}"${after}>`;
			}

			if (action.type === 'setAttributes') {
				const attrHtml = this.attributesHtml(action.attrs);
				return `<img${before}${attrHtml}${after}>`;
			}

			throw new Error(`Invalid action: ${action.type}`);
		});
	}

	public prependBaseUrl(html: string, baseUrl: string) {
		if (!html) return '';

		return html.replace(anchorRegex, (_v: string, before: string, href: string, after: string) => {
			const newHref = urlUtils.prependBaseUrl(href, baseUrl);
			return `<a${before}href="${newHref}"${after}>`;
		});
	}

	public attributesHtml(attr: any) {
		const output = [];

		for (const n in attr) {
			if (!attr.hasOwnProperty(n)) continue;
			output.push(`${n}="${htmlentities(attr[n])}"`);
		}

		return output.join(' ');
	}

	public stripHtml(html: string) {
		const output: string[] = [];

		const tagStack: any[] = [];

		const currentTag = () => {
			if (!tagStack.length) return '';
			return tagStack[tagStack.length - 1];
		};

		const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base'];

		const parser = new htmlparser2.Parser({

			onopentag: (name: string) => {
				tagStack.push(name.toLowerCase());
			},

			ontext: (decodedText: string) => {
				if (disallowedTags.includes(currentTag())) return;
				output.push(decodedText);
			},

			onclosetag: (name: string) => {
				if (currentTag() === name.toLowerCase()) tagStack.pop();
			},

		}, { decodeEntities: true });

		parser.write(html);
		parser.end();

		return output.join('').replace(/\s+/g, ' ');
	}
}

export default new HtmlUtils();

export function plainTextToHtml(plainText: string): string {
	const lines = plainText
		.replace(/[\n\r]/g, '\n')
		.split('\n');

	const lineOpenTag = lines.length > 1 ? '<p>' : '';
	const lineCloseTag = lines.length > 1 ? '</p>' : '';

	return lines
		.map(line => lineOpenTag + escapeHtml(line) + lineCloseTag)
		.join('');
}
All: Use Lerna to manage monorepo 2020-11-05 18:58:23 +02:00			`const urlUtils = require('./urlUtils.js');`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`const Entities = require('html-entities').AllHtmlEntities;`
First pass at linting lib dir 2019-07-29 15:43:53 +02:00			`const htmlentities = new Entities().encode;`
Tools: Renamed package namespace from @joplinapp to @joplin 2020-11-07 17:59:37 +02:00			`const htmlparser2 = require('@joplin/fork-htmlparser2');`
Desktop: Fixed pasting HTML in Rich Text editor, and improved pasting plain text 2021-05-20 18:08:59 +02:00			`const { escapeHtml } = require('./string-utils.js');`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`// [\s\S] instead of . for multiline matching`
			`// https://stackoverflow.com/a/16119722/561309`
First pass at linting lib dir 2019-07-29 15:43:53 +02:00			`const imageRegex = /<img([\s\S]?)src=["']([\s\S]?)["']([\s\S]*?)>/gi;`
			`const anchorRegex = /<a([\s\S]?)href=["']([\s\S]?)["']([\s\S]*?)>/gi;`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00
Desktop, Cli: Fixed various bugs related to the import of ENEX files as HTML 2020-06-15 18:10:51 +02:00			`const selfClosingElements = [`
			`'area',`
			`'base',`
			`'basefont',`
			`'br',`
			`'col',`
			`'command',`
			`'embed',`
			`'frame',`
			`'hr',`
			`'img',`
			`'input',`
			`'isindex',`
			`'keygen',`
			`'link',`
			`'meta',`
			`'param',`
			`'source',`
			`'track',`
			`'wbr',`
			`];`

Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`class HtmlUtils {`
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00
			`public headAndBodyHtml(doc: any) {`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`const output = [];`
			`if (doc.head) output.push(doc.head.innerHTML);`
			`if (doc.body) output.push(doc.body.innerHTML);`
			`return output.join('\n');`
			`}`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public isSelfClosingTag(tagName: string) {`
Desktop, Cli: Fixed various bugs related to the import of ENEX files as HTML 2020-06-15 18:10:51 +02:00			`return selfClosingElements.includes(tagName.toLowerCase());`
			`}`

Clipper: Fixes #3984: Images from some website were not being downloaded 2020-10-29 12:16:31 +02:00			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
Desktop: Various improvements to Markdown import and export (#5290) In preparation for #5224 2021-08-23 01:35:45 +02:00			`private extractUrls(regex: RegExp, html: string) {`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`if (!html) return [];`
Minor changes 2019-07-19 19:18:05 +02:00
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`const output = [];`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`let matches;`
Desktop: Various improvements to Markdown import and export (#5290) In preparation for #5224 2021-08-23 01:35:45 +02:00			`while ((matches = regex.exec(html))) {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`output.push(matches[2]);`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`}`

Revert "Tools: Added eslint rule arrow-parens" This reverts commit 0b6f5581f0c908f88636171004561da5dbe138cd. It causes too many conflicts with pull requests. 2020-05-21 10:14:33 +02:00			`return output.filter(url => !!url);`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Desktop: Various improvements to Markdown import and export (#5290) In preparation for #5224 2021-08-23 01:35:45 +02:00			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
			`public extractImageUrls(html: string) {`
			`return this.extractUrls(imageRegex, html);`
			`}`

			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
			`public extractAnchorUrls(html: string) {`
			`return this.extractUrls(anchorRegex, html);`
			`}`

			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
			`public extractFileUrls(html: string) {`
			`return this.extractImageUrls(html).concat(this.extractAnchorUrls(html));`
			`}`

			`public replaceResourceUrl(html: string, urlToReplace: string, id: string) {`
			const htmlLinkRegex = `(?<=(?:src\|href)=["'])${urlToReplace}(?=["'])`;
			`const htmlReg = new RegExp(htmlLinkRegex, 'g');`
			return html.replace(htmlReg, `:/${id}`);
			`}`

Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public replaceImageUrls(html: string, callback: Function) {`
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`return this.processImageTags(html, (data: any) => {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`const newSrc = callback(data.src);`
			`return {`
			`type: 'replaceSource',`
			`src: newSrc,`
			`};`
			`});`
			`}`

Desktop: Fixes #4916: Ensure that image paths that contain spaces are pasted correctly in the Rich Text editor 2021-05-04 17:44:30 +02:00			`// Note that the URLs provided by this function are URL-encoded, which is`
			`// usually what you want for web URLs. But if they are file:// URLs and the`
			`// file path is going to be used, it will need to be unescaped first. The`
			`// transformed SRC, must also be escaped before being sent back to this`
			`// function.`
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public processImageTags(html: string, callback: Function) {`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`if (!html) return '';`

Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`return html.replace(imageRegex, (_v: string, before: string, src: string, after: string) => {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`const action = callback({ src: src });`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			if (!action) return `<img${before}src="${src}"${after}>`;
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`if (action.type === 'replaceElement') {`
			`return action.html;`
			`}`

			`if (action.type === 'replaceSource') {`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			return `<img${before}src="${action.src}"${after}>`;
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`}`

			`if (action.type === 'setAttributes') {`
			`const attrHtml = this.attributesHtml(action.attrs);`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			return `<img${before}${attrHtml}${after}>`;
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`}`

Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			throw new Error(`Invalid action: ${action.type}`);
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`});`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public prependBaseUrl(html: string, baseUrl: string) {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`if (!html) return '';`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`return html.replace(anchorRegex, (_v: string, before: string, href: string, after: string) => {`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00			`const newHref = urlUtils.prependBaseUrl(href, baseUrl);`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			return `<a${before}href="${newHref}"${after}>`;
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`});`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`

Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public attributesHtml(attr: any) {`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`const output = [];`

			`for (const n in attr) {`
			`if (!attr.hasOwnProperty(n)) continue;`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			output.push(`${n}="${htmlentities(attr[n])}"`);
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`

			`return output.join(' ');`
			`}`
Desktop: Improved GotoAnything speed and made it safer Previously we'd use the remove-markdown package to create the note preview however this function would freeze on certain notes, and was probably unsafe as it used regex to parse Markdown. Replaced this in favour of Markdown-it along with htmlparser2 to strip all markup from a note. 2020-07-15 00:27:12 +02:00
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public stripHtml(html: string) {`
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`const output: string[] = [];`
Desktop: Improved GotoAnything speed and made it safer Previously we'd use the remove-markdown package to create the note preview however this function would freeze on certain notes, and was probably unsafe as it used regex to parse Markdown. Replaced this in favour of Markdown-it along with htmlparser2 to strip all markup from a note. 2020-07-15 00:27:12 +02:00
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`const tagStack: any[] = [];`
Desktop: Improved GotoAnything speed and made it safer Previously we'd use the remove-markdown package to create the note preview however this function would freeze on certain notes, and was probably unsafe as it used regex to parse Markdown. Replaced this in favour of Markdown-it along with htmlparser2 to strip all markup from a note. 2020-07-15 00:27:12 +02:00
			`const currentTag = () => {`
			`if (!tagStack.length) return '';`
			`return tagStack[tagStack.length - 1];`
			`};`

			`const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base'];`

			`const parser = new htmlparser2.Parser({`

Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`onopentag: (name: string) => {`
Desktop: Improved GotoAnything speed and made it safer Previously we'd use the remove-markdown package to create the note preview however this function would freeze on certain notes, and was probably unsafe as it used regex to parse Markdown. Replaced this in favour of Markdown-it along with htmlparser2 to strip all markup from a note. 2020-07-15 00:27:12 +02:00			`tagStack.push(name.toLowerCase());`
			`},`

Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`ontext: (decodedText: string) => {`
Desktop: Improved GotoAnything speed and made it safer Previously we'd use the remove-markdown package to create the note preview however this function would freeze on certain notes, and was probably unsafe as it used regex to parse Markdown. Replaced this in favour of Markdown-it along with htmlparser2 to strip all markup from a note. 2020-07-15 00:27:12 +02:00			`if (disallowedTags.includes(currentTag())) return;`
			`output.push(decodedText);`
			`},`

Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`onclosetag: (name: string) => {`
Desktop: Improved GotoAnything speed and made it safer Previously we'd use the remove-markdown package to create the note preview however this function would freeze on certain notes, and was probably unsafe as it used regex to parse Markdown. Replaced this in favour of Markdown-it along with htmlparser2 to strip all markup from a note. 2020-07-15 00:27:12 +02:00			`if (currentTag() === name.toLowerCase()) tagStack.pop();`
			`},`

			`}, { decodeEntities: true });`

			`parser.write(html);`
			`parser.end();`

			`return output.join('').replace(/\s+/g, ' ');`
			`}`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`export default new HtmlUtils();`
Desktop: Fixed pasting HTML in Rich Text editor, and improved pasting plain text 2021-05-20 18:08:59 +02:00
			`export function plainTextToHtml(plainText: string): string {`
			`const lines = plainText`
			`.replace(/[\n\r]/g, '\n')`
			`.split('\n');`

			`const lineOpenTag = lines.length > 1 ? '<p>' : '';`
			`const lineCloseTag = lines.length > 1 ? '</p>' : '';`

			`return lines`
			`.map(line => lineOpenTag + escapeHtml(line) + lineCloseTag)`
			`.join('');`
			`}`