joplin/packages/lib/htmlUtils.ts

const urlUtils = require('./urlUtils.js');
const Entities = require('html-entities').AllHtmlEntities;
const htmlentities = new Entities().encode;
const { escapeHtml } = require('./string-utils.js');

// [\s\S] instead of . for multiline matching
// https://stackoverflow.com/a/16119722/561309
const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const embedRegex = /<embed([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const objectRegex = /<object([\s\S]*?)data=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const pdfUrlRegex = /[\s\S]*?\.pdf$/i;

const selfClosingElements = [
	'area',
	'base',
	'basefont',
	'br',
	'col',
	'command',
	'embed',
	'frame',
	'hr',
	'img',
	'input',
	'isindex',
	'keygen',
	'link',
	'meta',
	'param',
	'source',
	'track',
	'wbr',
];

class HtmlUtils {

	// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
	public headAndBodyHtml(doc: any) {
		const output = [];
		if (doc.head) output.push(doc.head.innerHTML);
		if (doc.body) output.push(doc.body.innerHTML);
		return output.join('\n');
	}

	public isSelfClosingTag(tagName: string) {
		return selfClosingElements.includes(tagName.toLowerCase());
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	private extractUrls(regex: RegExp, html: string) {
		if (!html) return [];

		const output = [];
		let matches;
		while ((matches = regex.exec(html))) {
			output.push(matches[2]);
		}

		return output.filter(url => !!url);
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	public extractImageUrls(html: string) {
		return this.extractUrls(imageRegex, html);
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	public extractPdfUrls(html: string) {
		return [...this.extractUrls(embedRegex, html), ...this.extractUrls(objectRegex, html)].filter(url => pdfUrlRegex.test(url));
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	public extractAnchorUrls(html: string) {
		return this.extractUrls(anchorRegex, html);
	}

	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
	public extractFileUrls(html: string) {
		return this.extractImageUrls(html).concat(this.extractAnchorUrls(html));
	}

	public replaceResourceUrl(html: string, urlToReplace: string, id: string) {
		const htmlLinkRegex = `(?<=(?:src|href)=["'])${urlToReplace}(?=["'])`;
		const htmlReg = new RegExp(htmlLinkRegex, 'g');
		return html.replace(htmlReg, `:/${id}`);
	}

	// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
	public replaceImageUrls(html: string, callback: Function) {
		// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
		return this.processImageTags(html, (data: any) => {
			const newSrc = callback(data.src);
			return {
				type: 'replaceSource',
				src: newSrc,
			};
		});
	}

	// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
	public replaceEmbedUrls(html: string, callback: Function) {
		if (!html) return '';
		// We are adding the link as <a> since joplin disabled <embed>, <object> tags due to security reasons.
		// See: CVE-2020-15930
		html = html.replace(embedRegex, (_v: string, _before: string, src: string, _after: string) => {
			const link = callback(src);
			return `<a href="${link}">${escapeHtml(src)}</a>`;
		});
		html = html.replace(objectRegex, (_v: string, _before: string, src: string, _after: string) => {
			const link = callback(src);
			return `<a href="${link}">${escapeHtml(src)}</a>`;
		});
		return html;
	}

	// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
	public replaceMediaUrls(html: string, callback: Function) {
		html = this.replaceImageUrls(html, callback);
		html = this.replaceEmbedUrls(html, callback);
		return html;
	}

	// Note that the URLs provided by this function are URL-encoded, which is
	// usually what you want for web URLs. But if they are file:// URLs and the
	// file path is going to be used, it will need to be unescaped first. The
	// transformed SRC, must also be escaped before being sent back to this
	// function.
	// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
	public processImageTags(html: string, callback: Function) {
		if (!html) return '';

		return html.replace(imageRegex, (_v: string, before: string, src: string, after: string) => {
			const action = callback({ src: src });

			if (!action) return `<img${before}src="${src}"${after}>`;

			if (action.type === 'replaceElement') {
				return action.html;
			}

			if (action.type === 'replaceSource') {
				return `<img${before}src="${action.src}"${after}>`;
			}

			if (action.type === 'setAttributes') {
				const attrHtml = this.attributesHtml(action.attrs);
				return `<img${before}${attrHtml}${after}>`;
			}

			throw new Error(`Invalid action: ${action.type}`);
		});
	}

	public prependBaseUrl(html: string, baseUrl: string) {
		if (!html) return '';

		return html.replace(anchorRegex, (_v: string, before: string, href: string, after: string) => {
			const newHref = urlUtils.prependBaseUrl(href, baseUrl);
			return `<a${before}href="${newHref}"${after}>`;
		});
	}

	// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
	public attributesHtml(attr: any) {
		const output = [];

		for (const n in attr) {
			if (!attr.hasOwnProperty(n)) continue;
			output.push(`${n}="${htmlentities(attr[n])}"`);
		}

		return output.join(' ');
	}

}

export default new HtmlUtils();

export function plainTextToHtml(plainText: string): string {
	const lines = plainText
		.replace(/\r\n/g, '\n')
		.split('\n');

	if (lines.length === 1) return escapeHtml(lines[0]);

	// Step 1: Merge adjacent lines into paragraphs, with each line separated by
	// '<br/>'. So 'one\ntwo' will become '<p>one</br>two</p>'

	const step1: string[] = [];
	let currentLine = '';

	for (let line of lines) {
		line = line.trimEnd();
		if (!line) {
			if (currentLine) {
				step1.push(`<p>${currentLine}</p>`);
				currentLine = '';
			}
			step1.push(line);
		} else {
			if (currentLine) {
				currentLine += `<br/>${escapeHtml(line)}`;
			} else {
				currentLine = escapeHtml(line);
			}
		}
	}

	if (currentLine) step1.push(`<p>${currentLine}</p>`);

	// Step 2: Convert the remaining empty lines to <br/> tags. Note that `n`
	// successive empty lines should produced `n-1` <br/> tags. This makes more
	// sense when looking at the tests.

	const step2: string[] = [];
	let newLineCount = 0;
	for (let i = 0; i < step1.length; i++) {
		const line = step1[i];

		if (!line) {
			newLineCount++;
			if (newLineCount >= 2) step2.push('');
		} else {
			newLineCount = 0;
			step2.push(line);
		}
	}

	// Step 3: Actually convert the empty lines to <br/> tags

	const step3: string[] = [];
	for (const line of step2) {
		step3.push(line ? line : '<br/>');
	}

	return step3.join('');
}
All: Use Lerna to manage monorepo 2020-11-05 18:58:23 +02:00			`const urlUtils = require('./urlUtils.js');`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`const Entities = require('html-entities').AllHtmlEntities;`
First pass at linting lib dir 2019-07-29 15:43:53 +02:00			`const htmlentities = new Entities().encode;`
Desktop: Fixed pasting HTML in Rich Text editor, and improved pasting plain text 2021-05-20 18:08:59 +02:00			`const { escapeHtml } = require('./string-utils.js');`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`// [\s\S] instead of . for multiline matching`
			`// https://stackoverflow.com/a/16119722/561309`
First pass at linting lib dir 2019-07-29 15:43:53 +02:00			`const imageRegex = /<img([\s\S]?)src=["']([\s\S]?)["']([\s\S]*?)>/gi;`
			`const anchorRegex = /<a([\s\S]?)href=["']([\s\S]?)["']([\s\S]*?)>/gi;`
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`const embedRegex = /<embed([\s\S]?)src=["']([\s\S]?)["']([\s\S]*?)>/gi;`
			`const objectRegex = /<object([\s\S]?)data=["']([\s\S]?)["']([\s\S]*?)>/gi;`
			`const pdfUrlRegex = /[\s\S]*?\.pdf$/i;`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00
Desktop, Cli: Fixed various bugs related to the import of ENEX files as HTML 2020-06-15 18:10:51 +02:00			`const selfClosingElements = [`
			`'area',`
			`'base',`
			`'basefont',`
			`'br',`
			`'col',`
			`'command',`
			`'embed',`
			`'frame',`
			`'hr',`
			`'img',`
			`'input',`
			`'isindex',`
			`'keygen',`
			`'link',`
			`'meta',`
			`'param',`
			`'source',`
			`'track',`
			`'wbr',`
			`];`

Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`class HtmlUtils {`
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00
Tools: Implement @typescript-eslint/no-explicit-any rule 2024-04-05 13:16:49 +02:00			`// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied`
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public headAndBodyHtml(doc: any) {`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`const output = [];`
			`if (doc.head) output.push(doc.head.innerHTML);`
			`if (doc.body) output.push(doc.body.innerHTML);`
			`return output.join('\n');`
			`}`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public isSelfClosingTag(tagName: string) {`
Desktop, Cli: Fixed various bugs related to the import of ENEX files as HTML 2020-06-15 18:10:51 +02:00			`return selfClosingElements.includes(tagName.toLowerCase());`
			`}`

Clipper: Fixes #3984: Images from some website were not being downloaded 2020-10-29 12:16:31 +02:00			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
Desktop: Various improvements to Markdown import and export (#5290) In preparation for #5224 2021-08-23 01:35:45 +02:00			`private extractUrls(regex: RegExp, html: string) {`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`if (!html) return [];`
Minor changes 2019-07-19 19:18:05 +02:00
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`const output = [];`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`let matches;`
Desktop: Various improvements to Markdown import and export (#5290) In preparation for #5224 2021-08-23 01:35:45 +02:00			`while ((matches = regex.exec(html))) {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`output.push(matches[2]);`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`}`

Revert "Tools: Added eslint rule arrow-parens" This reverts commit 0b6f5581f0c908f88636171004561da5dbe138cd. It causes too many conflicts with pull requests. 2020-05-21 10:14:33 +02:00			`return output.filter(url => !!url);`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Desktop: Various improvements to Markdown import and export (#5290) In preparation for #5224 2021-08-23 01:35:45 +02:00			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
			`public extractImageUrls(html: string) {`
			`return this.extractUrls(imageRegex, html);`
			`}`

Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
			`public extractPdfUrls(html: string) {`
			`return [...this.extractUrls(embedRegex, html), ...this.extractUrls(objectRegex, html)].filter(url => pdfUrlRegex.test(url));`
			`}`

Desktop: Various improvements to Markdown import and export (#5290) In preparation for #5224 2021-08-23 01:35:45 +02:00			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
			`public extractAnchorUrls(html: string) {`
			`return this.extractUrls(anchorRegex, html);`
			`}`

			`// Returns the encoded URLs, so to be useful they should be decoded again before use.`
			`public extractFileUrls(html: string) {`
			`return this.extractImageUrls(html).concat(this.extractAnchorUrls(html));`
			`}`

			`public replaceResourceUrl(html: string, urlToReplace: string, id: string) {`
			const htmlLinkRegex = `(?<=(?:src\|href)=["'])${urlToReplace}(?=["'])`;
			`const htmlReg = new RegExp(htmlLinkRegex, 'g');`
			return html.replace(htmlReg, `:/${id}`);
			`}`

Tools: Apply rule @typescript-eslint/ban-types 2023-06-30 11:30:29 +02:00			`// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied`
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public replaceImageUrls(html: string, callback: Function) {`
Tools: Implement @typescript-eslint/no-explicit-any rule 2024-04-05 13:16:49 +02:00			`// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied`
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`return this.processImageTags(html, (data: any) => {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`const newSrc = callback(data.src);`
			`return {`
			`type: 'replaceSource',`
			`src: newSrc,`
			`};`
			`});`
			`}`

Tools: Apply rule @typescript-eslint/ban-types 2023-06-30 11:30:29 +02:00			`// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied`
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`public replaceEmbedUrls(html: string, callback: Function) {`
			`if (!html) return '';`
			`// We are adding the link as <a> since joplin disabled <embed>, <object> tags due to security reasons.`
			`// See: CVE-2020-15930`
			`html = html.replace(embedRegex, (_v: string, _before: string, src: string, _after: string) => {`
			`const link = callback(src);`
			return `<a href="${link}">${escapeHtml(src)}</a>`;
			`});`
			`html = html.replace(objectRegex, (_v: string, _before: string, src: string, _after: string) => {`
			`const link = callback(src);`
			return `<a href="${link}">${escapeHtml(src)}</a>`;
			`});`
			`return html;`
			`}`

Tools: Apply rule @typescript-eslint/ban-types 2023-06-30 11:30:29 +02:00			`// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied`
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`public replaceMediaUrls(html: string, callback: Function) {`
			`html = this.replaceImageUrls(html, callback);`
			`html = this.replaceEmbedUrls(html, callback);`
			`return html;`
			`}`

Desktop: Fixes #4916: Ensure that image paths that contain spaces are pasted correctly in the Rich Text editor 2021-05-04 17:44:30 +02:00			`// Note that the URLs provided by this function are URL-encoded, which is`
			`// usually what you want for web URLs. But if they are file:// URLs and the`
			`// file path is going to be used, it will need to be unescaped first. The`
			`// transformed SRC, must also be escaped before being sent back to this`
			`// function.`
Tools: Apply rule @typescript-eslint/ban-types 2023-06-30 11:30:29 +02:00			`// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied`
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public processImageTags(html: string, callback: Function) {`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00			`if (!html) return '';`

Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`return html.replace(imageRegex, (_v: string, before: string, src: string, after: string) => {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`const action = callback({ src: src });`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			if (!action) return `<img${before}src="${src}"${after}>`;
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`if (action.type === 'replaceElement') {`
			`return action.html;`
			`}`

			`if (action.type === 'replaceSource') {`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			return `<img${before}src="${action.src}"${after}>`;
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`}`

			`if (action.type === 'setAttributes') {`
			`const attrHtml = this.attributesHtml(action.attrs);`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			return `<img${before}${attrHtml}${after}>`;
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`}`

Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			throw new Error(`Invalid action: ${action.type}`);
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`});`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public prependBaseUrl(html: string, baseUrl: string) {`
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`if (!html) return '';`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`return html.replace(anchorRegex, (_v: string, before: string, href: string, after: string) => {`
Clipper: Fixed issue with relative links when importing HTML 2019-07-15 01:44:45 +02:00			`const newHref = urlUtils.prependBaseUrl(href, baseUrl);`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			return `<a${before}href="${newHref}"${after}>`;
Use regex instead of jsdom for compability with mobile app 2019-07-21 01:18:51 +02:00			`});`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`

Tools: Implement @typescript-eslint/no-explicit-any rule 2024-04-05 13:16:49 +02:00			`// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied`
Converted htmlUtils to TypeScript 2021-01-30 14:19:43 +02:00			`public attributesHtml(attr: any) {`
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`const output = [];`

			`for (const n in attr) {`
			`if (!attr.hasOwnProperty(n)) continue;`
Chore: Apply eslint rules 2019-09-19 23:51:18 +02:00			output.push(`${n}="${htmlentities(attr[n])}"`);
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`

			`return output.join(' ');`
			`}`
Desktop: Improved GotoAnything speed and made it safer Previously we'd use the remove-markdown package to create the note preview however this function would freeze on certain notes, and was probably unsafe as it used regex to parse Markdown. Replaced this in favour of Markdown-it along with htmlparser2 to strip all markup from a note. 2020-07-15 00:27:12 +02:00
Clipper: Refactored image rules to re-use more code 2019-07-15 22:43:28 +02:00			`}`
Clipper: Adding support for clipping page as HTML 2019-07-14 17:00:02 +02:00
Desktop: Fixes #4441: Fixed copying text from Rich Text editor 2021-01-30 13:08:11 +02:00			`export default new HtmlUtils();`
Desktop: Fixed pasting HTML in Rich Text editor, and improved pasting plain text 2021-05-20 18:08:59 +02:00
			`export function plainTextToHtml(plainText: string): string {`
			`const lines = plainText`
Desktop: Fixes #8476: Text that is pasted in Rich Text editor had extra new lines 2023-07-27 15:48:41 +02:00			`.replace(/\r\n/g, '\n')`
Desktop: Fixed pasting HTML in Rich Text editor, and improved pasting plain text 2021-05-20 18:08:59 +02:00			`.split('\n');`

Desktop: Fixes #8476: Text that is pasted in Rich Text editor had extra new lines 2023-07-27 15:48:41 +02:00			`if (lines.length === 1) return escapeHtml(lines[0]);`
Desktop: Fixed pasting HTML in Rich Text editor, and improved pasting plain text 2021-05-20 18:08:59 +02:00
Desktop: Fixes #8476: Text that is pasted in Rich Text editor had extra new lines 2023-07-27 15:48:41 +02:00			`// Step 1: Merge adjacent lines into paragraphs, with each line separated by`
			`// '<br/>'. So 'one\ntwo' will become '<p>one</br>two</p>'`

			`const step1: string[] = [];`
			`let currentLine = '';`

			`for (let line of lines) {`
Desktop: Fixes #9264: Preserve indentation from plain text when pasting on Rich Text Editor (#9828) 2024-03-02 17:43:38 +02:00			`line = line.trimEnd();`
Desktop: Fixes #8476: Text that is pasted in Rich Text editor had extra new lines 2023-07-27 15:48:41 +02:00			`if (!line) {`
			`if (currentLine) {`
			step1.push(`<p>${currentLine}</p>`);
			`currentLine = '';`
			`}`
			`step1.push(line);`
			`} else {`
			`if (currentLine) {`
			currentLine += `<br/>${escapeHtml(line)}`;
			`} else {`
			`currentLine = escapeHtml(line);`
			`}`
			`}`
			`}`

			if (currentLine) step1.push(`<p>${currentLine}</p>`);

			// Step 2: Convert the remaining empty lines to <br/> tags. Note that `n`
			// successive empty lines should produced `n-1` <br/> tags. This makes more
			`// sense when looking at the tests.`

			`const step2: string[] = [];`
			`let newLineCount = 0;`
			`for (let i = 0; i < step1.length; i++) {`
			`const line = step1[i];`

			`if (!line) {`
			`newLineCount++;`
			`if (newLineCount >= 2) step2.push('');`
			`} else {`
			`newLineCount = 0;`
			`step2.push(line);`
			`}`
			`}`

			`// Step 3: Actually convert the empty lines to <br/> tags`

			`const step3: string[] = [];`
			`for (const line of step2) {`
			`step3.push(line ? line : '<br/>');`
			`}`

			`return step3.join('');`
Desktop: Fixed pasting HTML in Rich Text editor, and improved pasting plain text 2021-05-20 18:08:59 +02:00			`}`