joplin/packages/lib/HtmlToMd.ts

const TurndownService = require('@joplin/turndown');
const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm;
import markdownUtils from './markdownUtils';

const pdfUrlRegex = /[\s\S]*?\.pdf$/i;

export interface ParseOptions {
	anchorNames?: string[];
	preserveImageTagsWithSize?: boolean;
	preserveNestedTables?: boolean;
	baseUrl?: string;
	disableEscapeContent?: boolean;
	convertEmbeddedPdfsToLinks?: boolean;
}

export default class HtmlToMd {

	public parse(html: string, options: ParseOptions = {}) {
		const turndownOpts: any = {
			headingStyle: 'atx',
			anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [],
			codeBlockStyle: 'fenced',
			preserveImageTagsWithSize: !!options.preserveImageTagsWithSize,
			preserveNestedTables: !!options.preserveNestedTables,
			bulletListMarker: '-',
			emDelimiter: '*',
			strongDelimiter: '**',

			// If soft-breaks are enabled, lines need to end with two or more spaces for
			// trailing <br/>s to render. See
			// https://github.com/laurent22/joplin/issues/8430
			br: '  ',

			disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false,
		};
		if (options.convertEmbeddedPdfsToLinks) {
			// Turndown ignores empty <object> tags, so we need to handle this case seperately
			// https://github.com/mixmark-io/turndown/issues/293#issuecomment-588984202
			turndownOpts.blankReplacement = (content: string, node: any) => {
				if (node.matches('object')) {
					return pdfRule.replacement(content, node, {});
				}
				return '\n\n';
			};
		}
		const turndown = new TurndownService(turndownOpts);
		turndown.use(turndownPluginGfm);
		turndown.remove('script');
		turndown.remove('style');
		const pdfRule = {
			filter: ['embed', 'object'],
			replacement: function(_content: string, node: any, _options: any) {
				// We are setting embedded_pdf as name so that we can later distingish them from normal links and create resources for them.
				if (node.matches('embed') && node.getAttribute('src') && pdfUrlRegex.test(node.getAttribute('src'))) {
					return `[embedded_pdf](${node.getAttribute('src')})`;
				} else if (node.matches('object') && node.getAttribute('data') && pdfUrlRegex.test(node.getAttribute('data'))) {
					return `[embedded_pdf](${node.getAttribute('data')})`;
				}
				return '';
			},
		};
		if (options.convertEmbeddedPdfsToLinks) {
			turndown.addRule('pdf', pdfRule);
		}
		let md = turndown.turndown(html);
		if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl);
		return md;
	}

}
Clipper: Fixes #4105: Handle certain types of code block 2020-12-02 17:43:44 +02:00			`const TurndownService = require('@joplin/turndown');`
			`const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm;`
Desktop: Fixes #4669: Copying code block from Rich Text editor results in two copies of the text Also improved copying plain text from Rich Text editor - in that case the HTML is converted to Markdown 2021-04-11 19:01:06 +02:00			`import markdownUtils from './markdownUtils';`
Clipper: Added first files 2018-05-16 15:16:14 +02:00
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`const pdfUrlRegex = /[\s\S]*?\.pdf$/i;`

Desktop: Fixes #4669: Copying code block from Rich Text editor results in two copies of the text Also improved copying plain text from Rich Text editor - in that case the HTML is converted to Markdown 2021-04-11 19:01:06 +02:00			`export interface ParseOptions {`
			`anchorNames?: string[];`
			`preserveImageTagsWithSize?: boolean;`
Desktop: Resolves #9293: Preserve nested tables in RTE 2023-11-13 16:34:30 +02:00			`preserveNestedTables?: boolean;`
Desktop: Fixes #4669: Copying code block from Rich Text editor results in two copies of the text Also improved copying plain text from Rich Text editor - in that case the HTML is converted to Markdown 2021-04-11 19:01:06 +02:00			`baseUrl?: string;`
Desktop: Resolves #5440: Do not escape content when copying from Rich Text editor 2021-09-19 14:00:06 +02:00			`disableEscapeContent?: boolean;`
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`convertEmbeddedPdfsToLinks?: boolean;`
Desktop: Fixes #4669: Copying code block from Rich Text editor results in two copies of the text Also improved copying plain text from Rich Text editor - in that case the HTML is converted to Markdown 2021-04-11 19:01:06 +02:00			`}`

			`export default class HtmlToMd {`

			`public parse(html: string, options: ParseOptions = {}) {`
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`const turndownOpts: any = {`
Clipper: Improved Html To Md conversion 2018-05-22 01:54:23 +02:00			`headingStyle: 'atx',`
Revert "Tools: Added eslint rule arrow-parens" This reverts commit 0b6f5581f0c908f88636171004561da5dbe138cd. It causes too many conflicts with pull requests. 2020-05-21 10:14:33 +02:00			`anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [],`
Clipper: Resolves #1669: Handle special case of code block used on Microsoft website 2019-06-22 19:57:41 +02:00			`codeBlockStyle: 'fenced',`
Desktop: Resolves #176: Added experimental WYSIWYG editor (#2556) * Trying to get TuiEditor to work * Tests with TinyMCE * Fixed build * Improved asset loading * Added support for Joplin source blocks * Added support for Joplin source blocks * Better integration * Make sure noteDidUpdate event is always dispatched at the right time * Minor tweaks * Fixed tests * Add support for checkboxes * Minor refactoring * Added support for file attachments * Add support for fenced code blocks * Fix new line issue on code block * Added support for Fountain scripts * Refactoring * Better handling of saving and loading notes * Fix saving and loading ntoes * Handle multi-note selection and fixed new note creation issue * Fixed newline issue in test * Fixed newline issue in test * Improve saving and loading * Improve saving and loading note * Removed undeeded prop * Fixed issue when new note being saved is incorrectly reloaded * Refactoring and improve saving of note when unmounting component * Fixed TypeScript error * Small changes * Improved further handling of saving and loading notes * Handle provisional notes and fixed various saving and loading bugs * Adding back support for HTML notes * Added support for HTML notes * Better handling of editable nodes * Preserve image HTML tag when the size is set * Handle switching between editor when the note has note finished saving * Handle templates * Handle templates * Handle loading note that is being saved * Handle note being reloaded via sync * Clean up * Clean up and improved logging * Fixed TS error * Fixed a few issues * Fixed test * Logging * Various improvements * Add blockquote support * Moved CWD operation to shim * Removed deleted files * Added support for Joplin commands 2020-03-10 01:24:57 +02:00			`preserveImageTagsWithSize: !!options.preserveImageTagsWithSize,`
Desktop: Resolves #9293: Preserve nested tables in RTE 2023-11-13 16:34:30 +02:00			`preserveNestedTables: !!options.preserveNestedTables,`
Desktop: Added support for checkboxes and fixed various issues with WYSIWYG editor 2020-03-23 02:47:25 +02:00			`bulletListMarker: '-',`
			`emDelimiter: '*',`
			`strongDelimiter: '**',`
Desktop: Fixes #8430: Make HTML <br/> tags convert to markdown compatible with the softbreaks setting (#8469) 2023-07-18 15:48:26 +02:00
			`// If soft-breaks are enabled, lines need to end with two or more spaces for`
			`// trailing <br/>s to render. See`
			`// https://github.com/laurent22/joplin/issues/8430`
			`br: ' ',`

Desktop: Resolves #5440: Do not escape content when copying from Rich Text editor 2021-09-19 14:00:06 +02:00			`disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false,`
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`};`
			`if (options.convertEmbeddedPdfsToLinks) {`
			`// Turndown ignores empty <object> tags, so we need to handle this case seperately`
			`// https://github.com/mixmark-io/turndown/issues/293#issuecomment-588984202`
			`turndownOpts.blankReplacement = (content: string, node: any) => {`
			`if (node.matches('object')) {`
			`return pdfRule.replacement(content, node, {});`
			`}`
			`return '\n\n';`
			`};`
			`}`
			`const turndown = new TurndownService(turndownOpts);`
First pass at linting lib dir 2019-07-29 15:43:53 +02:00			`turndown.use(turndownPluginGfm);`
Clipper: Improved UI and integration with main app 2018-05-20 11:19:59 +02:00			`turndown.remove('script');`
Clipper: Skip style section when importing HTML 2018-05-24 14:32:43 +02:00			`turndown.remove('style');`
Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) 2022-06-20 14:56:54 +02:00			`const pdfRule = {`
			`filter: ['embed', 'object'],`
			`replacement: function(_content: string, node: any, _options: any) {`
			`// We are setting embedded_pdf as name so that we can later distingish them from normal links and create resources for them.`
			`if (node.matches('embed') && node.getAttribute('src') && pdfUrlRegex.test(node.getAttribute('src'))) {`
			return `[embedded_pdf](${node.getAttribute('src')})`;
			`} else if (node.matches('object') && node.getAttribute('data') && pdfUrlRegex.test(node.getAttribute('data'))) {`
			return `[embedded_pdf](${node.getAttribute('data')})`;
			`}`
			`return '';`
			`},`
			`};`
			`if (options.convertEmbeddedPdfsToLinks) {`
			`turndown.addRule('pdf', pdfRule);`
			`}`
First pass at linting lib dir 2019-07-29 15:43:53 +02:00			`let md = turndown.turndown(html);`
Clipper: Download images and convert them to resources 2018-05-23 13:14:38 +02:00			`if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl);`
			`return md;`
Clipper: Added first files 2018-05-16 15:16:14 +02:00			`}`

Desktop: Fixes #4669: Copying code block from Rich Text editor results in two copies of the text Also improved copying plain text from Rich Text editor - in that case the HTML is converted to Markdown 2021-04-11 19:01:06 +02:00			`}`