2020-12-02 17:43:44 +02:00
|
|
|
const TurndownService = require('@joplin/turndown');
|
|
|
|
const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm;
|
2021-04-11 19:01:06 +02:00
|
|
|
import markdownUtils from './markdownUtils';
|
2018-05-16 15:16:14 +02:00
|
|
|
|
2022-06-20 14:56:54 +02:00
|
|
|
const pdfUrlRegex = /[\s\S]*?\.pdf$/i;
|
|
|
|
|
2021-04-11 19:01:06 +02:00
|
|
|
export interface ParseOptions {
|
|
|
|
anchorNames?: string[];
|
|
|
|
preserveImageTagsWithSize?: boolean;
|
2023-11-13 16:34:30 +02:00
|
|
|
preserveNestedTables?: boolean;
|
2021-04-11 19:01:06 +02:00
|
|
|
baseUrl?: string;
|
2021-09-19 14:00:06 +02:00
|
|
|
disableEscapeContent?: boolean;
|
2022-06-20 14:56:54 +02:00
|
|
|
convertEmbeddedPdfsToLinks?: boolean;
|
2021-04-11 19:01:06 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
export default class HtmlToMd {
|
|
|
|
|
|
|
|
public parse(html: string, options: ParseOptions = {}) {
|
2022-06-20 14:56:54 +02:00
|
|
|
const turndownOpts: any = {
|
2018-05-22 01:54:23 +02:00
|
|
|
headingStyle: 'atx',
|
2020-05-21 10:14:33 +02:00
|
|
|
anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [],
|
2019-06-22 19:57:41 +02:00
|
|
|
codeBlockStyle: 'fenced',
|
2020-03-10 01:24:57 +02:00
|
|
|
preserveImageTagsWithSize: !!options.preserveImageTagsWithSize,
|
2023-11-13 16:34:30 +02:00
|
|
|
preserveNestedTables: !!options.preserveNestedTables,
|
2020-03-23 02:47:25 +02:00
|
|
|
bulletListMarker: '-',
|
|
|
|
emDelimiter: '*',
|
|
|
|
strongDelimiter: '**',
|
2023-07-18 15:48:26 +02:00
|
|
|
|
|
|
|
// If soft-breaks are enabled, lines need to end with two or more spaces for
|
|
|
|
// trailing <br/>s to render. See
|
|
|
|
// https://github.com/laurent22/joplin/issues/8430
|
|
|
|
br: ' ',
|
|
|
|
|
2021-09-19 14:00:06 +02:00
|
|
|
disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false,
|
2022-06-20 14:56:54 +02:00
|
|
|
};
|
|
|
|
if (options.convertEmbeddedPdfsToLinks) {
|
|
|
|
// Turndown ignores empty <object> tags, so we need to handle this case seperately
|
|
|
|
// https://github.com/mixmark-io/turndown/issues/293#issuecomment-588984202
|
|
|
|
turndownOpts.blankReplacement = (content: string, node: any) => {
|
|
|
|
if (node.matches('object')) {
|
|
|
|
return pdfRule.replacement(content, node, {});
|
|
|
|
}
|
|
|
|
return '\n\n';
|
|
|
|
};
|
|
|
|
}
|
|
|
|
const turndown = new TurndownService(turndownOpts);
|
2019-07-29 15:43:53 +02:00
|
|
|
turndown.use(turndownPluginGfm);
|
2018-05-20 11:19:59 +02:00
|
|
|
turndown.remove('script');
|
2018-05-24 14:32:43 +02:00
|
|
|
turndown.remove('style');
|
2022-06-20 14:56:54 +02:00
|
|
|
const pdfRule = {
|
|
|
|
filter: ['embed', 'object'],
|
|
|
|
replacement: function(_content: string, node: any, _options: any) {
|
|
|
|
// We are setting embedded_pdf as name so that we can later distingish them from normal links and create resources for them.
|
|
|
|
if (node.matches('embed') && node.getAttribute('src') && pdfUrlRegex.test(node.getAttribute('src'))) {
|
|
|
|
return `[embedded_pdf](${node.getAttribute('src')})`;
|
|
|
|
} else if (node.matches('object') && node.getAttribute('data') && pdfUrlRegex.test(node.getAttribute('data'))) {
|
|
|
|
return `[embedded_pdf](${node.getAttribute('data')})`;
|
|
|
|
}
|
|
|
|
return '';
|
|
|
|
},
|
|
|
|
};
|
|
|
|
if (options.convertEmbeddedPdfsToLinks) {
|
|
|
|
turndown.addRule('pdf', pdfRule);
|
|
|
|
}
|
2019-07-29 15:43:53 +02:00
|
|
|
let md = turndown.turndown(html);
|
2018-05-23 13:14:38 +02:00
|
|
|
if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl);
|
|
|
|
return md;
|
2018-05-16 15:16:14 +02:00
|
|
|
}
|
|
|
|
|
2021-04-11 19:01:06 +02:00
|
|
|
}
|