const urlUtils = require('./urlUtils.js'); const Entities = require('html-entities').AllHtmlEntities; const htmlentities = new Entities().encode; const { escapeHtml } = require('./string-utils.js'); // [\s\S] instead of . for multiline matching // https://stackoverflow.com/a/16119722/561309 const imageRegex = //gi; const anchorRegex = //gi; const embedRegex = //gi; const objectRegex = //gi; const pdfUrlRegex = /[\s\S]*?\.pdf$/i; const selfClosingElements = [ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', ]; class HtmlUtils { public headAndBodyHtml(doc: any) { const output = []; if (doc.head) output.push(doc.head.innerHTML); if (doc.body) output.push(doc.body.innerHTML); return output.join('\n'); } public isSelfClosingTag(tagName: string) { return selfClosingElements.includes(tagName.toLowerCase()); } // Returns the **encoded** URLs, so to be useful they should be decoded again before use. private extractUrls(regex: RegExp, html: string) { if (!html) return []; const output = []; let matches; while ((matches = regex.exec(html))) { output.push(matches[2]); } return output.filter(url => !!url); } // Returns the **encoded** URLs, so to be useful they should be decoded again before use. public extractImageUrls(html: string) { return this.extractUrls(imageRegex, html); } // Returns the **encoded** URLs, so to be useful they should be decoded again before use. public extractPdfUrls(html: string) { return [...this.extractUrls(embedRegex, html), ...this.extractUrls(objectRegex, html)].filter(url => pdfUrlRegex.test(url)); } // Returns the **encoded** URLs, so to be useful they should be decoded again before use. public extractAnchorUrls(html: string) { return this.extractUrls(anchorRegex, html); } // Returns the **encoded** URLs, so to be useful they should be decoded again before use. public extractFileUrls(html: string) { return this.extractImageUrls(html).concat(this.extractAnchorUrls(html)); } public replaceResourceUrl(html: string, urlToReplace: string, id: string) { const htmlLinkRegex = `(?<=(?:src|href)=["'])${urlToReplace}(?=["'])`; const htmlReg = new RegExp(htmlLinkRegex, 'g'); return html.replace(htmlReg, `:/${id}`); } // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public replaceImageUrls(html: string, callback: Function) { return this.processImageTags(html, (data: any) => { const newSrc = callback(data.src); return { type: 'replaceSource', src: newSrc, }; }); } // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public replaceEmbedUrls(html: string, callback: Function) { if (!html) return ''; // We are adding the link as since joplin disabled , tags due to security reasons. // See: CVE-2020-15930 html = html.replace(embedRegex, (_v: string, _before: string, src: string, _after: string) => { const link = callback(src); return `${escapeHtml(src)}`; }); html = html.replace(objectRegex, (_v: string, _before: string, src: string, _after: string) => { const link = callback(src); return `${escapeHtml(src)}`; }); return html; } // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public replaceMediaUrls(html: string, callback: Function) { html = this.replaceImageUrls(html, callback); html = this.replaceEmbedUrls(html, callback); return html; } // Note that the URLs provided by this function are URL-encoded, which is // usually what you want for web URLs. But if they are file:// URLs and the // file path is going to be used, it will need to be unescaped first. The // transformed SRC, must also be escaped before being sent back to this // function. // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public processImageTags(html: string, callback: Function) { if (!html) return ''; return html.replace(imageRegex, (_v: string, before: string, src: string, after: string) => { const action = callback({ src: src }); if (!action) return ``; if (action.type === 'replaceElement') { return action.html; } if (action.type === 'replaceSource') { return ``; } if (action.type === 'setAttributes') { const attrHtml = this.attributesHtml(action.attrs); return ``; } throw new Error(`Invalid action: ${action.type}`); }); } public prependBaseUrl(html: string, baseUrl: string) { if (!html) return ''; return html.replace(anchorRegex, (_v: string, before: string, href: string, after: string) => { const newHref = urlUtils.prependBaseUrl(href, baseUrl); return ``; }); } public attributesHtml(attr: any) { const output = []; for (const n in attr) { if (!attr.hasOwnProperty(n)) continue; output.push(`${n}="${htmlentities(attr[n])}"`); } return output.join(' '); } } export default new HtmlUtils(); export function plainTextToHtml(plainText: string): string { const lines = plainText .replace(/\r\n/g, '\n') .split('\n'); if (lines.length === 1) return escapeHtml(lines[0]); // Step 1: Merge adjacent lines into paragraphs, with each line separated by // '
'. So 'one\ntwo' will become '

one
two

' const step1: string[] = []; let currentLine = ''; for (let line of lines) { line = line.trimEnd(); if (!line) { if (currentLine) { step1.push(`

${currentLine}

`); currentLine = ''; } step1.push(line); } else { if (currentLine) { currentLine += `
${escapeHtml(line)}`; } else { currentLine = escapeHtml(line); } } } if (currentLine) step1.push(`

${currentLine}

`); // Step 2: Convert the remaining empty lines to
tags. Note that `n` // successive empty lines should produced `n-1`
tags. This makes more // sense when looking at the tests. const step2: string[] = []; let newLineCount = 0; for (let i = 0; i < step1.length; i++) { const line = step1[i]; if (!line) { newLineCount++; if (newLineCount >= 2) step2.push(''); } else { newLineCount = 0; step2.push(line); } } // Step 3: Actually convert the empty lines to
tags const step3: string[] = []; for (const line of step2) { step3.push(line ? line : '
'); } return step3.join(''); }