const Entities = require('html-entities').AllHtmlEntities; const htmlentities = new Entities().encode; import { fileUriToPath } from '@joplin/utils/url'; const htmlparser2 = require('@joplin/fork-htmlparser2'); // [\s\S] instead of . for multiline matching // https://stackoverflow.com/a/16119722/561309 const imageRegex = //gi; const anchorRegex = //gi; const selfClosingElements = [ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', ]; interface SanitizeHtmlOptions { addNoMdConvClass?: boolean; allowedFilePrefixes?: string[]; } export const attributesHtml = (attr: Record) => { const output = []; for (const n in attr) { if (!attr.hasOwnProperty(n)) continue; if (!attr[n]) { output.push(n); } else { output.push(`${n}="${htmlentities(attr[n])}"`); } } return output.join(' '); }; export const isSelfClosingTag = (tagName: string) => { return selfClosingElements.includes(tagName.toLowerCase()); }; type ProcessImageResult = { type: 'replaceElement'; html: string; }|{ type: 'replaceSource'; src: string; }|{ type: 'setAttributes'; attrs: Record; }; interface ProcessImageEvent { src: string; before: string; after: string; } type ProcessImageCallback = (data: ProcessImageEvent)=> ProcessImageResult; class HtmlUtils { // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public processImageTags(html: string, callback: ProcessImageCallback) { if (!html) return ''; return html.replace(imageRegex, (_v, before, src, after) => { const action = callback({ src, before, after }); if (!action) return ``; if (action.type === 'replaceElement') { return action.html; } if (action.type === 'replaceSource') { return ``; } if (action.type === 'setAttributes') { const attrHtml = attributesHtml(action.attrs); return ``; } throw new Error(`Invalid action: ${(action as Record).type}`); }); } // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public processAnchorTags(html: string, callback: Function) { if (!html) return ''; interface Action { type: 'replaceElement' | 'replaceSource' | 'setAttributes'; href: string; html: string; attrs: Record; } return html.replace(anchorRegex, (_v, before, href, after) => { const action: Action = callback({ href: href }); if (!action) return ``; if (action.type === 'replaceElement') { return action.html; } if (action.type === 'replaceSource') { return ``; } if (action.type === 'setAttributes') { const attrHtml = attributesHtml(action.attrs); return ``; } throw new Error(`Invalid action: ${action.type}`); }); } public stripHtml(html: string) { const output: string[] = []; const tagStack: string[] = []; const currentTag = () => { if (!tagStack.length) return ''; return tagStack[tagStack.length - 1]; }; const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base']; const parser = new htmlparser2.Parser({ onopentag: (name: string) => { tagStack.push(name.toLowerCase()); }, ontext: (decodedText: string) => { if (disallowedTags.includes(currentTag())) return; output.push(decodedText); }, onclosetag: (name: string) => { if (currentTag() === name.toLowerCase()) tagStack.pop(); }, }, { decodeEntities: true }); parser.write(html); parser.end(); // In general, we want to get back plain text from this function, so all // HTML entities are decoded. Howver, to prevent XSS attacks, we // re-encode all the "<" characters, which should break any attempt to // inject HTML tags. return output.join('') .replace(/\s+/g, ' ') .replace(/ { if (!tagStack.length) return ''; return tagStack[tagStack.length - 1]; }; // When we encounter a disallowed tag, all the other tags within it are // going to be skipped too. This is necessary to prevent certain XSS // attacks. See sanitize_11.md let disallowedTagDepth = 0; // The BASE tag allows changing the base URL from which files are // loaded, and that can break several plugins, such as Katex (which // needs to load CSS files using a relative URL). For that reason it is // disabled. More info: https://github.com/laurent22/joplin/issues/3021 // // "link" can be used to escape the parser and inject JavaScript. Adding // "meta" too for the same reason as it shouldn't be used in notes // anyway. // // There are too many issues with SVG tags and to handle them properly // we should parse them separately. Currently we are not so it is better // to disable them. SVG graphics are still supported via the IMG tag. const disallowedTags = [ 'script', 'iframe', 'frameset', 'frame', 'object', 'base', 'embed', 'link', 'meta', 'noscript', 'button', 'input', 'select', 'textarea', 'option', 'optgroup', 'svg', // Disallow map and area tags: links are currently not // sanitized as well as links, allowing potential sandbox // escape. 'map', 'area', ]; // Certain tags should not be rendered, however unlike for the disallowed tags, we want to // keep their content. For example the FORM tag may sometimes wrap relevant content so we // want to keep that content, but we don't want to keep the FORM tag itself. In that case we // simply replace it with a DIV tag. const replaceWithDivTags = [ 'form', ]; const parser = new htmlparser2.Parser({ onopentag: (name: string, attrs: Record) => { // Note: "name" and attribute names are always lowercase even // when the input is not. So there is no need to call // "toLowerCase" on them. tagStack.push(name); if (disallowedTags.includes(currentTag())) { disallowedTagDepth++; return; } if (disallowedTagDepth) return; if (replaceWithDivTags.includes(currentTag())) { output.push('
'); return; } attrs = { ...attrs }; // Remove all the attributes that start with "on", which // normally should be JavaScript events. A better solution // would be to blacklist known events only but it seems the // list is not well defined [0] and we don't want any to slip // through the cracks. A side effect of this change is a // regular harmless attribute that starts with "on" will also // be removed. // 0: https://developer.mozilla.org/en-US/docs/Web/Events for (const attrName in attrs) { if (!attrs.hasOwnProperty(attrName)) continue; if (attrName.length <= 2) continue; if (attrName.substr(0, 2) !== 'on') continue; delete attrs[attrName]; } // Make sure that only non-acceptable URLs are filtered out. In // particular we want to exclude `javascript:` URLs. This // applies to A tags, and also AREA ones but to be safe we don't // filter on the tag name and process all HREF attributes. if ('href' in attrs && !this.isAcceptedUrl(attrs['href'], options.allowedFilePrefixes)) { attrs['href'] = '#'; } // We need to clear any such attribute, otherwise it will // make any arbitrary link open within the application. if ('data-from-md' in attrs) { delete attrs['data-from-md']; } if (options.addNoMdConvClass) { let classAttr = attrs['class'] || ''; if (!classAttr.includes('jop-noMdConv')) { classAttr += ' jop-noMdConv'; attrs['class'] = classAttr.trim(); } } // For some reason, entire parts of HTML notes don't show up in // the viewer when there's an anchor tag without an "href" // attribute. It doesn't always happen and it seems to depend on // what else is in the note but in any case adding the "href" // fixes it. https://github.com/laurent22/joplin/issues/5687 if (name === 'a' && !attrs['href']) { attrs['href'] = '#'; } let attrHtml = attributesHtml(attrs); if (attrHtml) attrHtml = ` ${attrHtml}`; const closingSign = isSelfClosingTag(name) ? '/>' : '>'; output.push(`<${name}${attrHtml}${closingSign}`); }, ontext: (decodedText: string) => { if (disallowedTagDepth) return; if (currentTag() === 'style') { // For CSS, we have to put the style as-is inside the tag // because if we html-entities encode it, it's not going to // work. But it's ok because JavaScript won't run within the // style tag. Ideally CSS should be loaded from an external // file. // We however have to encode at least the `<` characters to // prevent certain XSS injections that would rely on the // content not being encoded (see sanitize_13.md) output.push(decodedText.replace(/ { const current = currentTag(); if (current === name.toLowerCase()) tagStack.pop(); // The Markdown sanitization code can result in calls like this: // sanitizeHtml('') // sanitizeHtml('') // Thus, we need to be able to remove '', even if there is no // corresponding opening tag. if (disallowedTags.includes(current) || disallowedTags.includes(name)) { if (disallowedTagDepth > 0) { disallowedTagDepth--; } return; } if (disallowedTagDepth) return; if (replaceWithDivTags.includes(currentTag())) { output.push('
'); return; } if (isSelfClosingTag(name)) return; output.push(``); }, }, { decodeEntities: true }); parser.write(html); parser.end(); return output.join(''); } } const makeHtmlTag = (name: string, attrs: Record) => { let attrHtml = attributesHtml(attrs); if (attrHtml) attrHtml = ` ${attrHtml}`; const closingSign = isSelfClosingTag(name) ? '/>' : '>'; return `<${name}${attrHtml}${closingSign}`; }; // Will return either the content of the tag if it exists, or the whole // HTML (which would be a fragment of HTML) export const extractHtmlBody = (html: string) => { let inBody = false; let bodyFound = false; const output: string[] = []; const parser = new htmlparser2.Parser({ onopentag: (name: string, attrs: Record) => { if (name === 'body') { inBody = true; bodyFound = true; return; } if (inBody) { output.push(makeHtmlTag(name, attrs)); } }, ontext: (encodedText: string) => { if (inBody) output.push(encodedText); }, onclosetag: (name: string) => { if (inBody && name === 'body') inBody = false; if (inBody) { if (isSelfClosingTag(name)) return; output.push(``); } }, }, { decodeEntities: false }); parser.write(html); parser.end(); return bodyFound ? output.join('') : html; }; export const removeWrappingParagraphAndTrailingEmptyElements = (html: string) => { if (!html.startsWith('

')) return html; const stack: string[] = []; const output: string[] = []; let inFirstParagraph = true; let canSimplify = true; const parser = new htmlparser2.Parser({ onopentag: (name: string, attrs: Record) => { if (inFirstParagraph && stack.length > 0) { output.push(makeHtmlTag(name, attrs)); } else if (!inFirstParagraph && attrs.style) { canSimplify = false; } stack.push(name); }, ontext: (encodedText: string) => { if (encodedText.trim() && !inFirstParagraph) { canSimplify = false; } else { output.push(encodedText); } }, onclosetag: (name: string) => { stack.pop(); if (stack.length === 0 && name === 'p') { inFirstParagraph = false; } else if (inFirstParagraph) { if (isSelfClosingTag(name)) return; output.push(``); // Many elements, even if empty, can still be visible. // For example, an


. Don't simplify if these elements // are present. } else if (!['div', 'style', 'span'].includes(name)) { canSimplify = false; } }, }); parser.write(html); parser.end(); return canSimplify ? output.join('') : html; }; export const htmlDocIsImageOnly = (html: string) => { let imageCount = 0; let nonImageFound = false; let textFound = false; // Ignore these tags that do not result in any Markdown (or HTML) code being generated. const ignoredTags = ['meta', 'head', 'body', 'html']; const parser = new htmlparser2.Parser({ onopentag: (name: string) => { if (name === 'img') { imageCount++; } else if (ignoredTags.includes(name)) { // Skip } else { nonImageFound = true; } }, ontext: (text: string) => { if (text.trim()) textFound = true; }, }); parser.write(html); parser.end(); return imageCount === 1 && !nonImageFound && !textFound; }; export default new HtmlUtils();