const Entities = require('html-entities').AllHtmlEntities; const htmlentities = new Entities().encode; import { fileUriToPath } from '@joplin/utils/url'; const htmlparser2 = require('@joplin/fork-htmlparser2'); // [\s\S] instead of . for multiline matching // https://stackoverflow.com/a/16119722/561309 const imageRegex = //gi; const anchorRegex = //gi; const selfClosingElements = [ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', ]; interface SanitizeHtmlOptions { addNoMdConvClass?: boolean; allowedFilePrefixes?: string[]; } export const attributesHtml = (attr: Record) => { const output = []; for (const n in attr) { if (!attr.hasOwnProperty(n)) continue; if (!attr[n]) { output.push(n); } else { output.push(`${n}="${htmlentities(attr[n])}"`); } } return output.join(' '); }; export const isSelfClosingTag = (tagName: string) => { return selfClosingElements.includes(tagName.toLowerCase()); }; class HtmlUtils { // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public processImageTags(html: string, callback: Function) { if (!html) return ''; return html.replace(imageRegex, (_v, before, src, after) => { const action = callback({ src: src }); if (!action) return ``; if (action.type === 'replaceElement') { return action.html; } if (action.type === 'replaceSource') { return ``; } if (action.type === 'setAttributes') { const attrHtml = attributesHtml(action.attrs); return ``; } throw new Error(`Invalid action: ${action.type}`); }); } // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public processAnchorTags(html: string, callback: Function) { if (!html) return ''; interface Action { type: 'replaceElement' | 'replaceSource' | 'setAttributes'; href: string; html: string; attrs: Record; } return html.replace(anchorRegex, (_v, before, href, after) => { const action: Action = callback({ href: href }); if (!action) return ``; if (action.type === 'replaceElement') { return action.html; } if (action.type === 'replaceSource') { return ``; } if (action.type === 'setAttributes') { const attrHtml = attributesHtml(action.attrs); return ``; } throw new Error(`Invalid action: ${action.type}`); }); } public stripHtml(html: string) { const output: string[] = []; const tagStack: string[] = []; const currentTag = () => { if (!tagStack.length) return ''; return tagStack[tagStack.length - 1]; }; const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base']; const parser = new htmlparser2.Parser({ onopentag: (name: string) => { tagStack.push(name.toLowerCase()); }, ontext: (decodedText: string) => { if (disallowedTags.includes(currentTag())) return; output.push(decodedText); }, onclosetag: (name: string) => { if (currentTag() === name.toLowerCase()) tagStack.pop(); }, }, { decodeEntities: true }); parser.write(html); parser.end(); // In general, we want to get back plain text from this function, so all // HTML entities are decoded. Howver, to prevent XSS attacks, we // re-encode all the "<" characters, which should break any attempt to // inject HTML tags. return output.join('') .replace(/\s+/g, ' ') .replace(/ { if (!tagStack.length) return ''; return tagStack[tagStack.length - 1]; }; // When we encounter a disallowed tag, all the other tags within it are // going to be skipped too. This is necessary to prevent certain XSS // attacks. See sanitize_11.md let disallowedTagDepth = 0; // The BASE tag allows changing the base URL from which files are // loaded, and that can break several plugins, such as Katex (which // needs to load CSS files using a relative URL). For that reason it is // disabled. More info: https://github.com/laurent22/joplin/issues/3021 // // "link" can be used to escape the parser and inject JavaScript. Adding // "meta" too for the same reason as it shouldn't be used in notes // anyway. // // There are too many issues with SVG tags and to handle them properly // we should parse them separately. Currently we are not so it is better // to disable them. SVG graphics are still supported via the IMG tag. const disallowedTags = [ 'script', 'iframe', 'frameset', 'frame', 'object', 'base', 'embed', 'link', 'meta', 'noscript', 'button', 'form', 'input', 'select', 'textarea', 'option', 'optgroup', 'svg', // Disallow map and area tags: links are currently not // sanitized as well as links, allowing potential sandbox // escape. 'map', 'area', ]; const parser = new htmlparser2.Parser({ onopentag: (name: string, attrs: Record) => { // Note: "name" and attribute names are always lowercase even // when the input is not. So there is no need to call // "toLowerCase" on them. tagStack.push(name); if (disallowedTags.includes(currentTag())) { disallowedTagDepth++; return; } if (disallowedTagDepth) return; attrs = { ...attrs }; // Remove all the attributes that start with "on", which // normally should be JavaScript events. A better solution // would be to blacklist known events only but it seems the // list is not well defined [0] and we don't want any to slip // throught the cracks. A side effect of this change is a // regular harmless attribute that starts with "on" will also // be removed. // 0: https://developer.mozilla.org/en-US/docs/Web/Events for (const attrName in attrs) { if (!attrs.hasOwnProperty(attrName)) continue; if (attrName.length <= 2) continue; if (attrName.substr(0, 2) !== 'on') continue; delete attrs[attrName]; } // Make sure that only non-acceptable URLs are filtered out. In // particular we want to exclude `javascript:` URLs. This // applies to A tags, and also AREA ones but to be safe we don't // filter on the tag name and process all HREF attributes. if ('href' in attrs && !this.isAcceptedUrl(attrs['href'], options.allowedFilePrefixes)) { attrs['href'] = '#'; } // We need to clear any such attribute, otherwise it will // make any arbitrary link open within the application. if ('data-from-md' in attrs) { delete attrs['data-from-md']; } if (options.addNoMdConvClass) { let classAttr = attrs['class'] || ''; if (!classAttr.includes('jop-noMdConv')) { classAttr += ' jop-noMdConv'; attrs['class'] = classAttr.trim(); } } // For some reason, entire parts of HTML notes don't show up in // the viewer when there's an anchor tag without an "href" // attribute. It doesn't always happen and it seems to depend on // what else is in the note but in any case adding the "href" // fixes it. https://github.com/laurent22/joplin/issues/5687 if (name === 'a' && !attrs['href']) { attrs['href'] = '#'; } let attrHtml = attributesHtml(attrs); if (attrHtml) attrHtml = ` ${attrHtml}`; const closingSign = isSelfClosingTag(name) ? '/>' : '>'; output.push(`<${name}${attrHtml}${closingSign}`); }, ontext: (decodedText: string) => { if (disallowedTagDepth) return; if (currentTag() === 'style') { // For CSS, we have to put the style as-is inside the tag // because if we html-entities encode it, it's not going to // work. But it's ok because JavaScript won't run within the // style tag. Ideally CSS should be loaded from an external // file. // We however have to encode at least the `<` characters to // prevent certain XSS injections that would rely on the // content not being encoded (see sanitize_13.md) output.push(decodedText.replace(/ { const current = currentTag(); if (current === name.toLowerCase()) tagStack.pop(); // The Markdown sanitization code can result in calls like this: // sanitizeHtml('') // sanitizeHtml('') // Thus, we need to be able to remove '', even if there is no // corresponding opening tag. if (disallowedTags.includes(current) || disallowedTags.includes(name)) { if (disallowedTagDepth > 0) { disallowedTagDepth--; } return; } if (disallowedTagDepth) return; if (isSelfClosingTag(name)) return; output.push(``); }, }, { decodeEntities: true }); parser.write(html); parser.end(); return output.join(''); } } const makeHtmlTag = (name: string, attrs: Record) => { let attrHtml = attributesHtml(attrs); if (attrHtml) attrHtml = ` ${attrHtml}`; const closingSign = isSelfClosingTag(name) ? '/>' : '>'; return `<${name}${attrHtml}${closingSign}`; }; // Will return either the content of the tag if it exists, or the whole // HTML (which would be a fragment of HTML) export const extractHtmlBody = (html: string) => { let inBody = false; let bodyFound = false; const output: string[] = []; const parser = new htmlparser2.Parser({ onopentag: (name: string, attrs: Record) => { if (name === 'body') { inBody = true; bodyFound = true; return; } if (inBody) { output.push(makeHtmlTag(name, attrs)); } }, ontext: (encodedText: string) => { if (inBody) output.push(encodedText); }, onclosetag: (name: string) => { if (inBody && name === 'body') inBody = false; if (inBody) { if (isSelfClosingTag(name)) return; output.push(``); } }, }, { decodeEntities: false }); parser.write(html); parser.end(); return bodyFound ? output.join('') : html; }; export const htmlDocIsImageOnly = (html: string) => { let imageCount = 0; let nonImageFound = false; let textFound = false; const parser = new htmlparser2.Parser({ onopentag: (name: string) => { if (name === 'img') { imageCount++; } else if (['meta'].includes(name)) { // We allow these tags since they don't print anything } else { nonImageFound = true; } }, ontext: (text: string) => { if (text.trim()) textFound = true; }, }); parser.write(html); parser.end(); return imageCount === 1 && !nonImageFound && !textFound; }; export default new HtmlUtils();