mirror of https://github.com/laurent22/joplin.git synced 2025-03-20 20:55:18 +02:00

237 lines
6.1 KiB
Raw Normal View History

const Entities = require('html-entities').AllHtmlEntities;
const htmlentities = new Entities().encode;
const htmlparser2 = require('@joplin/fork-htmlparser2');
// [\s\S] instead of . for multiline matching
// https://stackoverflow.com/a/16119722/561309
const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const selfClosingElements = [
class HtmlUtils {
attributesHtml(attr: any) {
const output = [];
for (const n in attr) {
if (!attr.hasOwnProperty(n)) continue;
return output.join(' ');
processImageTags(html: string, callback: Function) {
if (!html) return '';
return html.replace(imageRegex, (_v, before, src, after) => {
const action = callback({ src: src });
if (!action) return `<img${before}src="${src}"${after}>`;
if (action.type === 'replaceElement') {
return action.html;
if (action.type === 'replaceSource') {
return `<img${before}src="${action.src}"${after}>`;
if (action.type === 'setAttributes') {
const attrHtml = this.attributesHtml(action.attrs);
return `<img${before}${attrHtml}${after}>`;
throw new Error(`Invalid action: ${action.type}`);
processAnchorTags(html: string, callback: Function) {
if (!html) return '';
return html.replace(anchorRegex, (_v, before, href, after) => {
const action = callback({ href: href });
if (!action) return `<a${before}href="${href}"${after}>`;
if (action.type === 'replaceElement') {
return action.html;
if (action.type === 'replaceSource') {
return `<img${before}href="${action.href}"${after}>`;
if (action.type === 'setAttributes') {
const attrHtml = this.attributesHtml(action.attrs);
return `<img${before}${attrHtml}${after}>`;
throw new Error(`Invalid action: ${action.type}`);
isSelfClosingTag(tagName: string) {
return selfClosingElements.includes(tagName.toLowerCase());
// TODO: copied from @joplin/lib
stripHtml(html: string) {
const output: string[] = [];
2020-11-05 16:58:23 +00:00
const tagStack: string[] = [];
2020-11-05 16:58:23 +00:00
const currentTag = () => {
if (!tagStack.length) return '';
return tagStack[tagStack.length - 1];
const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base'];
const parser = new htmlparser2.Parser({
onopentag: (name: string) => {
2020-11-05 16:58:23 +00:00
ontext: (decodedText: string) => {
2020-11-05 16:58:23 +00:00
if (disallowedTags.includes(currentTag())) return;
onclosetag: (name: string) => {
2020-11-05 16:58:23 +00:00
if (currentTag() === name.toLowerCase()) tagStack.pop();
}, { decodeEntities: true });
return output.join('').replace(/\s+/g, ' ');
sanitizeHtml(html: string, options: any = null) {
options = Object.assign({}, {
// If true, adds a "jop-noMdConv" class to all the tags.
// It can be used afterwards to restore HTML tags in Markdown.
addNoMdConvClass: false,
}, options);
const output: string[] = [];
const tagStack: string[] = [];
const currentTag = () => {
if (!tagStack.length) return '';
return tagStack[tagStack.length - 1];
// The BASE tag allows changing the base URL from which files are
// loaded, and that can break several plugins, such as Katex (which
// needs to load CSS files using a relative URL). For that reason
// it is disabled. More info:
// https://github.com/laurent22/joplin/issues/3021
// "link" can be used to escape the parser and inject JavaScript.
// Adding "meta" too for the same reason as it shouldn't be used in
// notes anyway.
const disallowedTags = ['script', 'iframe', 'frameset', 'frame', 'object', 'base', 'embed', 'link', 'meta'];
const parser = new htmlparser2.Parser({
onopentag: (name: string, attrs: any) => {
if (disallowedTags.includes(currentTag())) return;
attrs = Object.assign({}, attrs);
// Remove all the attributes that start with "on", which
// normally should be JavaScript events. A better solution
// would be to blacklist known events only but it seems the
// list is not well defined [0] and we don't want any to slip
// throught the cracks. A side effect of this change is a
// regular harmless attribute that starts with "on" will also
// be removed.
// 0: https://developer.mozilla.org/en-US/docs/Web/Events
for (const name in attrs) {
if (!attrs.hasOwnProperty(name)) continue;
if (name.length <= 2) continue;
if (name.toLowerCase().substr(0, 2) !== 'on') continue;
delete attrs[name];
if (options.addNoMdConvClass) {
let classAttr = attrs['class'] || '';
if (!classAttr.includes('jop-noMdConv')) {
classAttr += ' jop-noMdConv';
attrs['class'] = classAttr.trim();
let attrHtml = this.attributesHtml(attrs);
if (attrHtml) attrHtml = ` ${attrHtml}`;
const closingSign = this.isSelfClosingTag(name) ? '/>' : '>';
ontext: (decodedText: string) => {
if (disallowedTags.includes(currentTag())) return;
if (currentTag() === 'style') {
// For CSS, we have to put the style as-is inside the tag because if we html-entities encode
// it, it's not going to work. But it's ok because JavaScript won't run within the style tag.
// Ideally CSS should be loaded from an external file.
} else {
onclosetag: (name: string) => {
const current = currentTag();
if (current === name.toLowerCase()) tagStack.pop();
if (disallowedTags.includes(current)) return;
if (this.isSelfClosingTag(name)) return;
}, { decodeEntities: true });
return output.join('');
export default new HtmlUtils();