) => {
// Note: "name" and attribute names are always lowercase even
// when the input is not. So there is no need to call
// "toLowerCase" on them.
tagStack.push(name);
if (disallowedTags.includes(currentTag())) {
disallowedTagDepth++;
return;
}
if (disallowedTagDepth) return;
if (replaceWithDivTags.includes(currentTag())) {
output.push('');
return;
}
attrs = { ...attrs };
// Remove all the attributes that start with "on", which
// normally should be JavaScript events. A better solution
// would be to blacklist known events only but it seems the
// list is not well defined [0] and we don't want any to slip
// through the cracks. A side effect of this change is a
// regular harmless attribute that starts with "on" will also
// be removed.
// 0: https://developer.mozilla.org/en-US/docs/Web/Events
for (const attrName in attrs) {
if (!attrs.hasOwnProperty(attrName)) continue;
if (attrName.length <= 2) continue;
if (attrName.substr(0, 2) !== 'on') continue;
delete attrs[attrName];
}
// Make sure that only non-acceptable URLs are filtered out. In
// particular we want to exclude `javascript:` URLs. This
// applies to A tags, and also AREA ones but to be safe we don't
// filter on the tag name and process all HREF attributes.
if ('href' in attrs && !this.isAcceptedUrl(attrs['href'], options.allowedFilePrefixes)) {
attrs['href'] = '#';
}
// We need to clear any such attribute, otherwise it will
// make any arbitrary link open within the application.
if ('data-from-md' in attrs) {
delete attrs['data-from-md'];
}
if (options.addNoMdConvClass) {
let classAttr = attrs['class'] || '';
if (!classAttr.includes('jop-noMdConv')) {
classAttr += ' jop-noMdConv';
attrs['class'] = classAttr.trim();
}
}
// For some reason, entire parts of HTML notes don't show up in
// the viewer when there's an anchor tag without an "href"
// attribute. It doesn't always happen and it seems to depend on
// what else is in the note but in any case adding the "href"
// fixes it. https://github.com/laurent22/joplin/issues/5687
if (name === 'a' && !attrs['href']) {
attrs['href'] = '#';
}
let attrHtml = attributesHtml(attrs);
if (attrHtml) attrHtml = ` ${attrHtml}`;
const closingSign = isSelfClosingTag(name) ? '/>' : '>';
output.push(`<${name}${attrHtml}${closingSign}`);
},
ontext: (decodedText: string) => {
if (disallowedTagDepth) return;
if (currentTag() === 'style') {
// For CSS, we have to put the style as-is inside the tag
// because if we html-entities encode it, it's not going to
// work. But it's ok because JavaScript won't run within the
// style tag. Ideally CSS should be loaded from an external
// file.
// We however have to encode at least the `<` characters to
// prevent certain XSS injections that would rely on the
// content not being encoded (see sanitize_13.md)
output.push(decodedText.replace(/ {
const current = currentTag();
if (current === name.toLowerCase()) tagStack.pop();
// The Markdown sanitization code can result in calls like this:
// sanitizeHtml('')
// sanitizeHtml('')
// Thus, we need to be able to remove '', even if there is no
// corresponding opening tag.
if (disallowedTags.includes(current) || disallowedTags.includes(name)) {
if (disallowedTagDepth > 0) {
disallowedTagDepth--;
}
return;
}
if (disallowedTagDepth) return;
if (replaceWithDivTags.includes(currentTag())) {
output.push('
');
return;
}
if (isSelfClosingTag(name)) return;
output.push(`${name}>`);
},
}, { decodeEntities: true });
parser.write(html);
parser.end();
return output.join('');
}
}
const makeHtmlTag = (name: string, attrs: Record) => {
let attrHtml = attributesHtml(attrs);
if (attrHtml) attrHtml = ` ${attrHtml}`;
const closingSign = isSelfClosingTag(name) ? '/>' : '>';
return `<${name}${attrHtml}${closingSign}`;
};
// Will return either the content of the tag if it exists, or the whole
// HTML (which would be a fragment of HTML)
export const extractHtmlBody = (html: string) => {
let inBody = false;
let bodyFound = false;
const output: string[] = [];
const parser = new htmlparser2.Parser({
onopentag: (name: string, attrs: Record) => {
if (name === 'body') {
inBody = true;
bodyFound = true;
return;
}
if (inBody) {
output.push(makeHtmlTag(name, attrs));
}
},
ontext: (encodedText: string) => {
if (inBody) output.push(encodedText);
},
onclosetag: (name: string) => {
if (inBody && name === 'body') inBody = false;
if (inBody) {
if (isSelfClosingTag(name)) return;
output.push(`${name}>`);
}
},
}, { decodeEntities: false });
parser.write(html);
parser.end();
return bodyFound ? output.join('') : html;
};
export const htmlDocIsImageOnly = (html: string) => {
let imageCount = 0;
let nonImageFound = false;
let textFound = false;
// Ignore these tags that do not result in any Markdown (or HTML) code being generated.
const ignoredTags = ['meta', 'head', 'body', 'html'];
const parser = new htmlparser2.Parser({
onopentag: (name: string) => {
if (name === 'img') {
imageCount++;
} else if (ignoredTags.includes(name)) {
// Skip
} else {
nonImageFound = true;
}
},
ontext: (text: string) => {
if (text.trim()) textFound = true;
},
});
parser.write(html);
parser.end();
return imageCount === 1 && !nonImageFound && !textFound;
};
export default new HtmlUtils();