2020-11-05 18:58:23 +02:00
|
|
|
const urlUtils = require('./urlUtils.js');
|
2019-07-15 22:43:28 +02:00
|
|
|
const Entities = require('html-entities').AllHtmlEntities;
|
2019-07-29 15:43:53 +02:00
|
|
|
const htmlentities = new Entities().encode;
|
2020-11-07 17:59:37 +02:00
|
|
|
const htmlparser2 = require('@joplin/fork-htmlparser2');
|
2019-07-15 01:44:45 +02:00
|
|
|
|
2019-07-21 01:18:51 +02:00
|
|
|
// [\s\S] instead of . for multiline matching
|
|
|
|
// https://stackoverflow.com/a/16119722/561309
|
2019-07-29 15:43:53 +02:00
|
|
|
const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
|
|
|
|
const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
|
2019-07-21 01:18:51 +02:00
|
|
|
|
2020-06-15 18:10:51 +02:00
|
|
|
const selfClosingElements = [
|
|
|
|
'area',
|
|
|
|
'base',
|
|
|
|
'basefont',
|
|
|
|
'br',
|
|
|
|
'col',
|
|
|
|
'command',
|
|
|
|
'embed',
|
|
|
|
'frame',
|
|
|
|
'hr',
|
|
|
|
'img',
|
|
|
|
'input',
|
|
|
|
'isindex',
|
|
|
|
'keygen',
|
|
|
|
'link',
|
|
|
|
'meta',
|
|
|
|
'param',
|
|
|
|
'source',
|
|
|
|
'track',
|
|
|
|
'wbr',
|
|
|
|
];
|
|
|
|
|
2019-07-15 22:43:28 +02:00
|
|
|
class HtmlUtils {
|
|
|
|
headAndBodyHtml(doc) {
|
|
|
|
const output = [];
|
|
|
|
if (doc.head) output.push(doc.head.innerHTML);
|
|
|
|
if (doc.body) output.push(doc.body.innerHTML);
|
|
|
|
return output.join('\n');
|
|
|
|
}
|
2019-07-14 17:00:02 +02:00
|
|
|
|
2020-06-15 18:10:51 +02:00
|
|
|
isSelfClosingTag(tagName) {
|
|
|
|
return selfClosingElements.includes(tagName.toLowerCase());
|
|
|
|
}
|
|
|
|
|
2020-10-29 12:16:31 +02:00
|
|
|
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
|
2019-07-14 17:00:02 +02:00
|
|
|
extractImageUrls(html) {
|
|
|
|
if (!html) return [];
|
2019-07-19 19:18:05 +02:00
|
|
|
|
2019-07-14 17:00:02 +02:00
|
|
|
const output = [];
|
2019-07-21 01:18:51 +02:00
|
|
|
let matches;
|
2019-07-29 15:43:53 +02:00
|
|
|
while ((matches = imageRegex.exec(html))) {
|
2019-07-21 01:18:51 +02:00
|
|
|
output.push(matches[2]);
|
2019-07-14 17:00:02 +02:00
|
|
|
}
|
|
|
|
|
2020-05-21 10:14:33 +02:00
|
|
|
return output.filter(url => !!url);
|
2019-07-15 22:43:28 +02:00
|
|
|
}
|
2019-07-14 17:00:02 +02:00
|
|
|
|
|
|
|
replaceImageUrls(html, callback) {
|
2020-05-21 10:14:33 +02:00
|
|
|
return this.processImageTags(html, data => {
|
2019-07-21 01:18:51 +02:00
|
|
|
const newSrc = callback(data.src);
|
|
|
|
return {
|
|
|
|
type: 'replaceSource',
|
|
|
|
src: newSrc,
|
|
|
|
};
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
processImageTags(html, callback) {
|
2019-07-14 17:00:02 +02:00
|
|
|
if (!html) return '';
|
|
|
|
|
2019-07-21 01:18:51 +02:00
|
|
|
return html.replace(imageRegex, (v, before, src, after) => {
|
|
|
|
const action = callback({ src: src });
|
2019-07-14 17:00:02 +02:00
|
|
|
|
2019-09-19 23:51:18 +02:00
|
|
|
if (!action) return `<img${before}src="${src}"${after}>`;
|
2019-07-14 17:00:02 +02:00
|
|
|
|
2019-07-21 01:18:51 +02:00
|
|
|
if (action.type === 'replaceElement') {
|
|
|
|
return action.html;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (action.type === 'replaceSource') {
|
2019-09-19 23:51:18 +02:00
|
|
|
return `<img${before}src="${action.src}"${after}>`;
|
2019-07-21 01:18:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (action.type === 'setAttributes') {
|
|
|
|
const attrHtml = this.attributesHtml(action.attrs);
|
2019-09-19 23:51:18 +02:00
|
|
|
return `<img${before}${attrHtml}${after}>`;
|
2019-07-21 01:18:51 +02:00
|
|
|
}
|
|
|
|
|
2019-09-19 23:51:18 +02:00
|
|
|
throw new Error(`Invalid action: ${action.type}`);
|
2019-07-21 01:18:51 +02:00
|
|
|
});
|
2019-07-15 22:43:28 +02:00
|
|
|
}
|
2019-07-15 01:44:45 +02:00
|
|
|
|
|
|
|
prependBaseUrl(html, baseUrl) {
|
2019-07-21 01:18:51 +02:00
|
|
|
if (!html) return '';
|
2019-07-15 01:44:45 +02:00
|
|
|
|
2019-07-21 01:18:51 +02:00
|
|
|
return html.replace(anchorRegex, (v, before, href, after) => {
|
2019-07-15 01:44:45 +02:00
|
|
|
const newHref = urlUtils.prependBaseUrl(href, baseUrl);
|
2019-09-19 23:51:18 +02:00
|
|
|
return `<a${before}href="${newHref}"${after}>`;
|
2019-07-21 01:18:51 +02:00
|
|
|
});
|
2019-07-15 22:43:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
attributesHtml(attr) {
|
|
|
|
const output = [];
|
|
|
|
|
|
|
|
for (const n in attr) {
|
|
|
|
if (!attr.hasOwnProperty(n)) continue;
|
2019-09-19 23:51:18 +02:00
|
|
|
output.push(`${n}="${htmlentities(attr[n])}"`);
|
2019-07-15 22:43:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return output.join(' ');
|
|
|
|
}
|
2020-07-15 00:27:12 +02:00
|
|
|
|
|
|
|
stripHtml(html) {
|
|
|
|
const output = [];
|
|
|
|
|
|
|
|
const tagStack = [];
|
|
|
|
|
|
|
|
const currentTag = () => {
|
|
|
|
if (!tagStack.length) return '';
|
|
|
|
return tagStack[tagStack.length - 1];
|
|
|
|
};
|
|
|
|
|
|
|
|
const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base'];
|
|
|
|
|
|
|
|
const parser = new htmlparser2.Parser({
|
|
|
|
|
|
|
|
onopentag: (name) => {
|
|
|
|
tagStack.push(name.toLowerCase());
|
|
|
|
},
|
|
|
|
|
|
|
|
ontext: (decodedText) => {
|
|
|
|
if (disallowedTags.includes(currentTag())) return;
|
|
|
|
output.push(decodedText);
|
|
|
|
},
|
|
|
|
|
|
|
|
onclosetag: (name) => {
|
|
|
|
if (currentTag() === name.toLowerCase()) tagStack.pop();
|
|
|
|
},
|
|
|
|
|
|
|
|
}, { decodeEntities: true });
|
|
|
|
|
|
|
|
parser.write(html);
|
|
|
|
parser.end();
|
|
|
|
|
|
|
|
return output.join('').replace(/\s+/g, ' ');
|
|
|
|
}
|
2019-07-15 22:43:28 +02:00
|
|
|
}
|
2019-07-14 17:00:02 +02:00
|
|
|
|
2019-07-15 22:43:28 +02:00
|
|
|
const htmlUtils = new HtmlUtils();
|
2019-07-14 17:00:02 +02:00
|
|
|
|
2019-07-29 15:43:53 +02:00
|
|
|
module.exports = htmlUtils;
|