1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-24 10:27:10 +02:00
joplin/packages/lib/htmlUtils.ts

191 lines
4.9 KiB
TypeScript
Raw Normal View History

2020-11-05 18:58:23 +02:00
const urlUtils = require('./urlUtils.js');
const Entities = require('html-entities').AllHtmlEntities;
2019-07-29 15:43:53 +02:00
const htmlentities = new Entities().encode;
const htmlparser2 = require('@joplin/fork-htmlparser2');
const { escapeHtml } = require('./string-utils.js');
// [\s\S] instead of . for multiline matching
// https://stackoverflow.com/a/16119722/561309
2019-07-29 15:43:53 +02:00
const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const selfClosingElements = [
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
];
class HtmlUtils {
2021-01-30 14:19:43 +02:00
public headAndBodyHtml(doc: any) {
const output = [];
if (doc.head) output.push(doc.head.innerHTML);
if (doc.body) output.push(doc.body.innerHTML);
return output.join('\n');
}
2021-01-30 14:19:43 +02:00
public isSelfClosingTag(tagName: string) {
return selfClosingElements.includes(tagName.toLowerCase());
}
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
private extractUrls(regex: RegExp, html: string) {
if (!html) return [];
2019-07-19 19:18:05 +02:00
const output = [];
let matches;
while ((matches = regex.exec(html))) {
output.push(matches[2]);
}
return output.filter(url => !!url);
}
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
public extractImageUrls(html: string) {
return this.extractUrls(imageRegex, html);
}
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
public extractAnchorUrls(html: string) {
return this.extractUrls(anchorRegex, html);
}
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
public extractFileUrls(html: string) {
return this.extractImageUrls(html).concat(this.extractAnchorUrls(html));
}
public replaceResourceUrl(html: string, urlToReplace: string, id: string) {
const htmlLinkRegex = `(?<=(?:src|href)=["'])${urlToReplace}(?=["'])`;
const htmlReg = new RegExp(htmlLinkRegex, 'g');
return html.replace(htmlReg, `:/${id}`);
}
2021-01-30 14:19:43 +02:00
public replaceImageUrls(html: string, callback: Function) {
return this.processImageTags(html, (data: any) => {
const newSrc = callback(data.src);
return {
type: 'replaceSource',
src: newSrc,
};
});
}
// Note that the URLs provided by this function are URL-encoded, which is
// usually what you want for web URLs. But if they are file:// URLs and the
// file path is going to be used, it will need to be unescaped first. The
// transformed SRC, must also be escaped before being sent back to this
// function.
2021-01-30 14:19:43 +02:00
public processImageTags(html: string, callback: Function) {
if (!html) return '';
return html.replace(imageRegex, (_v: string, before: string, src: string, after: string) => {
const action = callback({ src: src });
2019-09-19 23:51:18 +02:00
if (!action) return `<img${before}src="${src}"${after}>`;
if (action.type === 'replaceElement') {
return action.html;
}
if (action.type === 'replaceSource') {
2019-09-19 23:51:18 +02:00
return `<img${before}src="${action.src}"${after}>`;
}
if (action.type === 'setAttributes') {
const attrHtml = this.attributesHtml(action.attrs);
2019-09-19 23:51:18 +02:00
return `<img${before}${attrHtml}${after}>`;
}
2019-09-19 23:51:18 +02:00
throw new Error(`Invalid action: ${action.type}`);
});
}
2021-01-30 14:19:43 +02:00
public prependBaseUrl(html: string, baseUrl: string) {
if (!html) return '';
return html.replace(anchorRegex, (_v: string, before: string, href: string, after: string) => {
const newHref = urlUtils.prependBaseUrl(href, baseUrl);
2019-09-19 23:51:18 +02:00
return `<a${before}href="${newHref}"${after}>`;
});
}
2021-01-30 14:19:43 +02:00
public attributesHtml(attr: any) {
const output = [];
for (const n in attr) {
if (!attr.hasOwnProperty(n)) continue;
2019-09-19 23:51:18 +02:00
output.push(`${n}="${htmlentities(attr[n])}"`);
}
return output.join(' ');
}
2021-01-30 14:19:43 +02:00
public stripHtml(html: string) {
const output: string[] = [];
const tagStack: any[] = [];
const currentTag = () => {
if (!tagStack.length) return '';
return tagStack[tagStack.length - 1];
};
const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base'];
const parser = new htmlparser2.Parser({
onopentag: (name: string) => {
tagStack.push(name.toLowerCase());
},
ontext: (decodedText: string) => {
if (disallowedTags.includes(currentTag())) return;
output.push(decodedText);
},
onclosetag: (name: string) => {
if (currentTag() === name.toLowerCase()) tagStack.pop();
},
}, { decodeEntities: true });
parser.write(html);
parser.end();
return output.join('').replace(/\s+/g, ' ');
}
}
export default new HtmlUtils();
export function plainTextToHtml(plainText: string): string {
const lines = plainText
.replace(/[\n\r]/g, '\n')
.split('\n');
const lineOpenTag = lines.length > 1 ? '<p>' : '';
const lineCloseTag = lines.length > 1 ? '</p>' : '';
return lines
.map(line => lineOpenTag + escapeHtml(line) + lineCloseTag)
.join('');
}