1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-08-10 22:11:50 +02:00

Chore: Refactor htmlpack for mobile compatibility (#12174)

This commit is contained in:
Henry Heino
2025-05-19 15:02:26 -07:00
committed by GitHub
parent a4dacd65e6
commit d5830dd3a1
17 changed files with 450 additions and 263 deletions

View File

@@ -1021,7 +1021,10 @@ packages/generator-joplin/generators/app/templates/api/types.js
packages/generator-joplin/generators/app/templates/api_index.js
packages/generator-joplin/generators/app/templates/src/index.js
packages/generator-joplin/tools/updateCategories.js
packages/htmlpack/src/index.js
packages/htmlpack/index.test.js
packages/htmlpack/index.js
packages/htmlpack/packToString.js
packages/htmlpack/utils/parseHtmlAsync.js
packages/lib/ArrayUtils.js
packages/lib/AsyncActionQueue.test.js
packages/lib/AsyncActionQueue.js

5
.gitignore vendored
View File

@@ -995,7 +995,10 @@ packages/generator-joplin/generators/app/templates/api/types.js
packages/generator-joplin/generators/app/templates/api_index.js
packages/generator-joplin/generators/app/templates/src/index.js
packages/generator-joplin/tools/updateCategories.js
packages/htmlpack/src/index.js
packages/htmlpack/index.test.js
packages/htmlpack/index.js
packages/htmlpack/packToString.js
packages/htmlpack/utils/parseHtmlAsync.js
packages/lib/ArrayUtils.js
packages/lib/AsyncActionQueue.test.js
packages/lib/AsyncActionQueue.js

View File

@@ -27,6 +27,7 @@ const localPackages = {
'@joplin/react-native-saf-x': path.resolve(__dirname, '../react-native-saf-x/'),
'@joplin/react-native-alarm-notification': path.resolve(__dirname, '../react-native-alarm-notification/'),
'@joplin/fork-sax': path.resolve(__dirname, '../fork-sax/'),
'@joplin/htmlpack': path.resolve(__dirname, '../htmlpack/'),
};
// cSpell:disable

View File

@@ -1 +1,2 @@
dist/*
dist/*
test-output/

View File

@@ -0,0 +1,2 @@
test-data/
test-output/

View File

@@ -0,0 +1,35 @@
import { exists, mkdir, readFile, remove } from 'fs-extra';
import { join } from 'path';
import htmlpack from '.';
const outputDirectory = './test-output';
describe('htmlpack/index', () => {
beforeEach(async () => {
if (await exists(outputDirectory)) {
await remove(outputDirectory);
}
await mkdir(outputDirectory);
});
test('should convert HTML into a single file', async () => {
const outputFile = join(outputDirectory, 'output.html');
await htmlpack(join('test-data', 'index.html'), outputFile);
const outputContent = await readFile(outputFile, 'utf8');
expect(outputContent).toBe(`
<html>
<head>
<style>* {
color: red;
}</style>
</head>
<body>
<h1>Test</h1>
<a href="data:text/plain;base64,UmVzb3VyY2Uu" download="resource.txt">Test link.</a>
<img src="" alt="test image"/>
<p>Test paragraph</p>
</body>
</html>`);
});
});

View File

@@ -0,0 +1,28 @@
import * as fs from 'fs-extra';
const Datauri = require('datauri/sync');
import { dirname } from 'path';
import packToString from './packToString';
const dataUriEncode = (filePath: string): string => {
const result = Datauri(filePath);
return result.content;
};
export default async function htmlpack(inputFile: string, outputFile: string): Promise<void> {
const inputHtml = await fs.readFile(inputFile, 'utf8');
const baseDir = dirname(inputFile);
const output = await packToString(baseDir, inputHtml, {
exists(path: string) {
return fs.exists(path);
},
readFileText(path: string) {
return fs.readFile(path, 'utf8');
},
async readFileDataUri(path: string) {
return dataUriEncode(path);
},
});
await fs.writeFile(outputFile, output, 'utf8');
}

View File

@@ -0,0 +1,4 @@
module.exports = {
testMatch: ['**/*.test.js'],
testPathIgnorePatterns: ['<rootDir>/node_modules/'],
};

View File

@@ -0,0 +1,254 @@
const Entities = require('html-entities').AllHtmlEntities;
import { CssTypes, parse as cssParse, stringify as cssStringify } from '@adobe/css-tools';
import { dirname, basename } from 'path';
import parseHtmlAsync, { HtmlAttrs } from './utils/parseHtmlAsync';
const selfClosingElements = [
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
];
const htmlentities = (s: string): string => {
const output = (new Entities()).encode(s);
return output.replace(/&Tab;/ig, '\t');
};
const attributesHtml = (attrs: HtmlAttrs) => {
const output: string[] = [];
for (const n in attrs) {
if (!attrs.hasOwnProperty(n)) continue;
output.push(`${n}="${htmlentities(attrs[n])}"`);
}
return output.join(' ');
};
const attrValue = (attrs: HtmlAttrs, name: string): string => {
if (!attrs[name]) return '';
return attrs[name];
};
const isSelfClosingTag = (tagName: string) => {
return selfClosingElements.includes(tagName.toLowerCase());
};
export type FileApi = {
exists(path: string): Promise<boolean>;
readFileText(path: string): Promise<string>;
readFileDataUri(path: string): Promise<string>;
};
// packToString should be able to run in React Native -- don't use fs-extra.
const packToString = async (baseDir: string, inputFileText: string, fs: FileApi) => {
const readFileDataUriSafe = async (path: string) => {
try {
return await fs.readFileDataUri(path);
} catch (error) {
// If the file path is invalid, the Datauri will throw an exception.
// Instead, since we can just ignore that particular file.
// Fixes https://github.com/laurent22/joplin/issues/8305
return '';
}
};
const processCssContent = async (cssBaseDir: string, content: string) => {
const o = cssParse(content, {
silent: false,
});
for (const rule of o.stylesheet.rules) {
if (rule.type === 'font-face') {
for (const declaration of rule.declarations) {
if (declaration.type === CssTypes.comment) {
continue;
}
if (declaration.property === 'src') {
const replacements = new Map<string, string>();
const replacementTasks: Promise<void>[] = [];
declaration.value.replace(/url\((.*?)\)/g, (match: string, url: string) => {
if (replacements.has(url)) return match;
replacements.set(url, match);
replacementTasks.push((async () => {
const cssFilePath = `${cssBaseDir}/${url}`;
let replacement;
if (await fs.exists(cssFilePath)) {
replacement = `url(${await readFileDataUriSafe(cssFilePath)})`;
} else {
replacement = `url(${url})`;
}
replacements.set(url, replacement);
})());
return match;
});
await Promise.all(replacementTasks);
declaration.value = declaration.value.replace(/url\((.*?)\)/g, (_match: string, url: string) => {
return replacements.get(url);
});
}
}
}
}
return cssStringify(o);
};
const processLinkTag = async (_name: string, attrs: HtmlAttrs) => {
const href = attrValue(attrs, 'href');
if (!href) return null;
const filePath = `${baseDir}/${href}`;
if (!await fs.exists(filePath)) return null;
const content = await fs.readFileText(filePath);
return `<style>${await processCssContent(dirname(filePath), content)}</style>`;
};
const processScriptTag = async (_name: string, attrs: HtmlAttrs) => {
const src = attrValue(attrs, 'src');
if (!src) return null;
const scriptFilePath = `${baseDir}/${src}`;
let content = await fs.readFileText(scriptFilePath);
// There's no simple way to insert arbitrary content in <script> tags.
// Encoding HTML entities doesn't work because the JS parser will not decode
// them before parsing. We also can't put the code verbatim since it may
// contain strings such as `</script>` or `<!--` which would break the HTML
// file.
//
// So it seems the only way is to escape these specific sequences with a
// backslash. It shouldn't break the JS code and should allow the HTML
// parser to work as expected.
//
// https://stackoverflow.com/a/41302266/561309
content = content.replace(/<script>/g, '<\\script>');
content = content.replace(/<\/script>/g, '<\\/script>');
content = content.replace(/<!--/g, '<\\!--');
return `<script>${content}</script>`;
};
const processImgTag = async (_name: string, attrs: HtmlAttrs) => {
const src = attrValue(attrs, 'src');
if (!src) return null;
const filePath = `${baseDir}/${src}`;
if (!await fs.exists(filePath)) return null;
const modAttrs = { ...attrs };
delete modAttrs.src;
return `<img src="${await readFileDataUriSafe(filePath)}" ${attributesHtml(modAttrs)}/>`;
};
const processAnchorTag = async (_name: string, attrs: HtmlAttrs) => {
const href = attrValue(attrs, 'href');
if (!href) return null;
const filePath = `${baseDir}/${href}`;
if (!await fs.exists(filePath)) return null;
const modAttrs = { ...attrs };
modAttrs.href = await readFileDataUriSafe(filePath);
modAttrs.download = basename(href);
return `<a ${attributesHtml(modAttrs)}>`;
};
const output: string[] = [];
interface Tag {
name: string;
}
const tagStack: Tag[] = [];
const currentTag = () => {
if (!tagStack.length) return { name: '', processed: false };
return tagStack[tagStack.length - 1];
};
await parseHtmlAsync(inputFileText, {
onopentag: async (name: string, attrs: HtmlAttrs) => {
name = name.toLowerCase();
let processedResult = '';
if (name === 'link') {
processedResult = await processLinkTag(name, attrs);
}
if (name === 'script') {
processedResult = await processScriptTag(name, attrs);
}
if (name === 'img') {
processedResult = await processImgTag(name, attrs);
}
if (name === 'a') {
processedResult = await processAnchorTag(name, attrs);
}
tagStack.push({ name });
if (processedResult) {
output.push(processedResult);
} else {
let attrHtml = attributesHtml(attrs);
if (attrHtml) attrHtml = ` ${attrHtml}`;
const closingSign = isSelfClosingTag(name) ? '/>' : '>';
output.push(`<${name}${attrHtml}${closingSign}`);
}
},
ontext: async (decodedText: string) => {
if (currentTag().name === 'style') {
// For CSS, we have to put the style as-is inside the tag because if we html-entities encode
// it, it's not going to work. But it's ok because JavaScript won't run within the style tag.
// Ideally CSS should be loaded from an external file.
output.push(decodedText);
} else {
output.push(htmlentities(decodedText));
}
},
onclosetag: async (name: string) => {
const current = currentTag();
if (current.name === name.toLowerCase()) tagStack.pop();
if (isSelfClosingTag(name)) return;
output.push(`</${name}>`);
},
});
return output.join('');
};
export default packToString;

View File

@@ -3,13 +3,24 @@
"version": "3.3.1",
"description": "Pack an HTML file and all its linked resources into a single HTML file",
"main": "dist/index.js",
"types": "src/index.ts",
"types": "index.ts",
"exports": {
".": {
"default": "./dist/index.js",
"types": "./index.ts"
},
"./packToString": {
"default": "./dist/packToString.js",
"types": "./packToString.ts"
}
},
"publishConfig": {
"access": "public"
},
"scripts": {
"tsc": "tsc --project tsconfig.json",
"watch": "tsc --watch --preserveWatchOutput --project tsconfig.json"
"watch": "tsc --watch --preserveWatchOutput --project tsconfig.json",
"test": "jest"
},
"author": "Laurent Cozic",
"license": "MIT",
@@ -21,7 +32,10 @@
"html-entities": "1.4.0"
},
"devDependencies": {
"@types/fs-extra": "11.0.4"
"@types/fs-extra": "11.0.4",
"@types/jest": "29.5.12",
"jest": "29.7.0",
"typescript": "5.4.5"
},
"gitHead": "05a29b450962bf05a8642bbd39446a1f679a96ba"
}

View File

@@ -1,257 +0,0 @@
import * as fs from 'fs-extra';
import { pathExistsSync } from 'fs-extra';
const Entities = require('html-entities').AllHtmlEntities;
const htmlparser2 = require('@joplin/fork-htmlparser2');
const Datauri = require('datauri/sync');
import { CssTypes, parse as cssParse, stringify as cssStringify } from '@adobe/css-tools';
const selfClosingElements = [
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
];
const htmlentities = (s: string): string => {
const output = (new Entities()).encode(s);
return output.replace(/&Tab;/ig, '\t');
};
const dataUriEncode = (filePath: string): string => {
try {
const result = Datauri(filePath);
return result.content;
} catch (error) {
// If the file path is invalid, the Datauri will throw an exception.
// Instead, since we can just ignore that particular file.
// Fixes https://github.com/laurent22/joplin/issues/8305
return '';
}
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
const attributesHtml = (attr: any) => {
const output = [];
for (const n in attr) {
if (!attr.hasOwnProperty(n)) continue;
output.push(`${n}="${htmlentities(attr[n])}"`);
}
return output.join(' ');
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
const attrValue = (attrs: any, name: string): string => {
if (!attrs[name]) return '';
return attrs[name];
};
const isSelfClosingTag = (tagName: string) => {
return selfClosingElements.includes(tagName.toLowerCase());
};
const processCssContent = (cssBaseDir: string, content: string): string => {
const o = cssParse(content, {
silent: false,
});
for (const rule of o.stylesheet.rules) {
if (rule.type === 'font-face') {
for (const declaration of rule.declarations) {
if (declaration.type === CssTypes.comment) {
continue;
}
if (declaration.property === 'src') {
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
declaration.value = declaration.value.replace(/url\((.*?)\)/g, (_v: any, url: string) => {
const cssFilePath = `${cssBaseDir}/${url}`;
if (fs.existsSync(cssFilePath)) {
return `url(${dataUriEncode(cssFilePath)})`;
} else {
return `url(${url})`;
}
});
}
}
}
}
return cssStringify(o);
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
const processLinkTag = (baseDir: string, _name: string, attrs: any): string => {
const href = attrValue(attrs, 'href');
if (!href) return null;
const filePath = `${baseDir}/${href}`;
if (!pathExistsSync(filePath)) return null;
const content = fs.readFileSync(filePath, 'utf8');
return `<style>${processCssContent(dirname(filePath), content)}</style>`;
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
const processScriptTag = (baseDir: string, _name: string, attrs: any): string => {
const src = attrValue(attrs, 'src');
if (!src) return null;
const scriptFilePath = `${baseDir}/${src}`;
let content = fs.readFileSync(scriptFilePath, 'utf8');
// There's no simple way to insert arbitrary content in <script> tags.
// Encoding HTML entities doesn't work because the JS parser will not decode
// them before parsing. We also can't put the code verbatim since it may
// contain strings such as `</script>` or `<!--` which would break the HTML
// file.
//
// So it seems the only way is to escape these specific sequences with a
// backslash. It shouldn't break the JS code and should allow the HTML
// parser to work as expected.
//
// https://stackoverflow.com/a/41302266/561309
content = content.replace(/<script>/g, '<\\script>');
content = content.replace(/<\/script>/g, '<\\/script>');
content = content.replace(/<!--/g, '<\\!--');
return `<script>${content}</script>`;
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
const processImgTag = (baseDir: string, _name: string, attrs: any): string => {
const src = attrValue(attrs, 'src');
if (!src) return null;
const filePath = `${baseDir}/${src}`;
if (!fs.existsSync(filePath)) return null;
const modAttrs = { ...attrs };
delete modAttrs.src;
return `<img src="${dataUriEncode(filePath)}" ${attributesHtml(modAttrs)}/>`;
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
const processAnchorTag = (baseDir: string, _name: string, attrs: any): string => {
const href = attrValue(attrs, 'href');
if (!href) return null;
const filePath = `${baseDir}/${href}`;
if (!fs.existsSync(filePath)) return null;
const modAttrs = { ...attrs };
modAttrs.href = dataUriEncode(filePath);
modAttrs.download = basename(filePath);
return `<a ${attributesHtml(modAttrs)}>`;
};
function basename(path: string) {
if (!path) throw new Error('Path is empty');
const s = path.split(/\/|\\/);
return s[s.length - 1];
}
function dirname(path: string) {
if (!path) throw new Error('Path is empty');
const s = path.split(/\/|\\/);
s.pop();
return s.join('/');
}
export default async function htmlpack(inputFile: string, outputFile: string): Promise<void> {
const inputHtml = await fs.readFile(inputFile, 'utf8');
const baseDir = dirname(inputFile);
const output: string[] = [];
interface Tag {
name: string;
}
const tagStack: Tag[] = [];
const currentTag = () => {
if (!tagStack.length) return { name: '', processed: false };
return tagStack[tagStack.length - 1];
};
const parser = new htmlparser2.Parser({
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
onopentag: (name: string, attrs: any) => {
name = name.toLowerCase();
let processedResult = '';
if (name === 'link') {
processedResult = processLinkTag(baseDir, name, attrs);
}
if (name === 'script') {
processedResult = processScriptTag(baseDir, name, attrs);
}
if (name === 'img') {
processedResult = processImgTag(baseDir, name, attrs);
}
if (name === 'a') {
processedResult = processAnchorTag(baseDir, name, attrs);
}
tagStack.push({ name });
if (processedResult) {
output.push(processedResult);
} else {
let attrHtml = attributesHtml(attrs);
if (attrHtml) attrHtml = ` ${attrHtml}`;
const closingSign = isSelfClosingTag(name) ? '/>' : '>';
output.push(`<${name}${attrHtml}${closingSign}`);
}
},
ontext: (decodedText: string) => {
if (currentTag().name === 'style') {
// For CSS, we have to put the style as-is inside the tag because if we html-entities encode
// it, it's not going to work. But it's ok because JavaScript won't run within the style tag.
// Ideally CSS should be loaded from an external file.
output.push(decodedText);
} else {
output.push(htmlentities(decodedText));
}
},
onclosetag: (name: string) => {
const current = currentTag();
if (current.name === name.toLowerCase()) tagStack.pop();
if (isSelfClosingTag(name)) return;
output.push(`</${name}>`);
},
}, { decodeEntities: true });
parser.write(inputHtml);
parser.end();
await fs.writeFile(outputFile, output.join(''), 'utf8');
}

View File

@@ -0,0 +1 @@
<svg viewBox="-95 -96 208 208" width="208" height="208" version="1.1" baseProfile="full" xmlns="http://www.w3.org/2000/svg"><text style="font-size: 64px; fill: red;">Test</text></svg>

After

Width:  |  Height:  |  Size: 183 B

View File

@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<link href="./style.css" rel="stylesheet"/>
</head>
<body>
<h1>Test</h1>
<a href="./resource.txt">Test link.</a>
<img src="./image.svg" alt="test image"/>
<p>Test paragraph</p>
</body>
</html>

View File

@@ -0,0 +1 @@
Resource.

View File

@@ -0,0 +1,3 @@
* {
color: red;
}

View File

@@ -0,0 +1,79 @@
const htmlparser2 = require('@joplin/fork-htmlparser2');
export type HtmlAttrs = Record<string, string>;
interface Callbacks {
onopentag: (name: string, attrs: HtmlAttrs)=> Promise<void>;
ontext: (text: string)=> Promise<void>;
onclosetag: (name: string)=> Promise<void>;
}
enum EventTypes {
OpenTag,
Text,
CloseTag,
}
interface OpenTagEvent {
type: EventTypes.OpenTag;
name: string;
attrs: HtmlAttrs;
}
interface TextEvent {
type: EventTypes.Text;
decodedText: string;
}
interface CloseTagEvent {
type: EventTypes.CloseTag;
name: string;
}
type ParserEvent = OpenTagEvent|TextEvent|CloseTagEvent;
const parseHtmlAsync = async (html: string, callbacks: Callbacks) => {
const events: ParserEvent[] = [];
const parser = new htmlparser2.Parser({
onopentag: (name: string, attrs: HtmlAttrs) => {
events.push({
type: EventTypes.OpenTag,
name,
attrs,
});
},
ontext: (decodedText: string) => {
events.push({
type: EventTypes.Text,
decodedText,
});
},
onclosetag: (name: string) => {
events.push({
type: EventTypes.CloseTag,
name,
});
},
}, { decodeEntities: true });
parser.write(html);
parser.end();
for (const event of events) {
if (event.type === EventTypes.OpenTag) {
await callbacks.onopentag(event.name, event.attrs);
} else if (event.type === EventTypes.CloseTag) {
await callbacks.onclosetag(event.name);
} else if (event.type === EventTypes.Text) {
await callbacks.ontext(event.decodedText);
} else {
const exhaustivenessCheck: never = event;
throw new Error(`Unknown event type: ${exhaustivenessCheck}`);
}
}
};
export default parseHtmlAsync;

View File

@@ -9017,9 +9017,12 @@ __metadata:
"@adobe/css-tools": 4.4.2
"@joplin/fork-htmlparser2": ^4.1.58
"@types/fs-extra": 11.0.4
"@types/jest": 29.5.12
datauri: 4.1.0
fs-extra: 11.2.0
html-entities: 1.4.0
jest: 29.7.0
typescript: 5.4.5
languageName: unknown
linkType: soft