1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-11-24 08:12:24 +02:00

HtmlToMd: Fixed handling of inline tags

This commit is contained in:
Laurent Cozic 2018-05-15 13:26:53 +01:00
parent 8cce2f17d5
commit a2b1181f7c
5 changed files with 156 additions and 43 deletions

View File

@ -122,6 +122,15 @@
"concat-map": "0.0.1"
}
},
"camel-case": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/camel-case/-/camel-case-3.0.0.tgz",
"integrity": "sha1-yjw2iKTpzzpM2nd9xNy8cTJJz3M=",
"requires": {
"no-case": "^2.2.0",
"upper-case": "^1.1.1"
}
},
"camelcase": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz",
@ -163,6 +172,14 @@
"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.0.1.tgz",
"integrity": "sha1-4qdQQqlVGQi+vSW4Uj1fl2nXkYE="
},
"clean-css": {
"version": "4.1.11",
"resolved": "https://registry.npmjs.org/clean-css/-/clean-css-4.1.11.tgz",
"integrity": "sha1-Ls3xRaujj1R0DybO/Q/z4D4SXWo=",
"requires": {
"source-map": "0.5.x"
}
},
"co": {
"version": "4.6.0",
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@ -207,6 +224,11 @@
"delayed-stream": "~1.0.0"
}
},
"commander": {
"version": "2.15.1",
"resolved": "https://registry.npmjs.org/commander/-/commander-2.15.1.tgz",
"integrity": "sha512-VlfT9F3V0v+jr4yxPc5gg9s62/fIVWsd2Bk2iD435um1NlGMYdVCq+MjcXnhYq2icNOizHr1kK+5TI6H0Hy0ag=="
},
"compare-version": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/compare-version/-/compare-version-0.1.2.tgz",
@ -573,6 +595,11 @@
"sntp": "2.x.x"
}
},
"he": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/he/-/he-1.1.1.tgz",
"integrity": "sha1-k0EP0hsAlzUVH4howvJx80J+I/0="
},
"highlight.js": {
"version": "9.12.0",
"resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-9.12.0.tgz",
@ -588,6 +615,20 @@
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-1.2.1.tgz",
"integrity": "sha1-DfKTUfByEWNRXfueVUPl9u7VFi8="
},
"html-minifier": {
"version": "3.5.15",
"resolved": "https://registry.npmjs.org/html-minifier/-/html-minifier-3.5.15.tgz",
"integrity": "sha512-OZa4rfb6tZOZ3Z8Xf0jKxXkiDcFWldQePGYFDcgKqES2sXeWaEv9y6QQvWUtX3ySI3feApQi5uCsHLINQ6NoAw==",
"requires": {
"camel-case": "3.0.x",
"clean-css": "4.1.x",
"commander": "2.15.x",
"he": "1.1.x",
"param-case": "2.1.x",
"relateurl": "0.2.x",
"uglify-js": "3.3.x"
}
},
"http-signature": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz",
@ -815,6 +856,11 @@
"js-tokens": "^3.0.0"
}
},
"lower-case": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/lower-case/-/lower-case-1.1.4.tgz",
"integrity": "sha1-miyr0bno4K6ZOkv31YdcOcQujqw="
},
"lowlight": {
"version": "1.9.2",
"resolved": "https://registry.npmjs.org/lowlight/-/lowlight-1.9.2.tgz",
@ -933,6 +979,14 @@
"resolved": "https://registry.npmjs.org/nextgen-events/-/nextgen-events-0.11.3.tgz",
"integrity": "sha512-dC4v/dOF6m8/M05eU712KXjRJ0e/187rx5CMS/fTnulv2QGPps1U/c/J1D3wtegEhK+EE7LuJc3jly3pyfV46g=="
},
"no-case": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/no-case/-/no-case-2.3.2.tgz",
"integrity": "sha512-rmTZ9kz+f3rCvK2TD1Ue/oZlns7OGoIWP4fc3llxxRXlOkHKoWPPWJOfFYpITabSow43QJbRIoHQXtt10VldyQ==",
"requires": {
"lower-case": "^1.1.1"
}
},
"node-bitmap": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/node-bitmap/-/node-bitmap-0.0.1.tgz",
@ -997,6 +1051,14 @@
"wrappy": "1"
}
},
"param-case": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/param-case/-/param-case-2.1.1.tgz",
"integrity": "sha1-35T9jPZTHs915r75oIWPvHK+Ikc=",
"requires": {
"no-case": "^2.2.0"
}
},
"parse-data-uri": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/parse-data-uri/-/parse-data-uri-0.2.0.tgz",
@ -1107,6 +1169,11 @@
"symbol-observable": "^1.0.3"
}
},
"relateurl": {
"version": "0.2.7",
"resolved": "https://registry.npmjs.org/relateurl/-/relateurl-0.2.7.tgz",
"integrity": "sha1-VNvzd+UUQKypCkzSdGANP/LYiKk="
},
"request": {
"version": "2.85.0",
"resolved": "https://registry.npmjs.org/request/-/request-2.85.0.tgz",
@ -1233,6 +1300,11 @@
"hoek": "4.x.x"
}
},
"source-map": {
"version": "0.5.7",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz",
"integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w="
},
"sprintf-js": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.1.tgz",
@ -2138,6 +2210,22 @@
"integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=",
"optional": true
},
"uglify-js": {
"version": "3.3.25",
"resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.3.25.tgz",
"integrity": "sha512-hobogryjDV36VrLK3Y69ou4REyrTApzUblVFmdQOYRe8cYaSmFJXMb4dR9McdvYDSbeNdzUgYr2YVukJaErJcA==",
"requires": {
"commander": "~2.15.0",
"source-map": "~0.6.1"
},
"dependencies": {
"source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g=="
}
}
},
"unc-path-regex": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/unc-path-regex/-/unc-path-regex-0.1.2.tgz",
@ -2153,6 +2241,11 @@
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.1.tgz",
"integrity": "sha1-+nG63UQ3r0wUiEHjs7Fl+enlkLc="
},
"upper-case": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/upper-case/-/upper-case-1.1.3.tgz",
"integrity": "sha1-9rRQHC7EzdJrp4vnIilh3ndiFZg="
},
"url-parse": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.2.0.tgz",

View File

@ -35,6 +35,7 @@
"form-data": "^2.1.4",
"fs-extra": "^5.0.0",
"html-entities": "^1.2.1",
"html-minifier": "^3.5.15",
"jssha": "^2.3.0",
"levenshtein": "^1.0.5",
"lodash": "^4.17.4",

View File

@ -34,7 +34,7 @@ describe('HtmlToMd', function() {
const htmlPath = basePath + '/' + htmlFilename;
const mdPath = basePath + '/' + filename(htmlFilename) + '.md';
// if (htmlFilename !== 'code2.html') continue;
// if (htmlFilename !== 'text2.html') continue;
const html = await shim.fsDriver().readFile(htmlPath);
const expectedMd = await shim.fsDriver().readFile(mdPath);

View File

@ -1,5 +1,5 @@
| |
| --- |
| $ sudo ethtool --set-priv-flags p2p1 mlx4_rss_xor_hash_function on<br># Three empty lines follow<br> |
| $ sudo ethtool --set-priv-flags p2p1 mlx4_rss_xor_hash_function on<br># Three empty lines follow |
Some text

View File

@ -337,6 +337,9 @@ function processMdArrayNewLines(md, isTable = false) {
}
let lines = output.replace(/\\r/g, '').split('\n');
// console.info(lines);
lines = formatMdLayout(lines)
// lines = convertSingleLineCodeBlocksToInline(lines)
lines = mergeMultipleNewLines(lines);
@ -530,6 +533,8 @@ function simplifyString(s) {
}
function collapseWhiteSpaceAndAppend(lines, state, text) {
// console.info([text]);
if (state.inCode.length) {
lines.push(text);
@ -556,6 +561,11 @@ function collapseWhiteSpaceAndAppend(lines, state, text) {
// console.info(lines);
if (!!text.match(/^\n+$/)) {
lines.push(' ');
return lines;
}
// Remove all \n and \r from the left and right of the text
while (text.length && (text[0] == "\n" || text[0] == "\r")) text = text.substr(1);
while (text.length && (text[text.length - 1] == "\n" || text[text.length - 1] == "\r")) text = text.substr(0, text.length - 1);
@ -1316,7 +1326,10 @@ function drawTable(table) {
// A cell in a Markdown table cannot have actual new lines so replace
// them with <br>, which are supported by the markdown renderers.
let cellText = processMdArrayNewLines(td.lines, true).replace(/\n+/g, "<br>");
let cellText = processMdArrayNewLines(td.lines, true)
let lines = cellText.split('\n');
lines = postProcessMarkdown(lines);
cellText = lines.join('\n').replace(/\n+/g, "<br>");
// Inside tables cells, "|" needs to be escaped
cellText = cellText.replace(/\|/g, "\\|");
@ -1397,44 +1410,7 @@ function minifyHtml(html) {
return output;
}
async function enexXmlToMd(xmlString, resources, options = {}) {
// This allows simplifying the HTML, which results in better Markdown. In particular, it removes all
// non-significant newlines and convert them to spaces.
// xmlString = minifyHtml(xmlString);
// console.info([xmlString]);
const stream = stringToStream(xmlString);
let result = await enexXmlToMdArray(stream, resources, options);
let mdLines = [];
for (let i = 0; i < result.content.lines.length; i++) {
let line = result.content.lines[i];
if (typeof line === 'object' && line.type === 'table') { // A table
const table = line;
const tableLines = drawTable(table);
mdLines = mdLines.concat(tableLines);
} else if (typeof line === 'object' && line.type === 'code') {
mdLines = mdLines.concat(line.lines);
} else if (typeof line === 'object') {
console.warn('Unhandled object type:', line);
mdLines = mdLines.concat(line.lines);
} else { // an actual line
mdLines.push(line);
}
}
let firstAttachment = true;
for (let i = 0; i < result.resources.length; i++) {
let r = result.resources[i];
if (firstAttachment) mdLines.push(NEWLINE);
mdLines.push(NEWLINE);
mdLines = addResourceTag(mdLines, r, r.filename);
firstAttachment = false;
}
let output = processMdArrayNewLines(mdLines).split('\n')
function postProcessMarkdown(lines) {
// After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to
// block start/end or elements that were ignored, etc. If these white spaces were intended it's not really
// possible to detect it, so simply trim them all so that the result is more deterministic and can be
@ -1490,8 +1466,51 @@ async function enexXmlToMd(xmlString, resources, options = {}) {
return output;
}
output = trimEmptyLines(output)
output = cleanUpSpaces(output)
lines = trimEmptyLines(lines)
lines = cleanUpSpaces(lines)
return lines;
}
async function enexXmlToMd(xmlString, resources, options = {}) {
// This allows simplifying the HTML, which results in better Markdown. In particular, it removes all
// non-significant newlines and convert them to spaces.
// xmlString = minifyHtml(xmlString);
// console.info([xmlString]);
const stream = stringToStream(xmlString);
let result = await enexXmlToMdArray(stream, resources, options);
let mdLines = [];
for (let i = 0; i < result.content.lines.length; i++) {
let line = result.content.lines[i];
if (typeof line === 'object' && line.type === 'table') { // A table
const table = line;
const tableLines = drawTable(table);
mdLines = mdLines.concat(tableLines);
} else if (typeof line === 'object' && line.type === 'code') {
mdLines = mdLines.concat(line.lines);
} else if (typeof line === 'object') {
console.warn('Unhandled object type:', line);
mdLines = mdLines.concat(line.lines);
} else { // an actual line
mdLines.push(line);
}
}
let firstAttachment = true;
for (let i = 0; i < result.resources.length; i++) {
let r = result.resources[i];
if (firstAttachment) mdLines.push(NEWLINE);
mdLines.push(NEWLINE);
mdLines = addResourceTag(mdLines, r, r.filename);
firstAttachment = false;
}
let output = processMdArrayNewLines(mdLines).split('\n')
output = postProcessMarkdown(output);
return output.join('\n');
}