mirror of
https://github.com/laurent22/joplin.git
synced 2025-03-29 21:21:15 +02:00
HtmlToMd: Added more test cases and fixed some issues
This commit is contained in:
parent
ba9598682c
commit
ba2874173d
@ -8,7 +8,6 @@ const Note = require('lib/models/Note.js');
|
|||||||
const BaseModel = require('lib/BaseModel.js');
|
const BaseModel = require('lib/BaseModel.js');
|
||||||
const { shim } = require('lib/shim');
|
const { shim } = require('lib/shim');
|
||||||
const { enexXmlToMd } = require('lib/import-enex-md-gen.js');
|
const { enexXmlToMd } = require('lib/import-enex-md-gen.js');
|
||||||
const stringToStream = require('string-to-stream')
|
|
||||||
|
|
||||||
jasmine.DEFAULT_TIMEOUT_INTERVAL = 60 * 60 * 1000; // Can run for a while since everything is in the same test unit
|
jasmine.DEFAULT_TIMEOUT_INTERVAL = 60 * 60 * 1000; // Can run for a while since everything is in the same test unit
|
||||||
|
|
||||||
@ -35,13 +34,12 @@ describe('HtmlToMd', function() {
|
|||||||
const htmlPath = basePath + '/' + htmlFilename;
|
const htmlPath = basePath + '/' + htmlFilename;
|
||||||
const mdPath = basePath + '/' + filename(htmlFilename) + '.md';
|
const mdPath = basePath + '/' + filename(htmlFilename) + '.md';
|
||||||
|
|
||||||
// if (htmlFilename !== 'list.html') continue;
|
// if (htmlFilename !== 'list2.html') continue;
|
||||||
|
|
||||||
const html = await shim.fsDriver().readFile(htmlPath);
|
const html = await shim.fsDriver().readFile(htmlPath);
|
||||||
const expectedMd = await shim.fsDriver().readFile(mdPath);
|
const expectedMd = await shim.fsDriver().readFile(mdPath);
|
||||||
|
|
||||||
const contentStream = stringToStream('<div>' + html + '</div>');
|
const actualMd = await enexXmlToMd('<div>' + html + '</div>', []);
|
||||||
const actualMd = await enexXmlToMd(contentStream, []);
|
|
||||||
|
|
||||||
if (actualMd !== expectedMd) {
|
if (actualMd !== expectedMd) {
|
||||||
console.info('');
|
console.info('');
|
||||||
|
4
CliClient/tests/html_to_md/list3.html
Normal file
4
CliClient/tests/html_to_md/list3.html
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
<ul class="find-me-on">
|
||||||
|
<li>Github</li>
|
||||||
|
<li>Twitter</li>
|
||||||
|
</ul>
|
2
CliClient/tests/html_to_md/list3.md
Normal file
2
CliClient/tests/html_to_md/list3.md
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
- Github
|
||||||
|
- Twitter
|
11
CliClient/tests/html_to_md/list4.html
Normal file
11
CliClient/tests/html_to_md/list4.html
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
<figure style="height:551px;">
|
||||||
|
<div class="image" style="background-image:url('https://cdn.arstechnica.net/wp-content/uploads/2018/05/2018050719551800-9A1517382E5F6AE0CEFC2F883445C47F-980x551.jpg'); background-color:#000"></div>
|
||||||
|
<figcaption id="caption-1308165">
|
||||||
|
<span class="icon caption-arrow icon-drop-indicator"></span>
|
||||||
|
<div class="caption">The Kit won't work without a Joy-Con infrared camera looking in.</div>
|
||||||
|
</figcaption>
|
||||||
|
</figure>
|
||||||
|
</li>
|
||||||
|
</ul>
|
1
CliClient/tests/html_to_md/list4.md
Normal file
1
CliClient/tests/html_to_md/list4.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
- The Kit won't work without a Joy-Con infrared camera looking in.
|
@ -1,13 +1,11 @@
|
|||||||
const { enexXmlToMd } = require('lib/import-enex-md-gen.js');
|
const { enexXmlToMd } = require('lib/import-enex-md-gen.js');
|
||||||
const stringToStream = require('string-to-stream')
|
|
||||||
|
|
||||||
class HtmlToMarkdownParser {
|
class HtmlToMarkdownParser {
|
||||||
|
|
||||||
async parse(html, options = {}) {
|
async parse(html, options = {}) {
|
||||||
if (!options.baseUrl) options.baseUrl = '';
|
if (!options.baseUrl) options.baseUrl = '';
|
||||||
|
|
||||||
const contentStream = stringToStream(html);
|
const markdown = await enexXmlToMd(html, [], options);
|
||||||
const markdown = await enexXmlToMd(contentStream, [], options);
|
|
||||||
return markdown;
|
return markdown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
const stringPadding = require('string-padding');
|
const stringPadding = require('string-padding');
|
||||||
|
const stringToStream = require('string-to-stream')
|
||||||
|
|
||||||
const BLOCK_OPEN = "[[BLOCK_OPEN]]";
|
const BLOCK_OPEN = "[[BLOCK_OPEN]]";
|
||||||
const BLOCK_CLOSE = "[[BLOCK_CLOSE]]";
|
const BLOCK_CLOSE = "[[BLOCK_CLOSE]]";
|
||||||
@ -542,6 +543,8 @@ function collapseWhiteSpaceAndAppend(lines, state, text) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
// console.info(lines);
|
||||||
|
|
||||||
// Remove all \n and \r from the left and right of the text
|
// Remove all \n and \r from the left and right of the text
|
||||||
while (text.length && (text[0] == "\n" || text[0] == "\r")) text = text.substr(1);
|
while (text.length && (text[0] == "\n" || text[0] == "\r")) text = text.substr(1);
|
||||||
while (text.length && (text[text.length - 1] == "\n" || text[text.length - 1] == "\r")) text = text.substr(0, text.length - 1);
|
while (text.length && (text[text.length - 1] == "\n" || text[text.length - 1] == "\r")) text = text.substr(0, text.length - 1);
|
||||||
@ -602,7 +605,7 @@ function addResourceTag(lines, resource, alt = "") {
|
|||||||
|
|
||||||
|
|
||||||
function isBlockTag(n) {
|
function isBlockTag(n) {
|
||||||
return ["div", "p", "dl", "dd", 'dt', "center", 'address', 'form', 'input', 'section', 'nav', 'header', 'article', 'textarea', 'footer', 'fieldset'].indexOf(n) >= 0;
|
return ["div", "p", "dl", "dd", 'dt', "center", 'address', 'form', 'input', 'section', 'nav', 'header', 'article', 'textarea', 'footer', 'fieldset', 'summary', 'details'].indexOf(n) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
function isStrongTag(n) {
|
function isStrongTag(n) {
|
||||||
@ -622,7 +625,7 @@ function isAnchor(n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function isIgnoredEndTag(n) {
|
function isIgnoredEndTag(n) {
|
||||||
return ["en-note", "en-todo", "span", "body", "html", "font", "br", 'hr', 'tbody', 'sup', 'img', 'abbr', 'cite', 'thead', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area', 'label', 'legend'].indexOf(n) >= 0;
|
return ["en-note", "en-todo", "span", "body", "html", "font", "br", 'hr', 'tbody', 'sup', 'img', 'abbr', 'cite', 'thead', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area', 'label', 'legend', 'time-ago', 'relative-time'].indexOf(n) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
function isListTag(n) {
|
function isListTag(n) {
|
||||||
@ -631,12 +634,12 @@ function isListTag(n) {
|
|||||||
|
|
||||||
// Elements that don't require any special treatment beside adding a newline character
|
// Elements that don't require any special treatment beside adding a newline character
|
||||||
function isNewLineOnlyEndTag(n) {
|
function isNewLineOnlyEndTag(n) {
|
||||||
return ["div", "p", "li", "h1", "h2", "h3", "h4", "h5", 'h6', "dl", "dd", 'dt', "center", 'address', 'form', 'input', 'section', 'nav', 'header', 'article', 'textarea', 'footer', 'fieldset'].indexOf(n) >= 0;
|
return ["div", "p", "h1", "h2", "h3", "h4", "h5", 'h6', "dl", "dd", 'dt', "center", 'address', 'form', 'input', 'section', 'nav', 'header', 'article', 'textarea', 'footer', 'fieldset', 'summary', 'details'].indexOf(n) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tags that must be ignored - both the tag and its content.
|
// Tags that must be ignored - both the tag and its content.
|
||||||
function isIgnoredContentTag(n) {
|
function isIgnoredContentTag(n) {
|
||||||
return ['script', 'style', 'iframe', 'select', 'option', 'button', 'video', 'source'].indexOf(n) >= 0
|
return ['script', 'style', 'iframe', 'select', 'option', 'button', 'video', 'source', 'svg', 'path'].indexOf(n) >= 0
|
||||||
}
|
}
|
||||||
|
|
||||||
function isCodeTag(n) {
|
function isCodeTag(n) {
|
||||||
@ -734,8 +737,14 @@ function enexXmlToMdArray(stream, resources, options = {}) {
|
|||||||
lists: [],
|
lists: [],
|
||||||
anchorAttributes: [],
|
anchorAttributes: [],
|
||||||
ignoreContents: [],
|
ignoreContents: [],
|
||||||
|
ignoreWhiteSpace: [],
|
||||||
|
warningsTags: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const ignoreWhiteSpace = () => {
|
||||||
|
return state.ignoreWhiteSpace.length ? state.ignoreWhiteSpace[state.ignoreWhiteSpace.length-1] : false;
|
||||||
|
}
|
||||||
|
|
||||||
let saxStreamOptions = {};
|
let saxStreamOptions = {};
|
||||||
let strict = false;
|
let strict = false;
|
||||||
var saxStream = require('sax').createStream(strict, saxStreamOptions)
|
var saxStream = require('sax').createStream(strict, saxStreamOptions)
|
||||||
@ -754,6 +763,7 @@ function enexXmlToMdArray(stream, resources, options = {}) {
|
|||||||
saxStream.on('text', function(text) {
|
saxStream.on('text', function(text) {
|
||||||
if (state.ignoreContents.length) return;
|
if (state.ignoreContents.length) return;
|
||||||
if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return;
|
if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return;
|
||||||
|
if ((!text || !text.trim()) && ignoreWhiteSpace()) return;
|
||||||
section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text);
|
section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text);
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -824,7 +834,9 @@ function enexXmlToMdArray(stream, resources, options = {}) {
|
|||||||
} else if (isListTag(n)) {
|
} else if (isListTag(n)) {
|
||||||
section.lines.push(BLOCK_OPEN);
|
section.lines.push(BLOCK_OPEN);
|
||||||
state.lists.push({ tag: n, counter: 1 });
|
state.lists.push({ tag: n, counter: 1 });
|
||||||
|
state.ignoreWhiteSpace.push(true);
|
||||||
} else if (n == 'li') {
|
} else if (n == 'li') {
|
||||||
|
state.ignoreWhiteSpace.push(false);
|
||||||
section.lines.push(BLOCK_OPEN);
|
section.lines.push(BLOCK_OPEN);
|
||||||
if (!state.lists.length) {
|
if (!state.lists.length) {
|
||||||
console.warn("Found <li> tag without being inside a list");
|
console.warn("Found <li> tag without being inside a list");
|
||||||
@ -984,10 +996,13 @@ function enexXmlToMdArray(stream, resources, options = {}) {
|
|||||||
// section.lines.push(MONOSPACE_OPEN);
|
// section.lines.push(MONOSPACE_OPEN);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
} else if (["span", "font", 'sup', 'cite', 'abbr', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area', 'label', 'legend'].indexOf(n) >= 0) {
|
} else if (["span", "font", 'sup', 'cite', 'abbr', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area', 'label', 'legend', 'time-ago', 'relative-time'].indexOf(n) >= 0) {
|
||||||
// Inline tags that can be ignored in Markdown
|
// Inline tags that can be ignored in Markdown
|
||||||
} else {
|
} else {
|
||||||
console.warn("Unsupported start tag: " + n);
|
if (state.warningsTags.indexOf(n) < 0) {
|
||||||
|
console.warn("Unsupported start tag: " + n);
|
||||||
|
state.warningsTags.push(n);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -1025,8 +1040,12 @@ function enexXmlToMdArray(stream, resources, options = {}) {
|
|||||||
} else if (isIgnoredEndTag(n)) {
|
} else if (isIgnoredEndTag(n)) {
|
||||||
// Skip
|
// Skip
|
||||||
} else if (isListTag(n)) {
|
} else if (isListTag(n)) {
|
||||||
|
state.ignoreWhiteSpace.pop();
|
||||||
section.lines.push(BLOCK_CLOSE);
|
section.lines.push(BLOCK_CLOSE);
|
||||||
state.lists.pop();
|
state.lists.pop();
|
||||||
|
} else if (n === 'li') {
|
||||||
|
state.ignoreWhiteSpace.pop();
|
||||||
|
section.lines.push(BLOCK_CLOSE);
|
||||||
} else if (isStrongTag(n)) {
|
} else if (isStrongTag(n)) {
|
||||||
section.lines.push("**");
|
section.lines.push("**");
|
||||||
} else if (isStrikeTag(n)) {
|
} else if (isStrikeTag(n)) {
|
||||||
@ -1170,9 +1189,11 @@ function enexXmlToMdArray(stream, resources, options = {}) {
|
|||||||
} else if (isIgnoredEndTag(n)) {
|
} else if (isIgnoredEndTag(n)) {
|
||||||
// Skip
|
// Skip
|
||||||
} else {
|
} else {
|
||||||
console.warn("Unsupported end tag: " + n);
|
if (state.warningsTags.indexOf(n) < 0) {
|
||||||
|
console.warn("Unsupported end tag: " + n);
|
||||||
|
state.warningsTags.push(n);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
})
|
})
|
||||||
|
|
||||||
saxStream.on('attribute', function(attr) {
|
saxStream.on('attribute', function(attr) {
|
||||||
@ -1309,7 +1330,8 @@ function drawTable(table) {
|
|||||||
return flatRender ? lines : lines.join('<<<<:D>>>>' + NEWLINE + '<<<<:D>>>>').split('<<<<:D>>>>');
|
return flatRender ? lines : lines.join('<<<<:D>>>>' + NEWLINE + '<<<<:D>>>>').split('<<<<:D>>>>');
|
||||||
}
|
}
|
||||||
|
|
||||||
async function enexXmlToMd(stream, resources, options = {}) {
|
async function enexXmlToMd(xmlString, resources, options = {}) {
|
||||||
|
const stream = stringToStream(xmlString);
|
||||||
let result = await enexXmlToMdArray(stream, resources, options);
|
let result = await enexXmlToMdArray(stream, resources, options);
|
||||||
|
|
||||||
let mdLines = [];
|
let mdLines = [];
|
||||||
@ -1334,14 +1356,13 @@ async function enexXmlToMd(stream, resources, options = {}) {
|
|||||||
firstAttachment = false;
|
firstAttachment = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
let output = processMdArrayNewLines(mdLines);
|
let output = processMdArrayNewLines(mdLines).split('\n')
|
||||||
|
|
||||||
// After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to
|
// After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to
|
||||||
// block start/end or elements that were ignored, etc. If these white spaces were intended it's not really
|
// block start/end or elements that were ignored, etc. If these white spaces were intended it's not really
|
||||||
// possible to detect it, so simply trim them all so that the result is more deterministic and can be
|
// possible to detect it, so simply trim them all so that the result is more deterministic and can be
|
||||||
// easily unit tested.
|
// easily unit tested.
|
||||||
const trimEmptyLines = function(text) {
|
const trimEmptyLines = function(lines) {
|
||||||
const lines = text.split('\n');
|
|
||||||
while (lines.length) {
|
while (lines.length) {
|
||||||
if (!lines[0].trim()) {
|
if (!lines[0].trim()) {
|
||||||
lines.splice(0, 1);
|
lines.splice(0, 1);
|
||||||
@ -1358,10 +1379,32 @@ async function enexXmlToMd(stream, resources, options = {}) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return lines.join('\n');
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
return trimEmptyLines(output);
|
function cleanUpSpaces(lines) {
|
||||||
|
const output = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
let line = lines[i];
|
||||||
|
|
||||||
|
// eg. " - Some list item" => " - Some list item"
|
||||||
|
// Note that spaces before the "-" are preserved
|
||||||
|
line = line.replace(/^(\s+|)-\s+/, '$1- ')
|
||||||
|
|
||||||
|
// eg "Some text " => "Some text"
|
||||||
|
line = line.replace(/^(.*?)\s+$/, '$1')
|
||||||
|
|
||||||
|
output.push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
output = trimEmptyLines(output)
|
||||||
|
output = cleanUpSpaces(output)
|
||||||
|
|
||||||
|
return output.join('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { enexXmlToMd, processMdArrayNewLines, NEWLINE, addResourceTag };
|
module.exports = { enexXmlToMd, processMdArrayNewLines, NEWLINE, addResourceTag };
|
@ -15,7 +15,6 @@ const md5 = require('md5');
|
|||||||
|
|
||||||
//const Promise = require('promise');
|
//const Promise = require('promise');
|
||||||
const fs = require('fs-extra');
|
const fs = require('fs-extra');
|
||||||
const stringToStream = require('string-to-stream')
|
|
||||||
|
|
||||||
function dateToTimestamp(s, zeroIfInvalid = false) {
|
function dateToTimestamp(s, zeroIfInvalid = false) {
|
||||||
let m = moment(s, 'YYYYMMDDTHHmmssZ');
|
let m = moment(s, 'YYYYMMDDTHHmmssZ');
|
||||||
@ -219,8 +218,7 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
|
|||||||
|
|
||||||
while (notes.length) {
|
while (notes.length) {
|
||||||
let note = notes.shift();
|
let note = notes.shift();
|
||||||
const contentStream = stringToStream(note.bodyXml);
|
const body = await enexXmlToMd(note.bodyXml, note.resources);
|
||||||
const body = await enexXmlToMd(contentStream, note.resources);
|
|
||||||
delete note.bodyXml;
|
delete note.bodyXml;
|
||||||
|
|
||||||
// console.info('*************************************************************************');
|
// console.info('*************************************************************************');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user