From 394f2df66455ac17546023a9310ab5ee5850b7f7 Mon Sep 17 00:00:00 2001 From: Laurent Cozic Date: Sat, 12 May 2018 11:48:39 +0100 Subject: [PATCH] All: More robust HTML to MD conversion and started adding test units for it --- CliClient/tests/HtmlToMd.js | 64 +++++ CliClient/tests/html_to_md/code1.html | 16 ++ CliClient/tests/html_to_md/code1.md | 14 + CliClient/tests/html_to_md/heading.html | 9 + CliClient/tests/html_to_md/heading.md | 5 + CliClient/tests/html_to_md/inlineCode.html | 3 + CliClient/tests/html_to_md/inlineCode.md | 1 + .../tests/html_to_md/inlineCodeWithLink.html | 3 + .../tests/html_to_md/inlineCodeWithLink.md | 1 + CliClient/tests/html_to_md/list.html | 17 ++ CliClient/tests/html_to_md/list.md | 9 + CliClient/tests/html_to_md/paragraph.html | 5 + CliClient/tests/html_to_md/paragraph.md | 5 + ReactNativeClient/lib/import-enex-md-gen.js | 257 ++++++++++++++---- 14 files changed, 361 insertions(+), 48 deletions(-) create mode 100644 CliClient/tests/HtmlToMd.js create mode 100644 CliClient/tests/html_to_md/code1.html create mode 100644 CliClient/tests/html_to_md/code1.md create mode 100644 CliClient/tests/html_to_md/heading.html create mode 100644 CliClient/tests/html_to_md/heading.md create mode 100644 CliClient/tests/html_to_md/inlineCode.html create mode 100644 CliClient/tests/html_to_md/inlineCode.md create mode 100644 CliClient/tests/html_to_md/inlineCodeWithLink.html create mode 100644 CliClient/tests/html_to_md/inlineCodeWithLink.md create mode 100644 CliClient/tests/html_to_md/list.html create mode 100644 CliClient/tests/html_to_md/list.md create mode 100644 CliClient/tests/html_to_md/paragraph.html create mode 100644 CliClient/tests/html_to_md/paragraph.md diff --git a/CliClient/tests/HtmlToMd.js b/CliClient/tests/HtmlToMd.js new file mode 100644 index 0000000000..2c5f226923 --- /dev/null +++ b/CliClient/tests/HtmlToMd.js @@ -0,0 +1,64 @@ +require('app-module-path').addPath(__dirname); + +const { time } = require('lib/time-utils.js'); +const { filename } = require('lib/path-utils.js'); +const { asyncTest, fileContentEqual, setupDatabase, setupDatabaseAndSynchronizer, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync } = require('test-utils.js'); +const Folder = require('lib/models/Folder.js'); +const Note = require('lib/models/Note.js'); +const BaseModel = require('lib/BaseModel.js'); +const { shim } = require('lib/shim'); +const { enexXmlToMd } = require('lib/import-enex-md-gen.js'); +const stringToStream = require('string-to-stream') + +jasmine.DEFAULT_TIMEOUT_INTERVAL = 60 * 60 * 1000; // Can run for a while since everything is in the same test unit + +process.on('unhandledRejection', (reason, p) => { + console.log('Unhandled Rejection at: Promise', p, 'reason:', reason); +}); + +describe('HtmlToMd', function() { + + beforeEach(async (done) => { + await setupDatabaseAndSynchronizer(1); + await switchClient(1); + done(); + }); + + it('should convert from HTML to Markdown', asyncTest(async () => { + const basePath = __dirname + '/html_to_md'; + const files = await shim.fsDriver().readDirStats(basePath); + + for (let i = 0; i < files.length; i++) { + const htmlFilename = files[i].path; + if (htmlFilename.indexOf('.html') < 0) continue; + + const htmlPath = basePath + '/' + htmlFilename; + const mdPath = basePath + '/' + filename(htmlFilename) + '.md'; + + // if (htmlFilename !== 'inlineCodeWithLink.html') continue; + + const html = await shim.fsDriver().readFile(htmlPath); + const expectedMd = await shim.fsDriver().readFile(mdPath); + + const contentStream = stringToStream(html); + const actualMd = await enexXmlToMd(contentStream, []); + + if (actualMd !== expectedMd) { + console.info(''); + console.info('Error converting file: ' + htmlFilename); + console.info('--------------------------------- Got:'); + console.info(actualMd); + console.info('--------------------------------- Expected:'); + console.info(expectedMd); + console.info('--------------------------------------------'); + console.info(''); + + expect(false).toBe(true); + process.exit(1); + } else { + expect(true).toBe(true) + } + } + })); + +}); \ No newline at end of file diff --git a/CliClient/tests/html_to_md/code1.html b/CliClient/tests/html_to_md/code1.html new file mode 100644 index 0000000000..3c32632a1a --- /dev/null +++ b/CliClient/tests/html_to_md/code1.html @@ -0,0 +1,16 @@ +
+

For example, consider a web page like this:

+ +
<!DOCTYPE html>
+<html>
+  <head>
+    <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+  </head>
+
+  <body>
+    <script src="page-scripts/page-script.js"></script>
+  </body>
+</html>
+ +

The script "page-script.js" does this:

+
\ No newline at end of file diff --git a/CliClient/tests/html_to_md/code1.md b/CliClient/tests/html_to_md/code1.md new file mode 100644 index 0000000000..f8c09478d9 --- /dev/null +++ b/CliClient/tests/html_to_md/code1.md @@ -0,0 +1,14 @@ +For example, consider a web page like this: + + + + + + + + + + + + +The script "page-script.js" does this: \ No newline at end of file diff --git a/CliClient/tests/html_to_md/heading.html b/CliClient/tests/html_to_md/heading.html new file mode 100644 index 0000000000..8dba219523 --- /dev/null +++ b/CliClient/tests/html_to_md/heading.html @@ -0,0 +1,9 @@ +
+
+

Values added to the global scope of a content script with

+
+ +

Loading content scripts

+ +

You can load a content script into a web page in one of three ways:

+
\ No newline at end of file diff --git a/CliClient/tests/html_to_md/heading.md b/CliClient/tests/html_to_md/heading.md new file mode 100644 index 0000000000..e3576cd508 --- /dev/null +++ b/CliClient/tests/html_to_md/heading.md @@ -0,0 +1,5 @@ +Values added to the global scope of a content script with + +## Loading content scripts + +You can load a content script into a web page in one of three ways: \ No newline at end of file diff --git a/CliClient/tests/html_to_md/inlineCode.html b/CliClient/tests/html_to_md/inlineCode.html new file mode 100644 index 0000000000..56f8b86e12 --- /dev/null +++ b/CliClient/tests/html_to_md/inlineCode.html @@ -0,0 +1,3 @@ +
+

Similarly, I need another regex to match double newlines (\n\n) that are not part of a longer run of newline characters like \n\n\n or \n\n\n\n\n\n etc.

+
\ No newline at end of file diff --git a/CliClient/tests/html_to_md/inlineCode.md b/CliClient/tests/html_to_md/inlineCode.md new file mode 100644 index 0000000000..0adac62c59 --- /dev/null +++ b/CliClient/tests/html_to_md/inlineCode.md @@ -0,0 +1 @@ +Similarly, I need another regex to match double newlines (`\n\n`) that are not part of a longer run of newline characters like `\n\n\n` or `\n\n\n\n\n\n` etc. \ No newline at end of file diff --git a/CliClient/tests/html_to_md/inlineCodeWithLink.html b/CliClient/tests/html_to_md/inlineCodeWithLink.html new file mode 100644 index 0000000000..7a468ec26b --- /dev/null +++ b/CliClient/tests/html_to_md/inlineCodeWithLink.html @@ -0,0 +1,3 @@ +
+

the runtime.onConnect listener gets passed its own runtime.Port object.

+
\ No newline at end of file diff --git a/CliClient/tests/html_to_md/inlineCodeWithLink.md b/CliClient/tests/html_to_md/inlineCodeWithLink.md new file mode 100644 index 0000000000..ef3931a55f --- /dev/null +++ b/CliClient/tests/html_to_md/inlineCodeWithLink.md @@ -0,0 +1 @@ +theĀ `[runtime.onConnect](/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/onConnect)` listener gets passed its own `[runtime.Port](/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/Port)` object. \ No newline at end of file diff --git a/CliClient/tests/html_to_md/list.html b/CliClient/tests/html_to_md/list.html new file mode 100644 index 0000000000..33a964ddc0 --- /dev/null +++ b/CliClient/tests/html_to_md/list.html @@ -0,0 +1,17 @@ +
+

Liste de courses

+ +
+
Pizzas
+
Pain
+
Jambon
+
+ +

+ +
+
On its own
+
+ +

End

+
\ No newline at end of file diff --git a/CliClient/tests/html_to_md/list.md b/CliClient/tests/html_to_md/list.md new file mode 100644 index 0000000000..66d74f8a67 --- /dev/null +++ b/CliClient/tests/html_to_md/list.md @@ -0,0 +1,9 @@ +Liste de courses + +- [X] Pizzas +- [X] Pain +- [X] Jambon + +- [X] On its own + +End \ No newline at end of file diff --git a/CliClient/tests/html_to_md/paragraph.html b/CliClient/tests/html_to_md/paragraph.html new file mode 100644 index 0000000000..0c04d07f55 --- /dev/null +++ b/CliClient/tests/html_to_md/paragraph.html @@ -0,0 +1,5 @@ +
+

Something something

+

Blablbla blabla lbla

+

Last line

+
\ No newline at end of file diff --git a/CliClient/tests/html_to_md/paragraph.md b/CliClient/tests/html_to_md/paragraph.md new file mode 100644 index 0000000000..895af9060a --- /dev/null +++ b/CliClient/tests/html_to_md/paragraph.md @@ -0,0 +1,5 @@ +Something something + +Blablbla blabla lbla + +Last line \ No newline at end of file diff --git a/ReactNativeClient/lib/import-enex-md-gen.js b/ReactNativeClient/lib/import-enex-md-gen.js index 861a0f1478..7336d9b6e4 100644 --- a/ReactNativeClient/lib/import-enex-md-gen.js +++ b/ReactNativeClient/lib/import-enex-md-gen.js @@ -213,8 +213,10 @@ function mergeMonospaceSectionsWrapper(md, ignoreMonospace = false) { } function processMdArrayNewLines(md, isTable = false) { + // console.info(md); + // Try to merge MONOSPACE sections, works good when when not parsing a table - md = mergeMonospaceSectionsWrapper(md, isTable); + // md = mergeMonospaceSectionsWrapper(md, isTable); while (md.length && md[0] == BLOCK_OPEN) { md.shift(); @@ -289,6 +291,8 @@ function processMdArrayNewLines(md, isTable = false) { } } + // console.info(md); + let output = ''; let previous = ''; let start = true; @@ -312,7 +316,148 @@ function processMdArrayNewLines(md, isTable = false) { if (!output.trim().length) return ''; - return output; + let lines = output.replace(/\\r/g, '').split('\n'); + return convertSingleLineCodeBlocksToInline(formatMdLayout(lines)).join('\n'); +} + +// While the processMdArrayNewLines() function adds newlines in a way that's technically correct, the resulting Markdown can look messy. +// This is because while a "block" element should be surrounded by newlines, in practice, some should be surrounded by TWO new lines, while +// others by only ONE. +// +// For instance, this: +// +//
  • one
  • +//
  • two
  • +//
  • three
  • +// +// should result in this: +// +// - one +// - two +// - three +// +// While this: +// +//

    Some long paragraph

    And another one

    And the last paragraph

    +// +// should result in this: +// +// Some long paragraph +// +// And another one +// +// And the last paragraph +// +// So in one case, one newline between tags, and in another two newlines. In HTML this would be done via CSS, but in Markdown we need +// to add new lines. It's also important to get these newlines right because two blocks of text next to each others might be renderered +// differently than if there's a newlines between them. So the function below parses the almost final MD and add new lines depending +// on various rules. + + const isHeading = function(line) { + return !!line.match(/#+\s/); + } + + const isListItem = function(line) { + return line && line.trim().indexOf('- ') === 0; + } + + const isCodeLine = function(line) { + return line && line.indexOf('\t') === 0; + } + + const isPlainParagraph = function(line) { + if (!line || !line.length) return false; + + if (isListItem(line)) return false; + if (isHeading(line)) return false; + if (isCodeLine(line)) return false; + + return true; + } + +function formatMdLayout(lines) { + let previous = ''; + let newLines = []; + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + + // Add a new line at the end of a list of items + if (isListItem(previous) && line && !isListItem(line)) { + newLines.push(''); + + // Add a new line at the beginning of a list of items + } else if (isListItem(line) && previous && !isListItem(previous)) { + newLines.push(''); + + // Add a new line before a heading + } else if (isHeading(line) && previous) { + newLines.push(''); + + // Add a new line after a heading + } else if (isHeading(previous) && line) { + newLines.push(''); + + // Add a new line at beginning of paragraph + } else if (isPlainParagraph(line) && previous) { + newLines.push(''); + + // Add a new line at end of paragraph + } else if (isPlainParagraph(previous) && line) { + newLines.push(''); + } + + newLines.push(line); + previous = newLines[newLines.length - 1]; + } + + return newLines; +} + +function lineStartsWithDelimiter(line) { + if (!line || !line.length) return false; + return ' ,.;:)]}'.indexOf(line[0]) >= 0; +} + +function convertSingleLineCodeBlocksToInline(lines) { + let newLines = []; + let currentCodeLines = []; + let codeLineCount = 0; + + + const processCurrentCodeLines = (line) => { + if (codeLineCount === 1) { + const inlineCode = currentCodeLines.join('').trim(); + newLines[newLines.length - 1] += '`' + inlineCode + '`'; + if (line) newLines[newLines.length - 1] += (lineStartsWithDelimiter(line) ? '' : ' ') + line; + } else { + newLines = newLines.concat(currentCodeLines); + newLines.push(line); + } + + currentCodeLines = []; + codeLineCount = 0; + } + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + + if (isCodeLine(line)) { + currentCodeLines.push(line); + codeLineCount++; + } else if (!line.trim()) { + currentCodeLines.push(line); + } else { + if (currentCodeLines.length) { + processCurrentCodeLines(line); + } else { + newLines.push(line); + } + } + } + + if (currentCodeLines.length) processCurrentCodeLines(''); + + return newLines; } function isWhiteSpace(c) { @@ -343,8 +488,22 @@ function simplifyString(s) { function collapseWhiteSpaceAndAppend(lines, state, text) { if (state.inCode) { - text = "\t" + text; - lines.push(text); + let previous = lines.length ? lines[lines.length - 1] : ''; + + // If the preceding item is a block limit, then the current line should start with a TAB + if ([BLOCK_OPEN, BLOCK_CLOSE, NEWLINE, NEWLINE_MERGED, MONOSPACE_OPEN, MONOSPACE_CLOSE].indexOf(previous) >= 0 || !previous) { + //text = "\t" + text; + lines.push('\t'); + lines.push(text); + } else { + // If the current text contains one or more \n, then the last one should be immediately followed by a TAB + const idx = text.lastIndexOf('\n'); + if (idx >= 0) { + text = text.substr(0, idx+1) + '\t' + text.substr(idx+1); + } + + lines.push(text); + } } else { // Remove all \n and \r from the left and right of the text while (text.length && (text[0] == "\n" || text[0] == "\r")) text = text.substr(1); @@ -563,17 +722,17 @@ function enexXmlToMdArray(stream, resources, options = {}) { let n = node.name.toLowerCase(); - if (n == "div") { - // div tags are recursive, in order to find the end we have to count the depth - if (state.inCodeblock > 0) { - state.inCodeblock++; - } else if (nodeAttributes && nodeAttributes.style && nodeAttributes.style.indexOf("box-sizing: border-box") >= 0) { - // Evernote code block start - state.inCodeblock = 1; - section.lines.push("```"); - return; // skip further processing - } - } + // if (n == "div") { + // // div tags are recursive, in order to find the end we have to count the depth + // if (state.inCodeblock > 0) { + // state.inCodeblock++; + // } else if (nodeAttributes && nodeAttributes.style && nodeAttributes.style.indexOf("box-sizing: border-box") >= 0) { + // // Evernote code block start + // state.inCodeblock = 1; + // section.lines.push("```"); + // return; // skip further processing + // } + // } if (n == 'en-note') { // Start of note @@ -656,7 +815,9 @@ function enexXmlToMdArray(stream, resources, options = {}) { } } else if (isAnchor(n)) { state.anchorAttributes.push(nodeAttributes); - section.lines.push('['); + // Need to add the '[' via this function to make sure that links within code blocks + // are handled correctly. + collapseWhiteSpaceAndAppend(section.lines, state, '['); } else if (isEmTag(n)) { section.lines.push("*"); } else if (n == "en-todo") { @@ -763,26 +924,26 @@ function enexXmlToMdArray(stream, resources, options = {}) { if (resource && !!resource.id) { section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt); } - } else if (n == "span" || n == "font") { - // Check for monospace font. It can come from being specified in either from - // or . - // Monospace sections are already in monospace for Evernote code blocks - if (state.inCodeblock == 0 && nodeAttributes) { - let style = null; + // } else if (n == "span" || n == "font") { + // // Check for monospace font. It can come from being specified in either from + // // or . + // // Monospace sections are already in monospace for Evernote code blocks + // if (state.inCodeblock == 0 && nodeAttributes) { + // let style = null; - if (nodeAttributes.style) { - style = nodeAttributes.style.toLowerCase(); - } else if (nodeAttributes.face) { - style = nodeAttributes.face.toLowerCase(); - } + // if (nodeAttributes.style) { + // style = nodeAttributes.style.toLowerCase(); + // } else if (nodeAttributes.face) { + // style = nodeAttributes.face.toLowerCase(); + // } - monospace = style.match(/monospace|courier|menlo|monaco/) != null; + // monospace = style ? style.match(/monospace|courier|menlo|monaco/) != null : false; - if (monospace) { - state.inMonospaceFont = true; - section.lines.push(MONOSPACE_OPEN); - } - } + // if (monospace) { + // state.inMonospaceFont = true; + // section.lines.push(MONOSPACE_OPEN); + // } + // } } else if (["span", "font", 'sup', 'cite', 'abbr', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area', 'label', 'legend'].indexOf(n) >= 0) { // Inline tags that can be ignored in Markdown } else { @@ -793,16 +954,16 @@ function enexXmlToMdArray(stream, resources, options = {}) { saxStream.on('closetag', function(n) { n = n ? n.toLowerCase() : n; - if (n == "div") { - if (state.inCodeblock >= 1) { - state.inCodeblock--; - if (state.inCodeblock == 0) { - // Evernote code block end - section.lines.push("```"); - return; // skip further processing - } - } - } + // if (n == "div") { + // if (state.inCodeblock >= 1) { + // state.inCodeblock--; + // if (state.inCodeblock == 0) { + // // Evernote code block end + // section.lines.push("```"); + // return; // skip further processing + // } + // } + // } if (n == 'en-note') { // End of note @@ -816,11 +977,11 @@ function enexXmlToMdArray(stream, resources, options = {}) { if (section && section.parent) section = section.parent; } else if (n == 'table') { if (section && section.parent) section = section.parent; - } else if (n == "span" || n == "font") { - if (state.inCodeblock == 0 && state.inMonospaceFont) { - state.inMonospaceFont = false; - section.lines.push(MONOSPACE_CLOSE); - } + // } else if (n == "span" || n == "font") { + // if (state.inCodeblock == 0 && state.inMonospaceFont) { + // state.inMonospaceFont = false; + // section.lines.push(MONOSPACE_CLOSE); + // } } else if (isIgnoredEndTag(n)) { // Skip } else if (isListTag(n)) {