From ba9598682c6bba44a832aa293a054ad0759c7da7 Mon Sep 17 00:00:00 2001 From: Laurent Cozic Date: Mon, 14 May 2018 18:46:04 +0100 Subject: [PATCH] HtmlToMd: Fixed various tests --- CliClient/tests/HtmlToMd.js | 6 +- CliClient/tests/html_to_md/link1.html | 4 + CliClient/tests/html_to_md/link1.md | 1 + CliClient/tests/html_to_md/list2.html | 11 ++ CliClient/tests/html_to_md/list2.md | 3 + CliClient/tests/html_to_md/paragraph.html | 6 ++ CliClient/tests/html_to_md/paragraph.md | 7 +- CliClient/tests/html_to_md/text1.html | 2 + CliClient/tests/html_to_md/text1.md | 1 + ReactNativeClient/lib/import-enex-md-gen.js | 108 +++++++++++++++++++- ReactNativeClient/lib/path-utils.js | 3 +- 11 files changed, 141 insertions(+), 11 deletions(-) create mode 100644 CliClient/tests/html_to_md/link1.html create mode 100644 CliClient/tests/html_to_md/link1.md create mode 100644 CliClient/tests/html_to_md/list2.html create mode 100644 CliClient/tests/html_to_md/list2.md create mode 100644 CliClient/tests/html_to_md/text1.html create mode 100644 CliClient/tests/html_to_md/text1.md diff --git a/CliClient/tests/HtmlToMd.js b/CliClient/tests/HtmlToMd.js index b8db791b8..13ca41f63 100644 --- a/CliClient/tests/HtmlToMd.js +++ b/CliClient/tests/HtmlToMd.js @@ -35,12 +35,12 @@ describe('HtmlToMd', function() { const htmlPath = basePath + '/' + htmlFilename; const mdPath = basePath + '/' + filename(htmlFilename) + '.md'; - // if (htmlFilename !== 'tableWithNewLines.html') continue; + // if (htmlFilename !== 'list.html') continue; const html = await shim.fsDriver().readFile(htmlPath); const expectedMd = await shim.fsDriver().readFile(mdPath); - const contentStream = stringToStream(html); + const contentStream = stringToStream('
' + html + '
'); const actualMd = await enexXmlToMd(contentStream, []); if (actualMd !== expectedMd) { @@ -54,7 +54,7 @@ describe('HtmlToMd', function() { console.info(''); expect(false).toBe(true); - return; + // return; } else { expect(true).toBe(true) } diff --git a/CliClient/tests/html_to_md/link1.html b/CliClient/tests/html_to_md/link1.html new file mode 100644 index 000000000..a28599932 --- /dev/null +++ b/CliClient/tests/html_to_md/link1.html @@ -0,0 +1,4 @@ + + Sign in + + \ No newline at end of file diff --git a/CliClient/tests/html_to_md/link1.md b/CliClient/tests/html_to_md/link1.md new file mode 100644 index 000000000..d8ada44d8 --- /dev/null +++ b/CliClient/tests/html_to_md/link1.md @@ -0,0 +1 @@ +[Sign in](https://arstechnica.com/civis/ucp.php?mode=login&return_to=%2Ftech-policy%2F2018%2F05%2Fjails-are-replacing-in-person-visits-with-video-calling-services-theyre-awful%2F) \ No newline at end of file diff --git a/CliClient/tests/html_to_md/list2.html b/CliClient/tests/html_to_md/list2.html new file mode 100644 index 000000000..c1e3b8449 --- /dev/null +++ b/CliClient/tests/html_to_md/list2.html @@ -0,0 +1,11 @@ + \ No newline at end of file diff --git a/CliClient/tests/html_to_md/list2.md b/CliClient/tests/html_to_md/list2.md new file mode 100644 index 000000000..7b9c3f790 --- /dev/null +++ b/CliClient/tests/html_to_md/list2.md @@ -0,0 +1,3 @@ +- [Github](https://github.com/zetter) +- [Twitter](https://twitter.com/czetter) +- [Lanyrd](http://lanyrd.com/profile/czetter/) \ No newline at end of file diff --git a/CliClient/tests/html_to_md/paragraph.html b/CliClient/tests/html_to_md/paragraph.html index 0c04d07f5..e740f00e1 100644 --- a/CliClient/tests/html_to_md/paragraph.html +++ b/CliClient/tests/html_to_md/paragraph.html @@ -1,5 +1,11 @@
+

Short paragraphs are merged together:

Something something

Blablbla blabla lbla

Last line

+
+ +
+

Longer ones are separated by new lines. In 1894 Joplin arrived in Sedalia, Missouri. At first, Joplin stayed with the family of Arthur Marshall, at the time a 13-year-old boy but later one of Joplin's students and a rag-time composer in his own right.[26] There is no record of Joplin having a permanent residence in the town until 1904, as Joplin was making a living as a touring musician.

+

There is little precise evidence known about Joplin's activities at this time, although he performed as a solo musician at dances and at the major black clubs in Sedalia, the Black 400 club and the Maple Leaf Club. He performed in the Queen City Cornet Band, and his own six-piece dance orchestra.

\ No newline at end of file diff --git a/CliClient/tests/html_to_md/paragraph.md b/CliClient/tests/html_to_md/paragraph.md index 895af9060..ea2f6ada2 100644 --- a/CliClient/tests/html_to_md/paragraph.md +++ b/CliClient/tests/html_to_md/paragraph.md @@ -1,5 +1,8 @@ +Short paragraphs are merged together: Something something - Blablbla blabla lbla +Last line -Last line \ No newline at end of file +Longer ones are separated by new lines. In 1894 Joplin arrived in Sedalia, Missouri. At first, Joplin stayed with the family of Arthur Marshall, at the time a 13-year-old boy but later one of Joplin's students and a rag-time composer in his own right.[26] There is no record of Joplin having a permanent residence in the town until 1904, as Joplin was making a living as a touring musician. + +There is little precise evidence known about Joplin's activities at this time, although he performed as a solo musician at dances and at the major black clubs in Sedalia, the Black 400 club and the Maple Leaf Club. He performed in the Queen City Cornet Band, and his own six-piece dance orchestra. \ No newline at end of file diff --git a/CliClient/tests/html_to_md/text1.html b/CliClient/tests/html_to_md/text1.html new file mode 100644 index 000000000..578dba884 --- /dev/null +++ b/CliClient/tests/html_to_md/text1.html @@ -0,0 +1,2 @@ +Kurt Gödel published a very short +but profound paper titled \ No newline at end of file diff --git a/CliClient/tests/html_to_md/text1.md b/CliClient/tests/html_to_md/text1.md new file mode 100644 index 000000000..9079625c3 --- /dev/null +++ b/CliClient/tests/html_to_md/text1.md @@ -0,0 +1 @@ +Kurt Gödel published a very short but profound paper titled \ No newline at end of file diff --git a/ReactNativeClient/lib/import-enex-md-gen.js b/ReactNativeClient/lib/import-enex-md-gen.js index d597bc2cd..58cca60b5 100644 --- a/ReactNativeClient/lib/import-enex-md-gen.js +++ b/ReactNativeClient/lib/import-enex-md-gen.js @@ -316,8 +316,30 @@ function processMdArrayNewLines(md, isTable = false) { if (!output.trim().length) return ''; + // To simplify the result, we only allow up to one empty line between blocks of text + const mergeMultipleNewLines = function(lines) { + let output = []; + let newlineCount = 0; + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (!line.trim()) { + newlineCount++; + } else { + newlineCount = 0; + } + + if (newlineCount >= 2) continue; + + output.push(line); + } + return output; + } + let lines = output.replace(/\\r/g, '').split('\n'); - return convertSingleLineCodeBlocksToInline(formatMdLayout(lines)).join('\n'); + lines = formatMdLayout(lines) + lines = convertSingleLineCodeBlocksToInline(lines) + lines = mergeMultipleNewLines(lines); + return lines.join('\n'); } // While the processMdArrayNewLines() function adds newlines in a way that's technically correct, the resulting Markdown can look messy. @@ -370,7 +392,10 @@ function processMdArrayNewLines(md, isTable = false) { } const isPlainParagraph = function(line) { - if (!line || !line.length) return false; + // Note: if a line is no longer than 80 characters, we don't consider it's a paragraph, which + // means no newlines will be added before or after. This is to handle text that has been + // written with "hard" new lines. + if (!line || line.length < 80) return false; if (isListItem(line)) return false; if (isHeading(line)) return false; @@ -401,6 +426,12 @@ function formatMdLayout(lines) { // Add a new line after a heading } else if (isHeading(previous) && line) { newLines.push(''); + + } else if (isCodeLine(line) && !isCodeLine(previous)) { + newLines.push(''); + + } else if (!isCodeLine(line) && isCodeLine(previous)) { + newLines.push(''); // Add a new line at beginning of paragraph } else if (isPlainParagraph(line) && previous) { @@ -510,10 +541,14 @@ function collapseWhiteSpaceAndAppend(lines, state, text) { lines.push(text); } } else { + // Remove all \n and \r from the left and right of the text while (text.length && (text[0] == "\n" || text[0] == "\r")) text = text.substr(1); while (text.length && (text[text.length - 1] == "\n" || text[text.length - 1] == "\r")) text = text.substr(0, text.length - 1); + // Replace the inner \n with a space + text = text.replace(/[\n\r]+/g, ' '); + // Collapse all white spaces to just one. If there are spaces to the left and right of the string // also collapse them to just one space. let spaceLeft = text.length && text[0] == ' '; @@ -831,7 +866,7 @@ function enexXmlToMdArray(stream, resources, options = {}) { } else if (n == "hr") { // Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML section.lines.push(NEWLINE); - section.lines.push('----------------------------------------'); + section.lines.push('* * *'); section.lines.push(NEWLINE); section.lines.push(NEWLINE); } else if (n == "h1") { @@ -1024,7 +1059,7 @@ function enexXmlToMdArray(stream, resources, options = {}) { let previous = null; for (let i = section.lines.length - 1; i >= 0; i--) { previous = section.lines[i]; - if ([BLOCK_OPEN, BLOCK_CLOSE, NEWLINE, NEWLINE_MERGED, SPACE].indexOf(previous) >= 0) { + if ([BLOCK_OPEN, BLOCK_CLOSE, NEWLINE, NEWLINE_MERGED, SPACE].indexOf(previous) >= 0 || !previous) { continue; } else { break; @@ -1088,6 +1123,42 @@ function enexXmlToMdArray(stream, resources, options = {}) { } section.lines.push(url); } else { + + // Eg. converts: + // [ Sign in ](https://example.com) + // to: + // [Sign in](https://example.com) + const trimTextStartAndEndSpaces = function(lines) { + let firstBracketIndex = 0; + let foundFirstNonWhite = false; + for (let i = lines.length - 1; i >= 0; i--) { + const l = lines[i]; + if (!foundFirstNonWhite && (l === SPACE || l === ' ' || !l)) { + lines.pop(); + } else { + foundFirstNonWhite = true; + } + + if (l === '[') { + firstBracketIndex = i; + break; + } + } + + for (let i = firstBracketIndex + 1; i < lines.length; i++) { + const l = lines[i]; + if (l === SPACE || l === ' ' ||!l) { + lines.splice(i, 1); + } else { + break; + } + } + + return lines; + } + + section.lines = trimTextStartAndEndSpaces(section.lines); + section.lines.push('](' + url + ')'); } } @@ -1263,7 +1334,34 @@ async function enexXmlToMd(stream, resources, options = {}) { firstAttachment = false; } - return processMdArrayNewLines(mdLines); + let output = processMdArrayNewLines(mdLines); + + // After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to + // block start/end or elements that were ignored, etc. If these white spaces were intended it's not really + // possible to detect it, so simply trim them all so that the result is more deterministic and can be + // easily unit tested. + const trimEmptyLines = function(text) { + const lines = text.split('\n'); + while (lines.length) { + if (!lines[0].trim()) { + lines.splice(0, 1); + } else { + break; + } + } + + while (lines.length) { + if (!lines[lines.length - 1].trim()) { + lines.pop(); + } else { + break; + } + } + + return lines.join('\n'); + } + + return trimEmptyLines(output); } module.exports = { enexXmlToMd, processMdArrayNewLines, NEWLINE, addResourceTag }; \ No newline at end of file diff --git a/ReactNativeClient/lib/path-utils.js b/ReactNativeClient/lib/path-utils.js index 13f2053a6..526a9b277 100644 --- a/ReactNativeClient/lib/path-utils.js +++ b/ReactNativeClient/lib/path-utils.js @@ -40,7 +40,8 @@ function safeFileExtension(e) { return e.replace(/[^a-zA-Z0-9]/g, '') } -function toSystemSlashes(path, os) { +function toSystemSlashes(path, os = null) { + if (os === null) os = process.platform; if (os === 'win32') return path.replace(/\//g, "\\"); return path.replace(/\\/g, "/"); }