All: More robust HTML to MD conversion and started adding test units for it

2024-12-24 10:27:10 +02:00 · 2018-05-12 11:48:39 +01:00 · 2018-05-12 11:48:39 +01:00 · 394f2df664
commit 394f2df664
parent 2a04378a0d
14 changed files with 361 additions and 48 deletions
--- a/CliClient/tests/HtmlToMd.js
+++ b/CliClient/tests/HtmlToMd.js
@ -0,0 +1,64 @@
+require('app-module-path').addPath(__dirname);
+
+const { time } = require('lib/time-utils.js');
+const { filename } = require('lib/path-utils.js');
+const { asyncTest, fileContentEqual, setupDatabase, setupDatabaseAndSynchronizer, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync } = require('test-utils.js');
+const Folder = require('lib/models/Folder.js');
+const Note = require('lib/models/Note.js');
+const BaseModel = require('lib/BaseModel.js');
+const { shim } = require('lib/shim');
+const { enexXmlToMd } = require('lib/import-enex-md-gen.js');
+const stringToStream = require('string-to-stream')
+
+jasmine.DEFAULT_TIMEOUT_INTERVAL = 60 * 60 * 1000; // Can run for a while since everything is in the same test unit
+
+process.on('unhandledRejection', (reason, p) => {
+	console.log('Unhandled Rejection at: Promise', p, 'reason:', reason);
+});
+
+describe('HtmlToMd', function() {
+
+	beforeEach(async (done) => {
+		await setupDatabaseAndSynchronizer(1);
+		await switchClient(1);
+		done();
+	});
+
+	it('should convert from HTML to Markdown', asyncTest(async () => {
+		const basePath = __dirname + '/html_to_md';
+		const files = await shim.fsDriver().readDirStats(basePath);
+		
+		for (let i = 0; i < files.length; i++) {
+			const htmlFilename = files[i].path;
+			if (htmlFilename.indexOf('.html') < 0) continue;
+
+			const htmlPath = basePath + '/' + htmlFilename;
+			const mdPath = basePath + '/' + filename(htmlFilename) + '.md';
+
+			// if (htmlFilename !== 'inlineCodeWithLink.html') continue;
+
+			const html = await shim.fsDriver().readFile(htmlPath);
+			const expectedMd = await shim.fsDriver().readFile(mdPath);
+
+			const contentStream = stringToStream(html);
+			const actualMd = await enexXmlToMd(contentStream, []);
+
+			if (actualMd !== expectedMd) {
+				console.info('');
+				console.info('Error converting file: ' + htmlFilename);
+				console.info('--------------------------------- Got:');
+				console.info(actualMd);
+				console.info('--------------------------------- Expected:');
+				console.info(expectedMd);
+				console.info('--------------------------------------------');
+				console.info('');
+
+				expect(false).toBe(true);
+				process.exit(1);
+			} else {
+				expect(true).toBe(true)
+			}
+		}
+	}));
+
+});
--- a/CliClient/tests/html_to_md/code1.html
+++ b/CliClient/tests/html_to_md/code1.html
@ -0,0 +1,16 @@
+<div>
+	<p>For example, consider a web page like this:</p>
+
+<pre class="brush: html line-numbers  language-html"><code class=" language-html"><span class="token doctype">&lt;!DOCTYPE html&gt;</span>
+<span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>html</span><span class="token punctuation">&gt;</span></span>
+  <span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>head</span><span class="token punctuation">&gt;</span></span>
+    <span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>meta</span> <span class="token attr-name">http-equiv</span><span class="token attr-value"><span class="token punctuation">=</span><span class="token punctuation">"</span>content-type<span class="token punctuation">"</span></span> <span class="token attr-name">content</span><span class="token attr-value"><span class="token punctuation">=</span><span class="token punctuation">"</span>text/html; charset<span class="token punctuation">=</span>utf-8<span class="token punctuation">"</span></span> <span class="token punctuation">/&gt;</span></span>
+  <span class="token tag"><span class="token tag"><span class="token punctuation">&lt;/</span>head</span><span class="token punctuation">&gt;</span></span>
+
+  <span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>body</span><span class="token punctuation">&gt;</span></span>
+    <span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>script</span> <span class="token attr-name">src</span><span class="token attr-value"><span class="token punctuation">=</span><span class="token punctuation">"</span>page-scripts/page-script.js<span class="token punctuation">"</span></span><span class="token punctuation">&gt;</span></span><span class="token script language-javascript"></span><span class="token tag"><span class="token tag"><span class="token punctuation">&lt;/</span>script</span><span class="token punctuation">&gt;</span></span>
+  <span class="token tag"><span class="token tag"><span class="token punctuation">&lt;/</span>body</span><span class="token punctuation">&gt;</span></span>
+<span class="token tag"><span class="token tag"><span class="token punctuation">&lt;/</span>html</span><span class="token punctuation">&gt;</span></span><span class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
+
+	<p>The script "page-script.js" does this:</p>
+</div>
--- a/CliClient/tests/html_to_md/code1.md
+++ b/CliClient/tests/html_to_md/code1.md
@ -0,0 +1,14 @@
+For example, consider a web page like this:
+
+	<!DOCTYPE html>
+	<html>
+	  <head>
+	    <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+	  </head>
+
+	  <body>
+	    <script src="page-scripts/page-script.js"></script>
+	  </body>
+	</html>
+
+The script "page-script.js" does this:
--- a/CliClient/tests/html_to_md/heading.html
+++ b/CliClient/tests/html_to_md/heading.html
@ -0,0 +1,9 @@
+<div>
+	<div class="note">
+		<p>Values added to the global scope of a content script with</p>
+	</div>
+
+	<h2 id="Loading_content_scripts">Loading content scripts</h2>
+
+	<p>You can load a content script into a web page in one of three ways:</p>
+</div>
--- a/CliClient/tests/html_to_md/heading.md
+++ b/CliClient/tests/html_to_md/heading.md
@ -0,0 +1,5 @@
+Values added to the global scope of a content script with
+
+## Loading content scripts
+
+You can load a content script into a web page in one of three ways:
--- a/CliClient/tests/html_to_md/inlineCode.html
+++ b/CliClient/tests/html_to_md/inlineCode.html
@ -0,0 +1,3 @@
+<div>
+	<p>Similarly, I need another regex to match double newlines (<code>\n\n</code>) that are not part of a longer run of newline characters like <code>\n\n\n</code> or <code>\n\n\n\n\n\n</code> etc.</p>
+</div>
--- a/CliClient/tests/html_to_md/inlineCode.md
+++ b/CliClient/tests/html_to_md/inlineCode.md
@ -0,0 +1 @@
+Similarly, I need another regex to match double newlines (`\n\n`) that are not part of a longer run of newline characters like `\n\n\n` or `\n\n\n\n\n\n` etc.
--- a/CliClient/tests/html_to_md/inlineCodeWithLink.html
+++ b/CliClient/tests/html_to_md/inlineCodeWithLink.html
@ -0,0 +1,3 @@
+<div>
+<p>the&nbsp;<code><a href="/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/onConnect">runtime.onConnect</a></code> listener gets passed its own <code><a href="/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/Port">runtime.Port</a></code> object.</p>
+</div>
--- a/CliClient/tests/html_to_md/inlineCodeWithLink.md
+++ b/CliClient/tests/html_to_md/inlineCodeWithLink.md
@ -0,0 +1 @@
+the `[runtime.onConnect](/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/onConnect)` listener gets passed its own `[runtime.Port](/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/Port)` object.
--- a/CliClient/tests/html_to_md/list.html
+++ b/CliClient/tests/html_to_md/list.html
@ -0,0 +1,17 @@
+<div>
+	<p>Liste de courses</p>
+
+	<div>
+		<div><en-todo checked="true"/>Pizzas</div>
+		<div><en-todo checked="true"/>Pain</div>
+		<div><en-todo checked="true"/>Jambon</div>
+	</div>
+
+	<div><br/></div>
+
+	<div>
+		<div><en-todo checked="true"/>On its own</div>
+	</div>
+
+	<p>End</p>
+</div>
--- a/CliClient/tests/html_to_md/list.md
+++ b/CliClient/tests/html_to_md/list.md
@ -0,0 +1,9 @@
+Liste de courses
+
+- [X] Pizzas
+- [X] Pain
+- [X] Jambon
+
+- [X] On its own
+
+End
--- a/CliClient/tests/html_to_md/paragraph.html
+++ b/CliClient/tests/html_to_md/paragraph.html
@ -0,0 +1,5 @@
+<div>
+	<p>Something something</p>
+	<p>Blablbla blabla lbla</p>
+	<p>Last line</p>
+</div>
--- a/CliClient/tests/html_to_md/paragraph.md
+++ b/CliClient/tests/html_to_md/paragraph.md
@ -0,0 +1,5 @@
+Something something
+
+Blablbla blabla lbla
+
+Last line
--- a/ReactNativeClient/lib/import-enex-md-gen.js
+++ b/ReactNativeClient/lib/import-enex-md-gen.js
@ -213,8 +213,10 @@ function mergeMonospaceSectionsWrapper(md, ignoreMonospace = false) {
 }

 function processMdArrayNewLines(md, isTable = false) {
+	// console.info(md);
+
 	// Try to merge MONOSPACE sections, works good when when not parsing a table
-	md = mergeMonospaceSectionsWrapper(md, isTable);
+	// md = mergeMonospaceSectionsWrapper(md, isTable);

 	while (md.length && md[0] == BLOCK_OPEN) {
 		md.shift();
@ -289,6 +291,8 @@ function processMdArrayNewLines(md, isTable = false) {
 		}
 	}

+	// console.info(md);
+
 	let output = '';
 	let previous = '';
 	let start = true;
@ -312,7 +316,148 @@ function processMdArrayNewLines(md, isTable = false) {

 	if (!output.trim().length) return '';

-	return output;
+	let lines = output.replace(/\\r/g, '').split('\n');
+	return convertSingleLineCodeBlocksToInline(formatMdLayout(lines)).join('\n');
+}
+
+// While the processMdArrayNewLines() function adds newlines in a way that's technically correct, the resulting Markdown can look messy.
+// This is because while a "block" element should be surrounded by newlines, in practice, some should be surrounded by TWO new lines, while
+// others by only ONE.
+//
+// For instance, this:
+//
+//     <li>one</li>
+//     <li>two</li>
+//     <li>three</li>
+//
+// should result in this:
+// 
+//     - one
+//     - two
+//     - three
+//
+// While this:
+//
+//     <p>Some long paragraph</p><p>And another one</p><p>And the last paragraph</p>
+//
+// should result in this:
+//
+//     Some long paragraph
+//     
+//     And another one
+//    
+//     And the last paragraph
+//
+// So in one case, one newline between tags, and in another two newlines. In HTML this would be done via CSS, but in Markdown we need
+// to add new lines. It's also important to get these newlines right because two blocks of text next to each others might be renderered
+// differently than if there's a newlines between them. So the function below parses the almost final MD and add new lines depending
+// on various rules.
+
+	const isHeading = function(line) {
+		return !!line.match(/#+\s/);
+	}
+
+	const isListItem = function(line) {
+		return line && line.trim().indexOf('- ') === 0;
+	}
+
+	const isCodeLine = function(line) {
+		return line && line.indexOf('\t') === 0; 
+	}
+
+	const isPlainParagraph = function(line) {
+		if (!line || !line.length) return false;
+
+		if (isListItem(line)) return false;
+		if (isHeading(line)) return false;
+		if (isCodeLine(line)) return false;
+
+		return true; 
+	}
+
+function formatMdLayout(lines) {	
+	let previous = '';
+	let newLines = [];
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i];
+
+		// Add a new line at the end of a list of items
+		if (isListItem(previous) && line && !isListItem(line)) {
+			newLines.push('');
+
+		// Add a new line at the beginning of a list of items
+		} else if (isListItem(line) && previous && !isListItem(previous)) {
+			newLines.push('');
+
+		// Add a new line before a heading
+		} else if (isHeading(line) && previous) {
+			newLines.push('');
+
+		// Add a new line after a heading
+		} else if (isHeading(previous) && line) {
+			newLines.push('');
+		
+		// Add a new line at beginning of paragraph
+		} else if (isPlainParagraph(line) && previous) {
+			newLines.push('');
+
+		// Add a new line at end of paragraph
+		} else if (isPlainParagraph(previous) && line) {
+			newLines.push('');
+		}
+	
+		newLines.push(line);
+		previous = newLines[newLines.length - 1];
+	}
+
+	return newLines;
+}
+
+function lineStartsWithDelimiter(line) {
+	if (!line || !line.length) return false;
+	return ' ,.;:)]}'.indexOf(line[0]) >= 0;
+}
+
+function convertSingleLineCodeBlocksToInline(lines) {
+	let newLines = [];
+	let currentCodeLines = [];
+	let codeLineCount = 0;
+
+
+	const processCurrentCodeLines = (line) => {
+		if (codeLineCount === 1) {
+			const inlineCode = currentCodeLines.join('').trim();
+			newLines[newLines.length - 1] +=  '`' + inlineCode + '`';
+			if (line) newLines[newLines.length - 1] += (lineStartsWithDelimiter(line) ? '' : ' ') + line;
+		} else {
+			newLines = newLines.concat(currentCodeLines);
+			newLines.push(line);
+		}
+
+		currentCodeLines = [];
+		codeLineCount = 0;
+	}
+
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i];
+
+		if (isCodeLine(line)) {
+			currentCodeLines.push(line);
+			codeLineCount++;
+		} else if (!line.trim()) {
+			currentCodeLines.push(line);
+		} else {
+			if (currentCodeLines.length) {
+				processCurrentCodeLines(line);
+			} else {
+				newLines.push(line);
+			}
+		}
+	}
+
+	if (currentCodeLines.length) processCurrentCodeLines('');
+
+	return newLines;
 }

 function isWhiteSpace(c) {
@ -343,8 +488,22 @@ function simplifyString(s) {

 function collapseWhiteSpaceAndAppend(lines, state, text) {
 	if (state.inCode) {
-		text = "\t" + text;
-		lines.push(text);
+		let previous = lines.length ? lines[lines.length - 1] : '';
+
+		// If the preceding item is a block limit, then the current line should start with a TAB
+		if ([BLOCK_OPEN, BLOCK_CLOSE, NEWLINE, NEWLINE_MERGED, MONOSPACE_OPEN, MONOSPACE_CLOSE].indexOf(previous) >= 0 || !previous) {
+			//text = "\t" + text;
+			lines.push('\t');
+			lines.push(text);
+		} else {
+			// If the current text contains one or more \n, then the last one should be immediately followed by a TAB
+			const idx = text.lastIndexOf('\n');
+			if (idx >= 0) {
+				text = text.substr(0, idx+1) + '\t' + text.substr(idx+1);
+			}
+
+			lines.push(text);
+		}
 	} else {
 		// Remove all \n and \r from the left and right of the text
 		while (text.length && (text[0] == "\n" || text[0] == "\r")) text = text.substr(1);
@ -563,17 +722,17 @@ function enexXmlToMdArray(stream, resources, options = {}) {

 			let n = node.name.toLowerCase();

-			if (n == "div") {
-				// div tags are recursive, in order to find the end we have to count the depth
-				if (state.inCodeblock > 0) {
-					state.inCodeblock++;
-				} else if (nodeAttributes && nodeAttributes.style && nodeAttributes.style.indexOf("box-sizing: border-box") >= 0) {
-					// Evernote code block start
-					state.inCodeblock = 1;
-					section.lines.push("```");
-					return; // skip further processing
-				}
-			}
+			// if (n == "div") {
+			// 	// div tags are recursive, in order to find the end we have to count the depth
+			// 	if (state.inCodeblock > 0) {
+			// 		state.inCodeblock++;
+			// 	} else if (nodeAttributes && nodeAttributes.style && nodeAttributes.style.indexOf("box-sizing: border-box") >= 0) {
+			// 		// Evernote code block start
+			// 		state.inCodeblock = 1;
+			// 		section.lines.push("```");
+			// 		return; // skip further processing
+			// 	}
+			// }

 			if (n == 'en-note') {
 				// Start of note
@ -656,7 +815,9 @@ function enexXmlToMdArray(stream, resources, options = {}) {
 				}
 			} else if (isAnchor(n)) {
 				state.anchorAttributes.push(nodeAttributes);
-				section.lines.push('[');
+				// Need to add the '[' via this function to make sure that links within code blocks
+				// are handled correctly.
+				collapseWhiteSpaceAndAppend(section.lines, state, '[');
 			} else if (isEmTag(n)) {
 				section.lines.push("*");
 			} else if (n == "en-todo") {
@ -763,26 +924,26 @@ function enexXmlToMdArray(stream, resources, options = {}) {
 				if (resource && !!resource.id) {
 					section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt);
 				}
-		 	} else if (n == "span" || n == "font") {
-				// Check for monospace font. It can come from being specified in either from
-				// <span style="..."> or <font face="...">.
-				// Monospace sections are already in monospace for Evernote code blocks
-				if (state.inCodeblock == 0 && nodeAttributes) {
-					let style = null;
+		 	// } else if (n == "span" || n == "font") {
+				// // Check for monospace font. It can come from being specified in either from
+				// // <span style="..."> or <font face="...">.
+				// // Monospace sections are already in monospace for Evernote code blocks
+				// if (state.inCodeblock == 0 && nodeAttributes) {
+				// 	let style = null;

-					if (nodeAttributes.style) {
-						style = nodeAttributes.style.toLowerCase();
-					} else if (nodeAttributes.face) {
-						style = nodeAttributes.face.toLowerCase();
-					}
+				// 	if (nodeAttributes.style) {
+				// 		style = nodeAttributes.style.toLowerCase();
+				// 	} else if (nodeAttributes.face) {
+				// 		style = nodeAttributes.face.toLowerCase();
+				// 	}
 				
-					monospace = style.match(/monospace|courier|menlo|monaco/) != null;
+				// 	monospace = style ? style.match(/monospace|courier|menlo|monaco/) != null : false;

-					if (monospace) {
-						state.inMonospaceFont = true;
-						section.lines.push(MONOSPACE_OPEN);
-					}
-				} 
+				// 	if (monospace) {
+				// 		state.inMonospaceFont = true;
+				// 		section.lines.push(MONOSPACE_OPEN);
+				// 	}
+				// } 
 			} else if (["span", "font", 'sup', 'cite', 'abbr', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area', 'label', 'legend'].indexOf(n) >= 0) {
 				// Inline tags that can be ignored in Markdown
 			} else {
@ -793,16 +954,16 @@ function enexXmlToMdArray(stream, resources, options = {}) {
 		saxStream.on('closetag', function(n) {
 			n = n ? n.toLowerCase() : n;

-			if (n == "div") {
-				if (state.inCodeblock >= 1) {
-					state.inCodeblock--;
-					if (state.inCodeblock == 0) {
-						// Evernote code block end
-						section.lines.push("```");
-						return; // skip further processing
-					}
-				}
-			}
+			// if (n == "div") {
+			// 	if (state.inCodeblock >= 1) {
+			// 		state.inCodeblock--;
+			// 		if (state.inCodeblock == 0) {
+			// 			// Evernote code block end
+			// 			section.lines.push("```");
+			// 			return; // skip further processing
+			// 		}
+			// 	}
+			// }

 			if (n == 'en-note') {
 				// End of note
@ -816,11 +977,11 @@ function enexXmlToMdArray(stream, resources, options = {}) {
 				if (section && section.parent) section = section.parent;
 			} else if (n == 'table') {
 				if (section && section.parent) section = section.parent;
-			} else if (n == "span" || n == "font") {
-				if (state.inCodeblock == 0 && state.inMonospaceFont) {
-					state.inMonospaceFont = false;
-					section.lines.push(MONOSPACE_CLOSE);
-				}
+			// } else if (n == "span" || n == "font") {
+			// 	if (state.inCodeblock == 0 && state.inMonospaceFont) {
+			// 		state.inMonospaceFont = false;
+			// 		section.lines.push(MONOSPACE_CLOSE);
+			// 	}
 			} else if (isIgnoredEndTag(n)) {
 				// Skip
 			} else if (isListTag(n)) {
				`@ -0,0 +1 @@`
				Similarly, I need another regex to match double newlines (`\n\n`) that are not part of a longer run of newline characters like `\n\n\n` or `\n\n\n\n\n\n` etc.
				`@ -0,0 +1 @@`
				the `[runtime.onConnect](/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/onConnect)` listener gets passed its own `[runtime.Port](/en-US/docs/Mozilla/Add-ons/WebExtensions/API/runtime/Port)` object.