Merge monospace text lines when importing from Evernote

2024-12-24 10:27:10 +02:00 · 2018-04-10 22:10:33 +02:00 · 2018-04-10 22:10:33 +02:00 · 16554b22c7
commit 16554b22c7
parent d5574098f0
1 changed files with 266 additions and 4 deletions
--- a/ReactNativeClient/lib/import-enex-md-gen.js
+++ b/ReactNativeClient/lib/import-enex-md-gen.js
@ -5,8 +5,235 @@ const BLOCK_CLOSE = "[[BLOCK_CLOSE]]";
 const NEWLINE = "[[NEWLINE]]";
 const NEWLINE_MERGED = "[[MERGED]]";
 const SPACE = "[[SPACE]]";
+// For monospace font detection (Courier, Menlo, Moncaco)
+const MONOSPACE_OPEN = "[[MONOSPACE_OPEN]]";
+const MONOSPACE_CLOSE = "[[MONOSPACE_CLOSE]]";
+
+// Enable debugging
+const DEBUG_MONOSPACE_MERGE = false;
+
+
+function debugMD(text, md) {
+	if (DEBUG_MONOSPACE_MERGE) {
+		console.log("< " + text + " START>");
+		for (let i = 0; i < md.length; i++) { 
+			console.log("%i: \"%s\"", i, md[i]);
+		}
+		console.log("< " + text + " STOP>");
+	}
+}
+
+
+// This function will return a list of all monospace sections with a flag saying whether they can be merged or not
+function findMonospaceSections(md, ignoreMonospace = false) {
+	let temp = [];
+
+	let sections = [];
+	let section = null;
+	let mergeWithPrevious = true;
+
+	let last = "";
+	for (let i = 0; i < md.length; i++) { 
+		let v = md[i];
+		
+		if (v == MONOSPACE_OPEN) {
+			// Remember where monospace section begins, later it will be replaced with appropriate markdown (` or ```) 
+
+			if (section != null) throw new Error('Monospace open tag detected while the previous was not closed'); // Sanity check, but normally not possible
+
+			let monospaceSection = {
+				openIndex: null,
+				closeIndex: null,
+				mergeAllowed: true, 
+				mergeWithPrevious: mergeWithPrevious,
+			}
+			section = monospaceSection;
+
+			if (!ignoreMonospace) {
+				section.openIndex = temp.push(v) - 1;
+			} 
+			// Add an empty string, it can be later replaced with newline if necessary
+			temp.push("");
+			
+			if (last != BLOCK_OPEN) {
+				// We cannot merge inline code
+				section.mergeAllowed = false;
+			}
+
+			if (DEBUG_MONOSPACE_MERGE) {
+				console.log("> MONOSPACE_OPEN, openIndex: %o, closeIndex: %o, mergeAllowed: %o, mergeWithPrevious: %o", 
+					section.openIndex, section.closeIndex, section.mergeAllowed, section.mergeWithPrevious);
+			}
+
+		} else if (v == MONOSPACE_CLOSE) {
+			// Remember where monospace section begins, later it will be replaced with appropriate markdown (` or ```) 
+
+			if (section == null) throw new Error('Monospace tag was closed without being open before'); // Sanity check, but normally not possible
+			if (section.closeIndex != null) throw new Error('Monospace tag is closed for the second time'); // Sanity check, but normally not possible
+
+			// Add an empty string, it can be later replaced with newline if necessary
+			temp.push("");
+			if (!ignoreMonospace) {
+				section.closeIndex = temp.push(v) - 1;
+			}
+
+			if (md[i+1] != BLOCK_CLOSE) {
+				// We cannot merge inline code
+				section.mergeAllowed = false;
+			}
+
+			if (DEBUG_MONOSPACE_MERGE) {
+				console.log("> \"" + md[i-1] + "\"");
+				console.log("> MONOSPACE_CLOSE, openIndex: %o, closeIndex: %o, mergeAllowed: %o, mergeWithPrevious: %o", 
+					section.openIndex, section.closeIndex, section.mergeAllowed, section.mergeWithPrevious);
+			}
+
+			sections.push(section);
+
+			// Reset
+			section = null;
+			mergeWithPrevious = true;
+
+		} else {
+			// We can merge only if monospace sections are separated by newlines
+			if (v != NEWLINE && v != BLOCK_OPEN && v != BLOCK_CLOSE) {
+				mergeWithPrevious = false;
+			}
+			temp.push(v);
+		}
+		last = v;
+	}
+
+	return {
+		md: temp,
+		monospaceSections: sections,
+	};
+}
+
+
+// This function is looping over monospace sections and collapsing what it can merge
+function mergeMonospaceSections(md, sections, ignoreMonospace = false) {
+
+	const USE_BLOCK_TAG = 1;
+	const USE_INLINE_TAG = 2;
+	const USE_EMPTY_TAG = 3;
+
+	const toMonospace = (md, section, startTag, endTag, dbg = "") => {
+		if (DEBUG_MONOSPACE_MERGE) {
+			console.log("> TO_MONOSPACE, openIndex: %o, closeIndex: %o, startTag: %o, endTag: %o, DBG: %o",
+				section.openIndex, section.closeIndex, startTag, endTag, dbg);
+		}
+		switch (startTag) {
+			case USE_BLOCK_TAG:
+				md[section.openIndex] = "```";
+				md[section.openIndex + 1] = NEWLINE;
+				break;
+			case USE_INLINE_TAG:
+				md[section.openIndex] = "`";
+				break;
+			case USE_EMPTY_TAG:
+				md[section.openIndex] = "";
+				break;
+		}
+		switch (endTag) {
+			case USE_BLOCK_TAG:
+				// We don't add a NEWLINE if there already is a NEWLINE
+				if (md[section.closeIndex - 2] == NEWLINE) {
+					md[section.closeIndex - 1] = "";
+				} else {
+					md[section.closeIndex - 1] = NEWLINE;
+				}
+				md[section.closeIndex] = "```";
+				break;
+			case USE_INLINE_TAG:
+				md[section.closeIndex] = "`";
+				break;
+			case USE_EMPTY_TAG:
+				md[section.closeIndex] = "";
+				break;
+		}
+	}
+
+	const getSection = () => {
+		return sections.shift();
+	}
+
+	const getMergeableSection = (first = null) => {
+		if (first) {
+			sections.unshift(first);
+		}
+		while (sections.length) {
+			s = sections.shift();
+			if (s.mergeAllowed) {
+				return s;
+			}
+			// If cannot merge then convert onto inline code
+			toMonospace(md, s, USE_INLINE_TAG, USE_INLINE_TAG, "getCollapsibleSection");
+		}
+		return null;
+	}
+
+	let left = getMergeableSection();
+	let right = null;
+
+	while (left) {
+		let isFirst = true;
+
+		right = getSection();
+		while (right && right.mergeAllowed && right.mergeWithPrevious) {
+			// We can merge left and right
+			if (isFirst) {
+				isFirst = false;
+				toMonospace(md, left, USE_BLOCK_TAG, USE_EMPTY_TAG, "First section");
+			} else {
+				toMonospace(md, left, USE_EMPTY_TAG, USE_EMPTY_TAG, "Middle section");
+			}
+			left = right;
+			right = getSection();
+		}
+
+		if (isFirst) {
+			// Could not merge, convert to inline code
+			toMonospace(md, left, USE_INLINE_TAG, USE_INLINE_TAG, "Left inline section");
+		} else {
+			// Was merged, add block end tag
+			toMonospace(md, left, USE_EMPTY_TAG, USE_BLOCK_TAG, "Final section");
+		}
+
+		left = getMergeableSection(right);
+	}
+}
+
+
+// This function will try to merge monospace sections
+// It works in two phases:
+//   1) It will find all monospace sections
+//   2) It will merge all monospace sections where merge is allowed
+function mergeMonospaceSectionsWrapper(md, ignoreMonospace = false) {	
+
+	const result = findMonospaceSections(md, ignoreMonospace);
+
+	mergeMonospaceSections(result.md, result.monospaceSections, ignoreMonospace);
+
+	// Remove empty items, it is necessary for correct function of newline merging happening outside this function
+	let temp = []
+	for (let i = 0; i < result.md.length; i++) {
+		let v = result.md[i];
+		if (v != "") {
+			temp.push(v);
+		}
+	} 
+
+	debugMD("DEBUG: after merging monospace sections", temp);
+
+	return temp;		
+}
+
+
+function processMdArrayNewLines(md, isTable = false) {
+	// Try to merge MONOSPACE sections, works good when when not parsing a table
+	md = mergeMonospaceSectionsWrapper(md, isTable);

-function processMdArrayNewLines(md) {
 	while (md.length && md[0] == BLOCK_OPEN) {
 		md.shift();
 	}
@ -271,7 +498,11 @@ function attributeToLowerCase(node) {
 	return output;
 }

-function enexXmlToMdArray(stream, resources) {
+function enexXmlToMdArray(stream, resources, importOptions = null) {
+	// TODO: Receive importOptions from upstream
+	if (!importOptions) importOptions = {};
+	if (!('mergeMonospaceSections' in importOptions)) importOptions.mergeMonospaceSections = true;
+
 	let remainingResources = resources.slice();

 	const removeRemainingResource = (id) => {
@ -287,6 +518,7 @@ function enexXmlToMdArray(stream, resources) {
 		let state = {
 			inCode: false,
 			inQuote: false,
+			inMonospaceFont: false,
 			lists: [],
 			anchorAttributes: [],
 		};
@ -502,6 +734,26 @@ function enexXmlToMdArray(stream, resources) {
 				if (resource && !!resource.id) {
 					section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt);
 				}
+			} else if (n == "span" || n == "font") {
+				// Check for monospace font. It can come from being specified in either from
+				// <span style="..."> or <font face="...">.
+				if (importOptions.mergeMonospaceSections && nodeAttributes) {
+					let style = null;
+
+					if (nodeAttributes.style) {
+						style = nodeAttributes.style.toLowerCase();
+					} else if (nodeAttributes.face) {
+						style = nodeAttributes.face.toLowerCase();
+					}
+				
+					monospace = style.match(/monospace|courier|menlo|monaco/) != null;
+
+					if (monospace) {
+						state.inMonospaceFont = true;
+						section.lines.push(MONOSPACE_OPEN);
+						//console.log("OPEN:  tag: %s, style: ", n, style);
+					}
+				} 
 			} else if (["span", "font", 'sup', 'cite', 'abbr', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area'].indexOf(n) >= 0) {
 				// Inline tags that can be ignored in Markdown
 			} else {
@ -522,6 +774,13 @@ function enexXmlToMdArray(stream, resources) {
 				if (section && section.parent) section = section.parent;
 			} else if (n == 'table') {
 				if (section && section.parent) section = section.parent;
+
+			} else if (n == "span" || n == "font") {
+				if (importOptions.mergeMonospaceSections && state.inMonospaceFont) {
+					state.inMonospaceFont = false;
+					section.lines.push(MONOSPACE_CLOSE);
+					//console.log("CLOSE: tag: %s, lines[n-1]: '%s', lines[n]: '%s'", n, section.lines[section.lines.length - 2], section.lines[section.lines.length - 1]);
+				}
 			} else if (isIgnoredEndTag(n)) {
 				// Skip
 			} else if (isListTag(n)) {
@ -662,7 +921,7 @@ function drawTable(table) {

 				const renderCurrentCells = () => {
 					if (!currentCells.length) return;
-					const cellText = processMdArrayNewLines(currentCells);
+					const cellText = processMdArrayNewLines(currentCells, true);
 					line.push(cellText);
 					currentCells = [];
 				}
@ -685,7 +944,7 @@ function drawTable(table) {

 				// A cell in a Markdown table cannot have actual new lines so replace
 				// them with <br>, which are supported by the markdown renderers.
-				let cellText = processMdArrayNewLines(td.lines).replace(/\n+/g, "<br>");
+				let cellText = processMdArrayNewLines(td.lines, true).replace(/\n+/g, "<br>");

 				// Inside tables cells, "|" needs to be escaped
 				cellText = cellText.replace(/\|/g, "\\|");
@ -760,6 +1019,9 @@ async function enexXmlToMd(stream, resources) {
 		firstAttachment = false;
 	}

+	//console.log(mdLines);
+	debugMD("DEBUG: raw MdLines", mdLines);
+
 	return processMdArrayNewLines(mdLines);
 }