1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-21 09:38:01 +02:00
joplin/ReactNativeClient/lib/import-enex-md-gen.js

1145 lines
33 KiB
JavaScript
Raw Normal View History

const stringPadding = require('string-padding');
2019-07-29 15:43:53 +02:00
const stringToStream = require('string-to-stream');
const resourceUtils = require('lib/resourceUtils.js');
2017-07-17 20:59:40 +02:00
2019-07-29 15:43:53 +02:00
const BLOCK_OPEN = '[[BLOCK_OPEN]]';
const BLOCK_CLOSE = '[[BLOCK_CLOSE]]';
const NEWLINE = '[[NEWLINE]]';
const NEWLINE_MERGED = '[[MERGED]]';
const SPACE = '[[SPACE]]';
2017-06-25 01:19:11 +02:00
function processMdArrayNewLines(md) {
2017-06-25 01:19:11 +02:00
while (md.length && md[0] == BLOCK_OPEN) {
md.shift();
}
while (md.length && md[md.length - 1] == BLOCK_CLOSE) {
md.pop();
}
let temp = [];
let last = '';
2019-07-29 15:43:53 +02:00
for (let i = 0; i < md.length; i++) {
const v = md[i];
2017-06-25 01:19:11 +02:00
if (isNewLineBlock(last) && isNewLineBlock(v) && last == v) {
// Skip it
} else {
temp.push(v);
}
last = v;
}
md = temp;
temp = [];
2019-07-29 15:43:53 +02:00
last = '';
for (let i = 0; i < md.length; i++) {
const v = md[i];
2017-06-25 01:19:11 +02:00
if (last == BLOCK_CLOSE && v == BLOCK_OPEN) {
temp.pop();
temp.push(NEWLINE_MERGED);
} else {
temp.push(v);
}
last = v;
}
md = temp;
temp = [];
2019-07-29 15:43:53 +02:00
last = '';
for (let i = 0; i < md.length; i++) {
const v = md[i];
2017-06-25 01:19:11 +02:00
if (last == NEWLINE && (v == NEWLINE_MERGED || v == BLOCK_CLOSE)) {
// Skip it
} else {
temp.push(v);
}
last = v;
}
md = temp;
// NEW!!!
temp = [];
2019-07-29 15:43:53 +02:00
last = '';
for (let i = 0; i < md.length; i++) {
const v = md[i];
2017-06-25 01:19:11 +02:00
if (last == NEWLINE && (v == NEWLINE_MERGED || v == BLOCK_OPEN)) {
// Skip it
} else {
temp.push(v);
}
last = v;
}
md = temp;
if (md.length > 2) {
if (md[md.length - 2] == NEWLINE_MERGED && md[md.length - 1] == NEWLINE) {
md.pop();
}
}
let output = '';
let previous = '';
2017-06-25 01:19:11 +02:00
let start = true;
2019-07-29 15:43:53 +02:00
for (let i = 0; i < md.length; i++) {
const v = md[i];
let add = '';
2017-06-25 01:19:11 +02:00
if (v == BLOCK_CLOSE || v == BLOCK_OPEN || v == NEWLINE || v == NEWLINE_MERGED) {
2019-07-29 15:43:53 +02:00
add = '\n';
2017-06-25 01:19:11 +02:00
} else if (v == SPACE) {
2019-07-29 15:43:53 +02:00
if (previous == SPACE || previous == '\n' || start) {
2017-06-25 01:19:11 +02:00
continue; // skip
} else {
2019-07-29 15:43:53 +02:00
add = ' ';
2017-06-25 01:19:11 +02:00
}
} else {
add = v;
}
start = false;
output += add;
previous = add;
}
if (!output.trim().length) return '';
2017-06-25 01:19:11 +02:00
// To simplify the result, we only allow up to one empty line between blocks of text
const mergeMultipleNewLines = function(lines) {
const output = [];
let newlineCount = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (!line.trim()) {
newlineCount++;
} else {
newlineCount = 0;
}
if (newlineCount >= 2) continue;
output.push(line);
}
return output;
2019-07-29 15:43:53 +02:00
};
let lines = output.replace(/\\r/g, '').split('\n');
2019-07-29 15:43:53 +02:00
lines = formatMdLayout(lines);
lines = mergeMultipleNewLines(lines);
return lines.join('\n');
}
// While the processMdArrayNewLines() function adds newlines in a way that's technically correct, the resulting Markdown can look messy.
// This is because while a "block" element should be surrounded by newlines, in practice, some should be surrounded by TWO new lines, while
// others by only ONE.
//
// For instance, this:
//
// <li>one</li>
// <li>two</li>
// <li>three</li>
//
// should result in this:
2019-07-29 15:43:53 +02:00
//
// - one
// - two
// - three
//
// While this:
//
// <p>Some long paragraph</p><p>And another one</p><p>And the last paragraph</p>
//
// should result in this:
//
// Some long paragraph
2019-07-29 15:43:53 +02:00
//
// And another one
2019-07-29 15:43:53 +02:00
//
// And the last paragraph
//
// So in one case, one newline between tags, and in another two newlines. In HTML this would be done via CSS, but in Markdown we need
// to add new lines. It's also important to get these newlines right because two blocks of text next to each others might be renderered
// differently than if there's a newlines between them. So the function below parses the almost final MD and add new lines depending
// on various rules.
2019-07-29 15:43:53 +02:00
const isHeading = function(line) {
return !!line.match(/^#+\s/);
};
2019-07-29 15:43:53 +02:00
const isListItem = function(line) {
return line && line.trim().indexOf('- ') === 0;
};
2019-07-29 15:43:53 +02:00
const isCodeLine = function(line) {
return line && line.indexOf('\t') === 0;
};
2019-07-29 15:43:53 +02:00
const isTableLine = function(line) {
return line.indexOf('| ') === 0;
};
2019-07-29 15:43:53 +02:00
const isPlainParagraph = function(line) {
// Note: if a line is no longer than 80 characters, we don't consider it's a paragraph, which
// means no newlines will be added before or after. This is to handle text that has been
// written with "hard" new lines.
if (!line || line.length < 80) return false;
2019-07-29 15:43:53 +02:00
if (isListItem(line)) return false;
if (isHeading(line)) return false;
if (isCodeLine(line)) return false;
if (isTableLine(line)) return false;
2019-07-29 15:43:53 +02:00
return true;
};
2019-07-29 15:43:53 +02:00
function formatMdLayout(lines) {
let previous = '';
const newLines = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Add a new line at the end of a list of items
if (isListItem(previous) && line && !isListItem(line)) {
newLines.push('');
2019-07-29 15:43:53 +02:00
// Add a new line at the beginning of a list of items
} else if (isListItem(line) && previous && !isListItem(previous)) {
newLines.push('');
2019-07-29 15:43:53 +02:00
// Add a new line before a heading
} else if (isHeading(line) && previous) {
newLines.push('');
2019-07-29 15:43:53 +02:00
// Add a new line after a heading
} else if (isHeading(previous) && line) {
newLines.push('');
} else if (isCodeLine(line) && !isCodeLine(previous)) {
newLines.push('');
} else if (!isCodeLine(line) && isCodeLine(previous)) {
newLines.push('');
} else if (isTableLine(line) && !isTableLine(previous)) {
newLines.push('');
} else if (!isTableLine(line) && isTableLine(previous)) {
newLines.push('');
2019-07-29 15:43:53 +02:00
// Add a new line at beginning of paragraph
} else if (isPlainParagraph(line) && previous) {
newLines.push('');
2019-07-29 15:43:53 +02:00
// Add a new line at end of paragraph
} else if (isPlainParagraph(previous) && line) {
newLines.push('');
}
2019-07-29 15:43:53 +02:00
newLines.push(line);
previous = newLines[newLines.length - 1];
}
return newLines;
}
2017-06-25 01:19:11 +02:00
function isWhiteSpace(c) {
return c == '\n' || c == '\r' || c == '\v' || c == '\f' || c == '\t' || c == ' ';
2017-06-25 01:19:11 +02:00
}
// Like QString::simpified(), except that it preserves non-breaking spaces (which
// Evernote uses for identation, etc.)
function simplifyString(s) {
let output = '';
2017-06-25 01:19:11 +02:00
let previousWhite = false;
for (let i = 0; i < s.length; i++) {
const c = s[i];
const isWhite = isWhiteSpace(c);
2017-06-25 01:19:11 +02:00
if (previousWhite && isWhite) {
// skip
} else {
output += c;
}
previousWhite = isWhite;
}
while (output.length && isWhiteSpace(output[0])) output = output.substr(1);
while (output.length && isWhiteSpace(output[output.length - 1])) output = output.substr(0, output.length - 1);
return output;
}
function collapseWhiteSpaceAndAppend(lines, state, text) {
if (state.inCode.length) {
lines.push(text);
2017-06-25 01:19:11 +02:00
} else {
// Remove all \n and \r from the left and right of the text
2019-07-29 15:43:53 +02:00
while (text.length && (text[0] == '\n' || text[0] == '\r')) text = text.substr(1);
while (text.length && (text[text.length - 1] == '\n' || text[text.length - 1] == '\r')) text = text.substr(0, text.length - 1);
2017-06-25 01:19:11 +02:00
// Collapse all white spaces to just one. If there are spaces to the left and right of the string
// also collapse them to just one space.
const spaceLeft = text.length && text[0] == ' ';
const spaceRight = text.length && text[text.length - 1] == ' ';
2017-06-25 01:19:11 +02:00
text = simplifyString(text);
2019-07-29 15:43:53 +02:00
if (!spaceLeft && !spaceRight && text == '') return lines;
2017-06-25 01:19:11 +02:00
if (state.inQuote) {
// Add a ">" at the beginning of the block then at the beginning of each lines. So it turns this:
// "my quote\nsecond line" into this => "> my quote\n> second line"
lines.push('> ');
if (lines.indexOf('\r') >= 0) {
text = text.replace(/\n\r/g, '\n\r> ');
} else {
text = text.replace(/\n/g, '\n> ');
}
}
2017-06-25 01:19:11 +02:00
if (spaceLeft) lines.push(SPACE);
lines.push(text);
if (spaceRight) lines.push(SPACE);
}
return lines;
}
function tagAttributeToMdText(attr) {
// HTML attributes may contain newlines so remove them.
// https://github.com/laurent22/joplin/issues/1583
if (!attr) return '';
attr = attr.replace(/[\n\r]+/g, ' ');
attr = attr.replace(/\]/g, '\\]');
return attr;
}
2019-07-29 15:43:53 +02:00
function addResourceTag(lines, resource, alt = '') {
// Note: refactor to use Resource.markdownTag
2017-08-01 23:40:14 +02:00
if (!alt) alt = resource.title;
if (!alt) alt = resource.filename;
if (!alt) alt = '';
alt = tagAttributeToMdText(alt);
if (resourceUtils.isImageMimeType(resource.mime)) {
2019-07-29 15:43:53 +02:00
lines.push('![');
lines.push(alt);
2019-09-19 23:51:18 +02:00
lines.push(`](:/${resource.id})`);
2017-06-25 01:19:11 +02:00
} else {
2019-07-29 15:43:53 +02:00
lines.push('[');
lines.push(alt);
2019-09-19 23:51:18 +02:00
lines.push(`](:/${resource.id})`);
2017-06-25 01:19:11 +02:00
}
return lines;
}
function isBlockTag(n) {
2019-07-29 15:43:53 +02:00
return ['div', 'p', 'dl', 'dd', 'dt', 'center', 'address'].indexOf(n) >= 0;
2017-06-25 01:19:11 +02:00
}
function isStrongTag(n) {
2019-07-29 15:43:53 +02:00
return n == 'strong' || n == 'b' || n == 'big';
}
function isStrikeTag(n) {
2019-07-29 15:43:53 +02:00
return n == 'strike' || n == 's' || n == 'del';
2017-06-25 01:19:11 +02:00
}
function isEmTag(n) {
2019-07-29 15:43:53 +02:00
return n == 'em' || n == 'i' || n == 'u';
2017-06-25 01:19:11 +02:00
}
function isAnchor(n) {
2019-07-29 15:43:53 +02:00
return n == 'a';
2017-06-25 01:19:11 +02:00
}
function isIgnoredEndTag(n) {
2019-07-29 15:43:53 +02:00
return ['en-note', 'en-todo', 'body', 'html', 'font', 'br', 'hr', 'tbody', 'sup', 'img', 'abbr', 'cite', 'thead', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area'].indexOf(n) >= 0;
2017-06-25 01:19:11 +02:00
}
function isListTag(n) {
2019-07-29 15:43:53 +02:00
return n == 'ol' || n == 'ul';
2017-06-25 01:19:11 +02:00
}
// Elements that don't require any special treatment beside adding a newline character
function isNewLineOnlyEndTag(n) {
2019-07-29 15:43:53 +02:00
return ['div', 'p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dl', 'dd', 'dt', 'center', 'address'].indexOf(n) >= 0;
}
function isInlineCodeTag(n) {
return ['samp', 'kbd'].indexOf(n) >= 0;
}
2017-06-25 01:19:11 +02:00
function isNewLineBlock(s) {
return s == BLOCK_OPEN || s == BLOCK_CLOSE;
}
function attributeToLowerCase(node) {
if (!node.attributes) return {};
const output = {};
for (const n in node.attributes) {
if (!node.attributes.hasOwnProperty(n)) continue;
output[n.toLowerCase()] = node.attributes[n];
}
return output;
}
function isSpanWithStyle(attributes) {
if (attributes != undefined) {
if ('style' in attributes) {
return true;
} else {
return false;
}
}
}
function isSpanStyleBold(attributes) {
const style = attributes.style;
if (style.includes('font-weight: bold;')) {
return true;
2019-07-29 15:43:53 +02:00
} else if (style.search(/font-family:.*,Bold.*;/) != -1) {
2019-10-09 21:35:13 +02:00
// console.debug('font-family regex matched');
return true;
} else {
2019-10-09 21:35:13 +02:00
// console.debug('Found unsupported style(s) in span tag: %s', style);
return false;
}
}
function isSpanStyleItalic(attributes) {
let style = attributes.style;
style = style.replace(/\s+/g, '');
return (style.toLowerCase().includes('font-style:italic;'));
}
function enexXmlToMdArray(stream, resources) {
const remainingResources = resources.slice();
const removeRemainingResource = id => {
for (let i = 0; i < remainingResources.length; i++) {
const r = remainingResources[i];
if (r.id === id) {
remainingResources.splice(i, 1);
}
}
2019-07-29 15:43:53 +02:00
};
2017-06-25 01:19:11 +02:00
return new Promise((resolve) => {
const state = {
inCode: [],
inPre: false,
inQuote: false,
2017-06-25 01:19:11 +02:00
lists: [],
anchorAttributes: [],
spanAttributes: [],
2017-06-25 01:19:11 +02:00
};
const options = {};
const strict = false;
const saxStream = require('sax').createStream(strict, options);
2017-06-25 01:19:11 +02:00
2017-07-13 21:29:10 +02:00
let section = {
type: 'text',
2017-07-13 21:29:10 +02:00
lines: [],
parent: null,
};
saxStream.on('error', function(e) {
console.warn(e);
2019-07-29 15:43:53 +02:00
});
2017-06-25 01:19:11 +02:00
const unwrapInnerText = text => {
const lines = text.split('\n');
let output = '';
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
2019-07-29 15:43:53 +02:00
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
if (!line) {
output += '\n';
continue;
}
if (nextLine) {
2019-09-19 23:51:18 +02:00
output += `${line} `;
} else {
output += line;
}
}
return output;
2019-07-29 15:43:53 +02:00
};
saxStream.on('text', function(text) {
if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return;
text = !state.inPre ? unwrapInnerText(text) : text;
2017-07-13 21:29:10 +02:00
section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text);
2019-07-29 15:43:53 +02:00
});
2017-06-25 01:19:11 +02:00
saxStream.on('opentag', function(node) {
const nodeAttributes = attributeToLowerCase(node);
const n = node.name.toLowerCase();
const currentList = state.lists && state.lists.length ? state.lists[state.lists.length - 1] : null;
// Kind of a hack: If we are inside a list, at the beginning of an item (when a "- " or "1. " has been added
// but no other text yet), if the current tag is eg. a <div> or any other block tag, we skip it, so that a new line
// does not get created. It is to handle list4.html test case.
// https://github.com/laurent22/joplin/issues/832
if (currentList) {
if (!currentList.startedText && isBlockTag(n)) return;
currentList.startedText = true;
}
if (n == 'en-note') {
2017-06-25 01:19:11 +02:00
// Start of note
} else if (isBlockTag(n)) {
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_OPEN);
} else if (n == 'table') {
const newSection = {
type: 'table',
2017-07-13 21:29:10 +02:00
lines: [],
parent: section,
};
section.lines.push(newSection);
section = newSection;
} else if (n == 'tbody' || n == 'thead') {
2017-07-13 21:29:10 +02:00
// Ignore it
} else if (n == 'tr') {
if (section.type != 'table') {
console.warn('Found a <tr> tag outside of a table');
return;
}
2017-07-13 21:29:10 +02:00
const newSection = {
type: 'tr',
2017-07-13 21:29:10 +02:00
lines: [],
parent: section,
2017-07-17 20:59:40 +02:00
isHeader: false,
2019-07-29 15:43:53 +02:00
};
2017-07-13 21:29:10 +02:00
section.lines.push(newSection);
section = newSection;
} else if (n == 'td' || n == 'th') {
if (section.type != 'tr') {
console.warn('Found a <td> tag outside of a <tr>');
return;
}
2017-07-13 21:29:10 +02:00
if (n == 'th') section.isHeader = true;
2017-07-17 20:59:40 +02:00
const newSection = {
type: 'td',
2017-07-13 21:29:10 +02:00
lines: [],
parent: section,
};
section.lines.push(newSection);
section = newSection;
2017-06-25 01:19:11 +02:00
} else if (isListTag(n)) {
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_OPEN);
state.lists.push({ tag: n, counter: 1, startedText: false });
} else if (n == 'li') {
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_OPEN);
2017-06-25 01:19:11 +02:00
if (!state.lists.length) {
2019-07-29 15:43:53 +02:00
console.warn('Found <li> tag without being inside a list');
2017-06-25 01:19:11 +02:00
return;
}
const container = state.lists[state.lists.length - 1];
container.startedText = false;
const indent = ' '.repeat(state.lists.length - 1);
2019-07-29 15:43:53 +02:00
if (container.tag == 'ul') {
2019-09-19 23:51:18 +02:00
section.lines.push(`${indent}- `);
2017-06-25 01:19:11 +02:00
} else {
2019-09-19 23:51:18 +02:00
section.lines.push(`${indent + container.counter}. `);
2017-06-25 01:19:11 +02:00
container.counter++;
}
} else if (isStrongTag(n)) {
2019-07-29 15:43:53 +02:00
section.lines.push('**');
} else if (isStrikeTag(n)) {
section.lines.push('(');
} else if (isInlineCodeTag(n)) {
section.lines.push('`');
} else if (n == 'q') {
section.lines.push('"');
} else if (n == 'img') {
2019-07-29 15:43:53 +02:00
if (nodeAttributes.src) {
// Many (most?) img tags don't have no source associated, especially when they were imported from HTML
let s = '![';
if (nodeAttributes.alt) s += tagAttributeToMdText(nodeAttributes.alt);
2019-09-19 23:51:18 +02:00
s += `](${nodeAttributes.src})`;
section.lines.push(s);
}
2017-06-25 01:19:11 +02:00
} else if (isAnchor(n)) {
state.anchorAttributes.push(nodeAttributes);
// Need to add the '[' via this function to make sure that links within code blocks
// are handled correctly.
collapseWhiteSpaceAndAppend(section.lines, state, '[');
2017-06-25 01:19:11 +02:00
} else if (isEmTag(n)) {
2019-07-29 15:43:53 +02:00
section.lines.push('*');
} else if (n == 'en-todo') {
const x = nodeAttributes && nodeAttributes.checked && nodeAttributes.checked.toLowerCase() == 'true' ? 'X' : ' ';
2019-09-19 23:51:18 +02:00
section.lines.push(`- [${x}] `);
2019-07-29 15:43:53 +02:00
} else if (n == 'hr') {
2017-07-15 18:14:15 +02:00
// Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML
section.lines.push(NEWLINE);
section.lines.push('* * *');
2017-07-15 18:14:15 +02:00
section.lines.push(NEWLINE);
section.lines.push(NEWLINE);
2019-07-29 15:43:53 +02:00
} else if (n == 'h1') {
section.lines.push(BLOCK_OPEN);
section.lines.push('# ');
} else if (n == 'h2') {
section.lines.push(BLOCK_OPEN);
section.lines.push('## ');
} else if (n == 'h3') {
section.lines.push(BLOCK_OPEN);
section.lines.push('### ');
} else if (n == 'h4') {
section.lines.push(BLOCK_OPEN);
section.lines.push('#### ');
} else if (n == 'h5') {
section.lines.push(BLOCK_OPEN);
section.lines.push('##### ');
} else if (n == 'h6') {
section.lines.push(BLOCK_OPEN);
section.lines.push('###### ');
} else if (n == 'blockquote') {
section.lines.push(BLOCK_OPEN);
state.inQuote = true;
} else if (n === 'code') {
state.inCode.push(true);
state.currentCode = '';
const newSection = {
type: 'code',
lines: [],
parent: section,
2019-07-29 15:43:53 +02:00
};
section.lines.push(newSection);
section = newSection;
} else if (n === 'pre') {
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_OPEN);
state.inPre = true;
2019-07-29 15:43:53 +02:00
} else if (n == 'br') {
2017-07-13 21:29:10 +02:00
section.lines.push(NEWLINE);
2019-07-29 15:43:53 +02:00
} else if (n == 'en-media') {
const hash = nodeAttributes.hash;
2017-06-25 01:19:11 +02:00
let resource = null;
for (let i = 0; i < resources.length; i++) {
const r = resources[i];
2017-06-25 01:19:11 +02:00
if (r.id == hash) {
resource = r;
removeRemainingResource(r.id);
2017-06-25 01:19:11 +02:00
break;
}
}
if (!resource) {
// This is a bit of a hack. Notes sometime have resources attached to it, but those <resource> tags don't contain
// an "objID" tag, making it impossible to reference the resource. However, in this case the content of the note
// will contain a corresponding <en-media/> tag, which has the ID in the "hash" attribute. All this information
// has been collected above so we now set the resource ID to the hash attribute of the en-media tags. Here's an
// example of note that shows this problem:
// <?xml version="1.0" encoding="UTF-8"?>
// <!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export2.dtd">
// <en-export export-date="20161221T203133Z" application="Evernote/Windows" version="6.x">
// <note>
// <title>Commande</title>
// <content>
// <![CDATA[
// <?xml version="1.0" encoding="UTF-8"?>
// <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">
// <en-note>
// <en-media alt="your QR code" hash="216a16a1bbe007fba4ccf60b118b4ccc" type="image/png"></en-media>
// </en-note>
// ]]>
// </content>
// <created>20160921T203424Z</created>
// <updated>20160921T203438Z</updated>
// <note-attributes>
// <reminder-order>20160902T140445Z</reminder-order>
// <reminder-done-time>20160924T101120Z</reminder-done-time>
// </note-attributes>
// <resource>
// <data encoding="base64">........</data>
// <mime>image/png</mime>
// <width>150</width>
// <height>150</height>
// </resource>
// </note>
// </en-export>
// Note that there's also the case of resources with no ID where the ID is actually the MD5 of the content.
// This is handled in import-enex.js
2017-06-25 01:19:11 +02:00
let found = false;
for (let i = 0; i < remainingResources.length; i++) {
const r = remainingResources[i];
2017-06-25 01:19:11 +02:00
if (!r.id) {
resource = Object.assign({}, r);
resource.id = hash;
remainingResources.splice(i, 1);
2017-06-25 01:19:11 +02:00
found = true;
break;
}
}
if (!found) {
2019-09-19 23:51:18 +02:00
console.warn(`Hash with no associated resource: ${hash}`);
2017-06-25 01:19:11 +02:00
}
}
// If the resource does not appear among the note's resources, it
// means it's an attachement. It will be appended along with the
// other remaining resources at the bottom of the markdown text.
if (resource && !!resource.id) {
section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt);
2017-06-25 01:19:11 +02:00
}
2019-07-29 15:43:53 +02:00
} else if (n == 'span') {
if (isSpanWithStyle(nodeAttributes)) {
// console.debug('Found style(s) in span tag: %s', nodeAttributes.style);
state.spanAttributes.push(nodeAttributes);
if (isSpanStyleBold(nodeAttributes)) {
2019-10-09 21:35:13 +02:00
// console.debug('Applying style found in span tag: bold')
2019-07-29 15:43:53 +02:00
section.lines.push('**');
}
if (isSpanStyleItalic(nodeAttributes)) {
// console.debug('Applying style found in span tag: italic')
section.lines.push('*');
}
}
2019-07-29 15:43:53 +02:00
} else if (['font', 'sup', 'cite', 'abbr', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area'].indexOf(n) >= 0) {
// Inline tags that can be ignored in Markdown
2017-06-25 01:19:11 +02:00
} else {
2019-09-19 23:51:18 +02:00
console.warn(`Unsupported start tag: ${n}`);
2017-06-25 01:19:11 +02:00
}
2019-07-29 15:43:53 +02:00
});
2017-06-25 01:19:11 +02:00
saxStream.on('closetag', function(n) {
n = n ? n.toLowerCase() : n;
if (n == 'en-note') {
2017-06-25 01:19:11 +02:00
// End of note
} else if (isNewLineOnlyEndTag(n)) {
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_CLOSE);
} else if (n == 'td' || n == 'th') {
if (section && section.parent) section = section.parent;
} else if (n == 'tr') {
if (section && section.parent) section = section.parent;
} else if (n == 'table') {
if (section && section.parent) section = section.parent;
2017-06-25 01:19:11 +02:00
} else if (isIgnoredEndTag(n)) {
// Skip
} else if (isListTag(n)) {
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_CLOSE);
2017-06-25 01:19:11 +02:00
state.lists.pop();
} else if (isStrongTag(n)) {
2019-07-29 15:43:53 +02:00
section.lines.push('**');
} else if (isStrikeTag(n)) {
section.lines.push(')');
} else if (isInlineCodeTag(n)) {
section.lines.push('`');
2017-06-25 01:19:11 +02:00
} else if (isEmTag(n)) {
2019-07-29 15:43:53 +02:00
section.lines.push('*');
} else if (n == 'q') {
section.lines.push('"');
} else if (n == 'blockquote') {
section.lines.push(BLOCK_OPEN);
state.inQuote = false;
} else if (n === 'code') {
state.inCode.pop();
if (!state.inCode.length) {
const codeLines = section.lines.join('').split('\n');
section.lines = [];
if (codeLines.length > 1) {
for (let i = 0; i < codeLines.length; i++) {
if (i > 0) section.lines.push('\n');
2019-09-19 23:51:18 +02:00
section.lines.push(`\t${codeLines[i]}`);
}
} else {
2019-09-19 23:51:18 +02:00
section.lines.push(`\`${codeLines.join('')}\``);
}
if (section && section.parent) section = section.parent;
}
} else if (n === 'pre') {
state.inPre = false;
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_CLOSE);
2017-06-25 01:19:11 +02:00
} else if (isAnchor(n)) {
const attributes = state.anchorAttributes.pop();
const url = attributes && attributes.href ? attributes.href : '';
2017-07-13 21:16:01 +02:00
if (section.lines.length < 1) throw new Error('Invalid anchor tag closing'); // Sanity check, but normally not possible
2017-07-13 21:16:01 +02:00
// When closing the anchor tag, check if there's is any text content. If not
// put the URL as is (don't wrap it in [](url)). The markdown parser, using
// GitHub flavour, will turn this URL into a link. This is to generate slightly
// cleaner markdown.
// Need to loop on the previous tags so as to skip the special ones, which are not relevant for the below algorithm.
let previous = null;
for (let i = section.lines.length - 1; i >= 0; i--) {
previous = section.lines[i];
if ([BLOCK_OPEN, BLOCK_CLOSE, NEWLINE, NEWLINE_MERGED, SPACE].indexOf(previous) >= 0 || !previous) {
continue;
} else {
break;
}
}
if (previous == '[') {
// We have a link that had some content but, after parsing, nothing is left. The content was most likely
// something that shows up via CSS and which we cannot support. For example:
//
// <a onclick="return vote()" href="vote?id=17045576">
// <div class="votearrow" title="upvote"></div>
// </a>
//
// In the case above the arrow is displayed via CSS.
// It is useless to display the full URL since often it is not relevant for a note (for example
// it's interactive bits) and it's not user-generated content such as a URL that would appear in a comment.
// So in this case, we still want to preserve the information but display it in a discreet way as a simple [L].
// Need to pop everything inside the current [] because it can only be special chars that we don't want (they would create uncessary newlines)
for (let i = section.lines.length - 1; i >= 0; i--) {
if (section.lines[i] !== '[') {
section.lines.pop();
} else {
break;
}
}
if (!url) {
// If there's no URL and no content, pop the [ and don't save any content.
section.lines.pop();
} else {
section.lines.push('(L)');
2019-09-19 23:51:18 +02:00
section.lines.push(`](${url})`);
}
2017-07-13 21:16:01 +02:00
} else if (!previous || previous == url) {
2017-07-13 21:29:10 +02:00
section.lines.pop();
section.lines.pop();
section.lines.push(url);
2017-07-13 21:16:01 +02:00
} else {
// Need to remove any new line character between the current ']' and the previous '['
// otherwise it won't render properly.
let allSpaces = true;
for (let i = section.lines.length - 1; i >= 0; i--) {
const c = section.lines[i];
if (c === '[') {
break;
} else {
if (c === BLOCK_CLOSE || c === BLOCK_OPEN || c === NEWLINE) {
section.lines[i] = SPACE;
} else {
if (!isWhiteSpace(c)) allSpaces = false;
}
}
}
if (allSpaces) {
for (let i = section.lines.length - 1; i >= 0; i--) {
const c = section.lines.pop();
if (c === '[') break;
2019-07-29 15:43:53 +02:00
}
section.lines.push(url);
} else {
// Eg. converts:
// [ Sign in ](https://example.com)
// to:
// [Sign in](https://example.com)
const trimTextStartAndEndSpaces = function(lines) {
let firstBracketIndex = 0;
let foundFirstNonWhite = false;
for (let i = lines.length - 1; i >= 0; i--) {
const l = lines[i];
if (!foundFirstNonWhite && (l === SPACE || l === ' ' || !l)) {
lines.pop();
} else {
foundFirstNonWhite = true;
}
if (l === '[') {
firstBracketIndex = i;
break;
}
}
for (let i = firstBracketIndex + 1; i < lines.length; i++) {
const l = lines[i];
2019-07-29 15:43:53 +02:00
if (l === SPACE || l === ' ' || !l) {
lines.splice(i, 1);
} else {
break;
}
}
return lines;
2019-07-29 15:43:53 +02:00
};
section.lines = trimTextStartAndEndSpaces(section.lines);
2019-09-19 23:51:18 +02:00
section.lines.push(`](${url})`);
}
2017-07-13 21:16:01 +02:00
}
2017-06-25 01:19:11 +02:00
} else if (isListTag(n)) {
2017-07-13 21:29:10 +02:00
section.lines.push(BLOCK_CLOSE);
2017-06-25 01:19:11 +02:00
state.lists.pop();
2019-07-29 15:43:53 +02:00
} else if (n == 'en-media') {
2017-06-25 01:19:11 +02:00
// Skip
} else if (n == 'span') {
const attributes = state.spanAttributes.pop();
if (isSpanWithStyle(attributes)) {
if (isSpanStyleBold(attributes)) {
2019-10-09 21:35:13 +02:00
// console.debug('Applying style found in span tag (closing): bold')
2019-07-29 15:43:53 +02:00
section.lines.push('**');
}
if (isSpanStyleItalic(attributes)) {
// console.debug('Applying style found in span tag (closing): italic')
section.lines.push('*');
}
}
2017-06-25 01:19:11 +02:00
} else if (isIgnoredEndTag(n)) {
// Skip
} else {
2019-09-19 23:51:18 +02:00
console.warn(`Unsupported end tag: ${n}`);
2017-06-25 01:19:11 +02:00
}
2019-07-29 15:43:53 +02:00
});
saxStream.on('attribute', function() {});
saxStream.on('end', function() {
2017-06-25 01:19:11 +02:00
resolve({
2017-07-13 21:29:10 +02:00
content: section,
resources: remainingResources,
2017-06-25 01:19:11 +02:00
});
2019-07-29 15:43:53 +02:00
});
2017-06-25 01:19:11 +02:00
stream.pipe(saxStream);
});
}
function tableHasSubTables(table) {
2017-07-17 20:59:40 +02:00
for (let trIndex = 0; trIndex < table.lines.length; trIndex++) {
const tr = table.lines[trIndex];
if (!tr || !tr.lines) continue;
2019-07-29 15:43:53 +02:00
2017-07-17 20:59:40 +02:00
for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) {
const td = tr.lines[tdIndex];
for (let i = 0; i < td.lines.length; i++) {
if (typeof td.lines[i] === 'object') return true;
}
2017-07-17 20:59:40 +02:00
}
}
return false;
2017-07-17 20:59:40 +02:00
}
2017-07-13 21:29:10 +02:00
// Markdown tables don't support tables within tables, which is common in notes that are complete web pages, for example when imported
// via Web Clipper. So to handle this, we render all the outer tables as regular text (as if replacing all the <table>, <tr> and <td>
// elements by <div>) and only the inner ones, those that don't contain any other tables, are rendered as actual tables. This is generally
// the required behaviour since the outer tables are usually for layout and the inner ones are the content.
function drawTable(table) {
2017-07-17 20:59:40 +02:00
// | First Header | Second Header |
// | ------------- | ------------- |
// | Content Cell | Content Cell |
// | Content Cell | Content Cell |
// There must be at least 3 dashes separating each header cell.
// https://gist.github.com/IanWang/28965e13cdafdef4e11dc91f578d160d#tables
const flatRender = tableHasSubTables(table); // Render the table has regular text
2017-07-17 20:59:40 +02:00
let lines = [];
lines.push(BLOCK_OPEN);
2017-07-17 20:59:40 +02:00
let headerDone = false;
for (let trIndex = 0; trIndex < table.lines.length; trIndex++) {
const tr = table.lines[trIndex];
const isHeader = tr.isHeader;
const line = [];
const headerLine = [];
2017-07-17 20:59:40 +02:00
let emptyHeader = null;
for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) {
const td = tr.lines[tdIndex];
2017-07-17 20:59:40 +02:00
if (flatRender) {
line.push(BLOCK_OPEN);
let currentCells = [];
const renderCurrentCells = () => {
if (!currentCells.length) return;
const cellText = processMdArrayNewLines(currentCells);
line.push(cellText);
currentCells = [];
2019-07-29 15:43:53 +02:00
};
// In here, recursively render the tables
for (let i = 0; i < td.lines.length; i++) {
const c = td.lines[i];
2019-07-29 15:43:53 +02:00
if (typeof c === 'object' && ['table', 'td', 'tr', 'th'].indexOf(c.type) >= 0) {
// This is a table
renderCurrentCells();
currentCells = currentCells.concat(drawTable(c));
2019-07-29 15:43:53 +02:00
} else {
// This is plain text
currentCells.push(c);
}
}
renderCurrentCells();
line.push(BLOCK_CLOSE);
2019-07-29 15:43:53 +02:00
} else {
// Regular table rendering
// A cell in a Markdown table cannot have actual new lines so replace
// them with <br>, which are supported by the markdown renderers.
2019-07-29 15:43:53 +02:00
let cellText = processMdArrayNewLines(td.lines, true);
let lines = cellText.split('\n');
lines = postProcessMarkdown(lines);
2019-07-29 15:43:53 +02:00
cellText = lines.join('\n').replace(/\n+/g, '<br>');
// Inside tables cells, "|" needs to be escaped
2019-07-29 15:43:53 +02:00
cellText = cellText.replace(/\|/g, '\\|');
// Previously the width of the cell was as big as the content since it looks nicer, however that often doesn't work
// since the content can be very long, resulting in unreadable markdown. So no solution is perfect but making it a
// width of 3 is a bit better. Note that 3 is the minimum width of a cell - below this, it won't be rendered by
// markdown parsers.
const width = 3;
line.push(stringPadding(cellText, width, ' ', stringPadding.RIGHT));
if (!headerDone) {
if (!isHeader) {
if (!emptyHeader) emptyHeader = [];
const h = stringPadding(' ', width, ' ', stringPadding.RIGHT);
emptyHeader.push(h);
}
headerLine.push('-'.repeat(width));
2017-07-13 21:29:10 +02:00
}
}
2017-07-17 20:59:40 +02:00
}
if (flatRender) {
2017-07-17 20:59:40 +02:00
headerDone = true;
lines.push(BLOCK_OPEN);
lines = lines.concat(line);
lines.push(BLOCK_CLOSE);
} else {
if (emptyHeader) {
2019-09-19 23:51:18 +02:00
lines.push(`| ${emptyHeader.join(' | ')} |`);
lines.push(`| ${headerLine.join(' | ')} |`);
headerDone = true;
}
2017-07-17 20:59:40 +02:00
2019-09-19 23:51:18 +02:00
lines.push(`| ${line.join(' | ')} |`);
2017-07-17 20:59:40 +02:00
if (!headerDone) {
2019-09-19 23:51:18 +02:00
lines.push(`| ${headerLine.join(' | ')} |`);
headerDone = true;
}
2017-07-13 21:29:10 +02:00
}
}
lines.push(BLOCK_CLOSE);
2019-09-19 23:51:18 +02:00
return flatRender ? lines : lines.join(`<<<<:D>>>>${NEWLINE}<<<<:D>>>>`).split('<<<<:D>>>>');
2017-07-13 21:29:10 +02:00
}
function postProcessMarkdown(lines) {
// After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to
// block start/end or elements that were ignored, etc. If these white spaces were intended it's not really
// possible to detect it, so simply trim them all so that the result is more deterministic and can be
// easily unit tested.
const trimEmptyLines = function(lines) {
while (lines.length) {
if (!lines[0].trim()) {
lines.splice(0, 1);
} else {
break;
}
}
while (lines.length) {
if (!lines[lines.length - 1].trim()) {
lines.pop();
} else {
break;
}
}
return lines;
2019-07-29 15:43:53 +02:00
};
function cleanUpSpaces(lines) {
const output = [];
for (let i = 0; i < lines.length; i++) {
let line = lines[i];
if (line.length) {
// eg. " - Some list item" => " - Some list item"
// Note that spaces before the "-" are preserved
2019-07-29 15:43:53 +02:00
line = line.replace(/^(\s+|)-\s+/, '$1- ');
// eg "Some text " => "Some text"
2019-07-29 15:43:53 +02:00
line = line.replace(/^(.*?)\s+$/, '$1');
}
output.push(line);
}
return output;
}
2019-07-29 15:43:53 +02:00
lines = trimEmptyLines(lines);
lines = cleanUpSpaces(lines);
return lines;
}
async function enexXmlToMd(xmlString, resources, options = {}) {
const stream = stringToStream(xmlString);
const result = await enexXmlToMdArray(stream, resources, options);
let mdLines = [];
for (let i = 0; i < result.content.lines.length; i++) {
const line = result.content.lines[i];
2019-07-29 15:43:53 +02:00
if (typeof line === 'object' && line.type === 'table') {
// A table
const table = line;
const tableLines = drawTable(table);
mdLines = mdLines.concat(tableLines);
} else if (typeof line === 'object' && line.type === 'code') {
mdLines = mdLines.concat(line.lines);
} else if (typeof line === 'object') {
console.warn('Unhandled object type:', line);
mdLines = mdLines.concat(line.lines);
2019-07-29 15:43:53 +02:00
} else {
// an actual line
mdLines.push(line);
}
}
let firstAttachment = true;
for (let i = 0; i < result.resources.length; i++) {
const r = result.resources[i];
if (firstAttachment) mdLines.push(NEWLINE);
mdLines.push(NEWLINE);
mdLines = addResourceTag(mdLines, r, r.filename);
firstAttachment = false;
}
2019-07-29 15:43:53 +02:00
let output = processMdArrayNewLines(mdLines).split('\n');
output = postProcessMarkdown(output);
return output.join('\n');
2017-06-25 01:19:11 +02:00
}
2019-07-29 15:43:53 +02:00
module.exports = { enexXmlToMd, processMdArrayNewLines, NEWLINE, addResourceTag };