import markdownUtils from './markdownUtils'; import { ResourceEntity } from './services/database/types'; import { htmlentities } from '@joplin/utils/html'; const stringPadding = require('string-padding'); const stringToStream = require('string-to-stream'); const resourceUtils = require('./resourceUtils.js'); const cssParser = require('css'); const BLOCK_OPEN = '[[BLOCK_OPEN]]'; const BLOCK_CLOSE = '[[BLOCK_CLOSE]]'; const NEWLINE = '[[NEWLINE]]'; const NEWLINE_MERGED = '[[MERGED]]'; const SPACE = '[[SPACE]]'; enum SectionType { Text = 'text', Tr = 'tr', Td = 'td', Table = 'table', Caption = 'caption', Hidden = 'hidden', Code = 'code', } interface Section { type: SectionType; parent: Section; lines: any[]; isHeader?: boolean; } interface ParserStateTag { name: string; visible: boolean; isCodeBlock: boolean; isHighlight: boolean; } enum ListTag { Ul = 'ul', Ol = 'ol', CheckboxList = 'checkboxList', TaskList = 'taskList', } interface ParserStateList { tag: ListTag; counter: number; startedText: boolean; } interface ParserState { inCode: boolean[]; inPre: boolean; inQuote: boolean; lists: ParserStateList[]; anchorAttributes: any[]; spanAttributes: string[]; tags: ParserStateTag[]; currentCode?: string; } interface ExtractedTask { title: string; completed: boolean; groupId: string; } interface EnexXmlToMdArrayResult { content: Section; resources: ResourceEntity[]; } function processMdArrayNewLines(md: string[]): string { while (md.length && md[0] === BLOCK_OPEN) { md.shift(); } while (md.length && md[md.length - 1] === BLOCK_CLOSE) { md.pop(); } let temp = []; let last = ''; for (let i = 0; i < md.length; i++) { const v = md[i]; if (isNewLineBlock(last) && isNewLineBlock(v) && last === v) { // Skip it } else { temp.push(v); } last = v; } md = temp; temp = []; last = ''; for (let i = 0; i < md.length; i++) { const v = md[i]; if (last === BLOCK_CLOSE && v === BLOCK_OPEN) { temp.pop(); temp.push(NEWLINE_MERGED); } else { temp.push(v); } last = v; } md = temp; temp = []; last = ''; for (let i = 0; i < md.length; i++) { const v = md[i]; if (last === NEWLINE && (v === NEWLINE_MERGED || v === BLOCK_CLOSE)) { // Skip it } else { temp.push(v); } last = v; } md = temp; // NEW!!! temp = []; last = ''; for (let i = 0; i < md.length; i++) { const v = md[i]; if (last === NEWLINE && (v === NEWLINE_MERGED || v === BLOCK_OPEN)) { // Skip it } else { temp.push(v); } last = v; } md = temp; if (md.length > 2) { if (md[md.length - 2] === NEWLINE_MERGED && md[md.length - 1] === NEWLINE) { md.pop(); } } let output = ''; let previous = ''; let start = true; for (let i = 0; i < md.length; i++) { const v = md[i]; let add = ''; if (v === BLOCK_CLOSE || v === BLOCK_OPEN || v === NEWLINE || v === NEWLINE_MERGED) { add = '\n'; } else if (v === SPACE) { if (previous === SPACE || previous === '\n' || start) { continue; // skip } else { add = ' '; } } else { add = v; } start = false; output += add; previous = add; } if (!output.trim().length) return ''; // To simplify the result, we only allow up to one empty line between blocks of text const mergeMultipleNewLines = function(lines: string[]) { const output = []; let newlineCount = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; if (!line.trim()) { newlineCount++; } else { newlineCount = 0; } if (newlineCount >= 2) continue; output.push(line); } return output; }; let lines = output.replace(/\\r/g, '').split('\n'); lines = formatMdLayout(lines); lines = mergeMultipleNewLines(lines); return lines.join('\n'); } // While the processMdArrayNewLines() function adds newlines in a way that's technically correct, the resulting Markdown can look messy. // This is because while a "block" element should be surrounded by newlines, in practice, some should be surrounded by TWO new lines, while // others by only ONE. // // For instance, this: // //
Some long paragraph
And another one
And the last paragraph
// // should result in this: // // Some long paragraph // // And another one // // And the last paragraph // // So in one case, one newline between tags, and in another two newlines. In HTML this would be done via CSS, but in Markdown we need // to add new lines. It's also important to get these newlines right because two blocks of text next to each others might be renderered // differently than if there's a newlines between them. So the function below parses the almost final MD and add new lines depending // on various rules. const isHeading = function(line: string) { return !!line.match(/^#+\s/); }; const isListItem = function(line: string) { return line && line.trim().indexOf('- ') === 0; }; const isCodeLine = function(line: string) { return line && line.indexOf('\t') === 0; }; const isTableLine = function(line: string) { return line.indexOf('| ') === 0; }; const isPlainParagraph = function(line: string) { // Note: if a line is no longer than 80 characters, we don't consider it's a paragraph, which // means no newlines will be added before or after. This is to handle text that has been // written with "hard" new lines. if (!line || line.length < 80) return false; if (isListItem(line)) return false; if (isHeading(line)) return false; if (isCodeLine(line)) return false; if (isTableLine(line)) return false; return true; }; function formatMdLayout(lines: string[]) { let previous = ''; const newLines = []; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Add a new line at the end of a list of items if (isListItem(previous) && line && !isListItem(line)) { newLines.push(''); // Add a new line at the beginning of a list of items } else if (isListItem(line) && previous && !isListItem(previous)) { newLines.push(''); // Add a new line before a heading } else if (isHeading(line) && previous) { newLines.push(''); // Add a new line after a heading } else if (isHeading(previous) && line) { newLines.push(''); } else if (isCodeLine(line) && !isCodeLine(previous)) { newLines.push(''); } else if (!isCodeLine(line) && isCodeLine(previous)) { newLines.push(''); } else if (isTableLine(line) && !isTableLine(previous)) { newLines.push(''); } else if (!isTableLine(line) && isTableLine(previous)) { newLines.push(''); // Add a new line at beginning of paragraph } else if (isPlainParagraph(line) && previous) { newLines.push(''); // Add a new line at end of paragraph } else if (isPlainParagraph(previous) && line) { newLines.push(''); } newLines.push(line); previous = newLines[newLines.length - 1]; } return newLines; } function isWhiteSpace(c: string): boolean { return c === '\n' || c === '\r' || c === '\v' || c === '\f' || c === '\t' || c === ' '; } // Like QString::simpified(), except that it preserves non-breaking spaces (which // Evernote uses for identation, etc.) function simplifyString(s: string): string { let output = ''; let previousWhite = false; for (let i = 0; i < s.length; i++) { const c = s[i]; const isWhite = isWhiteSpace(c); if (previousWhite && isWhite) { // skip } else { output += c; } previousWhite = isWhite; } while (output.length && isWhiteSpace(output[0])) output = output.substr(1); while (output.length && isWhiteSpace(output[output.length - 1])) output = output.substr(0, output.length - 1); return output; } function collapseWhiteSpaceAndAppend(lines: string[], state: any, text: string) { if (state.inCode.length) { lines.push(text); } else { // Remove all \n and \r from the left and right of the text while (text.length && (text[0] === '\n' || text[0] === '\r')) text = text.substr(1); while (text.length && (text[text.length - 1] === '\n' || text[text.length - 1] === '\r')) text = text.substr(0, text.length - 1); // Collapse all white spaces to just one. If there are spaces to the left and right of the string // also collapse them to just one space. const spaceLeft = text.length && text[0] === ' '; const spaceRight = text.length && text[text.length - 1] === ' '; text = simplifyString(text); if (!spaceLeft && !spaceRight && text === '') return lines; if (state.inQuote) { // Add a ">" at the beginning of the block then at the beginning of each lines. So it turns this: // "my quote\nsecond line" into this => "> my quote\n> second line" lines.push('> '); if (lines.indexOf('\r') >= 0) { text = text.replace(/\n\r/g, '\n\r> '); } else { text = text.replace(/\n/g, '\n> '); } } if (spaceLeft) lines.push(SPACE); lines.push(text); if (spaceRight) lines.push(SPACE); } return lines; } function tagAttributeToMdText(attr: string): string { // HTML attributes may contain newlines so remove them. // https://github.com/laurent22/joplin/issues/1583 if (!attr) return ''; attr = attr.replace(/[\n\r]+/g, ' '); attr = attr.replace(/\]/g, '\\]'); return attr; } interface AddResourceOptions { alt?: string; width?: number; height?: number; } const addResourceTag = (lines: string[], src: string, mime: string, options: AddResourceOptions): string[] => { const alt = options.alt ? tagAttributeToMdText(options.alt) : ''; if (resourceUtils.isImageMimeType(mime)) { if (!!options.width || !!options.height) { const attrs: Record
// elements by ) and only the inner ones, those that don't contain any other tables, are rendered as actual tables. This is generally
// the required behaviour since the outer tables are usually for layout and the inner ones are the content.
function drawTable(table: Section) {
// | First Header | Second Header |
// | ------------- | ------------- |
// | Content Cell | Content Cell |
// | Content Cell | Content Cell |
// There must be at least 3 dashes separating each header cell.
// https://gist.github.com/IanWang/28965e13cdafdef4e11dc91f578d160d#tables
const flatRender = tableHasSubTables(table); // Render the table has regular text
let lines = [];
lines.push(BLOCK_OPEN);
let headerDone = false;
let caption = null;
for (let trIndex = 0; trIndex < table.lines.length; trIndex++) {
const tr = table.lines[trIndex];
if (tr.type === 'caption') {
caption = tr;
continue;
}
const isHeader = tr.isHeader;
const line = [];
const headerLine = [];
let emptyHeader = null;
for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) {
const td = tr.lines[tdIndex];
if (flatRender) {
line.push(BLOCK_OPEN);
let currentCells: any[] = [];
const renderCurrentCells = () => {
if (!currentCells.length) return;
const cellText = processMdArrayNewLines(currentCells);
line.push(cellText);
currentCells = [];
};
// In here, recursively render the tables
for (let i = 0; i < td.lines.length; i++) {
const c = td.lines[i];
if (typeof c === 'object' && ['table', 'td', 'tr', 'th', 'caption'].indexOf(c.type) >= 0) {
// This is a table
renderCurrentCells();
currentCells = currentCells.concat(drawTable(c));
} else {
// This is plain text
// currentCells.push(c);
currentCells = currentCells.concat(renderLine(c));
}
}
renderCurrentCells();
line.push(BLOCK_CLOSE);
} else {
// Regular table rendering
// A cell in a Markdown table cannot have actual new lines so replace
// them with , which are supported by the markdown renderers. let cellText = processMdArrayNewLines(td.lines); let lines = cellText.split('\n'); lines = postProcessMarkdown(lines); cellText = lines.join('\n').replace(/\n+/g, ' '); // Inside tables cells, "|" needs to be escaped cellText = cellText.replace(/\|/g, '\\|'); // Previously the width of the cell was as big as the content since it looks nicer, however that often doesn't work // since the content can be very long, resulting in unreadable markdown. So no solution is perfect but making it a // width of 3 is a bit better. Note that 3 is the minimum width of a cell - below this, it won't be rendered by // markdown parsers. const width = 3; line.push(stringPadding(cellText, width, ' ', stringPadding.RIGHT)); if (!headerDone) { if (!isHeader) { if (!emptyHeader) emptyHeader = []; const h = stringPadding(' ', width, ' ', stringPadding.RIGHT); emptyHeader.push(h); } headerLine.push('-'.repeat(width)); } } } if (flatRender) { headerDone = true; lines.push(BLOCK_OPEN); lines = lines.concat(line); lines.push(BLOCK_CLOSE); } else { if (emptyHeader) { lines.push(`| ${emptyHeader.join(' | ')} |`); lines.push(`| ${headerLine.join(' | ')} |`); headerDone = true; } lines.push(`| ${line.join(' | ')} |`); if (!headerDone) { lines.push(`| ${headerLine.join(' | ')} |`); headerDone = true; } } } lines.push(BLOCK_CLOSE); if (caption) { const captionLines: any[] = renderLines(caption.lines); lines = lines.concat(captionLines); } return flatRender ? lines : lines.join(`<<<<:D>>>>${NEWLINE}<<<<:D>>>>`).split('<<<<:D>>>>'); } function postProcessMarkdown(lines: string[]) { // After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to // block start/end or elements that were ignored, etc. If these white spaces were intended it's not really // possible to detect it, so simply trim them all so that the result is more deterministic and can be // easily unit tested. const trimEmptyLines = function(lines: string[]) { while (lines.length) { if (!lines[0].trim()) { lines.splice(0, 1); } else { break; } } while (lines.length) { if (!lines[lines.length - 1].trim()) { lines.pop(); } else { break; } } return lines; }; function cleanUpSpaces(lines: string[]) { const output = []; for (let i = 0; i < lines.length; i++) { let line = lines[i]; if (line.length) { // eg. " - Some list item" => " - Some list item" // Note that spaces before the "-" are preserved line = line.replace(/^(\s+|)-\s+/, '$1- '); // eg "Some text " => "Some text" line = line.replace(/^(.*?)\s+$/, '$1'); } output.push(line); } return output; } lines = trimEmptyLines(lines); lines = cleanUpSpaces(lines); return lines; } // A "line" can be some Markdown text, or it can be a section, like a table, // etc. so this function returns an array of strings. function renderLine(line: any) { if (typeof line === 'object' && line.type === 'table') { // A table const table = line; return drawTable(table); } else if (typeof line === 'object' && line.type === 'code') { return line.lines; } else if (typeof line === 'object' && line.type === 'hidden') { // ENEX notes sometimes have hidden tags. We could strip off these // sections but in the spirit of preserving all data we wrap them in // a hidden tag too. let hiddenLines = [' '); // We need to add two new lines after the HTML block, or the Markdown // after that will not render. // https://github.com/markdown-it/markdown-it/issues/746 hiddenLines.push(NEWLINE); hiddenLines.push(NEWLINE); return hiddenLines; } else if (typeof line === 'object') { console.warn('Unhandled object type:', line); return line.lines; } else { // an actual line return [line]; } } function renderLines(lines: any[]) { let mdLines: string[] = []; for (let i = 0; i < lines.length; i++) { const renderedLines = renderLine(lines[i]); mdLines = mdLines.concat(renderedLines); } return mdLines; } async function enexXmlToMd(xmlString: string, resources: ResourceEntity[], tasks: ExtractedTask[]) { const stream = stringToStream(xmlString); const result = await enexXmlToMdArray(stream, resources, tasks); let mdLines = renderLines(result.content.lines); let firstAttachment = true; for (let i = 0; i < result.resources.length; i++) { const r = result.resources[i]; if (firstAttachment) mdLines.push(NEWLINE); mdLines.push(NEWLINE); mdLines = addResourceTag(mdLines, `:/${r.id}`, r.mime, { alt: altFromResource(r), }); firstAttachment = false; } let output = processMdArrayNewLines(mdLines).split('\n'); output = postProcessMarkdown(output); return output.join('\n'); } export { enexXmlToMd, processMdArrayNewLines, NEWLINE, cssValue }; |