Commande

or any other block tag, we skip it, so that a new line // does not get created. It is to handle list4.html test case. // https://github.com/laurent22/joplin/issues/832 if (currentList) { if (!currentList.startedText && isBlockTag(n)) return; currentList.startedText = true; } // Note that the order of if/else blocks is important. In // particular table-related blocks should always be on top and // take priority over, in particular, hidden blocks. This is so // that a block that is both table-related and hidden is simply // handled as table-related. This is to ensure that the table // structure is valid. if (n == 'en-note') { // Start of note } else if (n == 'table') { const newSection: Section = { type: SectionType.Table, lines: [], parent: section, }; section.lines.push(newSection); section = newSection; } else if (n == 'tbody' || n == 'thead') { // Ignore it } else if (n == 'tr') { // Note: Even if we encounter tags in the wrong place, we // create the sections anyway so that the data is imported. // Invalid HTML like would most likely be from clipped // pages which would look like a mess in Evernote. So it // will look like a mess in Joplin too but at least the // data will be there. // // Also if we simply skip the section, it will cause an // error in drawTable() later on. // // https://discourse.joplinapp.org/t/not-all-notes-imported-from-evernote/13056/12?u=laurent if (section.type != 'table') { displaySaxWarning(this, 'Found a tag outside of a table'); // return; } const newSection: Section = { type: SectionType.Tr, lines: [], parent: section, isHeader: false, }; section.lines.push(newSection); section = newSection; } else if (n == 'td' || n == 'th') { if (section.type != 'tr') { displaySaxWarning(this, 'Found a tag outside of a '); // return; } if (n == 'th') section.isHeader = true; const newSection: Section = { type: SectionType.Td, lines: [], parent: section, }; section.lines.push(newSection); section = newSection; } else if (n == 'caption') { if (section.type != 'table') { displaySaxWarning(this, 'Found a tag outside of a '); // return; } const newSection: Section = { type: SectionType.Caption, lines: [], parent: section, }; section.lines.push(newSection); section = newSection; } else if (!isVisible) { const newSection: Section = { type: SectionType.Hidden, lines: [], parent: section, }; section.lines.push(newSection); section = newSection; } else if (isBlockTag(n)) { section.lines.push(BLOCK_OPEN); } else if (isListTag(n)) { section.lines.push(BLOCK_OPEN); state.lists.push({ tag: n, counter: 1, startedText: false }); } else if (n == 'li') { section.lines.push(BLOCK_OPEN); if (!state.lists.length) { displaySaxWarning(this, 'Found

tag without being inside a list'); return; } const container = state.lists[state.lists.length - 1]; container.startedText = false; const indent = ' '.repeat(state.lists.length - 1); if (container.tag == 'ul') { section.lines.push(`${indent}- `); } else { section.lines.push(`${indent + container.counter}. `); container.counter++; } } else if (isStrongTag(n)) { section.lines.push('**'); } else if (isStrikeTag(n)) { section.lines.push(''); } else if (isInlineCodeTag(n)) { section.lines.push('`'); } else if (n == 'q') { section.lines.push('"'); } else if (n == 'img') { if (nodeAttributes.src) { // Many (most?) img tags don't have no source associated, especially when they were imported from HTML let s = '!['; if (nodeAttributes.alt) s += tagAttributeToMdText(nodeAttributes.alt); s += `](${markdownUtils.escapeLinkUrl(nodeAttributes.src)})`; section.lines.push(s); } } else if (isAnchor(n)) { state.anchorAttributes.push(nodeAttributes); // Need to add the '[' via this function to make sure that links within code blocks // are handled correctly. collapseWhiteSpaceAndAppend(section.lines, state, '['); } else if (isEmTag(n)) { section.lines.push('*'); } else if (n == 'en-todo') { const x = nodeAttributes && nodeAttributes.checked && nodeAttributes.checked.toLowerCase() == 'true' ? 'X' : ' '; section.lines.push(`- [${x}] `); } else if (n == 'hr') { // Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML section.lines.push(NEWLINE); section.lines.push('* * *'); section.lines.push(NEWLINE); section.lines.push(NEWLINE); } else if (n == 'h1') { section.lines.push(BLOCK_OPEN); section.lines.push('# '); } else if (n == 'h2') { section.lines.push(BLOCK_OPEN); section.lines.push('## '); } else if (n == 'h3') { section.lines.push(BLOCK_OPEN); section.lines.push('### '); } else if (n == 'h4') { section.lines.push(BLOCK_OPEN); section.lines.push('#### '); } else if (n == 'h5') { section.lines.push(BLOCK_OPEN); section.lines.push('##### '); } else if (n == 'h6') { section.lines.push(BLOCK_OPEN); section.lines.push('###### '); } else if (n == 'blockquote') { section.lines.push(BLOCK_OPEN); state.inQuote = true; } else if (n === 'code') { state.inCode.push(true); state.currentCode = ''; const newSection: Section = { type: SectionType.Code, lines: [], parent: section, }; section.lines.push(newSection); section = newSection; } else if (n === 'pre') { section.lines.push(BLOCK_OPEN); state.inPre = true; } else if (n == 'br') { section.lines.push(NEWLINE); } else if (n == 'en-media') { const hash = nodeAttributes.hash; let resource = null; for (let i = 0; i < resources.length; i++) { const r = resources[i]; if (r.id == hash) { resource = r; removeRemainingResource(r.id); break; } } if (!resource) { // This is a bit of a hack. Notes sometime have resources attached to it, but those tags don't contain // an "objID" tag, making it impossible to reference the resource. However, in this case the content of the note // will contain a corresponding tag, which has the ID in the "hash" attribute. All this information // has been collected above so we now set the resource ID to the hash attribute of the en-media tags. Here's an // example of note that shows this problem: // // // // // Commande // // // // // // // ]]> // // 20160921T203424Z // 20160921T203438Z // // 20160902T140445Z // 20160924T101120Z // // // ........ // image/png // 150 // 150 // // // // Note that there's also the case of resources with no ID where the ID is actually the MD5 of the content. // This is handled in import-enex.js let found = false; for (let i = 0; i < remainingResources.length; i++) { const r = remainingResources[i]; if (!r.id) { resource = Object.assign({}, r); resource.id = hash; remainingResources.splice(i, 1); found = true; break; } } if (!found) { // console.warn(`Hash with no associated resource: ${hash}`); } } // If the resource does not appear among the note's resources, it // means it's an attachement. It will be appended along with the // other remaining resources at the bottom of the markdown text. if (resource && !!resource.id) { section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt); } } else if (n == 'span') { if (isSpanWithStyle(nodeAttributes)) { // Found style(s) in span tag state.spanAttributes.push(nodeAttributes); if (isSpanStyleBold(nodeAttributes)) { // Applying style found in span tag: bold' section.lines.push('**'); } if (isSpanStyleItalic(nodeAttributes)) { // Applying style found in span tag: italic' section.lines.push('*'); } } } else if (['font', 'sup', 'cite', 'abbr', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area'].indexOf(n) >= 0) { // Inline tags that can be ignored in Markdown } else { console.warn(`Unsupported start tag: ${n}`); } }); saxStream.on('closetag', function(n: string) { n = n ? n.toLowerCase() : n; const poppedTag = state.tags.pop(); if (n == 'en-note') { // End of note } else if (!poppedTag.visible) { if (section && section.parent) section = section.parent; } else if (isNewLineOnlyEndTag(n)) { section.lines.push(BLOCK_CLOSE); } else if (n == 'td' || n == 'th') { if (section && section.parent) section = section.parent; } else if (n == 'tr' || n == 'caption') { if (section && section.parent) section = section.parent; } else if (n == 'table') { if (section && section.parent) section = section.parent; } else if (isIgnoredEndTag(n)) { // Skip } else if (isListTag(n)) { section.lines.push(BLOCK_CLOSE); state.lists.pop(); } else if (isStrongTag(n)) { section.lines.push('**'); } else if (isStrikeTag(n)) { section.lines.push(''); } else if (isInlineCodeTag(n)) { section.lines.push('`'); } else if (isEmTag(n)) { section.lines.push('*'); } else if (n == 'q') { section.lines.push('"'); } else if (n == 'blockquote') { section.lines.push(BLOCK_OPEN); state.inQuote = false; } else if (n === 'code') { state.inCode.pop(); if (!state.inCode.length) { const codeLines = processMdArrayNewLines(section.lines).split('\n'); section.lines = []; if (codeLines.length > 1) { for (let i = 0; i < codeLines.length; i++) { if (i > 0) section.lines.push('\n'); section.lines.push(`\t${codeLines[i]}`); } } else { section.lines.push(`\`${codeLines.join('')}\``); } if (section && section.parent) section = section.parent; } } else if (n === 'pre') { state.inPre = false; section.lines.push(BLOCK_CLOSE); } else if (isAnchor(n)) { const attributes = state.anchorAttributes.pop(); const url = attributes && attributes.href ? attributes.href : ''; if (section.lines.length < 1) throw new Error('Invalid anchor tag closing'); // Sanity check, but normally not possible // When closing the anchor tag, check if there's is any text content. If not // put the URL as is (don't wrap it in [](url)). The markdown parser, using // GitHub flavour, will turn this URL into a link. This is to generate slightly // cleaner markdown. // Need to loop on the previous tags so as to skip the special ones, which are not relevant for the below algorithm. let previous = null; for (let i = section.lines.length - 1; i >= 0; i--) { previous = section.lines[i]; if ([BLOCK_OPEN, BLOCK_CLOSE, NEWLINE, NEWLINE_MERGED, SPACE].indexOf(previous) >= 0 || !previous) { continue; } else { break; } } if (previous == '[') { // We have a link that had some content but, after parsing, nothing is left. The content was most likely // something that shows up via CSS and which we cannot support. For example: // // //

// // // In the case above the arrow is displayed via CSS. // It is useless to display the full URL since often it is not relevant for a note (for example // it's interactive bits) and it's not user-generated content such as a URL that would appear in a comment. // So in this case, we still want to preserve the information but display it in a discreet way as a simple [L]. // Need to pop everything inside the current [] because it can only be special chars that we don't want (they would create uncessary newlines) for (let i = section.lines.length - 1; i >= 0; i--) { if (section.lines[i] !== '[') { section.lines.pop(); } else { break; } } if (!url) { // If there's no URL and no content, pop the [ and don't save any content. section.lines.pop(); } else { section.lines.push('(L)'); section.lines.push(`](${url})`); } } else if (!previous || previous == url) { section.lines.pop(); section.lines.pop(); section.lines.push(url); } else { // Need to remove any new line character between the current ']' and the previous '[' // otherwise it won't render properly. let allSpaces = true; for (let i = section.lines.length - 1; i >= 0; i--) { const c = section.lines[i]; if (c === '[') { break; } else { if (c === BLOCK_CLOSE || c === BLOCK_OPEN || c === NEWLINE) { section.lines[i] = SPACE; } else { if (!isWhiteSpace(c)) allSpaces = false; } } } if (allSpaces) { for (let i = section.lines.length - 1; i >= 0; i--) { const c = section.lines.pop(); if (c === '[') break; } section.lines.push(url); } else { // Eg. converts: // [ Sign in ](https://example.com) // to: // [Sign in](https://example.com) const trimTextStartAndEndSpaces = function(lines: string[]) { let firstBracketIndex = 0; let foundFirstNonWhite = false; for (let i = lines.length - 1; i >= 0; i--) { const l = lines[i]; if (!foundFirstNonWhite && (l === SPACE || l === ' ' || !l)) { lines.pop(); } else { foundFirstNonWhite = true; } if (l === '[') { firstBracketIndex = i; break; } } for (let i = firstBracketIndex + 1; i < lines.length; i++) { const l = lines[i]; if (l === SPACE || l === ' ' || !l) { lines.splice(i, 1); } else { break; } } return lines; }; section.lines = trimTextStartAndEndSpaces(section.lines); section.lines.push(`](${url})`); } } } else if (n == 'en-media') { // Skip } else if (n == 'span') { const attributes = state.spanAttributes.pop(); if (isSpanWithStyle(attributes)) { if (isSpanStyleBold(attributes)) { // Applying style found in span tag (closing): bold' section.lines.push('**'); } if (isSpanStyleItalic(attributes)) { // Applying style found in span tag (closing): italic' section.lines.push('*'); } } } else { console.warn(`Unsupported end tag: ${n}`); } }); saxStream.on('attribute', function() {}); saxStream.on('end', function() { resolve({ content: section, resources: remainingResources, } as EnexXmlToMdArrayResult); }); stream.pipe(saxStream); }); } function tableHasSubTables(table: Section) { for (let trIndex = 0; trIndex < table.lines.length; trIndex++) { const tr = table.lines[trIndex]; if (!tr || !tr.lines) continue; for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) { const td = tr.lines[tdIndex]; // We are inside a CAPTION, not a TD if (typeof td === 'string') continue; for (let i = 0; i < td.lines.length; i++) { if (typeof td.lines[i] === 'object') return true; } } } return false; } // Markdown tables don't support tables within tables, which is common in notes that are complete web pages, for example when imported // via Web Clipper. So to handle this, we render all the outer tables as regular text (as if replacing all the

, and

// elements by

) and only the inner ones, those that don't contain any other tables, are rendered as actual tables. This is generally // the required behaviour since the outer tables are usually for layout and the inner ones are the content. function drawTable(table: Section) { // | First Header | Second Header | // | ------------- | ------------- | // | Content Cell | Content Cell | // | Content Cell | Content Cell | // There must be at least 3 dashes separating each header cell. // https://gist.github.com/IanWang/28965e13cdafdef4e11dc91f578d160d#tables const flatRender = tableHasSubTables(table); // Render the table has regular text let lines = []; lines.push(BLOCK_OPEN); let headerDone = false; let caption = null; for (let trIndex = 0; trIndex < table.lines.length; trIndex++) { const tr = table.lines[trIndex]; if (tr.type === 'caption') { caption = tr; continue; } const isHeader = tr.isHeader; const line = []; const headerLine = []; let emptyHeader = null; for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) { const td = tr.lines[tdIndex]; if (flatRender) { line.push(BLOCK_OPEN); let currentCells: any[] = []; const renderCurrentCells = () => { if (!currentCells.length) return; const cellText = processMdArrayNewLines(currentCells); line.push(cellText); currentCells = []; }; // In here, recursively render the tables for (let i = 0; i < td.lines.length; i++) { const c = td.lines[i]; if (typeof c === 'object' && ['table', 'td', 'tr', 'th', 'caption'].indexOf(c.type) >= 0) { // This is a table renderCurrentCells(); currentCells = currentCells.concat(drawTable(c)); } else { // This is plain text // currentCells.push(c); currentCells = currentCells.concat(renderLine(c)); } } renderCurrentCells(); line.push(BLOCK_CLOSE); } else { // Regular table rendering // A cell in a Markdown table cannot have actual new lines so replace // them with
, which are supported by the markdown renderers. let cellText = processMdArrayNewLines(td.lines); let lines = cellText.split('\n'); lines = postProcessMarkdown(lines); cellText = lines.join('\n').replace(/\n+/g, '
'); // Inside tables cells, "|" needs to be escaped cellText = cellText.replace(/\|/g, '\\|'); // Previously the width of the cell was as big as the content since it looks nicer, however that often doesn't work // since the content can be very long, resulting in unreadable markdown. So no solution is perfect but making it a // width of 3 is a bit better. Note that 3 is the minimum width of a cell - below this, it won't be rendered by // markdown parsers. const width = 3; line.push(stringPadding(cellText, width, ' ', stringPadding.RIGHT)); if (!headerDone) { if (!isHeader) { if (!emptyHeader) emptyHeader = []; const h = stringPadding(' ', width, ' ', stringPadding.RIGHT); emptyHeader.push(h); } headerLine.push('-'.repeat(width)); } } } if (flatRender) { headerDone = true; lines.push(BLOCK_OPEN); lines = lines.concat(line); lines.push(BLOCK_CLOSE); } else { if (emptyHeader) { lines.push(`| ${emptyHeader.join(' | ')} |`); lines.push(`| ${headerLine.join(' | ')} |`); headerDone = true; } lines.push(`| ${line.join(' | ')} |`); if (!headerDone) { lines.push(`| ${headerLine.join(' | ')} |`); headerDone = true; } } } lines.push(BLOCK_CLOSE); if (caption) { const captionLines: any[] = renderLines(caption.lines); lines = lines.concat(captionLines); } return flatRender ? lines : lines.join(`<<<<:D>>>>${NEWLINE}<<<<:D>>>>`).split('<<<<:D>>>>'); } function postProcessMarkdown(lines: string[]) { // After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to // block start/end or elements that were ignored, etc. If these white spaces were intended it's not really // possible to detect it, so simply trim them all so that the result is more deterministic and can be // easily unit tested. const trimEmptyLines = function(lines: string[]) { while (lines.length) { if (!lines[0].trim()) { lines.splice(0, 1); } else { break; } } while (lines.length) { if (!lines[lines.length - 1].trim()) { lines.pop(); } else { break; } } return lines; }; function cleanUpSpaces(lines: string[]) { const output = []; for (let i = 0; i < lines.length; i++) { let line = lines[i]; if (line.length) { // eg. " - Some list item" => " - Some list item" // Note that spaces before the "-" are preserved line = line.replace(/^(\s+|)-\s+/, '$1- '); // eg "Some text " => "Some text" line = line.replace(/^(.*?)\s+$/, '$1'); } output.push(line); } return output; } lines = trimEmptyLines(lines); lines = cleanUpSpaces(lines); return lines; } // A "line" can be some Markdown text, or it can be a section, like a table, // etc. so this function returns an array of strings. function renderLine(line: any) { if (typeof line === 'object' && line.type === 'table') { // A table const table = line; return drawTable(table); } else if (typeof line === 'object' && line.type === 'code') { return line.lines; } else if (typeof line === 'object' && line.type === 'hidden') { // ENEX notes sometimes have hidden tags. We could strip off these // sections but in the spirit of preserving all data we wrap them in // a hidden tag too. let hiddenLines = [''); // We need to add two new lines after the HTML block, or the Markdown // after that will not render. // https://github.com/markdown-it/markdown-it/issues/746 hiddenLines.push(NEWLINE); hiddenLines.push(NEWLINE); return hiddenLines; } else if (typeof line === 'object') { console.warn('Unhandled object type:', line); return line.lines; } else { // an actual line return [line]; } } function renderLines(lines: any[]) { let mdLines: string[] = []; for (let i = 0; i < lines.length; i++) { const renderedLines = renderLine(lines[i]); mdLines = mdLines.concat(renderedLines); } return mdLines; } async function enexXmlToMd(xmlString: string, resources: ResourceEntity[]) { const stream = stringToStream(xmlString); const result = await enexXmlToMdArray(stream, resources); let mdLines = renderLines(result.content.lines); let firstAttachment = true; for (let i = 0; i < result.resources.length; i++) { const r = result.resources[i]; if (firstAttachment) mdLines.push(NEWLINE); mdLines.push(NEWLINE); mdLines = addResourceTag(mdLines, r, r.filename); firstAttachment = false; } let output = processMdArrayNewLines(mdLines).split('\n'); output = postProcessMarkdown(output); return output.join('\n'); } export { enexXmlToMd, processMdArrayNewLines, NEWLINE, addResourceTag };