From 507e7e6014d3c15816e91c6a73092fd93dc2940c Mon Sep 17 00:00:00 2001 From: Laurent Cozic Date: Wed, 6 Dec 2017 19:29:58 +0000 Subject: [PATCH] All: Improved ENEX import for web pages that have been saved as notes --- CliClient/app/app.js | 10 +- CliClient/package-lock.json | 2 +- CliClient/package.json | 2 +- ReactNativeClient/lib/import-enex-md-gen.js | 128 +++++++++++++++----- ReactNativeClient/lib/import-enex.js | 123 +++++++++++++------ 5 files changed, 193 insertions(+), 72 deletions(-) diff --git a/CliClient/app/app.js b/CliClient/app/app.js index f2863c49e..ee8bfc2bf 100644 --- a/CliClient/app/app.js +++ b/CliClient/app/app.js @@ -246,9 +246,13 @@ class Application extends BaseApplication { try { CommandClass = require(__dirname + '/command-' + name + '.js'); } catch (error) { - let e = new Error('No such command: ' + name); - e.type = 'notFound'; - throw e; + if (error.message && error.message.indexOf('Cannot find module') >= 0) { + let e = new Error(_('No such command: %s', name)); + e.type = 'notFound'; + throw e; + } else { + throw error; + } } let cmd = new CommandClass(); diff --git a/CliClient/package-lock.json b/CliClient/package-lock.json index 00cd5d87c..3b119bda4 100644 --- a/CliClient/package-lock.json +++ b/CliClient/package-lock.json @@ -1,6 +1,6 @@ { "name": "joplin", - "version": "0.10.77", + "version": "0.10.78", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/CliClient/package.json b/CliClient/package.json index cd1fd2b24..5d686b0af 100644 --- a/CliClient/package.json +++ b/CliClient/package.json @@ -18,7 +18,7 @@ ], "owner": "Laurent Cozic" }, - "version": "0.10.77", + "version": "0.10.78", "bin": { "joplin": "./main.js" }, diff --git a/ReactNativeClient/lib/import-enex-md-gen.js b/ReactNativeClient/lib/import-enex-md-gen.js index 3c0a464ad..e11afa5a6 100644 --- a/ReactNativeClient/lib/import-enex-md-gen.js +++ b/ReactNativeClient/lib/import-enex-md-gen.js @@ -194,11 +194,15 @@ function addResourceTag(lines, resource, alt = "") { function isBlockTag(n) { - return n=="div" || n=="p" || n=="dl" || n=="dd" || n=="center"; + return n=="div" || n=="p" || n=="dl" || n=="dd" || n == 'dt' || n=="center"; } function isStrongTag(n) { - return n == "strong" || n == "b"; + return n == "strong" || n == "b" || n == 'big'; +} + +function isStrikeTag(n) { + return n == "strike" || n == "s" || n == 'del'; } function isEmTag(n) { @@ -210,7 +214,7 @@ function isAnchor(n) { } function isIgnoredEndTag(n) { - return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup'; + return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup' || n == 'img' || n == 'abbr' || n == 'cite' || n == 'thead' || n == 'small' || n == 'tt' || n == 'sub'; } function isListTag(n) { @@ -219,7 +223,7 @@ function isListTag(n) { // Elements that don't require any special treatment beside adding a newline character function isNewLineOnlyEndTag(n) { - return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=="dl" || n=="dd" || n=="center"; + return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=='h6' || n=="dl" || n=="dd" || n == 'dt' || n=="center"; } function isCodeTag(n) { @@ -253,8 +257,27 @@ function xmlNodeText(xmlNode) { return xmlNode[0]; } +function attributeToLowerCase(node) { + if (!node.attributes) return {}; + let output = {}; + for (let n in node.attributes) { + if (!node.attributes.hasOwnProperty(n)) continue; + output[n.toLowerCase()] = node.attributes[n]; + } + return output; +} + function enexXmlToMdArray(stream, resources) { - resources = resources.slice(); + let remainingResources = resources.slice(); + + const removeRemainingResource = (id) => { + for (let i = 0; i < remainingResources.length; i++) { + const r = remainingResources[i]; + if (r.id === id) { + remainingResources.splice(i, 1); + } + } + } return new Promise((resolve, reject) => { let state = { @@ -265,7 +288,7 @@ function enexXmlToMdArray(stream, resources) { }; let options = {}; - let strict = true; + let strict = false; var saxStream = require('sax').createStream(strict, options) let section = { @@ -275,14 +298,18 @@ function enexXmlToMdArray(stream, resources) { }; saxStream.on('error', function(e) { - reject(e); + console.warn(e); + //reject(e); }) saxStream.on('text', function(text) { + if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return; section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text); }) saxStream.on('opentag', function(node) { + const nodeAttributes = attributeToLowerCase(node); + let n = node.name.toLowerCase(); if (n == 'en-note') { // Start of note @@ -293,25 +320,51 @@ function enexXmlToMdArray(stream, resources) { type: 'table', lines: [], parent: section, + toString: function() { + let output = []; + output.push(BLOCK_OPEN); + for (let i = 0; i < this.lines.length; i++) { + output = output.concat(this.lines[i].toMdLines()); + } + output.push(BLOCK_CLOSE); + return processMdArrayNewLines(output); + }, }; section.lines.push(newSection); section = newSection; - } else if (n == 'tbody') { + } else if (n == 'tbody' || n == 'thead') { // Ignore it } else if (n == 'tr') { - if (section.type != 'table') throw new Error('Found a tag outside of a table'); + if (section.type != 'table') { + console.warn('Found a tag outside of a table'); + return; + } let newSection = { type: 'tr', lines: [], parent: section, isHeader: false, + // Normally tables are rendered properly as markdown, but for table within table within table... we cannot + // handle this in Markdown so simply render it as one cell per line. + toMdLines: function() { + let output = []; + output.push(BLOCK_OPEN); + for (let i = 0; i < this.lines.length; i++) { + output.push(this.lines[i].toString()); + } + output.push(BLOCK_CLOSE); + return output; + }, } section.lines.push(newSection); section = newSection; } else if (n == 'td' || n == 'th') { - if (section.type != 'tr') throw new Error('Found a tag outside of a '); + if (section.type != 'tr') { + console.warn('Found a tag outside of a '); + return; + } if (n == 'th') section.isHeader = true; @@ -319,6 +372,9 @@ function enexXmlToMdArray(stream, resources) { type: 'td', lines: [], parent: section, + toString: function() { + return processMdArrayNewLines(this.lines); + }, }; section.lines.push(newSection); @@ -342,17 +398,27 @@ function enexXmlToMdArray(stream, resources) { } } else if (isStrongTag(n)) { section.lines.push("**"); - } else if (n == 's') { - // Not supported + } else if (isStrikeTag(n)) { + section.lines.push('('); + } else if (n == 'samp') { + section.lines.push('`'); } else if (n == 'q') { section.lines.push('"'); + } else if (n == 'img') { + // TODO: TEST IMAGE + if (nodeAttributes.src) { // Many (most?) img tags don't have no source associated, especially when they were imported from HTML + let s = '!['; + if (nodeAttributes.alt) s += nodeAttributes.alt; + s += '](' + nodeAttributes.src + ')'; + section.lines.push(s); + } } else if (isAnchor(n)) { - state.anchorAttributes.push(node.attributes); + state.anchorAttributes.push(nodeAttributes); section.lines.push('['); } else if (isEmTag(n)) { section.lines.push("*"); } else if (n == "en-todo") { - let x = node.attributes && node.attributes.checked && node.attributes.checked.toLowerCase() == 'true' ? 'X' : ' '; + let x = nodeAttributes && nodeAttributes.checked && nodeAttributes.checked.toLowerCase() == 'true' ? 'X' : ' '; section.lines.push('- [' + x + '] '); } else if (n == "hr") { // Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML @@ -375,20 +441,20 @@ function enexXmlToMdArray(stream, resources) { } else if (n == 'blockquote') { section.lines.push(BLOCK_OPEN); state.inQuote = true; - } else if (isCodeTag(n, node.attributes)) { + } else if (isCodeTag(n, nodeAttributes)) { section.lines.push(BLOCK_OPEN); state.inCode = true; } else if (n == "br") { section.lines.push(NEWLINE); } else if (n == "en-media") { - const hash = node.attributes.hash; + const hash = nodeAttributes.hash; let resource = null; for (let i = 0; i < resources.length; i++) { let r = resources[i]; if (r.id == hash) { resource = r; - resources.splice(i, 1); + removeRemainingResource(r.id); break; } } @@ -430,11 +496,11 @@ function enexXmlToMdArray(stream, resources) { // let found = false; - for (let i = 0; i < resources.length; i++) { - let r = resources[i]; + for (let i = 0; i < remainingResources.length; i++) { + let r = remainingResources[i]; if (!r.id) { r.id = hash; - resources[i] = r; + remainingResources[i] = r; found = true; break; } @@ -448,27 +514,29 @@ function enexXmlToMdArray(stream, resources) { // means it's an attachement. It will be appended along with the // other remaining resources at the bottom of the markdown text. if (!!resource.id) { - section.lines = addResourceTag(section.lines, resource, node.attributes.alt); + section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt); } } - } else if (n == "span" || n == "font" || n == 'sup') { - // Ignore + } else if (n == "span" || n == "font" || n == 'sup' || n == 'cite' || n == 'abbr' || n == 'small' || n == 'tt' || n == 'sub') { + // Inline tags that can be ignored in Markdown } else { console.warn("Unsupported start tag: " + n); } }) saxStream.on('closetag', function(n) { + n = n ? n.toLowerCase() : n; + if (n == 'en-note') { // End of note } else if (isNewLineOnlyEndTag(n)) { section.lines.push(BLOCK_CLOSE); } else if (n == 'td' || n == 'th') { - section = section.parent; + if (section && section.parent) section = section.parent; } else if (n == 'tr') { - section = section.parent; + if (section && section.parent) section = section.parent; } else if (n == 'table') { - section = section.parent; + if (section && section.parent) section = section.parent; } else if (isIgnoredEndTag(n)) { // Skip } else if (isListTag(n)) { @@ -476,6 +544,10 @@ function enexXmlToMdArray(stream, resources) { state.lists.pop(); } else if (isStrongTag(n)) { section.lines.push("**"); + } else if (isStrikeTag(n)) { + section.lines.push(')'); + } else if (n == 'samp') { + section.lines.push('`'); } else if (isEmTag(n)) { section.lines.push("*"); } else if (n == 'q') { @@ -527,7 +599,7 @@ function enexXmlToMdArray(stream, resources) { saxStream.on('end', function() { resolve({ content: section, - resources: resources, + resources: remainingResources, }); }) @@ -570,7 +642,7 @@ function colWidths(table) { const tr = table.lines[trIndex]; for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) { const td = tr.lines[tdIndex]; - const w = cellWidth(td.content); + const w = Math.min(cellWidth(td.content), 20); // Have to set a max width otherwise it can be extremely long for notes that import entire web pages (eg. Hacker News comment pages) if (output.length <= tdIndex) output.push(0); if (w > output[tdIndex]) output[tdIndex] = w; } diff --git a/ReactNativeClient/lib/import-enex.js b/ReactNativeClient/lib/import-enex.js index 475baff78..40fbf0617 100644 --- a/ReactNativeClient/lib/import-enex.js +++ b/ReactNativeClient/lib/import-enex.js @@ -212,51 +212,92 @@ function importEnex(parentFolderId, filePath, importOptions = null) { async function processNotes() { if (processingNotes) return false; - processingNotes = true; - stream.pause(); + try { + processingNotes = true; + stream.pause(); - let chain = []; - while (notes.length) { - let note = notes.shift(); - const contentStream = stringToStream(note.bodyXml); - chain.push(() => { - return enexXmlToMd(contentStream, note.resources).then((body) => { - delete note.bodyXml; + while (notes.length) { + let note = notes.shift(); + const contentStream = stringToStream(note.bodyXml); + const body = await enexXmlToMd(contentStream, note.resources); + delete note.bodyXml; - // console.info('-----------------------------------------------------------'); - // console.info(body); - // console.info('-----------------------------------------------------------'); + // console.info('-----------------------------------------------------------'); + // console.info(body); + // console.info('-----------------------------------------------------------'); - note.id = uuid.create(); - note.parent_id = parentFolderId; - note.body = body; + note.id = uuid.create(); + note.parent_id = parentFolderId; + note.body = body; - // Notes in enex files always have a created timestamp but not always an - // updated timestamp (it the note has never been modified). For sync - // we require an updated_time property, so set it to create_time in that case - if (!note.updated_time) note.updated_time = note.created_time; + // Notes in enex files always have a created timestamp but not always an + // updated timestamp (it the note has never been modified). For sync + // we require an updated_time property, so set it to create_time in that case + if (!note.updated_time) note.updated_time = note.created_time; - return saveNoteToStorage(note, importOptions.fuzzyMatching); - }).then((result) => { - if (result.noteUpdated) { - progressState.updated++; - } else if (result.noteCreated) { - progressState.created++; - } else if (result.noteSkipped) { - progressState.skipped++; - } - progressState.resourcesCreated += result.resourcesCreated; - progressState.notesTagged += result.notesTagged; - importOptions.onProgress(progressState); - }); - }); + const result = await saveNoteToStorage(note, importOptions.fuzzyMatching); + + if (result.noteUpdated) { + progressState.updated++; + } else if (result.noteCreated) { + progressState.created++; + } else if (result.noteSkipped) { + progressState.skipped++; + } + progressState.resourcesCreated += result.resourcesCreated; + progressState.notesTagged += result.notesTagged; + importOptions.onProgress(progressState); + } + } catch(error) { + console.error(error); } - return promiseChain(chain).then(() => { - stream.resume(); - processingNotes = false; - return true; - }); + stream.resume(); + processingNotes = false; + return true; + + // let chain = []; + // while (notes.length) { + // let note = notes.shift(); + // const contentStream = stringToStream(note.bodyXml); + // chain.push(() => { + // return enexXmlToMd(contentStream, note.resources).then((body) => { + // delete note.bodyXml; + + // // console.info('-----------------------------------------------------------'); + // // console.info(body); + // // console.info('-----------------------------------------------------------'); + + // note.id = uuid.create(); + // note.parent_id = parentFolderId; + // note.body = body; + + // // Notes in enex files always have a created timestamp but not always an + // // updated timestamp (it the note has never been modified). For sync + // // we require an updated_time property, so set it to create_time in that case + // if (!note.updated_time) note.updated_time = note.created_time; + + // return saveNoteToStorage(note, importOptions.fuzzyMatching); + // }).then((result) => { + // if (result.noteUpdated) { + // progressState.updated++; + // } else if (result.noteCreated) { + // progressState.created++; + // } else if (result.noteSkipped) { + // progressState.skipped++; + // } + // progressState.resourcesCreated += result.resourcesCreated; + // progressState.notesTagged += result.notesTagged; + // importOptions.onProgress(progressState); + // }); + // }); + // } + + // return promiseChain(chain).then(() => { + // stream.resume(); + // processingNotes = false; + // return true; + // }); } saxStream.on('error', (error) => { @@ -323,7 +364,11 @@ function importEnex(parentFolderId, filePath, importOptions = null) { noteResourceRecognition.objID = extractRecognitionObjId(data); } else if (note) { if (n == 'content') { - note.bodyXml = data; + if ('bodyXml' in note) { + note.bodyXml += data; + } else { + note.bodyXml = data; + } } } });