1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-24 10:27:10 +02:00

All: Improved ENEX import for web pages that have been saved as notes

This commit is contained in:
Laurent Cozic 2017-12-06 19:29:58 +00:00
parent f42908b11c
commit 507e7e6014
5 changed files with 193 additions and 72 deletions

View File

@ -246,9 +246,13 @@ class Application extends BaseApplication {
try { try {
CommandClass = require(__dirname + '/command-' + name + '.js'); CommandClass = require(__dirname + '/command-' + name + '.js');
} catch (error) { } catch (error) {
let e = new Error('No such command: ' + name); if (error.message && error.message.indexOf('Cannot find module') >= 0) {
e.type = 'notFound'; let e = new Error(_('No such command: %s', name));
throw e; e.type = 'notFound';
throw e;
} else {
throw error;
}
} }
let cmd = new CommandClass(); let cmd = new CommandClass();

View File

@ -1,6 +1,6 @@
{ {
"name": "joplin", "name": "joplin",
"version": "0.10.77", "version": "0.10.78",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {

View File

@ -18,7 +18,7 @@
], ],
"owner": "Laurent Cozic" "owner": "Laurent Cozic"
}, },
"version": "0.10.77", "version": "0.10.78",
"bin": { "bin": {
"joplin": "./main.js" "joplin": "./main.js"
}, },

View File

@ -194,11 +194,15 @@ function addResourceTag(lines, resource, alt = "") {
function isBlockTag(n) { function isBlockTag(n) {
return n=="div" || n=="p" || n=="dl" || n=="dd" || n=="center"; return n=="div" || n=="p" || n=="dl" || n=="dd" || n == 'dt' || n=="center";
} }
function isStrongTag(n) { function isStrongTag(n) {
return n == "strong" || n == "b"; return n == "strong" || n == "b" || n == 'big';
}
function isStrikeTag(n) {
return n == "strike" || n == "s" || n == 'del';
} }
function isEmTag(n) { function isEmTag(n) {
@ -210,7 +214,7 @@ function isAnchor(n) {
} }
function isIgnoredEndTag(n) { function isIgnoredEndTag(n) {
return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup'; return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup' || n == 'img' || n == 'abbr' || n == 'cite' || n == 'thead' || n == 'small' || n == 'tt' || n == 'sub';
} }
function isListTag(n) { function isListTag(n) {
@ -219,7 +223,7 @@ function isListTag(n) {
// Elements that don't require any special treatment beside adding a newline character // Elements that don't require any special treatment beside adding a newline character
function isNewLineOnlyEndTag(n) { function isNewLineOnlyEndTag(n) {
return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=="dl" || n=="dd" || n=="center"; return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=='h6' || n=="dl" || n=="dd" || n == 'dt' || n=="center";
} }
function isCodeTag(n) { function isCodeTag(n) {
@ -253,8 +257,27 @@ function xmlNodeText(xmlNode) {
return xmlNode[0]; return xmlNode[0];
} }
function attributeToLowerCase(node) {
if (!node.attributes) return {};
let output = {};
for (let n in node.attributes) {
if (!node.attributes.hasOwnProperty(n)) continue;
output[n.toLowerCase()] = node.attributes[n];
}
return output;
}
function enexXmlToMdArray(stream, resources) { function enexXmlToMdArray(stream, resources) {
resources = resources.slice(); let remainingResources = resources.slice();
const removeRemainingResource = (id) => {
for (let i = 0; i < remainingResources.length; i++) {
const r = remainingResources[i];
if (r.id === id) {
remainingResources.splice(i, 1);
}
}
}
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
let state = { let state = {
@ -265,7 +288,7 @@ function enexXmlToMdArray(stream, resources) {
}; };
let options = {}; let options = {};
let strict = true; let strict = false;
var saxStream = require('sax').createStream(strict, options) var saxStream = require('sax').createStream(strict, options)
let section = { let section = {
@ -275,14 +298,18 @@ function enexXmlToMdArray(stream, resources) {
}; };
saxStream.on('error', function(e) { saxStream.on('error', function(e) {
reject(e); console.warn(e);
//reject(e);
}) })
saxStream.on('text', function(text) { saxStream.on('text', function(text) {
if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return;
section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text); section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text);
}) })
saxStream.on('opentag', function(node) { saxStream.on('opentag', function(node) {
const nodeAttributes = attributeToLowerCase(node);
let n = node.name.toLowerCase(); let n = node.name.toLowerCase();
if (n == 'en-note') { if (n == 'en-note') {
// Start of note // Start of note
@ -293,25 +320,51 @@ function enexXmlToMdArray(stream, resources) {
type: 'table', type: 'table',
lines: [], lines: [],
parent: section, parent: section,
toString: function() {
let output = [];
output.push(BLOCK_OPEN);
for (let i = 0; i < this.lines.length; i++) {
output = output.concat(this.lines[i].toMdLines());
}
output.push(BLOCK_CLOSE);
return processMdArrayNewLines(output);
},
}; };
section.lines.push(newSection); section.lines.push(newSection);
section = newSection; section = newSection;
} else if (n == 'tbody') { } else if (n == 'tbody' || n == 'thead') {
// Ignore it // Ignore it
} else if (n == 'tr') { } else if (n == 'tr') {
if (section.type != 'table') throw new Error('Found a <tr> tag outside of a table'); if (section.type != 'table') {
console.warn('Found a <tr> tag outside of a table');
return;
}
let newSection = { let newSection = {
type: 'tr', type: 'tr',
lines: [], lines: [],
parent: section, parent: section,
isHeader: false, isHeader: false,
// Normally tables are rendered properly as markdown, but for table within table within table... we cannot
// handle this in Markdown so simply render it as one cell per line.
toMdLines: function() {
let output = [];
output.push(BLOCK_OPEN);
for (let i = 0; i < this.lines.length; i++) {
output.push(this.lines[i].toString());
}
output.push(BLOCK_CLOSE);
return output;
},
} }
section.lines.push(newSection); section.lines.push(newSection);
section = newSection; section = newSection;
} else if (n == 'td' || n == 'th') { } else if (n == 'td' || n == 'th') {
if (section.type != 'tr') throw new Error('Found a <td> tag outside of a <tr>'); if (section.type != 'tr') {
console.warn('Found a <td> tag outside of a <tr>');
return;
}
if (n == 'th') section.isHeader = true; if (n == 'th') section.isHeader = true;
@ -319,6 +372,9 @@ function enexXmlToMdArray(stream, resources) {
type: 'td', type: 'td',
lines: [], lines: [],
parent: section, parent: section,
toString: function() {
return processMdArrayNewLines(this.lines);
},
}; };
section.lines.push(newSection); section.lines.push(newSection);
@ -342,17 +398,27 @@ function enexXmlToMdArray(stream, resources) {
} }
} else if (isStrongTag(n)) { } else if (isStrongTag(n)) {
section.lines.push("**"); section.lines.push("**");
} else if (n == 's') { } else if (isStrikeTag(n)) {
// Not supported section.lines.push('(');
} else if (n == 'samp') {
section.lines.push('`');
} else if (n == 'q') { } else if (n == 'q') {
section.lines.push('"'); section.lines.push('"');
} else if (n == 'img') {
// TODO: TEST IMAGE
if (nodeAttributes.src) { // Many (most?) img tags don't have no source associated, especially when they were imported from HTML
let s = '![';
if (nodeAttributes.alt) s += nodeAttributes.alt;
s += '](' + nodeAttributes.src + ')';
section.lines.push(s);
}
} else if (isAnchor(n)) { } else if (isAnchor(n)) {
state.anchorAttributes.push(node.attributes); state.anchorAttributes.push(nodeAttributes);
section.lines.push('['); section.lines.push('[');
} else if (isEmTag(n)) { } else if (isEmTag(n)) {
section.lines.push("*"); section.lines.push("*");
} else if (n == "en-todo") { } else if (n == "en-todo") {
let x = node.attributes && node.attributes.checked && node.attributes.checked.toLowerCase() == 'true' ? 'X' : ' '; let x = nodeAttributes && nodeAttributes.checked && nodeAttributes.checked.toLowerCase() == 'true' ? 'X' : ' ';
section.lines.push('- [' + x + '] '); section.lines.push('- [' + x + '] ');
} else if (n == "hr") { } else if (n == "hr") {
// Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML // Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML
@ -375,20 +441,20 @@ function enexXmlToMdArray(stream, resources) {
} else if (n == 'blockquote') { } else if (n == 'blockquote') {
section.lines.push(BLOCK_OPEN); section.lines.push(BLOCK_OPEN);
state.inQuote = true; state.inQuote = true;
} else if (isCodeTag(n, node.attributes)) { } else if (isCodeTag(n, nodeAttributes)) {
section.lines.push(BLOCK_OPEN); section.lines.push(BLOCK_OPEN);
state.inCode = true; state.inCode = true;
} else if (n == "br") { } else if (n == "br") {
section.lines.push(NEWLINE); section.lines.push(NEWLINE);
} else if (n == "en-media") { } else if (n == "en-media") {
const hash = node.attributes.hash; const hash = nodeAttributes.hash;
let resource = null; let resource = null;
for (let i = 0; i < resources.length; i++) { for (let i = 0; i < resources.length; i++) {
let r = resources[i]; let r = resources[i];
if (r.id == hash) { if (r.id == hash) {
resource = r; resource = r;
resources.splice(i, 1); removeRemainingResource(r.id);
break; break;
} }
} }
@ -430,11 +496,11 @@ function enexXmlToMdArray(stream, resources) {
// </en-export> // </en-export>
let found = false; let found = false;
for (let i = 0; i < resources.length; i++) { for (let i = 0; i < remainingResources.length; i++) {
let r = resources[i]; let r = remainingResources[i];
if (!r.id) { if (!r.id) {
r.id = hash; r.id = hash;
resources[i] = r; remainingResources[i] = r;
found = true; found = true;
break; break;
} }
@ -448,27 +514,29 @@ function enexXmlToMdArray(stream, resources) {
// means it's an attachement. It will be appended along with the // means it's an attachement. It will be appended along with the
// other remaining resources at the bottom of the markdown text. // other remaining resources at the bottom of the markdown text.
if (!!resource.id) { if (!!resource.id) {
section.lines = addResourceTag(section.lines, resource, node.attributes.alt); section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt);
} }
} }
} else if (n == "span" || n == "font" || n == 'sup') { } else if (n == "span" || n == "font" || n == 'sup' || n == 'cite' || n == 'abbr' || n == 'small' || n == 'tt' || n == 'sub') {
// Ignore // Inline tags that can be ignored in Markdown
} else { } else {
console.warn("Unsupported start tag: " + n); console.warn("Unsupported start tag: " + n);
} }
}) })
saxStream.on('closetag', function(n) { saxStream.on('closetag', function(n) {
n = n ? n.toLowerCase() : n;
if (n == 'en-note') { if (n == 'en-note') {
// End of note // End of note
} else if (isNewLineOnlyEndTag(n)) { } else if (isNewLineOnlyEndTag(n)) {
section.lines.push(BLOCK_CLOSE); section.lines.push(BLOCK_CLOSE);
} else if (n == 'td' || n == 'th') { } else if (n == 'td' || n == 'th') {
section = section.parent; if (section && section.parent) section = section.parent;
} else if (n == 'tr') { } else if (n == 'tr') {
section = section.parent; if (section && section.parent) section = section.parent;
} else if (n == 'table') { } else if (n == 'table') {
section = section.parent; if (section && section.parent) section = section.parent;
} else if (isIgnoredEndTag(n)) { } else if (isIgnoredEndTag(n)) {
// Skip // Skip
} else if (isListTag(n)) { } else if (isListTag(n)) {
@ -476,6 +544,10 @@ function enexXmlToMdArray(stream, resources) {
state.lists.pop(); state.lists.pop();
} else if (isStrongTag(n)) { } else if (isStrongTag(n)) {
section.lines.push("**"); section.lines.push("**");
} else if (isStrikeTag(n)) {
section.lines.push(')');
} else if (n == 'samp') {
section.lines.push('`');
} else if (isEmTag(n)) { } else if (isEmTag(n)) {
section.lines.push("*"); section.lines.push("*");
} else if (n == 'q') { } else if (n == 'q') {
@ -527,7 +599,7 @@ function enexXmlToMdArray(stream, resources) {
saxStream.on('end', function() { saxStream.on('end', function() {
resolve({ resolve({
content: section, content: section,
resources: resources, resources: remainingResources,
}); });
}) })
@ -570,7 +642,7 @@ function colWidths(table) {
const tr = table.lines[trIndex]; const tr = table.lines[trIndex];
for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) { for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) {
const td = tr.lines[tdIndex]; const td = tr.lines[tdIndex];
const w = cellWidth(td.content); const w = Math.min(cellWidth(td.content), 20); // Have to set a max width otherwise it can be extremely long for notes that import entire web pages (eg. Hacker News comment pages)
if (output.length <= tdIndex) output.push(0); if (output.length <= tdIndex) output.push(0);
if (w > output[tdIndex]) output[tdIndex] = w; if (w > output[tdIndex]) output[tdIndex] = w;
} }

View File

@ -212,51 +212,92 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
async function processNotes() { async function processNotes() {
if (processingNotes) return false; if (processingNotes) return false;
processingNotes = true; try {
stream.pause(); processingNotes = true;
stream.pause();
let chain = []; while (notes.length) {
while (notes.length) { let note = notes.shift();
let note = notes.shift(); const contentStream = stringToStream(note.bodyXml);
const contentStream = stringToStream(note.bodyXml); const body = await enexXmlToMd(contentStream, note.resources);
chain.push(() => { delete note.bodyXml;
return enexXmlToMd(contentStream, note.resources).then((body) => {
delete note.bodyXml;
// console.info('-----------------------------------------------------------'); // console.info('-----------------------------------------------------------');
// console.info(body); // console.info(body);
// console.info('-----------------------------------------------------------'); // console.info('-----------------------------------------------------------');
note.id = uuid.create(); note.id = uuid.create();
note.parent_id = parentFolderId; note.parent_id = parentFolderId;
note.body = body; note.body = body;
// Notes in enex files always have a created timestamp but not always an // Notes in enex files always have a created timestamp but not always an
// updated timestamp (it the note has never been modified). For sync // updated timestamp (it the note has never been modified). For sync
// we require an updated_time property, so set it to create_time in that case // we require an updated_time property, so set it to create_time in that case
if (!note.updated_time) note.updated_time = note.created_time; if (!note.updated_time) note.updated_time = note.created_time;
return saveNoteToStorage(note, importOptions.fuzzyMatching); const result = await saveNoteToStorage(note, importOptions.fuzzyMatching);
}).then((result) => {
if (result.noteUpdated) { if (result.noteUpdated) {
progressState.updated++; progressState.updated++;
} else if (result.noteCreated) { } else if (result.noteCreated) {
progressState.created++; progressState.created++;
} else if (result.noteSkipped) { } else if (result.noteSkipped) {
progressState.skipped++; progressState.skipped++;
} }
progressState.resourcesCreated += result.resourcesCreated; progressState.resourcesCreated += result.resourcesCreated;
progressState.notesTagged += result.notesTagged; progressState.notesTagged += result.notesTagged;
importOptions.onProgress(progressState); importOptions.onProgress(progressState);
}); }
}); } catch(error) {
console.error(error);
} }
return promiseChain(chain).then(() => { stream.resume();
stream.resume(); processingNotes = false;
processingNotes = false; return true;
return true;
}); // let chain = [];
// while (notes.length) {
// let note = notes.shift();
// const contentStream = stringToStream(note.bodyXml);
// chain.push(() => {
// return enexXmlToMd(contentStream, note.resources).then((body) => {
// delete note.bodyXml;
// // console.info('-----------------------------------------------------------');
// // console.info(body);
// // console.info('-----------------------------------------------------------');
// note.id = uuid.create();
// note.parent_id = parentFolderId;
// note.body = body;
// // Notes in enex files always have a created timestamp but not always an
// // updated timestamp (it the note has never been modified). For sync
// // we require an updated_time property, so set it to create_time in that case
// if (!note.updated_time) note.updated_time = note.created_time;
// return saveNoteToStorage(note, importOptions.fuzzyMatching);
// }).then((result) => {
// if (result.noteUpdated) {
// progressState.updated++;
// } else if (result.noteCreated) {
// progressState.created++;
// } else if (result.noteSkipped) {
// progressState.skipped++;
// }
// progressState.resourcesCreated += result.resourcesCreated;
// progressState.notesTagged += result.notesTagged;
// importOptions.onProgress(progressState);
// });
// });
// }
// return promiseChain(chain).then(() => {
// stream.resume();
// processingNotes = false;
// return true;
// });
} }
saxStream.on('error', (error) => { saxStream.on('error', (error) => {
@ -323,7 +364,11 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
noteResourceRecognition.objID = extractRecognitionObjId(data); noteResourceRecognition.objID = extractRecognitionObjId(data);
} else if (note) { } else if (note) {
if (n == 'content') { if (n == 'content') {
note.bodyXml = data; if ('bodyXml' in note) {
note.bodyXml += data;
} else {
note.bodyXml = data;
}
} }
} }
}); });