1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-24 10:27:10 +02:00

All: Improved ENEX import for web pages that have been saved as notes

This commit is contained in:
Laurent Cozic 2017-12-06 19:29:58 +00:00
parent f42908b11c
commit 507e7e6014
5 changed files with 193 additions and 72 deletions

View File

@ -246,9 +246,13 @@ class Application extends BaseApplication {
try {
CommandClass = require(__dirname + '/command-' + name + '.js');
} catch (error) {
let e = new Error('No such command: ' + name);
e.type = 'notFound';
throw e;
if (error.message && error.message.indexOf('Cannot find module') >= 0) {
let e = new Error(_('No such command: %s', name));
e.type = 'notFound';
throw e;
} else {
throw error;
}
}
let cmd = new CommandClass();

View File

@ -1,6 +1,6 @@
{
"name": "joplin",
"version": "0.10.77",
"version": "0.10.78",
"lockfileVersion": 1,
"requires": true,
"dependencies": {

View File

@ -18,7 +18,7 @@
],
"owner": "Laurent Cozic"
},
"version": "0.10.77",
"version": "0.10.78",
"bin": {
"joplin": "./main.js"
},

View File

@ -194,11 +194,15 @@ function addResourceTag(lines, resource, alt = "") {
function isBlockTag(n) {
return n=="div" || n=="p" || n=="dl" || n=="dd" || n=="center";
return n=="div" || n=="p" || n=="dl" || n=="dd" || n == 'dt' || n=="center";
}
function isStrongTag(n) {
return n == "strong" || n == "b";
return n == "strong" || n == "b" || n == 'big';
}
function isStrikeTag(n) {
return n == "strike" || n == "s" || n == 'del';
}
function isEmTag(n) {
@ -210,7 +214,7 @@ function isAnchor(n) {
}
function isIgnoredEndTag(n) {
return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup';
return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup' || n == 'img' || n == 'abbr' || n == 'cite' || n == 'thead' || n == 'small' || n == 'tt' || n == 'sub';
}
function isListTag(n) {
@ -219,7 +223,7 @@ function isListTag(n) {
// Elements that don't require any special treatment beside adding a newline character
function isNewLineOnlyEndTag(n) {
return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=="dl" || n=="dd" || n=="center";
return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=='h6' || n=="dl" || n=="dd" || n == 'dt' || n=="center";
}
function isCodeTag(n) {
@ -253,8 +257,27 @@ function xmlNodeText(xmlNode) {
return xmlNode[0];
}
function attributeToLowerCase(node) {
if (!node.attributes) return {};
let output = {};
for (let n in node.attributes) {
if (!node.attributes.hasOwnProperty(n)) continue;
output[n.toLowerCase()] = node.attributes[n];
}
return output;
}
function enexXmlToMdArray(stream, resources) {
resources = resources.slice();
let remainingResources = resources.slice();
const removeRemainingResource = (id) => {
for (let i = 0; i < remainingResources.length; i++) {
const r = remainingResources[i];
if (r.id === id) {
remainingResources.splice(i, 1);
}
}
}
return new Promise((resolve, reject) => {
let state = {
@ -265,7 +288,7 @@ function enexXmlToMdArray(stream, resources) {
};
let options = {};
let strict = true;
let strict = false;
var saxStream = require('sax').createStream(strict, options)
let section = {
@ -275,14 +298,18 @@ function enexXmlToMdArray(stream, resources) {
};
saxStream.on('error', function(e) {
reject(e);
console.warn(e);
//reject(e);
})
saxStream.on('text', function(text) {
if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return;
section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text);
})
saxStream.on('opentag', function(node) {
const nodeAttributes = attributeToLowerCase(node);
let n = node.name.toLowerCase();
if (n == 'en-note') {
// Start of note
@ -293,25 +320,51 @@ function enexXmlToMdArray(stream, resources) {
type: 'table',
lines: [],
parent: section,
toString: function() {
let output = [];
output.push(BLOCK_OPEN);
for (let i = 0; i < this.lines.length; i++) {
output = output.concat(this.lines[i].toMdLines());
}
output.push(BLOCK_CLOSE);
return processMdArrayNewLines(output);
},
};
section.lines.push(newSection);
section = newSection;
} else if (n == 'tbody') {
} else if (n == 'tbody' || n == 'thead') {
// Ignore it
} else if (n == 'tr') {
if (section.type != 'table') throw new Error('Found a <tr> tag outside of a table');
if (section.type != 'table') {
console.warn('Found a <tr> tag outside of a table');
return;
}
let newSection = {
type: 'tr',
lines: [],
parent: section,
isHeader: false,
// Normally tables are rendered properly as markdown, but for table within table within table... we cannot
// handle this in Markdown so simply render it as one cell per line.
toMdLines: function() {
let output = [];
output.push(BLOCK_OPEN);
for (let i = 0; i < this.lines.length; i++) {
output.push(this.lines[i].toString());
}
output.push(BLOCK_CLOSE);
return output;
},
}
section.lines.push(newSection);
section = newSection;
} else if (n == 'td' || n == 'th') {
if (section.type != 'tr') throw new Error('Found a <td> tag outside of a <tr>');
if (section.type != 'tr') {
console.warn('Found a <td> tag outside of a <tr>');
return;
}
if (n == 'th') section.isHeader = true;
@ -319,6 +372,9 @@ function enexXmlToMdArray(stream, resources) {
type: 'td',
lines: [],
parent: section,
toString: function() {
return processMdArrayNewLines(this.lines);
},
};
section.lines.push(newSection);
@ -342,17 +398,27 @@ function enexXmlToMdArray(stream, resources) {
}
} else if (isStrongTag(n)) {
section.lines.push("**");
} else if (n == 's') {
// Not supported
} else if (isStrikeTag(n)) {
section.lines.push('(');
} else if (n == 'samp') {
section.lines.push('`');
} else if (n == 'q') {
section.lines.push('"');
} else if (n == 'img') {
// TODO: TEST IMAGE
if (nodeAttributes.src) { // Many (most?) img tags don't have no source associated, especially when they were imported from HTML
let s = '![';
if (nodeAttributes.alt) s += nodeAttributes.alt;
s += '](' + nodeAttributes.src + ')';
section.lines.push(s);
}
} else if (isAnchor(n)) {
state.anchorAttributes.push(node.attributes);
state.anchorAttributes.push(nodeAttributes);
section.lines.push('[');
} else if (isEmTag(n)) {
section.lines.push("*");
} else if (n == "en-todo") {
let x = node.attributes && node.attributes.checked && node.attributes.checked.toLowerCase() == 'true' ? 'X' : ' ';
let x = nodeAttributes && nodeAttributes.checked && nodeAttributes.checked.toLowerCase() == 'true' ? 'X' : ' ';
section.lines.push('- [' + x + '] ');
} else if (n == "hr") {
// Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML
@ -375,20 +441,20 @@ function enexXmlToMdArray(stream, resources) {
} else if (n == 'blockquote') {
section.lines.push(BLOCK_OPEN);
state.inQuote = true;
} else if (isCodeTag(n, node.attributes)) {
} else if (isCodeTag(n, nodeAttributes)) {
section.lines.push(BLOCK_OPEN);
state.inCode = true;
} else if (n == "br") {
section.lines.push(NEWLINE);
} else if (n == "en-media") {
const hash = node.attributes.hash;
const hash = nodeAttributes.hash;
let resource = null;
for (let i = 0; i < resources.length; i++) {
let r = resources[i];
if (r.id == hash) {
resource = r;
resources.splice(i, 1);
removeRemainingResource(r.id);
break;
}
}
@ -430,11 +496,11 @@ function enexXmlToMdArray(stream, resources) {
// </en-export>
let found = false;
for (let i = 0; i < resources.length; i++) {
let r = resources[i];
for (let i = 0; i < remainingResources.length; i++) {
let r = remainingResources[i];
if (!r.id) {
r.id = hash;
resources[i] = r;
remainingResources[i] = r;
found = true;
break;
}
@ -448,27 +514,29 @@ function enexXmlToMdArray(stream, resources) {
// means it's an attachement. It will be appended along with the
// other remaining resources at the bottom of the markdown text.
if (!!resource.id) {
section.lines = addResourceTag(section.lines, resource, node.attributes.alt);
section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt);
}
}
} else if (n == "span" || n == "font" || n == 'sup') {
// Ignore
} else if (n == "span" || n == "font" || n == 'sup' || n == 'cite' || n == 'abbr' || n == 'small' || n == 'tt' || n == 'sub') {
// Inline tags that can be ignored in Markdown
} else {
console.warn("Unsupported start tag: " + n);
}
})
saxStream.on('closetag', function(n) {
n = n ? n.toLowerCase() : n;
if (n == 'en-note') {
// End of note
} else if (isNewLineOnlyEndTag(n)) {
section.lines.push(BLOCK_CLOSE);
} else if (n == 'td' || n == 'th') {
section = section.parent;
if (section && section.parent) section = section.parent;
} else if (n == 'tr') {
section = section.parent;
if (section && section.parent) section = section.parent;
} else if (n == 'table') {
section = section.parent;
if (section && section.parent) section = section.parent;
} else if (isIgnoredEndTag(n)) {
// Skip
} else if (isListTag(n)) {
@ -476,6 +544,10 @@ function enexXmlToMdArray(stream, resources) {
state.lists.pop();
} else if (isStrongTag(n)) {
section.lines.push("**");
} else if (isStrikeTag(n)) {
section.lines.push(')');
} else if (n == 'samp') {
section.lines.push('`');
} else if (isEmTag(n)) {
section.lines.push("*");
} else if (n == 'q') {
@ -527,7 +599,7 @@ function enexXmlToMdArray(stream, resources) {
saxStream.on('end', function() {
resolve({
content: section,
resources: resources,
resources: remainingResources,
});
})
@ -570,7 +642,7 @@ function colWidths(table) {
const tr = table.lines[trIndex];
for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) {
const td = tr.lines[tdIndex];
const w = cellWidth(td.content);
const w = Math.min(cellWidth(td.content), 20); // Have to set a max width otherwise it can be extremely long for notes that import entire web pages (eg. Hacker News comment pages)
if (output.length <= tdIndex) output.push(0);
if (w > output[tdIndex]) output[tdIndex] = w;
}

View File

@ -212,51 +212,92 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
async function processNotes() {
if (processingNotes) return false;
processingNotes = true;
stream.pause();
try {
processingNotes = true;
stream.pause();
let chain = [];
while (notes.length) {
let note = notes.shift();
const contentStream = stringToStream(note.bodyXml);
chain.push(() => {
return enexXmlToMd(contentStream, note.resources).then((body) => {
delete note.bodyXml;
while (notes.length) {
let note = notes.shift();
const contentStream = stringToStream(note.bodyXml);
const body = await enexXmlToMd(contentStream, note.resources);
delete note.bodyXml;
// console.info('-----------------------------------------------------------');
// console.info(body);
// console.info('-----------------------------------------------------------');
// console.info('-----------------------------------------------------------');
// console.info(body);
// console.info('-----------------------------------------------------------');
note.id = uuid.create();
note.parent_id = parentFolderId;
note.body = body;
note.id = uuid.create();
note.parent_id = parentFolderId;
note.body = body;
// Notes in enex files always have a created timestamp but not always an
// updated timestamp (it the note has never been modified). For sync
// we require an updated_time property, so set it to create_time in that case
if (!note.updated_time) note.updated_time = note.created_time;
// Notes in enex files always have a created timestamp but not always an
// updated timestamp (it the note has never been modified). For sync
// we require an updated_time property, so set it to create_time in that case
if (!note.updated_time) note.updated_time = note.created_time;
return saveNoteToStorage(note, importOptions.fuzzyMatching);
}).then((result) => {
if (result.noteUpdated) {
progressState.updated++;
} else if (result.noteCreated) {
progressState.created++;
} else if (result.noteSkipped) {
progressState.skipped++;
}
progressState.resourcesCreated += result.resourcesCreated;
progressState.notesTagged += result.notesTagged;
importOptions.onProgress(progressState);
});
});
const result = await saveNoteToStorage(note, importOptions.fuzzyMatching);
if (result.noteUpdated) {
progressState.updated++;
} else if (result.noteCreated) {
progressState.created++;
} else if (result.noteSkipped) {
progressState.skipped++;
}
progressState.resourcesCreated += result.resourcesCreated;
progressState.notesTagged += result.notesTagged;
importOptions.onProgress(progressState);
}
} catch(error) {
console.error(error);
}
return promiseChain(chain).then(() => {
stream.resume();
processingNotes = false;
return true;
});
stream.resume();
processingNotes = false;
return true;
// let chain = [];
// while (notes.length) {
// let note = notes.shift();
// const contentStream = stringToStream(note.bodyXml);
// chain.push(() => {
// return enexXmlToMd(contentStream, note.resources).then((body) => {
// delete note.bodyXml;
// // console.info('-----------------------------------------------------------');
// // console.info(body);
// // console.info('-----------------------------------------------------------');
// note.id = uuid.create();
// note.parent_id = parentFolderId;
// note.body = body;
// // Notes in enex files always have a created timestamp but not always an
// // updated timestamp (it the note has never been modified). For sync
// // we require an updated_time property, so set it to create_time in that case
// if (!note.updated_time) note.updated_time = note.created_time;
// return saveNoteToStorage(note, importOptions.fuzzyMatching);
// }).then((result) => {
// if (result.noteUpdated) {
// progressState.updated++;
// } else if (result.noteCreated) {
// progressState.created++;
// } else if (result.noteSkipped) {
// progressState.skipped++;
// }
// progressState.resourcesCreated += result.resourcesCreated;
// progressState.notesTagged += result.notesTagged;
// importOptions.onProgress(progressState);
// });
// });
// }
// return promiseChain(chain).then(() => {
// stream.resume();
// processingNotes = false;
// return true;
// });
}
saxStream.on('error', (error) => {
@ -323,7 +364,11 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
noteResourceRecognition.objID = extractRecognitionObjId(data);
} else if (note) {
if (n == 'content') {
note.bodyXml = data;
if ('bodyXml' in note) {
note.bodyXml += data;
} else {
note.bodyXml = data;
}
}
}
});