mirror of
https://github.com/laurent22/joplin.git
synced 2024-12-24 10:27:10 +02:00
All: Improved ENEX import for web pages that have been saved as notes
This commit is contained in:
parent
f42908b11c
commit
507e7e6014
@ -246,9 +246,13 @@ class Application extends BaseApplication {
|
||||
try {
|
||||
CommandClass = require(__dirname + '/command-' + name + '.js');
|
||||
} catch (error) {
|
||||
let e = new Error('No such command: ' + name);
|
||||
e.type = 'notFound';
|
||||
throw e;
|
||||
if (error.message && error.message.indexOf('Cannot find module') >= 0) {
|
||||
let e = new Error(_('No such command: %s', name));
|
||||
e.type = 'notFound';
|
||||
throw e;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
let cmd = new CommandClass();
|
||||
|
2
CliClient/package-lock.json
generated
2
CliClient/package-lock.json
generated
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "joplin",
|
||||
"version": "0.10.77",
|
||||
"version": "0.10.78",
|
||||
"lockfileVersion": 1,
|
||||
"requires": true,
|
||||
"dependencies": {
|
||||
|
@ -18,7 +18,7 @@
|
||||
],
|
||||
"owner": "Laurent Cozic"
|
||||
},
|
||||
"version": "0.10.77",
|
||||
"version": "0.10.78",
|
||||
"bin": {
|
||||
"joplin": "./main.js"
|
||||
},
|
||||
|
@ -194,11 +194,15 @@ function addResourceTag(lines, resource, alt = "") {
|
||||
|
||||
|
||||
function isBlockTag(n) {
|
||||
return n=="div" || n=="p" || n=="dl" || n=="dd" || n=="center";
|
||||
return n=="div" || n=="p" || n=="dl" || n=="dd" || n == 'dt' || n=="center";
|
||||
}
|
||||
|
||||
function isStrongTag(n) {
|
||||
return n == "strong" || n == "b";
|
||||
return n == "strong" || n == "b" || n == 'big';
|
||||
}
|
||||
|
||||
function isStrikeTag(n) {
|
||||
return n == "strike" || n == "s" || n == 'del';
|
||||
}
|
||||
|
||||
function isEmTag(n) {
|
||||
@ -210,7 +214,7 @@ function isAnchor(n) {
|
||||
}
|
||||
|
||||
function isIgnoredEndTag(n) {
|
||||
return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup';
|
||||
return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br" || n=='hr' || n=='s' || n == 'tbody' || n == 'sup' || n == 'img' || n == 'abbr' || n == 'cite' || n == 'thead' || n == 'small' || n == 'tt' || n == 'sub';
|
||||
}
|
||||
|
||||
function isListTag(n) {
|
||||
@ -219,7 +223,7 @@ function isListTag(n) {
|
||||
|
||||
// Elements that don't require any special treatment beside adding a newline character
|
||||
function isNewLineOnlyEndTag(n) {
|
||||
return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=="dl" || n=="dd" || n=="center";
|
||||
return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=='h6' || n=="dl" || n=="dd" || n == 'dt' || n=="center";
|
||||
}
|
||||
|
||||
function isCodeTag(n) {
|
||||
@ -253,8 +257,27 @@ function xmlNodeText(xmlNode) {
|
||||
return xmlNode[0];
|
||||
}
|
||||
|
||||
function attributeToLowerCase(node) {
|
||||
if (!node.attributes) return {};
|
||||
let output = {};
|
||||
for (let n in node.attributes) {
|
||||
if (!node.attributes.hasOwnProperty(n)) continue;
|
||||
output[n.toLowerCase()] = node.attributes[n];
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
function enexXmlToMdArray(stream, resources) {
|
||||
resources = resources.slice();
|
||||
let remainingResources = resources.slice();
|
||||
|
||||
const removeRemainingResource = (id) => {
|
||||
for (let i = 0; i < remainingResources.length; i++) {
|
||||
const r = remainingResources[i];
|
||||
if (r.id === id) {
|
||||
remainingResources.splice(i, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
let state = {
|
||||
@ -265,7 +288,7 @@ function enexXmlToMdArray(stream, resources) {
|
||||
};
|
||||
|
||||
let options = {};
|
||||
let strict = true;
|
||||
let strict = false;
|
||||
var saxStream = require('sax').createStream(strict, options)
|
||||
|
||||
let section = {
|
||||
@ -275,14 +298,18 @@ function enexXmlToMdArray(stream, resources) {
|
||||
};
|
||||
|
||||
saxStream.on('error', function(e) {
|
||||
reject(e);
|
||||
console.warn(e);
|
||||
//reject(e);
|
||||
})
|
||||
|
||||
saxStream.on('text', function(text) {
|
||||
if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return;
|
||||
section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text);
|
||||
})
|
||||
|
||||
saxStream.on('opentag', function(node) {
|
||||
const nodeAttributes = attributeToLowerCase(node);
|
||||
|
||||
let n = node.name.toLowerCase();
|
||||
if (n == 'en-note') {
|
||||
// Start of note
|
||||
@ -293,25 +320,51 @@ function enexXmlToMdArray(stream, resources) {
|
||||
type: 'table',
|
||||
lines: [],
|
||||
parent: section,
|
||||
toString: function() {
|
||||
let output = [];
|
||||
output.push(BLOCK_OPEN);
|
||||
for (let i = 0; i < this.lines.length; i++) {
|
||||
output = output.concat(this.lines[i].toMdLines());
|
||||
}
|
||||
output.push(BLOCK_CLOSE);
|
||||
return processMdArrayNewLines(output);
|
||||
},
|
||||
};
|
||||
section.lines.push(newSection);
|
||||
section = newSection;
|
||||
} else if (n == 'tbody') {
|
||||
} else if (n == 'tbody' || n == 'thead') {
|
||||
// Ignore it
|
||||
} else if (n == 'tr') {
|
||||
if (section.type != 'table') throw new Error('Found a <tr> tag outside of a table');
|
||||
if (section.type != 'table') {
|
||||
console.warn('Found a <tr> tag outside of a table');
|
||||
return;
|
||||
}
|
||||
|
||||
let newSection = {
|
||||
type: 'tr',
|
||||
lines: [],
|
||||
parent: section,
|
||||
isHeader: false,
|
||||
// Normally tables are rendered properly as markdown, but for table within table within table... we cannot
|
||||
// handle this in Markdown so simply render it as one cell per line.
|
||||
toMdLines: function() {
|
||||
let output = [];
|
||||
output.push(BLOCK_OPEN);
|
||||
for (let i = 0; i < this.lines.length; i++) {
|
||||
output.push(this.lines[i].toString());
|
||||
}
|
||||
output.push(BLOCK_CLOSE);
|
||||
return output;
|
||||
},
|
||||
}
|
||||
|
||||
section.lines.push(newSection);
|
||||
section = newSection;
|
||||
} else if (n == 'td' || n == 'th') {
|
||||
if (section.type != 'tr') throw new Error('Found a <td> tag outside of a <tr>');
|
||||
if (section.type != 'tr') {
|
||||
console.warn('Found a <td> tag outside of a <tr>');
|
||||
return;
|
||||
}
|
||||
|
||||
if (n == 'th') section.isHeader = true;
|
||||
|
||||
@ -319,6 +372,9 @@ function enexXmlToMdArray(stream, resources) {
|
||||
type: 'td',
|
||||
lines: [],
|
||||
parent: section,
|
||||
toString: function() {
|
||||
return processMdArrayNewLines(this.lines);
|
||||
},
|
||||
};
|
||||
|
||||
section.lines.push(newSection);
|
||||
@ -342,17 +398,27 @@ function enexXmlToMdArray(stream, resources) {
|
||||
}
|
||||
} else if (isStrongTag(n)) {
|
||||
section.lines.push("**");
|
||||
} else if (n == 's') {
|
||||
// Not supported
|
||||
} else if (isStrikeTag(n)) {
|
||||
section.lines.push('(');
|
||||
} else if (n == 'samp') {
|
||||
section.lines.push('`');
|
||||
} else if (n == 'q') {
|
||||
section.lines.push('"');
|
||||
} else if (n == 'img') {
|
||||
// TODO: TEST IMAGE
|
||||
if (nodeAttributes.src) { // Many (most?) img tags don't have no source associated, especially when they were imported from HTML
|
||||
let s = '![';
|
||||
if (nodeAttributes.alt) s += nodeAttributes.alt;
|
||||
s += '](' + nodeAttributes.src + ')';
|
||||
section.lines.push(s);
|
||||
}
|
||||
} else if (isAnchor(n)) {
|
||||
state.anchorAttributes.push(node.attributes);
|
||||
state.anchorAttributes.push(nodeAttributes);
|
||||
section.lines.push('[');
|
||||
} else if (isEmTag(n)) {
|
||||
section.lines.push("*");
|
||||
} else if (n == "en-todo") {
|
||||
let x = node.attributes && node.attributes.checked && node.attributes.checked.toLowerCase() == 'true' ? 'X' : ' ';
|
||||
let x = nodeAttributes && nodeAttributes.checked && nodeAttributes.checked.toLowerCase() == 'true' ? 'X' : ' ';
|
||||
section.lines.push('- [' + x + '] ');
|
||||
} else if (n == "hr") {
|
||||
// Needs to be surrounded by new lines so that it's properly rendered as a line when converting to HTML
|
||||
@ -375,20 +441,20 @@ function enexXmlToMdArray(stream, resources) {
|
||||
} else if (n == 'blockquote') {
|
||||
section.lines.push(BLOCK_OPEN);
|
||||
state.inQuote = true;
|
||||
} else if (isCodeTag(n, node.attributes)) {
|
||||
} else if (isCodeTag(n, nodeAttributes)) {
|
||||
section.lines.push(BLOCK_OPEN);
|
||||
state.inCode = true;
|
||||
} else if (n == "br") {
|
||||
section.lines.push(NEWLINE);
|
||||
} else if (n == "en-media") {
|
||||
const hash = node.attributes.hash;
|
||||
const hash = nodeAttributes.hash;
|
||||
|
||||
let resource = null;
|
||||
for (let i = 0; i < resources.length; i++) {
|
||||
let r = resources[i];
|
||||
if (r.id == hash) {
|
||||
resource = r;
|
||||
resources.splice(i, 1);
|
||||
removeRemainingResource(r.id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -430,11 +496,11 @@ function enexXmlToMdArray(stream, resources) {
|
||||
// </en-export>
|
||||
|
||||
let found = false;
|
||||
for (let i = 0; i < resources.length; i++) {
|
||||
let r = resources[i];
|
||||
for (let i = 0; i < remainingResources.length; i++) {
|
||||
let r = remainingResources[i];
|
||||
if (!r.id) {
|
||||
r.id = hash;
|
||||
resources[i] = r;
|
||||
remainingResources[i] = r;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
@ -448,27 +514,29 @@ function enexXmlToMdArray(stream, resources) {
|
||||
// means it's an attachement. It will be appended along with the
|
||||
// other remaining resources at the bottom of the markdown text.
|
||||
if (!!resource.id) {
|
||||
section.lines = addResourceTag(section.lines, resource, node.attributes.alt);
|
||||
section.lines = addResourceTag(section.lines, resource, nodeAttributes.alt);
|
||||
}
|
||||
}
|
||||
} else if (n == "span" || n == "font" || n == 'sup') {
|
||||
// Ignore
|
||||
} else if (n == "span" || n == "font" || n == 'sup' || n == 'cite' || n == 'abbr' || n == 'small' || n == 'tt' || n == 'sub') {
|
||||
// Inline tags that can be ignored in Markdown
|
||||
} else {
|
||||
console.warn("Unsupported start tag: " + n);
|
||||
}
|
||||
})
|
||||
|
||||
saxStream.on('closetag', function(n) {
|
||||
n = n ? n.toLowerCase() : n;
|
||||
|
||||
if (n == 'en-note') {
|
||||
// End of note
|
||||
} else if (isNewLineOnlyEndTag(n)) {
|
||||
section.lines.push(BLOCK_CLOSE);
|
||||
} else if (n == 'td' || n == 'th') {
|
||||
section = section.parent;
|
||||
if (section && section.parent) section = section.parent;
|
||||
} else if (n == 'tr') {
|
||||
section = section.parent;
|
||||
if (section && section.parent) section = section.parent;
|
||||
} else if (n == 'table') {
|
||||
section = section.parent;
|
||||
if (section && section.parent) section = section.parent;
|
||||
} else if (isIgnoredEndTag(n)) {
|
||||
// Skip
|
||||
} else if (isListTag(n)) {
|
||||
@ -476,6 +544,10 @@ function enexXmlToMdArray(stream, resources) {
|
||||
state.lists.pop();
|
||||
} else if (isStrongTag(n)) {
|
||||
section.lines.push("**");
|
||||
} else if (isStrikeTag(n)) {
|
||||
section.lines.push(')');
|
||||
} else if (n == 'samp') {
|
||||
section.lines.push('`');
|
||||
} else if (isEmTag(n)) {
|
||||
section.lines.push("*");
|
||||
} else if (n == 'q') {
|
||||
@ -527,7 +599,7 @@ function enexXmlToMdArray(stream, resources) {
|
||||
saxStream.on('end', function() {
|
||||
resolve({
|
||||
content: section,
|
||||
resources: resources,
|
||||
resources: remainingResources,
|
||||
});
|
||||
})
|
||||
|
||||
@ -570,7 +642,7 @@ function colWidths(table) {
|
||||
const tr = table.lines[trIndex];
|
||||
for (let tdIndex = 0; tdIndex < tr.lines.length; tdIndex++) {
|
||||
const td = tr.lines[tdIndex];
|
||||
const w = cellWidth(td.content);
|
||||
const w = Math.min(cellWidth(td.content), 20); // Have to set a max width otherwise it can be extremely long for notes that import entire web pages (eg. Hacker News comment pages)
|
||||
if (output.length <= tdIndex) output.push(0);
|
||||
if (w > output[tdIndex]) output[tdIndex] = w;
|
||||
}
|
||||
|
@ -212,51 +212,92 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
|
||||
async function processNotes() {
|
||||
if (processingNotes) return false;
|
||||
|
||||
processingNotes = true;
|
||||
stream.pause();
|
||||
try {
|
||||
processingNotes = true;
|
||||
stream.pause();
|
||||
|
||||
let chain = [];
|
||||
while (notes.length) {
|
||||
let note = notes.shift();
|
||||
const contentStream = stringToStream(note.bodyXml);
|
||||
chain.push(() => {
|
||||
return enexXmlToMd(contentStream, note.resources).then((body) => {
|
||||
delete note.bodyXml;
|
||||
while (notes.length) {
|
||||
let note = notes.shift();
|
||||
const contentStream = stringToStream(note.bodyXml);
|
||||
const body = await enexXmlToMd(contentStream, note.resources);
|
||||
delete note.bodyXml;
|
||||
|
||||
// console.info('-----------------------------------------------------------');
|
||||
// console.info(body);
|
||||
// console.info('-----------------------------------------------------------');
|
||||
// console.info('-----------------------------------------------------------');
|
||||
// console.info(body);
|
||||
// console.info('-----------------------------------------------------------');
|
||||
|
||||
note.id = uuid.create();
|
||||
note.parent_id = parentFolderId;
|
||||
note.body = body;
|
||||
note.id = uuid.create();
|
||||
note.parent_id = parentFolderId;
|
||||
note.body = body;
|
||||
|
||||
// Notes in enex files always have a created timestamp but not always an
|
||||
// updated timestamp (it the note has never been modified). For sync
|
||||
// we require an updated_time property, so set it to create_time in that case
|
||||
if (!note.updated_time) note.updated_time = note.created_time;
|
||||
// Notes in enex files always have a created timestamp but not always an
|
||||
// updated timestamp (it the note has never been modified). For sync
|
||||
// we require an updated_time property, so set it to create_time in that case
|
||||
if (!note.updated_time) note.updated_time = note.created_time;
|
||||
|
||||
return saveNoteToStorage(note, importOptions.fuzzyMatching);
|
||||
}).then((result) => {
|
||||
if (result.noteUpdated) {
|
||||
progressState.updated++;
|
||||
} else if (result.noteCreated) {
|
||||
progressState.created++;
|
||||
} else if (result.noteSkipped) {
|
||||
progressState.skipped++;
|
||||
}
|
||||
progressState.resourcesCreated += result.resourcesCreated;
|
||||
progressState.notesTagged += result.notesTagged;
|
||||
importOptions.onProgress(progressState);
|
||||
});
|
||||
});
|
||||
const result = await saveNoteToStorage(note, importOptions.fuzzyMatching);
|
||||
|
||||
if (result.noteUpdated) {
|
||||
progressState.updated++;
|
||||
} else if (result.noteCreated) {
|
||||
progressState.created++;
|
||||
} else if (result.noteSkipped) {
|
||||
progressState.skipped++;
|
||||
}
|
||||
progressState.resourcesCreated += result.resourcesCreated;
|
||||
progressState.notesTagged += result.notesTagged;
|
||||
importOptions.onProgress(progressState);
|
||||
}
|
||||
} catch(error) {
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
return promiseChain(chain).then(() => {
|
||||
stream.resume();
|
||||
processingNotes = false;
|
||||
return true;
|
||||
});
|
||||
stream.resume();
|
||||
processingNotes = false;
|
||||
return true;
|
||||
|
||||
// let chain = [];
|
||||
// while (notes.length) {
|
||||
// let note = notes.shift();
|
||||
// const contentStream = stringToStream(note.bodyXml);
|
||||
// chain.push(() => {
|
||||
// return enexXmlToMd(contentStream, note.resources).then((body) => {
|
||||
// delete note.bodyXml;
|
||||
|
||||
// // console.info('-----------------------------------------------------------');
|
||||
// // console.info(body);
|
||||
// // console.info('-----------------------------------------------------------');
|
||||
|
||||
// note.id = uuid.create();
|
||||
// note.parent_id = parentFolderId;
|
||||
// note.body = body;
|
||||
|
||||
// // Notes in enex files always have a created timestamp but not always an
|
||||
// // updated timestamp (it the note has never been modified). For sync
|
||||
// // we require an updated_time property, so set it to create_time in that case
|
||||
// if (!note.updated_time) note.updated_time = note.created_time;
|
||||
|
||||
// return saveNoteToStorage(note, importOptions.fuzzyMatching);
|
||||
// }).then((result) => {
|
||||
// if (result.noteUpdated) {
|
||||
// progressState.updated++;
|
||||
// } else if (result.noteCreated) {
|
||||
// progressState.created++;
|
||||
// } else if (result.noteSkipped) {
|
||||
// progressState.skipped++;
|
||||
// }
|
||||
// progressState.resourcesCreated += result.resourcesCreated;
|
||||
// progressState.notesTagged += result.notesTagged;
|
||||
// importOptions.onProgress(progressState);
|
||||
// });
|
||||
// });
|
||||
// }
|
||||
|
||||
// return promiseChain(chain).then(() => {
|
||||
// stream.resume();
|
||||
// processingNotes = false;
|
||||
// return true;
|
||||
// });
|
||||
}
|
||||
|
||||
saxStream.on('error', (error) => {
|
||||
@ -323,7 +364,11 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
|
||||
noteResourceRecognition.objID = extractRecognitionObjId(data);
|
||||
} else if (note) {
|
||||
if (n == 'content') {
|
||||
note.bodyXml = data;
|
||||
if ('bodyXml' in note) {
|
||||
note.bodyXml += data;
|
||||
} else {
|
||||
note.bodyXml = data;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user