1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-24 10:27:10 +02:00

Desktop, Cli: Fix importing of very large attachments (150MB+) from Evernote ENEX files

This commit is contained in:
Laurent Cozic 2020-02-10 21:50:45 +00:00
parent 691521c5b9
commit 9ec1e84ed0
7 changed files with 3172 additions and 116 deletions

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,9 @@
"description": "Joplin CLI Client",
"license": "MIT",
"author": "Laurent Cozic",
"scripts": {
"postinstall": "patch-package"
},
"bugs": {
"url": "https://github.com/laurent22/joplin/issues"
},
@ -32,6 +35,7 @@
"app-module-path": "^2.2.0",
"async-mutex": "^0.1.3",
"base-64": "^0.1.0",
"base64-stream": "^1.0.0",
"clean-html": "^1.5.0",
"compare-version": "^0.1.2",
"diacritics": "^1.3.0",
@ -39,31 +43,49 @@
"es6-promise-pool": "^2.5.0",
"file-uri-to-path": "^1.0.0",
"follow-redirects": "^1.2.4",
"font-awesome-filetypes": "^2.1.0",
"form-data": "^2.1.4",
"fs-extra": "^5.0.0",
"highlight.js": "^9.17.1",
"html-entities": "^1.2.1",
"html-minifier": "^3.5.15",
"image-data-uri": "^2.0.0",
"image-type": "^3.0.0",
"joplin-turndown": "^4.0.19",
"joplin-turndown-plugin-gfm": "^1.0.12",
"json-stringify-safe": "^5.0.1",
"jssha": "^2.3.0",
"katex": "^0.11.1",
"levenshtein": "^1.0.5",
"markdown-it": "^10.0.0",
"markdown-it-abbr": "^1.0.4",
"markdown-it-anchor": "^5.2.5",
"markdown-it-deflist": "^2.0.3",
"markdown-it-emoji": "^1.4.0",
"markdown-it-expand-tabs": "^1.0.13",
"markdown-it-footnote": "^3.0.2",
"markdown-it-ins": "^3.0.0",
"markdown-it-mark": "^3.0.0",
"markdown-it-multimd-table": "^4.0.1",
"markdown-it-sub": "^1.0.0",
"markdown-it-sup": "^1.0.0",
"markdown-it-toc-done-right": "^4.1.0",
"md5": "^2.2.1",
"md5-file": "^4.0.0",
"mime": "^2.0.3",
"moment": "^2.24.0",
"multiparty": "^4.2.1",
"node-emoji": "^1.8.1",
"node-fetch": "^1.7.1",
"node-persist": "^2.1.0",
"patch-package": "^6.2.0",
"promise": "^7.1.1",
"proper-lockfile": "^2.0.1",
"query-string": "4.3.4",
"read-chunk": "^2.1.0",
"redux": "^3.7.2",
"request": "^2.88.0",
"sax": "^1.2.2",
"sax": "^1.2.4",
"server-destroy": "^1.0.1",
"sharp": "^0.23.2",
"sprintf-js": "^1.1.1",
@ -77,33 +99,14 @@
"terminal-kit": "^1.30.0",
"tkwidgets": "^0.5.26",
"url-parse": "^1.4.7",
"uslug": "^1.0.4",
"uuid": "^3.0.1",
"valid-url": "^1.0.9",
"word-wrap": "^1.2.3",
"xml2js": "^0.4.19",
"yargs-parser": "^7.0.0",
"font-awesome-filetypes": "^2.1.0",
"highlight.js": "^9.17.1",
"json-stringify-safe": "^5.0.1",
"katex": "^0.11.1",
"markdown-it-abbr": "^1.0.4",
"markdown-it-anchor": "^5.2.5",
"markdown-it-deflist": "^2.0.3",
"markdown-it-emoji": "^1.4.0",
"markdown-it-expand-tabs": "^1.0.13",
"markdown-it-footnote": "^3.0.2",
"markdown-it-ins": "^3.0.0",
"markdown-it-mark": "^3.0.0",
"markdown-it-multimd-table": "^4.0.1",
"markdown-it-sub": "^1.0.0",
"markdown-it-sup": "^1.0.0",
"markdown-it-toc-done-right": "^4.1.0",
"uslug": "^1.0.4"
"yargs-parser": "^7.0.0"
},
"devDependencies": {
"jasmine": "^3.5.0"
},
"scripts": {
"test": "jasmine"
}
}

View File

@ -0,0 +1,18 @@
diff --git a/node_modules/sax/lib/sax.js b/node_modules/sax/lib/sax.js
index 795d607..ccad5d8 100644
--- a/node_modules/sax/lib/sax.js
+++ b/node_modules/sax/lib/sax.js
@@ -1040,6 +1040,13 @@
parser.textNode += c
}
}
+
+ // Sax is kind of buggy when handling large text node. It has a function to check that
+ // the buffer doesn't run out of space but it doesn't seem to call it for text node.
+ // The result is that parser.textNode reaches 1GB and then the app crashes. So here
+ // we call checkBufferLength to make sure the buffer is cleared and the "text" event
+ // emitted so that the caller can handle memory properly.
+ checkBufferLength(parser);
continue
case S.SCRIPT:

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,7 @@
"pack": "node_modules/.bin/electron-builder --dir",
"dist": "node_modules/.bin/electron-builder",
"publish": "build -p always",
"postinstall": "node compile.js && node compile-package-info.js && node copyPluginAssets.js && node electronRebuild.js",
"postinstall": "patch-package && node compile.js && node compile-package-info.js && node copyPluginAssets.js && node electronRebuild.js",
"compile": "node compile.js && node compile-package-info.js",
"install-141": "npm install --toolset=v141"
},
@ -76,7 +76,8 @@
"babel-preset-react": "^6.24.1",
"electron": "^7.1.9",
"electron-builder": "22.3.2",
"electron-rebuild": "^1.8.8"
"electron-rebuild": "^1.8.8",
"patch-package": "^6.2.0"
},
"optionalDependencies": {
"7zip-bin-linux": "^1.0.1",
@ -87,6 +88,7 @@
"app-module-path": "^2.2.0",
"async-mutex": "^0.1.3",
"base-64": "^0.1.0",
"base64-stream": "^1.0.0",
"chokidar": "^3.0.0",
"clean-html": "^1.5.0",
"compare-versions": "^3.2.1",
@ -98,20 +100,37 @@
"es6-promise-pool": "^2.5.0",
"file-uri-to-path": "^1.0.0",
"follow-redirects": "^1.5.0",
"font-awesome-filetypes": "^2.1.0",
"form-data": "^2.3.2",
"formatcoords": "^1.1.3",
"fs-extra": "^5.0.0",
"highlight.js": "^9.17.1",
"html-entities": "^1.2.1",
"html-minifier": "^4.0.0",
"image-type": "^3.0.0",
"joplin-turndown": "^4.0.19",
"joplin-turndown-plugin-gfm": "^1.0.12",
"json-stringify-safe": "^5.0.1",
"jssha": "^2.3.1",
"katex": "^0.11.1",
"levenshtein": "^1.0.5",
"lodash": "^4.17.15",
"mark.js": "^8.11.1",
"markdown-it": "^10.0.0",
"markdown-it-abbr": "^1.0.4",
"markdown-it-anchor": "^5.2.5",
"markdown-it-deflist": "^2.0.3",
"markdown-it-emoji": "^1.4.0",
"markdown-it-expand-tabs": "^1.0.13",
"markdown-it-footnote": "^3.0.2",
"markdown-it-ins": "^3.0.0",
"markdown-it-mark": "^3.0.0",
"markdown-it-multimd-table": "^4.0.1",
"markdown-it-sub": "^1.0.0",
"markdown-it-sup": "^1.0.0",
"markdown-it-toc-done-right": "^4.1.0",
"md5": "^2.2.1",
"md5-file": "^4.0.0",
"moment": "^2.22.2",
"multiparty": "^4.2.1",
"mustache": "^3.0.1",
@ -130,6 +149,7 @@
"readability-node": "^0.1.0",
"redux": "^3.7.2",
"reselect": "^4.0.0",
"sax": "^1.2.4",
"server-destroy": "^1.0.1",
"smalltalk": "^2.5.1",
"sprintf-js": "^1.1.1",
@ -141,25 +161,9 @@
"tcp-port-used": "^0.1.2",
"uglifycss": "0.0.29",
"url-parse": "^1.4.3",
"uslug": "^1.0.4",
"uuid": "^3.2.1",
"valid-url": "^1.0.9",
"xml2js": "^0.4.19",
"font-awesome-filetypes": "^2.1.0",
"highlight.js": "^9.17.1",
"json-stringify-safe": "^5.0.1",
"katex": "^0.11.1",
"markdown-it-abbr": "^1.0.4",
"markdown-it-anchor": "^5.2.5",
"markdown-it-deflist": "^2.0.3",
"markdown-it-emoji": "^1.4.0",
"markdown-it-expand-tabs": "^1.0.13",
"markdown-it-footnote": "^3.0.2",
"markdown-it-ins": "^3.0.0",
"markdown-it-mark": "^3.0.0",
"markdown-it-multimd-table": "^4.0.1",
"markdown-it-sub": "^1.0.0",
"markdown-it-sup": "^1.0.0",
"markdown-it-toc-done-right": "^4.1.0",
"uslug": "^1.0.4"
"xml2js": "^0.4.19"
}
}

View File

@ -0,0 +1,18 @@
diff --git a/node_modules/sax/lib/sax.js b/node_modules/sax/lib/sax.js
index 795d607..ccad5d8 100644
--- a/node_modules/sax/lib/sax.js
+++ b/node_modules/sax/lib/sax.js
@@ -1040,6 +1040,13 @@
parser.textNode += c
}
}
+
+ // Sax is kind of buggy when handling large text node. It has a function to check that
+ // the buffer doesn't run out of space but it doesn't seem to call it for text node.
+ // The result is that parser.textNode reaches 1GB and then the app crashes. So here
+ // we call checkBufferLength to make sure the buffer is cleared and the "text" event
+ // emitted so that the caller can handle memory properly.
+ checkBufferLength(parser);
continue
case S.SCRIPT:

View File

@ -4,12 +4,15 @@ const BaseModel = require('lib/BaseModel.js');
const Note = require('lib/models/Note.js');
const Tag = require('lib/models/Tag.js');
const Resource = require('lib/models/Resource.js');
const Setting = require('lib/models/Setting.js');
const { MarkupToHtml } = require('lib/joplin-renderer');
const { enexXmlToMd } = require('./import-enex-md-gen.js');
const { enexXmlToHtml } = require('./import-enex-html-gen.js');
const { time } = require('lib/time-utils.js');
const Levenshtein = require('levenshtein');
const md5 = require('md5');
const { Base64Decode } = require('base64-stream');
const md5File = require('md5-file');
// const Promise = require('promise');
const fs = require('fs-extra');
@ -35,8 +38,28 @@ function extractRecognitionObjId(recognitionXml) {
return r && r.length >= 2 ? r[1] : null;
}
async function filePutContents(filePath, content) {
await fs.writeFile(filePath, content);
async function decodeBase64File(sourceFile, destFile) {
return new Promise(function(resolve, reject) {
const sourceStream = fs.createReadStream(sourceFile);
const destStream = fs.createWriteStream(destFile);
sourceStream.pipe(new Base64Decode()).pipe(destStream);
sourceStream.on('end', () => resolve());
sourceStream.on('error', (error) => reject(error));
});
}
async function md5FileAsync(filePath) {
return new Promise((resolve, reject) => {
md5File(filePath, (error, hash) => {
if (error) {
reject(error);
return;
}
resolve(hash);
});
});
}
function removeUndefinedProperties(note) {
@ -82,14 +105,51 @@ async function fuzzyMatch(note) {
return null;
}
async function saveNoteResources(note) {
// At this point we have the resource has it's been parsed from the XML, but additional
// processing needs to be done to get the final resource file, its size, MD5, etc.
async function processNoteResource(resource) {
if (resource.dataEncoding == 'base64') {
const decodedFilePath = `${resource.dataFilePath}.decoded`;
await decodeBase64File(resource.dataFilePath, decodedFilePath);
resource.dataFilePath = decodedFilePath;
} else if (resource.dataEncoding) {
throw new Error(`Cannot decode resource with encoding: ${resource.dataEncoding}`);
}
const stats = fs.statSync(resource.dataFilePath);
resource.size = stats.size;
if (!resource.id) {
// If no resource ID is present, the resource ID is actually the MD5 of the data.
// This ID will match the "hash" attribute of the corresponding <en-media> tag.
// resourceId = md5(decodedData);
resource.id = await md5FileAsync(resource.dataFilePath);
}
if (!resource.id || !resource.size) {
const debugTemp = Object.assign({}, resource);
debugTemp.data = debugTemp.data ? `${debugTemp.data.substr(0, 32)}...` : debugTemp.data;
throw new Error(`This resource was not added because it has no ID or no content: ${JSON.stringify(debugTemp)}`);
}
return resource;
}
async function saveNoteResources(note, importOptions) {
let resourcesCreated = 0;
for (let i = 0; i < note.resources.length; i++) {
let resource = note.resources[i];
if (!resource.id) continue;
try {
resource = await processNoteResource(resource);
} catch (error) {
importOptions.onError(error);
continue;
}
let toSave = Object.assign({}, resource);
delete toSave.data;
delete toSave.dataFilePath;
delete toSave.dataEncoding;
// The same resource sometimes appear twice in the same enex (exact same ID and file).
// In that case, just skip it - it means two different notes might be linked to the
@ -97,7 +157,7 @@ async function saveNoteResources(note) {
let existingResource = await Resource.load(toSave.id);
if (existingResource) continue;
await filePutContents(Resource.fullPath(toSave), resource.data);
await fs.move(resource.dataFilePath, Resource.fullPath(toSave), { overwrite: true });
await Resource.save(toSave, { isNew: true });
resourcesCreated++;
}
@ -119,10 +179,14 @@ async function saveNoteTags(note) {
return notesTagged;
}
async function saveNoteToStorage(note, fuzzyMatching = false) {
async function saveNoteToStorage(note, importOptions) {
importOptions = Object.assign({}, {
fuzzyMatching: false,
}, importOptions);
note = Note.filter(note);
let existingNote = fuzzyMatching ? await fuzzyMatch(note) : null;
let existingNote = importOptions.fuzzyMatching ? await fuzzyMatch(note) : null;
let result = {
noteCreated: false,
@ -132,7 +196,7 @@ async function saveNoteToStorage(note, fuzzyMatching = false) {
notesTagged: 0,
};
let resourcesCreated = await saveNoteResources(note);
let resourcesCreated = await saveNoteResources(note, importOptions);
result.resourcesCreated += resourcesCreated;
let notesTagged = await saveNoteTags(note);
@ -241,7 +305,7 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
// we require an updated_time property, so set it to create_time in that case
if (!note.updated_time) note.updated_time = note.created_time;
const result = await saveNoteToStorage(note, importOptions.fuzzyMatching);
const result = await saveNoteToStorage(note, importOptions);
if (result.noteUpdated) {
progressState.updated++;
@ -276,11 +340,20 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
noteResourceAttributes[n] = text;
} else if (noteResource) {
if (n == 'data') {
let attr = currentNodeAttributes();
noteResource.dataEncoding = attr.encoding;
if (!noteResource.dataEncoding) {
let attr = currentNodeAttributes();
noteResource.dataEncoding = attr.encoding;
}
if (!noteResource.dataFilePath) {
noteResource.dataFilePath = `${Setting.value('tempDir')}/${md5(Date.now() + Math.random())}.base64`;
}
fs.appendFileSync(noteResource.dataFilePath, text);
} else {
if (!(n in noteResource)) noteResource[n] = '';
noteResource[n] += text;
}
if (!(n in noteResource)) noteResource[n] = '';
noteResource[n] += text;
} else if (note) {
if (n == 'title') {
note.title = text;
@ -336,7 +409,7 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
}
});
saxStream.on('closetag', function(n) {
saxStream.on('closetag', async function(n) {
nodes.pop();
if (n == 'note') {
@ -372,56 +445,16 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
note.source = noteAttributes.source ? `evernote.${noteAttributes.source}` : 'evernote';
note.source_url = noteAttributes['source-url'] ? noteAttributes['source-url'] : '';
// if (noteAttributes['reminder-time']) {
// console.info('======================================================');
// console.info(noteAttributes);
// console.info('------------------------------------------------------');
// console.info(note);
// console.info('======================================================');
// }
noteAttributes = null;
} else if (n == 'resource') {
let decodedData = null;
let resourceId = noteResource.id;
if (noteResource.dataEncoding == 'base64') {
try {
decodedData = Buffer.from(noteResource.data, 'base64');
} catch (error) {
importOptions.onError(error);
}
} else if (noteResource.dataEncoding) {
importOptions.onError(new Error(`Cannot decode resource with encoding: ${noteResource.dataEncoding}`));
decodedData = noteResource.data; // Just put the encoded data directly in the file so it can, potentially, be manually decoded later
}
if (!resourceId && decodedData) {
// If no resource ID is present, the resource ID is actually the MD5 of the data.
// This ID will match the "hash" attribute of the corresponding <en-media> tag.
resourceId = md5(decodedData);
}
if (!resourceId || !noteResource.data) {
const debugTemp = Object.assign({}, noteResource);
debugTemp.data = debugTemp.data ? `${debugTemp.data.substr(0, 32)}...` : debugTemp.data;
importOptions.onError(new Error(`This resource was not added because it has no ID or no content: ${JSON.stringify(debugTemp)}`));
} else {
let size = 0;
if (decodedData) {
size = 'byteLength' in decodedData ? decodedData.byteLength : decodedData.length;
}
let r = {
id: resourceId,
data: decodedData,
mime: noteResource.mime,
title: noteResource.filename ? noteResource.filename : '',
filename: noteResource.filename ? noteResource.filename : '',
size: size,
};
note.resources.push(r);
}
note.resources.push({
id: noteResource.id,
dataFilePath: noteResource.dataFilePath,
dataEncoding: noteResource.dataEncoding,
mime: noteResource.mime,
title: noteResource.filename ? noteResource.filename : '',
filename: noteResource.filename ? noteResource.filename : '',
});
noteResource = null;
}