Desktop, Cli: Fix importing of very large attachments (150MB+) from Evernote ENEX files

2024-12-24 10:27:10 +02:00 · 2020-02-10 21:50:45 +00:00 · 2020-02-10 21:50:45 +00:00 · 9ec1e84ed0
commit 9ec1e84ed0
parent 691521c5b9
7 changed files with 3172 additions and 116 deletions
--- a/CliClient/package-lock.json
+++ b/CliClient/package-lock.json
--- a/CliClient/package.json
+++ b/CliClient/package.json
@ -3,6 +3,9 @@
  "description": "Joplin CLI Client",
  "license": "MIT",
  "author": "Laurent Cozic",
+  "scripts": {
+    "postinstall": "patch-package"
+  },
  "bugs": {
    "url": "https://github.com/laurent22/joplin/issues"
  },
@ -32,6 +35,7 @@
    "app-module-path": "^2.2.0",
    "async-mutex": "^0.1.3",
    "base-64": "^0.1.0",
+    "base64-stream": "^1.0.0",
    "clean-html": "^1.5.0",
    "compare-version": "^0.1.2",
    "diacritics": "^1.3.0",
@ -39,31 +43,49 @@
    "es6-promise-pool": "^2.5.0",
    "file-uri-to-path": "^1.0.0",
    "follow-redirects": "^1.2.4",
+    "font-awesome-filetypes": "^2.1.0",
    "form-data": "^2.1.4",
    "fs-extra": "^5.0.0",
+    "highlight.js": "^9.17.1",
    "html-entities": "^1.2.1",
    "html-minifier": "^3.5.15",
    "image-data-uri": "^2.0.0",
    "image-type": "^3.0.0",
    "joplin-turndown": "^4.0.19",
    "joplin-turndown-plugin-gfm": "^1.0.12",
+    "json-stringify-safe": "^5.0.1",
    "jssha": "^2.3.0",
+    "katex": "^0.11.1",
    "levenshtein": "^1.0.5",
    "markdown-it": "^10.0.0",
+    "markdown-it-abbr": "^1.0.4",
+    "markdown-it-anchor": "^5.2.5",
+    "markdown-it-deflist": "^2.0.3",
+    "markdown-it-emoji": "^1.4.0",
+    "markdown-it-expand-tabs": "^1.0.13",
+    "markdown-it-footnote": "^3.0.2",
+    "markdown-it-ins": "^3.0.0",
+    "markdown-it-mark": "^3.0.0",
+    "markdown-it-multimd-table": "^4.0.1",
+    "markdown-it-sub": "^1.0.0",
+    "markdown-it-sup": "^1.0.0",
+    "markdown-it-toc-done-right": "^4.1.0",
    "md5": "^2.2.1",
+    "md5-file": "^4.0.0",
    "mime": "^2.0.3",
    "moment": "^2.24.0",
    "multiparty": "^4.2.1",
    "node-emoji": "^1.8.1",
    "node-fetch": "^1.7.1",
    "node-persist": "^2.1.0",
+    "patch-package": "^6.2.0",
    "promise": "^7.1.1",
    "proper-lockfile": "^2.0.1",
    "query-string": "4.3.4",
    "read-chunk": "^2.1.0",
    "redux": "^3.7.2",
    "request": "^2.88.0",
-    "sax": "^1.2.2",
+    "sax": "^1.2.4",
    "server-destroy": "^1.0.1",
    "sharp": "^0.23.2",
    "sprintf-js": "^1.1.1",
@ -77,33 +99,14 @@
    "terminal-kit": "^1.30.0",
    "tkwidgets": "^0.5.26",
    "url-parse": "^1.4.7",
+    "uslug": "^1.0.4",
    "uuid": "^3.0.1",
    "valid-url": "^1.0.9",
    "word-wrap": "^1.2.3",
    "xml2js": "^0.4.19",
-    "yargs-parser": "^7.0.0",
-    "font-awesome-filetypes": "^2.1.0",
-    "highlight.js": "^9.17.1",
-    "json-stringify-safe": "^5.0.1",
-    "katex": "^0.11.1",
-    "markdown-it-abbr": "^1.0.4",
-    "markdown-it-anchor": "^5.2.5",
-    "markdown-it-deflist": "^2.0.3",
-    "markdown-it-emoji": "^1.4.0",
-    "markdown-it-expand-tabs": "^1.0.13",
-    "markdown-it-footnote": "^3.0.2",
-    "markdown-it-ins": "^3.0.0",
-    "markdown-it-mark": "^3.0.0",
-    "markdown-it-multimd-table": "^4.0.1",
-    "markdown-it-sub": "^1.0.0",
-    "markdown-it-sup": "^1.0.0",
-    "markdown-it-toc-done-right": "^4.1.0",
-    "uslug": "^1.0.4"
+    "yargs-parser": "^7.0.0"
  },
  "devDependencies": {
    "jasmine": "^3.5.0"
-  },
-  "scripts": {
-    "test": "jasmine"
  }
 }
--- a/CliClient/patches/sax+1.2.4.patch
+++ b/CliClient/patches/sax+1.2.4.patch
@ -0,0 +1,18 @@
+diff --git a/node_modules/sax/lib/sax.js b/node_modules/sax/lib/sax.js
+index 795d607..ccad5d8 100644
+--- a/node_modules/sax/lib/sax.js
+++ b/node_modules/sax/lib/sax.js
+@@ -1040,6 +1040,13 @@
+               parser.textNode += c
+             }
+           }
+
+          // Sax is kind of buggy when handling large text node. It has a function to check that
+          // the buffer doesn't run out of space but it doesn't seem to call it for text node.
+          // The result is that parser.textNode reaches 1GB and then the app crashes. So here
+          // we call checkBufferLength to make sure the buffer is cleared and the "text" event
+          // emitted so that the caller can handle memory properly.
+          checkBufferLength(parser);
+           continue
+ 
+         case S.SCRIPT:
--- a/ElectronClient/app/package-lock.json
+++ b/ElectronClient/app/package-lock.json
--- a/ElectronClient/app/package.json
+++ b/ElectronClient/app/package.json
@ -8,7 +8,7 @@
    "pack": "node_modules/.bin/electron-builder --dir",
    "dist": "node_modules/.bin/electron-builder",
    "publish": "build -p always",
-    "postinstall": "node compile.js && node compile-package-info.js && node copyPluginAssets.js && node electronRebuild.js",
+    "postinstall": "patch-package && node compile.js && node compile-package-info.js && node copyPluginAssets.js && node electronRebuild.js",
    "compile": "node compile.js && node compile-package-info.js",
    "install-141": "npm install --toolset=v141"
  },
@ -76,7 +76,8 @@
    "babel-preset-react": "^6.24.1",
    "electron": "^7.1.9",
    "electron-builder": "22.3.2",
-    "electron-rebuild": "^1.8.8"
+    "electron-rebuild": "^1.8.8",
+    "patch-package": "^6.2.0"
  },
  "optionalDependencies": {
    "7zip-bin-linux": "^1.0.1",
@ -87,6 +88,7 @@
    "app-module-path": "^2.2.0",
    "async-mutex": "^0.1.3",
    "base-64": "^0.1.0",
+    "base64-stream": "^1.0.0",
    "chokidar": "^3.0.0",
    "clean-html": "^1.5.0",
    "compare-versions": "^3.2.1",
@ -98,20 +100,37 @@
    "es6-promise-pool": "^2.5.0",
    "file-uri-to-path": "^1.0.0",
    "follow-redirects": "^1.5.0",
+    "font-awesome-filetypes": "^2.1.0",
    "form-data": "^2.3.2",
    "formatcoords": "^1.1.3",
    "fs-extra": "^5.0.0",
+    "highlight.js": "^9.17.1",
    "html-entities": "^1.2.1",
    "html-minifier": "^4.0.0",
    "image-type": "^3.0.0",
    "joplin-turndown": "^4.0.19",
    "joplin-turndown-plugin-gfm": "^1.0.12",
+    "json-stringify-safe": "^5.0.1",
    "jssha": "^2.3.1",
+    "katex": "^0.11.1",
    "levenshtein": "^1.0.5",
    "lodash": "^4.17.15",
    "mark.js": "^8.11.1",
    "markdown-it": "^10.0.0",
+    "markdown-it-abbr": "^1.0.4",
+    "markdown-it-anchor": "^5.2.5",
+    "markdown-it-deflist": "^2.0.3",
+    "markdown-it-emoji": "^1.4.0",
+    "markdown-it-expand-tabs": "^1.0.13",
+    "markdown-it-footnote": "^3.0.2",
+    "markdown-it-ins": "^3.0.0",
+    "markdown-it-mark": "^3.0.0",
+    "markdown-it-multimd-table": "^4.0.1",
+    "markdown-it-sub": "^1.0.0",
+    "markdown-it-sup": "^1.0.0",
+    "markdown-it-toc-done-right": "^4.1.0",
    "md5": "^2.2.1",
+    "md5-file": "^4.0.0",
    "moment": "^2.22.2",
    "multiparty": "^4.2.1",
    "mustache": "^3.0.1",
@ -130,6 +149,7 @@
    "readability-node": "^0.1.0",
    "redux": "^3.7.2",
    "reselect": "^4.0.0",
+    "sax": "^1.2.4",
    "server-destroy": "^1.0.1",
    "smalltalk": "^2.5.1",
    "sprintf-js": "^1.1.1",
@ -141,25 +161,9 @@
    "tcp-port-used": "^0.1.2",
    "uglifycss": "0.0.29",
    "url-parse": "^1.4.3",
+    "uslug": "^1.0.4",
    "uuid": "^3.2.1",
    "valid-url": "^1.0.9",
-    "xml2js": "^0.4.19",
-    "font-awesome-filetypes": "^2.1.0",
-    "highlight.js": "^9.17.1",
-    "json-stringify-safe": "^5.0.1",
-    "katex": "^0.11.1",
-    "markdown-it-abbr": "^1.0.4",
-    "markdown-it-anchor": "^5.2.5",
-    "markdown-it-deflist": "^2.0.3",
-    "markdown-it-emoji": "^1.4.0",
-    "markdown-it-expand-tabs": "^1.0.13",
-    "markdown-it-footnote": "^3.0.2",
-    "markdown-it-ins": "^3.0.0",
-    "markdown-it-mark": "^3.0.0",
-    "markdown-it-multimd-table": "^4.0.1",
-    "markdown-it-sub": "^1.0.0",
-    "markdown-it-sup": "^1.0.0",
-    "markdown-it-toc-done-right": "^4.1.0",
-    "uslug": "^1.0.4"
+    "xml2js": "^0.4.19"
  }
 }
--- a/ElectronClient/app/patches/sax+1.2.4.patch
+++ b/ElectronClient/app/patches/sax+1.2.4.patch
@ -0,0 +1,18 @@
+diff --git a/node_modules/sax/lib/sax.js b/node_modules/sax/lib/sax.js
+index 795d607..ccad5d8 100644
+--- a/node_modules/sax/lib/sax.js
+++ b/node_modules/sax/lib/sax.js
+@@ -1040,6 +1040,13 @@
+               parser.textNode += c
+             }
+           }
+
+          // Sax is kind of buggy when handling large text node. It has a function to check that
+          // the buffer doesn't run out of space but it doesn't seem to call it for text node.
+          // The result is that parser.textNode reaches 1GB and then the app crashes. So here
+          // we call checkBufferLength to make sure the buffer is cleared and the "text" event
+          // emitted so that the caller can handle memory properly.
+          checkBufferLength(parser);
+           continue
+ 
+         case S.SCRIPT:
--- a/ReactNativeClient/lib/import-enex.js
+++ b/ReactNativeClient/lib/import-enex.js
@ -4,12 +4,15 @@ const BaseModel = require('lib/BaseModel.js');
 const Note = require('lib/models/Note.js');
 const Tag = require('lib/models/Tag.js');
 const Resource = require('lib/models/Resource.js');
+const Setting = require('lib/models/Setting.js');
 const { MarkupToHtml } = require('lib/joplin-renderer');
 const { enexXmlToMd } = require('./import-enex-md-gen.js');
 const { enexXmlToHtml } = require('./import-enex-html-gen.js');
 const { time } = require('lib/time-utils.js');
 const Levenshtein = require('levenshtein');
 const md5 = require('md5');
+const { Base64Decode } = require('base64-stream');
+const md5File = require('md5-file');

 // const Promise = require('promise');
 const fs = require('fs-extra');
@ -35,8 +38,28 @@ function extractRecognitionObjId(recognitionXml) {
 	return r && r.length >= 2 ? r[1] : null;
 }

-async function filePutContents(filePath, content) {
-	await fs.writeFile(filePath, content);
+async function decodeBase64File(sourceFile, destFile) {
+	return new Promise(function(resolve, reject) {
+		const sourceStream = fs.createReadStream(sourceFile);
+		const destStream = fs.createWriteStream(destFile);
+		sourceStream.pipe(new Base64Decode()).pipe(destStream);
+
+		sourceStream.on('end', () => resolve());
+		sourceStream.on('error', (error) => reject(error));
+	});
+}
+
+async function md5FileAsync(filePath) {
+	return new Promise((resolve, reject) => {
+		md5File(filePath, (error, hash) => {
+			if (error) {
+				reject(error);
+				return;
+			}
+
+			resolve(hash);
+		});
+	});
 }

 function removeUndefinedProperties(note) {
@ -82,14 +105,51 @@ async function fuzzyMatch(note) {
 	return null;
 }

-async function saveNoteResources(note) {
+// At this point we have the resource has it's been parsed from the XML, but additional
+// processing needs to be done to get the final resource file, its size, MD5, etc.
+async function processNoteResource(resource) {
+	if (resource.dataEncoding == 'base64') {
+		const decodedFilePath = `${resource.dataFilePath}.decoded`;
+		await decodeBase64File(resource.dataFilePath, decodedFilePath);
+		resource.dataFilePath = decodedFilePath;
+	} else if (resource.dataEncoding) {
+		throw new Error(`Cannot decode resource with encoding: ${resource.dataEncoding}`);
+	}
+
+	const stats = fs.statSync(resource.dataFilePath);
+	resource.size = stats.size;
+
+	if (!resource.id) {
+		// If no resource ID is present, the resource ID is actually the MD5 of the data.
+		// This ID will match the "hash" attribute of the corresponding <en-media> tag.
+		// resourceId = md5(decodedData);
+		resource.id = await md5FileAsync(resource.dataFilePath);
+	}
+
+	if (!resource.id || !resource.size) {
+		const debugTemp = Object.assign({}, resource);
+		debugTemp.data = debugTemp.data ? `${debugTemp.data.substr(0, 32)}...` : debugTemp.data;
+		throw new Error(`This resource was not added because it has no ID or no content: ${JSON.stringify(debugTemp)}`);
+	}
+
+	return resource;
+}
+
+async function saveNoteResources(note, importOptions) {
 	let resourcesCreated = 0;
 	for (let i = 0; i < note.resources.length; i++) {
 		let resource = note.resources[i];
-		if (!resource.id) continue;
+
+		try {
+			resource = await processNoteResource(resource);
+		} catch (error) {
+			importOptions.onError(error);
+			continue;
+		}

 		let toSave = Object.assign({}, resource);
-		delete toSave.data;
+		delete toSave.dataFilePath;
+		delete toSave.dataEncoding;

 		// The same resource sometimes appear twice in the same enex (exact same ID and file).
 		// In that case, just skip it - it means two different notes might be linked to the
@ -97,7 +157,7 @@ async function saveNoteResources(note) {
 		let existingResource = await Resource.load(toSave.id);
 		if (existingResource) continue;

-		await filePutContents(Resource.fullPath(toSave), resource.data);
+		await fs.move(resource.dataFilePath, Resource.fullPath(toSave), { overwrite: true });
 		await Resource.save(toSave, { isNew: true });
 		resourcesCreated++;
 	}
@ -119,10 +179,14 @@ async function saveNoteTags(note) {
 	return notesTagged;
 }

-async function saveNoteToStorage(note, fuzzyMatching = false) {
+async function saveNoteToStorage(note, importOptions) {
+	importOptions = Object.assign({}, {
+		fuzzyMatching: false,
+	}, importOptions);
+
 	note = Note.filter(note);

-	let existingNote = fuzzyMatching ? await fuzzyMatch(note) : null;
+	let existingNote = importOptions.fuzzyMatching ? await fuzzyMatch(note) : null;

 	let result = {
 		noteCreated: false,
@ -132,7 +196,7 @@ async function saveNoteToStorage(note, fuzzyMatching = false) {
 		notesTagged: 0,
 	};

-	let resourcesCreated = await saveNoteResources(note);
+	let resourcesCreated = await saveNoteResources(note, importOptions);
 	result.resourcesCreated += resourcesCreated;

 	let notesTagged = await saveNoteTags(note);
@ -241,7 +305,7 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
 					// we require an updated_time property, so set it to create_time in that case
 					if (!note.updated_time) note.updated_time = note.created_time;

-					const result = await saveNoteToStorage(note, importOptions.fuzzyMatching);
+					const result = await saveNoteToStorage(note, importOptions);

 					if (result.noteUpdated) {
 						progressState.updated++;
@ -276,11 +340,20 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
 				noteResourceAttributes[n] = text;
 			} else if (noteResource) {
 				if (n == 'data') {
-					let attr = currentNodeAttributes();
-					noteResource.dataEncoding = attr.encoding;
+					if (!noteResource.dataEncoding) {
+						let attr = currentNodeAttributes();
+						noteResource.dataEncoding = attr.encoding;
+					}
+
+					if (!noteResource.dataFilePath) {
+						noteResource.dataFilePath = `${Setting.value('tempDir')}/${md5(Date.now() + Math.random())}.base64`;
+					}
+
+					fs.appendFileSync(noteResource.dataFilePath, text);
+				} else {
+					if (!(n in noteResource)) noteResource[n] = '';
+					noteResource[n] += text;
 				}
-				if (!(n in noteResource)) noteResource[n] = '';
-				noteResource[n] += text;
 			} else if (note) {
 				if (n == 'title') {
 					note.title = text;
@ -336,7 +409,7 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
 			}
 		});

-		saxStream.on('closetag', function(n) {
+		saxStream.on('closetag', async function(n) {
 			nodes.pop();

 			if (n == 'note') {
@ -372,56 +445,16 @@ function importEnex(parentFolderId, filePath, importOptions = null) {
 				note.source = noteAttributes.source ? `evernote.${noteAttributes.source}` : 'evernote';
 				note.source_url = noteAttributes['source-url'] ? noteAttributes['source-url'] : '';

-				// if (noteAttributes['reminder-time']) {
-				// 	console.info('======================================================');
-				// 	console.info(noteAttributes);
-				// 	console.info('------------------------------------------------------');
-				// 	console.info(note);
-				// 	console.info('======================================================');
-				// }
-
 				noteAttributes = null;
 			} else if (n == 'resource') {
-				let decodedData = null;
-				let resourceId = noteResource.id;
-				if (noteResource.dataEncoding == 'base64') {
-					try {
-						decodedData = Buffer.from(noteResource.data, 'base64');
-					} catch (error) {
-						importOptions.onError(error);
-					}
-				} else if (noteResource.dataEncoding) {
-					importOptions.onError(new Error(`Cannot decode resource with encoding: ${noteResource.dataEncoding}`));
-					decodedData = noteResource.data; // Just put the encoded data directly in the file so it can, potentially, be manually decoded later
-				}
-
-				if (!resourceId && decodedData) {
-					// If no resource ID is present, the resource ID is actually the MD5 of the data.
-					// This ID will match the "hash" attribute of the corresponding <en-media> tag.
-					resourceId = md5(decodedData);
-				}
-
-				if (!resourceId || !noteResource.data) {
-					const debugTemp = Object.assign({}, noteResource);
-					debugTemp.data = debugTemp.data ? `${debugTemp.data.substr(0, 32)}...` : debugTemp.data;
-					importOptions.onError(new Error(`This resource was not added because it has no ID or no content: ${JSON.stringify(debugTemp)}`));
-				} else {
-					let size = 0;
-					if (decodedData) {
-						size = 'byteLength' in decodedData ? decodedData.byteLength : decodedData.length;
-					}
-
-					let r = {
-						id: resourceId,
-						data: decodedData,
-						mime: noteResource.mime,
-						title: noteResource.filename ? noteResource.filename : '',
-						filename: noteResource.filename ? noteResource.filename : '',
-						size: size,
-					};
-
-					note.resources.push(r);
-				}
+				note.resources.push({
+					id: noteResource.id,
+					dataFilePath: noteResource.dataFilePath,
+					dataEncoding: noteResource.dataEncoding,
+					mime: noteResource.mime,
+					title: noteResource.filename ? noteResource.filename : '',
+					filename: noteResource.filename ? noteResource.filename : '',
+				});

 				noteResource = null;
 			}