Desktop: Fixes #10050: Fixed OCR memory leak when processing PDF documents

2025-02-01 19:15:01 +02:00 · 2024-03-05 11:42:54 +00:00 · 2024-03-05 11:42:54 +00:00 · d9a16b5c0f
commit d9a16b5c0f
parent 8d5ee36745
1 changed files with 32 additions and 25 deletions
--- a/packages/lib/shim-init-node.ts
+++ b/packages/lib/shim-init-node.ts
@ -739,21 +739,24 @@ function shimInit(options: ShimInitOptions = null) {
 	shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => {
 		const loadingTask = pdfJs.getDocument(pdfPath);
 		const doc = await loadingTask.promise;
 		const textByPage = [];
-		for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
+		try {
-			const page = await doc.getPage(pageNum);
+			for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
-			const textContent = await page.getTextContent();
+				const page = await doc.getPage(pageNum);
 				const textContent = await page.getTextContent();
-			const strings = textContent.items.map(item => {
+				const strings = textContent.items.map(item => {
-				const text = (item as TextItem).str ?? '';
+					const text = (item as TextItem).str ?? '';
-				return text;
+					return text;
-			}).join('\n');
+				}).join('\n');
-			// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
+				// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
-			// We remove them here.
+				// We remove them here.
-			textByPage.push(replaceUnsupportedCharacters(strings));
+				textByPage.push(replaceUnsupportedCharacters(strings));
 			}
 		} finally {
 			await doc.destroy();
 		}
 		return textByPage;
@ -791,23 +794,27 @@ function shimInit(options: ShimInitOptions = null) {
 		const loadingTask = pdfJs.getDocument(pdfPath);
 		const doc = await loadingTask.promise;
-		for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
+		try {
-			const page = await doc.getPage(pageNum);
+			for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
-			const viewport = page.getViewport({ scale: 2 });
+				const page = await doc.getPage(pageNum);
-			const canvas = createCanvas();
+				const viewport = page.getViewport({ scale: 2 });
-			const ctx = canvas.getContext('2d');
+				const canvas = createCanvas();
 				const ctx = canvas.getContext('2d');
-			canvas.height = viewport.height;
+				canvas.height = viewport.height;
-			canvas.width = viewport.width;
+				canvas.width = viewport.width;
-			const renderTask = page.render({ canvasContext: ctx, viewport: viewport });
+				const renderTask = page.render({ canvasContext: ctx, viewport: viewport });
-			await renderTask.promise;
+				await renderTask.promise;
-			const buffer = await canvasToBuffer(canvas);
+				const buffer = await canvasToBuffer(canvas);
-			const filePath = `${outputDirectoryPath}/${filePrefix}_${pageNum.toString().padStart(4, '0')}.jpg`;
+				const filePath = `${outputDirectoryPath}/${filePrefix}_${pageNum.toString().padStart(4, '0')}.jpg`;
-			output.push(filePath);
+				output.push(filePath);
-			await writeFile(filePath, buffer, 'binary');
+				await writeFile(filePath, buffer, 'binary');
-			if (!(await shim.fsDriver().exists(filePath))) throw new Error(`Could not write to file: ${filePath}`);
+				if (!(await shim.fsDriver().exists(filePath))) throw new Error(`Could not write to file: ${filePath}`);
 			}
 		} finally {
 			await doc.destroy();
 		}
 		return output;