Desktop: Resolves #9765: OCR: Use existing PDF text when available (#9764)

2025-11-23 22:36:32 +02:00 · 2024-02-02 14:59:15 -08:00
parent 564adb9a57
commit 815fe8d3ba
6 changed files with 45 additions and 3 deletions
--- a/packages/lib/shim-init-node.ts
+++ b/packages/lib/shim-init-node.ts
@@ -9,6 +9,7 @@ import * as fs from 'fs-extra';
 import * as pdfJsNamespace from 'pdfjs-dist';
 import { writeFile } from 'fs/promises';
 import { ResourceEntity } from './services/database/types';
+import { TextItem } from 'pdfjs-dist/types/src/display/api';

 const { FileApiDriverLocal } = require('./file-api-driver-local');
 const mimeUtils = require('./mime-utils.js').mime;
@@ -734,6 +735,26 @@ function shimInit(options: ShimInitOptions = null) {
 		}
 	};

+	shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => {
+		const loadingTask = pdfJs.getDocument(pdfPath);
+		const doc = await loadingTask.promise;
+
+		const textByPage = [];
+
+		for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
+			const page = await doc.getPage(pageNum);
+			const textContent = await page.getTextContent();
+
+			const strings = textContent.items.map(item => {
+				const text = (item as TextItem).str ?? '';
+				return text;
+			}).join('\n');
+			textByPage.push(strings);
+		}
+
+		return textByPage;
+	};
+
 	shim.pdfToImages = async (pdfPath: string, outputDirectoryPath: string): Promise<string[]> => {
 		// We handle both the Electron app and testing framework. Potentially
 		// the same code could be use to support the CLI app.