Desktop: Resolves #9765: OCR: Use existing PDF text when available (#9764)

2025-11-26 22:41:17 +02:00 · 2024-02-02 14:59:15 -08:00
parent 564adb9a57
commit 815fe8d3ba
6 changed files with 45 additions and 3 deletions
--- a/packages/app-cli/tests/ocr_samples/multi_page__embedded_text.pdf
+++ b/packages/app-cli/tests/ocr_samples/multi_page__embedded_text.pdf
--- a/packages/app-cli/tests/ocr_samples/multi_page__no_embedded_text.pdf
+++ b/packages/app-cli/tests/ocr_samples/multi_page__no_embedded_text.pdf
--- a/packages/lib/services/ocr/OcrService.test.ts
+++ b/packages/lib/services/ocr/OcrService.test.ts
@@ -82,15 +82,20 @@ describe('OcrService', () => {
 		// `jest.retryTimes(2)`
 	}, 60000 * 5);
-	it('should process PDF resources', async () => {
+	test.each([
-		const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
+		// Use embedded text (skip OCR)
 		['dummy.pdf', 'Dummy PDF file'],
 		['multi_page__embedded_text.pdf', 'This is a test.\nTesting...\nThis PDF has 3 pages.\nThis is page 3.'],
 		['multi_page__no_embedded_text.pdf', 'This is a multi-page PDF\nwith no embedded text.\nPage 2: more text.\nThe third page.'],
 	])('should process PDF resources', async (samplePath: string, expectedText: string) => {
 		const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/${samplePath}` });
 		const service = newOcrService();
 		await service.processResources();
 		const processedResource: ResourceEntity = await Resource.load(resource.id);
-		expect(processedResource.ocr_text).toBe('Dummy PDF file');
+		expect(processedResource.ocr_text).toBe(expectedText);
 		expect(processedResource.ocr_status).toBe(ResourceOcrStatus.Done);
 		expect(processedResource.ocr_error).toBe('');
--- a/packages/lib/services/ocr/OcrService.ts
+++ b/packages/lib/services/ocr/OcrService.ts
@@ -62,6 +62,17 @@ export default class OcrService {
 		const resourceFilePath = Resource.fullPath(resource);
 		if (resource.mime === 'application/pdf') {
 			// OCR can be slow for large PDFs.
 			// Skip it if the PDF already includes text.
 			const pageTexts = await shim.pdfExtractEmbeddedText(resourceFilePath);
 			const pagesWithText = pageTexts.filter(text => !!text.trim().length);
 			if (pagesWithText.length > 0) {
 				return {
 					text: pageTexts.join('\n'),
 				};
 			}
 			const imageFilePaths = await shim.pdfToImages(resourceFilePath, await this.pdfExtractDir());
 			const results: RecognizeResult[] = [];
--- a/packages/lib/shim-init-node.ts
+++ b/packages/lib/shim-init-node.ts
@@ -9,6 +9,7 @@ import * as fs from 'fs-extra';
 import * as pdfJsNamespace from 'pdfjs-dist';
 import { writeFile } from 'fs/promises';
 import { ResourceEntity } from './services/database/types';
 import { TextItem } from 'pdfjs-dist/types/src/display/api';
 const { FileApiDriverLocal } = require('./file-api-driver-local');
 const mimeUtils = require('./mime-utils.js').mime;
@@ -734,6 +735,26 @@ function shimInit(options: ShimInitOptions = null) {
 		}
 	};
 	shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => {
 		const loadingTask = pdfJs.getDocument(pdfPath);
 		const doc = await loadingTask.promise;
 		const textByPage = [];
 		for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
 			const page = await doc.getPage(pageNum);
 			const textContent = await page.getTextContent();
 			const strings = textContent.items.map(item => {
 				const text = (item as TextItem).str ?? '';
 				return text;
 			}).join('\n');
 			textByPage.push(strings);
 		}
 		return textByPage;
 	};
 	shim.pdfToImages = async (pdfPath: string, outputDirectoryPath: string): Promise<string[]> => {
 		// We handle both the Electron app and testing framework. Potentially
 		// the same code could be use to support the CLI app.
--- a/packages/lib/shim.ts
+++ b/packages/lib/shim.ts
@@ -277,6 +277,11 @@ const shim = {
 		throw new Error('Not implemented');
 	},
 	// Does not do OCR -- just extracts existing text from a PDF.
 	pdfExtractEmbeddedText: async (_pdfPath: string): Promise<string[]> => {
 		throw new Error('Not implemented: textFromPdf');
 	},
 	pdfToImages: async (_pdfPath: string, _outputDirectoryPath: string): Promise<string[]> => {
 		throw new Error('Not implemented');
 	},