diff --git a/packages/app-cli/tests/ocr_samples/multi_page__embedded_text.pdf b/packages/app-cli/tests/ocr_samples/multi_page__embedded_text.pdf new file mode 100644 index 000000000..8a847a490 Binary files /dev/null and b/packages/app-cli/tests/ocr_samples/multi_page__embedded_text.pdf differ diff --git a/packages/app-cli/tests/ocr_samples/multi_page__no_embedded_text.pdf b/packages/app-cli/tests/ocr_samples/multi_page__no_embedded_text.pdf new file mode 100644 index 000000000..bf907f5b8 Binary files /dev/null and b/packages/app-cli/tests/ocr_samples/multi_page__no_embedded_text.pdf differ diff --git a/packages/lib/services/ocr/OcrService.test.ts b/packages/lib/services/ocr/OcrService.test.ts index d92bd14c2..6f52b494f 100644 --- a/packages/lib/services/ocr/OcrService.test.ts +++ b/packages/lib/services/ocr/OcrService.test.ts @@ -82,15 +82,20 @@ describe('OcrService', () => { // `jest.retryTimes(2)` }, 60000 * 5); - it('should process PDF resources', async () => { - const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` }); + test.each([ + // Use embedded text (skip OCR) + ['dummy.pdf', 'Dummy PDF file'], + ['multi_page__embedded_text.pdf', 'This is a test.\nTesting...\nThis PDF has 3 pages.\nThis is page 3.'], + ['multi_page__no_embedded_text.pdf', 'This is a multi-page PDF\nwith no embedded text.\nPage 2: more text.\nThe third page.'], + ])('should process PDF resources', async (samplePath: string, expectedText: string) => { + const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/${samplePath}` }); const service = newOcrService(); await service.processResources(); const processedResource: ResourceEntity = await Resource.load(resource.id); - expect(processedResource.ocr_text).toBe('Dummy PDF file'); + expect(processedResource.ocr_text).toBe(expectedText); expect(processedResource.ocr_status).toBe(ResourceOcrStatus.Done); expect(processedResource.ocr_error).toBe(''); diff --git a/packages/lib/services/ocr/OcrService.ts b/packages/lib/services/ocr/OcrService.ts index e1cf17e0b..a42d7a9c9 100644 --- a/packages/lib/services/ocr/OcrService.ts +++ b/packages/lib/services/ocr/OcrService.ts @@ -62,6 +62,17 @@ export default class OcrService { const resourceFilePath = Resource.fullPath(resource); if (resource.mime === 'application/pdf') { + // OCR can be slow for large PDFs. + // Skip it if the PDF already includes text. + const pageTexts = await shim.pdfExtractEmbeddedText(resourceFilePath); + const pagesWithText = pageTexts.filter(text => !!text.trim().length); + + if (pagesWithText.length > 0) { + return { + text: pageTexts.join('\n'), + }; + } + const imageFilePaths = await shim.pdfToImages(resourceFilePath, await this.pdfExtractDir()); const results: RecognizeResult[] = []; diff --git a/packages/lib/shim-init-node.ts b/packages/lib/shim-init-node.ts index e8f12f108..b06c7bfe4 100644 --- a/packages/lib/shim-init-node.ts +++ b/packages/lib/shim-init-node.ts @@ -9,6 +9,7 @@ import * as fs from 'fs-extra'; import * as pdfJsNamespace from 'pdfjs-dist'; import { writeFile } from 'fs/promises'; import { ResourceEntity } from './services/database/types'; +import { TextItem } from 'pdfjs-dist/types/src/display/api'; const { FileApiDriverLocal } = require('./file-api-driver-local'); const mimeUtils = require('./mime-utils.js').mime; @@ -734,6 +735,26 @@ function shimInit(options: ShimInitOptions = null) { } }; + shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise => { + const loadingTask = pdfJs.getDocument(pdfPath); + const doc = await loadingTask.promise; + + const textByPage = []; + + for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) { + const page = await doc.getPage(pageNum); + const textContent = await page.getTextContent(); + + const strings = textContent.items.map(item => { + const text = (item as TextItem).str ?? ''; + return text; + }).join('\n'); + textByPage.push(strings); + } + + return textByPage; + }; + shim.pdfToImages = async (pdfPath: string, outputDirectoryPath: string): Promise => { // We handle both the Electron app and testing framework. Potentially // the same code could be use to support the CLI app. diff --git a/packages/lib/shim.ts b/packages/lib/shim.ts index b3b628c3d..fe6901dca 100644 --- a/packages/lib/shim.ts +++ b/packages/lib/shim.ts @@ -277,6 +277,11 @@ const shim = { throw new Error('Not implemented'); }, + // Does not do OCR -- just extracts existing text from a PDF. + pdfExtractEmbeddedText: async (_pdfPath: string): Promise => { + throw new Error('Not implemented: textFromPdf'); + }, + pdfToImages: async (_pdfPath: string, _outputDirectoryPath: string): Promise => { throw new Error('Not implemented'); },