mirror of
https://github.com/laurent22/joplin.git
synced 2025-01-11 18:24:43 +02:00
This commit is contained in:
parent
564adb9a57
commit
815fe8d3ba
BIN
packages/app-cli/tests/ocr_samples/multi_page__embedded_text.pdf
Normal file
BIN
packages/app-cli/tests/ocr_samples/multi_page__embedded_text.pdf
Normal file
Binary file not shown.
Binary file not shown.
@ -82,15 +82,20 @@ describe('OcrService', () => {
|
|||||||
// `jest.retryTimes(2)`
|
// `jest.retryTimes(2)`
|
||||||
}, 60000 * 5);
|
}, 60000 * 5);
|
||||||
|
|
||||||
it('should process PDF resources', async () => {
|
test.each([
|
||||||
const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
|
// Use embedded text (skip OCR)
|
||||||
|
['dummy.pdf', 'Dummy PDF file'],
|
||||||
|
['multi_page__embedded_text.pdf', 'This is a test.\nTesting...\nThis PDF has 3 pages.\nThis is page 3.'],
|
||||||
|
['multi_page__no_embedded_text.pdf', 'This is a multi-page PDF\nwith no embedded text.\nPage 2: more text.\nThe third page.'],
|
||||||
|
])('should process PDF resources', async (samplePath: string, expectedText: string) => {
|
||||||
|
const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/${samplePath}` });
|
||||||
|
|
||||||
const service = newOcrService();
|
const service = newOcrService();
|
||||||
|
|
||||||
await service.processResources();
|
await service.processResources();
|
||||||
|
|
||||||
const processedResource: ResourceEntity = await Resource.load(resource.id);
|
const processedResource: ResourceEntity = await Resource.load(resource.id);
|
||||||
expect(processedResource.ocr_text).toBe('Dummy PDF file');
|
expect(processedResource.ocr_text).toBe(expectedText);
|
||||||
expect(processedResource.ocr_status).toBe(ResourceOcrStatus.Done);
|
expect(processedResource.ocr_status).toBe(ResourceOcrStatus.Done);
|
||||||
expect(processedResource.ocr_error).toBe('');
|
expect(processedResource.ocr_error).toBe('');
|
||||||
|
|
||||||
|
@ -62,6 +62,17 @@ export default class OcrService {
|
|||||||
const resourceFilePath = Resource.fullPath(resource);
|
const resourceFilePath = Resource.fullPath(resource);
|
||||||
|
|
||||||
if (resource.mime === 'application/pdf') {
|
if (resource.mime === 'application/pdf') {
|
||||||
|
// OCR can be slow for large PDFs.
|
||||||
|
// Skip it if the PDF already includes text.
|
||||||
|
const pageTexts = await shim.pdfExtractEmbeddedText(resourceFilePath);
|
||||||
|
const pagesWithText = pageTexts.filter(text => !!text.trim().length);
|
||||||
|
|
||||||
|
if (pagesWithText.length > 0) {
|
||||||
|
return {
|
||||||
|
text: pageTexts.join('\n'),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const imageFilePaths = await shim.pdfToImages(resourceFilePath, await this.pdfExtractDir());
|
const imageFilePaths = await shim.pdfToImages(resourceFilePath, await this.pdfExtractDir());
|
||||||
const results: RecognizeResult[] = [];
|
const results: RecognizeResult[] = [];
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import * as fs from 'fs-extra';
|
|||||||
import * as pdfJsNamespace from 'pdfjs-dist';
|
import * as pdfJsNamespace from 'pdfjs-dist';
|
||||||
import { writeFile } from 'fs/promises';
|
import { writeFile } from 'fs/promises';
|
||||||
import { ResourceEntity } from './services/database/types';
|
import { ResourceEntity } from './services/database/types';
|
||||||
|
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||||
|
|
||||||
const { FileApiDriverLocal } = require('./file-api-driver-local');
|
const { FileApiDriverLocal } = require('./file-api-driver-local');
|
||||||
const mimeUtils = require('./mime-utils.js').mime;
|
const mimeUtils = require('./mime-utils.js').mime;
|
||||||
@ -734,6 +735,26 @@ function shimInit(options: ShimInitOptions = null) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => {
|
||||||
|
const loadingTask = pdfJs.getDocument(pdfPath);
|
||||||
|
const doc = await loadingTask.promise;
|
||||||
|
|
||||||
|
const textByPage = [];
|
||||||
|
|
||||||
|
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
|
||||||
|
const page = await doc.getPage(pageNum);
|
||||||
|
const textContent = await page.getTextContent();
|
||||||
|
|
||||||
|
const strings = textContent.items.map(item => {
|
||||||
|
const text = (item as TextItem).str ?? '';
|
||||||
|
return text;
|
||||||
|
}).join('\n');
|
||||||
|
textByPage.push(strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
return textByPage;
|
||||||
|
};
|
||||||
|
|
||||||
shim.pdfToImages = async (pdfPath: string, outputDirectoryPath: string): Promise<string[]> => {
|
shim.pdfToImages = async (pdfPath: string, outputDirectoryPath: string): Promise<string[]> => {
|
||||||
// We handle both the Electron app and testing framework. Potentially
|
// We handle both the Electron app and testing framework. Potentially
|
||||||
// the same code could be use to support the CLI app.
|
// the same code could be use to support the CLI app.
|
||||||
|
@ -277,6 +277,11 @@ const shim = {
|
|||||||
throw new Error('Not implemented');
|
throw new Error('Not implemented');
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// Does not do OCR -- just extracts existing text from a PDF.
|
||||||
|
pdfExtractEmbeddedText: async (_pdfPath: string): Promise<string[]> => {
|
||||||
|
throw new Error('Not implemented: textFromPdf');
|
||||||
|
},
|
||||||
|
|
||||||
pdfToImages: async (_pdfPath: string, _outputDirectoryPath: string): Promise<string[]> => {
|
pdfToImages: async (_pdfPath: string, _outputDirectoryPath: string): Promise<string[]> => {
|
||||||
throw new Error('Not implemented');
|
throw new Error('Not implemented');
|
||||||
},
|
},
|
||||||
|
Loading…
Reference in New Issue
Block a user