1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-01-02 12:47:41 +02:00
joplin/packages/lib/services/ocr/OcrService.ts

218 lines
6.5 KiB
TypeScript

import { toIso639 } from '../../locale';
import Resource from '../../models/Resource';
import Setting from '../../models/Setting';
import shim from '../../shim';
import { ResourceEntity, ResourceOcrStatus } from '../database/types';
import OcrDriverBase from './OcrDriverBase';
import { RecognizeResult } from './utils/types';
import { Minute } from '@joplin/utils/time';
import Logger from '@joplin/utils/Logger';
import filterOcrText from './utils/filterOcrText';
import TaskQueue from '../../TaskQueue';
import eventManager, { EventName } from '../../eventManager';
const logger = Logger.create('OcrService');
// From: https://github.com/naptha/tesseract.js/blob/master/docs/image-format.md
export const supportedMimeTypes = [
'application/pdf',
'image/bmp',
'image/jpeg',
'image/jpg',
'image/png',
'image/webp',
'image/x-portable-bitmap',
];
const resourceInfo = (resource: ResourceEntity) => {
return `${resource.id} (type ${resource.mime})`;
};
export default class OcrService {
private driver_: OcrDriverBase;
private isRunningInBackground_ = false;
private maintenanceTimer_: any = null;
private pdfExtractDir_: string = null;
private isProcessingResources_ = false;
private recognizeQueue_: TaskQueue = null;
public constructor(driver: OcrDriverBase) {
this.driver_ = driver;
this.recognizeQueue_ = new TaskQueue('recognize', logger);
this.recognizeQueue_.setConcurrency(5);
this.recognizeQueue_.keepTaskResults = false;
}
private async pdfExtractDir(): Promise<string> {
if (this.pdfExtractDir_ !== null) return this.pdfExtractDir_;
const p = `${Setting.value('tempDir')}/ocr_pdf_extract`;
await shim.fsDriver().mkdir(p);
this.pdfExtractDir_ = p;
return this.pdfExtractDir_;
}
public get running() {
return this.runInBackground;
}
private async recognize(language: string, resource: ResourceEntity): Promise<RecognizeResult> {
if (resource.encryption_applied) throw new Error(`Cannot OCR encrypted resource: ${resource.id}`);
const resourceFilePath = Resource.fullPath(resource);
if (resource.mime === 'application/pdf') {
// OCR can be slow for large PDFs.
// Skip it if the PDF already includes text.
const pageTexts = await shim.pdfExtractEmbeddedText(resourceFilePath);
const pagesWithText = pageTexts.filter(text => !!text.trim().length);
if (pagesWithText.length > 0) {
return {
text: pageTexts.join('\n'),
};
}
const imageFilePaths = await shim.pdfToImages(resourceFilePath, await this.pdfExtractDir());
const results: RecognizeResult[] = [];
let pageIndex = 0;
for (const imageFilePath of imageFilePaths) {
logger.info(`Recognize: ${resourceInfo(resource)}: Processing PDF page ${pageIndex + 1} / ${imageFilePaths.length}...`);
results.push(await this.driver_.recognize(language, imageFilePath));
pageIndex++;
}
for (const imageFilePath of imageFilePaths) {
await shim.fsDriver().remove(imageFilePath);
}
return {
text: results.map(r => r.text).join('\n'),
};
} else {
return this.driver_.recognize(language, resourceFilePath);
}
}
public async dispose() {
await this.driver_.dispose();
}
public async processResources() {
if (this.isProcessingResources_) return;
this.isProcessingResources_ = true;
const totalResourcesToProcess = await Resource.needOcrCount(supportedMimeTypes);
const inProcessResourceIds: string[] = [];
const skippedResourceIds: string[] = [];
logger.info(`Found ${totalResourcesToProcess} resources to process...`);
const makeQueueAction = (totalProcessed: number, language: string, resource: ResourceEntity) => {
return async () => {
logger.info(`Processing resource ${totalProcessed + 1} / ${totalResourcesToProcess}: ${resourceInfo(resource)}...`);
const toSave: ResourceEntity = {
id: resource.id,
};
try {
const fetchStatus = await Resource.localState(resource.id);
if (fetchStatus.fetch_status === Resource.FETCH_STATUS_ERROR) {
throw new Error(`Cannot process resource ${resourceInfo(resource)} because it cannot be fetched from the server: ${fetchStatus.fetch_error}`);
}
if (fetchStatus.fetch_status !== Resource.FETCH_STATUS_DONE) {
skippedResourceIds.push(resource.id);
logger.info(`Skipping resource ${resourceInfo(resource)} because it has not been downloaded yet`);
return;
}
const result = await this.recognize(language, resource);
toSave.ocr_status = ResourceOcrStatus.Done;
toSave.ocr_text = filterOcrText(result.text);
toSave.ocr_details = Resource.serializeOcrDetails(result.lines),
toSave.ocr_error = '';
} catch (error) {
const errorMessage = typeof error === 'string' ? error : error?.message;
logger.warn(`Could not process resource ${resourceInfo(resource)}`, error);
toSave.ocr_status = ResourceOcrStatus.Error;
toSave.ocr_text = '';
toSave.ocr_details = '';
toSave.ocr_error = errorMessage || 'Unknown error';
}
await Resource.save(toSave);
};
};
try {
const language = toIso639(Setting.value('locale'));
let totalProcessed = 0;
while (true) {
const resources = await Resource.needOcr(supportedMimeTypes, skippedResourceIds.concat(inProcessResourceIds), 100, {
fields: [
'id',
'mime',
'file_extension',
'encryption_applied',
],
});
if (!resources.length) break;
for (const resource of resources) {
inProcessResourceIds.push(resource.id);
await this.recognizeQueue_.pushAsync(resource.id, makeQueueAction(totalProcessed++, language, resource));
}
}
await this.recognizeQueue_.waitForAll();
if (totalProcessed) {
eventManager.emit(EventName.OcrServiceResourcesProcessed);
}
logger.info(`${totalProcessed} resources have been processed.`);
} finally {
this.isProcessingResources_ = false;
}
}
public async maintenance() {
await this.processResources();
}
public async runInBackground() {
if (this.isRunningInBackground_) return;
this.isRunningInBackground_ = true;
if (this.maintenanceTimer_) return;
logger.info('Starting background service...');
await this.maintenance();
this.maintenanceTimer_ = shim.setInterval(async () => {
await this.maintenance();
this.maintenanceTimer_ = null;
}, 5 * Minute);
}
public async stopRunInBackground() {
logger.info('Stopping background service...');
if (this.maintenanceTimer_) shim.clearInterval(this.maintenanceTimer_);
this.maintenanceTimer_ = null;
this.isRunningInBackground_ = false;
await this.recognizeQueue_.stop();
}
}