1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-12-02 22:49:09 +02:00

Desktop: Add support for OCR (#8975)

This commit is contained in:
Laurent Cozic
2023-12-13 19:24:58 +00:00
committed by GitHub
parent 0e847685ff
commit bce94f1775
79 changed files with 2381 additions and 445 deletions

View File

@@ -1,6 +1,6 @@
import BaseModel, { ModelType } from '../BaseModel';
import shim from '../shim';
import eventManager from '../eventManager';
import eventManager, { EventName } from '../eventManager';
import { ItemChangeEntity } from '../services/database/types';
const Mutex = require('async-mutex').Mutex;
@@ -56,7 +56,7 @@ export default class ItemChange extends BaseModel {
release();
ItemChange.saveCalls_.pop();
eventManager.emit('itemChange', {
eventManager.emit(EventName.ItemChange, {
itemType: itemType,
itemId: itemId,
eventType: type,

View File

@@ -436,7 +436,7 @@ export default class Note extends BaseItem {
return this.modelSelectOne(`SELECT ${this.previewFieldsSql(options.fields)} FROM notes WHERE is_conflict = 0 AND id = ?`, [noteId]);
}
public static async search(options: any = null) {
public static async search(options: any = null): Promise<NoteEntity[]> {
if (!options) options = {};
if (!options.conditions) options.conditions = [];
if (!options.conditionsParams) options.conditionsParams = [];

View File

@@ -1,6 +1,7 @@
import BaseModel from '../BaseModel';
import { SqlQuery } from '../services/database/types';
import { NoteEntity, SqlQuery } from '../services/database/types';
import BaseItem from './BaseItem';
import { LoadOptions } from './utils/types';
// - If is_associated = 1, note_resources indicates which note_id is currently associated with the given resource_id
// - If is_associated = 0, note_resources indicates which note_id *was* associated with the given resource_id
@@ -76,6 +77,30 @@ export default class NoteResource extends BaseModel {
return rows.map((r: any) => r.note_id);
}
public static async associatedResourceNotes(resourceIds: string[], options: LoadOptions = null): Promise<Record<string, any>> {
if (!resourceIds.length) return {};
const fields: string[] = options && options.fields ? (options.fields as string[]).slice() : [];
fields.push('resource_id');
fields.push('note_id');
const rows = await this.modelSelectAll(`
SELECT ${this.selectFields({ ...options, fields })}
FROM note_resources
LEFT JOIN notes
ON notes.id = note_resources.note_id
WHERE resource_id IN ("${resourceIds.join('", "')}") AND is_associated = 1
`);
const output: Record<string, NoteEntity[]> = {};
for (const row of rows) {
if (!output[row.resource_id]) output[row.resource_id] = [];
output[row.resource_id].push(row);
}
return output;
}
public static async setAssociatedResources(noteId: string, resourceIds: string[]) {
const existingRows = await this.modelSelectAll('SELECT * FROM note_resources WHERE note_id = ?', [noteId]);

View File

@@ -5,7 +5,7 @@ import Resource from '../models/Resource';
import shim from '../shim';
import { ErrorCode } from '../errors';
import { remove, pathExists } from 'fs-extra';
import { ResourceEntity } from '../services/database/types';
import { ResourceEntity, ResourceOcrStatus } from '../services/database/types';
const testImagePath = `${supportDir}/photo.jpg`;
@@ -152,4 +152,43 @@ describe('models/Resource', () => {
cleanup();
});
it('should return resources since a certain time and ID', async () => {
expect((await Resource.allForNormalization(0, '')).length).toBe(0);
const testData: [string, number][] = [
['00000000000000000000000000000001', 1536700000000],
['ddddddddddddddddddddddddddddddd1', 1536700000001],
['ddddddddddddddddddddddddddddddd3', 1536700000001],
['ddddddddddddddddddddddddddddddd2', 1536700000001],
['bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb1', 1536700000002],
];
for (const [id, updatedTime] of testData) {
await Resource.save({
id,
created_time: updatedTime,
updated_time: updatedTime,
user_updated_time: updatedTime,
user_created_time: updatedTime,
mime: 'application/octet-stream',
ocr_text: 'test',
ocr_status: ResourceOcrStatus.Done,
}, { isNew: true, autoTimestamp: false });
}
expect((await Resource.allForNormalization(0, '')).length).toBe(testData.length);
{
const resources = await Resource.allForNormalization(1536700000001, 'ddddddddddddddddddddddddddddddd2');
expect(resources.length).toBe(2);
expect(resources.map(r => r.id)).toEqual(['ddddddddddddddddddddddddddddddd3', 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb1']);
}
{
const resources = await Resource.allForNormalization(1536700000000, '00000000000000000000000000000001');
expect(resources.length).toBe(4);
expect(resources.map(r => r.id)).toEqual(['ddddddddddddddddddddddddddddddd1', 'ddddddddddddddddddddddddddddddd2', 'ddddddddddddddddddddddddddddddd3', 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb1']);
}
});
});

View File

@@ -5,7 +5,7 @@ import NoteResource from './NoteResource';
import Setting from './Setting';
import markdownUtils from '../markdownUtils';
import { _ } from '../locale';
import { ResourceEntity, ResourceLocalStateEntity } from '../services/database/types';
import { ResourceEntity, ResourceLocalStateEntity, ResourceOcrStatus, SqlQuery } from '../services/database/types';
import ResourceLocalState from './ResourceLocalState';
const pathUtils = require('../path-utils');
const { mime } = require('../mime-utils.js');
@@ -15,9 +15,13 @@ import JoplinError from '../JoplinError';
import itemCanBeEncrypted from './utils/itemCanBeEncrypted';
import { getEncryptionEnabled } from '../services/synchronizer/syncInfoUtils';
import ShareService from '../services/share/ShareService';
import { LoadOptions } from './utils/types';
import { SaveOptions } from './utils/types';
import { MarkupLanguage } from '@joplin/renderer';
import { htmlentities } from '@joplin/utils/html';
import { RecognizeResultLine } from '../services/ocr/utils/types';
import eventManager, { EventName } from '../eventManager';
import { unique } from '../array';
export default class Resource extends BaseItem {
@@ -87,8 +91,9 @@ export default class Resource extends BaseItem {
return await this.db().exec('UPDATE resource_local_states SET fetch_status = ? WHERE fetch_status = ?', [Resource.FETCH_STATUS_IDLE, Resource.FETCH_STATUS_STARTED]);
}
public static resetErrorStatus(resourceId: string) {
return this.db().exec('UPDATE resource_local_states SET fetch_status = ?, fetch_error = "" WHERE resource_id = ?', [Resource.FETCH_STATUS_IDLE, resourceId]);
public static async resetFetchErrorStatus(resourceId: string) {
await this.db().exec('UPDATE resource_local_states SET fetch_status = ?, fetch_error = "" WHERE resource_id = ?', [Resource.FETCH_STATUS_IDLE, resourceId]);
await this.resetOcrStatus(resourceId);
}
public static fsDriver() {
@@ -284,7 +289,7 @@ export default class Resource extends BaseItem {
return url.substr(2);
}
public static async localState(resourceOrId: any) {
public static async localState(resourceOrId: any): Promise<ResourceLocalStateEntity> {
return ResourceLocalState.byResourceId(typeof resourceOrId === 'object' ? resourceOrId.id : resourceOrId);
}
@@ -323,6 +328,7 @@ export default class Resource extends BaseItem {
await super.batchDelete([id], options);
await this.fsDriver().remove(path);
await NoteResource.deleteByResource(id); // Clean up note/resource relationships
await this.db().exec('DELETE FROM items_normalized WHERE item_id = ?', [id]);
}
await ResourceLocalState.batchDelete(ids);
@@ -454,6 +460,21 @@ export default class Resource extends BaseItem {
return folder;
}
public static mustHandleConflict(local: ResourceEntity, remote: ResourceEntity) {
// That shouldn't happen so throw an exception
if (local.id !== remote.id) throw new Error('Cannot handle conflict for two different resources');
// If the content has changed, we need to handle the conflict
if (local.blob_updated_time !== remote.blob_updated_time) return true;
// If nothing has been changed, or if only the metadata has been
// changed, we just keep the remote version. Most of the resource
// metadata is not user-editable so there won't be any data loss. Such a
// conflict might happen for example if a resource is OCRed by two
// different clients.
return false;
}
public static async createConflictResourceNote(resource: ResourceEntity) {
const Note = this.getClass('Note');
const conflictResource = await Resource.duplicateResource(resource.id);
@@ -465,10 +486,90 @@ export default class Resource extends BaseItem {
}, { changeSource: ItemChange.SOURCE_SYNC });
}
private static baseNeedOcrQuery(selectSql: string, supportedMimeTypes: string[]): SqlQuery {
return {
sql: `
SELECT ${selectSql}
FROM resources
WHERE
ocr_status = ? AND
encryption_applied = 0 AND
mime IN ("${supportedMimeTypes.join('","')}")
`,
params: [
ResourceOcrStatus.Todo,
],
};
}
public static async needOcrCount(supportedMimeTypes: string[]): Promise<number> {
const query = this.baseNeedOcrQuery('count(*) as total', supportedMimeTypes);
const r = await this.db().selectOne(query.sql, query.params);
return r ? r['total'] : 0;
}
public static async needOcr(supportedMimeTypes: string[], skippedResourceIds: string[], limit: number, options: LoadOptions): Promise<ResourceEntity[]> {
const query = this.baseNeedOcrQuery(this.selectFields(options), supportedMimeTypes);
const skippedResourcesSql = skippedResourceIds.length ? `AND resources.id NOT IN ("${skippedResourceIds.join('","')}")` : '';
return await this.db().selectAll(`
${query.sql}
${skippedResourcesSql}
ORDER BY updated_time DESC
LIMIT ${limit}
`, query.params);
}
private static async resetOcrStatus(resourceId: string) {
await Resource.save({
id: resourceId,
ocr_error: '',
ocr_text: '',
ocr_status: ResourceOcrStatus.Todo,
});
}
public static serializeOcrDetails(details: RecognizeResultLine[]) {
if (!details || !details.length) return '';
return JSON.stringify(details);
}
public static unserializeOcrDetails(s: string): RecognizeResultLine[] | null {
if (!s) return null;
try {
const r = JSON.parse(s);
if (!r) return null;
if (!Array.isArray(r)) throw new Error('OCR details are not valid (not an array');
return r;
} catch (error) {
error.message = `Could not unserialized OCR data: ${error.message}`;
throw error;
}
}
public static async resourceOcrTextsByIds(ids: string[]): Promise<ResourceEntity[]> {
if (!ids.length) return [];
ids = unique(ids);
return this.modelSelectAll(`SELECT id, ocr_text FROM resources WHERE id IN ("${ids.join('","')}")`);
}
public static allForNormalization(updatedTime: number, id: string, limit = 100, options: LoadOptions = null) {
return this.modelSelectAll<ResourceEntity>(`
SELECT ${this.selectFields(options)} FROM resources
WHERE (updated_time, id) > (?, ?)
AND ocr_text != ""
AND ocr_status = ?
ORDER BY updated_time ASC, id ASC
LIMIT ?
`, [updatedTime, id, ResourceOcrStatus.Done, limit]);
}
public static async save(o: ResourceEntity, options: SaveOptions = null): Promise<ResourceEntity> {
const resource = { ...o };
if (this.isNew(o, options)) {
const isNew = this.isNew(o, options);
if (isNew) {
const now = Date.now();
options = { ...options, autoTimestamp: false };
if (!resource.created_time) resource.created_time = now;
@@ -476,7 +577,9 @@ export default class Resource extends BaseItem {
if (!resource.blob_updated_time) resource.blob_updated_time = now;
}
return await super.save(resource, options);
const output = await super.save(resource, options);
if (isNew) eventManager.emit(EventName.ResourceCreate);
return output;
}
}

View File

@@ -1,6 +1,6 @@
import shim from '../shim';
import { _, supportedLocalesToLanguages, defaultLocale } from '../locale';
import eventManager from '../eventManager';
import eventManager, { EventName } from '../eventManager';
import BaseModel from '../BaseModel';
import Database from '../database';
import SyncTargetRegistry from '../SyncTargetRegistry';
@@ -837,6 +837,17 @@ class Setting extends BaseModel {
isGlobal: true,
},
'ocr.enabled': {
value: false,
type: SettingItemType.Bool,
public: true,
appTypes: [AppType.Desktop],
label: () => _('Enable optical character recognition (OCR)'),
description: () => _('When enabled, the application will scan your attachments and extract the text from it. This will allow you to search for text in these attachments.'),
storage: SettingStorage.File,
isGlobal: true,
},
theme: {
value: Setting.THEME_LIGHT,
type: SettingItemType.Int,
@@ -1592,6 +1603,7 @@ class Setting extends BaseModel {
'revisionService.lastProcessedChangeId': { value: 0, type: SettingItemType.Int, public: false },
'searchEngine.initialIndexingDone': { value: false, type: SettingItemType.Bool, public: false },
'searchEngine.lastProcessedResource': { value: '', type: SettingItemType.String, public: false },
'revisionService.enabled': { section: 'revisionService', storage: SettingStorage.File, value: true, type: SettingItemType.Bool, public: true, label: () => _('Enable note history') },
'revisionService.ttlDays': {
@@ -2490,7 +2502,7 @@ class Setting extends BaseModel {
const keys = this.changedKeys_.slice();
this.changedKeys_ = [];
eventManager.emit('settingsChange', { keys });
eventManager.emit(EventName.SettingsChange, { keys });
}
public static scheduleSave() {