You've already forked joplin
mirror of
https://github.com/laurent22/joplin.git
synced 2025-12-02 22:49:09 +02:00
Desktop: Add support for OCR (#8975)
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import BaseModel, { ModelType } from '../BaseModel';
|
||||
import shim from '../shim';
|
||||
import eventManager from '../eventManager';
|
||||
import eventManager, { EventName } from '../eventManager';
|
||||
import { ItemChangeEntity } from '../services/database/types';
|
||||
const Mutex = require('async-mutex').Mutex;
|
||||
|
||||
@@ -56,7 +56,7 @@ export default class ItemChange extends BaseModel {
|
||||
release();
|
||||
ItemChange.saveCalls_.pop();
|
||||
|
||||
eventManager.emit('itemChange', {
|
||||
eventManager.emit(EventName.ItemChange, {
|
||||
itemType: itemType,
|
||||
itemId: itemId,
|
||||
eventType: type,
|
||||
|
||||
@@ -436,7 +436,7 @@ export default class Note extends BaseItem {
|
||||
return this.modelSelectOne(`SELECT ${this.previewFieldsSql(options.fields)} FROM notes WHERE is_conflict = 0 AND id = ?`, [noteId]);
|
||||
}
|
||||
|
||||
public static async search(options: any = null) {
|
||||
public static async search(options: any = null): Promise<NoteEntity[]> {
|
||||
if (!options) options = {};
|
||||
if (!options.conditions) options.conditions = [];
|
||||
if (!options.conditionsParams) options.conditionsParams = [];
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import BaseModel from '../BaseModel';
|
||||
import { SqlQuery } from '../services/database/types';
|
||||
import { NoteEntity, SqlQuery } from '../services/database/types';
|
||||
import BaseItem from './BaseItem';
|
||||
import { LoadOptions } from './utils/types';
|
||||
|
||||
// - If is_associated = 1, note_resources indicates which note_id is currently associated with the given resource_id
|
||||
// - If is_associated = 0, note_resources indicates which note_id *was* associated with the given resource_id
|
||||
@@ -76,6 +77,30 @@ export default class NoteResource extends BaseModel {
|
||||
return rows.map((r: any) => r.note_id);
|
||||
}
|
||||
|
||||
public static async associatedResourceNotes(resourceIds: string[], options: LoadOptions = null): Promise<Record<string, any>> {
|
||||
if (!resourceIds.length) return {};
|
||||
|
||||
const fields: string[] = options && options.fields ? (options.fields as string[]).slice() : [];
|
||||
fields.push('resource_id');
|
||||
fields.push('note_id');
|
||||
|
||||
const rows = await this.modelSelectAll(`
|
||||
SELECT ${this.selectFields({ ...options, fields })}
|
||||
FROM note_resources
|
||||
LEFT JOIN notes
|
||||
ON notes.id = note_resources.note_id
|
||||
WHERE resource_id IN ("${resourceIds.join('", "')}") AND is_associated = 1
|
||||
`);
|
||||
|
||||
const output: Record<string, NoteEntity[]> = {};
|
||||
for (const row of rows) {
|
||||
if (!output[row.resource_id]) output[row.resource_id] = [];
|
||||
output[row.resource_id].push(row);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
public static async setAssociatedResources(noteId: string, resourceIds: string[]) {
|
||||
const existingRows = await this.modelSelectAll('SELECT * FROM note_resources WHERE note_id = ?', [noteId]);
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import Resource from '../models/Resource';
|
||||
import shim from '../shim';
|
||||
import { ErrorCode } from '../errors';
|
||||
import { remove, pathExists } from 'fs-extra';
|
||||
import { ResourceEntity } from '../services/database/types';
|
||||
import { ResourceEntity, ResourceOcrStatus } from '../services/database/types';
|
||||
|
||||
const testImagePath = `${supportDir}/photo.jpg`;
|
||||
|
||||
@@ -152,4 +152,43 @@ describe('models/Resource', () => {
|
||||
cleanup();
|
||||
});
|
||||
|
||||
it('should return resources since a certain time and ID', async () => {
|
||||
expect((await Resource.allForNormalization(0, '')).length).toBe(0);
|
||||
|
||||
const testData: [string, number][] = [
|
||||
['00000000000000000000000000000001', 1536700000000],
|
||||
['ddddddddddddddddddddddddddddddd1', 1536700000001],
|
||||
['ddddddddddddddddddddddddddddddd3', 1536700000001],
|
||||
['ddddddddddddddddddddddddddddddd2', 1536700000001],
|
||||
['bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb1', 1536700000002],
|
||||
];
|
||||
|
||||
for (const [id, updatedTime] of testData) {
|
||||
await Resource.save({
|
||||
id,
|
||||
created_time: updatedTime,
|
||||
updated_time: updatedTime,
|
||||
user_updated_time: updatedTime,
|
||||
user_created_time: updatedTime,
|
||||
mime: 'application/octet-stream',
|
||||
ocr_text: 'test',
|
||||
ocr_status: ResourceOcrStatus.Done,
|
||||
}, { isNew: true, autoTimestamp: false });
|
||||
}
|
||||
|
||||
expect((await Resource.allForNormalization(0, '')).length).toBe(testData.length);
|
||||
|
||||
{
|
||||
const resources = await Resource.allForNormalization(1536700000001, 'ddddddddddddddddddddddddddddddd2');
|
||||
expect(resources.length).toBe(2);
|
||||
expect(resources.map(r => r.id)).toEqual(['ddddddddddddddddddddddddddddddd3', 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb1']);
|
||||
}
|
||||
|
||||
{
|
||||
const resources = await Resource.allForNormalization(1536700000000, '00000000000000000000000000000001');
|
||||
expect(resources.length).toBe(4);
|
||||
expect(resources.map(r => r.id)).toEqual(['ddddddddddddddddddddddddddddddd1', 'ddddddddddddddddddddddddddddddd2', 'ddddddddddddddddddddddddddddddd3', 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb1']);
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
@@ -5,7 +5,7 @@ import NoteResource from './NoteResource';
|
||||
import Setting from './Setting';
|
||||
import markdownUtils from '../markdownUtils';
|
||||
import { _ } from '../locale';
|
||||
import { ResourceEntity, ResourceLocalStateEntity } from '../services/database/types';
|
||||
import { ResourceEntity, ResourceLocalStateEntity, ResourceOcrStatus, SqlQuery } from '../services/database/types';
|
||||
import ResourceLocalState from './ResourceLocalState';
|
||||
const pathUtils = require('../path-utils');
|
||||
const { mime } = require('../mime-utils.js');
|
||||
@@ -15,9 +15,13 @@ import JoplinError from '../JoplinError';
|
||||
import itemCanBeEncrypted from './utils/itemCanBeEncrypted';
|
||||
import { getEncryptionEnabled } from '../services/synchronizer/syncInfoUtils';
|
||||
import ShareService from '../services/share/ShareService';
|
||||
import { LoadOptions } from './utils/types';
|
||||
import { SaveOptions } from './utils/types';
|
||||
import { MarkupLanguage } from '@joplin/renderer';
|
||||
import { htmlentities } from '@joplin/utils/html';
|
||||
import { RecognizeResultLine } from '../services/ocr/utils/types';
|
||||
import eventManager, { EventName } from '../eventManager';
|
||||
import { unique } from '../array';
|
||||
|
||||
export default class Resource extends BaseItem {
|
||||
|
||||
@@ -87,8 +91,9 @@ export default class Resource extends BaseItem {
|
||||
return await this.db().exec('UPDATE resource_local_states SET fetch_status = ? WHERE fetch_status = ?', [Resource.FETCH_STATUS_IDLE, Resource.FETCH_STATUS_STARTED]);
|
||||
}
|
||||
|
||||
public static resetErrorStatus(resourceId: string) {
|
||||
return this.db().exec('UPDATE resource_local_states SET fetch_status = ?, fetch_error = "" WHERE resource_id = ?', [Resource.FETCH_STATUS_IDLE, resourceId]);
|
||||
public static async resetFetchErrorStatus(resourceId: string) {
|
||||
await this.db().exec('UPDATE resource_local_states SET fetch_status = ?, fetch_error = "" WHERE resource_id = ?', [Resource.FETCH_STATUS_IDLE, resourceId]);
|
||||
await this.resetOcrStatus(resourceId);
|
||||
}
|
||||
|
||||
public static fsDriver() {
|
||||
@@ -284,7 +289,7 @@ export default class Resource extends BaseItem {
|
||||
return url.substr(2);
|
||||
}
|
||||
|
||||
public static async localState(resourceOrId: any) {
|
||||
public static async localState(resourceOrId: any): Promise<ResourceLocalStateEntity> {
|
||||
return ResourceLocalState.byResourceId(typeof resourceOrId === 'object' ? resourceOrId.id : resourceOrId);
|
||||
}
|
||||
|
||||
@@ -323,6 +328,7 @@ export default class Resource extends BaseItem {
|
||||
await super.batchDelete([id], options);
|
||||
await this.fsDriver().remove(path);
|
||||
await NoteResource.deleteByResource(id); // Clean up note/resource relationships
|
||||
await this.db().exec('DELETE FROM items_normalized WHERE item_id = ?', [id]);
|
||||
}
|
||||
|
||||
await ResourceLocalState.batchDelete(ids);
|
||||
@@ -454,6 +460,21 @@ export default class Resource extends BaseItem {
|
||||
return folder;
|
||||
}
|
||||
|
||||
public static mustHandleConflict(local: ResourceEntity, remote: ResourceEntity) {
|
||||
// That shouldn't happen so throw an exception
|
||||
if (local.id !== remote.id) throw new Error('Cannot handle conflict for two different resources');
|
||||
|
||||
// If the content has changed, we need to handle the conflict
|
||||
if (local.blob_updated_time !== remote.blob_updated_time) return true;
|
||||
|
||||
// If nothing has been changed, or if only the metadata has been
|
||||
// changed, we just keep the remote version. Most of the resource
|
||||
// metadata is not user-editable so there won't be any data loss. Such a
|
||||
// conflict might happen for example if a resource is OCRed by two
|
||||
// different clients.
|
||||
return false;
|
||||
}
|
||||
|
||||
public static async createConflictResourceNote(resource: ResourceEntity) {
|
||||
const Note = this.getClass('Note');
|
||||
const conflictResource = await Resource.duplicateResource(resource.id);
|
||||
@@ -465,10 +486,90 @@ export default class Resource extends BaseItem {
|
||||
}, { changeSource: ItemChange.SOURCE_SYNC });
|
||||
}
|
||||
|
||||
private static baseNeedOcrQuery(selectSql: string, supportedMimeTypes: string[]): SqlQuery {
|
||||
return {
|
||||
sql: `
|
||||
SELECT ${selectSql}
|
||||
FROM resources
|
||||
WHERE
|
||||
ocr_status = ? AND
|
||||
encryption_applied = 0 AND
|
||||
mime IN ("${supportedMimeTypes.join('","')}")
|
||||
`,
|
||||
params: [
|
||||
ResourceOcrStatus.Todo,
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
public static async needOcrCount(supportedMimeTypes: string[]): Promise<number> {
|
||||
const query = this.baseNeedOcrQuery('count(*) as total', supportedMimeTypes);
|
||||
const r = await this.db().selectOne(query.sql, query.params);
|
||||
return r ? r['total'] : 0;
|
||||
}
|
||||
|
||||
public static async needOcr(supportedMimeTypes: string[], skippedResourceIds: string[], limit: number, options: LoadOptions): Promise<ResourceEntity[]> {
|
||||
const query = this.baseNeedOcrQuery(this.selectFields(options), supportedMimeTypes);
|
||||
const skippedResourcesSql = skippedResourceIds.length ? `AND resources.id NOT IN ("${skippedResourceIds.join('","')}")` : '';
|
||||
|
||||
return await this.db().selectAll(`
|
||||
${query.sql}
|
||||
${skippedResourcesSql}
|
||||
ORDER BY updated_time DESC
|
||||
LIMIT ${limit}
|
||||
`, query.params);
|
||||
}
|
||||
|
||||
private static async resetOcrStatus(resourceId: string) {
|
||||
await Resource.save({
|
||||
id: resourceId,
|
||||
ocr_error: '',
|
||||
ocr_text: '',
|
||||
ocr_status: ResourceOcrStatus.Todo,
|
||||
});
|
||||
}
|
||||
|
||||
public static serializeOcrDetails(details: RecognizeResultLine[]) {
|
||||
if (!details || !details.length) return '';
|
||||
return JSON.stringify(details);
|
||||
}
|
||||
|
||||
public static unserializeOcrDetails(s: string): RecognizeResultLine[] | null {
|
||||
if (!s) return null;
|
||||
try {
|
||||
const r = JSON.parse(s);
|
||||
if (!r) return null;
|
||||
if (!Array.isArray(r)) throw new Error('OCR details are not valid (not an array');
|
||||
return r;
|
||||
} catch (error) {
|
||||
error.message = `Could not unserialized OCR data: ${error.message}`;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
public static async resourceOcrTextsByIds(ids: string[]): Promise<ResourceEntity[]> {
|
||||
if (!ids.length) return [];
|
||||
ids = unique(ids);
|
||||
return this.modelSelectAll(`SELECT id, ocr_text FROM resources WHERE id IN ("${ids.join('","')}")`);
|
||||
}
|
||||
|
||||
public static allForNormalization(updatedTime: number, id: string, limit = 100, options: LoadOptions = null) {
|
||||
return this.modelSelectAll<ResourceEntity>(`
|
||||
SELECT ${this.selectFields(options)} FROM resources
|
||||
WHERE (updated_time, id) > (?, ?)
|
||||
AND ocr_text != ""
|
||||
AND ocr_status = ?
|
||||
ORDER BY updated_time ASC, id ASC
|
||||
LIMIT ?
|
||||
`, [updatedTime, id, ResourceOcrStatus.Done, limit]);
|
||||
}
|
||||
|
||||
public static async save(o: ResourceEntity, options: SaveOptions = null): Promise<ResourceEntity> {
|
||||
const resource = { ...o };
|
||||
|
||||
if (this.isNew(o, options)) {
|
||||
const isNew = this.isNew(o, options);
|
||||
|
||||
if (isNew) {
|
||||
const now = Date.now();
|
||||
options = { ...options, autoTimestamp: false };
|
||||
if (!resource.created_time) resource.created_time = now;
|
||||
@@ -476,7 +577,9 @@ export default class Resource extends BaseItem {
|
||||
if (!resource.blob_updated_time) resource.blob_updated_time = now;
|
||||
}
|
||||
|
||||
return await super.save(resource, options);
|
||||
const output = await super.save(resource, options);
|
||||
if (isNew) eventManager.emit(EventName.ResourceCreate);
|
||||
return output;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import shim from '../shim';
|
||||
import { _, supportedLocalesToLanguages, defaultLocale } from '../locale';
|
||||
import eventManager from '../eventManager';
|
||||
import eventManager, { EventName } from '../eventManager';
|
||||
import BaseModel from '../BaseModel';
|
||||
import Database from '../database';
|
||||
import SyncTargetRegistry from '../SyncTargetRegistry';
|
||||
@@ -837,6 +837,17 @@ class Setting extends BaseModel {
|
||||
isGlobal: true,
|
||||
},
|
||||
|
||||
'ocr.enabled': {
|
||||
value: false,
|
||||
type: SettingItemType.Bool,
|
||||
public: true,
|
||||
appTypes: [AppType.Desktop],
|
||||
label: () => _('Enable optical character recognition (OCR)'),
|
||||
description: () => _('When enabled, the application will scan your attachments and extract the text from it. This will allow you to search for text in these attachments.'),
|
||||
storage: SettingStorage.File,
|
||||
isGlobal: true,
|
||||
},
|
||||
|
||||
theme: {
|
||||
value: Setting.THEME_LIGHT,
|
||||
type: SettingItemType.Int,
|
||||
@@ -1592,6 +1603,7 @@ class Setting extends BaseModel {
|
||||
'revisionService.lastProcessedChangeId': { value: 0, type: SettingItemType.Int, public: false },
|
||||
|
||||
'searchEngine.initialIndexingDone': { value: false, type: SettingItemType.Bool, public: false },
|
||||
'searchEngine.lastProcessedResource': { value: '', type: SettingItemType.String, public: false },
|
||||
|
||||
'revisionService.enabled': { section: 'revisionService', storage: SettingStorage.File, value: true, type: SettingItemType.Bool, public: true, label: () => _('Enable note history') },
|
||||
'revisionService.ttlDays': {
|
||||
@@ -2490,7 +2502,7 @@ class Setting extends BaseModel {
|
||||
|
||||
const keys = this.changedKeys_.slice();
|
||||
this.changedKeys_ = [];
|
||||
eventManager.emit('settingsChange', { keys });
|
||||
eventManager.emit(EventName.SettingsChange, { keys });
|
||||
}
|
||||
|
||||
public static scheduleSave() {
|
||||
|
||||
Reference in New Issue
Block a user