2024-01-05 14:43:24 +00:00
|
|
|
import { createNoteAndResource, newOcrService, ocrSampleDir, resourceFetcher, setupDatabaseAndSynchronizer, supportDir, switchClient, synchronizerStart } from '../../testing/test-utils';
|
2023-12-13 19:24:58 +00:00
|
|
|
import { supportedMimeTypes } from './OcrService';
|
|
|
|
import Resource from '../../models/Resource';
|
|
|
|
import { ResourceEntity, ResourceOcrStatus } from '../database/types';
|
|
|
|
import { msleep } from '@joplin/utils/time';
|
|
|
|
import Logger from '@joplin/utils/Logger';
|
|
|
|
|
|
|
|
describe('OcrService', () => {
|
|
|
|
|
|
|
|
beforeEach(async () => {
|
|
|
|
await setupDatabaseAndSynchronizer(1);
|
|
|
|
await setupDatabaseAndSynchronizer(2);
|
|
|
|
await switchClient(1);
|
|
|
|
});
|
|
|
|
|
|
|
|
it('should process resources', async () => {
|
|
|
|
const { resource: resource1 } = await createNoteAndResource({ path: `${ocrSampleDir}/testocr.png` });
|
|
|
|
const { resource: resource2 } = await createNoteAndResource({ path: `${supportDir}/photo.jpg` });
|
|
|
|
const { resource: resource3 } = await createNoteAndResource({ path: `${ocrSampleDir}/with_bullets.png` });
|
|
|
|
|
|
|
|
// Wait to make sure that updated_time is updated
|
|
|
|
await msleep(1);
|
|
|
|
|
|
|
|
expect(await Resource.needOcrCount(supportedMimeTypes)).toBe(3);
|
|
|
|
|
2024-01-05 14:43:24 +00:00
|
|
|
const service = newOcrService();
|
2023-12-13 19:24:58 +00:00
|
|
|
await service.processResources();
|
|
|
|
|
|
|
|
const expectedText = 'This is a lot of 12 point text to test the\n' +
|
|
|
|
'ocr code and see if it works on all types\n' +
|
|
|
|
'of file format.\n' +
|
|
|
|
'The quick brown dog jumped over the\n' +
|
|
|
|
'lazy fox. The quick brown dog jumped\n' +
|
|
|
|
'over the lazy fox. The quick brown dog\n' +
|
|
|
|
'jumped over the lazy fox. The quick\n' +
|
|
|
|
'brown dog jumped over the lazy fox.';
|
|
|
|
const processedResource1: ResourceEntity = await Resource.load(resource1.id);
|
|
|
|
expect(processedResource1.ocr_text).toBe(expectedText);
|
|
|
|
expect(processedResource1.ocr_status).toBe(ResourceOcrStatus.Done);
|
|
|
|
expect(processedResource1.ocr_error).toBe('');
|
|
|
|
|
|
|
|
const details = Resource.unserializeOcrDetails(processedResource1.ocr_details);
|
|
|
|
const lines = details.map(l => l.words.map(w => w.t).join(' ')).join('\n');
|
|
|
|
expect(lines).toBe(expectedText);
|
|
|
|
expect(details[0].words[0].t).toBe('This');
|
|
|
|
expect(details[0].words[0]).toEqual({ 't': 'This', 'bb': [36, 96, 92, 116], 'bl': [36, 96, 116, 116] });
|
|
|
|
|
|
|
|
// Also check that the resource blob has not been updated
|
|
|
|
expect(processedResource1.blob_updated_time).toBe(resource1.blob_updated_time);
|
|
|
|
expect(processedResource1.updated_time).toBeGreaterThan(resource1.updated_time);
|
|
|
|
|
|
|
|
const processedResource2: ResourceEntity = await Resource.load(resource2.id);
|
|
|
|
expect(processedResource2.ocr_text).toBe('');
|
|
|
|
expect(processedResource2.ocr_status).toBe(ResourceOcrStatus.Done);
|
|
|
|
expect(processedResource2.ocr_error).toBe('');
|
|
|
|
|
|
|
|
const processedResource3: ResourceEntity = await Resource.load(resource3.id);
|
|
|
|
expect(processedResource3.ocr_text).toBe('Declaration\n' +
|
|
|
|
'| declare that:\n' +
|
|
|
|
'® | will arrive in the UK within the next 48 hours\n' +
|
|
|
|
'® | understand | have to provide proof of a negative COVID 19 test prior to departure to the UK (unless\n' +
|
|
|
|
'exempt)\n' +
|
|
|
|
'® | have provided my seat number, if relevant\n' +
|
|
|
|
'® The information | have entered in this form is correct\n' +
|
|
|
|
'® | understand it could be a criminal offence to provide false details and | may be prosecuted\n' +
|
|
|
|
'If any of your information changes once you have submitted your details, such as travel details, seat number, or\n' +
|
|
|
|
'contact information, you must complete a new form.\n' +
|
|
|
|
'| confirm that | understand and agree with the above declarations.',
|
|
|
|
);
|
|
|
|
expect(processedResource3.ocr_status).toBe(ResourceOcrStatus.Done);
|
|
|
|
expect(processedResource3.ocr_error).toBe('');
|
|
|
|
|
|
|
|
// Also check that the resource blob has not been updated
|
|
|
|
expect(processedResource2.blob_updated_time).toBe(resource2.blob_updated_time);
|
|
|
|
expect(processedResource2.updated_time).toBeGreaterThan(resource2.updated_time);
|
|
|
|
|
|
|
|
await service.dispose();
|
2024-01-13 19:10:34 +00:00
|
|
|
|
|
|
|
// On CI these tests can randomly throw the error "Exceeded timeout of
|
|
|
|
// 90000 ms for a test.". So for now increase the timeout and if that's
|
|
|
|
// not sufficient it means the test is simply stuck, and we should use
|
|
|
|
// `jest.retryTimes(2)`
|
|
|
|
}, 60000 * 5);
|
2023-12-13 19:24:58 +00:00
|
|
|
|
2024-02-02 14:59:15 -08:00
|
|
|
test.each([
|
|
|
|
// Use embedded text (skip OCR)
|
|
|
|
['dummy.pdf', 'Dummy PDF file'],
|
|
|
|
['multi_page__embedded_text.pdf', 'This is a test.\nTesting...\nThis PDF has 3 pages.\nThis is page 3.'],
|
|
|
|
['multi_page__no_embedded_text.pdf', 'This is a multi-page PDF\nwith no embedded text.\nPage 2: more text.\nThe third page.'],
|
|
|
|
])('should process PDF resources', async (samplePath: string, expectedText: string) => {
|
|
|
|
const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/${samplePath}` });
|
2023-12-13 19:24:58 +00:00
|
|
|
|
2024-01-05 14:43:24 +00:00
|
|
|
const service = newOcrService();
|
2023-12-13 19:24:58 +00:00
|
|
|
|
|
|
|
await service.processResources();
|
|
|
|
|
|
|
|
const processedResource: ResourceEntity = await Resource.load(resource.id);
|
2024-02-02 14:59:15 -08:00
|
|
|
expect(processedResource.ocr_text).toBe(expectedText);
|
2023-12-13 19:24:58 +00:00
|
|
|
expect(processedResource.ocr_status).toBe(ResourceOcrStatus.Done);
|
|
|
|
expect(processedResource.ocr_error).toBe('');
|
|
|
|
|
|
|
|
await service.dispose();
|
|
|
|
});
|
|
|
|
|
|
|
|
it('should handle case where resource blob has not yet been downloaded', async () => {
|
|
|
|
await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
|
|
|
|
|
|
|
|
await synchronizerStart();
|
|
|
|
|
|
|
|
await switchClient(2);
|
|
|
|
|
|
|
|
await synchronizerStart();
|
|
|
|
|
|
|
|
await msleep(1);
|
|
|
|
|
2024-01-05 14:43:24 +00:00
|
|
|
const service = newOcrService();
|
2023-12-13 19:24:58 +00:00
|
|
|
|
|
|
|
await service.processResources();
|
|
|
|
|
|
|
|
{
|
|
|
|
const resource: ResourceEntity = (await Resource.all())[0];
|
|
|
|
expect(resource.ocr_text).toBe('');
|
|
|
|
expect(resource.ocr_error).toBe('');
|
|
|
|
expect(resource.ocr_status).toBe(ResourceOcrStatus.Todo);
|
|
|
|
}
|
|
|
|
|
|
|
|
await resourceFetcher().startAndWait();
|
|
|
|
|
|
|
|
await service.processResources();
|
|
|
|
|
|
|
|
{
|
|
|
|
const resource: ResourceEntity = (await Resource.all())[0];
|
|
|
|
expect(resource.ocr_text).toBe('Dummy PDF file');
|
|
|
|
expect(resource.ocr_error).toBe('');
|
|
|
|
expect(resource.ocr_status).toBe(ResourceOcrStatus.Done);
|
|
|
|
}
|
|
|
|
|
|
|
|
await service.dispose();
|
|
|
|
});
|
|
|
|
|
|
|
|
it('should handle case where resource blob cannot be downloaded', async () => {
|
|
|
|
await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
|
|
|
|
|
|
|
|
await synchronizerStart();
|
|
|
|
|
|
|
|
await switchClient(2);
|
|
|
|
|
|
|
|
await synchronizerStart();
|
|
|
|
|
|
|
|
const resource: ResourceEntity = (await Resource.all())[0];
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------
|
|
|
|
// Fetch status is an error so OCR status will be an error too
|
|
|
|
// ----------------------------------------------------------------
|
|
|
|
|
|
|
|
await Resource.setLocalState(resource.id, {
|
|
|
|
resource_id: resource.id,
|
|
|
|
fetch_status: Resource.FETCH_STATUS_ERROR,
|
|
|
|
fetch_error: 'cannot be downloaded',
|
|
|
|
});
|
|
|
|
|
2024-01-05 14:43:24 +00:00
|
|
|
const service = newOcrService();
|
2023-12-13 19:24:58 +00:00
|
|
|
|
2024-02-26 10:16:23 +00:00
|
|
|
// The service will print a warning so we disable it in tests
|
2023-12-13 19:24:58 +00:00
|
|
|
Logger.globalLogger.enabled = false;
|
|
|
|
await service.processResources();
|
|
|
|
Logger.globalLogger.enabled = true;
|
|
|
|
|
|
|
|
{
|
|
|
|
const resource: ResourceEntity = (await Resource.all())[0];
|
|
|
|
expect(resource.ocr_text).toBe('');
|
|
|
|
expect(resource.ocr_error).toContain('Cannot process resource');
|
|
|
|
expect(resource.ocr_error).toContain('cannot be downloaded');
|
|
|
|
expect(resource.ocr_status).toBe(ResourceOcrStatus.Error);
|
|
|
|
}
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------
|
|
|
|
// After the fetch status is reset and the resource downloaded, it
|
|
|
|
// should also retry OCR and succeed.
|
|
|
|
// ----------------------------------------------------------------
|
|
|
|
|
|
|
|
await Resource.resetFetchErrorStatus(resource.id);
|
|
|
|
|
|
|
|
await resourceFetcher().startAndWait();
|
|
|
|
|
|
|
|
await service.processResources();
|
|
|
|
|
|
|
|
{
|
|
|
|
const resource: ResourceEntity = (await Resource.all())[0];
|
|
|
|
expect(resource.ocr_text).toBe('Dummy PDF file');
|
|
|
|
expect(resource.ocr_error).toBe('');
|
|
|
|
expect(resource.ocr_status).toBe(ResourceOcrStatus.Done);
|
|
|
|
}
|
|
|
|
|
|
|
|
await service.dispose();
|
|
|
|
});
|
|
|
|
|
|
|
|
it('should handle conflicts if two clients process the same resource then sync', async () => {
|
|
|
|
await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
|
|
|
|
|
2024-01-05 14:43:24 +00:00
|
|
|
const service1 = newOcrService();
|
2023-12-13 19:24:58 +00:00
|
|
|
await synchronizerStart();
|
|
|
|
await service1.processResources();
|
|
|
|
|
|
|
|
await switchClient(2);
|
|
|
|
|
|
|
|
await synchronizerStart();
|
|
|
|
await msleep(1);
|
|
|
|
await resourceFetcher().startAndWait();
|
2024-01-05 14:43:24 +00:00
|
|
|
const service2 = newOcrService();
|
2023-12-13 19:24:58 +00:00
|
|
|
await service2.processResources();
|
|
|
|
await synchronizerStart();
|
2024-02-26 10:16:23 +00:00
|
|
|
const expectedResourceUpdatedTime = (await Resource.all())[0].updated_time;
|
2023-12-13 19:24:58 +00:00
|
|
|
|
|
|
|
await switchClient(1);
|
|
|
|
|
|
|
|
await synchronizerStart();
|
|
|
|
|
|
|
|
// A conflict happened during sync, but it is resolved by keeping the
|
|
|
|
// remote version.
|
|
|
|
|
|
|
|
expect((await Resource.all()).length).toBe(1);
|
|
|
|
|
|
|
|
{
|
|
|
|
const resource: ResourceEntity = (await Resource.all())[0];
|
|
|
|
expect(resource.ocr_text).toBe('Dummy PDF file');
|
|
|
|
expect(resource.ocr_error).toBe('');
|
|
|
|
expect(resource.ocr_status).toBe(ResourceOcrStatus.Done);
|
2024-02-26 10:16:23 +00:00
|
|
|
expect(resource.updated_time).toBe(expectedResourceUpdatedTime);
|
2023-12-13 19:24:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
await service1.dispose();
|
|
|
|
await service2.dispose();
|
|
|
|
});
|
|
|
|
|
|
|
|
// Use this to quickly test with specific images:
|
|
|
|
|
|
|
|
// it('should process resources 2', async () => {
|
|
|
|
// await createNoteAndResource({ path: `${require('os').homedir()}/Desktop/AllClients.png` });
|
|
|
|
|
2024-01-05 14:43:24 +00:00
|
|
|
// const service = newOcrService();
|
2023-12-13 19:24:58 +00:00
|
|
|
// await service.processResources();
|
|
|
|
|
|
|
|
// console.info(await Resource.all());
|
|
|
|
|
|
|
|
// await service.dispose();
|
|
|
|
// });
|
|
|
|
|
|
|
|
});
|