1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-01-17 18:44:45 +02:00
joplin/packages/lib/services/ocr/OcrService.test.ts

250 lines
9.0 KiB
TypeScript
Raw Normal View History

2024-01-05 14:43:24 +00:00
import { createNoteAndResource, newOcrService, ocrSampleDir, resourceFetcher, setupDatabaseAndSynchronizer, supportDir, switchClient, synchronizerStart } from '../../testing/test-utils';
2023-12-13 19:24:58 +00:00
import { supportedMimeTypes } from './OcrService';
import Resource from '../../models/Resource';
import { ResourceEntity, ResourceOcrStatus } from '../database/types';
import { msleep } from '@joplin/utils/time';
import Logger from '@joplin/utils/Logger';
describe('OcrService', () => {
beforeEach(async () => {
await setupDatabaseAndSynchronizer(1);
await setupDatabaseAndSynchronizer(2);
await switchClient(1);
});
it('should process resources', async () => {
const { resource: resource1 } = await createNoteAndResource({ path: `${ocrSampleDir}/testocr.png` });
const { resource: resource2 } = await createNoteAndResource({ path: `${supportDir}/photo.jpg` });
const { resource: resource3 } = await createNoteAndResource({ path: `${ocrSampleDir}/with_bullets.png` });
// Wait to make sure that updated_time is updated
await msleep(1);
expect(await Resource.needOcrCount(supportedMimeTypes)).toBe(3);
2024-01-05 14:43:24 +00:00
const service = newOcrService();
2023-12-13 19:24:58 +00:00
await service.processResources();
const expectedText = 'This is a lot of 12 point text to test the\n' +
'ocr code and see if it works on all types\n' +
'of file format.\n' +
'The quick brown dog jumped over the\n' +
'lazy fox. The quick brown dog jumped\n' +
'over the lazy fox. The quick brown dog\n' +
'jumped over the lazy fox. The quick\n' +
'brown dog jumped over the lazy fox.';
const processedResource1: ResourceEntity = await Resource.load(resource1.id);
expect(processedResource1.ocr_text).toBe(expectedText);
expect(processedResource1.ocr_status).toBe(ResourceOcrStatus.Done);
expect(processedResource1.ocr_error).toBe('');
const details = Resource.unserializeOcrDetails(processedResource1.ocr_details);
const lines = details.map(l => l.words.map(w => w.t).join(' ')).join('\n');
expect(lines).toBe(expectedText);
expect(details[0].words[0].t).toBe('This');
expect(details[0].words[0]).toEqual({ 't': 'This', 'bb': [36, 96, 92, 116], 'bl': [36, 96, 116, 116] });
// Also check that the resource blob has not been updated
expect(processedResource1.blob_updated_time).toBe(resource1.blob_updated_time);
expect(processedResource1.updated_time).toBeGreaterThan(resource1.updated_time);
const processedResource2: ResourceEntity = await Resource.load(resource2.id);
expect(processedResource2.ocr_text).toBe('');
expect(processedResource2.ocr_status).toBe(ResourceOcrStatus.Done);
expect(processedResource2.ocr_error).toBe('');
const processedResource3: ResourceEntity = await Resource.load(resource3.id);
expect(processedResource3.ocr_text).toBe('Declaration\n' +
'| declare that:\n' +
'® | will arrive in the UK within the next 48 hours\n' +
'® | understand | have to provide proof of a negative COVID 19 test prior to departure to the UK (unless\n' +
'exempt)\n' +
'® | have provided my seat number, if relevant\n' +
'® The information | have entered in this form is correct\n' +
'® | understand it could be a criminal offence to provide false details and | may be prosecuted\n' +
'If any of your information changes once you have submitted your details, such as travel details, seat number, or\n' +
'contact information, you must complete a new form.\n' +
'| confirm that | understand and agree with the above declarations.',
);
expect(processedResource3.ocr_status).toBe(ResourceOcrStatus.Done);
expect(processedResource3.ocr_error).toBe('');
// Also check that the resource blob has not been updated
expect(processedResource2.blob_updated_time).toBe(resource2.blob_updated_time);
expect(processedResource2.updated_time).toBeGreaterThan(resource2.updated_time);
await service.dispose();
2024-01-13 19:10:34 +00:00
// On CI these tests can randomly throw the error "Exceeded timeout of
// 90000 ms for a test.". So for now increase the timeout and if that's
// not sufficient it means the test is simply stuck, and we should use
// `jest.retryTimes(2)`
}, 60000 * 5);
2023-12-13 19:24:58 +00:00
test.each([
// Use embedded text (skip OCR)
['dummy.pdf', 'Dummy PDF file'],
['multi_page__embedded_text.pdf', 'This is a test.\nTesting...\nThis PDF has 3 pages.\nThis is page 3.'],
['multi_page__no_embedded_text.pdf', 'This is a multi-page PDF\nwith no embedded text.\nPage 2: more text.\nThe third page.'],
])('should process PDF resources', async (samplePath: string, expectedText: string) => {
const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/${samplePath}` });
2023-12-13 19:24:58 +00:00
2024-01-05 14:43:24 +00:00
const service = newOcrService();
2023-12-13 19:24:58 +00:00
await service.processResources();
const processedResource: ResourceEntity = await Resource.load(resource.id);
expect(processedResource.ocr_text).toBe(expectedText);
2023-12-13 19:24:58 +00:00
expect(processedResource.ocr_status).toBe(ResourceOcrStatus.Done);
expect(processedResource.ocr_error).toBe('');
await service.dispose();
});
it('should handle case where resource blob has not yet been downloaded', async () => {
await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
await synchronizerStart();
await switchClient(2);
await synchronizerStart();
await msleep(1);
2024-01-05 14:43:24 +00:00
const service = newOcrService();
2023-12-13 19:24:58 +00:00
await service.processResources();
{
const resource: ResourceEntity = (await Resource.all())[0];
expect(resource.ocr_text).toBe('');
expect(resource.ocr_error).toBe('');
expect(resource.ocr_status).toBe(ResourceOcrStatus.Todo);
}
await resourceFetcher().startAndWait();
await service.processResources();
{
const resource: ResourceEntity = (await Resource.all())[0];
expect(resource.ocr_text).toBe('Dummy PDF file');
expect(resource.ocr_error).toBe('');
expect(resource.ocr_status).toBe(ResourceOcrStatus.Done);
}
await service.dispose();
});
it('should handle case where resource blob cannot be downloaded', async () => {
await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
await synchronizerStart();
await switchClient(2);
await synchronizerStart();
const resource: ResourceEntity = (await Resource.all())[0];
// ----------------------------------------------------------------
// Fetch status is an error so OCR status will be an error too
// ----------------------------------------------------------------
await Resource.setLocalState(resource.id, {
resource_id: resource.id,
fetch_status: Resource.FETCH_STATUS_ERROR,
fetch_error: 'cannot be downloaded',
});
2024-01-05 14:43:24 +00:00
const service = newOcrService();
2023-12-13 19:24:58 +00:00
// The service will print a warning so we disable it in tests
2023-12-13 19:24:58 +00:00
Logger.globalLogger.enabled = false;
await service.processResources();
Logger.globalLogger.enabled = true;
{
const resource: ResourceEntity = (await Resource.all())[0];
expect(resource.ocr_text).toBe('');
expect(resource.ocr_error).toContain('Cannot process resource');
expect(resource.ocr_error).toContain('cannot be downloaded');
expect(resource.ocr_status).toBe(ResourceOcrStatus.Error);
}
// ----------------------------------------------------------------
// After the fetch status is reset and the resource downloaded, it
// should also retry OCR and succeed.
// ----------------------------------------------------------------
await Resource.resetFetchErrorStatus(resource.id);
await resourceFetcher().startAndWait();
await service.processResources();
{
const resource: ResourceEntity = (await Resource.all())[0];
expect(resource.ocr_text).toBe('Dummy PDF file');
expect(resource.ocr_error).toBe('');
expect(resource.ocr_status).toBe(ResourceOcrStatus.Done);
}
await service.dispose();
});
it('should handle conflicts if two clients process the same resource then sync', async () => {
await createNoteAndResource({ path: `${ocrSampleDir}/dummy.pdf` });
2024-01-05 14:43:24 +00:00
const service1 = newOcrService();
2023-12-13 19:24:58 +00:00
await synchronizerStart();
await service1.processResources();
await switchClient(2);
await synchronizerStart();
await msleep(1);
await resourceFetcher().startAndWait();
2024-01-05 14:43:24 +00:00
const service2 = newOcrService();
2023-12-13 19:24:58 +00:00
await service2.processResources();
await synchronizerStart();
const expectedResourceUpdatedTime = (await Resource.all())[0].updated_time;
2023-12-13 19:24:58 +00:00
await switchClient(1);
await synchronizerStart();
// A conflict happened during sync, but it is resolved by keeping the
// remote version.
expect((await Resource.all()).length).toBe(1);
{
const resource: ResourceEntity = (await Resource.all())[0];
expect(resource.ocr_text).toBe('Dummy PDF file');
expect(resource.ocr_error).toBe('');
expect(resource.ocr_status).toBe(ResourceOcrStatus.Done);
expect(resource.updated_time).toBe(expectedResourceUpdatedTime);
2023-12-13 19:24:58 +00:00
}
await service1.dispose();
await service2.dispose();
});
// Use this to quickly test with specific images:
// it('should process resources 2', async () => {
// await createNoteAndResource({ path: `${require('os').homedir()}/Desktop/AllClients.png` });
2024-01-05 14:43:24 +00:00
// const service = newOcrService();
2023-12-13 19:24:58 +00:00
// await service.processResources();
// console.info(await Resource.all());
// await service.dispose();
// });
});