1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-02-01 19:15:01 +02:00

Desktop: Fixes #10050: Fixed OCR memory leak when processing PDF documents

This commit is contained in:
Laurent Cozic 2024-03-05 11:42:54 +00:00
parent 8d5ee36745
commit d9a16b5c0f

View File

@ -739,21 +739,24 @@ function shimInit(options: ShimInitOptions = null) {
shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => { shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => {
const loadingTask = pdfJs.getDocument(pdfPath); const loadingTask = pdfJs.getDocument(pdfPath);
const doc = await loadingTask.promise; const doc = await loadingTask.promise;
const textByPage = []; const textByPage = [];
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) { try {
const page = await doc.getPage(pageNum); for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
const textContent = await page.getTextContent(); const page = await doc.getPage(pageNum);
const textContent = await page.getTextContent();
const strings = textContent.items.map(item => { const strings = textContent.items.map(item => {
const text = (item as TextItem).str ?? ''; const text = (item as TextItem).str ?? '';
return text; return text;
}).join('\n'); }).join('\n');
// Some PDFs contain unsupported characters that can lead to hard-to-debug issues. // Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
// We remove them here. // We remove them here.
textByPage.push(replaceUnsupportedCharacters(strings)); textByPage.push(replaceUnsupportedCharacters(strings));
}
} finally {
await doc.destroy();
} }
return textByPage; return textByPage;
@ -791,23 +794,27 @@ function shimInit(options: ShimInitOptions = null) {
const loadingTask = pdfJs.getDocument(pdfPath); const loadingTask = pdfJs.getDocument(pdfPath);
const doc = await loadingTask.promise; const doc = await loadingTask.promise;
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) { try {
const page = await doc.getPage(pageNum); for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
const viewport = page.getViewport({ scale: 2 }); const page = await doc.getPage(pageNum);
const canvas = createCanvas(); const viewport = page.getViewport({ scale: 2 });
const ctx = canvas.getContext('2d'); const canvas = createCanvas();
const ctx = canvas.getContext('2d');
canvas.height = viewport.height; canvas.height = viewport.height;
canvas.width = viewport.width; canvas.width = viewport.width;
const renderTask = page.render({ canvasContext: ctx, viewport: viewport }); const renderTask = page.render({ canvasContext: ctx, viewport: viewport });
await renderTask.promise; await renderTask.promise;
const buffer = await canvasToBuffer(canvas); const buffer = await canvasToBuffer(canvas);
const filePath = `${outputDirectoryPath}/${filePrefix}_${pageNum.toString().padStart(4, '0')}.jpg`; const filePath = `${outputDirectoryPath}/${filePrefix}_${pageNum.toString().padStart(4, '0')}.jpg`;
output.push(filePath); output.push(filePath);
await writeFile(filePath, buffer, 'binary'); await writeFile(filePath, buffer, 'binary');
if (!(await shim.fsDriver().exists(filePath))) throw new Error(`Could not write to file: ${filePath}`); if (!(await shim.fsDriver().exists(filePath))) throw new Error(`Could not write to file: ${filePath}`);
}
} finally {
await doc.destroy();
} }
return output; return output;