mirror of
https://github.com/laurent22/joplin.git
synced 2025-02-01 19:15:01 +02:00
Desktop: Fixes #10050: Fixed OCR memory leak when processing PDF documents
This commit is contained in:
parent
8d5ee36745
commit
d9a16b5c0f
@ -739,21 +739,24 @@ function shimInit(options: ShimInitOptions = null) {
|
|||||||
shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => {
|
shim.pdfExtractEmbeddedText = async (pdfPath: string): Promise<string[]> => {
|
||||||
const loadingTask = pdfJs.getDocument(pdfPath);
|
const loadingTask = pdfJs.getDocument(pdfPath);
|
||||||
const doc = await loadingTask.promise;
|
const doc = await loadingTask.promise;
|
||||||
|
|
||||||
const textByPage = [];
|
const textByPage = [];
|
||||||
|
|
||||||
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
|
try {
|
||||||
const page = await doc.getPage(pageNum);
|
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
|
||||||
const textContent = await page.getTextContent();
|
const page = await doc.getPage(pageNum);
|
||||||
|
const textContent = await page.getTextContent();
|
||||||
|
|
||||||
const strings = textContent.items.map(item => {
|
const strings = textContent.items.map(item => {
|
||||||
const text = (item as TextItem).str ?? '';
|
const text = (item as TextItem).str ?? '';
|
||||||
return text;
|
return text;
|
||||||
}).join('\n');
|
}).join('\n');
|
||||||
|
|
||||||
// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
|
// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
|
||||||
// We remove them here.
|
// We remove them here.
|
||||||
textByPage.push(replaceUnsupportedCharacters(strings));
|
textByPage.push(replaceUnsupportedCharacters(strings));
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await doc.destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
return textByPage;
|
return textByPage;
|
||||||
@ -791,23 +794,27 @@ function shimInit(options: ShimInitOptions = null) {
|
|||||||
const loadingTask = pdfJs.getDocument(pdfPath);
|
const loadingTask = pdfJs.getDocument(pdfPath);
|
||||||
const doc = await loadingTask.promise;
|
const doc = await loadingTask.promise;
|
||||||
|
|
||||||
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
|
try {
|
||||||
const page = await doc.getPage(pageNum);
|
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
|
||||||
const viewport = page.getViewport({ scale: 2 });
|
const page = await doc.getPage(pageNum);
|
||||||
const canvas = createCanvas();
|
const viewport = page.getViewport({ scale: 2 });
|
||||||
const ctx = canvas.getContext('2d');
|
const canvas = createCanvas();
|
||||||
|
const ctx = canvas.getContext('2d');
|
||||||
|
|
||||||
canvas.height = viewport.height;
|
canvas.height = viewport.height;
|
||||||
canvas.width = viewport.width;
|
canvas.width = viewport.width;
|
||||||
|
|
||||||
const renderTask = page.render({ canvasContext: ctx, viewport: viewport });
|
const renderTask = page.render({ canvasContext: ctx, viewport: viewport });
|
||||||
await renderTask.promise;
|
await renderTask.promise;
|
||||||
|
|
||||||
const buffer = await canvasToBuffer(canvas);
|
const buffer = await canvasToBuffer(canvas);
|
||||||
const filePath = `${outputDirectoryPath}/${filePrefix}_${pageNum.toString().padStart(4, '0')}.jpg`;
|
const filePath = `${outputDirectoryPath}/${filePrefix}_${pageNum.toString().padStart(4, '0')}.jpg`;
|
||||||
output.push(filePath);
|
output.push(filePath);
|
||||||
await writeFile(filePath, buffer, 'binary');
|
await writeFile(filePath, buffer, 'binary');
|
||||||
if (!(await shim.fsDriver().exists(filePath))) throw new Error(`Could not write to file: ${filePath}`);
|
if (!(await shim.fsDriver().exists(filePath))) throw new Error(`Could not write to file: ${filePath}`);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await doc.destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user