1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-21 09:38:01 +02:00

Desktop: PDF search text: Remove NULL characters early to avoid possible sync issues (#9862)

This commit is contained in:
Henry Heino 2024-02-06 08:24:00 -08:00 committed by GitHub
parent 8b9ce9ec72
commit a906e73b22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 49 additions and 5 deletions

View File

@ -1002,6 +1002,8 @@ packages/lib/types.js
packages/lib/utils/credentialFiles.js
packages/lib/utils/joplinCloud.js
packages/lib/utils/processStartFlags.js
packages/lib/utils/replaceUnsupportedCharacters.test.js
packages/lib/utils/replaceUnsupportedCharacters.js
packages/lib/utils/userFetcher.js
packages/lib/utils/webDAVUtils.test.js
packages/lib/utils/webDAVUtils.js

2
.gitignore vendored
View File

@ -982,6 +982,8 @@ packages/lib/types.js
packages/lib/utils/credentialFiles.js
packages/lib/utils/joplinCloud.js
packages/lib/utils/processStartFlags.js
packages/lib/utils/replaceUnsupportedCharacters.test.js
packages/lib/utils/replaceUnsupportedCharacters.js
packages/lib/utils/userFetcher.js
packages/lib/utils/webDAVUtils.test.js
packages/lib/utils/webDAVUtils.js

View File

@ -322,10 +322,21 @@ describe('services/SearchEngine', () => {
}));
it('should support searching through documents that contain null characters', (async () => {
await Note.save({ title: 'Test', body: 'Test\x00testing' });
await Note.save({
title: 'Test',
body: `
NUL characters, "\x00", have been known to break FTS search.
Previously, all characters after a NUL (\x00) character in a note
would not show up in search results. NUL characters may have also
broken search for other notes.
In this note, "testing" only appears after the NUL characters.
`,
});
await engine.syncTables();
expect((await engine.search('previously')).length).toBe(1);
expect((await engine.search('testing')).length).toBe(1);
}));

View File

@ -13,6 +13,7 @@ import JoplinDatabase from '../../JoplinDatabase';
import NoteResource from '../../models/NoteResource';
import BaseItem from '../../models/BaseItem';
import { isCallbackUrl, parseCallbackUrl } from '../../callbackUrlUtils';
import replaceUnsupportedCharacters from '../../utils/replaceUnsupportedCharacters';
const { sprintf } = require('sprintf-js');
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
@ -603,9 +604,8 @@ export default class SearchEngine {
private normalizeText_(text: string) {
let normalizedText = text.normalize ? text.normalize() : text;
// Null characters can break FTS. Remove them.
// eslint-disable-next-line no-control-regex
normalizedText = normalizedText.replace(/\x00/g, ' ');
// NULL characters can break FTS. Remove them.
normalizedText = replaceUnsupportedCharacters(normalizedText);
return removeDiacritics(normalizedText.toLowerCase());
}

View File

@ -10,6 +10,7 @@ import * as pdfJsNamespace from 'pdfjs-dist';
import { writeFile } from 'fs/promises';
import { ResourceEntity } from './services/database/types';
import { TextItem } from 'pdfjs-dist/types/src/display/api';
import replaceUnsupportedCharacters from './utils/replaceUnsupportedCharacters';
const { FileApiDriverLocal } = require('./file-api-driver-local');
const mimeUtils = require('./mime-utils.js').mime;
@ -749,7 +750,10 @@ function shimInit(options: ShimInitOptions = null) {
const text = (item as TextItem).str ?? '';
return text;
}).join('\n');
textByPage.push(strings);
// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
// We remove them here.
textByPage.push(replaceUnsupportedCharacters(strings));
}
return textByPage;

View File

@ -0,0 +1,8 @@
import replaceUnsupportedCharacters from './replaceUnsupportedCharacters';
describe('replaceUnsupportedCharacters', () => {
test('should replace NULL characters', () => {
expect(replaceUnsupportedCharacters('Test\x00...')).toBe('Test�...');
expect(replaceUnsupportedCharacters('\x00Test\x00...')).toBe('�Test�...');
});
});

View File

@ -0,0 +1,17 @@
const replaceUnsupportedCharacters = (text: string) => {
// In the past, NULL characters have caused sync and search issues.
// Because these issues are often difficult to debug, we remove these characters entirely.
//
// See
// - Sync issue: https://github.com/laurent22/joplin/issues/5046
// - Search issue: https://github.com/laurent22/joplin/issues/9775
//
// As per the commonmark spec, we replace \x00 with the replacement character.
// (see https://spec.commonmark.org/0.31.2/#insecure-characters).
//
// eslint-disable-next-line no-control-regex
return text.replace(/\x00/g, '\uFFFD');
};
export default replaceUnsupportedCharacters;