From a906e73b222d4780d5e77e40064f32ca535f4a1c Mon Sep 17 00:00:00 2001 From: Henry Heino <46334387+personalizedrefrigerator@users.noreply.github.com> Date: Tue, 6 Feb 2024 08:24:00 -0800 Subject: [PATCH] Desktop: PDF search text: Remove NULL characters early to avoid possible sync issues (#9862) --- .eslintignore | 2 ++ .gitignore | 2 ++ .../lib/services/search/SearchEngine.test.ts | 13 ++++++++++++- packages/lib/services/search/SearchEngine.ts | 6 +++--- packages/lib/shim-init-node.ts | 6 +++++- .../utils/replaceUnsupportedCharacters.test.ts | 8 ++++++++ .../lib/utils/replaceUnsupportedCharacters.ts | 17 +++++++++++++++++ 7 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 packages/lib/utils/replaceUnsupportedCharacters.test.ts create mode 100644 packages/lib/utils/replaceUnsupportedCharacters.ts diff --git a/.eslintignore b/.eslintignore index 2effa4b77..e9834718c 100644 --- a/.eslintignore +++ b/.eslintignore @@ -1002,6 +1002,8 @@ packages/lib/types.js packages/lib/utils/credentialFiles.js packages/lib/utils/joplinCloud.js packages/lib/utils/processStartFlags.js +packages/lib/utils/replaceUnsupportedCharacters.test.js +packages/lib/utils/replaceUnsupportedCharacters.js packages/lib/utils/userFetcher.js packages/lib/utils/webDAVUtils.test.js packages/lib/utils/webDAVUtils.js diff --git a/.gitignore b/.gitignore index 64f1ba0db..abaa3b592 100644 --- a/.gitignore +++ b/.gitignore @@ -982,6 +982,8 @@ packages/lib/types.js packages/lib/utils/credentialFiles.js packages/lib/utils/joplinCloud.js packages/lib/utils/processStartFlags.js +packages/lib/utils/replaceUnsupportedCharacters.test.js +packages/lib/utils/replaceUnsupportedCharacters.js packages/lib/utils/userFetcher.js packages/lib/utils/webDAVUtils.test.js packages/lib/utils/webDAVUtils.js diff --git a/packages/lib/services/search/SearchEngine.test.ts b/packages/lib/services/search/SearchEngine.test.ts index 8f7ed39f3..fc4b85900 100644 --- a/packages/lib/services/search/SearchEngine.test.ts +++ b/packages/lib/services/search/SearchEngine.test.ts @@ -322,10 +322,21 @@ describe('services/SearchEngine', () => { })); it('should support searching through documents that contain null characters', (async () => { - await Note.save({ title: 'Test', body: 'Test\x00testing' }); + await Note.save({ + title: 'Test', + body: ` + NUL characters, "\x00", have been known to break FTS search. + Previously, all characters after a NUL (\x00) character in a note + would not show up in search results. NUL characters may have also + broken search for other notes. + + In this note, "testing" only appears after the NUL characters. + `, + }); await engine.syncTables(); + expect((await engine.search('previously')).length).toBe(1); expect((await engine.search('testing')).length).toBe(1); })); diff --git a/packages/lib/services/search/SearchEngine.ts b/packages/lib/services/search/SearchEngine.ts index f6cbee6dd..12f6a389a 100644 --- a/packages/lib/services/search/SearchEngine.ts +++ b/packages/lib/services/search/SearchEngine.ts @@ -13,6 +13,7 @@ import JoplinDatabase from '../../JoplinDatabase'; import NoteResource from '../../models/NoteResource'; import BaseItem from '../../models/BaseItem'; import { isCallbackUrl, parseCallbackUrl } from '../../callbackUrlUtils'; +import replaceUnsupportedCharacters from '../../utils/replaceUnsupportedCharacters'; const { sprintf } = require('sprintf-js'); const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js'); @@ -603,9 +604,8 @@ export default class SearchEngine { private normalizeText_(text: string) { let normalizedText = text.normalize ? text.normalize() : text; - // Null characters can break FTS. Remove them. - // eslint-disable-next-line no-control-regex - normalizedText = normalizedText.replace(/\x00/g, ' '); + // NULL characters can break FTS. Remove them. + normalizedText = replaceUnsupportedCharacters(normalizedText); return removeDiacritics(normalizedText.toLowerCase()); } diff --git a/packages/lib/shim-init-node.ts b/packages/lib/shim-init-node.ts index b06c7bfe4..87a70ff44 100644 --- a/packages/lib/shim-init-node.ts +++ b/packages/lib/shim-init-node.ts @@ -10,6 +10,7 @@ import * as pdfJsNamespace from 'pdfjs-dist'; import { writeFile } from 'fs/promises'; import { ResourceEntity } from './services/database/types'; import { TextItem } from 'pdfjs-dist/types/src/display/api'; +import replaceUnsupportedCharacters from './utils/replaceUnsupportedCharacters'; const { FileApiDriverLocal } = require('./file-api-driver-local'); const mimeUtils = require('./mime-utils.js').mime; @@ -749,7 +750,10 @@ function shimInit(options: ShimInitOptions = null) { const text = (item as TextItem).str ?? ''; return text; }).join('\n'); - textByPage.push(strings); + + // Some PDFs contain unsupported characters that can lead to hard-to-debug issues. + // We remove them here. + textByPage.push(replaceUnsupportedCharacters(strings)); } return textByPage; diff --git a/packages/lib/utils/replaceUnsupportedCharacters.test.ts b/packages/lib/utils/replaceUnsupportedCharacters.test.ts new file mode 100644 index 000000000..7c48ec875 --- /dev/null +++ b/packages/lib/utils/replaceUnsupportedCharacters.test.ts @@ -0,0 +1,8 @@ +import replaceUnsupportedCharacters from './replaceUnsupportedCharacters'; + +describe('replaceUnsupportedCharacters', () => { + test('should replace NULL characters', () => { + expect(replaceUnsupportedCharacters('Test\x00...')).toBe('Test�...'); + expect(replaceUnsupportedCharacters('\x00Test\x00...')).toBe('�Test�...'); + }); +}); diff --git a/packages/lib/utils/replaceUnsupportedCharacters.ts b/packages/lib/utils/replaceUnsupportedCharacters.ts new file mode 100644 index 000000000..010234ca3 --- /dev/null +++ b/packages/lib/utils/replaceUnsupportedCharacters.ts @@ -0,0 +1,17 @@ + +const replaceUnsupportedCharacters = (text: string) => { + // In the past, NULL characters have caused sync and search issues. + // Because these issues are often difficult to debug, we remove these characters entirely. + // + // See + // - Sync issue: https://github.com/laurent22/joplin/issues/5046 + // - Search issue: https://github.com/laurent22/joplin/issues/9775 + // + // As per the commonmark spec, we replace \x00 with the replacement character. + // (see https://spec.commonmark.org/0.31.2/#insecure-characters). + // + // eslint-disable-next-line no-control-regex + return text.replace(/\x00/g, '\uFFFD'); +}; + +export default replaceUnsupportedCharacters;