mirror of
https://github.com/laurent22/joplin.git
synced 2024-12-21 09:38:01 +02:00
Desktop: PDF search text: Remove NULL characters early to avoid possible sync issues (#9862)
This commit is contained in:
parent
8b9ce9ec72
commit
a906e73b22
@ -1002,6 +1002,8 @@ packages/lib/types.js
|
||||
packages/lib/utils/credentialFiles.js
|
||||
packages/lib/utils/joplinCloud.js
|
||||
packages/lib/utils/processStartFlags.js
|
||||
packages/lib/utils/replaceUnsupportedCharacters.test.js
|
||||
packages/lib/utils/replaceUnsupportedCharacters.js
|
||||
packages/lib/utils/userFetcher.js
|
||||
packages/lib/utils/webDAVUtils.test.js
|
||||
packages/lib/utils/webDAVUtils.js
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -982,6 +982,8 @@ packages/lib/types.js
|
||||
packages/lib/utils/credentialFiles.js
|
||||
packages/lib/utils/joplinCloud.js
|
||||
packages/lib/utils/processStartFlags.js
|
||||
packages/lib/utils/replaceUnsupportedCharacters.test.js
|
||||
packages/lib/utils/replaceUnsupportedCharacters.js
|
||||
packages/lib/utils/userFetcher.js
|
||||
packages/lib/utils/webDAVUtils.test.js
|
||||
packages/lib/utils/webDAVUtils.js
|
||||
|
@ -322,10 +322,21 @@ describe('services/SearchEngine', () => {
|
||||
}));
|
||||
|
||||
it('should support searching through documents that contain null characters', (async () => {
|
||||
await Note.save({ title: 'Test', body: 'Test\x00testing' });
|
||||
await Note.save({
|
||||
title: 'Test',
|
||||
body: `
|
||||
NUL characters, "\x00", have been known to break FTS search.
|
||||
Previously, all characters after a NUL (\x00) character in a note
|
||||
would not show up in search results. NUL characters may have also
|
||||
broken search for other notes.
|
||||
|
||||
In this note, "testing" only appears after the NUL characters.
|
||||
`,
|
||||
});
|
||||
|
||||
await engine.syncTables();
|
||||
|
||||
expect((await engine.search('previously')).length).toBe(1);
|
||||
expect((await engine.search('testing')).length).toBe(1);
|
||||
}));
|
||||
|
||||
|
@ -13,6 +13,7 @@ import JoplinDatabase from '../../JoplinDatabase';
|
||||
import NoteResource from '../../models/NoteResource';
|
||||
import BaseItem from '../../models/BaseItem';
|
||||
import { isCallbackUrl, parseCallbackUrl } from '../../callbackUrlUtils';
|
||||
import replaceUnsupportedCharacters from '../../utils/replaceUnsupportedCharacters';
|
||||
const { sprintf } = require('sprintf-js');
|
||||
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
|
||||
|
||||
@ -603,9 +604,8 @@ export default class SearchEngine {
|
||||
private normalizeText_(text: string) {
|
||||
let normalizedText = text.normalize ? text.normalize() : text;
|
||||
|
||||
// Null characters can break FTS. Remove them.
|
||||
// eslint-disable-next-line no-control-regex
|
||||
normalizedText = normalizedText.replace(/\x00/g, ' ');
|
||||
// NULL characters can break FTS. Remove them.
|
||||
normalizedText = replaceUnsupportedCharacters(normalizedText);
|
||||
|
||||
return removeDiacritics(normalizedText.toLowerCase());
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ import * as pdfJsNamespace from 'pdfjs-dist';
|
||||
import { writeFile } from 'fs/promises';
|
||||
import { ResourceEntity } from './services/database/types';
|
||||
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||
import replaceUnsupportedCharacters from './utils/replaceUnsupportedCharacters';
|
||||
|
||||
const { FileApiDriverLocal } = require('./file-api-driver-local');
|
||||
const mimeUtils = require('./mime-utils.js').mime;
|
||||
@ -749,7 +750,10 @@ function shimInit(options: ShimInitOptions = null) {
|
||||
const text = (item as TextItem).str ?? '';
|
||||
return text;
|
||||
}).join('\n');
|
||||
textByPage.push(strings);
|
||||
|
||||
// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
|
||||
// We remove them here.
|
||||
textByPage.push(replaceUnsupportedCharacters(strings));
|
||||
}
|
||||
|
||||
return textByPage;
|
||||
|
8
packages/lib/utils/replaceUnsupportedCharacters.test.ts
Normal file
8
packages/lib/utils/replaceUnsupportedCharacters.test.ts
Normal file
@ -0,0 +1,8 @@
|
||||
import replaceUnsupportedCharacters from './replaceUnsupportedCharacters';
|
||||
|
||||
describe('replaceUnsupportedCharacters', () => {
|
||||
test('should replace NULL characters', () => {
|
||||
expect(replaceUnsupportedCharacters('Test\x00...')).toBe('Test�...');
|
||||
expect(replaceUnsupportedCharacters('\x00Test\x00...')).toBe('�Test�...');
|
||||
});
|
||||
});
|
17
packages/lib/utils/replaceUnsupportedCharacters.ts
Normal file
17
packages/lib/utils/replaceUnsupportedCharacters.ts
Normal file
@ -0,0 +1,17 @@
|
||||
|
||||
const replaceUnsupportedCharacters = (text: string) => {
|
||||
// In the past, NULL characters have caused sync and search issues.
|
||||
// Because these issues are often difficult to debug, we remove these characters entirely.
|
||||
//
|
||||
// See
|
||||
// - Sync issue: https://github.com/laurent22/joplin/issues/5046
|
||||
// - Search issue: https://github.com/laurent22/joplin/issues/9775
|
||||
//
|
||||
// As per the commonmark spec, we replace \x00 with the replacement character.
|
||||
// (see https://spec.commonmark.org/0.31.2/#insecure-characters).
|
||||
//
|
||||
// eslint-disable-next-line no-control-regex
|
||||
return text.replace(/\x00/g, '\uFFFD');
|
||||
};
|
||||
|
||||
export default replaceUnsupportedCharacters;
|
Loading…
Reference in New Issue
Block a user