mirror of
https://github.com/laurent22/joplin.git
synced 2025-03-11 14:09:55 +02:00
Desktop: PDF search text: Remove NULL characters early to avoid possible sync issues (#9862)
This commit is contained in:
parent
8b9ce9ec72
commit
a906e73b22
@ -1002,6 +1002,8 @@ packages/lib/types.js
|
|||||||
packages/lib/utils/credentialFiles.js
|
packages/lib/utils/credentialFiles.js
|
||||||
packages/lib/utils/joplinCloud.js
|
packages/lib/utils/joplinCloud.js
|
||||||
packages/lib/utils/processStartFlags.js
|
packages/lib/utils/processStartFlags.js
|
||||||
|
packages/lib/utils/replaceUnsupportedCharacters.test.js
|
||||||
|
packages/lib/utils/replaceUnsupportedCharacters.js
|
||||||
packages/lib/utils/userFetcher.js
|
packages/lib/utils/userFetcher.js
|
||||||
packages/lib/utils/webDAVUtils.test.js
|
packages/lib/utils/webDAVUtils.test.js
|
||||||
packages/lib/utils/webDAVUtils.js
|
packages/lib/utils/webDAVUtils.js
|
||||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -982,6 +982,8 @@ packages/lib/types.js
|
|||||||
packages/lib/utils/credentialFiles.js
|
packages/lib/utils/credentialFiles.js
|
||||||
packages/lib/utils/joplinCloud.js
|
packages/lib/utils/joplinCloud.js
|
||||||
packages/lib/utils/processStartFlags.js
|
packages/lib/utils/processStartFlags.js
|
||||||
|
packages/lib/utils/replaceUnsupportedCharacters.test.js
|
||||||
|
packages/lib/utils/replaceUnsupportedCharacters.js
|
||||||
packages/lib/utils/userFetcher.js
|
packages/lib/utils/userFetcher.js
|
||||||
packages/lib/utils/webDAVUtils.test.js
|
packages/lib/utils/webDAVUtils.test.js
|
||||||
packages/lib/utils/webDAVUtils.js
|
packages/lib/utils/webDAVUtils.js
|
||||||
|
@ -322,10 +322,21 @@ describe('services/SearchEngine', () => {
|
|||||||
}));
|
}));
|
||||||
|
|
||||||
it('should support searching through documents that contain null characters', (async () => {
|
it('should support searching through documents that contain null characters', (async () => {
|
||||||
await Note.save({ title: 'Test', body: 'Test\x00testing' });
|
await Note.save({
|
||||||
|
title: 'Test',
|
||||||
|
body: `
|
||||||
|
NUL characters, "\x00", have been known to break FTS search.
|
||||||
|
Previously, all characters after a NUL (\x00) character in a note
|
||||||
|
would not show up in search results. NUL characters may have also
|
||||||
|
broken search for other notes.
|
||||||
|
|
||||||
|
In this note, "testing" only appears after the NUL characters.
|
||||||
|
`,
|
||||||
|
});
|
||||||
|
|
||||||
await engine.syncTables();
|
await engine.syncTables();
|
||||||
|
|
||||||
|
expect((await engine.search('previously')).length).toBe(1);
|
||||||
expect((await engine.search('testing')).length).toBe(1);
|
expect((await engine.search('testing')).length).toBe(1);
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@ import JoplinDatabase from '../../JoplinDatabase';
|
|||||||
import NoteResource from '../../models/NoteResource';
|
import NoteResource from '../../models/NoteResource';
|
||||||
import BaseItem from '../../models/BaseItem';
|
import BaseItem from '../../models/BaseItem';
|
||||||
import { isCallbackUrl, parseCallbackUrl } from '../../callbackUrlUtils';
|
import { isCallbackUrl, parseCallbackUrl } from '../../callbackUrlUtils';
|
||||||
|
import replaceUnsupportedCharacters from '../../utils/replaceUnsupportedCharacters';
|
||||||
const { sprintf } = require('sprintf-js');
|
const { sprintf } = require('sprintf-js');
|
||||||
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
|
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
|
||||||
|
|
||||||
@ -603,9 +604,8 @@ export default class SearchEngine {
|
|||||||
private normalizeText_(text: string) {
|
private normalizeText_(text: string) {
|
||||||
let normalizedText = text.normalize ? text.normalize() : text;
|
let normalizedText = text.normalize ? text.normalize() : text;
|
||||||
|
|
||||||
// Null characters can break FTS. Remove them.
|
// NULL characters can break FTS. Remove them.
|
||||||
// eslint-disable-next-line no-control-regex
|
normalizedText = replaceUnsupportedCharacters(normalizedText);
|
||||||
normalizedText = normalizedText.replace(/\x00/g, ' ');
|
|
||||||
|
|
||||||
return removeDiacritics(normalizedText.toLowerCase());
|
return removeDiacritics(normalizedText.toLowerCase());
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import * as pdfJsNamespace from 'pdfjs-dist';
|
|||||||
import { writeFile } from 'fs/promises';
|
import { writeFile } from 'fs/promises';
|
||||||
import { ResourceEntity } from './services/database/types';
|
import { ResourceEntity } from './services/database/types';
|
||||||
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||||
|
import replaceUnsupportedCharacters from './utils/replaceUnsupportedCharacters';
|
||||||
|
|
||||||
const { FileApiDriverLocal } = require('./file-api-driver-local');
|
const { FileApiDriverLocal } = require('./file-api-driver-local');
|
||||||
const mimeUtils = require('./mime-utils.js').mime;
|
const mimeUtils = require('./mime-utils.js').mime;
|
||||||
@ -749,7 +750,10 @@ function shimInit(options: ShimInitOptions = null) {
|
|||||||
const text = (item as TextItem).str ?? '';
|
const text = (item as TextItem).str ?? '';
|
||||||
return text;
|
return text;
|
||||||
}).join('\n');
|
}).join('\n');
|
||||||
textByPage.push(strings);
|
|
||||||
|
// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
|
||||||
|
// We remove them here.
|
||||||
|
textByPage.push(replaceUnsupportedCharacters(strings));
|
||||||
}
|
}
|
||||||
|
|
||||||
return textByPage;
|
return textByPage;
|
||||||
|
8
packages/lib/utils/replaceUnsupportedCharacters.test.ts
Normal file
8
packages/lib/utils/replaceUnsupportedCharacters.test.ts
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import replaceUnsupportedCharacters from './replaceUnsupportedCharacters';
|
||||||
|
|
||||||
|
describe('replaceUnsupportedCharacters', () => {
|
||||||
|
test('should replace NULL characters', () => {
|
||||||
|
expect(replaceUnsupportedCharacters('Test\x00...')).toBe('Test�...');
|
||||||
|
expect(replaceUnsupportedCharacters('\x00Test\x00...')).toBe('�Test�...');
|
||||||
|
});
|
||||||
|
});
|
17
packages/lib/utils/replaceUnsupportedCharacters.ts
Normal file
17
packages/lib/utils/replaceUnsupportedCharacters.ts
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
|
||||||
|
const replaceUnsupportedCharacters = (text: string) => {
|
||||||
|
// In the past, NULL characters have caused sync and search issues.
|
||||||
|
// Because these issues are often difficult to debug, we remove these characters entirely.
|
||||||
|
//
|
||||||
|
// See
|
||||||
|
// - Sync issue: https://github.com/laurent22/joplin/issues/5046
|
||||||
|
// - Search issue: https://github.com/laurent22/joplin/issues/9775
|
||||||
|
//
|
||||||
|
// As per the commonmark spec, we replace \x00 with the replacement character.
|
||||||
|
// (see https://spec.commonmark.org/0.31.2/#insecure-characters).
|
||||||
|
//
|
||||||
|
// eslint-disable-next-line no-control-regex
|
||||||
|
return text.replace(/\x00/g, '\uFFFD');
|
||||||
|
};
|
||||||
|
|
||||||
|
export default replaceUnsupportedCharacters;
|
Loading…
x
Reference in New Issue
Block a user