2018-12-09 22:45:50 +02:00
|
|
|
const { Logger } = require('lib/logger.js');
|
2018-12-10 20:54:46 +02:00
|
|
|
const ItemChange = require('lib/models/ItemChange.js');
|
|
|
|
const Setting = require('lib/models/Setting.js');
|
|
|
|
const Note = require('lib/models/Note.js');
|
|
|
|
const BaseModel = require('lib/BaseModel.js');
|
2019-01-14 21:11:54 +02:00
|
|
|
const ItemChangeUtils = require('lib/services/ItemChangeUtils');
|
|
|
|
const { pregQuote, scriptType } = require('lib/string-utils.js');
|
|
|
|
const removeDiacritics = require('diacritics').remove;
|
2019-06-28 01:48:52 +02:00
|
|
|
const { sprintf } = require('sprintf-js');
|
2020-08-08 01:13:21 +02:00
|
|
|
const filterParser = require('./filterParser').default;
|
|
|
|
const queryBuilder = require('./queryBuilder').default;
|
2018-12-09 22:45:50 +02:00
|
|
|
|
|
|
|
class SearchEngine {
|
2020-08-08 01:13:21 +02:00
|
|
|
|
2018-12-09 22:45:50 +02:00
|
|
|
constructor() {
|
2019-09-13 00:16:42 +02:00
|
|
|
this.dispatch = () => {};
|
2018-12-09 22:45:50 +02:00
|
|
|
this.logger_ = new Logger();
|
|
|
|
this.db_ = null;
|
2019-01-15 20:10:22 +02:00
|
|
|
this.isIndexing_ = false;
|
2020-02-22 13:25:16 +02:00
|
|
|
this.syncCalls_ = [];
|
2018-12-09 22:45:50 +02:00
|
|
|
}
|
2018-12-10 20:54:46 +02:00
|
|
|
|
2018-12-09 22:45:50 +02:00
|
|
|
static instance() {
|
2020-02-27 20:25:42 +02:00
|
|
|
if (SearchEngine.instance_) return SearchEngine.instance_;
|
|
|
|
SearchEngine.instance_ = new SearchEngine();
|
|
|
|
return SearchEngine.instance_;
|
2018-12-09 22:45:50 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
setLogger(logger) {
|
|
|
|
this.logger_ = logger;
|
|
|
|
}
|
|
|
|
|
|
|
|
logger() {
|
|
|
|
return this.logger_;
|
|
|
|
}
|
|
|
|
|
|
|
|
setDb(db) {
|
|
|
|
this.db_ = db;
|
|
|
|
}
|
|
|
|
|
|
|
|
db() {
|
|
|
|
return this.db_;
|
|
|
|
}
|
|
|
|
|
2019-01-13 18:05:07 +02:00
|
|
|
noteById_(notes, noteId) {
|
|
|
|
for (let i = 0; i < notes.length; i++) {
|
|
|
|
if (notes[i].id === noteId) return notes[i];
|
|
|
|
}
|
|
|
|
// The note may have been deleted since the change was recorded. For example in this case:
|
|
|
|
// - Note created (Some Change object is recorded)
|
|
|
|
// - Note is deleted
|
|
|
|
// - ResourceService indexer runs.
|
|
|
|
// In that case, there will be a change for the note, but the note will be gone.
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2019-01-15 20:10:22 +02:00
|
|
|
async rebuildIndex_() {
|
2019-01-13 18:05:07 +02:00
|
|
|
let noteIds = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
|
2020-05-21 10:14:33 +02:00
|
|
|
noteIds = noteIds.map(n => n.id);
|
2019-01-13 18:05:07 +02:00
|
|
|
|
2019-01-14 21:11:54 +02:00
|
|
|
const lastChangeId = await ItemChange.lastChangeId();
|
2019-01-13 18:05:07 +02:00
|
|
|
|
|
|
|
// First delete content of note_normalized, in case the previous initial indexing failed
|
2019-01-14 21:11:54 +02:00
|
|
|
await this.db().exec('DELETE FROM notes_normalized');
|
2019-01-13 18:05:07 +02:00
|
|
|
|
|
|
|
while (noteIds.length) {
|
|
|
|
const currentIds = noteIds.splice(0, 100);
|
2020-08-08 01:13:21 +02:00
|
|
|
const notes = await Note.modelSelectAll(`
|
|
|
|
SELECT ${SearchEngine.relevantFields}
|
|
|
|
FROM notes
|
|
|
|
WHERE id IN ("${currentIds.join('","')}") AND is_conflict = 0 AND encryption_applied = 0`);
|
2019-01-13 18:05:07 +02:00
|
|
|
const queries = [];
|
|
|
|
|
|
|
|
for (let i = 0; i < notes.length; i++) {
|
|
|
|
const note = notes[i];
|
|
|
|
const n = this.normalizeNote_(note);
|
2020-08-08 01:13:21 +02:00
|
|
|
queries.push({ sql: `
|
|
|
|
INSERT INTO notes_normalized(${SearchEngine.relevantFields})
|
|
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
|
|
params: [n.id, n.title, n.body, n.user_created_time, n.user_updated_time, n.is_todo, n.todo_completed, n.parent_id, n.latitude, n.longitude, n.altitude, n.source_url] }
|
|
|
|
);
|
2019-01-13 18:05:07 +02:00
|
|
|
}
|
|
|
|
|
2020-09-15 15:01:07 +02:00
|
|
|
if (!noteIds.length && (Setting.value('db.fuzzySearchEnabled') === 1)) {
|
2020-09-06 14:07:00 +02:00
|
|
|
// On the last loop
|
|
|
|
queries.push({ sql: 'INSERT INTO notes_spellfix(word,rank) SELECT term, documents FROM search_aux WHERE col=\'*\'' });
|
|
|
|
}
|
|
|
|
|
2019-01-13 18:05:07 +02:00
|
|
|
await this.db().transactionExecBatch(queries);
|
|
|
|
}
|
|
|
|
|
2019-01-14 21:11:54 +02:00
|
|
|
Setting.setValue('searchEngine.lastProcessedChangeId', lastChangeId);
|
2019-01-13 18:05:07 +02:00
|
|
|
}
|
|
|
|
|
2019-01-15 20:10:22 +02:00
|
|
|
scheduleSyncTables() {
|
|
|
|
if (this.scheduleSyncTablesIID_) return;
|
|
|
|
|
|
|
|
this.scheduleSyncTablesIID_ = setTimeout(async () => {
|
2019-06-26 19:36:42 +02:00
|
|
|
try {
|
|
|
|
await this.syncTables();
|
|
|
|
} catch (error) {
|
|
|
|
this.logger().error('SearchEngine::scheduleSyncTables: Error while syncing tables:', error);
|
|
|
|
}
|
2019-01-15 20:10:22 +02:00
|
|
|
this.scheduleSyncTablesIID_ = null;
|
|
|
|
}, 10000);
|
|
|
|
}
|
|
|
|
|
2019-06-28 01:48:52 +02:00
|
|
|
async rebuildIndex() {
|
2019-07-29 15:43:53 +02:00
|
|
|
Setting.setValue('searchEngine.lastProcessedChangeId', 0);
|
2019-06-28 01:48:52 +02:00
|
|
|
Setting.setValue('searchEngine.initialIndexingDone', false);
|
|
|
|
return this.syncTables();
|
|
|
|
}
|
|
|
|
|
2020-02-22 13:25:16 +02:00
|
|
|
async syncTables_() {
|
2019-01-15 20:10:22 +02:00
|
|
|
if (this.isIndexing_) return;
|
|
|
|
|
|
|
|
this.isIndexing_ = true;
|
|
|
|
|
2018-12-29 21:19:18 +02:00
|
|
|
this.logger().info('SearchEngine: Updating FTS table...');
|
|
|
|
|
|
|
|
await ItemChange.waitForAllSaved();
|
|
|
|
|
2019-01-13 18:05:07 +02:00
|
|
|
if (!Setting.value('searchEngine.initialIndexingDone')) {
|
2019-01-15 20:10:22 +02:00
|
|
|
await this.rebuildIndex_();
|
|
|
|
Setting.setValue('searchEngine.initialIndexingDone', true);
|
|
|
|
this.isIndexing_ = false;
|
2019-01-13 18:05:07 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-12-29 21:19:18 +02:00
|
|
|
const startTime = Date.now();
|
|
|
|
|
2019-06-28 01:48:52 +02:00
|
|
|
const report = {
|
|
|
|
inserted: 0,
|
2019-07-29 15:43:53 +02:00
|
|
|
deleted: 0,
|
2019-06-28 01:48:52 +02:00
|
|
|
};
|
|
|
|
|
2018-12-29 21:19:18 +02:00
|
|
|
let lastChangeId = Setting.value('searchEngine.lastProcessedChangeId');
|
|
|
|
|
2019-06-26 19:36:42 +02:00
|
|
|
try {
|
|
|
|
while (true) {
|
2019-07-29 15:43:53 +02:00
|
|
|
const changes = await ItemChange.modelSelectAll(
|
|
|
|
`
|
2019-06-26 19:36:42 +02:00
|
|
|
SELECT id, item_id, type
|
|
|
|
FROM item_changes
|
|
|
|
WHERE item_type = ?
|
|
|
|
AND id > ?
|
|
|
|
ORDER BY id ASC
|
2019-06-29 00:49:43 +02:00
|
|
|
LIMIT 10
|
2019-07-29 15:43:53 +02:00
|
|
|
`,
|
|
|
|
[BaseModel.TYPE_NOTE, lastChangeId]
|
|
|
|
);
|
2019-06-26 19:36:42 +02:00
|
|
|
|
2020-09-06 14:07:00 +02:00
|
|
|
const queries = [];
|
|
|
|
|
|
|
|
if (!changes.length) {
|
|
|
|
if (Setting.value('db.fuzzySearchEnabled') === 1) {
|
|
|
|
queries.push({ sql: 'DELETE FROM notes_spellfix' });
|
|
|
|
queries.push({ sql: 'INSERT INTO notes_spellfix(word,rank) SELECT term, documents FROM search_aux WHERE col=\'*\'' });
|
|
|
|
await this.db().transactionExecBatch(queries);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2019-06-26 19:36:42 +02:00
|
|
|
|
2020-05-21 10:14:33 +02:00
|
|
|
const noteIds = changes.map(a => a.item_id);
|
2020-08-08 01:13:21 +02:00
|
|
|
const notes = await Note.modelSelectAll(`
|
|
|
|
SELECT ${SearchEngine.relevantFields}
|
|
|
|
FROM notes WHERE id IN ("${noteIds.join('","')}") AND is_conflict = 0 AND encryption_applied = 0`
|
|
|
|
);
|
|
|
|
|
2019-06-26 19:36:42 +02:00
|
|
|
for (let i = 0; i < changes.length; i++) {
|
|
|
|
const change = changes[i];
|
|
|
|
|
|
|
|
if (change.type === ItemChange.TYPE_CREATE || change.type === ItemChange.TYPE_UPDATE) {
|
|
|
|
queries.push({ sql: 'DELETE FROM notes_normalized WHERE id = ?', params: [change.item_id] });
|
|
|
|
const note = this.noteById_(notes, change.item_id);
|
|
|
|
if (note) {
|
|
|
|
const n = this.normalizeNote_(note);
|
2020-08-08 01:13:21 +02:00
|
|
|
queries.push({ sql: `
|
|
|
|
INSERT INTO notes_normalized(${SearchEngine.relevantFields})
|
|
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
|
|
params: [change.item_id, n.title, n.body, n.user_created_time, n.user_updated_time, n.is_todo, n.todo_completed, n.parent_id, n.latitude, n.longitude, n.altitude, n.source_url] });
|
2019-06-28 01:48:52 +02:00
|
|
|
report.inserted++;
|
2019-06-26 19:36:42 +02:00
|
|
|
}
|
|
|
|
} else if (change.type === ItemChange.TYPE_DELETE) {
|
|
|
|
queries.push({ sql: 'DELETE FROM notes_normalized WHERE id = ?', params: [change.item_id] });
|
2019-06-28 01:48:52 +02:00
|
|
|
report.deleted++;
|
2019-06-26 19:36:42 +02:00
|
|
|
} else {
|
2019-09-19 23:51:18 +02:00
|
|
|
throw new Error(`Invalid change type: ${change.type}`);
|
2019-01-13 18:05:07 +02:00
|
|
|
}
|
2019-06-26 19:36:42 +02:00
|
|
|
|
|
|
|
lastChangeId = change.id;
|
2018-12-29 21:19:18 +02:00
|
|
|
}
|
|
|
|
|
2019-06-26 19:36:42 +02:00
|
|
|
await this.db().transactionExecBatch(queries);
|
|
|
|
Setting.setValue('searchEngine.lastProcessedChangeId', lastChangeId);
|
|
|
|
await Setting.saveAll();
|
2018-12-29 21:19:18 +02:00
|
|
|
}
|
2019-06-26 19:36:42 +02:00
|
|
|
} catch (error) {
|
|
|
|
this.logger().error('SearchEngine: Error while processing changes:', error);
|
2018-12-29 21:19:18 +02:00
|
|
|
}
|
|
|
|
|
2019-01-14 21:11:54 +02:00
|
|
|
await ItemChangeUtils.deleteProcessedChanges();
|
|
|
|
|
2019-06-28 01:48:52 +02:00
|
|
|
this.logger().info(sprintf('SearchEngine: Updated FTS table in %dms. Inserted: %d. Deleted: %d', Date.now() - startTime, report.inserted, report.deleted));
|
2019-01-15 20:10:22 +02:00
|
|
|
|
|
|
|
this.isIndexing_ = false;
|
2019-02-09 21:04:34 +02:00
|
|
|
}
|
2018-12-29 21:19:18 +02:00
|
|
|
|
2020-02-22 13:25:16 +02:00
|
|
|
async syncTables() {
|
|
|
|
this.syncCalls_.push(true);
|
|
|
|
try {
|
|
|
|
await this.syncTables_();
|
|
|
|
} finally {
|
|
|
|
this.syncCalls_.pop();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-10 20:54:46 +02:00
|
|
|
async countRows() {
|
2019-07-29 15:43:53 +02:00
|
|
|
const sql = 'SELECT count(*) as total FROM notes_fts';
|
2018-12-10 20:54:46 +02:00
|
|
|
const row = await this.db().selectOne(sql);
|
|
|
|
return row && row['total'] ? row['total'] : 0;
|
|
|
|
}
|
|
|
|
|
2020-04-14 00:10:59 +02:00
|
|
|
fieldNamesFromOffsets_(offsets) {
|
|
|
|
const notesNormalizedFieldNames = this.db().tableFieldNames('notes_normalized');
|
2018-12-10 20:54:46 +02:00
|
|
|
const occurenceCount = Math.floor(offsets.length / 4);
|
2020-04-14 00:10:59 +02:00
|
|
|
const output = [];
|
2018-12-10 20:54:46 +02:00
|
|
|
for (let i = 0; i < occurenceCount; i++) {
|
2020-04-14 00:10:59 +02:00
|
|
|
const colIndex = offsets[i * 4];
|
|
|
|
const fieldName = notesNormalizedFieldNames[colIndex];
|
|
|
|
if (!output.includes(fieldName)) output.push(fieldName);
|
2018-12-10 20:54:46 +02:00
|
|
|
}
|
|
|
|
|
2020-04-14 00:10:59 +02:00
|
|
|
return output;
|
2018-12-10 20:54:46 +02:00
|
|
|
}
|
|
|
|
|
2018-12-12 23:40:05 +02:00
|
|
|
calculateWeight_(offsets, termCount) {
|
2018-12-10 20:54:46 +02:00
|
|
|
// Offset doc: https://www.sqlite.org/fts3.html#offsets
|
2018-12-10 20:58:49 +02:00
|
|
|
|
2018-12-12 23:40:05 +02:00
|
|
|
// - If there's only one term in the query string, the content with the most matches goes on top
|
|
|
|
// - If there are multiple terms, the result with the most occurences that are closest to each others go on top.
|
|
|
|
// eg. if query is "abcd efgh", "abcd efgh" will go before "abcd XX efgh".
|
2019-07-29 15:43:53 +02:00
|
|
|
|
2018-12-10 20:54:46 +02:00
|
|
|
const occurenceCount = Math.floor(offsets.length / 4);
|
|
|
|
|
2018-12-12 23:40:05 +02:00
|
|
|
if (termCount === 1) return occurenceCount;
|
|
|
|
|
2018-12-10 20:54:46 +02:00
|
|
|
let spread = 0;
|
|
|
|
let previousDist = null;
|
|
|
|
for (let i = 0; i < occurenceCount; i++) {
|
|
|
|
const dist = offsets[i * 4 + 2];
|
|
|
|
|
|
|
|
if (previousDist !== null) {
|
|
|
|
const delta = dist - previousDist;
|
|
|
|
spread += delta;
|
|
|
|
}
|
|
|
|
|
|
|
|
previousDist = dist;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Divide the number of occurences by the spread so even if a note has many times the searched terms
|
|
|
|
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
|
|
|
|
// terms once or twice but just next to each others.
|
|
|
|
return occurenceCount / spread;
|
|
|
|
}
|
|
|
|
|
2020-08-19 00:53:28 +02:00
|
|
|
|
|
|
|
|
2020-09-06 14:07:00 +02:00
|
|
|
calculateWeightBM25_(rows, fuzzyScore) {
|
2020-08-19 00:53:28 +02:00
|
|
|
// https://www.sqlite.org/fts3.html#matchinfo
|
|
|
|
// pcnalx are the arguments passed to matchinfo
|
|
|
|
// p - The number of matchable phrases in the query.
|
|
|
|
// c - The number of user defined columns in the FTS table
|
|
|
|
// n - The number of rows in the FTS4 table.
|
|
|
|
// a - avg number of tokens in the text values stored in the column.
|
|
|
|
// l - For each column, the length of the value stored in the current
|
|
|
|
// row of the FTS4 table, in tokens.
|
|
|
|
// x - For each distinct combination of a phrase and table column, the
|
|
|
|
// following three values:
|
|
|
|
// hits_this_row
|
|
|
|
// hits_all_rows
|
|
|
|
// docs_with_hits
|
|
|
|
|
|
|
|
if (rows.length === 0) return;
|
|
|
|
|
|
|
|
const matchInfo = rows.map(row => new Uint32Array(row.matchinfo.buffer));
|
|
|
|
const generalInfo = matchInfo[0];
|
|
|
|
|
|
|
|
const K1 = 1.2;
|
|
|
|
const B = 0.75;
|
|
|
|
|
|
|
|
const TITLE_COLUMN = 1;
|
|
|
|
const BODY_COLUMN = 2;
|
|
|
|
const columns = [TITLE_COLUMN, BODY_COLUMN];
|
|
|
|
// const NUM_COLS = 12;
|
|
|
|
|
|
|
|
const numPhrases = generalInfo[0]; // p
|
|
|
|
const numColumns = generalInfo[1]; // c
|
|
|
|
const numRows = generalInfo[2]; // n
|
|
|
|
|
|
|
|
const avgTitleTokens = generalInfo[4]; // a
|
|
|
|
const avgBodyTokens = generalInfo[5];
|
|
|
|
const avgTokens = [null, avgTitleTokens, avgBodyTokens]; // we only need cols 1 and 2
|
|
|
|
|
|
|
|
const numTitleTokens = matchInfo.map(m => m[4 + numColumns]); // l
|
|
|
|
const numBodyTokens = matchInfo.map(m => m[5 + numColumns]);
|
|
|
|
const numTokens = [null, numTitleTokens, numBodyTokens];
|
|
|
|
|
|
|
|
const X = matchInfo.map(m => m.slice(27)); // x
|
|
|
|
|
|
|
|
const hitsThisRow = (array, c, p) => array[3 * (c + p * numColumns) + 0];
|
|
|
|
// const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1];
|
|
|
|
const docsWithHits = (array, c, p) => array[3 * (c + p * numColumns) + 2];
|
|
|
|
|
|
|
|
|
|
|
|
// if a term occurs in over half the documents in the collection
|
|
|
|
// then this model gives a negative term weight, which is presumably undesirable.
|
|
|
|
// But, assuming the use of a stop list, this normally doesn't happen,
|
|
|
|
// and the value for each summand can be given a floor of 0.
|
|
|
|
const IDF = (n, N) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0);
|
|
|
|
|
|
|
|
// https://en.wikipedia.org/wiki/Okapi_BM25
|
|
|
|
const BM25 = (idf, freq, numTokens, avgTokens) => {
|
|
|
|
if (avgTokens === 0) {
|
|
|
|
return 0; // To prevent division by zero
|
|
|
|
}
|
|
|
|
return idf * (freq * (K1 + 1)) / (freq + K1 * (1 - B + B * (numTokens / avgTokens)));
|
|
|
|
};
|
|
|
|
|
|
|
|
for (let i = 0; i < rows.length; i++) {
|
|
|
|
const row = rows[i];
|
|
|
|
row.weight = 0;
|
2020-09-06 14:07:00 +02:00
|
|
|
row.fuzziness = 1000;
|
|
|
|
row.wordFound = [];
|
2020-08-19 00:53:28 +02:00
|
|
|
for (let j = 0; j < numPhrases; j++) {
|
2020-09-06 14:07:00 +02:00
|
|
|
let found = false;
|
2020-08-19 00:53:28 +02:00
|
|
|
columns.forEach(column => {
|
|
|
|
const rowsWithHits = docsWithHits(X[i], column, j);
|
|
|
|
const frequencyHits = hitsThisRow(X[i], column, j);
|
|
|
|
const idf = IDF(rowsWithHits, numRows);
|
2020-09-06 14:07:00 +02:00
|
|
|
found = found ? found : (frequencyHits > 0);
|
|
|
|
|
2020-08-19 00:53:28 +02:00
|
|
|
row.weight += BM25(idf, frequencyHits, numTokens[column][i], avgTokens[column]);
|
2020-09-06 14:07:00 +02:00
|
|
|
row.fuzziness = (frequencyHits > 0) ? Math.min(row.fuzziness, fuzzyScore[j]) : row.fuzziness;
|
2020-08-19 00:53:28 +02:00
|
|
|
});
|
2020-09-06 14:07:00 +02:00
|
|
|
row.wordFound.push(found);
|
2020-08-19 00:53:28 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-03 18:06:14 +02:00
|
|
|
processBasicSearchResults_(rows, parsedQuery) {
|
|
|
|
const valueRegexs = parsedQuery.keys.includes('_') ? parsedQuery.terms['_'].map(term => term.valueRegex || term.value) : [];
|
|
|
|
const isTitleSearch = parsedQuery.keys.includes('title');
|
|
|
|
const isOnlyTitle = parsedQuery.keys.length === 1 && isTitleSearch;
|
|
|
|
|
2018-12-10 20:54:46 +02:00
|
|
|
for (let i = 0; i < rows.length; i++) {
|
|
|
|
const row = rows[i];
|
2020-06-03 18:06:14 +02:00
|
|
|
const testTitle = regex => new RegExp(regex, 'ig').test(row.title);
|
|
|
|
const matchedFields = {
|
|
|
|
title: isTitleSearch || valueRegexs.some(testTitle),
|
|
|
|
body: !isOnlyTitle,
|
|
|
|
};
|
|
|
|
|
|
|
|
row.fields = Object.keys(matchedFields).filter(key => matchedFields[key]);
|
|
|
|
row.weight = 0;
|
2020-09-06 14:07:00 +02:00
|
|
|
row.fuzziness = 0;
|
2020-06-03 18:06:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
processResults_(rows, parsedQuery, isBasicSearchResults = false) {
|
2020-09-06 14:07:00 +02:00
|
|
|
const rowContainsAllWords = (wordsFound, numFuzzyMatches) => {
|
|
|
|
let start = 0;
|
|
|
|
let end = 0;
|
|
|
|
for (let i = 0; i < numFuzzyMatches.length; i++) {
|
|
|
|
end = end + numFuzzyMatches[i];
|
|
|
|
if (!(wordsFound.slice(start, end).find(x => x))) {
|
|
|
|
// This note doesn't contain any fuzzy matches for the word
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
start = end;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
2020-06-03 18:06:14 +02:00
|
|
|
if (isBasicSearchResults) {
|
|
|
|
this.processBasicSearchResults_(rows, parsedQuery);
|
|
|
|
} else {
|
2020-09-06 14:07:00 +02:00
|
|
|
this.calculateWeightBM25_(rows, parsedQuery.fuzzyScore);
|
2020-06-03 18:06:14 +02:00
|
|
|
for (let i = 0; i < rows.length; i++) {
|
|
|
|
const row = rows[i];
|
2020-09-06 14:07:00 +02:00
|
|
|
row.include = (parsedQuery.fuzzy && !parsedQuery.any) ? rowContainsAllWords(row.wordFound, parsedQuery.numFuzzyMatches) : true;
|
2020-06-03 18:06:14 +02:00
|
|
|
const offsets = row.offsets.split(' ').map(o => Number(o));
|
|
|
|
row.fields = this.fieldNamesFromOffsets_(offsets);
|
|
|
|
}
|
2018-12-10 20:54:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
rows.sort((a, b) => {
|
2020-09-06 14:07:00 +02:00
|
|
|
if (a.fuzziness < b.fuzziness) return -1;
|
|
|
|
if (a.fuzziness > b.fuzziness) return +1;
|
2020-04-14 00:10:59 +02:00
|
|
|
if (a.fields.includes('title') && !b.fields.includes('title')) return -1;
|
|
|
|
if (!a.fields.includes('title') && b.fields.includes('title')) return +1;
|
2018-12-10 20:54:46 +02:00
|
|
|
if (a.weight < b.weight) return +1;
|
|
|
|
if (a.weight > b.weight) return -1;
|
2019-02-24 14:00:06 +02:00
|
|
|
if (a.is_todo && a.todo_completed) return +1;
|
|
|
|
if (b.is_todo && b.todo_completed) return -1;
|
|
|
|
if (a.user_updated_time < b.user_updated_time) return +1;
|
|
|
|
if (a.user_updated_time > b.user_updated_time) return -1;
|
2018-12-10 20:54:46 +02:00
|
|
|
return 0;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2018-12-12 23:40:05 +02:00
|
|
|
// https://stackoverflow.com/a/13818704/561309
|
|
|
|
queryTermToRegex(term) {
|
2018-12-14 00:57:14 +02:00
|
|
|
while (term.length && term.indexOf('*') === 0) {
|
|
|
|
term = term.substr(1);
|
|
|
|
}
|
|
|
|
|
2018-12-16 19:32:42 +02:00
|
|
|
let regexString = pregQuote(term);
|
2018-12-14 00:57:14 +02:00
|
|
|
if (regexString[regexString.length - 1] === '*') {
|
2019-09-19 23:51:18 +02:00
|
|
|
regexString = `${regexString.substr(0, regexString.length - 2)}[^${pregQuote(' \t\n\r,.,+-*?!={}<>|:"\'()[]')}]` + '*?';
|
2019-01-17 21:01:35 +02:00
|
|
|
// regexString = regexString.substr(0, regexString.length - 2) + '.*?';
|
2018-12-12 23:40:05 +02:00
|
|
|
}
|
2018-12-14 00:57:14 +02:00
|
|
|
|
|
|
|
return regexString;
|
2018-12-10 20:54:46 +02:00
|
|
|
}
|
|
|
|
|
2020-09-06 14:07:00 +02:00
|
|
|
async fuzzifier(words) {
|
|
|
|
const fuzzyMatches = [];
|
|
|
|
words.forEach(word => {
|
|
|
|
const fuzzyWords = this.db().selectAll('SELECT word, score FROM notes_spellfix WHERE word MATCH ? AND top=3', [word]);
|
|
|
|
fuzzyMatches.push(fuzzyWords);
|
|
|
|
});
|
|
|
|
return await Promise.all(fuzzyMatches);
|
|
|
|
}
|
|
|
|
|
2020-09-15 15:01:07 +02:00
|
|
|
async parseQuery(query, fuzzy = null) {
|
|
|
|
if (fuzzy === null) fuzzy = Setting.value('db.fuzzySearchEnabled') === 1;
|
|
|
|
|
2020-08-08 01:13:21 +02:00
|
|
|
const trimQuotes = (str) => str.startsWith('"') ? str.substr(1, str.length - 2) : str;
|
2018-12-10 20:58:49 +02:00
|
|
|
|
2020-08-08 01:13:21 +02:00
|
|
|
let allTerms = [];
|
2020-09-06 14:07:00 +02:00
|
|
|
let allFuzzyTerms = [];
|
|
|
|
|
2020-08-08 01:13:21 +02:00
|
|
|
try {
|
|
|
|
allTerms = filterParser(query);
|
|
|
|
} catch (error) {
|
|
|
|
console.warn(error);
|
2018-12-12 23:40:05 +02:00
|
|
|
}
|
|
|
|
|
2020-09-06 14:07:00 +02:00
|
|
|
const textTerms = allTerms.filter(x => x.name === 'text' && !x.negated);
|
|
|
|
const titleTerms = allTerms.filter(x => x.name === 'title' && !x.negated);
|
|
|
|
const bodyTerms = allTerms.filter(x => x.name === 'body' && !x.negated);
|
|
|
|
|
|
|
|
const fuzzyScore = [];
|
|
|
|
let numFuzzyMatches = [];
|
|
|
|
let terms = null;
|
2020-09-11 23:52:32 +02:00
|
|
|
|
2020-09-06 14:07:00 +02:00
|
|
|
if (fuzzy) {
|
2020-09-11 23:52:32 +02:00
|
|
|
const fuzzyText = await this.fuzzifier(textTerms.filter(x => !(x.quoted || x.wildcard)).map(x => trimQuotes(x.value)));
|
|
|
|
const fuzzyTitle = await this.fuzzifier(titleTerms.filter(x => !x.wildcard).map(x => trimQuotes(x.value)));
|
|
|
|
const fuzzyBody = await this.fuzzifier(bodyTerms.filter(x => !x.wildcard).map(x => trimQuotes(x.value)));
|
|
|
|
|
|
|
|
const phraseTextSearch = textTerms.filter(x => x.quoted);
|
|
|
|
const wildCardSearch = textTerms.concat(titleTerms).concat(bodyTerms).filter(x => x.wildcard);
|
2020-09-06 14:07:00 +02:00
|
|
|
|
2020-09-11 23:52:32 +02:00
|
|
|
// Save number of fuzzy matches we got for each word
|
|
|
|
// fuzzifier() is currently set to return at most 3 matches
|
2020-09-06 14:07:00 +02:00
|
|
|
// We need to know which fuzzy words go together so that we can filter out notes that don't contain a required word.
|
|
|
|
numFuzzyMatches = fuzzyText.concat(fuzzyTitle).concat(fuzzyBody).map(x => x.length);
|
2020-09-11 23:52:32 +02:00
|
|
|
for (let i = 0; i < phraseTextSearch.length + wildCardSearch.length; i++) {
|
|
|
|
// Phrase searches and wildcard searches are preserved without fuzzification (A single match)
|
|
|
|
numFuzzyMatches.push(1);
|
2020-09-06 14:07:00 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const mergedFuzzyText = [].concat.apply([], fuzzyText);
|
|
|
|
const mergedFuzzyTitle = [].concat.apply([], fuzzyTitle);
|
|
|
|
const mergedFuzzyBody = [].concat.apply([], fuzzyBody);
|
|
|
|
|
|
|
|
const fuzzyTextTerms = mergedFuzzyText.map(x => { return { name: 'text', value: x.word, negated: false, score: x.score }; });
|
|
|
|
const fuzzyTitleTerms = mergedFuzzyTitle.map(x => { return { name: 'title', value: x.word, negated: false, score: x.score }; });
|
|
|
|
const fuzzyBodyTerms = mergedFuzzyBody.map(x => { return { name: 'body', value: x.word, negated: false, score: x.score }; });
|
2020-08-08 01:13:21 +02:00
|
|
|
|
2020-09-11 23:52:32 +02:00
|
|
|
// Remove previous text, title and body and replace with fuzzy versions
|
2020-09-06 14:07:00 +02:00
|
|
|
allTerms = allTerms.filter(x => (x.name !== 'text' && x.name !== 'title' && x.name !== 'body'));
|
|
|
|
|
2020-09-11 23:52:32 +02:00
|
|
|
// The order matters here!
|
|
|
|
// The text goes first, then title, then body, then phrase and finally wildcard
|
|
|
|
// This is because it needs to match with numFuzzyMathches.
|
|
|
|
allFuzzyTerms = allTerms.concat(fuzzyTextTerms).concat(fuzzyTitleTerms).concat(fuzzyBodyTerms).concat(phraseTextSearch).concat(wildCardSearch);
|
2020-09-06 14:07:00 +02:00
|
|
|
|
|
|
|
const allTextTerms = allFuzzyTerms.filter(x => x.name === 'title' || x.name === 'body' || x.name === 'text');
|
|
|
|
for (let i = 0; i < allTextTerms.length; i++) {
|
2020-09-11 23:52:32 +02:00
|
|
|
// Phrase searches and wildcard searches will get a fuzziness score of zero.
|
|
|
|
// This means that they will go first in the sort order (Even if there are other words with matches in the title)
|
|
|
|
// Undesirable?
|
2020-09-06 14:07:00 +02:00
|
|
|
fuzzyScore.push(allFuzzyTerms[i].score ? allFuzzyTerms[i].score : 0);
|
|
|
|
}
|
|
|
|
|
2020-09-11 23:52:32 +02:00
|
|
|
const wildCardTextTerms = wildCardSearch.filter(x => x.name === 'text').map(x =>trimQuotes(x.value));
|
|
|
|
const wildCardTitleTerms = wildCardSearch.filter(x => x.name === 'title').map(x =>trimQuotes(x.value));
|
|
|
|
const wildCardBodyTerms = wildCardSearch.filter(x => x.name === 'body').map(x =>trimQuotes(x.value));
|
|
|
|
const phraseTextTerms = phraseTextSearch.map(x => trimQuotes(x.value));
|
|
|
|
|
|
|
|
terms = {
|
|
|
|
_: fuzzyTextTerms.map(x => trimQuotes(x.value)).concat(phraseTextTerms).concat(wildCardTextTerms),
|
|
|
|
title: fuzzyTitleTerms.map(x => trimQuotes(x.value)).concat(wildCardTitleTerms),
|
|
|
|
body: fuzzyBodyTerms.map(x => trimQuotes(x.value)).concat(wildCardBodyTerms),
|
|
|
|
};
|
2020-09-06 14:07:00 +02:00
|
|
|
} else {
|
|
|
|
const nonNegatedTextTerms = textTerms.length + titleTerms.length + bodyTerms.length;
|
|
|
|
for (let i = 0; i < nonNegatedTextTerms; i++) {
|
|
|
|
fuzzyScore.push(0);
|
|
|
|
}
|
|
|
|
terms = { _: textTerms.map(x =>trimQuotes(x.value)), 'title': titleTerms.map(x =>trimQuotes(x.value)), 'body': bodyTerms.map(x =>trimQuotes(x.value)) };
|
|
|
|
}
|
2018-12-12 23:40:05 +02:00
|
|
|
|
|
|
|
// Filter terms:
|
|
|
|
// - Convert wildcards to regex
|
|
|
|
// - Remove columns with no results
|
|
|
|
// - Add count of terms
|
|
|
|
|
|
|
|
let termCount = 0;
|
|
|
|
const keys = [];
|
2020-03-14 01:46:14 +02:00
|
|
|
for (const col in terms) {
|
2018-12-12 23:40:05 +02:00
|
|
|
if (!terms.hasOwnProperty(col)) continue;
|
|
|
|
|
|
|
|
if (!terms[col].length) {
|
|
|
|
delete terms[col];
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (let i = terms[col].length - 1; i >= 0; i--) {
|
|
|
|
const term = terms[col][i];
|
|
|
|
|
|
|
|
// SQlLite FTS doesn't allow "*" queries and neither shall we
|
|
|
|
if (term === '*') {
|
|
|
|
terms[col].splice(i, 1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (term.indexOf('*') >= 0) {
|
2019-01-18 20:31:07 +02:00
|
|
|
terms[col][i] = { type: 'regex', value: term, scriptType: scriptType(term), valueRegex: this.queryTermToRegex(term) };
|
2019-01-15 21:55:58 +02:00
|
|
|
} else {
|
2019-01-18 20:31:07 +02:00
|
|
|
terms[col][i] = { type: 'text', value: term, scriptType: scriptType(term) };
|
2018-12-12 23:40:05 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
termCount += terms[col].length;
|
|
|
|
|
|
|
|
keys.push(col);
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
termCount: termCount,
|
|
|
|
keys: keys,
|
2020-08-08 01:13:21 +02:00
|
|
|
terms: terms, // text terms
|
2020-09-06 14:07:00 +02:00
|
|
|
allTerms: fuzzy ? allFuzzyTerms : allTerms,
|
|
|
|
fuzzyScore: fuzzyScore,
|
|
|
|
numFuzzyMatches: numFuzzyMatches,
|
|
|
|
fuzzy: fuzzy,
|
|
|
|
any: !!allTerms.find(term => term.name === 'any'),
|
2018-12-12 23:40:05 +02:00
|
|
|
};
|
2018-12-10 20:58:49 +02:00
|
|
|
}
|
|
|
|
|
2018-12-14 00:57:14 +02:00
|
|
|
allParsedQueryTerms(parsedQuery) {
|
|
|
|
if (!parsedQuery || !parsedQuery.termCount) return [];
|
|
|
|
|
|
|
|
let output = [];
|
2020-03-14 01:46:14 +02:00
|
|
|
for (const col in parsedQuery.terms) {
|
2018-12-14 00:57:14 +02:00
|
|
|
if (!parsedQuery.terms.hasOwnProperty(col)) continue;
|
|
|
|
output = output.concat(parsedQuery.terms[col]);
|
|
|
|
}
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
2019-01-13 18:05:07 +02:00
|
|
|
normalizeText_(text) {
|
2019-01-19 20:03:05 +02:00
|
|
|
const normalizedText = text.normalize ? text.normalize() : text;
|
|
|
|
return removeDiacritics(normalizedText.toLowerCase());
|
2019-01-13 18:05:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
normalizeNote_(note) {
|
|
|
|
const n = Object.assign({}, note);
|
2019-07-29 15:43:53 +02:00
|
|
|
n.title = this.normalizeText_(n.title);
|
2019-01-13 18:05:07 +02:00
|
|
|
n.body = this.normalizeText_(n.body);
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
2019-01-14 21:11:54 +02:00
|
|
|
async basicSearch(query) {
|
2019-04-03 08:46:41 +02:00
|
|
|
query = query.replace(/\*/, '');
|
2020-09-06 14:07:00 +02:00
|
|
|
const parsedQuery = await this.parseQuery(query);
|
2019-04-03 08:46:41 +02:00
|
|
|
const searchOptions = {};
|
|
|
|
|
|
|
|
for (const key of parsedQuery.keys) {
|
2020-06-03 18:06:14 +02:00
|
|
|
if (parsedQuery.terms[key].length === 0) continue;
|
|
|
|
|
2019-04-03 08:46:41 +02:00
|
|
|
const term = parsedQuery.terms[key][0].value;
|
2019-09-19 23:51:18 +02:00
|
|
|
if (key === '_') searchOptions.anywherePattern = `*${term}*`;
|
|
|
|
if (key === 'title') searchOptions.titlePattern = `*${term}*`;
|
|
|
|
if (key === 'body') searchOptions.bodyPattern = `*${term}*`;
|
2019-01-14 21:11:54 +02:00
|
|
|
}
|
|
|
|
|
2019-04-03 08:46:41 +02:00
|
|
|
return Note.previews(null, searchOptions);
|
2019-01-14 21:11:54 +02:00
|
|
|
}
|
|
|
|
|
2020-09-06 14:07:00 +02:00
|
|
|
determineSearchType_(query, options) {
|
|
|
|
if (options.searchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
|
2020-04-18 13:45:54 +02:00
|
|
|
|
|
|
|
// If preferredSearchType is "fts" we auto-detect anyway
|
|
|
|
// because it's not always supported.
|
2019-01-14 21:11:54 +02:00
|
|
|
|
|
|
|
const st = scriptType(query);
|
|
|
|
|
2020-01-29 14:50:52 +02:00
|
|
|
if (!Setting.value('db.ftsEnabled') || ['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
|
2020-04-18 13:45:54 +02:00
|
|
|
return SearchEngine.SEARCH_TYPE_BASIC;
|
2020-09-15 15:01:07 +02:00
|
|
|
} else if (options.fuzzy) {
|
2020-09-06 14:07:00 +02:00
|
|
|
return SearchEngine.SEARCH_TYPE_FTS_FUZZY;
|
|
|
|
} else {
|
|
|
|
return SearchEngine.SEARCH_TYPE_FTS;
|
2020-04-18 13:45:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-08-08 01:13:21 +02:00
|
|
|
async search(searchString, options = null) {
|
2020-04-18 13:45:54 +02:00
|
|
|
options = Object.assign({}, {
|
|
|
|
searchType: SearchEngine.SEARCH_TYPE_AUTO,
|
2020-09-15 15:01:07 +02:00
|
|
|
fuzzy: Setting.value('db.fuzzySearchEnabled') === 1,
|
2020-04-18 13:45:54 +02:00
|
|
|
}, options);
|
|
|
|
|
2020-08-08 01:13:21 +02:00
|
|
|
searchString = this.normalizeText_(searchString);
|
2020-04-18 13:45:54 +02:00
|
|
|
|
2020-09-06 14:07:00 +02:00
|
|
|
const searchType = this.determineSearchType_(searchString, options);
|
2020-04-18 13:45:54 +02:00
|
|
|
|
|
|
|
if (searchType === SearchEngine.SEARCH_TYPE_BASIC) {
|
2019-01-14 21:11:54 +02:00
|
|
|
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
|
2020-08-08 01:13:21 +02:00
|
|
|
const rows = await this.basicSearch(searchString);
|
2020-09-06 14:07:00 +02:00
|
|
|
const parsedQuery = await this.parseQuery(searchString);
|
2020-06-03 18:06:14 +02:00
|
|
|
this.processResults_(rows, parsedQuery, true);
|
|
|
|
return rows;
|
2020-08-08 01:13:21 +02:00
|
|
|
} else {
|
2020-09-06 14:07:00 +02:00
|
|
|
// SEARCH_TYPE_FTS or SEARCH_TYPE_FTS_FUZZY
|
2020-04-18 13:45:54 +02:00
|
|
|
// FTS will ignore all special characters, like "-" in the index. So if
|
|
|
|
// we search for "this-phrase" it won't find it because it will only
|
|
|
|
// see "this phrase" in the index. Because of this, we remove the dashes
|
|
|
|
// when searching.
|
|
|
|
// https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856
|
2020-08-08 01:13:21 +02:00
|
|
|
|
2020-09-15 15:01:07 +02:00
|
|
|
const parsedQuery = await this.parseQuery(searchString, searchType === SearchEngine.SEARCH_TYPE_FTS_FUZZY);
|
2020-08-08 01:13:21 +02:00
|
|
|
|
2019-01-18 19:56:56 +02:00
|
|
|
try {
|
2020-09-15 15:01:07 +02:00
|
|
|
const { query, params } = queryBuilder(parsedQuery.allTerms, searchType === SearchEngine.SEARCH_TYPE_FTS_FUZZY);
|
2020-08-08 01:13:21 +02:00
|
|
|
const rows = await this.db().selectAll(query, params);
|
2020-04-14 00:10:59 +02:00
|
|
|
this.processResults_(rows, parsedQuery);
|
2020-09-06 14:07:00 +02:00
|
|
|
if (searchType === SearchEngine.SEARCH_TYPE_FTS_FUZZY && !parsedQuery.any) {
|
|
|
|
return rows.filter(row => row.include);
|
|
|
|
}
|
2019-01-18 19:56:56 +02:00
|
|
|
return rows;
|
|
|
|
} catch (error) {
|
2020-08-08 01:13:21 +02:00
|
|
|
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
|
2019-01-18 19:56:56 +02:00
|
|
|
return [];
|
|
|
|
}
|
2019-01-14 21:11:54 +02:00
|
|
|
}
|
|
|
|
}
|
2020-02-22 13:25:16 +02:00
|
|
|
|
2020-02-27 20:25:42 +02:00
|
|
|
async destroy() {
|
|
|
|
if (this.scheduleSyncTablesIID_) {
|
|
|
|
clearTimeout(this.scheduleSyncTablesIID_);
|
|
|
|
this.scheduleSyncTablesIID_ = null;
|
|
|
|
}
|
|
|
|
SearchEngine.instance_ = null;
|
|
|
|
|
2020-02-22 13:25:16 +02:00
|
|
|
return new Promise((resolve) => {
|
|
|
|
const iid = setInterval(() => {
|
|
|
|
if (!this.syncCalls_.length) {
|
|
|
|
clearInterval(iid);
|
2020-03-16 04:30:54 +02:00
|
|
|
this.instance_ = null;
|
2020-02-22 13:25:16 +02:00
|
|
|
resolve();
|
|
|
|
}
|
|
|
|
}, 100);
|
|
|
|
});
|
|
|
|
}
|
2018-12-09 22:45:50 +02:00
|
|
|
}
|
|
|
|
|
2020-08-08 01:13:21 +02:00
|
|
|
SearchEngine.relevantFields = 'id, title, body, user_created_time, user_updated_time, is_todo, todo_completed, parent_id, latitude, longitude, altitude, source_url';
|
|
|
|
|
2020-02-27 20:25:42 +02:00
|
|
|
SearchEngine.instance_ = null;
|
|
|
|
|
2020-04-18 13:45:54 +02:00
|
|
|
SearchEngine.SEARCH_TYPE_AUTO = 'auto';
|
|
|
|
SearchEngine.SEARCH_TYPE_BASIC = 'basic';
|
|
|
|
SearchEngine.SEARCH_TYPE_FTS = 'fts';
|
2020-09-06 14:07:00 +02:00
|
|
|
SearchEngine.SEARCH_TYPE_FTS_FUZZY = 'fts_fuzzy';
|
2020-04-18 13:45:54 +02:00
|
|
|
|
2019-07-29 15:43:53 +02:00
|
|
|
module.exports = SearchEngine;
|