diff --git a/packages/lib/services/searchengine/SearchEngine.test.js b/packages/lib/services/searchengine/SearchEngine.test.js index 17294d1795..58a2e5c124 100644 --- a/packages/lib/services/searchengine/SearchEngine.test.js +++ b/packages/lib/services/searchengine/SearchEngine.test.js @@ -1,6 +1,6 @@ /* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars, prefer-const */ -const { setupDatabaseAndSynchronizer, db, sleep, switchClient } = require('../../testing/test-utils.js'); +const { setupDatabaseAndSynchronizer, db, sleep, switchClient, msleep } = require('../../testing/test-utils.js'); const SearchEngine = require('../../services/searchengine/SearchEngine').default; const Note = require('../../models/Note').default; const ItemChange = require('../../models/ItemChange').default; @@ -148,6 +148,21 @@ describe('services_SearchEngine', () => { expect(rows[1].id).toBe(n2.id); })); + it('should order search results by relevance BM25 - 2', async () => { + // This simple test case didn't even work before due to a bug in the IDF + // calculation, and would just order by timestamp. + const n1 = await Note.save({ title: 'abcd abcd' }); // 1 + await msleep(1); + const n2 = await Note.save({ title: 'abcd' }); // 2 + + await engine.syncTables(); + + const rows = await engine.search('abcd'); + + expect(rows[0].id).toBe(n1.id); + expect(rows[1].id).toBe(n2.id); + }); + // TODO: Need to update and replace jasmine.mockDate() calls with Jest // equivalent diff --git a/packages/lib/services/searchengine/SearchEngine.ts b/packages/lib/services/searchengine/SearchEngine.ts index 7512899dc2..99b1ac137b 100644 --- a/packages/lib/services/searchengine/SearchEngine.ts +++ b/packages/lib/services/searchengine/SearchEngine.ts @@ -2,12 +2,13 @@ import Logger from '@joplin/utils/Logger'; import ItemChange from '../../models/ItemChange'; import Setting from '../../models/Setting'; import Note from '../../models/Note'; -import BaseModel from '../../BaseModel'; +import BaseModel, { ModelType } from '../../BaseModel'; import ItemChangeUtils from '../ItemChangeUtils'; import shim from '../../shim'; import filterParser, { Term } from './filterParser'; import queryBuilder from './queryBuilder'; import { ItemChangeEntity, NoteEntity } from '../database/types'; +import JoplinDatabase from '../../JoplinDatabase'; const { sprintf } = require('sprintf-js'); const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js'); @@ -28,6 +29,17 @@ interface SearchOptions { appendWildCards?: boolean; } +interface ProcessResultsRow { + offsets: string; + user_updated_time: number; + matchinfo: Buffer; + item_type?: ModelType; + fields?: string[]; + weight?: number; + is_todo?: number; + todo_completed?: number; +} + export interface ComplexTerm { type: 'regex' | 'text'; value: string; @@ -53,7 +65,7 @@ export default class SearchEngine { // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied public dispatch: Function = (_o: any) => {}; private logger_ = new Logger(); - private db_: any = null; + private db_: JoplinDatabase = null; private isIndexing_ = false; private syncCalls_: any[] = []; private scheduleSyncTablesIID_: any; @@ -93,8 +105,8 @@ export default class SearchEngine { } private async rebuildIndex_() { - let noteIds: string[] = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0'); - noteIds = noteIds.map((n: any) => n.id); + const notes = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0'); + const noteIds = notes.map(n => n.id); const lastChangeId = await ItemChange.lastChangeId(); @@ -292,7 +304,7 @@ export default class SearchEngine { - private calculateWeightBM25_(rows: any[]) { + private calculateWeightBM25_(rows: ProcessResultsRow[]) { // https://www.sqlite.org/fts3.html#matchinfo // pcnalx are the arguments passed to matchinfo // p - The number of matchable phrases in the query. @@ -318,7 +330,6 @@ export default class SearchEngine { const TITLE_COLUMN = 1; const BODY_COLUMN = 2; const columns = [TITLE_COLUMN, BODY_COLUMN]; - // const NUM_COLS = 12; const numPhrases = generalInfo[0]; // p const numColumns = generalInfo[1]; // c @@ -332,21 +343,23 @@ export default class SearchEngine { const numBodyTokens = matchInfo.map(m => m[5 + numColumns]); const numTokens = [null, numTitleTokens, numBodyTokens]; - const X = matchInfo.map(m => m.slice(27)); // x + // In byte size, we have for notes_normalized: + // + // p 1 + // c 1 + // n 1 + // a 12 + // l 12 + const X = matchInfo.map(m => m.slice(1 + 1 + 1 + numColumns + numColumns)); // x const hitsThisRow = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 0]; // const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1]; const docsWithHits = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 2]; - - // if a term occurs in over half the documents in the collection - // then this model gives a negative term weight, which is presumably undesirable. - // But, assuming the use of a stop list, this normally doesn't happen, - // and the value for each summand can be given a floor of 0. - const IDF = (n: number, N: number) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0); + const IDF = (n: number, N: number) => Math.max(Math.log(((N - n + 0.5) / (n + 0.5)) + 1), 0); // https://en.wikipedia.org/wiki/Okapi_BM25 - const BM25 = (idf: any, freq: any, numTokens: number, avgTokens: any) => { + const BM25 = (idf: number, freq: number, numTokens: number, avgTokens: number) => { if (avgTokens === 0) { return 0; // To prevent division by zero } @@ -355,7 +368,7 @@ export default class SearchEngine { const msSinceEpoch = Math.round(new Date().getTime()); const msPerDay = 86400000; - const weightForDaysSinceLastUpdate = (row: any) => { + const weightForDaysSinceLastUpdate = (row: ProcessResultsRow) => { // BM25 weights typically range 0-10, and last updated date should weight similarly, though prioritizing recency logarithmically. // An alpha of 200 ensures matches in the last week will show up front (11.59) and often so for matches within 2 weeks (5.99), // but is much less of a factor at 30 days (2.84) or very little after 90 days (0.95), focusing mostly on content at that point. @@ -405,7 +418,7 @@ export default class SearchEngine { } } - private processResults_(rows: any[], parsedQuery: any, isBasicSearchResults = false) { + private processResults_(rows: ProcessResultsRow[], parsedQuery: any, isBasicSearchResults = false) { if (isBasicSearchResults) { this.processBasicSearchResults_(rows, parsedQuery); } else { @@ -642,7 +655,7 @@ export default class SearchEngine { try { const { query, params } = queryBuilder(parsedQuery.allTerms, useFts); const rows = await this.db().selectAll(query, params); - this.processResults_(rows, parsedQuery, !useFts); + this.processResults_(rows as ProcessResultsRow[], parsedQuery, !useFts); return rows; } catch (error) { this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);