mirror of
https://github.com/laurent22/joplin.git
synced 2025-02-04 19:16:07 +02:00
All: Fix search engine ranking algorithm
This commit is contained in:
parent
3585412fcb
commit
f504cf1a83
@ -1,6 +1,6 @@
|
||||
/* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars, prefer-const */
|
||||
|
||||
const { setupDatabaseAndSynchronizer, db, sleep, switchClient } = require('../../testing/test-utils.js');
|
||||
const { setupDatabaseAndSynchronizer, db, sleep, switchClient, msleep } = require('../../testing/test-utils.js');
|
||||
const SearchEngine = require('../../services/searchengine/SearchEngine').default;
|
||||
const Note = require('../../models/Note').default;
|
||||
const ItemChange = require('../../models/ItemChange').default;
|
||||
@ -148,6 +148,21 @@ describe('services_SearchEngine', () => {
|
||||
expect(rows[1].id).toBe(n2.id);
|
||||
}));
|
||||
|
||||
it('should order search results by relevance BM25 - 2', async () => {
|
||||
// This simple test case didn't even work before due to a bug in the IDF
|
||||
// calculation, and would just order by timestamp.
|
||||
const n1 = await Note.save({ title: 'abcd abcd' }); // 1
|
||||
await msleep(1);
|
||||
const n2 = await Note.save({ title: 'abcd' }); // 2
|
||||
|
||||
await engine.syncTables();
|
||||
|
||||
const rows = await engine.search('abcd');
|
||||
|
||||
expect(rows[0].id).toBe(n1.id);
|
||||
expect(rows[1].id).toBe(n2.id);
|
||||
});
|
||||
|
||||
// TODO: Need to update and replace jasmine.mockDate() calls with Jest
|
||||
// equivalent
|
||||
|
||||
|
@ -2,12 +2,13 @@ import Logger from '@joplin/utils/Logger';
|
||||
import ItemChange from '../../models/ItemChange';
|
||||
import Setting from '../../models/Setting';
|
||||
import Note from '../../models/Note';
|
||||
import BaseModel from '../../BaseModel';
|
||||
import BaseModel, { ModelType } from '../../BaseModel';
|
||||
import ItemChangeUtils from '../ItemChangeUtils';
|
||||
import shim from '../../shim';
|
||||
import filterParser, { Term } from './filterParser';
|
||||
import queryBuilder from './queryBuilder';
|
||||
import { ItemChangeEntity, NoteEntity } from '../database/types';
|
||||
import JoplinDatabase from '../../JoplinDatabase';
|
||||
const { sprintf } = require('sprintf-js');
|
||||
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
|
||||
|
||||
@ -28,6 +29,17 @@ interface SearchOptions {
|
||||
appendWildCards?: boolean;
|
||||
}
|
||||
|
||||
interface ProcessResultsRow {
|
||||
offsets: string;
|
||||
user_updated_time: number;
|
||||
matchinfo: Buffer;
|
||||
item_type?: ModelType;
|
||||
fields?: string[];
|
||||
weight?: number;
|
||||
is_todo?: number;
|
||||
todo_completed?: number;
|
||||
}
|
||||
|
||||
export interface ComplexTerm {
|
||||
type: 'regex' | 'text';
|
||||
value: string;
|
||||
@ -53,7 +65,7 @@ export default class SearchEngine {
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
|
||||
public dispatch: Function = (_o: any) => {};
|
||||
private logger_ = new Logger();
|
||||
private db_: any = null;
|
||||
private db_: JoplinDatabase = null;
|
||||
private isIndexing_ = false;
|
||||
private syncCalls_: any[] = [];
|
||||
private scheduleSyncTablesIID_: any;
|
||||
@ -93,8 +105,8 @@ export default class SearchEngine {
|
||||
}
|
||||
|
||||
private async rebuildIndex_() {
|
||||
let noteIds: string[] = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
|
||||
noteIds = noteIds.map((n: any) => n.id);
|
||||
const notes = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
|
||||
const noteIds = notes.map(n => n.id);
|
||||
|
||||
const lastChangeId = await ItemChange.lastChangeId();
|
||||
|
||||
@ -292,7 +304,7 @@ export default class SearchEngine {
|
||||
|
||||
|
||||
|
||||
private calculateWeightBM25_(rows: any[]) {
|
||||
private calculateWeightBM25_(rows: ProcessResultsRow[]) {
|
||||
// https://www.sqlite.org/fts3.html#matchinfo
|
||||
// pcnalx are the arguments passed to matchinfo
|
||||
// p - The number of matchable phrases in the query.
|
||||
@ -318,7 +330,6 @@ export default class SearchEngine {
|
||||
const TITLE_COLUMN = 1;
|
||||
const BODY_COLUMN = 2;
|
||||
const columns = [TITLE_COLUMN, BODY_COLUMN];
|
||||
// const NUM_COLS = 12;
|
||||
|
||||
const numPhrases = generalInfo[0]; // p
|
||||
const numColumns = generalInfo[1]; // c
|
||||
@ -332,21 +343,23 @@ export default class SearchEngine {
|
||||
const numBodyTokens = matchInfo.map(m => m[5 + numColumns]);
|
||||
const numTokens = [null, numTitleTokens, numBodyTokens];
|
||||
|
||||
const X = matchInfo.map(m => m.slice(27)); // x
|
||||
// In byte size, we have for notes_normalized:
|
||||
//
|
||||
// p 1
|
||||
// c 1
|
||||
// n 1
|
||||
// a 12
|
||||
// l 12
|
||||
const X = matchInfo.map(m => m.slice(1 + 1 + 1 + numColumns + numColumns)); // x
|
||||
|
||||
const hitsThisRow = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 0];
|
||||
// const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1];
|
||||
const docsWithHits = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 2];
|
||||
|
||||
|
||||
// if a term occurs in over half the documents in the collection
|
||||
// then this model gives a negative term weight, which is presumably undesirable.
|
||||
// But, assuming the use of a stop list, this normally doesn't happen,
|
||||
// and the value for each summand can be given a floor of 0.
|
||||
const IDF = (n: number, N: number) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0);
|
||||
const IDF = (n: number, N: number) => Math.max(Math.log(((N - n + 0.5) / (n + 0.5)) + 1), 0);
|
||||
|
||||
// https://en.wikipedia.org/wiki/Okapi_BM25
|
||||
const BM25 = (idf: any, freq: any, numTokens: number, avgTokens: any) => {
|
||||
const BM25 = (idf: number, freq: number, numTokens: number, avgTokens: number) => {
|
||||
if (avgTokens === 0) {
|
||||
return 0; // To prevent division by zero
|
||||
}
|
||||
@ -355,7 +368,7 @@ export default class SearchEngine {
|
||||
|
||||
const msSinceEpoch = Math.round(new Date().getTime());
|
||||
const msPerDay = 86400000;
|
||||
const weightForDaysSinceLastUpdate = (row: any) => {
|
||||
const weightForDaysSinceLastUpdate = (row: ProcessResultsRow) => {
|
||||
// BM25 weights typically range 0-10, and last updated date should weight similarly, though prioritizing recency logarithmically.
|
||||
// An alpha of 200 ensures matches in the last week will show up front (11.59) and often so for matches within 2 weeks (5.99),
|
||||
// but is much less of a factor at 30 days (2.84) or very little after 90 days (0.95), focusing mostly on content at that point.
|
||||
@ -405,7 +418,7 @@ export default class SearchEngine {
|
||||
}
|
||||
}
|
||||
|
||||
private processResults_(rows: any[], parsedQuery: any, isBasicSearchResults = false) {
|
||||
private processResults_(rows: ProcessResultsRow[], parsedQuery: any, isBasicSearchResults = false) {
|
||||
if (isBasicSearchResults) {
|
||||
this.processBasicSearchResults_(rows, parsedQuery);
|
||||
} else {
|
||||
@ -642,7 +655,7 @@ export default class SearchEngine {
|
||||
try {
|
||||
const { query, params } = queryBuilder(parsedQuery.allTerms, useFts);
|
||||
const rows = await this.db().selectAll(query, params);
|
||||
this.processResults_(rows, parsedQuery, !useFts);
|
||||
this.processResults_(rows as ProcessResultsRow[], parsedQuery, !useFts);
|
||||
return rows;
|
||||
} catch (error) {
|
||||
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
|
||||
|
Loading…
x
Reference in New Issue
Block a user