mirror of
https://github.com/laurent22/joplin.git
synced 2025-02-10 19:41:43 +02:00
All: Fix search engine ranking algorithm
This commit is contained in:
parent
3585412fcb
commit
f504cf1a83
@ -1,6 +1,6 @@
|
|||||||
/* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars, prefer-const */
|
/* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars, prefer-const */
|
||||||
|
|
||||||
const { setupDatabaseAndSynchronizer, db, sleep, switchClient } = require('../../testing/test-utils.js');
|
const { setupDatabaseAndSynchronizer, db, sleep, switchClient, msleep } = require('../../testing/test-utils.js');
|
||||||
const SearchEngine = require('../../services/searchengine/SearchEngine').default;
|
const SearchEngine = require('../../services/searchengine/SearchEngine').default;
|
||||||
const Note = require('../../models/Note').default;
|
const Note = require('../../models/Note').default;
|
||||||
const ItemChange = require('../../models/ItemChange').default;
|
const ItemChange = require('../../models/ItemChange').default;
|
||||||
@ -148,6 +148,21 @@ describe('services_SearchEngine', () => {
|
|||||||
expect(rows[1].id).toBe(n2.id);
|
expect(rows[1].id).toBe(n2.id);
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
it('should order search results by relevance BM25 - 2', async () => {
|
||||||
|
// This simple test case didn't even work before due to a bug in the IDF
|
||||||
|
// calculation, and would just order by timestamp.
|
||||||
|
const n1 = await Note.save({ title: 'abcd abcd' }); // 1
|
||||||
|
await msleep(1);
|
||||||
|
const n2 = await Note.save({ title: 'abcd' }); // 2
|
||||||
|
|
||||||
|
await engine.syncTables();
|
||||||
|
|
||||||
|
const rows = await engine.search('abcd');
|
||||||
|
|
||||||
|
expect(rows[0].id).toBe(n1.id);
|
||||||
|
expect(rows[1].id).toBe(n2.id);
|
||||||
|
});
|
||||||
|
|
||||||
// TODO: Need to update and replace jasmine.mockDate() calls with Jest
|
// TODO: Need to update and replace jasmine.mockDate() calls with Jest
|
||||||
// equivalent
|
// equivalent
|
||||||
|
|
||||||
|
@ -2,12 +2,13 @@ import Logger from '@joplin/utils/Logger';
|
|||||||
import ItemChange from '../../models/ItemChange';
|
import ItemChange from '../../models/ItemChange';
|
||||||
import Setting from '../../models/Setting';
|
import Setting from '../../models/Setting';
|
||||||
import Note from '../../models/Note';
|
import Note from '../../models/Note';
|
||||||
import BaseModel from '../../BaseModel';
|
import BaseModel, { ModelType } from '../../BaseModel';
|
||||||
import ItemChangeUtils from '../ItemChangeUtils';
|
import ItemChangeUtils from '../ItemChangeUtils';
|
||||||
import shim from '../../shim';
|
import shim from '../../shim';
|
||||||
import filterParser, { Term } from './filterParser';
|
import filterParser, { Term } from './filterParser';
|
||||||
import queryBuilder from './queryBuilder';
|
import queryBuilder from './queryBuilder';
|
||||||
import { ItemChangeEntity, NoteEntity } from '../database/types';
|
import { ItemChangeEntity, NoteEntity } from '../database/types';
|
||||||
|
import JoplinDatabase from '../../JoplinDatabase';
|
||||||
const { sprintf } = require('sprintf-js');
|
const { sprintf } = require('sprintf-js');
|
||||||
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
|
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
|
||||||
|
|
||||||
@ -28,6 +29,17 @@ interface SearchOptions {
|
|||||||
appendWildCards?: boolean;
|
appendWildCards?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface ProcessResultsRow {
|
||||||
|
offsets: string;
|
||||||
|
user_updated_time: number;
|
||||||
|
matchinfo: Buffer;
|
||||||
|
item_type?: ModelType;
|
||||||
|
fields?: string[];
|
||||||
|
weight?: number;
|
||||||
|
is_todo?: number;
|
||||||
|
todo_completed?: number;
|
||||||
|
}
|
||||||
|
|
||||||
export interface ComplexTerm {
|
export interface ComplexTerm {
|
||||||
type: 'regex' | 'text';
|
type: 'regex' | 'text';
|
||||||
value: string;
|
value: string;
|
||||||
@ -53,7 +65,7 @@ export default class SearchEngine {
|
|||||||
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
|
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
|
||||||
public dispatch: Function = (_o: any) => {};
|
public dispatch: Function = (_o: any) => {};
|
||||||
private logger_ = new Logger();
|
private logger_ = new Logger();
|
||||||
private db_: any = null;
|
private db_: JoplinDatabase = null;
|
||||||
private isIndexing_ = false;
|
private isIndexing_ = false;
|
||||||
private syncCalls_: any[] = [];
|
private syncCalls_: any[] = [];
|
||||||
private scheduleSyncTablesIID_: any;
|
private scheduleSyncTablesIID_: any;
|
||||||
@ -93,8 +105,8 @@ export default class SearchEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async rebuildIndex_() {
|
private async rebuildIndex_() {
|
||||||
let noteIds: string[] = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
|
const notes = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
|
||||||
noteIds = noteIds.map((n: any) => n.id);
|
const noteIds = notes.map(n => n.id);
|
||||||
|
|
||||||
const lastChangeId = await ItemChange.lastChangeId();
|
const lastChangeId = await ItemChange.lastChangeId();
|
||||||
|
|
||||||
@ -292,7 +304,7 @@ export default class SearchEngine {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
private calculateWeightBM25_(rows: any[]) {
|
private calculateWeightBM25_(rows: ProcessResultsRow[]) {
|
||||||
// https://www.sqlite.org/fts3.html#matchinfo
|
// https://www.sqlite.org/fts3.html#matchinfo
|
||||||
// pcnalx are the arguments passed to matchinfo
|
// pcnalx are the arguments passed to matchinfo
|
||||||
// p - The number of matchable phrases in the query.
|
// p - The number of matchable phrases in the query.
|
||||||
@ -318,7 +330,6 @@ export default class SearchEngine {
|
|||||||
const TITLE_COLUMN = 1;
|
const TITLE_COLUMN = 1;
|
||||||
const BODY_COLUMN = 2;
|
const BODY_COLUMN = 2;
|
||||||
const columns = [TITLE_COLUMN, BODY_COLUMN];
|
const columns = [TITLE_COLUMN, BODY_COLUMN];
|
||||||
// const NUM_COLS = 12;
|
|
||||||
|
|
||||||
const numPhrases = generalInfo[0]; // p
|
const numPhrases = generalInfo[0]; // p
|
||||||
const numColumns = generalInfo[1]; // c
|
const numColumns = generalInfo[1]; // c
|
||||||
@ -332,21 +343,23 @@ export default class SearchEngine {
|
|||||||
const numBodyTokens = matchInfo.map(m => m[5 + numColumns]);
|
const numBodyTokens = matchInfo.map(m => m[5 + numColumns]);
|
||||||
const numTokens = [null, numTitleTokens, numBodyTokens];
|
const numTokens = [null, numTitleTokens, numBodyTokens];
|
||||||
|
|
||||||
const X = matchInfo.map(m => m.slice(27)); // x
|
// In byte size, we have for notes_normalized:
|
||||||
|
//
|
||||||
|
// p 1
|
||||||
|
// c 1
|
||||||
|
// n 1
|
||||||
|
// a 12
|
||||||
|
// l 12
|
||||||
|
const X = matchInfo.map(m => m.slice(1 + 1 + 1 + numColumns + numColumns)); // x
|
||||||
|
|
||||||
const hitsThisRow = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 0];
|
const hitsThisRow = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 0];
|
||||||
// const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1];
|
// const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1];
|
||||||
const docsWithHits = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 2];
|
const docsWithHits = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 2];
|
||||||
|
|
||||||
|
const IDF = (n: number, N: number) => Math.max(Math.log(((N - n + 0.5) / (n + 0.5)) + 1), 0);
|
||||||
// if a term occurs in over half the documents in the collection
|
|
||||||
// then this model gives a negative term weight, which is presumably undesirable.
|
|
||||||
// But, assuming the use of a stop list, this normally doesn't happen,
|
|
||||||
// and the value for each summand can be given a floor of 0.
|
|
||||||
const IDF = (n: number, N: number) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0);
|
|
||||||
|
|
||||||
// https://en.wikipedia.org/wiki/Okapi_BM25
|
// https://en.wikipedia.org/wiki/Okapi_BM25
|
||||||
const BM25 = (idf: any, freq: any, numTokens: number, avgTokens: any) => {
|
const BM25 = (idf: number, freq: number, numTokens: number, avgTokens: number) => {
|
||||||
if (avgTokens === 0) {
|
if (avgTokens === 0) {
|
||||||
return 0; // To prevent division by zero
|
return 0; // To prevent division by zero
|
||||||
}
|
}
|
||||||
@ -355,7 +368,7 @@ export default class SearchEngine {
|
|||||||
|
|
||||||
const msSinceEpoch = Math.round(new Date().getTime());
|
const msSinceEpoch = Math.round(new Date().getTime());
|
||||||
const msPerDay = 86400000;
|
const msPerDay = 86400000;
|
||||||
const weightForDaysSinceLastUpdate = (row: any) => {
|
const weightForDaysSinceLastUpdate = (row: ProcessResultsRow) => {
|
||||||
// BM25 weights typically range 0-10, and last updated date should weight similarly, though prioritizing recency logarithmically.
|
// BM25 weights typically range 0-10, and last updated date should weight similarly, though prioritizing recency logarithmically.
|
||||||
// An alpha of 200 ensures matches in the last week will show up front (11.59) and often so for matches within 2 weeks (5.99),
|
// An alpha of 200 ensures matches in the last week will show up front (11.59) and often so for matches within 2 weeks (5.99),
|
||||||
// but is much less of a factor at 30 days (2.84) or very little after 90 days (0.95), focusing mostly on content at that point.
|
// but is much less of a factor at 30 days (2.84) or very little after 90 days (0.95), focusing mostly on content at that point.
|
||||||
@ -405,7 +418,7 @@ export default class SearchEngine {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private processResults_(rows: any[], parsedQuery: any, isBasicSearchResults = false) {
|
private processResults_(rows: ProcessResultsRow[], parsedQuery: any, isBasicSearchResults = false) {
|
||||||
if (isBasicSearchResults) {
|
if (isBasicSearchResults) {
|
||||||
this.processBasicSearchResults_(rows, parsedQuery);
|
this.processBasicSearchResults_(rows, parsedQuery);
|
||||||
} else {
|
} else {
|
||||||
@ -642,7 +655,7 @@ export default class SearchEngine {
|
|||||||
try {
|
try {
|
||||||
const { query, params } = queryBuilder(parsedQuery.allTerms, useFts);
|
const { query, params } = queryBuilder(parsedQuery.allTerms, useFts);
|
||||||
const rows = await this.db().selectAll(query, params);
|
const rows = await this.db().selectAll(query, params);
|
||||||
this.processResults_(rows, parsedQuery, !useFts);
|
this.processResults_(rows as ProcessResultsRow[], parsedQuery, !useFts);
|
||||||
return rows;
|
return rows;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
|
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user