1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-01-26 18:58:21 +02:00

All: When searching, weight notes using Okapi BM25 score (#3454)

This commit is contained in:
Naveen M V 2020-08-19 04:23:28 +05:30 committed by GitHub
parent 0ae8d454ad
commit 82e96840e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 196 additions and 26 deletions

View File

@ -16,6 +16,39 @@ process.on('unhandledRejection', (reason, p) => {
let engine = null;
const IDF = (N, n) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0);
const frequency = (word, string) => {
const re = new RegExp(`\\b(${word})\\b`, 'g');
return (string.match(re) || []).length;
};
const calculateScore = (searchString, notes) => {
const K1 = 1.2;
const B = 0.75;
const freqTitle = notes.map(note => frequency(searchString, note.title));
const notesWithWord = freqTitle.filter(count => count !== 0).length;
const numTokens = notes.map(note => note.title.split(' ').length);
const avgTokens = Math.round(numTokens.reduce((a, b) => a + b, 0) / notes.length);
let titleBM25 = new Array(notes.length).fill(-1);
if (avgTokens != 0) {
for (let i = 0; i < notes.length; i++) {
titleBM25[i] = IDF(notes.length, notesWithWord) * ((freqTitle[i] * (K1 + 1)) / (freqTitle[i] + K1 * (1 - B + B * (numTokens[i] / avgTokens))));
}
}
const scores = [];
for (let i = 0; i < notes.length; i++) {
if (freqTitle[i]) scores.push(titleBM25[i]);
}
scores.sort().reverse();
return scores;
};
describe('services_SearchEngine', function() {
beforeEach(async (done) => {
@ -79,17 +112,92 @@ describe('services_SearchEngine', function() {
}));
it('should order search results by relevance (1)', asyncTest(async () => {
it('should order search results by relevance BM25', asyncTest(async () => {
// BM25 is based on term frequency - inverse document frequency
// The tf–idf value increases proportionally to the number of times a word appears in the document
// and is offset by the number of documents in the corpus that contain the word, which helps to adjust
// for the fact that some words appear more frequently in general.
// BM25 returns weight zero for search term which occurs in more than half the notes.
// So terms that are abundant in all notes to have zero relevance w.r.t BM25.
const n1 = await Note.save({ title: 'abcd efgh' }); // 3
const n2 = await Note.save({ title: 'abcd aaaaa abcd abcd' }); // 1
const n2 = await Note.save({ title: 'abcd efgh abcd abcd' }); // 1
const n3 = await Note.save({ title: 'abcd aaaaa bbbb eeee abcd' }); // 2
const n4 = await Note.save({ title: 'xyz xyz' });
const n5 = await Note.save({ title: 'xyz xyz xyz xyz' });
const n6 = await Note.save({ title: 'xyz xyz xyz xyz xyz xyz' });
const n7 = await Note.save({ title: 'xyz xyz xyz xyz xyz xyz' });
const n8 = await Note.save({ title: 'xyz xyz xyz xyz xyz xyz xyz xyz' });
await engine.syncTables();
const rows = await engine.search('abcd');
let rows = await engine.search('abcd');
expect(rows[0].id).toBe(n2.id);
expect(rows[1].id).toBe(n3.id);
expect(rows[2].id).toBe(n1.id);
rows = await engine.search('abcd efgh');
expect(rows[0].id).toBe(n1.id); // shorter note; also 'efgh' is more rare than 'abcd'.
expect(rows[1].id).toBe(n2.id);
}));
it('should correctly weigh notes using BM25', asyncTest(async () => {
const noteData = [
{
title: 'abc test2 test2',
},
{
title: 'foo foo',
},
{
title: 'dead beef',
},
{
title: 'test2 bar',
},
{
title: 'blah blah abc',
},
];
const n0 = await Note.save(noteData[0]);
const n1 = await Note.save(noteData[1]);
const n2 = await Note.save(noteData[2]);
const n3 = await Note.save(noteData[3]);
const n4 = await Note.save(noteData[4]);
await engine.syncTables();
let searchString = 'abc';
let scores = calculateScore(searchString, noteData);
let rows = await engine.search(searchString);
expect(rows[0].weight).toEqual(scores[0]);
expect(rows[1].weight).toEqual(scores[1]);
// console.log(rows);
// console.log(scores);
searchString = 'test2';
scores = calculateScore(searchString, noteData);
rows = await engine.search(searchString);
// console.log(rows);
// console.log(scores);
expect(rows[0].weight).toEqual(scores[0]);
expect(rows[1].weight).toEqual(scores[1]);
searchString = 'foo';
scores = calculateScore(searchString, noteData);
rows = await engine.search(searchString);
// console.log(rows);
// console.log(scores);
expect(rows[0].weight).toEqual(scores[0]);
}));
it('should tell where the results are found', asyncTest(async () => {
@ -118,28 +226,6 @@ describe('services_SearchEngine', function() {
}
}));
it('should order search results by relevance (2)', asyncTest(async () => {
// 1
const n1 = await Note.save({ title: 'abcd efgh', body: 'XX abcd XX efgh' });
// 4
const n2 = await Note.save({ title: 'abcd aaaaa bbbb eeee efgh' });
// 3
const n3 = await Note.save({ title: 'abcd aaaaa efgh' });
// 2
const n4 = await Note.save({ title: 'blablablabla blabla bla abcd X efgh' });
// 5
const n5 = await Note.save({ title: 'occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh' });
await engine.syncTables();
const rows = await engine.search('abcd efgh');
expect(rows[0].id).toBe(n1.id);
expect(rows[1].id).toBe(n4.id);
expect(rows[2].id).toBe(n3.id);
expect(rows[3].id).toBe(n2.id);
expect(rows[4].id).toBe(n5.id);
}));
it('should order search results by relevance (last updated first)', asyncTest(async () => {
let rows;

View File

@ -252,6 +252,84 @@ class SearchEngine {
return occurenceCount / spread;
}
calculateWeightBM25_(rows) {
// https://www.sqlite.org/fts3.html#matchinfo
// pcnalx are the arguments passed to matchinfo
// p - The number of matchable phrases in the query.
// c - The number of user defined columns in the FTS table
// n - The number of rows in the FTS4 table.
// a - avg number of tokens in the text values stored in the column.
// l - For each column, the length of the value stored in the current
// row of the FTS4 table, in tokens.
// x - For each distinct combination of a phrase and table column, the
// following three values:
// hits_this_row
// hits_all_rows
// docs_with_hits
if (rows.length === 0) return;
const matchInfo = rows.map(row => new Uint32Array(row.matchinfo.buffer));
const generalInfo = matchInfo[0];
const K1 = 1.2;
const B = 0.75;
const TITLE_COLUMN = 1;
const BODY_COLUMN = 2;
const columns = [TITLE_COLUMN, BODY_COLUMN];
// const NUM_COLS = 12;
const numPhrases = generalInfo[0]; // p
const numColumns = generalInfo[1]; // c
const numRows = generalInfo[2]; // n
const avgTitleTokens = generalInfo[4]; // a
const avgBodyTokens = generalInfo[5];
const avgTokens = [null, avgTitleTokens, avgBodyTokens]; // we only need cols 1 and 2
const numTitleTokens = matchInfo.map(m => m[4 + numColumns]); // l
const numBodyTokens = matchInfo.map(m => m[5 + numColumns]);
const numTokens = [null, numTitleTokens, numBodyTokens];
const X = matchInfo.map(m => m.slice(27)); // x
const hitsThisRow = (array, c, p) => array[3 * (c + p * numColumns) + 0];
// const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1];
const docsWithHits = (array, c, p) => array[3 * (c + p * numColumns) + 2];
// if a term occurs in over half the documents in the collection
// then this model gives a negative term weight, which is presumably undesirable.
// But, assuming the use of a stop list, this normally doesn't happen,
// and the value for each summand can be given a floor of 0.
const IDF = (n, N) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0);
// https://en.wikipedia.org/wiki/Okapi_BM25
const BM25 = (idf, freq, numTokens, avgTokens) => {
if (avgTokens === 0) {
return 0; // To prevent division by zero
}
return idf * (freq * (K1 + 1)) / (freq + K1 * (1 - B + B * (numTokens / avgTokens)));
};
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.weight = 0;
for (let j = 0; j < numPhrases; j++) {
columns.forEach(column => {
const rowsWithHits = docsWithHits(X[i], column, j);
const frequencyHits = hitsThisRow(X[i], column, j);
const idf = IDF(rowsWithHits, numRows);
row.weight += BM25(idf, frequencyHits, numTokens[column][i], avgTokens[column]);
});
}
}
}
processBasicSearchResults_(rows, parsedQuery) {
const valueRegexs = parsedQuery.keys.includes('_') ? parsedQuery.terms['_'].map(term => term.valueRegex || term.value) : [];
const isTitleSearch = parsedQuery.keys.includes('title');
@ -274,10 +352,10 @@ class SearchEngine {
if (isBasicSearchResults) {
this.processBasicSearchResults_(rows, parsedQuery);
} else {
this.calculateWeightBM25_(rows);
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const offsets = row.offsets.split(' ').map(o => Number(o));
row.weight = this.calculateWeight_(offsets, parsedQuery.termCount);
row.fields = this.fieldNamesFromOffsets_(offsets);
}
}

View File

@ -42,6 +42,11 @@ class SearchEngineUtils {
if (idWasAutoAdded) delete sortedNotes[idx].id;
}
// Note that when the search engine index is somehow corrupted, it might contain
// references to notes that don't exist. Not clear how it can happen, but anyway
// handle it here by checking if `user_updated_time` IS NOT NULL. Was causing this
// issue: https://discourse.joplinapp.org/t/how-to-recover-corrupted-database/9367
if (noteIds.length !== notes.length) {
// remove null objects
return sortedNotes.filter(n => n);

View File

@ -389,6 +389,7 @@ export default function queryBuilder(terms: Term[]) {
notes_fts.id,
notes_fts.title,
offsets(notes_fts) AS offsets,
matchinfo(notes_fts, 'pcnalx') AS matchinfo,
notes_fts.user_created_time,
notes_fts.user_updated_time,
notes_fts.is_todo,