1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-06-21 23:17:42 +02:00

All: When searching, weight notes using Okapi BM25 score (#3454)

This commit is contained in:
Naveen M V
2020-08-19 04:23:28 +05:30
committed by GitHub
parent 0ae8d454ad
commit 82e96840e9
4 changed files with 196 additions and 26 deletions

View File

@ -252,6 +252,84 @@ class SearchEngine {
return occurenceCount / spread;
}
calculateWeightBM25_(rows) {
// https://www.sqlite.org/fts3.html#matchinfo
// pcnalx are the arguments passed to matchinfo
// p - The number of matchable phrases in the query.
// c - The number of user defined columns in the FTS table
// n - The number of rows in the FTS4 table.
// a - avg number of tokens in the text values stored in the column.
// l - For each column, the length of the value stored in the current
// row of the FTS4 table, in tokens.
// x - For each distinct combination of a phrase and table column, the
// following three values:
// hits_this_row
// hits_all_rows
// docs_with_hits
if (rows.length === 0) return;
const matchInfo = rows.map(row => new Uint32Array(row.matchinfo.buffer));
const generalInfo = matchInfo[0];
const K1 = 1.2;
const B = 0.75;
const TITLE_COLUMN = 1;
const BODY_COLUMN = 2;
const columns = [TITLE_COLUMN, BODY_COLUMN];
// const NUM_COLS = 12;
const numPhrases = generalInfo[0]; // p
const numColumns = generalInfo[1]; // c
const numRows = generalInfo[2]; // n
const avgTitleTokens = generalInfo[4]; // a
const avgBodyTokens = generalInfo[5];
const avgTokens = [null, avgTitleTokens, avgBodyTokens]; // we only need cols 1 and 2
const numTitleTokens = matchInfo.map(m => m[4 + numColumns]); // l
const numBodyTokens = matchInfo.map(m => m[5 + numColumns]);
const numTokens = [null, numTitleTokens, numBodyTokens];
const X = matchInfo.map(m => m.slice(27)); // x
const hitsThisRow = (array, c, p) => array[3 * (c + p * numColumns) + 0];
// const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1];
const docsWithHits = (array, c, p) => array[3 * (c + p * numColumns) + 2];
// if a term occurs in over half the documents in the collection
// then this model gives a negative term weight, which is presumably undesirable.
// But, assuming the use of a stop list, this normally doesn't happen,
// and the value for each summand can be given a floor of 0.
const IDF = (n, N) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0);
// https://en.wikipedia.org/wiki/Okapi_BM25
const BM25 = (idf, freq, numTokens, avgTokens) => {
if (avgTokens === 0) {
return 0; // To prevent division by zero
}
return idf * (freq * (K1 + 1)) / (freq + K1 * (1 - B + B * (numTokens / avgTokens)));
};
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.weight = 0;
for (let j = 0; j < numPhrases; j++) {
columns.forEach(column => {
const rowsWithHits = docsWithHits(X[i], column, j);
const frequencyHits = hitsThisRow(X[i], column, j);
const idf = IDF(rowsWithHits, numRows);
row.weight += BM25(idf, frequencyHits, numTokens[column][i], avgTokens[column]);
});
}
}
}
processBasicSearchResults_(rows, parsedQuery) {
const valueRegexs = parsedQuery.keys.includes('_') ? parsedQuery.terms['_'].map(term => term.valueRegex || term.value) : [];
const isTitleSearch = parsedQuery.keys.includes('title');
@ -274,10 +352,10 @@ class SearchEngine {
if (isBasicSearchResults) {
this.processBasicSearchResults_(rows, parsedQuery);
} else {
this.calculateWeightBM25_(rows);
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const offsets = row.offsets.split(' ').map(o => Number(o));
row.weight = this.calculateWeight_(offsets, parsedQuery.termCount);
row.fields = this.fieldNamesFromOffsets_(offsets);
}
}

View File

@ -42,6 +42,11 @@ class SearchEngineUtils {
if (idWasAutoAdded) delete sortedNotes[idx].id;
}
// Note that when the search engine index is somehow corrupted, it might contain
// references to notes that don't exist. Not clear how it can happen, but anyway
// handle it here by checking if `user_updated_time` IS NOT NULL. Was causing this
// issue: https://discourse.joplinapp.org/t/how-to-recover-corrupted-database/9367
if (noteIds.length !== notes.length) {
// remove null objects
return sortedNotes.filter(n => n);

View File

@ -389,6 +389,7 @@ export default function queryBuilder(terms: Term[]) {
notes_fts.id,
notes_fts.title,
offsets(notes_fts) AS offsets,
matchinfo(notes_fts, 'pcnalx') AS matchinfo,
notes_fts.user_created_time,
notes_fts.user_updated_time,
notes_fts.is_todo,