mirror of
https://github.com/laurent22/joplin.git
synced 2024-12-24 10:27:10 +02:00
All: Sort search results by average of multiple criteria, including 'Sort notes by' field setting (#3777)
* Weight search results by most recently updated As discussed here: https://github.com/laurent22/joplin/pull/3777#issuecomment-696491859 Before this commit, results were rarely sorted by date. Content weights and fuzziness were determined, and then the first criteria to differ would win in sort order (and user_updated_time was the last criteria checked). Now the weight score itself will also include age of user_updated_time, surfacing fresh content. At the current alpha level, results are weighted logarithmically, prioritizing mostly within the last 30 days, and especially heavily within the past week. * Updated unit tests to weight search results by last updated date * Updated unit test title * Fixed issue with weighted search engine test, and made it more deterministic using mock date Date was being calculated only at the start of the test suite. It also wasn't using a set mock date, so the milliseconds between the real search engine calculations and the test calculation caused differences in results * Added initial Search Engine spec * Added Search Engine spec to README.md * Renamed Search Sorting spec per laurent22's mentioned naming * Revised copy in search sorting spec Co-authored-by: Laurent <laurent22@users.noreply.github.com>
This commit is contained in:
parent
c42d9cf069
commit
5eb0417b1a
@ -4,7 +4,7 @@
|
|||||||
require('app-module-path').addPath(__dirname);
|
require('app-module-path').addPath(__dirname);
|
||||||
|
|
||||||
const { time } = require('lib/time-utils.js');
|
const { time } = require('lib/time-utils.js');
|
||||||
const { fileContentEqual, setupDatabase, setupDatabaseAndSynchronizer, asyncTest, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync } = require('test-utils.js');
|
const { fileContentEqual, setupDatabase, setupDatabaseAndSynchronizer, asyncTest, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync, mockDate, restoreDate } = require('test-utils.js');
|
||||||
const SearchEngine = require('lib/services/searchengine/SearchEngine');
|
const SearchEngine = require('lib/services/searchengine/SearchEngine');
|
||||||
const Note = require('lib/models/Note');
|
const Note = require('lib/models/Note');
|
||||||
const ItemChange = require('lib/models/ItemChange');
|
const ItemChange = require('lib/models/ItemChange');
|
||||||
@ -33,16 +33,32 @@ const calculateScore = (searchString, notes) => {
|
|||||||
const numTokens = notes.map(note => note.title.split(' ').length);
|
const numTokens = notes.map(note => note.title.split(' ').length);
|
||||||
const avgTokens = Math.round(numTokens.reduce((a, b) => a + b, 0) / notes.length);
|
const avgTokens = Math.round(numTokens.reduce((a, b) => a + b, 0) / notes.length);
|
||||||
|
|
||||||
let titleBM25 = new Array(notes.length).fill(-1);
|
const msSinceEpoch = Math.round(new Date().getTime());
|
||||||
|
const msPerDay = 86400000;
|
||||||
|
const weightForDaysSinceLastUpdate = (row) => {
|
||||||
|
// BM25 weights typically range 0-10, and last updated date should weight similarly, though prioritizing recency logarithmically.
|
||||||
|
// An alpha of 200 ensures matches in the last week will show up front (11.59) and often so for matches within 2 weeks (5.99),
|
||||||
|
// but is much less of a factor at 30 days (2.84) or very little after 90 days (0.95), focusing mostly on content at that point.
|
||||||
|
if (!row.user_updated_time) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const alpha = 200;
|
||||||
|
const daysSinceLastUpdate = (msSinceEpoch - row.user_updated_time) / msPerDay;
|
||||||
|
return alpha * Math.log(1 + 1 / Math.max(daysSinceLastUpdate, 0.5));
|
||||||
|
};
|
||||||
|
|
||||||
|
let titleBM25WeightedByLastUpdate = new Array(notes.length).fill(-1);
|
||||||
if (avgTokens != 0) {
|
if (avgTokens != 0) {
|
||||||
for (let i = 0; i < notes.length; i++) {
|
for (let i = 0; i < notes.length; i++) {
|
||||||
titleBM25[i] = IDF(notes.length, notesWithWord) * ((freqTitle[i] * (K1 + 1)) / (freqTitle[i] + K1 * (1 - B + B * (numTokens[i] / avgTokens))));
|
titleBM25WeightedByLastUpdate[i] = IDF(notes.length, notesWithWord) * ((freqTitle[i] * (K1 + 1)) / (freqTitle[i] + K1 * (1 - B + B * (numTokens[i] / avgTokens))));
|
||||||
|
titleBM25WeightedByLastUpdate[i] += weightForDaysSinceLastUpdate(notes[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const scores = [];
|
const scores = [];
|
||||||
for (let i = 0; i < notes.length; i++) {
|
for (let i = 0; i < notes.length; i++) {
|
||||||
if (freqTitle[i]) scores.push(titleBM25[i]);
|
if (freqTitle[i]) scores.push(titleBM25WeightedByLastUpdate[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
scores.sort().reverse();
|
scores.sort().reverse();
|
||||||
@ -142,33 +158,54 @@ describe('services_SearchEngine', function() {
|
|||||||
expect(rows[1].id).toBe(n2.id);
|
expect(rows[1].id).toBe(n2.id);
|
||||||
}));
|
}));
|
||||||
|
|
||||||
it('should correctly weigh notes using BM25', asyncTest(async () => {
|
it('should correctly weigh notes using BM25 and user_updated_time', asyncTest(async () => {
|
||||||
|
await mockDate(2020, 9, 30, 50);
|
||||||
const noteData = [
|
const noteData = [
|
||||||
{
|
{
|
||||||
title: 'abc test2 test2',
|
title: 'abc test2 test2',
|
||||||
|
updated_time: 1601425064756,
|
||||||
|
user_updated_time: 1601425064756,
|
||||||
|
created_time: 1601425064756,
|
||||||
|
user_created_time: 1601425064756,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: 'foo foo',
|
title: 'foo foo',
|
||||||
|
updated_time: 1601425064758,
|
||||||
|
user_updated_time: 1601425064758,
|
||||||
|
created_time: 1601425064758,
|
||||||
|
user_created_time: 1601425064758,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: 'dead beef',
|
title: 'dead beef',
|
||||||
|
updated_time: 1601425064760,
|
||||||
|
user_updated_time: 1601425064760,
|
||||||
|
created_time: 1601425064760,
|
||||||
|
user_created_time: 1601425064760,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: 'test2 bar',
|
title: 'test2 bar',
|
||||||
|
updated_time: 1601425064761,
|
||||||
|
user_updated_time: 1601425064761,
|
||||||
|
created_time: 1601425064761,
|
||||||
|
user_created_time: 1601425064761,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: 'blah blah abc',
|
title: 'blah blah abc',
|
||||||
|
updated_time: 1601425064763,
|
||||||
|
user_updated_time: 1601425064763,
|
||||||
|
created_time: 1601425064763,
|
||||||
|
user_created_time: 1601425064763,
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
const n0 = await Note.save(noteData[0]);
|
const n0 = await Note.save(noteData[0], { autoTimestamp: false });
|
||||||
const n1 = await Note.save(noteData[1]);
|
const n1 = await Note.save(noteData[1], { autoTimestamp: false });
|
||||||
const n2 = await Note.save(noteData[2]);
|
const n2 = await Note.save(noteData[2], { autoTimestamp: false });
|
||||||
const n3 = await Note.save(noteData[3]);
|
const n3 = await Note.save(noteData[3], { autoTimestamp: false });
|
||||||
const n4 = await Note.save(noteData[4]);
|
const n4 = await Note.save(noteData[4], { autoTimestamp: false });
|
||||||
|
restoreDate();
|
||||||
await engine.syncTables();
|
await engine.syncTables();
|
||||||
|
await mockDate(2020, 9, 30, 50);
|
||||||
|
|
||||||
let searchString = 'abc';
|
let searchString = 'abc';
|
||||||
let scores = calculateScore(searchString, noteData);
|
let scores = calculateScore(searchString, noteData);
|
||||||
@ -198,6 +235,7 @@ describe('services_SearchEngine', function() {
|
|||||||
// console.log(scores);
|
// console.log(scores);
|
||||||
|
|
||||||
expect(rows[0].weight).toEqual(scores[0]);
|
expect(rows[0].weight).toEqual(scores[0]);
|
||||||
|
await restoreDate();
|
||||||
}));
|
}));
|
||||||
|
|
||||||
it('should tell where the results are found', asyncTest(async () => {
|
it('should tell where the results are found', asyncTest(async () => {
|
||||||
|
@ -634,6 +634,16 @@ function tempFilePath(ext) {
|
|||||||
return `${Setting.value('tempDir')}/${md5(Date.now() + Math.random())}.${ext}`;
|
return `${Setting.value('tempDir')}/${md5(Date.now() + Math.random())}.${ext}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function mockDate(year, month, day, tick) {
|
||||||
|
const fixedDate = new Date(2020, 0, 1);
|
||||||
|
jasmine.clock().install();
|
||||||
|
jasmine.clock().mockDate(fixedDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
function restoreDate() {
|
||||||
|
jasmine.clock().uninstall();
|
||||||
|
}
|
||||||
|
|
||||||
// Application for feature integration testing
|
// Application for feature integration testing
|
||||||
class TestApp extends BaseApplication {
|
class TestApp extends BaseApplication {
|
||||||
constructor(hasGui = true) {
|
constructor(hasGui = true) {
|
||||||
@ -702,4 +712,4 @@ class TestApp extends BaseApplication {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { synchronizerStart, syncTargetName, setSyncTargetName, syncDir, isNetworkSyncTarget, kvStore, expectThrow, logger, expectNotThrow, resourceService, resourceFetcher, tempFilePath, allSyncTargetItemsEncrypted, msleep, setupDatabase, revisionService, setupDatabaseAndSynchronizer, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync, checkThrow, encryptionService, loadEncryptionMasterKey, fileContentEqual, decryptionWorker, asyncTest, currentClientId, id, ids, sortedIds, at, createNTestNotes, createNTestFolders, createNTestTags, TestApp };
|
module.exports = { synchronizerStart, syncTargetName, setSyncTargetName, syncDir, isNetworkSyncTarget, kvStore, expectThrow, logger, expectNotThrow, resourceService, resourceFetcher, tempFilePath, allSyncTargetItemsEncrypted, msleep, setupDatabase, revisionService, setupDatabaseAndSynchronizer, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync, checkThrow, encryptionService, loadEncryptionMasterKey, fileContentEqual, decryptionWorker, asyncTest, currentClientId, id, ids, sortedIds, at, createNTestNotes, createNTestFolders, createNTestTags, mockDate, restoreDate, TestApp };
|
||||||
|
@ -100,6 +100,7 @@ The Web Clipper is a browser extension that allows you to save web pages and scr
|
|||||||
- [Note History spec](https://github.com/laurent22/joplin/blob/dev/readme/spec/history.md)
|
- [Note History spec](https://github.com/laurent22/joplin/blob/dev/readme/spec/history.md)
|
||||||
- [Sync Lock spec](https://github.com/laurent22/joplin/blob/dev/readme/spec/sync_lock.md)
|
- [Sync Lock spec](https://github.com/laurent22/joplin/blob/dev/readme/spec/sync_lock.md)
|
||||||
- [Plugin Architecture spec](https://github.com/laurent22/joplin/blob/dev/readme/spec/plugins.md)
|
- [Plugin Architecture spec](https://github.com/laurent22/joplin/blob/dev/readme/spec/plugins.md)
|
||||||
|
- [Search Sorting spec](https://github.com/laurent22/joplin/blob/master/readme/spec/search_sorting.md)
|
||||||
|
|
||||||
- Google Summer of Code 2020
|
- Google Summer of Code 2020
|
||||||
|
|
||||||
@ -345,7 +346,7 @@ You can also use search filters to further restrict the search.
|
|||||||
|**resource:**|Filter by attachment MIME type|`resource:image/jpeg` to return notes with a jpeg attachment.<br>`-resource:application/pdf` to return notes without a pdf attachment.<br>`resource:image/*` to return notes with any images.|
|
|**resource:**|Filter by attachment MIME type|`resource:image/jpeg` to return notes with a jpeg attachment.<br>`-resource:application/pdf` to return notes without a pdf attachment.<br>`resource:image/*` to return notes with any images.|
|
||||||
|**sourceurl:**|Filter by source URL|`sourceurl:https://www.google.com`<br>`sourceurl:*joplinapp.org` to perform a wildcard search.|
|
|**sourceurl:**|Filter by source URL|`sourceurl:https://www.google.com`<br>`sourceurl:*joplinapp.org` to perform a wildcard search.|
|
||||||
|
|
||||||
Note: In CliClient you have to escape the query using `--` when using negated filters.
|
Note: In CliClient you have to escape the query using `--` when using negated filters.
|
||||||
Eg. `:search -- "-tag:tag1"`.
|
Eg. `:search -- "-tag:tag1"`.
|
||||||
|
|
||||||
Notes are sorted by "relevance". Currently it means the notes that contain the requested terms the most times are on top. For queries with multiple terms, it also matters how close to each other the terms are. This is a bit experimental so if you notice a search query that returns unexpected results, please report it in the forum, providing as many details as possible to replicate the issue.
|
Notes are sorted by "relevance". Currently it means the notes that contain the requested terms the most times are on top. For queries with multiple terms, it also matters how close to each other the terms are. This is a bit experimental so if you notice a search query that returns unexpected results, please report it in the forum, providing as many details as possible to replicate the issue.
|
||||||
|
@ -328,6 +328,21 @@ class SearchEngine {
|
|||||||
return idf * (freq * (K1 + 1)) / (freq + K1 * (1 - B + B * (numTokens / avgTokens)));
|
return idf * (freq * (K1 + 1)) / (freq + K1 * (1 - B + B * (numTokens / avgTokens)));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const msSinceEpoch = Math.round(new Date().getTime());
|
||||||
|
const msPerDay = 86400000;
|
||||||
|
const weightForDaysSinceLastUpdate = (row) => {
|
||||||
|
// BM25 weights typically range 0-10, and last updated date should weight similarly, though prioritizing recency logarithmically.
|
||||||
|
// An alpha of 200 ensures matches in the last week will show up front (11.59) and often so for matches within 2 weeks (5.99),
|
||||||
|
// but is much less of a factor at 30 days (2.84) or very little after 90 days (0.95), focusing mostly on content at that point.
|
||||||
|
if (!row.user_updated_time) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const alpha = 200;
|
||||||
|
const daysSinceLastUpdate = (msSinceEpoch - row.user_updated_time) / msPerDay;
|
||||||
|
return alpha * Math.log(1 + 1 / Math.max(daysSinceLastUpdate, 0.5));
|
||||||
|
};
|
||||||
|
|
||||||
for (let i = 0; i < rows.length; i++) {
|
for (let i = 0; i < rows.length; i++) {
|
||||||
const row = rows[i];
|
const row = rows[i];
|
||||||
row.weight = 0;
|
row.weight = 0;
|
||||||
@ -346,6 +361,8 @@ class SearchEngine {
|
|||||||
});
|
});
|
||||||
row.wordFound.push(found);
|
row.wordFound.push(found);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
row.weight += weightForDaysSinceLastUpdate(row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
37
readme/spec/search_sorting.md
Normal file
37
readme/spec/search_sorting.md
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# Search Engine
|
||||||
|
|
||||||
|
The Search Engine powers the Search input in the note list and the Goto Anything dialog.
|
||||||
|
|
||||||
|
## Search algorithm
|
||||||
|
|
||||||
|
### Discretely using only the most critical parameter in sorting
|
||||||
|
|
||||||
|
Sorting occurs as the Search Engine processes results, after searching for and weighting these results.
|
||||||
|
|
||||||
|
Parameters include fuzziness, title matching, weight (based on BM25 and age), the completed status of to-dos, and the note's age.
|
||||||
|
|
||||||
|
The Search Engine uses only the first relevant parameter to determine the order, rather than a weighted average.
|
||||||
|
In effect, this means search results with note title matches will appear above all results that only matched the note body,
|
||||||
|
regardless of weight or other parameters.
|
||||||
|
|
||||||
|
### Determining weight as a sorting parameter
|
||||||
|
|
||||||
|
The Search Engine determines the weight parameter using both [BM25](https://en.wikipedia.org/wiki/Okapi_BM25)
|
||||||
|
and the number of days since last user update.
|
||||||
|
|
||||||
|
#### BM25
|
||||||
|
|
||||||
|
The Search Engine determines BM25 based on "term frequency-inverse document frequency."
|
||||||
|
The "TF–IDF" value increases proportionally to the number of times a word appears in the document
|
||||||
|
and is offset by the number of documents in the corpus that contain the word, which helps to adjust
|
||||||
|
for the fact that some words appear more frequently in general.
|
||||||
|
|
||||||
|
BM25 returns weight zero for a search term that occurs in more than half the notes.
|
||||||
|
So terms that are abundant in all notes to have zero relevance w.r.t. BM25.
|
||||||
|
|
||||||
|
#### Days since last user update
|
||||||
|
|
||||||
|
Sorting increases the BM25 weight by the inverse number of days since the note was updated.
|
||||||
|
Recent notes will, therefore, be weighted highly in the search results.
|
||||||
|
This time-based weight decays logarithmically, becoming less of a factor than BM25 after months have passed.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user