1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-01-11 18:24:43 +02:00

Nearly finished search engine backend

This commit is contained in:
Laurent Cozic 2018-12-10 18:54:46 +00:00
parent cb16a10121
commit 460f826672
4 changed files with 206 additions and 50 deletions

View File

@ -6,6 +6,7 @@ const markdownUtils = require('lib/markdownUtils.js');
const SearchEngine = require('lib/services/SearchEngine'); const SearchEngine = require('lib/services/SearchEngine');
const Folder = require('lib/models/Folder'); const Folder = require('lib/models/Folder');
const Note = require('lib/models/Note'); const Note = require('lib/models/Note');
const ItemChange = require('lib/models/ItemChange');
const Tag = require('lib/models/Tag'); const Tag = require('lib/models/Tag');
const Resource = require('lib/models/Resource'); const Resource = require('lib/models/Resource');
@ -22,19 +23,103 @@ describe('services_SearchEngine', function() {
beforeEach(async (done) => { beforeEach(async (done) => {
await setupDatabaseAndSynchronizer(1); await setupDatabaseAndSynchronizer(1);
await switchClient(1); await switchClient(1);
engine = new SearchEngine(); engine = new SearchEngine();
engine.setDb(db()); engine.setDb(db());
await engine.dropFtsTables();
await engine.createFtsTables();
done(); done();
}); });
it('should create the FTS table', async (done) => { it('should create the FTS table', async (done) => {
let rows;
await Note.save({ title: "abcd efgh" }); await Note.save({ title: "abcd efgh" });
await Note.save({ title: "abcd aaaaa bbbb eeee efgh" }); rows = await engine.search('abcd efgh');
await Note.save({ title: "abcd aaaaa efgh" }); expect(rows.length).toBe(0);
await Note.save({ title: "blablablabla blabla bla abcd X efgh" });
await Note.save({ title: "occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh" }); rows = await engine.search('abcd efgh');
expect(await engine.countRows()).toBe(1);
done();
});
it('should update the FTS table', async (done) => {
let rows;
expect(await engine.countRows()).toBe(0);
await Note.save({ title: "abcd efgh" });
await engine.updateFtsTables();
expect(await engine.countRows()).toBe(1);
await Note.save({ title: "abcd efgh" });
await engine.updateFtsTables();
expect(await engine.countRows()).toBe(2);
await engine.updateFtsTables(); await engine.updateFtsTables();
expect(await engine.countRows()).toBe(2);
done();
});
it('should order search results by relevance', async (done) => {
// 1
const n1 = await Note.save({ title: "abcd efgh", body: "XX abcd XX efgh" });
// 4
const n2 = await Note.save({ title: "abcd aaaaa bbbb eeee efgh" });
// 3
const n3 = await Note.save({ title: "abcd aaaaa efgh" });
// 2
const n4 = await Note.save({ title: "blablablabla blabla bla abcd X efgh" });
// 5
const n5 = await Note.save({ title: "occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh" });
await engine.updateFtsTables();
const rows = await engine.search('abcd efgh');
expect(rows[0].id).toBe(n1.id);
expect(rows[1].id).toBe(n4.id);
expect(rows[2].id).toBe(n3.id);
expect(rows[3].id).toBe(n2.id);
expect(rows[4].id).toBe(n5.id);
done();
});
it('should supports various query types', async (done) => {
let rows;
const n1 = await Note.save({ title: "abcd efgh ijkl", body: "aaaa bbbb" });
const n2 = await Note.save({ title: "iiii efgh bbbb", body: "aaaa bbbb" });
await engine.updateFtsTables();
rows = await engine.search('abcd ijkl');
expect(rows.length).toBe(1);
rows = await engine.search('"abcd ijkl"');
expect(rows.length).toBe(0);
rows = await engine.search('"abcd efgh"');
expect(rows.length).toBe(1);
rows = await engine.search('title:abcd');
expect(rows.length).toBe(1);
rows = await engine.search('title:efgh');
expect(rows.length).toBe(2);
rows = await engine.search('body:abcd');
expect(rows.length).toBe(0);
rows = await engine.search('body:bbbb');
expect(rows.length).toBe(2);
rows = await engine.search('body:bbbb iiii');
expect(rows.length).toBe(1);
done(); done();
}); });

View File

@ -4,6 +4,7 @@ const { DatabaseDriverNode } = require('lib/database-driver-node.js');
const BaseModel = require('lib/BaseModel.js'); const BaseModel = require('lib/BaseModel.js');
const Folder = require('lib/models/Folder.js'); const Folder = require('lib/models/Folder.js');
const Note = require('lib/models/Note.js'); const Note = require('lib/models/Note.js');
const ItemChange = require('lib/models/ItemChange.js');
const Resource = require('lib/models/Resource.js'); const Resource = require('lib/models/Resource.js');
const Tag = require('lib/models/Tag.js'); const Tag = require('lib/models/Tag.js');
const NoteTag = require('lib/models/NoteTag.js'); const NoteTag = require('lib/models/NoteTag.js');
@ -122,6 +123,8 @@ async function switchClient(id) {
async function clearDatabase(id = null) { async function clearDatabase(id = null) {
if (id === null) id = currentClient_; if (id === null) id = currentClient_;
await ItemChange.waitForAllSaved();
let queries = [ let queries = [
'DELETE FROM notes', 'DELETE FROM notes',
'DELETE FROM folders', 'DELETE FROM folders',

View File

@ -159,6 +159,7 @@ class Setting extends BaseModel {
'api.token': { value: null, type: Setting.TYPE_STRING, public: false }, 'api.token': { value: null, type: Setting.TYPE_STRING, public: false },
'resourceService.lastProcessedChangeId': { value: 0, type: Setting.TYPE_INT, public: false }, 'resourceService.lastProcessedChangeId': { value: 0, type: Setting.TYPE_INT, public: false },
'searchEngine.lastProcessedChangeId': { value: 0, type: Setting.TYPE_INT, public: false },
}; };
return this.metadata_; return this.metadata_;

View File

@ -1,4 +1,8 @@
const { Logger } = require('lib/logger.js'); const { Logger } = require('lib/logger.js');
const ItemChange = require('lib/models/ItemChange.js');
const Setting = require('lib/models/Setting.js');
const Note = require('lib/models/Note.js');
const BaseModel = require('lib/BaseModel.js');
class SearchEngine { class SearchEngine {
@ -7,65 +11,63 @@ class SearchEngine {
this.logger_ = new Logger(); this.logger_ = new Logger();
this.db_ = null; this.db_ = null;
} }
async createFtsTables() {
await this.db().exec('CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", notindexed="id", id, title, body)');
await this.db().exec('INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
}
async dropFtsTables() {
await this.db().exec('DROP TABLE IF EXISTS notes_fts');
}
async updateFtsTables() { async updateFtsTables() {
// CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", title, body);
// INSERT INTO notes_fts(docid, title, body) SELECT rowid, title, body FROM notes;
// SELECT title, offsets(notes_fts) length(offsets(notes_fts)) - length(replace(offsets(notes_fts), ' ', '')) + 1
// FROM notes_fts
// WHERE notes_fts
// MATCH 'test';
await this.db().exec('CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", title, body)'); // await this.db().exec('DELETE FROM notes_fts');
await this.db().exec('INSERT INTO notes_fts(docid, title, body) SELECT rowid, title, body FROM notes;'); // await this.db().exec('INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
// return;
await ItemChange.waitForAllSaved();
const sql = `SELECT docid, title, offsets(notes_fts) as offsets FROM notes_fts WHERE notes_fts MATCH "abcd efgh" `;
const rows = await this.db().selectAll(sql); const startTime = Date.now();
const calculateWeight = (offsets) => { let lastChangeId = Setting.value('searchEngine.lastProcessedChangeId');
// Offset doc: https://www.sqlite.org/fts3.html#offsets
const occurenceCount = Math.floor(offsets.length / 4);
let spread = 0; while (true) {
let previousDist = null; const changes = await ItemChange.modelSelectAll(`
for (let i = 0; i < occurenceCount; i++) { SELECT id, item_id, type
const dist = offsets[i * 4 + 2]; FROM item_changes
WHERE item_type = ?
AND id > ?
ORDER BY id ASC
LIMIT 100
`, [BaseModel.TYPE_NOTE, lastChangeId]);
if (previousDist !== null) { if (!changes.length) break;
const delta = dist - previousDist;
spread += delta; const queries = [];
for (let i = 0; i < changes.length; i++) {
const change = changes[i];
if (change.type === ItemChange.TYPE_CREATE || change.type === ItemChange.TYPE_UPDATE) {
queries.push({ sql: 'DELETE FROM notes_fts WHERE id = ?', params: [change.item_id] });
queries.push({ sql: 'INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE id = ?', params: [change.item_id] });
} else if (change.type === ItemChange.TYPE_DELETE) {
queries.push({ sql: 'DELETE FROM notes_fts WHERE id = ?', params: [change.item_id] });
} else {
throw new Error('Invalid change type: ' + change.type);
} }
previousDist = dist; lastChangeId = change.id;
} }
// Divide the number of occureances by the spread so even if a note has many times the searched terms await this.db().transactionExecBatch(queries);
// but these terms are very spread appart, they'll be given a lower weight than a note that has the Setting.setValue('searchEngine.lastProcessedChangeId', lastChangeId);
// terms once or twice but just next to each others. await Setting.saveAll();
return occurenceCount / spread;
} }
const orderResults = (rows) => { this.logger().info('SearchEngine: Updated FTS table in ' + (Date.now() - startTime) + 'ms');
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.weight = calculateWeight(row.offsets.split(' ').map(o => Number(o)));
}
rows.sort((a, b) => {
if (a.weight < b.weight) return +1;
if (a.weight > b.weight) return -1;
return 0;
});
}
orderResults(rows);
console.info(rows);
// console.info(rows);
} }
static instance() { static instance() {
@ -90,6 +92,71 @@ class SearchEngine {
return this.db_; return this.db_;
} }
async countRows() {
const sql = 'SELECT count(*) as total FROM notes_fts'
const row = await this.db().selectOne(sql);
return row && row['total'] ? row['total'] : 0;
}
columnIndexesFromOffsets_(offsets) {
const occurenceCount = Math.floor(offsets.length / 4);
const indexes = [];
for (let i = 0; i < occurenceCount; i++) {
const colIndex = offsets[i * 4] - 1;
if (indexes.indexOf(colIndex) < 0) indexes.push(colIndex);
}
return indexes;
}
calculateWeight_(offsets) {
// Offset doc: https://www.sqlite.org/fts3.html#offsets
const occurenceCount = Math.floor(offsets.length / 4);
let spread = 0;
let previousDist = null;
for (let i = 0; i < occurenceCount; i++) {
const dist = offsets[i * 4 + 2];
if (previousDist !== null) {
const delta = dist - previousDist;
spread += delta;
}
previousDist = dist;
}
// Divide the number of occurences by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread;
}
orderResults_(rows) {
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const offsets = row.offsets.split(' ').map(o => Number(o));
row.weight = this.calculateWeight_(offsets);
// row.colIndexes = this.columnIndexesFromOffsets_(offsets);
// row.offsets = offsets;
}
rows.sort((a, b) => {
if (a.weight < b.weight) return +1;
if (a.weight > b.weight) return -1;
return 0;
});
}
async search(query) {
const sql = 'SELECT id, offsets(notes_fts) AS offsets FROM notes_fts WHERE notes_fts MATCH ?'
const rows = await this.db().selectAll(sql, [query]);
this.orderResults_(rows);
return rows;
}
} }
module.exports = SearchEngine; module.exports = SearchEngine;