1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-24 10:27:10 +02:00

Nearly finished search engine backend

This commit is contained in:
Laurent Cozic 2018-12-10 18:54:46 +00:00
parent cb16a10121
commit 460f826672
4 changed files with 206 additions and 50 deletions

View File

@ -6,6 +6,7 @@ const markdownUtils = require('lib/markdownUtils.js');
const SearchEngine = require('lib/services/SearchEngine');
const Folder = require('lib/models/Folder');
const Note = require('lib/models/Note');
const ItemChange = require('lib/models/ItemChange');
const Tag = require('lib/models/Tag');
const Resource = require('lib/models/Resource');
@ -22,19 +23,103 @@ describe('services_SearchEngine', function() {
beforeEach(async (done) => {
await setupDatabaseAndSynchronizer(1);
await switchClient(1);
engine = new SearchEngine();
engine.setDb(db());
await engine.dropFtsTables();
await engine.createFtsTables();
done();
});
it('should create the FTS table', async (done) => {
let rows;
await Note.save({ title: "abcd efgh" });
await Note.save({ title: "abcd aaaaa bbbb eeee efgh" });
await Note.save({ title: "abcd aaaaa efgh" });
await Note.save({ title: "blablablabla blabla bla abcd X efgh" });
await Note.save({ title: "occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh" });
rows = await engine.search('abcd efgh');
expect(rows.length).toBe(0);
rows = await engine.search('abcd efgh');
expect(await engine.countRows()).toBe(1);
done();
});
it('should update the FTS table', async (done) => {
let rows;
expect(await engine.countRows()).toBe(0);
await Note.save({ title: "abcd efgh" });
await engine.updateFtsTables();
expect(await engine.countRows()).toBe(1);
await Note.save({ title: "abcd efgh" });
await engine.updateFtsTables();
expect(await engine.countRows()).toBe(2);
await engine.updateFtsTables();
expect(await engine.countRows()).toBe(2);
done();
});
it('should order search results by relevance', async (done) => {
// 1
const n1 = await Note.save({ title: "abcd efgh", body: "XX abcd XX efgh" });
// 4
const n2 = await Note.save({ title: "abcd aaaaa bbbb eeee efgh" });
// 3
const n3 = await Note.save({ title: "abcd aaaaa efgh" });
// 2
const n4 = await Note.save({ title: "blablablabla blabla bla abcd X efgh" });
// 5
const n5 = await Note.save({ title: "occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh occurence many times but very abcd spread appart spread appart spread appart spread appart spread appart efgh" });
await engine.updateFtsTables();
const rows = await engine.search('abcd efgh');
expect(rows[0].id).toBe(n1.id);
expect(rows[1].id).toBe(n4.id);
expect(rows[2].id).toBe(n3.id);
expect(rows[3].id).toBe(n2.id);
expect(rows[4].id).toBe(n5.id);
done();
});
it('should supports various query types', async (done) => {
let rows;
const n1 = await Note.save({ title: "abcd efgh ijkl", body: "aaaa bbbb" });
const n2 = await Note.save({ title: "iiii efgh bbbb", body: "aaaa bbbb" });
await engine.updateFtsTables();
rows = await engine.search('abcd ijkl');
expect(rows.length).toBe(1);
rows = await engine.search('"abcd ijkl"');
expect(rows.length).toBe(0);
rows = await engine.search('"abcd efgh"');
expect(rows.length).toBe(1);
rows = await engine.search('title:abcd');
expect(rows.length).toBe(1);
rows = await engine.search('title:efgh');
expect(rows.length).toBe(2);
rows = await engine.search('body:abcd');
expect(rows.length).toBe(0);
rows = await engine.search('body:bbbb');
expect(rows.length).toBe(2);
rows = await engine.search('body:bbbb iiii');
expect(rows.length).toBe(1);
done();
});

View File

@ -4,6 +4,7 @@ const { DatabaseDriverNode } = require('lib/database-driver-node.js');
const BaseModel = require('lib/BaseModel.js');
const Folder = require('lib/models/Folder.js');
const Note = require('lib/models/Note.js');
const ItemChange = require('lib/models/ItemChange.js');
const Resource = require('lib/models/Resource.js');
const Tag = require('lib/models/Tag.js');
const NoteTag = require('lib/models/NoteTag.js');
@ -122,6 +123,8 @@ async function switchClient(id) {
async function clearDatabase(id = null) {
if (id === null) id = currentClient_;
await ItemChange.waitForAllSaved();
let queries = [
'DELETE FROM notes',
'DELETE FROM folders',

View File

@ -159,6 +159,7 @@ class Setting extends BaseModel {
'api.token': { value: null, type: Setting.TYPE_STRING, public: false },
'resourceService.lastProcessedChangeId': { value: 0, type: Setting.TYPE_INT, public: false },
'searchEngine.lastProcessedChangeId': { value: 0, type: Setting.TYPE_INT, public: false },
};
return this.metadata_;

View File

@ -1,4 +1,8 @@
const { Logger } = require('lib/logger.js');
const ItemChange = require('lib/models/ItemChange.js');
const Setting = require('lib/models/Setting.js');
const Note = require('lib/models/Note.js');
const BaseModel = require('lib/BaseModel.js');
class SearchEngine {
@ -7,65 +11,63 @@ class SearchEngine {
this.logger_ = new Logger();
this.db_ = null;
}
async createFtsTables() {
await this.db().exec('CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", notindexed="id", id, title, body)');
await this.db().exec('INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
}
async dropFtsTables() {
await this.db().exec('DROP TABLE IF EXISTS notes_fts');
}
async updateFtsTables() {
// CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", title, body);
// INSERT INTO notes_fts(docid, title, body) SELECT rowid, title, body FROM notes;
// SELECT title, offsets(notes_fts) length(offsets(notes_fts)) - length(replace(offsets(notes_fts), ' ', '')) + 1
// FROM notes_fts
// WHERE notes_fts
// MATCH 'test';
await this.db().exec('CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", title, body)');
await this.db().exec('INSERT INTO notes_fts(docid, title, body) SELECT rowid, title, body FROM notes;');
// await this.db().exec('DELETE FROM notes_fts');
// await this.db().exec('INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
// return;
const sql = `SELECT docid, title, offsets(notes_fts) as offsets FROM notes_fts WHERE notes_fts MATCH "abcd efgh" `;
await ItemChange.waitForAllSaved();
const rows = await this.db().selectAll(sql);
const startTime = Date.now();
const calculateWeight = (offsets) => {
// Offset doc: https://www.sqlite.org/fts3.html#offsets
const occurenceCount = Math.floor(offsets.length / 4);
let lastChangeId = Setting.value('searchEngine.lastProcessedChangeId');
let spread = 0;
let previousDist = null;
for (let i = 0; i < occurenceCount; i++) {
const dist = offsets[i * 4 + 2];
while (true) {
const changes = await ItemChange.modelSelectAll(`
SELECT id, item_id, type
FROM item_changes
WHERE item_type = ?
AND id > ?
ORDER BY id ASC
LIMIT 100
`, [BaseModel.TYPE_NOTE, lastChangeId]);
if (previousDist !== null) {
const delta = dist - previousDist;
spread += delta;
if (!changes.length) break;
const queries = [];
for (let i = 0; i < changes.length; i++) {
const change = changes[i];
if (change.type === ItemChange.TYPE_CREATE || change.type === ItemChange.TYPE_UPDATE) {
queries.push({ sql: 'DELETE FROM notes_fts WHERE id = ?', params: [change.item_id] });
queries.push({ sql: 'INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE id = ?', params: [change.item_id] });
} else if (change.type === ItemChange.TYPE_DELETE) {
queries.push({ sql: 'DELETE FROM notes_fts WHERE id = ?', params: [change.item_id] });
} else {
throw new Error('Invalid change type: ' + change.type);
}
previousDist = dist;
lastChangeId = change.id;
}
// Divide the number of occureances by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread;
await this.db().transactionExecBatch(queries);
Setting.setValue('searchEngine.lastProcessedChangeId', lastChangeId);
await Setting.saveAll();
}
const orderResults = (rows) => {
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.weight = calculateWeight(row.offsets.split(' ').map(o => Number(o)));
}
rows.sort((a, b) => {
if (a.weight < b.weight) return +1;
if (a.weight > b.weight) return -1;
return 0;
});
}
orderResults(rows);
console.info(rows);
// console.info(rows);
this.logger().info('SearchEngine: Updated FTS table in ' + (Date.now() - startTime) + 'ms');
}
static instance() {
@ -90,6 +92,71 @@ class SearchEngine {
return this.db_;
}
async countRows() {
const sql = 'SELECT count(*) as total FROM notes_fts'
const row = await this.db().selectOne(sql);
return row && row['total'] ? row['total'] : 0;
}
columnIndexesFromOffsets_(offsets) {
const occurenceCount = Math.floor(offsets.length / 4);
const indexes = [];
for (let i = 0; i < occurenceCount; i++) {
const colIndex = offsets[i * 4] - 1;
if (indexes.indexOf(colIndex) < 0) indexes.push(colIndex);
}
return indexes;
}
calculateWeight_(offsets) {
// Offset doc: https://www.sqlite.org/fts3.html#offsets
const occurenceCount = Math.floor(offsets.length / 4);
let spread = 0;
let previousDist = null;
for (let i = 0; i < occurenceCount; i++) {
const dist = offsets[i * 4 + 2];
if (previousDist !== null) {
const delta = dist - previousDist;
spread += delta;
}
previousDist = dist;
}
// Divide the number of occurences by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread;
}
orderResults_(rows) {
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const offsets = row.offsets.split(' ').map(o => Number(o));
row.weight = this.calculateWeight_(offsets);
// row.colIndexes = this.columnIndexesFromOffsets_(offsets);
// row.offsets = offsets;
}
rows.sort((a, b) => {
if (a.weight < b.weight) return +1;
if (a.weight > b.weight) return -1;
return 0;
});
}
async search(query) {
const sql = 'SELECT id, offsets(notes_fts) AS offsets FROM notes_fts WHERE notes_fts MATCH ?'
const rows = await this.db().selectAll(sql, [query]);
this.orderResults_(rows);
return rows;
}
}
module.exports = SearchEngine;