1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-11-06 09:19:22 +02:00

Nearly finished search engine backend

This commit is contained in:
Laurent Cozic
2018-12-10 18:54:46 +00:00
parent cb16a10121
commit 460f826672
4 changed files with 206 additions and 50 deletions

View File

@@ -1,4 +1,8 @@
const { Logger } = require('lib/logger.js');
const ItemChange = require('lib/models/ItemChange.js');
const Setting = require('lib/models/Setting.js');
const Note = require('lib/models/Note.js');
const BaseModel = require('lib/BaseModel.js');
class SearchEngine {
@@ -7,65 +11,63 @@ class SearchEngine {
this.logger_ = new Logger();
this.db_ = null;
}
async createFtsTables() {
await this.db().exec('CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", notindexed="id", id, title, body)');
await this.db().exec('INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
}
async dropFtsTables() {
await this.db().exec('DROP TABLE IF EXISTS notes_fts');
}
async updateFtsTables() {
// CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", title, body);
// INSERT INTO notes_fts(docid, title, body) SELECT rowid, title, body FROM notes;
// SELECT title, offsets(notes_fts) length(offsets(notes_fts)) - length(replace(offsets(notes_fts), ' ', '')) + 1
// FROM notes_fts
// WHERE notes_fts
// MATCH 'test';
await this.db().exec('CREATE VIRTUAL TABLE notes_fts USING fts4(content="notes", title, body)');
await this.db().exec('INSERT INTO notes_fts(docid, title, body) SELECT rowid, title, body FROM notes;');
// await this.db().exec('DELETE FROM notes_fts');
// await this.db().exec('INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
// return;
const sql = `SELECT docid, title, offsets(notes_fts) as offsets FROM notes_fts WHERE notes_fts MATCH "abcd efgh" `;
await ItemChange.waitForAllSaved();
const rows = await this.db().selectAll(sql);
const startTime = Date.now();
const calculateWeight = (offsets) => {
// Offset doc: https://www.sqlite.org/fts3.html#offsets
const occurenceCount = Math.floor(offsets.length / 4);
let lastChangeId = Setting.value('searchEngine.lastProcessedChangeId');
let spread = 0;
let previousDist = null;
for (let i = 0; i < occurenceCount; i++) {
const dist = offsets[i * 4 + 2];
while (true) {
const changes = await ItemChange.modelSelectAll(`
SELECT id, item_id, type
FROM item_changes
WHERE item_type = ?
AND id > ?
ORDER BY id ASC
LIMIT 100
`, [BaseModel.TYPE_NOTE, lastChangeId]);
if (previousDist !== null) {
const delta = dist - previousDist;
spread += delta;
if (!changes.length) break;
const queries = [];
for (let i = 0; i < changes.length; i++) {
const change = changes[i];
if (change.type === ItemChange.TYPE_CREATE || change.type === ItemChange.TYPE_UPDATE) {
queries.push({ sql: 'DELETE FROM notes_fts WHERE id = ?', params: [change.item_id] });
queries.push({ sql: 'INSERT INTO notes_fts(docid, id, title, body) SELECT rowid, id, title, body FROM notes WHERE id = ?', params: [change.item_id] });
} else if (change.type === ItemChange.TYPE_DELETE) {
queries.push({ sql: 'DELETE FROM notes_fts WHERE id = ?', params: [change.item_id] });
} else {
throw new Error('Invalid change type: ' + change.type);
}
previousDist = dist;
lastChangeId = change.id;
}
// Divide the number of occureances by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread;
await this.db().transactionExecBatch(queries);
Setting.setValue('searchEngine.lastProcessedChangeId', lastChangeId);
await Setting.saveAll();
}
const orderResults = (rows) => {
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.weight = calculateWeight(row.offsets.split(' ').map(o => Number(o)));
}
rows.sort((a, b) => {
if (a.weight < b.weight) return +1;
if (a.weight > b.weight) return -1;
return 0;
});
}
orderResults(rows);
console.info(rows);
// console.info(rows);
this.logger().info('SearchEngine: Updated FTS table in ' + (Date.now() - startTime) + 'ms');
}
static instance() {
@@ -90,6 +92,71 @@ class SearchEngine {
return this.db_;
}
async countRows() {
const sql = 'SELECT count(*) as total FROM notes_fts'
const row = await this.db().selectOne(sql);
return row && row['total'] ? row['total'] : 0;
}
columnIndexesFromOffsets_(offsets) {
const occurenceCount = Math.floor(offsets.length / 4);
const indexes = [];
for (let i = 0; i < occurenceCount; i++) {
const colIndex = offsets[i * 4] - 1;
if (indexes.indexOf(colIndex) < 0) indexes.push(colIndex);
}
return indexes;
}
calculateWeight_(offsets) {
// Offset doc: https://www.sqlite.org/fts3.html#offsets
const occurenceCount = Math.floor(offsets.length / 4);
let spread = 0;
let previousDist = null;
for (let i = 0; i < occurenceCount; i++) {
const dist = offsets[i * 4 + 2];
if (previousDist !== null) {
const delta = dist - previousDist;
spread += delta;
}
previousDist = dist;
}
// Divide the number of occurences by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread;
}
orderResults_(rows) {
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const offsets = row.offsets.split(' ').map(o => Number(o));
row.weight = this.calculateWeight_(offsets);
// row.colIndexes = this.columnIndexesFromOffsets_(offsets);
// row.offsets = offsets;
}
rows.sort((a, b) => {
if (a.weight < b.weight) return +1;
if (a.weight > b.weight) return -1;
return 0;
});
}
async search(query) {
const sql = 'SELECT id, offsets(notes_fts) AS offsets FROM notes_fts WHERE notes_fts MATCH ?'
const rows = await this.db().selectAll(sql, [query]);
this.orderResults_(rows);
return rows;
}
}
module.exports = SearchEngine;