2018-12-09 22:45:50 +02:00
const { Logger } = require ( 'lib/logger.js' ) ;
2018-12-10 20:58:49 +02:00
const { shim } = require ( 'lib/shim.js' ) ;
2018-12-10 20:54:46 +02:00
const ItemChange = require ( 'lib/models/ItemChange.js' ) ;
const Setting = require ( 'lib/models/Setting.js' ) ;
const Note = require ( 'lib/models/Note.js' ) ;
const BaseModel = require ( 'lib/BaseModel.js' ) ;
2018-12-16 19:32:42 +02:00
const { pregQuote } = require ( 'lib/string-utils.js' ) ;
2018-12-09 22:45:50 +02:00
class SearchEngine {
constructor ( ) {
this . dispatch = ( action ) => { } ;
this . logger _ = new Logger ( ) ;
this . db _ = null ;
}
2018-12-10 20:54:46 +02:00
2018-12-09 22:45:50 +02:00
static instance ( ) {
if ( this . instance _ ) return this . instance _ ;
this . instance _ = new SearchEngine ( ) ;
return this . instance _ ;
}
setLogger ( logger ) {
this . logger _ = logger ;
}
logger ( ) {
return this . logger _ ;
}
setDb ( db ) {
this . db _ = db ;
}
db ( ) {
return this . db _ ;
}
2018-12-29 21:19:18 +02:00
async syncTables ( ) {
this . logger ( ) . info ( 'SearchEngine: Updating FTS table...' ) ;
await ItemChange . waitForAllSaved ( ) ;
const startTime = Date . now ( ) ;
let lastChangeId = Setting . value ( 'searchEngine.lastProcessedChangeId' ) ;
// TODO: if lastChangedid is undefined - index the whole notes table
while ( true ) {
const changes = await ItemChange . modelSelectAll ( `
SELECT id , item _id , type
FROM item _changes
WHERE item _type = ?
AND id > ?
ORDER BY id ASC
LIMIT 100
` , [BaseModel.TYPE_NOTE, lastChangeId]);
if ( ! changes . length ) break ;
const queries = [ ] ;
for ( let i = 0 ; i < changes . length ; i ++ ) {
const change = changes [ i ] ;
if ( change . type === ItemChange . TYPE _CREATE || change . type === ItemChange . TYPE _UPDATE ) {
queries . push ( { sql : 'DELETE FROM notes_normalized WHERE id = ?' , params : [ change . item _id ] } ) ;
queries . push ( { sql : 'INSERT INTO notes_normalized(id, title, body) SELECT id, title, body FROM notes WHERE id = ? AND is_conflict = 0 AND encryption_applied = 0' , params : [ change . item _id ] } ) ;
} else if ( change . type === ItemChange . TYPE _DELETE ) {
queries . push ( { sql : 'DELETE FROM notes_normalized WHERE id = ?' , params : [ change . item _id ] } ) ;
} else {
throw new Error ( 'Invalid change type: ' + change . type ) ;
}
lastChangeId = change . id ;
}
await this . db ( ) . transactionExecBatch ( queries ) ;
Setting . setValue ( 'searchEngine.lastProcessedChangeId' , lastChangeId ) ;
await Setting . saveAll ( ) ;
}
this . logger ( ) . info ( 'SearchEngine: Updated FTS table in ' + ( Date . now ( ) - startTime ) + 'ms' ) ;
}
2018-12-10 20:54:46 +02:00
async countRows ( ) {
const sql = 'SELECT count(*) as total FROM notes_fts'
const row = await this . db ( ) . selectOne ( sql ) ;
return row && row [ 'total' ] ? row [ 'total' ] : 0 ;
}
columnIndexesFromOffsets _ ( offsets ) {
const occurenceCount = Math . floor ( offsets . length / 4 ) ;
const indexes = [ ] ;
for ( let i = 0 ; i < occurenceCount ; i ++ ) {
const colIndex = offsets [ i * 4 ] - 1 ;
if ( indexes . indexOf ( colIndex ) < 0 ) indexes . push ( colIndex ) ;
}
return indexes ;
}
2018-12-12 23:40:05 +02:00
calculateWeight _ ( offsets , termCount ) {
2018-12-10 20:54:46 +02:00
// Offset doc: https://www.sqlite.org/fts3.html#offsets
2018-12-10 20:58:49 +02:00
2018-12-12 23:40:05 +02:00
// - If there's only one term in the query string, the content with the most matches goes on top
// - If there are multiple terms, the result with the most occurences that are closest to each others go on top.
// eg. if query is "abcd efgh", "abcd efgh" will go before "abcd XX efgh".
2018-12-10 20:54:46 +02:00
const occurenceCount = Math . floor ( offsets . length / 4 ) ;
2018-12-12 23:40:05 +02:00
if ( termCount === 1 ) return occurenceCount ;
2018-12-10 20:54:46 +02:00
let spread = 0 ;
let previousDist = null ;
for ( let i = 0 ; i < occurenceCount ; i ++ ) {
const dist = offsets [ i * 4 + 2 ] ;
if ( previousDist !== null ) {
const delta = dist - previousDist ;
spread += delta ;
}
previousDist = dist ;
}
// Divide the number of occurences by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread ;
}
2018-12-12 23:40:05 +02:00
orderResults _ ( rows , parsedQuery ) {
2018-12-10 20:54:46 +02:00
for ( let i = 0 ; i < rows . length ; i ++ ) {
const row = rows [ i ] ;
const offsets = row . offsets . split ( ' ' ) . map ( o => Number ( o ) ) ;
2018-12-12 23:40:05 +02:00
row . weight = this . calculateWeight _ ( offsets , parsedQuery . termCount ) ;
2018-12-10 20:54:46 +02:00
// row.colIndexes = this.columnIndexesFromOffsets_(offsets);
// row.offsets = offsets;
}
rows . sort ( ( a , b ) => {
if ( a . weight < b . weight ) return + 1 ;
if ( a . weight > b . weight ) return - 1 ;
return 0 ;
} ) ;
}
2018-12-12 23:40:05 +02:00
// https://stackoverflow.com/a/13818704/561309
queryTermToRegex ( term ) {
2018-12-14 00:57:14 +02:00
while ( term . length && term . indexOf ( '*' ) === 0 ) {
term = term . substr ( 1 ) ;
}
2018-12-16 19:32:42 +02:00
let regexString = pregQuote ( term ) ;
2018-12-14 00:57:14 +02:00
if ( regexString [ regexString . length - 1 ] === '*' ) {
2018-12-29 19:24:02 +02:00
// regexString = regexString.substr(0, regexString.length - 2) + '[^' + pregQuote(' \t\n\r,.,+-*?!={}<>|:"\'()[]') + ']' + '*';
regexString = regexString . substr ( 0 , regexString . length - 2 ) + '.*?' ;
2018-12-12 23:40:05 +02:00
}
2018-12-14 00:57:14 +02:00
return regexString ;
2018-12-10 20:54:46 +02:00
}
2018-12-12 23:40:05 +02:00
parseQuery ( query ) {
const terms = { _ : [ ] } ;
let inQuote = false ;
let currentCol = '_' ;
let currentTerm = '' ;
for ( let i = 0 ; i < query . length ; i ++ ) {
const c = query [ i ] ;
if ( c === '"' ) {
if ( inQuote ) {
terms [ currentCol ] . push ( currentTerm ) ;
currentTerm = '' ;
inQuote = false ;
} else {
inQuote = true ;
}
continue ;
}
2018-12-10 20:58:49 +02:00
2018-12-12 23:40:05 +02:00
if ( c === ' ' && ! inQuote ) {
if ( ! currentTerm ) continue ;
terms [ currentCol ] . push ( currentTerm ) ;
currentCol = '_' ;
currentTerm = '' ;
continue ;
}
2018-12-10 20:58:49 +02:00
2018-12-12 23:40:05 +02:00
if ( c === ':' && ! inQuote ) {
currentCol = currentTerm ;
terms [ currentCol ] = [ ] ;
currentTerm = '' ;
continue ;
}
currentTerm += c ;
}
if ( currentTerm ) terms [ currentCol ] . push ( currentTerm ) ;
// Filter terms:
// - Convert wildcards to regex
// - Remove columns with no results
// - Add count of terms
let termCount = 0 ;
const keys = [ ] ;
for ( let col in terms ) {
if ( ! terms . hasOwnProperty ( col ) ) continue ;
if ( ! terms [ col ] . length ) {
delete terms [ col ] ;
continue ;
}
for ( let i = terms [ col ] . length - 1 ; i >= 0 ; i -- ) {
const term = terms [ col ] [ i ] ;
// SQlLite FTS doesn't allow "*" queries and neither shall we
if ( term === '*' ) {
terms [ col ] . splice ( i , 1 ) ;
continue ;
}
if ( term . indexOf ( '*' ) >= 0 ) {
2018-12-14 00:57:14 +02:00
terms [ col ] [ i ] = { type : 'regex' , value : this . queryTermToRegex ( term ) } ;
2018-12-12 23:40:05 +02:00
}
}
termCount += terms [ col ] . length ;
keys . push ( col ) ;
}
return {
termCount : termCount ,
keys : keys ,
terms : terms ,
} ;
2018-12-10 20:58:49 +02:00
}
2018-12-14 00:57:14 +02:00
allParsedQueryTerms ( parsedQuery ) {
if ( ! parsedQuery || ! parsedQuery . termCount ) return [ ] ;
let output = [ ] ;
for ( let col in parsedQuery . terms ) {
if ( ! parsedQuery . terms . hasOwnProperty ( col ) ) continue ;
output = output . concat ( parsedQuery . terms [ col ] ) ;
}
return output ;
}
2018-12-12 23:40:05 +02:00
async search ( query ) {
const parsedQuery = this . parseQuery ( query ) ;
const sql = 'SELECT id, title, offsets(notes_fts) AS offsets FROM notes_fts WHERE notes_fts MATCH ?'
const rows = await this . db ( ) . selectAll ( sql , [ query ] ) ;
this . orderResults _ ( rows , parsedQuery ) ;
return rows ;
}
2018-12-09 22:45:50 +02:00
}
module . exports = SearchEngine ;