2018-12-09 22:45:50 +02:00
const { Logger } = require ( 'lib/logger.js' ) ;
2018-12-10 20:54:46 +02:00
const ItemChange = require ( 'lib/models/ItemChange.js' ) ;
const Setting = require ( 'lib/models/Setting.js' ) ;
const Note = require ( 'lib/models/Note.js' ) ;
const BaseModel = require ( 'lib/BaseModel.js' ) ;
2019-01-14 21:11:54 +02:00
const ItemChangeUtils = require ( 'lib/services/ItemChangeUtils' ) ;
const { pregQuote , scriptType } = require ( 'lib/string-utils.js' ) ;
const removeDiacritics = require ( 'diacritics' ) . remove ;
2019-06-28 01:48:52 +02:00
const { sprintf } = require ( 'sprintf-js' ) ;
2018-12-09 22:45:50 +02:00
class SearchEngine {
constructor ( ) {
2019-09-13 00:16:42 +02:00
this . dispatch = ( ) => { } ;
2018-12-09 22:45:50 +02:00
this . logger _ = new Logger ( ) ;
this . db _ = null ;
2019-01-15 20:10:22 +02:00
this . isIndexing _ = false ;
2018-12-09 22:45:50 +02:00
}
2018-12-10 20:54:46 +02:00
2018-12-09 22:45:50 +02:00
static instance ( ) {
if ( this . instance _ ) return this . instance _ ;
this . instance _ = new SearchEngine ( ) ;
return this . instance _ ;
}
setLogger ( logger ) {
this . logger _ = logger ;
}
logger ( ) {
return this . logger _ ;
}
setDb ( db ) {
this . db _ = db ;
}
db ( ) {
return this . db _ ;
}
2019-01-13 18:05:07 +02:00
noteById _ ( notes , noteId ) {
for ( let i = 0 ; i < notes . length ; i ++ ) {
if ( notes [ i ] . id === noteId ) return notes [ i ] ;
}
// The note may have been deleted since the change was recorded. For example in this case:
// - Note created (Some Change object is recorded)
// - Note is deleted
// - ResourceService indexer runs.
// In that case, there will be a change for the note, but the note will be gone.
return null ;
}
2019-01-15 20:10:22 +02:00
async rebuildIndex _ ( ) {
2019-01-13 18:05:07 +02:00
let noteIds = await this . db ( ) . selectAll ( 'SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0' ) ;
noteIds = noteIds . map ( n => n . id ) ;
2019-01-14 21:11:54 +02:00
const lastChangeId = await ItemChange . lastChangeId ( ) ;
2019-01-13 18:05:07 +02:00
// First delete content of note_normalized, in case the previous initial indexing failed
2019-01-14 21:11:54 +02:00
await this . db ( ) . exec ( 'DELETE FROM notes_normalized' ) ;
2019-01-13 18:05:07 +02:00
while ( noteIds . length ) {
const currentIds = noteIds . splice ( 0 , 100 ) ;
2019-09-19 23:51:18 +02:00
const notes = await Note . modelSelectAll ( ` SELECT id, title, body FROM notes WHERE id IN (" ${ currentIds . join ( '","' ) } ") AND is_conflict = 0 AND encryption_applied = 0 ` ) ;
2019-01-13 18:05:07 +02:00
const queries = [ ] ;
for ( let i = 0 ; i < notes . length ; i ++ ) {
const note = notes [ i ] ;
const n = this . normalizeNote _ ( note ) ;
queries . push ( { sql : 'INSERT INTO notes_normalized(id, title, body) VALUES (?, ?, ?)' , params : [ n . id , n . title , n . body ] } ) ;
}
await this . db ( ) . transactionExecBatch ( queries ) ;
}
2019-01-14 21:11:54 +02:00
Setting . setValue ( 'searchEngine.lastProcessedChangeId' , lastChangeId ) ;
2019-01-13 18:05:07 +02:00
}
2019-01-15 20:10:22 +02:00
scheduleSyncTables ( ) {
if ( this . scheduleSyncTablesIID _ ) return ;
this . scheduleSyncTablesIID _ = setTimeout ( async ( ) => {
2019-06-26 19:36:42 +02:00
try {
await this . syncTables ( ) ;
} catch ( error ) {
this . logger ( ) . error ( 'SearchEngine::scheduleSyncTables: Error while syncing tables:' , error ) ;
}
2019-01-15 20:10:22 +02:00
this . scheduleSyncTablesIID _ = null ;
} , 10000 ) ;
}
2019-06-28 01:48:52 +02:00
async rebuildIndex ( ) {
2019-07-29 15:43:53 +02:00
Setting . setValue ( 'searchEngine.lastProcessedChangeId' , 0 ) ;
2019-06-28 01:48:52 +02:00
Setting . setValue ( 'searchEngine.initialIndexingDone' , false ) ;
return this . syncTables ( ) ;
}
2018-12-29 21:19:18 +02:00
async syncTables ( ) {
2019-01-15 20:10:22 +02:00
if ( this . isIndexing _ ) return ;
this . isIndexing _ = true ;
2018-12-29 21:19:18 +02:00
this . logger ( ) . info ( 'SearchEngine: Updating FTS table...' ) ;
await ItemChange . waitForAllSaved ( ) ;
2019-01-13 18:05:07 +02:00
if ( ! Setting . value ( 'searchEngine.initialIndexingDone' ) ) {
2019-01-15 20:10:22 +02:00
await this . rebuildIndex _ ( ) ;
Setting . setValue ( 'searchEngine.initialIndexingDone' , true ) ;
this . isIndexing _ = false ;
2019-01-13 18:05:07 +02:00
return ;
}
2018-12-29 21:19:18 +02:00
const startTime = Date . now ( ) ;
2019-06-28 01:48:52 +02:00
const report = {
inserted : 0 ,
2019-07-29 15:43:53 +02:00
deleted : 0 ,
2019-06-28 01:48:52 +02:00
} ;
2018-12-29 21:19:18 +02:00
let lastChangeId = Setting . value ( 'searchEngine.lastProcessedChangeId' ) ;
2019-06-26 19:36:42 +02:00
try {
while ( true ) {
2019-07-29 15:43:53 +02:00
const changes = await ItemChange . modelSelectAll (
`
2019-06-26 19:36:42 +02:00
SELECT id , item _id , type
FROM item _changes
WHERE item _type = ?
AND id > ?
ORDER BY id ASC
2019-06-29 00:49:43 +02:00
LIMIT 10
2019-07-29 15:43:53 +02:00
` ,
[ BaseModel . TYPE _NOTE , lastChangeId ]
) ;
2019-06-26 19:36:42 +02:00
if ( ! changes . length ) break ;
const noteIds = changes . map ( a => a . item _id ) ;
2019-09-19 23:51:18 +02:00
const notes = await Note . modelSelectAll ( ` SELECT id, title, body FROM notes WHERE id IN (" ${ noteIds . join ( '","' ) } ") AND is_conflict = 0 AND encryption_applied = 0 ` ) ;
2019-06-26 19:36:42 +02:00
const queries = [ ] ;
for ( let i = 0 ; i < changes . length ; i ++ ) {
const change = changes [ i ] ;
if ( change . type === ItemChange . TYPE _CREATE || change . type === ItemChange . TYPE _UPDATE ) {
queries . push ( { sql : 'DELETE FROM notes_normalized WHERE id = ?' , params : [ change . item _id ] } ) ;
const note = this . noteById _ ( notes , change . item _id ) ;
if ( note ) {
const n = this . normalizeNote _ ( note ) ;
queries . push ( { sql : 'INSERT INTO notes_normalized(id, title, body) VALUES (?, ?, ?)' , params : [ change . item _id , n . title , n . body ] } ) ;
2019-06-28 01:48:52 +02:00
report . inserted ++ ;
2019-06-26 19:36:42 +02:00
}
} else if ( change . type === ItemChange . TYPE _DELETE ) {
queries . push ( { sql : 'DELETE FROM notes_normalized WHERE id = ?' , params : [ change . item _id ] } ) ;
2019-06-28 01:48:52 +02:00
report . deleted ++ ;
2019-06-26 19:36:42 +02:00
} else {
2019-09-19 23:51:18 +02:00
throw new Error ( ` Invalid change type: ${ change . type } ` ) ;
2019-01-13 18:05:07 +02:00
}
2019-06-26 19:36:42 +02:00
lastChangeId = change . id ;
2018-12-29 21:19:18 +02:00
}
2019-06-26 19:36:42 +02:00
await this . db ( ) . transactionExecBatch ( queries ) ;
Setting . setValue ( 'searchEngine.lastProcessedChangeId' , lastChangeId ) ;
await Setting . saveAll ( ) ;
2018-12-29 21:19:18 +02:00
}
2019-06-26 19:36:42 +02:00
} catch ( error ) {
this . logger ( ) . error ( 'SearchEngine: Error while processing changes:' , error ) ;
2018-12-29 21:19:18 +02:00
}
2019-01-14 21:11:54 +02:00
await ItemChangeUtils . deleteProcessedChanges ( ) ;
2019-06-28 01:48:52 +02:00
this . logger ( ) . info ( sprintf ( 'SearchEngine: Updated FTS table in %dms. Inserted: %d. Deleted: %d' , Date . now ( ) - startTime , report . inserted , report . deleted ) ) ;
2019-01-15 20:10:22 +02:00
this . isIndexing _ = false ;
2019-02-09 21:04:34 +02:00
}
2018-12-29 21:19:18 +02:00
2018-12-10 20:54:46 +02:00
async countRows ( ) {
2019-07-29 15:43:53 +02:00
const sql = 'SELECT count(*) as total FROM notes_fts' ;
2018-12-10 20:54:46 +02:00
const row = await this . db ( ) . selectOne ( sql ) ;
return row && row [ 'total' ] ? row [ 'total' ] : 0 ;
}
columnIndexesFromOffsets _ ( offsets ) {
const occurenceCount = Math . floor ( offsets . length / 4 ) ;
const indexes = [ ] ;
for ( let i = 0 ; i < occurenceCount ; i ++ ) {
const colIndex = offsets [ i * 4 ] - 1 ;
if ( indexes . indexOf ( colIndex ) < 0 ) indexes . push ( colIndex ) ;
}
return indexes ;
}
2018-12-12 23:40:05 +02:00
calculateWeight _ ( offsets , termCount ) {
2018-12-10 20:54:46 +02:00
// Offset doc: https://www.sqlite.org/fts3.html#offsets
2018-12-10 20:58:49 +02:00
2018-12-12 23:40:05 +02:00
// - If there's only one term in the query string, the content with the most matches goes on top
// - If there are multiple terms, the result with the most occurences that are closest to each others go on top.
// eg. if query is "abcd efgh", "abcd efgh" will go before "abcd XX efgh".
2019-07-29 15:43:53 +02:00
2018-12-10 20:54:46 +02:00
const occurenceCount = Math . floor ( offsets . length / 4 ) ;
2018-12-12 23:40:05 +02:00
if ( termCount === 1 ) return occurenceCount ;
2018-12-10 20:54:46 +02:00
let spread = 0 ;
let previousDist = null ;
for ( let i = 0 ; i < occurenceCount ; i ++ ) {
const dist = offsets [ i * 4 + 2 ] ;
if ( previousDist !== null ) {
const delta = dist - previousDist ;
spread += delta ;
}
previousDist = dist ;
}
// Divide the number of occurences by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread ;
}
2018-12-12 23:40:05 +02:00
orderResults _ ( rows , parsedQuery ) {
2018-12-10 20:54:46 +02:00
for ( let i = 0 ; i < rows . length ; i ++ ) {
const row = rows [ i ] ;
const offsets = row . offsets . split ( ' ' ) . map ( o => Number ( o ) ) ;
2018-12-12 23:40:05 +02:00
row . weight = this . calculateWeight _ ( offsets , parsedQuery . termCount ) ;
2018-12-10 20:54:46 +02:00
// row.colIndexes = this.columnIndexesFromOffsets_(offsets);
// row.offsets = offsets;
}
rows . sort ( ( a , b ) => {
if ( a . weight < b . weight ) return + 1 ;
if ( a . weight > b . weight ) return - 1 ;
2019-02-24 14:00:06 +02:00
if ( a . is _todo && a . todo _completed ) return + 1 ;
if ( b . is _todo && b . todo _completed ) return - 1 ;
if ( a . user _updated _time < b . user _updated _time ) return + 1 ;
if ( a . user _updated _time > b . user _updated _time ) return - 1 ;
2018-12-10 20:54:46 +02:00
return 0 ;
} ) ;
}
2018-12-12 23:40:05 +02:00
// https://stackoverflow.com/a/13818704/561309
queryTermToRegex ( term ) {
2018-12-14 00:57:14 +02:00
while ( term . length && term . indexOf ( '*' ) === 0 ) {
term = term . substr ( 1 ) ;
}
2018-12-16 19:32:42 +02:00
let regexString = pregQuote ( term ) ;
2018-12-14 00:57:14 +02:00
if ( regexString [ regexString . length - 1 ] === '*' ) {
2019-09-19 23:51:18 +02:00
regexString = ` ${ regexString . substr ( 0 , regexString . length - 2 ) } [^ ${ pregQuote ( ' \t\n\r,.,+-*?!={}<>|:"\'()[]' ) } ] ` + '*?' ;
2019-01-17 21:01:35 +02:00
// regexString = regexString.substr(0, regexString.length - 2) + '.*?';
2018-12-12 23:40:05 +02:00
}
2018-12-14 00:57:14 +02:00
return regexString ;
2018-12-10 20:54:46 +02:00
}
2018-12-12 23:40:05 +02:00
parseQuery ( query ) {
2019-07-29 15:43:53 +02:00
const terms = { _ : [ ] } ;
2018-12-12 23:40:05 +02:00
let inQuote = false ;
let currentCol = '_' ;
let currentTerm = '' ;
for ( let i = 0 ; i < query . length ; i ++ ) {
const c = query [ i ] ;
if ( c === '"' ) {
if ( inQuote ) {
terms [ currentCol ] . push ( currentTerm ) ;
currentTerm = '' ;
inQuote = false ;
} else {
inQuote = true ;
}
continue ;
}
2018-12-10 20:58:49 +02:00
2018-12-12 23:40:05 +02:00
if ( c === ' ' && ! inQuote ) {
if ( ! currentTerm ) continue ;
terms [ currentCol ] . push ( currentTerm ) ;
currentCol = '_' ;
currentTerm = '' ;
continue ;
}
2018-12-10 20:58:49 +02:00
2018-12-12 23:40:05 +02:00
if ( c === ':' && ! inQuote ) {
currentCol = currentTerm ;
2019-04-01 21:43:13 +02:00
if ( ! terms [ currentCol ] ) terms [ currentCol ] = [ ] ;
2018-12-12 23:40:05 +02:00
currentTerm = '' ;
continue ;
}
currentTerm += c ;
}
if ( currentTerm ) terms [ currentCol ] . push ( currentTerm ) ;
// Filter terms:
// - Convert wildcards to regex
// - Remove columns with no results
// - Add count of terms
let termCount = 0 ;
const keys = [ ] ;
for ( let col in terms ) {
if ( ! terms . hasOwnProperty ( col ) ) continue ;
if ( ! terms [ col ] . length ) {
delete terms [ col ] ;
continue ;
}
for ( let i = terms [ col ] . length - 1 ; i >= 0 ; i -- ) {
const term = terms [ col ] [ i ] ;
// SQlLite FTS doesn't allow "*" queries and neither shall we
if ( term === '*' ) {
terms [ col ] . splice ( i , 1 ) ;
continue ;
}
if ( term . indexOf ( '*' ) >= 0 ) {
2019-01-18 20:31:07 +02:00
terms [ col ] [ i ] = { type : 'regex' , value : term , scriptType : scriptType ( term ) , valueRegex : this . queryTermToRegex ( term ) } ;
2019-01-15 21:55:58 +02:00
} else {
2019-01-18 20:31:07 +02:00
terms [ col ] [ i ] = { type : 'text' , value : term , scriptType : scriptType ( term ) } ;
2018-12-12 23:40:05 +02:00
}
}
termCount += terms [ col ] . length ;
keys . push ( col ) ;
}
return {
termCount : termCount ,
keys : keys ,
terms : terms ,
} ;
2018-12-10 20:58:49 +02:00
}
2018-12-14 00:57:14 +02:00
allParsedQueryTerms ( parsedQuery ) {
if ( ! parsedQuery || ! parsedQuery . termCount ) return [ ] ;
let output = [ ] ;
for ( let col in parsedQuery . terms ) {
if ( ! parsedQuery . terms . hasOwnProperty ( col ) ) continue ;
output = output . concat ( parsedQuery . terms [ col ] ) ;
}
return output ;
}
2019-01-13 18:05:07 +02:00
normalizeText _ ( text ) {
2019-01-19 20:03:05 +02:00
const normalizedText = text . normalize ? text . normalize ( ) : text ;
return removeDiacritics ( normalizedText . toLowerCase ( ) ) ;
2019-01-13 18:05:07 +02:00
}
normalizeNote _ ( note ) {
const n = Object . assign ( { } , note ) ;
2019-07-29 15:43:53 +02:00
n . title = this . normalizeText _ ( n . title ) ;
2019-01-13 18:05:07 +02:00
n . body = this . normalizeText _ ( n . body ) ;
return n ;
}
2019-01-14 21:11:54 +02:00
async basicSearch ( query ) {
2019-04-03 08:46:41 +02:00
query = query . replace ( /\*/ , '' ) ;
const parsedQuery = this . parseQuery ( query ) ;
const searchOptions = { } ;
for ( const key of parsedQuery . keys ) {
const term = parsedQuery . terms [ key ] [ 0 ] . value ;
2019-09-19 23:51:18 +02:00
if ( key === '_' ) searchOptions . anywherePattern = ` * ${ term } * ` ;
if ( key === 'title' ) searchOptions . titlePattern = ` * ${ term } * ` ;
if ( key === 'body' ) searchOptions . bodyPattern = ` * ${ term } * ` ;
2019-01-14 21:11:54 +02:00
}
2019-04-03 08:46:41 +02:00
return Note . previews ( null , searchOptions ) ;
2019-01-14 21:11:54 +02:00
}
2018-12-12 23:40:05 +02:00
async search ( query ) {
2019-01-13 18:05:07 +02:00
query = this . normalizeText _ ( query ) ;
2019-01-31 10:35:41 +02:00
query = query . replace ( /-/g , ' ' ) ; // https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856
2019-01-14 21:11:54 +02:00
const st = scriptType ( query ) ;
if ( ! Setting . value ( 'db.ftsEnabled' ) || [ 'ja' , 'zh' , 'ko' ] . indexOf ( st ) >= 0 ) {
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
return this . basicSearch ( query ) ;
} else {
const parsedQuery = this . parseQuery ( query ) ;
2019-07-29 15:43:53 +02:00
const sql = 'SELECT notes_fts.id, notes_fts.title AS normalized_title, offsets(notes_fts) AS offsets, notes.title, notes.user_updated_time, notes.is_todo, notes.todo_completed, notes.parent_id FROM notes_fts LEFT JOIN notes ON notes_fts.id = notes.id WHERE notes_fts MATCH ?' ;
2019-01-18 19:56:56 +02:00
try {
const rows = await this . db ( ) . selectAll ( sql , [ query ] ) ;
this . orderResults _ ( rows , parsedQuery ) ;
return rows ;
} catch ( error ) {
2019-09-19 23:51:18 +02:00
this . logger ( ) . warn ( ` Cannot execute MATCH query: ${ query } : ${ error . message } ` ) ;
2019-01-18 19:56:56 +02:00
return [ ] ;
}
2019-01-14 21:11:54 +02:00
}
}
2018-12-09 22:45:50 +02:00
}
2019-07-29 15:43:53 +02:00
module . exports = SearchEngine ;