2018-12-09 21:45:50 +01:00
const { Logger } = require ( 'lib/logger.js' ) ;
2018-12-10 19:58:49 +01:00
const { shim } = require ( 'lib/shim.js' ) ;
2018-12-10 18:54:46 +00:00
const ItemChange = require ( 'lib/models/ItemChange.js' ) ;
const Setting = require ( 'lib/models/Setting.js' ) ;
const Note = require ( 'lib/models/Note.js' ) ;
const BaseModel = require ( 'lib/BaseModel.js' ) ;
2019-01-14 19:11:54 +00:00
const ItemChangeUtils = require ( 'lib/services/ItemChangeUtils' ) ;
const { pregQuote , scriptType } = require ( 'lib/string-utils.js' ) ;
const removeDiacritics = require ( 'diacritics' ) . remove ;
2018-12-09 21:45:50 +01:00
class SearchEngine {
constructor ( ) {
this . dispatch = ( action ) => { } ;
this . logger _ = new Logger ( ) ;
this . db _ = null ;
2019-01-15 18:10:22 +00:00
this . isIndexing _ = false ;
2018-12-09 21:45:50 +01:00
}
2018-12-10 18:54:46 +00:00
2018-12-09 21:45:50 +01:00
static instance ( ) {
if ( this . instance _ ) return this . instance _ ;
this . instance _ = new SearchEngine ( ) ;
return this . instance _ ;
}
setLogger ( logger ) {
this . logger _ = logger ;
}
logger ( ) {
return this . logger _ ;
}
setDb ( db ) {
this . db _ = db ;
}
db ( ) {
return this . db _ ;
}
2019-01-13 16:05:07 +00:00
noteById _ ( notes , noteId ) {
for ( let i = 0 ; i < notes . length ; i ++ ) {
if ( notes [ i ] . id === noteId ) return notes [ i ] ;
}
// The note may have been deleted since the change was recorded. For example in this case:
// - Note created (Some Change object is recorded)
// - Note is deleted
// - ResourceService indexer runs.
// In that case, there will be a change for the note, but the note will be gone.
return null ;
}
2019-01-15 18:10:22 +00:00
async rebuildIndex _ ( ) {
2019-01-13 16:05:07 +00:00
let noteIds = await this . db ( ) . selectAll ( 'SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0' ) ;
noteIds = noteIds . map ( n => n . id ) ;
2019-01-14 19:11:54 +00:00
const lastChangeId = await ItemChange . lastChangeId ( ) ;
2019-01-13 16:05:07 +00:00
// First delete content of note_normalized, in case the previous initial indexing failed
2019-01-14 19:11:54 +00:00
await this . db ( ) . exec ( 'DELETE FROM notes_normalized' ) ;
2019-01-13 16:05:07 +00:00
while ( noteIds . length ) {
const currentIds = noteIds . splice ( 0 , 100 ) ;
const notes = await Note . modelSelectAll ( 'SELECT id, title, body FROM notes WHERE id IN ("' + currentIds . join ( '","' ) + '") AND is_conflict = 0 AND encryption_applied = 0' ) ;
const queries = [ ] ;
for ( let i = 0 ; i < notes . length ; i ++ ) {
const note = notes [ i ] ;
const n = this . normalizeNote _ ( note ) ;
queries . push ( { sql : 'INSERT INTO notes_normalized(id, title, body) VALUES (?, ?, ?)' , params : [ n . id , n . title , n . body ] } ) ;
}
await this . db ( ) . transactionExecBatch ( queries ) ;
}
2019-01-14 19:11:54 +00:00
Setting . setValue ( 'searchEngine.lastProcessedChangeId' , lastChangeId ) ;
2019-01-13 16:05:07 +00:00
}
2019-01-15 18:10:22 +00:00
scheduleSyncTables ( ) {
if ( this . scheduleSyncTablesIID _ ) return ;
this . scheduleSyncTablesIID _ = setTimeout ( async ( ) => {
await this . syncTables ( ) ;
this . scheduleSyncTablesIID _ = null ;
} , 10000 ) ;
}
2018-12-29 20:19:18 +01:00
async syncTables ( ) {
2019-01-15 18:10:22 +00:00
if ( this . isIndexing _ ) return ;
this . isIndexing _ = true ;
2018-12-29 20:19:18 +01:00
this . logger ( ) . info ( 'SearchEngine: Updating FTS table...' ) ;
await ItemChange . waitForAllSaved ( ) ;
2019-01-13 16:05:07 +00:00
if ( ! Setting . value ( 'searchEngine.initialIndexingDone' ) ) {
2019-01-15 18:10:22 +00:00
await this . rebuildIndex _ ( ) ;
Setting . setValue ( 'searchEngine.initialIndexingDone' , true ) ;
this . isIndexing _ = false ;
2019-01-13 16:05:07 +00:00
return ;
}
2018-12-29 20:19:18 +01:00
const startTime = Date . now ( ) ;
let lastChangeId = Setting . value ( 'searchEngine.lastProcessedChangeId' ) ;
while ( true ) {
const changes = await ItemChange . modelSelectAll ( `
SELECT id , item _id , type
FROM item _changes
WHERE item _type = ?
AND id > ?
ORDER BY id ASC
LIMIT 100
` , [BaseModel.TYPE_NOTE, lastChangeId]);
if ( ! changes . length ) break ;
2019-01-13 16:05:07 +00:00
const noteIds = changes . map ( a => a . item _id ) ;
const notes = await Note . modelSelectAll ( 'SELECT id, title, body FROM notes WHERE id IN ("' + noteIds . join ( '","' ) + '") AND is_conflict = 0 AND encryption_applied = 0' ) ;
2018-12-29 20:19:18 +01:00
const queries = [ ] ;
for ( let i = 0 ; i < changes . length ; i ++ ) {
const change = changes [ i ] ;
if ( change . type === ItemChange . TYPE _CREATE || change . type === ItemChange . TYPE _UPDATE ) {
queries . push ( { sql : 'DELETE FROM notes_normalized WHERE id = ?' , params : [ change . item _id ] } ) ;
2019-01-13 16:05:07 +00:00
const note = this . noteById _ ( notes , change . item _id ) ;
if ( note ) {
const n = this . normalizeNote _ ( note ) ;
queries . push ( { sql : 'INSERT INTO notes_normalized(id, title, body) VALUES (?, ?, ?)' , params : [ change . item _id , n . title , n . body ] } ) ;
}
2018-12-29 20:19:18 +01:00
} else if ( change . type === ItemChange . TYPE _DELETE ) {
queries . push ( { sql : 'DELETE FROM notes_normalized WHERE id = ?' , params : [ change . item _id ] } ) ;
} else {
throw new Error ( 'Invalid change type: ' + change . type ) ;
}
lastChangeId = change . id ;
}
await this . db ( ) . transactionExecBatch ( queries ) ;
Setting . setValue ( 'searchEngine.lastProcessedChangeId' , lastChangeId ) ;
await Setting . saveAll ( ) ;
}
2019-01-14 19:11:54 +00:00
await ItemChangeUtils . deleteProcessedChanges ( ) ;
2018-12-29 20:19:18 +01:00
this . logger ( ) . info ( 'SearchEngine: Updated FTS table in ' + ( Date . now ( ) - startTime ) + 'ms' ) ;
2019-01-15 18:10:22 +00:00
this . isIndexing _ = false ;
2019-02-09 19:04:34 +00:00
}
2018-12-29 20:19:18 +01:00
2018-12-10 18:54:46 +00:00
async countRows ( ) {
const sql = 'SELECT count(*) as total FROM notes_fts'
const row = await this . db ( ) . selectOne ( sql ) ;
return row && row [ 'total' ] ? row [ 'total' ] : 0 ;
}
columnIndexesFromOffsets _ ( offsets ) {
const occurenceCount = Math . floor ( offsets . length / 4 ) ;
const indexes = [ ] ;
for ( let i = 0 ; i < occurenceCount ; i ++ ) {
const colIndex = offsets [ i * 4 ] - 1 ;
if ( indexes . indexOf ( colIndex ) < 0 ) indexes . push ( colIndex ) ;
}
return indexes ;
}
2018-12-12 22:40:05 +01:00
calculateWeight _ ( offsets , termCount ) {
2018-12-10 18:54:46 +00:00
// Offset doc: https://www.sqlite.org/fts3.html#offsets
2018-12-10 19:58:49 +01:00
2018-12-12 22:40:05 +01:00
// - If there's only one term in the query string, the content with the most matches goes on top
// - If there are multiple terms, the result with the most occurences that are closest to each others go on top.
// eg. if query is "abcd efgh", "abcd efgh" will go before "abcd XX efgh".
2018-12-10 18:54:46 +00:00
const occurenceCount = Math . floor ( offsets . length / 4 ) ;
2018-12-12 22:40:05 +01:00
if ( termCount === 1 ) return occurenceCount ;
2018-12-10 18:54:46 +00:00
let spread = 0 ;
let previousDist = null ;
for ( let i = 0 ; i < occurenceCount ; i ++ ) {
const dist = offsets [ i * 4 + 2 ] ;
if ( previousDist !== null ) {
const delta = dist - previousDist ;
spread += delta ;
}
previousDist = dist ;
}
// Divide the number of occurences by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread ;
}
2018-12-12 22:40:05 +01:00
orderResults _ ( rows , parsedQuery ) {
2018-12-10 18:54:46 +00:00
for ( let i = 0 ; i < rows . length ; i ++ ) {
const row = rows [ i ] ;
const offsets = row . offsets . split ( ' ' ) . map ( o => Number ( o ) ) ;
2018-12-12 22:40:05 +01:00
row . weight = this . calculateWeight _ ( offsets , parsedQuery . termCount ) ;
2018-12-10 18:54:46 +00:00
// row.colIndexes = this.columnIndexesFromOffsets_(offsets);
// row.offsets = offsets;
}
rows . sort ( ( a , b ) => {
if ( a . weight < b . weight ) return + 1 ;
if ( a . weight > b . weight ) return - 1 ;
2019-02-24 12:00:06 +00:00
if ( a . is _todo && a . todo _completed ) return + 1 ;
if ( b . is _todo && b . todo _completed ) return - 1 ;
if ( a . user _updated _time < b . user _updated _time ) return + 1 ;
if ( a . user _updated _time > b . user _updated _time ) return - 1 ;
2018-12-10 18:54:46 +00:00
return 0 ;
} ) ;
}
2018-12-12 22:40:05 +01:00
// https://stackoverflow.com/a/13818704/561309
queryTermToRegex ( term ) {
2018-12-13 23:57:14 +01:00
while ( term . length && term . indexOf ( '*' ) === 0 ) {
term = term . substr ( 1 ) ;
}
2018-12-16 18:32:42 +01:00
let regexString = pregQuote ( term ) ;
2018-12-13 23:57:14 +01:00
if ( regexString [ regexString . length - 1 ] === '*' ) {
2019-01-17 19:01:35 +00:00
regexString = regexString . substr ( 0 , regexString . length - 2 ) + '[^' + pregQuote ( ' \t\n\r,.,+-*?!={}<>|:"\'()[]' ) + ']' + '*?' ;
// regexString = regexString.substr(0, regexString.length - 2) + '.*?';
2018-12-12 22:40:05 +01:00
}
2018-12-13 23:57:14 +01:00
return regexString ;
2018-12-10 18:54:46 +00:00
}
2018-12-12 22:40:05 +01:00
parseQuery ( query ) {
const terms = { _ : [ ] } ;
let inQuote = false ;
let currentCol = '_' ;
let currentTerm = '' ;
for ( let i = 0 ; i < query . length ; i ++ ) {
const c = query [ i ] ;
if ( c === '"' ) {
if ( inQuote ) {
terms [ currentCol ] . push ( currentTerm ) ;
currentTerm = '' ;
inQuote = false ;
} else {
inQuote = true ;
}
continue ;
}
2018-12-10 19:58:49 +01:00
2018-12-12 22:40:05 +01:00
if ( c === ' ' && ! inQuote ) {
if ( ! currentTerm ) continue ;
terms [ currentCol ] . push ( currentTerm ) ;
currentCol = '_' ;
currentTerm = '' ;
continue ;
}
2018-12-10 19:58:49 +01:00
2018-12-12 22:40:05 +01:00
if ( c === ':' && ! inQuote ) {
currentCol = currentTerm ;
2019-04-01 19:43:13 +00:00
if ( ! terms [ currentCol ] ) terms [ currentCol ] = [ ] ;
2018-12-12 22:40:05 +01:00
currentTerm = '' ;
continue ;
}
currentTerm += c ;
}
if ( currentTerm ) terms [ currentCol ] . push ( currentTerm ) ;
// Filter terms:
// - Convert wildcards to regex
// - Remove columns with no results
// - Add count of terms
let termCount = 0 ;
const keys = [ ] ;
for ( let col in terms ) {
if ( ! terms . hasOwnProperty ( col ) ) continue ;
if ( ! terms [ col ] . length ) {
delete terms [ col ] ;
continue ;
}
for ( let i = terms [ col ] . length - 1 ; i >= 0 ; i -- ) {
const term = terms [ col ] [ i ] ;
// SQlLite FTS doesn't allow "*" queries and neither shall we
if ( term === '*' ) {
terms [ col ] . splice ( i , 1 ) ;
continue ;
}
if ( term . indexOf ( '*' ) >= 0 ) {
2019-01-18 18:31:07 +00:00
terms [ col ] [ i ] = { type : 'regex' , value : term , scriptType : scriptType ( term ) , valueRegex : this . queryTermToRegex ( term ) } ;
2019-01-15 19:55:58 +00:00
} else {
2019-01-18 18:31:07 +00:00
terms [ col ] [ i ] = { type : 'text' , value : term , scriptType : scriptType ( term ) } ;
2018-12-12 22:40:05 +01:00
}
}
termCount += terms [ col ] . length ;
keys . push ( col ) ;
}
return {
termCount : termCount ,
keys : keys ,
terms : terms ,
} ;
2018-12-10 19:58:49 +01:00
}
2018-12-13 23:57:14 +01:00
allParsedQueryTerms ( parsedQuery ) {
if ( ! parsedQuery || ! parsedQuery . termCount ) return [ ] ;
let output = [ ] ;
for ( let col in parsedQuery . terms ) {
if ( ! parsedQuery . terms . hasOwnProperty ( col ) ) continue ;
output = output . concat ( parsedQuery . terms [ col ] ) ;
}
return output ;
}
2019-01-13 16:05:07 +00:00
normalizeText _ ( text ) {
2019-01-19 18:03:05 +00:00
const normalizedText = text . normalize ? text . normalize ( ) : text ;
return removeDiacritics ( normalizedText . toLowerCase ( ) ) ;
2019-01-13 16:05:07 +00:00
}
normalizeNote _ ( note ) {
const n = Object . assign ( { } , note ) ;
n . title = this . normalizeText _ ( n . title ) ;
n . body = this . normalizeText _ ( n . body ) ;
return n ;
}
2019-01-14 19:11:54 +00:00
async basicSearch ( query ) {
2019-04-03 07:46:41 +01:00
query = query . replace ( /\*/ , '' ) ;
const parsedQuery = this . parseQuery ( query ) ;
const searchOptions = { } ;
for ( const key of parsedQuery . keys ) {
const term = parsedQuery . terms [ key ] [ 0 ] . value ;
if ( key === '_' ) searchOptions . anywherePattern = '*' + term + '*' ;
if ( key === 'title' ) searchOptions . titlePattern = '*' + term + '*' ;
if ( key === 'body' ) searchOptions . bodyPattern = '*' + term + '*' ;
2019-01-14 19:11:54 +00:00
}
2019-04-03 07:46:41 +01:00
return Note . previews ( null , searchOptions ) ;
2019-01-14 19:11:54 +00:00
}
2018-12-12 22:40:05 +01:00
async search ( query ) {
2019-01-13 16:05:07 +00:00
query = this . normalizeText _ ( query ) ;
2019-01-31 08:35:41 +00:00
query = query . replace ( /-/g , ' ' ) ; // https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856
2019-01-14 19:11:54 +00:00
const st = scriptType ( query ) ;
if ( ! Setting . value ( 'db.ftsEnabled' ) || [ 'ja' , 'zh' , 'ko' ] . indexOf ( st ) >= 0 ) {
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
return this . basicSearch ( query ) ;
} else {
const parsedQuery = this . parseQuery ( query ) ;
2019-04-04 07:53:00 +01:00
const sql = 'SELECT notes_fts.id, notes_fts.title AS normalized_title, offsets(notes_fts) AS offsets, notes.title, notes.user_updated_time, notes.is_todo, notes.todo_completed, notes.parent_id FROM notes_fts LEFT JOIN notes ON notes_fts.id = notes.id WHERE notes_fts MATCH ?'
2019-01-18 17:56:56 +00:00
try {
const rows = await this . db ( ) . selectAll ( sql , [ query ] ) ;
this . orderResults _ ( rows , parsedQuery ) ;
return rows ;
} catch ( error ) {
this . logger ( ) . warn ( 'Cannot execute MATCH query: ' + query + ': ' + error . message ) ;
return [ ] ;
}
2019-01-14 19:11:54 +00:00
}
}
2018-12-12 22:40:05 +01:00
2018-12-09 21:45:50 +01:00
}
module . exports = SearchEngine ;