You've already forked joplin
mirror of
https://github.com/laurent22/joplin.git
synced 2025-11-06 09:19:22 +02:00
Desktop: Fuzzy search (#3632)
This commit is contained in:
@@ -81,6 +81,11 @@ class SearchEngine {
|
||||
);
|
||||
}
|
||||
|
||||
if (!noteIds.length && (Setting.value('db.fuzzySearchEnabled') == 1)) {
|
||||
// On the last loop
|
||||
queries.push({ sql: 'INSERT INTO notes_spellfix(word,rank) SELECT term, documents FROM search_aux WHERE col=\'*\'' });
|
||||
}
|
||||
|
||||
await this.db().transactionExecBatch(queries);
|
||||
}
|
||||
|
||||
@@ -145,7 +150,16 @@ class SearchEngine {
|
||||
[BaseModel.TYPE_NOTE, lastChangeId]
|
||||
);
|
||||
|
||||
if (!changes.length) break;
|
||||
const queries = [];
|
||||
|
||||
if (!changes.length) {
|
||||
if (Setting.value('db.fuzzySearchEnabled') === 1) {
|
||||
queries.push({ sql: 'DELETE FROM notes_spellfix' });
|
||||
queries.push({ sql: 'INSERT INTO notes_spellfix(word,rank) SELECT term, documents FROM search_aux WHERE col=\'*\'' });
|
||||
await this.db().transactionExecBatch(queries);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
const noteIds = changes.map(a => a.item_id);
|
||||
const notes = await Note.modelSelectAll(`
|
||||
@@ -153,8 +167,6 @@ class SearchEngine {
|
||||
FROM notes WHERE id IN ("${noteIds.join('","')}") AND is_conflict = 0 AND encryption_applied = 0`
|
||||
);
|
||||
|
||||
const queries = [];
|
||||
|
||||
for (let i = 0; i < changes.length; i++) {
|
||||
const change = changes[i];
|
||||
|
||||
@@ -254,7 +266,7 @@ class SearchEngine {
|
||||
|
||||
|
||||
|
||||
calculateWeightBM25_(rows) {
|
||||
calculateWeightBM25_(rows, fuzzyScore) {
|
||||
// https://www.sqlite.org/fts3.html#matchinfo
|
||||
// pcnalx are the arguments passed to matchinfo
|
||||
// p - The number of matchable phrases in the query.
|
||||
@@ -318,14 +330,20 @@ class SearchEngine {
|
||||
for (let i = 0; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
row.weight = 0;
|
||||
row.fuzziness = 1000;
|
||||
row.wordFound = [];
|
||||
for (let j = 0; j < numPhrases; j++) {
|
||||
let found = false;
|
||||
columns.forEach(column => {
|
||||
const rowsWithHits = docsWithHits(X[i], column, j);
|
||||
const frequencyHits = hitsThisRow(X[i], column, j);
|
||||
|
||||
const idf = IDF(rowsWithHits, numRows);
|
||||
found = found ? found : (frequencyHits > 0);
|
||||
|
||||
row.weight += BM25(idf, frequencyHits, numTokens[column][i], avgTokens[column]);
|
||||
row.fuzziness = (frequencyHits > 0) ? Math.min(row.fuzziness, fuzzyScore[j]) : row.fuzziness;
|
||||
});
|
||||
row.wordFound.push(found);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -345,23 +363,40 @@ class SearchEngine {
|
||||
|
||||
row.fields = Object.keys(matchedFields).filter(key => matchedFields[key]);
|
||||
row.weight = 0;
|
||||
row.fuzziness = 0;
|
||||
}
|
||||
}
|
||||
|
||||
processResults_(rows, parsedQuery, isBasicSearchResults = false) {
|
||||
const rowContainsAllWords = (wordsFound, numFuzzyMatches) => {
|
||||
let start = 0;
|
||||
let end = 0;
|
||||
for (let i = 0; i < numFuzzyMatches.length; i++) {
|
||||
end = end + numFuzzyMatches[i];
|
||||
if (!(wordsFound.slice(start, end).find(x => x))) {
|
||||
// This note doesn't contain any fuzzy matches for the word
|
||||
return false;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
if (isBasicSearchResults) {
|
||||
this.processBasicSearchResults_(rows, parsedQuery);
|
||||
} else {
|
||||
this.calculateWeightBM25_(rows);
|
||||
this.calculateWeightBM25_(rows, parsedQuery.fuzzyScore);
|
||||
for (let i = 0; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
row.include = (parsedQuery.fuzzy && !parsedQuery.any) ? rowContainsAllWords(row.wordFound, parsedQuery.numFuzzyMatches) : true;
|
||||
const offsets = row.offsets.split(' ').map(o => Number(o));
|
||||
row.fields = this.fieldNamesFromOffsets_(offsets);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
rows.sort((a, b) => {
|
||||
if (a.fuzziness < b.fuzziness) return -1;
|
||||
if (a.fuzziness > b.fuzziness) return +1;
|
||||
if (a.fields.includes('title') && !b.fields.includes('title')) return -1;
|
||||
if (!a.fields.includes('title') && b.fields.includes('title')) return +1;
|
||||
if (a.weight < b.weight) return +1;
|
||||
@@ -389,21 +424,75 @@ class SearchEngine {
|
||||
return regexString;
|
||||
}
|
||||
|
||||
parseQuery(query) {
|
||||
async fuzzifier(words) {
|
||||
const fuzzyMatches = [];
|
||||
words.forEach(word => {
|
||||
const fuzzyWords = this.db().selectAll('SELECT word, score FROM notes_spellfix WHERE word MATCH ? AND top=3', [word]);
|
||||
fuzzyMatches.push(fuzzyWords);
|
||||
});
|
||||
return await Promise.all(fuzzyMatches);
|
||||
}
|
||||
|
||||
async parseQuery(query, fuzzy = false) {
|
||||
// fuzzy = false;
|
||||
const trimQuotes = (str) => str.startsWith('"') ? str.substr(1, str.length - 2) : str;
|
||||
|
||||
let allTerms = [];
|
||||
let allFuzzyTerms = [];
|
||||
|
||||
try {
|
||||
allTerms = filterParser(query);
|
||||
} catch (error) {
|
||||
console.warn(error);
|
||||
}
|
||||
|
||||
const textTerms = allTerms.filter(x => x.name === 'text').map(x => trimQuotes(x.value));
|
||||
const titleTerms = allTerms.filter(x => x.name === 'title').map(x => trimQuotes(x.value));
|
||||
const bodyTerms = allTerms.filter(x => x.name === 'body').map(x => trimQuotes(x.value));
|
||||
const textTerms = allTerms.filter(x => x.name === 'text' && !x.negated);
|
||||
const titleTerms = allTerms.filter(x => x.name === 'title' && !x.negated);
|
||||
const bodyTerms = allTerms.filter(x => x.name === 'body' && !x.negated);
|
||||
|
||||
const terms = { _: textTerms, 'title': titleTerms, 'body': bodyTerms };
|
||||
const fuzzyScore = [];
|
||||
let numFuzzyMatches = [];
|
||||
let terms = null;
|
||||
if (fuzzy) {
|
||||
const fuzzyText = await this.fuzzifier(textTerms.filter(x => !x.quoted).map(x => trimQuotes(x.value)));
|
||||
const fuzzyTitle = await this.fuzzifier(titleTerms.map(x => trimQuotes(x.value)));
|
||||
const fuzzyBody = await this.fuzzifier(bodyTerms.map(x => trimQuotes(x.value)));
|
||||
const phraseSearches = textTerms.filter(x => x.quoted).map(x => x.value);
|
||||
|
||||
// Save number of matches we got for each word
|
||||
// fuzzifier() is currently set to return at most 3 matches)
|
||||
// We need to know which fuzzy words go together so that we can filter out notes that don't contain a required word.
|
||||
numFuzzyMatches = fuzzyText.concat(fuzzyTitle).concat(fuzzyBody).map(x => x.length);
|
||||
for (let i = 0; i < phraseSearches.length; i++) {
|
||||
numFuzzyMatches.push(1); // Phrase searches are preserved without fuzzification
|
||||
}
|
||||
|
||||
const mergedFuzzyText = [].concat.apply([], fuzzyText);
|
||||
const mergedFuzzyTitle = [].concat.apply([], fuzzyTitle);
|
||||
const mergedFuzzyBody = [].concat.apply([], fuzzyBody);
|
||||
|
||||
const fuzzyTextTerms = mergedFuzzyText.map(x => { return { name: 'text', value: x.word, negated: false, score: x.score }; });
|
||||
const fuzzyTitleTerms = mergedFuzzyTitle.map(x => { return { name: 'title', value: x.word, negated: false, score: x.score }; });
|
||||
const fuzzyBodyTerms = mergedFuzzyBody.map(x => { return { name: 'body', value: x.word, negated: false, score: x.score }; });
|
||||
const phraseTextTerms = phraseSearches.map(x => { return { name: 'text', value: x, negated: false, score: 0 }; });
|
||||
|
||||
allTerms = allTerms.filter(x => (x.name !== 'text' && x.name !== 'title' && x.name !== 'body'));
|
||||
|
||||
allFuzzyTerms = allTerms.concat(fuzzyTextTerms).concat(fuzzyTitleTerms).concat(fuzzyBodyTerms).concat(phraseTextTerms);
|
||||
|
||||
const allTextTerms = allFuzzyTerms.filter(x => x.name === 'title' || x.name === 'body' || x.name === 'text');
|
||||
for (let i = 0; i < allTextTerms.length; i++) {
|
||||
fuzzyScore.push(allFuzzyTerms[i].score ? allFuzzyTerms[i].score : 0);
|
||||
}
|
||||
|
||||
terms = { _: fuzzyTextTerms.concat(phraseTextTerms).map(x =>trimQuotes(x.value)), 'title': fuzzyTitleTerms.map(x =>trimQuotes(x.value)), 'body': fuzzyBodyTerms.map(x =>trimQuotes(x.value)) };
|
||||
} else {
|
||||
const nonNegatedTextTerms = textTerms.length + titleTerms.length + bodyTerms.length;
|
||||
for (let i = 0; i < nonNegatedTextTerms; i++) {
|
||||
fuzzyScore.push(0);
|
||||
}
|
||||
terms = { _: textTerms.map(x =>trimQuotes(x.value)), 'title': titleTerms.map(x =>trimQuotes(x.value)), 'body': bodyTerms.map(x =>trimQuotes(x.value)) };
|
||||
}
|
||||
|
||||
// Filter terms:
|
||||
// - Convert wildcards to regex
|
||||
@@ -445,7 +534,11 @@ class SearchEngine {
|
||||
termCount: termCount,
|
||||
keys: keys,
|
||||
terms: terms, // text terms
|
||||
allTerms: allTerms,
|
||||
allTerms: fuzzy ? allFuzzyTerms : allTerms,
|
||||
fuzzyScore: fuzzyScore,
|
||||
numFuzzyMatches: numFuzzyMatches,
|
||||
fuzzy: fuzzy,
|
||||
any: !!allTerms.find(term => term.name === 'any'),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -474,7 +567,7 @@ class SearchEngine {
|
||||
|
||||
async basicSearch(query) {
|
||||
query = query.replace(/\*/, '');
|
||||
const parsedQuery = this.parseQuery(query);
|
||||
const parsedQuery = await this.parseQuery(query);
|
||||
const searchOptions = {};
|
||||
|
||||
for (const key of parsedQuery.keys) {
|
||||
@@ -489,8 +582,8 @@ class SearchEngine {
|
||||
return Note.previews(null, searchOptions);
|
||||
}
|
||||
|
||||
determineSearchType_(query, preferredSearchType) {
|
||||
if (preferredSearchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
|
||||
determineSearchType_(query, options) {
|
||||
if (options.searchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
|
||||
|
||||
// If preferredSearchType is "fts" we auto-detect anyway
|
||||
// because it's not always supported.
|
||||
@@ -499,9 +592,12 @@ class SearchEngine {
|
||||
|
||||
if (!Setting.value('db.ftsEnabled') || ['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
|
||||
return SearchEngine.SEARCH_TYPE_BASIC;
|
||||
} else if ((Setting.value('db.fuzzySearchEnabled') === 1) && options.fuzzy) {
|
||||
return SearchEngine.SEARCH_TYPE_FTS_FUZZY;
|
||||
} else {
|
||||
return SearchEngine.SEARCH_TYPE_FTS;
|
||||
}
|
||||
|
||||
return SearchEngine.SEARCH_TYPE_FTS;
|
||||
}
|
||||
|
||||
async search(searchString, options = null) {
|
||||
@@ -511,28 +607,31 @@ class SearchEngine {
|
||||
|
||||
searchString = this.normalizeText_(searchString);
|
||||
|
||||
const searchType = this.determineSearchType_(searchString, options.searchType);
|
||||
const searchType = this.determineSearchType_(searchString, options);
|
||||
|
||||
if (searchType === SearchEngine.SEARCH_TYPE_BASIC) {
|
||||
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
|
||||
const rows = await this.basicSearch(searchString);
|
||||
const parsedQuery = this.parseQuery(searchString);
|
||||
const parsedQuery = await this.parseQuery(searchString);
|
||||
this.processResults_(rows, parsedQuery, true);
|
||||
return rows;
|
||||
} else {
|
||||
// SEARCH_TYPE_FTS
|
||||
// SEARCH_TYPE_FTS or SEARCH_TYPE_FTS_FUZZY
|
||||
// FTS will ignore all special characters, like "-" in the index. So if
|
||||
// we search for "this-phrase" it won't find it because it will only
|
||||
// see "this phrase" in the index. Because of this, we remove the dashes
|
||||
// when searching.
|
||||
// https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856
|
||||
|
||||
const parsedQuery = this.parseQuery(searchString);
|
||||
const parsedQuery = await this.parseQuery(searchString, options.fuzzy);
|
||||
|
||||
try {
|
||||
const { query, params } = queryBuilder(parsedQuery.allTerms);
|
||||
const { query, params } = (searchType === SearchEngine.SEARCH_TYPE_FTS_FUZZY) ? queryBuilder(parsedQuery.allTerms, true) : queryBuilder(parsedQuery.allTerms, false);
|
||||
const rows = await this.db().selectAll(query, params);
|
||||
this.processResults_(rows, parsedQuery);
|
||||
if (searchType === SearchEngine.SEARCH_TYPE_FTS_FUZZY && !parsedQuery.any) {
|
||||
return rows.filter(row => row.include);
|
||||
}
|
||||
return rows;
|
||||
} catch (error) {
|
||||
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
|
||||
@@ -567,5 +666,6 @@ SearchEngine.instance_ = null;
|
||||
SearchEngine.SEARCH_TYPE_AUTO = 'auto';
|
||||
SearchEngine.SEARCH_TYPE_BASIC = 'basic';
|
||||
SearchEngine.SEARCH_TYPE_FTS = 'fts';
|
||||
SearchEngine.SEARCH_TYPE_FTS_FUZZY = 'fts_fuzzy';
|
||||
|
||||
module.exports = SearchEngine;
|
||||
|
||||
@@ -11,7 +11,7 @@ class SearchEngineUtils {
|
||||
searchType = SearchEngine.SEARCH_TYPE_BASIC;
|
||||
}
|
||||
|
||||
const results = await SearchEngine.instance().search(query, { searchType });
|
||||
const results = await SearchEngine.instance().search(query, { searchType, fuzzy: options.fuzzy });
|
||||
const noteIds = results.map(n => n.id);
|
||||
|
||||
// We need at least the note ID to be able to sort them below so if not
|
||||
|
||||
@@ -3,6 +3,7 @@ interface Term {
|
||||
name: string
|
||||
value: string
|
||||
negated: boolean
|
||||
quoted?: boolean
|
||||
}
|
||||
|
||||
const makeTerm = (name: string, value: string): Term => {
|
||||
@@ -10,10 +11,9 @@ const makeTerm = (name: string, value: string): Term => {
|
||||
return { name: name, value: value, negated: false };
|
||||
};
|
||||
|
||||
const quoted = (s: string) => s.startsWith('"') && s.endsWith('"');
|
||||
|
||||
const quote = (s : string) => {
|
||||
const quoted = (s: string) => s.startsWith('"') && s.endsWith('"');
|
||||
|
||||
if (!quoted(s)) {
|
||||
return `"${s}"`;
|
||||
}
|
||||
@@ -71,7 +71,6 @@ const parseQuery = (query: string): Term[] => {
|
||||
'altitude', 'resource', 'sourceurl']);
|
||||
|
||||
const terms = getTerms(query);
|
||||
// console.log(terms);
|
||||
|
||||
const result: Term[] = [];
|
||||
for (let i = 0; i < terms.length; i++) {
|
||||
@@ -98,9 +97,9 @@ const parseQuery = (query: string): Term[] => {
|
||||
// Every word is quoted if not already.
|
||||
// By quoting the word, FTS match query will take care of removing dashes and other word seperators.
|
||||
if (value.startsWith('-')) {
|
||||
result.push({ name: 'text', value: quote(value.slice(1)) , negated: true });
|
||||
result.push({ name: 'text', value: quote(value.slice(1)) , negated: true, quoted: quoted(value) });
|
||||
} else {
|
||||
result.push({ name: 'text', value: quote(value), negated: false });
|
||||
result.push({ name: 'text', value: quote(value), negated: false, quoted: quoted(value) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,7 +300,7 @@ const sourceUrlFilter = (terms: Term[], conditons: string[], params: string[], r
|
||||
};
|
||||
|
||||
|
||||
const textFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation) => {
|
||||
const textFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, fuzzy: Boolean) => {
|
||||
const addExcludeTextConditions = (excludedTerms: Term[], conditions:string[], params: string[], relation: Relation) => {
|
||||
const type = excludedTerms[0].name === 'text' ? '' : `.${excludedTerms[0].name}`;
|
||||
|
||||
@@ -342,7 +342,7 @@ const textFilter = (terms: Term[], conditions: string[], params: string[], relat
|
||||
if (term.name === 'text') return term.value;
|
||||
else return `${term.name}:${term.value}`;
|
||||
});
|
||||
const matchQuery = (relation === 'OR') ? termsToMatch.join(' OR ') : termsToMatch.join(' ');
|
||||
const matchQuery = (fuzzy || (relation === 'OR')) ? termsToMatch.join(' OR ') : termsToMatch.join(' ');
|
||||
params.push(matchQuery);
|
||||
}
|
||||
|
||||
@@ -374,14 +374,11 @@ const getConnective = (terms: Term[], relation: Relation): string => {
|
||||
return (!notebookTerm && (relation === 'OR')) ? 'ROWID=-1' : '1'; // ROWID=-1 acts as 0 (something always false)
|
||||
};
|
||||
|
||||
export default function queryBuilder(terms: Term[]) {
|
||||
export default function queryBuilder(terms: Term[], fuzzy: boolean) {
|
||||
const queryParts: string[] = [];
|
||||
const params: string[] = [];
|
||||
const withs: string[] = [];
|
||||
|
||||
// console.log("testing beep beep boop boop")
|
||||
// console.log(terms);
|
||||
|
||||
const relation: Relation = getDefaultRelation(terms);
|
||||
|
||||
queryParts.push(`
|
||||
@@ -405,8 +402,8 @@ export default function queryBuilder(terms: Term[]) {
|
||||
|
||||
resourceFilter(terms, queryParts, params, relation, withs);
|
||||
|
||||
textFilter(terms, queryParts, params, relation, fuzzy);
|
||||
|
||||
textFilter(terms, queryParts, params, relation);
|
||||
|
||||
typeFilter(terms, queryParts, params, relation);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user