1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-11-06 09:19:22 +02:00

Desktop: Fuzzy search (#3632)

This commit is contained in:
Naveen M V
2020-09-06 17:37:00 +05:30
committed by GitHub
parent a8296e2e37
commit e108fdb1d8
31 changed files with 602 additions and 209 deletions

View File

@@ -81,6 +81,11 @@ class SearchEngine {
);
}
if (!noteIds.length && (Setting.value('db.fuzzySearchEnabled') == 1)) {
// On the last loop
queries.push({ sql: 'INSERT INTO notes_spellfix(word,rank) SELECT term, documents FROM search_aux WHERE col=\'*\'' });
}
await this.db().transactionExecBatch(queries);
}
@@ -145,7 +150,16 @@ class SearchEngine {
[BaseModel.TYPE_NOTE, lastChangeId]
);
if (!changes.length) break;
const queries = [];
if (!changes.length) {
if (Setting.value('db.fuzzySearchEnabled') === 1) {
queries.push({ sql: 'DELETE FROM notes_spellfix' });
queries.push({ sql: 'INSERT INTO notes_spellfix(word,rank) SELECT term, documents FROM search_aux WHERE col=\'*\'' });
await this.db().transactionExecBatch(queries);
}
break;
}
const noteIds = changes.map(a => a.item_id);
const notes = await Note.modelSelectAll(`
@@ -153,8 +167,6 @@ class SearchEngine {
FROM notes WHERE id IN ("${noteIds.join('","')}") AND is_conflict = 0 AND encryption_applied = 0`
);
const queries = [];
for (let i = 0; i < changes.length; i++) {
const change = changes[i];
@@ -254,7 +266,7 @@ class SearchEngine {
calculateWeightBM25_(rows) {
calculateWeightBM25_(rows, fuzzyScore) {
// https://www.sqlite.org/fts3.html#matchinfo
// pcnalx are the arguments passed to matchinfo
// p - The number of matchable phrases in the query.
@@ -318,14 +330,20 @@ class SearchEngine {
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.weight = 0;
row.fuzziness = 1000;
row.wordFound = [];
for (let j = 0; j < numPhrases; j++) {
let found = false;
columns.forEach(column => {
const rowsWithHits = docsWithHits(X[i], column, j);
const frequencyHits = hitsThisRow(X[i], column, j);
const idf = IDF(rowsWithHits, numRows);
found = found ? found : (frequencyHits > 0);
row.weight += BM25(idf, frequencyHits, numTokens[column][i], avgTokens[column]);
row.fuzziness = (frequencyHits > 0) ? Math.min(row.fuzziness, fuzzyScore[j]) : row.fuzziness;
});
row.wordFound.push(found);
}
}
}
@@ -345,23 +363,40 @@ class SearchEngine {
row.fields = Object.keys(matchedFields).filter(key => matchedFields[key]);
row.weight = 0;
row.fuzziness = 0;
}
}
processResults_(rows, parsedQuery, isBasicSearchResults = false) {
const rowContainsAllWords = (wordsFound, numFuzzyMatches) => {
let start = 0;
let end = 0;
for (let i = 0; i < numFuzzyMatches.length; i++) {
end = end + numFuzzyMatches[i];
if (!(wordsFound.slice(start, end).find(x => x))) {
// This note doesn't contain any fuzzy matches for the word
return false;
}
start = end;
}
return true;
};
if (isBasicSearchResults) {
this.processBasicSearchResults_(rows, parsedQuery);
} else {
this.calculateWeightBM25_(rows);
this.calculateWeightBM25_(rows, parsedQuery.fuzzyScore);
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.include = (parsedQuery.fuzzy && !parsedQuery.any) ? rowContainsAllWords(row.wordFound, parsedQuery.numFuzzyMatches) : true;
const offsets = row.offsets.split(' ').map(o => Number(o));
row.fields = this.fieldNamesFromOffsets_(offsets);
}
}
rows.sort((a, b) => {
if (a.fuzziness < b.fuzziness) return -1;
if (a.fuzziness > b.fuzziness) return +1;
if (a.fields.includes('title') && !b.fields.includes('title')) return -1;
if (!a.fields.includes('title') && b.fields.includes('title')) return +1;
if (a.weight < b.weight) return +1;
@@ -389,21 +424,75 @@ class SearchEngine {
return regexString;
}
parseQuery(query) {
async fuzzifier(words) {
const fuzzyMatches = [];
words.forEach(word => {
const fuzzyWords = this.db().selectAll('SELECT word, score FROM notes_spellfix WHERE word MATCH ? AND top=3', [word]);
fuzzyMatches.push(fuzzyWords);
});
return await Promise.all(fuzzyMatches);
}
async parseQuery(query, fuzzy = false) {
// fuzzy = false;
const trimQuotes = (str) => str.startsWith('"') ? str.substr(1, str.length - 2) : str;
let allTerms = [];
let allFuzzyTerms = [];
try {
allTerms = filterParser(query);
} catch (error) {
console.warn(error);
}
const textTerms = allTerms.filter(x => x.name === 'text').map(x => trimQuotes(x.value));
const titleTerms = allTerms.filter(x => x.name === 'title').map(x => trimQuotes(x.value));
const bodyTerms = allTerms.filter(x => x.name === 'body').map(x => trimQuotes(x.value));
const textTerms = allTerms.filter(x => x.name === 'text' && !x.negated);
const titleTerms = allTerms.filter(x => x.name === 'title' && !x.negated);
const bodyTerms = allTerms.filter(x => x.name === 'body' && !x.negated);
const terms = { _: textTerms, 'title': titleTerms, 'body': bodyTerms };
const fuzzyScore = [];
let numFuzzyMatches = [];
let terms = null;
if (fuzzy) {
const fuzzyText = await this.fuzzifier(textTerms.filter(x => !x.quoted).map(x => trimQuotes(x.value)));
const fuzzyTitle = await this.fuzzifier(titleTerms.map(x => trimQuotes(x.value)));
const fuzzyBody = await this.fuzzifier(bodyTerms.map(x => trimQuotes(x.value)));
const phraseSearches = textTerms.filter(x => x.quoted).map(x => x.value);
// Save number of matches we got for each word
// fuzzifier() is currently set to return at most 3 matches)
// We need to know which fuzzy words go together so that we can filter out notes that don't contain a required word.
numFuzzyMatches = fuzzyText.concat(fuzzyTitle).concat(fuzzyBody).map(x => x.length);
for (let i = 0; i < phraseSearches.length; i++) {
numFuzzyMatches.push(1); // Phrase searches are preserved without fuzzification
}
const mergedFuzzyText = [].concat.apply([], fuzzyText);
const mergedFuzzyTitle = [].concat.apply([], fuzzyTitle);
const mergedFuzzyBody = [].concat.apply([], fuzzyBody);
const fuzzyTextTerms = mergedFuzzyText.map(x => { return { name: 'text', value: x.word, negated: false, score: x.score }; });
const fuzzyTitleTerms = mergedFuzzyTitle.map(x => { return { name: 'title', value: x.word, negated: false, score: x.score }; });
const fuzzyBodyTerms = mergedFuzzyBody.map(x => { return { name: 'body', value: x.word, negated: false, score: x.score }; });
const phraseTextTerms = phraseSearches.map(x => { return { name: 'text', value: x, negated: false, score: 0 }; });
allTerms = allTerms.filter(x => (x.name !== 'text' && x.name !== 'title' && x.name !== 'body'));
allFuzzyTerms = allTerms.concat(fuzzyTextTerms).concat(fuzzyTitleTerms).concat(fuzzyBodyTerms).concat(phraseTextTerms);
const allTextTerms = allFuzzyTerms.filter(x => x.name === 'title' || x.name === 'body' || x.name === 'text');
for (let i = 0; i < allTextTerms.length; i++) {
fuzzyScore.push(allFuzzyTerms[i].score ? allFuzzyTerms[i].score : 0);
}
terms = { _: fuzzyTextTerms.concat(phraseTextTerms).map(x =>trimQuotes(x.value)), 'title': fuzzyTitleTerms.map(x =>trimQuotes(x.value)), 'body': fuzzyBodyTerms.map(x =>trimQuotes(x.value)) };
} else {
const nonNegatedTextTerms = textTerms.length + titleTerms.length + bodyTerms.length;
for (let i = 0; i < nonNegatedTextTerms; i++) {
fuzzyScore.push(0);
}
terms = { _: textTerms.map(x =>trimQuotes(x.value)), 'title': titleTerms.map(x =>trimQuotes(x.value)), 'body': bodyTerms.map(x =>trimQuotes(x.value)) };
}
// Filter terms:
// - Convert wildcards to regex
@@ -445,7 +534,11 @@ class SearchEngine {
termCount: termCount,
keys: keys,
terms: terms, // text terms
allTerms: allTerms,
allTerms: fuzzy ? allFuzzyTerms : allTerms,
fuzzyScore: fuzzyScore,
numFuzzyMatches: numFuzzyMatches,
fuzzy: fuzzy,
any: !!allTerms.find(term => term.name === 'any'),
};
}
@@ -474,7 +567,7 @@ class SearchEngine {
async basicSearch(query) {
query = query.replace(/\*/, '');
const parsedQuery = this.parseQuery(query);
const parsedQuery = await this.parseQuery(query);
const searchOptions = {};
for (const key of parsedQuery.keys) {
@@ -489,8 +582,8 @@ class SearchEngine {
return Note.previews(null, searchOptions);
}
determineSearchType_(query, preferredSearchType) {
if (preferredSearchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
determineSearchType_(query, options) {
if (options.searchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
// If preferredSearchType is "fts" we auto-detect anyway
// because it's not always supported.
@@ -499,9 +592,12 @@ class SearchEngine {
if (!Setting.value('db.ftsEnabled') || ['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
return SearchEngine.SEARCH_TYPE_BASIC;
} else if ((Setting.value('db.fuzzySearchEnabled') === 1) && options.fuzzy) {
return SearchEngine.SEARCH_TYPE_FTS_FUZZY;
} else {
return SearchEngine.SEARCH_TYPE_FTS;
}
return SearchEngine.SEARCH_TYPE_FTS;
}
async search(searchString, options = null) {
@@ -511,28 +607,31 @@ class SearchEngine {
searchString = this.normalizeText_(searchString);
const searchType = this.determineSearchType_(searchString, options.searchType);
const searchType = this.determineSearchType_(searchString, options);
if (searchType === SearchEngine.SEARCH_TYPE_BASIC) {
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
const rows = await this.basicSearch(searchString);
const parsedQuery = this.parseQuery(searchString);
const parsedQuery = await this.parseQuery(searchString);
this.processResults_(rows, parsedQuery, true);
return rows;
} else {
// SEARCH_TYPE_FTS
// SEARCH_TYPE_FTS or SEARCH_TYPE_FTS_FUZZY
// FTS will ignore all special characters, like "-" in the index. So if
// we search for "this-phrase" it won't find it because it will only
// see "this phrase" in the index. Because of this, we remove the dashes
// when searching.
// https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856
const parsedQuery = this.parseQuery(searchString);
const parsedQuery = await this.parseQuery(searchString, options.fuzzy);
try {
const { query, params } = queryBuilder(parsedQuery.allTerms);
const { query, params } = (searchType === SearchEngine.SEARCH_TYPE_FTS_FUZZY) ? queryBuilder(parsedQuery.allTerms, true) : queryBuilder(parsedQuery.allTerms, false);
const rows = await this.db().selectAll(query, params);
this.processResults_(rows, parsedQuery);
if (searchType === SearchEngine.SEARCH_TYPE_FTS_FUZZY && !parsedQuery.any) {
return rows.filter(row => row.include);
}
return rows;
} catch (error) {
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
@@ -567,5 +666,6 @@ SearchEngine.instance_ = null;
SearchEngine.SEARCH_TYPE_AUTO = 'auto';
SearchEngine.SEARCH_TYPE_BASIC = 'basic';
SearchEngine.SEARCH_TYPE_FTS = 'fts';
SearchEngine.SEARCH_TYPE_FTS_FUZZY = 'fts_fuzzy';
module.exports = SearchEngine;

View File

@@ -11,7 +11,7 @@ class SearchEngineUtils {
searchType = SearchEngine.SEARCH_TYPE_BASIC;
}
const results = await SearchEngine.instance().search(query, { searchType });
const results = await SearchEngine.instance().search(query, { searchType, fuzzy: options.fuzzy });
const noteIds = results.map(n => n.id);
// We need at least the note ID to be able to sort them below so if not

View File

@@ -3,6 +3,7 @@ interface Term {
name: string
value: string
negated: boolean
quoted?: boolean
}
const makeTerm = (name: string, value: string): Term => {
@@ -10,10 +11,9 @@ const makeTerm = (name: string, value: string): Term => {
return { name: name, value: value, negated: false };
};
const quoted = (s: string) => s.startsWith('"') && s.endsWith('"');
const quote = (s : string) => {
const quoted = (s: string) => s.startsWith('"') && s.endsWith('"');
if (!quoted(s)) {
return `"${s}"`;
}
@@ -71,7 +71,6 @@ const parseQuery = (query: string): Term[] => {
'altitude', 'resource', 'sourceurl']);
const terms = getTerms(query);
// console.log(terms);
const result: Term[] = [];
for (let i = 0; i < terms.length; i++) {
@@ -98,9 +97,9 @@ const parseQuery = (query: string): Term[] => {
// Every word is quoted if not already.
// By quoting the word, FTS match query will take care of removing dashes and other word seperators.
if (value.startsWith('-')) {
result.push({ name: 'text', value: quote(value.slice(1)) , negated: true });
result.push({ name: 'text', value: quote(value.slice(1)) , negated: true, quoted: quoted(value) });
} else {
result.push({ name: 'text', value: quote(value), negated: false });
result.push({ name: 'text', value: quote(value), negated: false, quoted: quoted(value) });
}
}
}

View File

@@ -300,7 +300,7 @@ const sourceUrlFilter = (terms: Term[], conditons: string[], params: string[], r
};
const textFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation) => {
const textFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, fuzzy: Boolean) => {
const addExcludeTextConditions = (excludedTerms: Term[], conditions:string[], params: string[], relation: Relation) => {
const type = excludedTerms[0].name === 'text' ? '' : `.${excludedTerms[0].name}`;
@@ -342,7 +342,7 @@ const textFilter = (terms: Term[], conditions: string[], params: string[], relat
if (term.name === 'text') return term.value;
else return `${term.name}:${term.value}`;
});
const matchQuery = (relation === 'OR') ? termsToMatch.join(' OR ') : termsToMatch.join(' ');
const matchQuery = (fuzzy || (relation === 'OR')) ? termsToMatch.join(' OR ') : termsToMatch.join(' ');
params.push(matchQuery);
}
@@ -374,14 +374,11 @@ const getConnective = (terms: Term[], relation: Relation): string => {
return (!notebookTerm && (relation === 'OR')) ? 'ROWID=-1' : '1'; // ROWID=-1 acts as 0 (something always false)
};
export default function queryBuilder(terms: Term[]) {
export default function queryBuilder(terms: Term[], fuzzy: boolean) {
const queryParts: string[] = [];
const params: string[] = [];
const withs: string[] = [];
// console.log("testing beep beep boop boop")
// console.log(terms);
const relation: Relation = getDefaultRelation(terms);
queryParts.push(`
@@ -405,8 +402,8 @@ export default function queryBuilder(terms: Term[]) {
resourceFilter(terms, queryParts, params, relation, withs);
textFilter(terms, queryParts, params, relation, fuzzy);
textFilter(terms, queryParts, params, relation);
typeFilter(terms, queryParts, params, relation);