You've already forked joplin
mirror of
https://github.com/laurent22/joplin.git
synced 2026-06-18 20:16:34 +02:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e4b8d21700 | |||
| 19ad180602 |
@@ -16,6 +16,7 @@
|
||||
- To compile TypeScript, use `yarn tsc`. To type-check without emitting files, use `yarn tsc --noEmit`.
|
||||
- Default to no comments. Only add one when the why is non-obvious (workaround, hidden constraint, subtle invariant). Never explain what the code does — names handle that. Keep necessary comments to one or two line where possible.
|
||||
- SQL queries should only be done from within models (in packages/lib/models).
|
||||
- In markdown files, do not hard-wrap paragraphs. Let the renderer wrap lines; only insert newlines for actual paragraph or list breaks.
|
||||
|
||||
## Styling (desktop app)
|
||||
|
||||
|
||||
@@ -4,9 +4,8 @@ import { _ } from '@joplin/lib/locale';
|
||||
import EmbeddingIndexer from '@joplin/lib/services/ai/EmbeddingIndexer';
|
||||
import { IndexStatus } from '@joplin/lib/services/ai/types';
|
||||
|
||||
// Live status panel rendered under the "Enable AI features" toggle. Polls
|
||||
// EmbeddingIndexer.getStatus() while the AI settings page is visible.
|
||||
|
||||
// Live status panel under the "Enable AI features" toggle. Polls
|
||||
// EmbeddingIndexer.getStatus() while the AI section is visible.
|
||||
const POLL_INTERVAL_MS = 2000;
|
||||
|
||||
const modelStatusLabel = (s: IndexStatus['modelDownloadStatus']) => {
|
||||
@@ -33,18 +32,14 @@ const AiIndexStatus = () => {
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
let timer: ReturnType<typeof setTimeout> | null = null;
|
||||
// Chain polls instead of using setInterval — a fixed-interval timer
|
||||
// would fire while a previous getStatus() was still in flight (e.g.
|
||||
// during model load competing for the renderer thread), letting
|
||||
// requests stack up. Recursive setTimeout pauses the cadence on slow
|
||||
// ticks and resumes cleanly.
|
||||
// Chained setTimeout, not setInterval — keeps polls from stacking up
|
||||
// when getStatus() is slow (e.g. competing with the model load).
|
||||
const tick = async () => {
|
||||
try {
|
||||
const s = await EmbeddingIndexer.instance().getStatus();
|
||||
if (!cancelled) setStatus(s);
|
||||
} catch {
|
||||
// Swallow — the status panel is decorative; we don't want a
|
||||
// transient DB error to crash the settings screen.
|
||||
// Swallow: this panel is decorative.
|
||||
}
|
||||
if (!cancelled) timer = setTimeout(() => void tick(), POLL_INTERVAL_MS);
|
||||
};
|
||||
|
||||
@@ -2,19 +2,13 @@ import BaseModel from '../BaseModel';
|
||||
import JoplinDatabase from '../JoplinDatabase';
|
||||
import { NoteEmbeddingsMetaEntity } from '../services/database/types';
|
||||
|
||||
// Storage for per-note chunk embeddings produced by the AI embeddings index.
|
||||
// Storage for per-note chunk embeddings.
|
||||
//
|
||||
// Metadata (chunk text, source note ID, model identifier) lives in the regular
|
||||
// `note_embeddings_meta` table created by migration 52.
|
||||
//
|
||||
// The associated vectors are stored in a sqlite-vec `vec0` virtual table
|
||||
// (`note_embeddings_vec`) created lazily by `ensureVecTable()` when sqlite-vec
|
||||
// is available. Joining is done by rowid — the meta row's id is the same as
|
||||
// the vec row's rowid.
|
||||
//
|
||||
// All vector-touching methods check `JoplinDatabase.sqliteVecAvailable()` and
|
||||
// throw a clear error if vector search isn't supported on this platform. The
|
||||
// metadata-only methods work regardless.
|
||||
// Metadata lives in `note_embeddings_meta` (regular SQLite table, migration
|
||||
// 52). Vectors live in `note_embeddings_vec`, a sqlite-vec `vec0` virtual
|
||||
// table created lazily when sqlite-vec is available. Joined by rowid.
|
||||
// Vector methods throw if sqlite-vec is missing; metadata methods work
|
||||
// regardless.
|
||||
|
||||
interface SaveChunk {
|
||||
chunkIndex: number;
|
||||
@@ -31,13 +25,9 @@ interface SimilarityResult {
|
||||
|
||||
interface SimilaritySearchOptions {
|
||||
k: number;
|
||||
// Maximum cosine distance to include. sqlite-vec returns L2 distance for
|
||||
// vec0 by default; for normalised vectors L2² ≈ 2·(1 − cosine). The caller
|
||||
// converts as needed; this layer just exposes the distance as returned by
|
||||
// the extension.
|
||||
// Raw L2 distance from vec0 (caller converts to cosine if needed).
|
||||
maxDistance?: number;
|
||||
// Restrict results to this set of note IDs. Used for scoped searches
|
||||
// (notebook, tag, note) — the caller resolves scope → note IDs first.
|
||||
// Pre-resolved scope filter. Empty array = search nothing.
|
||||
noteIds?: string[];
|
||||
}
|
||||
|
||||
@@ -61,9 +51,7 @@ export default class NoteEmbedding extends BaseModel {
|
||||
}
|
||||
}
|
||||
|
||||
// Creates the sqlite-vec virtual table if it doesn't already exist. Called
|
||||
// lazily because the CREATE VIRTUAL TABLE statement fails on platforms
|
||||
// without the extension.
|
||||
// Called lazily — CREATE VIRTUAL TABLE fails without sqlite-vec loaded.
|
||||
public static async ensureVecTable(dimension: number) {
|
||||
this.requireVec();
|
||||
await this.db().exec(
|
||||
@@ -93,27 +81,26 @@ export default class NoteEmbedding extends BaseModel {
|
||||
return row?.c ?? 0;
|
||||
}
|
||||
|
||||
// Picks indexable notes (not trashed, not in conflict) that haven't been
|
||||
// embedded yet. Used by the indexer's backfill path so existing notes
|
||||
// from before AI was enabled get picked up over successive ticks.
|
||||
public static async notYetIndexedNoteIds(limit: number): Promise<string[]> {
|
||||
// Indexable notes not yet embedded — drives the indexer's initial scan.
|
||||
// `excludeIds` lets the caller skip notes that already failed this session,
|
||||
// so a permanently bad note doesn't keep coming back.
|
||||
public static async notYetIndexedNoteIds(limit: number, excludeIds: string[] = []): Promise<string[]> {
|
||||
const excludeSql = excludeIds.length
|
||||
? ` AND n.id NOT IN (${excludeIds.map(() => '?').join(',')})`
|
||||
: '';
|
||||
const rows = await this.db().selectAll<{ id: string }>(
|
||||
`SELECT id FROM notes
|
||||
WHERE (deleted_time IS NULL OR deleted_time = 0)
|
||||
AND (is_conflict IS NULL OR is_conflict = 0)
|
||||
AND id NOT IN (SELECT DISTINCT note_id FROM note_embeddings_meta)
|
||||
`SELECT n.id FROM notes n
|
||||
WHERE (n.deleted_time IS NULL OR n.deleted_time = 0)
|
||||
AND (n.is_conflict IS NULL OR n.is_conflict = 0)
|
||||
AND NOT EXISTS (SELECT 1 FROM note_embeddings_meta m WHERE m.note_id = n.id)${excludeSql}
|
||||
LIMIT ?`,
|
||||
[limit],
|
||||
[...excludeIds, limit],
|
||||
);
|
||||
return rows.map(r => r.id);
|
||||
}
|
||||
|
||||
// Removes every chunk for a note from both the meta table and the vec
|
||||
// table. Used before re-indexing a changed note and during note deletion.
|
||||
//
|
||||
// The vec-table delete is guarded by both `sqliteVecAvailable()` AND the
|
||||
// table's existence — saveChunks creates the vec table lazily, so on a
|
||||
// fresh profile it may not exist yet.
|
||||
// Vec-table delete is guarded by sqliteVecAvailable() AND vecTableExists()
|
||||
// because saveChunks creates the vec table lazily.
|
||||
public static async deleteByNoteId(noteId: string) {
|
||||
const rows = await this.db().selectAll<{ id: number }>(
|
||||
'SELECT id FROM note_embeddings_meta WHERE note_id = ?',
|
||||
@@ -138,15 +125,10 @@ export default class NoteEmbedding extends BaseModel {
|
||||
return !!row;
|
||||
}
|
||||
|
||||
// Replaces every chunk for a note with a new set. The `modelId` is recorded
|
||||
// alongside each chunk so a future model change can trigger a re-index.
|
||||
//
|
||||
// Not transactional: if the process crashes mid-save the indexer will
|
||||
// later see the half-written set, treat the note as out-of-date via the
|
||||
// ItemChange cursor, and re-run saveChunks (which begins with a
|
||||
// deleteByNoteId so the partial set is cleared). A future PR can add a
|
||||
// proper transactional helper to `Database` once there are other callers
|
||||
// that need it.
|
||||
// Replaces every chunk for a note. modelId is stored per row so a future
|
||||
// model change can trigger a re-index. Not wrapped in a transaction: a
|
||||
// mid-save crash leaves a partial set, but the indexer reprocesses via
|
||||
// the ItemChange cursor and saveChunks begins by clearing the note.
|
||||
public static async saveChunks(noteId: string, modelId: string, chunks: SaveChunk[]) {
|
||||
this.requireVec();
|
||||
if (chunks.length === 0) {
|
||||
@@ -166,10 +148,9 @@ export default class NoteEmbedding extends BaseModel {
|
||||
|
||||
const now = Date.now();
|
||||
for (const chunk of chunks) {
|
||||
// Insert into meta, then read back the auto-assigned id via
|
||||
// last_insert_rowid() — node-sqlite3's exec() doesn't return it
|
||||
// directly. The driver serialises queries through a mutex so the
|
||||
// reading query runs on the same connection without races.
|
||||
// node-sqlite3's exec() doesn't return the insert id, so read it
|
||||
// back with last_insert_rowid(). The driver serialises queries on
|
||||
// the same connection so there's no race.
|
||||
await this.db().exec({
|
||||
sql: 'INSERT INTO note_embeddings_meta(note_id, chunk_index, model_id, chunk_text, created_time) VALUES (?, ?, ?, ?, ?)',
|
||||
params: [noteId, chunk.chunkIndex, modelId, chunk.chunkText, now],
|
||||
@@ -190,25 +171,17 @@ export default class NoteEmbedding extends BaseModel {
|
||||
options: SimilaritySearchOptions,
|
||||
): Promise<SimilarityResult[]> {
|
||||
this.requireVec();
|
||||
// On a fresh profile the vec table is created lazily by saveChunks(),
|
||||
// so it may not exist yet. Treat that as "no embeddings to match"
|
||||
// rather than letting the query throw "no such table".
|
||||
if (!await this.vecTableExists()) return [];
|
||||
|
||||
// An explicitly-empty noteIds means "search within zero notes" — return
|
||||
// nothing rather than silently widening the search to all notes.
|
||||
// Explicit empty noteIds = search nothing (vs undefined = search all).
|
||||
if (options.noteIds && options.noteIds.length === 0) return [];
|
||||
|
||||
const k = Math.max(1, options.k | 0);
|
||||
|
||||
// sqlite-vec's vec0 needs an inline `k = ?` constraint or a hardcoded
|
||||
// LIMIT — a parameter-bound LIMIT after a JOIN isn't visible to its
|
||||
// optimiser. When a noteIds filter is in play we over-fetch by 4× so
|
||||
// the post-join filter rarely starves the result set, then trim to k.
|
||||
// This is a heuristic — pathological cases (large global indexes where
|
||||
// most top matches fall outside the scope) can still under-fill. A
|
||||
// follow-up can switch to pre-resolving noteIds → rowids and pushing
|
||||
// the filter into the vec MATCH clause directly.
|
||||
// vec0 needs k inlined into the MATCH clause; a parameter-bound LIMIT
|
||||
// after a JOIN isn't visible to its planner. With a noteIds filter we
|
||||
// over-fetch 4× and trim after, so the post-join filter has room to
|
||||
// drop non-matches. Pathological vaults can still under-fill.
|
||||
const fetchSize = options.noteIds?.length ? k * 4 : k;
|
||||
|
||||
const whereParts: string[] = [`v.embedding MATCH ? AND k = ${fetchSize}`];
|
||||
@@ -248,10 +221,8 @@ export default class NoteEmbedding extends BaseModel {
|
||||
return results;
|
||||
}
|
||||
|
||||
// Loads the stored vectors for a note's chunks, in chunk-index order.
|
||||
// Used by the search service for noteId-based queries — re-using the
|
||||
// already-indexed vectors avoids both re-embedding and the asymmetry
|
||||
// you'd otherwise get from running them through embedQuery().
|
||||
// Loads the stored vectors for a note's chunks in chunk-index order.
|
||||
// Lets noteId-based searches reuse indexed vectors instead of re-embedding.
|
||||
public static async vectorsByNoteId(noteId: string): Promise<number[][]> {
|
||||
this.requireVec();
|
||||
if (!await this.vecTableExists()) return [];
|
||||
@@ -266,12 +237,9 @@ export default class NoteEmbedding extends BaseModel {
|
||||
return rows.map(r => JSON.parse(r.embedding) as number[]);
|
||||
}
|
||||
|
||||
// Drops every embedding from both tables. Used when the embedding model
|
||||
// changes — the existing vectors are no longer comparable to anything new.
|
||||
// Drops every embedding. Used when the active model's id changes.
|
||||
public static async clearAll() {
|
||||
const queries: string[] = ['DELETE FROM note_embeddings_meta'];
|
||||
// Vec-table delete is guarded the same way as deleteByNoteId — the
|
||||
// table is created lazily, so on a fresh profile it may not exist yet.
|
||||
if (this.joplinDb().sqliteVecAvailable() && await this.vecTableExists()) {
|
||||
queries.push('DELETE FROM note_embeddings_vec');
|
||||
}
|
||||
|
||||
@@ -807,6 +807,17 @@ const builtInMetadata = (Setting: typeof SettingType) => {
|
||||
storage: SettingStorage.Database,
|
||||
},
|
||||
|
||||
// Indexer lifecycle: false until the initial full-vault scan finishes;
|
||||
// after that the indexer only follows the ItemChange feed. Reset to
|
||||
// false when the model id changes (triggering a re-scan).
|
||||
'ai.embedding.initialScanDone': {
|
||||
value: false,
|
||||
type: SettingItemType.Bool,
|
||||
public: false,
|
||||
appTypes: [AppType.Desktop],
|
||||
storage: SettingStorage.Database,
|
||||
},
|
||||
|
||||
theme: {
|
||||
value: Setting.THEME_LIGHT,
|
||||
type: SettingItemType.Int,
|
||||
|
||||
@@ -298,6 +298,38 @@ describe('EmbeddingIndexer', () => {
|
||||
expect(await NoteEmbedding.distinctNoteIdCount()).toBe(2);
|
||||
});
|
||||
|
||||
it('skips a note that fails during the initial scan instead of looping on it', async () => {
|
||||
if (skipIfNoVec()) return;
|
||||
const folder = await Folder.save({ title: 'f' });
|
||||
const good = await Note.save({ title: 'good', body: 'this one works', parent_id: folder.id });
|
||||
const bad = await Note.save({ title: 'bad', body: 'this one will fail', parent_id: folder.id });
|
||||
await waitForChangesSince(0, 2);
|
||||
|
||||
// Provider that throws for the bad note's body, succeeds for everything else.
|
||||
let failureCalls = 0;
|
||||
const failingProvider = new (class extends TestEmbeddingProvider {
|
||||
public async embed(texts: string[]) {
|
||||
if (texts.some(t => t.includes('this one will fail'))) {
|
||||
failureCalls++;
|
||||
throw new Error('synthetic failure');
|
||||
}
|
||||
return super.embed(texts);
|
||||
}
|
||||
})();
|
||||
AiService.instance().setEmbeddingProvider(failingProvider);
|
||||
|
||||
// First tick processes both — bad fails, good succeeds.
|
||||
await EmbeddingIndexer.instance().maintenance();
|
||||
// Second tick must NOT retry the bad note (would re-incur failureCalls).
|
||||
await EmbeddingIndexer.instance().maintenance();
|
||||
await EmbeddingIndexer.instance().maintenance();
|
||||
|
||||
expect(failureCalls).toBe(1);
|
||||
expect(await NoteEmbedding.countByNoteId(good.id)).toBeGreaterThan(0);
|
||||
expect(await NoteEmbedding.countByNoteId(bad.id)).toBe(0);
|
||||
expect(Setting.value('ai.embedding.initialScanDone')).toBe(true);
|
||||
});
|
||||
|
||||
it('getStatus counts indexed vs total notes and excludes trashed ones', async () => {
|
||||
if (skipIfNoVec()) return;
|
||||
const folder = await Folder.save({ title: 'f' });
|
||||
|
||||
@@ -13,32 +13,19 @@ import { EmbeddingProvider, IndexStatus } from './types';
|
||||
|
||||
const logger = Logger.create('EmbeddingIndexer');
|
||||
|
||||
// 5-minute interval matches OcrService — slow enough to avoid burning CPU on
|
||||
// every edit, fast enough that newly-saved notes are searchable within minutes.
|
||||
// Matches OcrService — slow enough to avoid burning CPU on every edit,
|
||||
// fast enough that newly-saved notes are searchable within minutes.
|
||||
const MAINTENANCE_INTERVAL = 5 * Minute;
|
||||
|
||||
// How many item_changes we process per maintenance tick. Keeps the indexer
|
||||
// responsive on huge backlogs (e.g. first run on an existing vault) by
|
||||
// committing progress after each batch rather than holding everything in
|
||||
// memory until done.
|
||||
// Caps both the per-tick change-feed drain and the backfill top-up.
|
||||
const BATCH_SIZE = 100;
|
||||
|
||||
// Background service that watches `item_changes` for note edits, chunks the
|
||||
// note body, asks the active EmbeddingProvider to embed each chunk, and stores
|
||||
// the result via the NoteEmbedding model.
|
||||
//
|
||||
// Lifecycle is identical to OcrService: `runInBackground()` starts a timer,
|
||||
// `stopRunInBackground()` stops it. Both are idempotent.
|
||||
//
|
||||
// Progress is durable via two settings:
|
||||
// - `ai.embedding.lastProcessedChangeId` is the cursor into item_changes. We
|
||||
// resume from this on every restart, so a crashed indexer doesn't reprocess
|
||||
// notes it already handled.
|
||||
// - `ai.embedding.lastIndexedModelId` is the model that produced the current
|
||||
// vectors. If the active provider's modelId differs (e.g. the user upgraded
|
||||
// the local model, or switched to a cloud embedding provider), we wipe
|
||||
// note_embeddings and start over — vectors from different models aren't
|
||||
// comparable.
|
||||
// Background service that watches `item_changes`, chunks each modified note,
|
||||
// embeds the chunks via the active EmbeddingProvider, and stores them in
|
||||
// NoteEmbedding. Progress is durable via two settings: a cursor into
|
||||
// item_changes, and the modelId that produced the current vectors (a mismatch
|
||||
// triggers a clear-and-rebuild — vectors from different models aren't
|
||||
// comparable).
|
||||
|
||||
export default class EmbeddingIndexer {
|
||||
|
||||
@@ -52,15 +39,18 @@ export default class EmbeddingIndexer {
|
||||
private maintenanceTimer_: ReturnType<typeof shim.setInterval> = null;
|
||||
private isRunningInBackground_ = false;
|
||||
private maintenanceRunning_ = false;
|
||||
// Notes that threw during this session's initial scan. Skipped for the
|
||||
// remainder of the session so the scan can complete; reset on process
|
||||
// restart so a fix to the underlying issue gets a fresh chance.
|
||||
private initialScanFailures_ = new Set<string>();
|
||||
|
||||
public async runInBackground() {
|
||||
if (this.isRunningInBackground_) return;
|
||||
this.isRunningInBackground_ = true;
|
||||
|
||||
logger.info('Starting background indexer');
|
||||
// Kick the first maintenance off without awaiting — model load + first
|
||||
// embed batch can take 5-15s and we don't want to block app startup on
|
||||
// it. The timer below picks up subsequent runs.
|
||||
// Fire-and-forget the first tick: model load + first embed batch can
|
||||
// be 5-15s and would otherwise block app startup.
|
||||
void this.maintenance();
|
||||
|
||||
this.maintenanceTimer_ = shim.setInterval(async () => {
|
||||
@@ -76,19 +66,17 @@ export default class EmbeddingIndexer {
|
||||
this.isRunningInBackground_ = false;
|
||||
}
|
||||
|
||||
// Snapshot of model + indexer state for the settings UI and the
|
||||
// joplin.ai.indexStatus() plugin API. Cheap to call (two COUNT(*) and one
|
||||
// provider state probe) so callers can poll on a UI tick.
|
||||
// Snapshot of indexer + model state for the settings UI. Cheap enough to
|
||||
// poll on a UI tick (two COUNTs + a provider probe).
|
||||
public async getStatus(): Promise<IndexStatus> {
|
||||
const provider = AiService.instance().getActiveEmbeddingProvider();
|
||||
|
||||
let modelDownloadStatus: IndexStatus['modelDownloadStatus'] = 'unavailable';
|
||||
if (provider) {
|
||||
// Providers without a downloadable artefact (remote, test stub)
|
||||
// surface as 'downloaded' so the UI doesn't need a special case.
|
||||
modelDownloadStatus = provider.modelDownloadStatus
|
||||
? await provider.modelDownloadStatus()
|
||||
// Remote / test providers without a downloadable artefact are
|
||||
// always "ready" — surfacing 'downloaded' lets the UI treat
|
||||
// them uniformly without a special "n/a" branch.
|
||||
: 'downloaded';
|
||||
}
|
||||
|
||||
@@ -103,41 +91,33 @@ export default class EmbeddingIndexer {
|
||||
indexerState = 'idle';
|
||||
}
|
||||
|
||||
// Both counts exclude trashed and conflict notes so the "indexed N/M"
|
||||
// ratio is interpretable: M is the universe the indexer cares about,
|
||||
// not the entire notes table.
|
||||
// Both counts exclude trashed/conflict notes so the displayed ratio
|
||||
// matches the indexer's universe.
|
||||
const notesIndexed = await NoteEmbedding.distinctNoteIdCount();
|
||||
const totalNotes = await Note.indexableCount();
|
||||
|
||||
return { modelDownloadStatus, indexerState, notesIndexed, totalNotes };
|
||||
}
|
||||
|
||||
// Single maintenance tick. Public so tests can drive the indexer without
|
||||
// waiting for a real timer fire.
|
||||
// Single maintenance tick. Public so tests can drive it without waiting
|
||||
// for the timer.
|
||||
public async maintenance() {
|
||||
if (this.maintenanceRunning_) {
|
||||
// Don't queue concurrent maintenance runs; if the previous one is
|
||||
// still going (e.g. a huge initial backlog), the next tick will
|
||||
// pick up where it left off.
|
||||
logger.info('Skipping maintenance — previous run still in flight');
|
||||
return;
|
||||
}
|
||||
if (this.maintenanceRunning_) return;
|
||||
this.maintenanceRunning_ = true;
|
||||
try {
|
||||
const provider = AiService.instance().getActiveEmbeddingProvider();
|
||||
if (!provider) {
|
||||
logger.info('No embedding provider configured — skipping');
|
||||
return;
|
||||
}
|
||||
if (!provider) return;
|
||||
|
||||
await this.handleModelChange(provider);
|
||||
// Drain the change feed first so a recent edit isn't starved by a
|
||||
// large backfill. Then top up the same batch slot with any
|
||||
// not-yet-indexed notes from before the cursor (existing notes on
|
||||
// first enable, or notes that pre-date the embeddings feature).
|
||||
const processed = await this.processChangeBatch(provider);
|
||||
if (processed < BATCH_SIZE) {
|
||||
await this.processBackfillBatch(provider, BATCH_SIZE - processed);
|
||||
// Until the initial scan completes, walk the whole vault one batch
|
||||
// per tick. Then switch to change-feed-only mode. The tick that
|
||||
// finishes the scan still runs the change feed so edits made
|
||||
// during the scan don't wait an extra interval.
|
||||
if (!Setting.value('ai.embedding.initialScanDone')) {
|
||||
await this.runInitialScanBatch(provider);
|
||||
}
|
||||
if (Setting.value('ai.embedding.initialScanDone')) {
|
||||
await this.processChangeBatch(provider);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Maintenance run failed:', error);
|
||||
@@ -146,9 +126,8 @@ export default class EmbeddingIndexer {
|
||||
}
|
||||
}
|
||||
|
||||
// If the active provider's modelId doesn't match what's stored for the
|
||||
// existing vectors, clear everything and reset the cursor. The next
|
||||
// maintenance tick will rebuild from scratch.
|
||||
// Wipe-and-rebuild when the active provider's modelId changes — vectors
|
||||
// from different models aren't comparable.
|
||||
private async handleModelChange(provider: EmbeddingProvider) {
|
||||
const lastModelId = Setting.value('ai.embedding.lastIndexedModelId') as string;
|
||||
if (lastModelId === provider.modelId) return;
|
||||
@@ -157,18 +136,17 @@ export default class EmbeddingIndexer {
|
||||
await NoteEmbedding.clearAll();
|
||||
Setting.setValue('ai.embedding.lastProcessedChangeId', 0);
|
||||
Setting.setValue('ai.embedding.lastIndexedModelId', provider.modelId);
|
||||
Setting.setValue('ai.embedding.initialScanDone', false);
|
||||
this.initialScanFailures_.clear();
|
||||
}
|
||||
|
||||
// Returns the number of notes acted on so the caller can decide how much
|
||||
// backfill room is left in this tick.
|
||||
private async processChangeBatch(provider: EmbeddingProvider): Promise<number> {
|
||||
private async processChangeBatch(provider: EmbeddingProvider): Promise<void> {
|
||||
const cursor = Setting.value('ai.embedding.lastProcessedChangeId') as number;
|
||||
const changes = await ItemChange.changesSinceId(cursor, { limit: BATCH_SIZE });
|
||||
if (!changes.length) return 0;
|
||||
if (!changes.length) return;
|
||||
|
||||
// Collapse duplicates so we only embed each note once per batch even
|
||||
// when there are multiple updates queued for it. Process deletes last
|
||||
// so a delete-then-create within the batch lands in the right order.
|
||||
// Collapse duplicates so a note edited multiple times only gets
|
||||
// embedded once per tick.
|
||||
const latestPerNote = new Map<string, ItemChangeEntity>();
|
||||
for (const change of changes) {
|
||||
if (change.item_type !== ModelType.Note) continue;
|
||||
@@ -183,55 +161,53 @@ export default class EmbeddingIndexer {
|
||||
await this.indexNote(noteId, provider);
|
||||
}
|
||||
} catch (error) {
|
||||
// Don't let one bad note stop the whole batch. The cursor will
|
||||
// advance past it; if the user fixes the underlying issue
|
||||
// they can trigger a full re-index later.
|
||||
logger.warn(`Failed to index note ${noteId}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
// Advance the cursor to the highest change ID we just looked at. If
|
||||
// the indexer crashes mid-batch the cursor stays at its previous
|
||||
// value, so the next run reprocesses the partially-applied notes
|
||||
// (idempotently — saveChunks deletes existing chunks first).
|
||||
// Advance the cursor only after the batch finishes. A mid-batch crash
|
||||
// reprocesses everything from the previous cursor (saveChunks is
|
||||
// idempotent — it deletes existing chunks first).
|
||||
const highestId = changes[changes.length - 1].id;
|
||||
Setting.setValue('ai.embedding.lastProcessedChangeId', highestId);
|
||||
|
||||
return latestPerNote.size;
|
||||
}
|
||||
|
||||
// Picks up any indexable notes that haven't been embedded yet — typically
|
||||
// the existing notes on a fresh AI enable, or notes that pre-date this
|
||||
// feature. Bounded by the slot left over from the change-feed batch so
|
||||
// each tick stays predictable in cost.
|
||||
private async processBackfillBatch(provider: EmbeddingProvider, limit: number): Promise<void> {
|
||||
if (limit <= 0) return;
|
||||
// Walks the full notes table on first enable (or after a model swap).
|
||||
// One batch per tick so the indexer stays responsive on big vaults.
|
||||
// Per-note failures are remembered in-memory only — they get a fresh
|
||||
// attempt on the next session, which is when a fix to the underlying
|
||||
// issue would have been deployed.
|
||||
private async runInitialScanBatch(provider: EmbeddingProvider): Promise<void> {
|
||||
// Snap the change-feed cursor at the start of the scan, not the end —
|
||||
// edits and deletes that happen during the scan are then picked up
|
||||
// normally by the change feed instead of being lost.
|
||||
if ((Setting.value('ai.embedding.lastProcessedChangeId') as number) === 0) {
|
||||
Setting.setValue('ai.embedding.lastProcessedChangeId', await ItemChange.lastChangeId());
|
||||
}
|
||||
|
||||
const noteIds = await NoteEmbedding.notYetIndexedNoteIds(limit);
|
||||
if (!noteIds.length) return;
|
||||
const excluded = Array.from(this.initialScanFailures_);
|
||||
const noteIds = await NoteEmbedding.notYetIndexedNoteIds(BATCH_SIZE, excluded);
|
||||
|
||||
logger.info(`Backfill: indexing ${noteIds.length} note(s) not yet in the embeddings index`);
|
||||
if (!noteIds.length) {
|
||||
Setting.setValue('ai.embedding.initialScanDone', true);
|
||||
logger.info(`Initial scan complete (${this.initialScanFailures_.size} note(s) skipped after errors)`);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info(`Initial scan: indexing ${noteIds.length} note(s)`);
|
||||
for (const noteId of noteIds) {
|
||||
try {
|
||||
await this.indexNote(noteId, provider);
|
||||
} catch (error) {
|
||||
logger.warn(`Backfill failed for note ${noteId}:`, error);
|
||||
this.initialScanFailures_.add(noteId);
|
||||
logger.warn(`Initial scan failed for note ${noteId} (will retry next session):`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async indexNote(noteId: string, provider: EmbeddingProvider) {
|
||||
const note = await Note.load(noteId) as NoteEntity | null;
|
||||
if (!note) {
|
||||
// Note may have been deleted between the change being recorded
|
||||
// and us getting around to it.
|
||||
await NoteEmbedding.deleteByNoteId(noteId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Skip notes that retrieval should never return — keeps the index
|
||||
// smaller and avoids surprising search results.
|
||||
if (note.is_conflict || (note.deleted_time && note.deleted_time > 0)) {
|
||||
if (!note || note.is_conflict || (note.deleted_time && note.deleted_time > 0)) {
|
||||
await NoteEmbedding.deleteByNoteId(noteId);
|
||||
return;
|
||||
}
|
||||
@@ -239,26 +215,15 @@ export default class EmbeddingIndexer {
|
||||
const body = (note.body ?? '').trim();
|
||||
const title = (note.title ?? '').trim();
|
||||
if (!body && !title) {
|
||||
// Notes with neither a title nor a body have no meaningful signal
|
||||
// to embed. Make sure any stale rows from a previous edit are
|
||||
// cleaned up.
|
||||
await NoteEmbedding.deleteByNoteId(noteId);
|
||||
return;
|
||||
}
|
||||
|
||||
const chunks = chunkText(body);
|
||||
|
||||
// Inject the title into the first chunk twice. The title is often the
|
||||
// densest semantic signal a note carries — e.g. "Pet sitters for my
|
||||
// dog" with a body that's just an attachment reference. Without this,
|
||||
// searching for "dog walker" would never find that note because the
|
||||
// body has no relevant content.
|
||||
//
|
||||
// Why double, and why only chunk 0?
|
||||
// - Doubling boosts the title's weight in the chunk's embedding so
|
||||
// title-anchored queries pull harder on this note.
|
||||
// - Chunk 0 is the natural place to put it because it's also where
|
||||
// the body opening lives, which usually flows from the title.
|
||||
// Title is often the densest semantic signal (e.g. "Pet sitters for
|
||||
// my dog" with a body that's just an attachment link). Doubling it
|
||||
// into chunk 0 boosts its weight so title-anchored queries hit.
|
||||
if (title) {
|
||||
if (chunks.length === 0) {
|
||||
chunks.push(title);
|
||||
@@ -285,11 +250,4 @@ export default class EmbeddingIndexer {
|
||||
|
||||
await NoteEmbedding.saveChunks(noteId, provider.modelId, payload);
|
||||
}
|
||||
|
||||
// Reset state — exposed for tests and for a future "re-index all" button.
|
||||
public async clearProgress() {
|
||||
await NoteEmbedding.clearAll();
|
||||
Setting.setValue('ai.embedding.lastProcessedChangeId', 0);
|
||||
Setting.setValue('ai.embedding.lastIndexedModelId', '');
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,34 +4,15 @@ import shim from '../../shim';
|
||||
|
||||
const logger = Logger.create('EmbeddingModelDownloader');
|
||||
|
||||
// Handles downloading the local embedding model on first use and caching it
|
||||
// under the user's profile. The model itself is too large to ship with the
|
||||
// desktop installer (~140 MB), so we download it lazily the first time the
|
||||
// user enables AI.
|
||||
//
|
||||
// Cache layout (under `${cacheDir}/ai/embedding-models/`):
|
||||
//
|
||||
// multilingual-e5-small/
|
||||
// config.json
|
||||
// model_quantized.onnx
|
||||
// sentencepiece.bpe.model
|
||||
// special_tokens_map.json
|
||||
// tokenizer.json
|
||||
// tokenizer_config.json
|
||||
//
|
||||
// The model directory is created by extracting the tarball published as a
|
||||
// release asset in https://github.com/joplinapp/embedding-models. Each release
|
||||
// uses the model name as both the tag and the archive filename, so the
|
||||
// download URL is fully determined by the model id.
|
||||
// Downloads the local embedding model on first use and caches it under
|
||||
// ${cacheDir}/ai/embedding-models/<archiveName>/. The model is too large
|
||||
// (~140 MB) to ship with the installer.
|
||||
|
||||
export interface ModelDescriptor {
|
||||
// Stable identifier the indexer stores in note_embeddings_meta.model_id.
|
||||
// When this changes, EmbeddingIndexer.handleModelChange() wipes the index.
|
||||
// Stored alongside each chunk; changing it triggers a full re-index.
|
||||
id: string;
|
||||
// File name (without `.tar.gz`) that matches the GitHub release tag AND
|
||||
// the top-level directory inside the archive.
|
||||
// Base name of the tarball and the cache subdir.
|
||||
archiveName: string;
|
||||
// Public URL the tarball is downloaded from.
|
||||
downloadUrl: string;
|
||||
}
|
||||
|
||||
@@ -42,10 +23,8 @@ export const MULTILINGUAL_E5_SMALL: ModelDescriptor = {
|
||||
};
|
||||
|
||||
export interface DownloadProgress {
|
||||
// Bytes received so far. May be partial if the server doesn't advertise
|
||||
// Content-Length (rare with GitHub Releases, but defensive).
|
||||
bytesDownloaded: number;
|
||||
// Total bytes if known, else null. Use to compute a percentage when set.
|
||||
// Currently always null — shim.fetchBlob doesn't surface Content-Length.
|
||||
totalBytes: number | null;
|
||||
}
|
||||
|
||||
@@ -63,27 +42,17 @@ const archivePath = (model: ModelDescriptor) =>
|
||||
const modelDir = (model: ModelDescriptor) =>
|
||||
`${baseCacheDir()}/${model.archiveName}`;
|
||||
|
||||
// Returns the path to the local model directory if it's present and looks
|
||||
// usable (the marker file we extract is there). Returns null otherwise.
|
||||
// Returns the model dir if config.json (our extraction marker) is present.
|
||||
export const localModelPath = async (model: ModelDescriptor): Promise<string | null> => {
|
||||
const dir = modelDir(model);
|
||||
// A successful extract always leaves config.json behind. That's our cheap
|
||||
// "is the model cached?" check — no need to validate every file.
|
||||
const marker = `${dir}/config.json`;
|
||||
const exists = await shim.fsDriver().exists(marker);
|
||||
return exists ? dir : null;
|
||||
};
|
||||
|
||||
// Tracks in-flight downloads per model so concurrent callers share a single
|
||||
// download instead of racing on the same tarball + extract directory. Cleared
|
||||
// once the work either resolves or rejects so a later call after a failure can
|
||||
// retry from scratch.
|
||||
// Single-flight per model id so concurrent callers share one download.
|
||||
const inFlight: Map<string, Promise<string>> = new Map();
|
||||
|
||||
// Downloads, verifies, and extracts the model if it isn't already on disk.
|
||||
// Safe to call repeatedly and from concurrent callers: cache hot → returns
|
||||
// immediately; cache cold → first caller does the work and the rest await
|
||||
// the same promise.
|
||||
export const ensureModelDownloaded = async (
|
||||
model: ModelDescriptor,
|
||||
options: EnsureOptions = {},
|
||||
@@ -94,7 +63,7 @@ export const ensureModelDownloaded = async (
|
||||
const pending = inFlight.get(model.id);
|
||||
if (pending) return pending;
|
||||
|
||||
// eslint-disable-next-line promise/prefer-await-to-then -- .finally is the natural fit here: we need cleanup to run on both resolve and reject, without inverting the caller-facing await chain
|
||||
// eslint-disable-next-line promise/prefer-await-to-then -- .finally cleans up on both resolve and reject without inverting the caller's await chain
|
||||
const work = runDownload(model, options).finally(() => {
|
||||
inFlight.delete(model.id);
|
||||
});
|
||||
@@ -113,34 +82,28 @@ const runDownload = async (
|
||||
const tarPath = archivePath(model);
|
||||
const targetDir = modelDir(model);
|
||||
|
||||
// Wipe any stale half-downloaded tarball or partially-extracted directory
|
||||
// left over from a previous failed attempt.
|
||||
// Wipe any stale partial state from a previous failed attempt.
|
||||
if (await fsDriver.exists(tarPath)) await fsDriver.remove(tarPath);
|
||||
if (await fsDriver.exists(targetDir)) await fsDriver.remove(targetDir);
|
||||
|
||||
logger.info(`Downloading embedding model from ${model.downloadUrl}`);
|
||||
await downloadWithProgress(model.downloadUrl, tarPath, options.onProgress);
|
||||
|
||||
// The published tarball stores files at the archive root (no top-level
|
||||
// directory) so we have to create the target dir ourselves and extract
|
||||
// into it. If we extracted into cacheDir the model files would spill into
|
||||
// the cache root and clobber any sibling model.
|
||||
// Files in the tarball sit at the archive root (no wrapping dir), so we
|
||||
// create the target ourselves to avoid spilling into the cache root.
|
||||
logger.info(`Extracting embedding model into ${targetDir}`);
|
||||
await fsDriver.mkdir(targetDir);
|
||||
await fsDriver.tarExtract({ file: tarPath, cwd: targetDir });
|
||||
|
||||
// Sanity check: after extraction the archive directory should exist with
|
||||
// the marker file in place. If not, the tarball was malformed.
|
||||
const verified = await localModelPath(model);
|
||||
if (!verified) {
|
||||
throw new Error(`Embedding model archive did not extract as expected (missing ${targetDir}/config.json)`);
|
||||
}
|
||||
|
||||
// Tarball is no longer needed once extracted.
|
||||
try {
|
||||
await fsDriver.remove(tarPath);
|
||||
} catch (error) {
|
||||
// Non-fatal — the next download attempt will clean it up.
|
||||
// Non-fatal — next download attempt will clean it up.
|
||||
logger.warn(`Failed to delete model archive at ${tarPath}: ${error.message ?? error}`);
|
||||
}
|
||||
|
||||
@@ -152,17 +115,9 @@ const downloadWithProgress = async (
|
||||
destPath: string,
|
||||
onProgress?: ProgressCallback,
|
||||
): Promise<void> => {
|
||||
// shim.fetchBlob streams the body straight to disk and handles redirects
|
||||
// (GitHub Releases redirects to S3, so following redirects is mandatory).
|
||||
// We pass a minimal downloadController so the underlying node http code
|
||||
// hands us each chunk as it arrives — that's our progress signal.
|
||||
//
|
||||
// The timeout is per-socket-idle (not total), so a 60s value means "fail
|
||||
// after 60s of silence" — fine for a multi-minute download as long as
|
||||
// data keeps flowing. Without it, shim.fetchBlob defaults to no timeout
|
||||
// and a stalled connection (captive portal, dropped TCP traffic, hung
|
||||
// proxy) would block forever and freeze every concurrent caller waiting
|
||||
// on the shared in-flight promise.
|
||||
// 60s is per-socket-idle, not total — fail after silence, fine while
|
||||
// data keeps flowing. Without it a stalled connection would block every
|
||||
// concurrent caller forever via the shared in-flight promise.
|
||||
const downloadController = onProgress ? makeProgressController(onProgress) : undefined;
|
||||
const response = await shim.fetchBlob(url, {
|
||||
path: destPath,
|
||||
@@ -177,13 +132,10 @@ const downloadWithProgress = async (
|
||||
}
|
||||
};
|
||||
|
||||
// Minimal downloadController shape that conforms to the contract exercised by
|
||||
// shim-init-node's fetchBlob (only `handleChunk(request)` is called). We don't
|
||||
// care about the size-limit fields the real LimitedDownloadController uses —
|
||||
// they're set internally by callers that need them.
|
||||
// Minimal shape that matches what shim-init-node's fetchBlob actually calls
|
||||
// on a downloadController — only handleChunk is exercised here.
|
||||
const makeProgressController = (onProgress: ProgressCallback) => {
|
||||
let bytesDownloaded = 0;
|
||||
const totalBytes: number | null = null;
|
||||
return {
|
||||
totalBytes: 0,
|
||||
imagesCount: 0,
|
||||
@@ -191,16 +143,16 @@ const makeProgressController = (onProgress: ProgressCallback) => {
|
||||
maxImagesCount: 0,
|
||||
printStats: () => { /* no-op */ },
|
||||
limitMessage: () => '',
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- DownloadChunk type isn't re-exported; we read only .length
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- DownloadChunk type isn't re-exported; we only read .length
|
||||
handleChunk: (_request: unknown) => (chunk: any) => {
|
||||
bytesDownloaded += chunk.length ?? 0;
|
||||
onProgress({ bytesDownloaded, totalBytes });
|
||||
onProgress({ bytesDownloaded, totalBytes: null });
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
// Clears the cached model. Used for the future "re-download" path and to
|
||||
// reclaim disk space when AI is disabled and the user wants a clean slate.
|
||||
// Wipes the cached model. Public for use by a future "re-download" action;
|
||||
// also covered by tests as the cleanup primitive.
|
||||
export const removeCachedModel = async (model: ModelDescriptor): Promise<void> => {
|
||||
const fsDriver = shim.fsDriver();
|
||||
const dir = modelDir(model);
|
||||
|
||||
@@ -12,25 +12,15 @@ import {
|
||||
const logger = Logger.create('LocalEmbeddingProvider');
|
||||
|
||||
// Runs the bundled multilingual-e5-small model locally via onnxruntime-node.
|
||||
// Tokenization is delegated to @xenova/transformers (AutoTokenizer); inference
|
||||
// runs through shim.onnxRuntime() so non-desktop builds (mobile, cli) that
|
||||
// haven't wired ONNX in degrade cleanly instead of crashing on require().
|
||||
//
|
||||
// e5 models expect inputs prefixed with "passage: " (for documents being
|
||||
// indexed) or "query: " (for search queries). For v1 we always use "passage:"
|
||||
// because this provider only feeds the indexer. When PR D adds search,
|
||||
// callers will provide pre-prefixed text or we'll add an `embedQuery()`
|
||||
// method — whichever fits the API best.
|
||||
// Tokenization is delegated to @xenova/transformers; inference runs through
|
||||
// shim.onnxRuntime() so non-desktop builds without ONNX wired in degrade
|
||||
// cleanly instead of crashing on require().
|
||||
|
||||
// Cap ONNX thread count so background indexing doesn't peg every core on the
|
||||
// user's machine. 2 threads is a good compromise: noticeably faster than 1
|
||||
// without making the laptop fans spin during a re-index.
|
||||
// Capped to keep background indexing from pegging every core.
|
||||
const INTRA_OP_NUM_THREADS = 2;
|
||||
|
||||
// e5-small returns 384-dim vectors. Hard-coded because the sqlite-vec table
|
||||
// dimension is fixed at first creation and we need to match it. If we ever
|
||||
// switch model, modelId changes too and the indexer rebuilds from scratch
|
||||
// against the new dimension.
|
||||
// Fixed at the model's known output size — the sqlite-vec table dimension
|
||||
// is set on first create() and must match.
|
||||
const E5_SMALL_DIMENSION = 384;
|
||||
|
||||
interface OnnxSession {
|
||||
@@ -83,10 +73,8 @@ export default class LocalEmbeddingProvider implements EmbeddingProvider {
|
||||
private initPromise_: Promise<void> | null = null;
|
||||
private session_: OnnxSession | null = null;
|
||||
private tokenizer_: Tokenizer | null = null;
|
||||
// `downloading` is set while we're inside ensureModelDownloaded(), so the
|
||||
// UI can show progress without polling the file system mid-download.
|
||||
// 'downloaded' once we've seen the marker on disk; reverts to whatever
|
||||
// the on-disk check returns if the cache is later wiped.
|
||||
// Set while we're inside ensureModelDownloaded() so the status panel can
|
||||
// distinguish "downloading" from "not started" without polling disk.
|
||||
private downloadStatus_: ProviderModelDownloadStatus | null = null;
|
||||
|
||||
public constructor(options: LocalEmbeddingProviderOptions = {}) {
|
||||
@@ -109,9 +97,8 @@ export default class LocalEmbeddingProvider implements EmbeddingProvider {
|
||||
|
||||
await this.ensureInitialised();
|
||||
|
||||
// e5 is trained with asymmetric prefixes — "passage: " for indexed
|
||||
// documents, "query: " for search inputs. Mixing them up doesn't
|
||||
// crash but does measurably hurt retrieval quality.
|
||||
// e5 is trained with asymmetric prefixes — "passage: " for documents,
|
||||
// "query: " for searches. Mixing them up measurably hurts retrieval.
|
||||
const prefixed = texts.map(t => `${prefix}${t}`);
|
||||
|
||||
const tokenized = this.tokenizer_!(prefixed, {
|
||||
@@ -128,11 +115,9 @@ export default class LocalEmbeddingProvider implements EmbeddingProvider {
|
||||
input_ids: inputIds,
|
||||
attention_mask: attentionMask,
|
||||
};
|
||||
// e5 was exported expecting token_type_ids as an input even though
|
||||
// XLM-RoBERTa only ever uses a single segment. transformers.js
|
||||
// doesn't emit it for XLM-R, so we synthesise a zero tensor with the
|
||||
// same shape — that's what the model would see for any single-segment
|
||||
// input anyway.
|
||||
// e5's ONNX export declares token_type_ids as a required input even
|
||||
// though XLM-RoBERTa is single-segment. transformers.js doesn't emit
|
||||
// it, so we synthesise a zero tensor when the session asks for one.
|
||||
if (this.session_!.inputNames?.includes('token_type_ids')) {
|
||||
const ttiData = tokenized.token_type_ids?.data
|
||||
?? new BigInt64Array(tokenized.input_ids.data.length);
|
||||
@@ -151,9 +136,8 @@ export default class LocalEmbeddingProvider implements EmbeddingProvider {
|
||||
|
||||
private async ensureInitialised(): Promise<void> {
|
||||
if (this.session_ && this.tokenizer_) return;
|
||||
// Clear initPromise_ on rejection so a transient failure (e.g. a
|
||||
// dropped download) can be retried on the next call instead of every
|
||||
// future call inheriting the same rejected promise.
|
||||
// Drop the cached promise on rejection so a transient failure can be
|
||||
// retried — otherwise every future caller inherits the failed promise.
|
||||
// eslint-disable-next-line promise/prefer-await-to-then -- await would need a wrapper to keep the single-flight cache shape
|
||||
this.initPromise_ ??= this.initialise().catch(error => {
|
||||
this.initPromise_ = null;
|
||||
@@ -168,8 +152,8 @@ export default class LocalEmbeddingProvider implements EmbeddingProvider {
|
||||
throw new Error('ONNX runtime is not available. Local embeddings require the desktop build.');
|
||||
}
|
||||
|
||||
// Full-override path used by unit tests: skip disk + transformers.js
|
||||
// entirely so the test runner doesn't need a real model directory.
|
||||
// Test seam: with both overrides supplied, skip the disk + ESM-import
|
||||
// paths entirely.
|
||||
if (this.overrides_?.onnxRuntime && this.overrides_?.tokenizer) {
|
||||
this.session_ = await ort.InferenceSession.create('', {});
|
||||
this.tokenizer_ = this.overrides_.tokenizer;
|
||||
@@ -196,20 +180,15 @@ export default class LocalEmbeddingProvider implements EmbeddingProvider {
|
||||
this.downloadStatus_ = 'downloaded';
|
||||
return dir;
|
||||
} catch (error) {
|
||||
// On failure, drop the in-memory state so the next probe reflects
|
||||
// the real on-disk situation (cache may be empty, or a stale
|
||||
// partial may have been wiped by runDownload's cleanup).
|
||||
this.downloadStatus_ = null;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
public async modelDownloadStatus(): Promise<ProviderModelDownloadStatus> {
|
||||
// In-flight download wins: the file system check would say "not
|
||||
// started" until the tarball lands and gets extracted.
|
||||
// In-flight download wins — the on-disk check still says "not started"
|
||||
// while the tarball is being fetched.
|
||||
if (this.downloadStatus_ === 'downloading') return 'downloading';
|
||||
// On-disk marker is the source of truth for the steady state, so we
|
||||
// notice an externally-wiped cache (manual rm or removeCachedModel).
|
||||
const dir = await localModelPath(this.model_);
|
||||
if (dir) {
|
||||
this.downloadStatus_ = 'downloaded';
|
||||
@@ -224,21 +203,16 @@ export default class LocalEmbeddingProvider implements EmbeddingProvider {
|
||||
|
||||
private async loadTokenizer(modelDir: string): Promise<Tokenizer> {
|
||||
if (this.overrides_?.tokenizer) return this.overrides_.tokenizer;
|
||||
// transformers.js is an ESM-only package, so we can't `require()` it
|
||||
// from this CommonJS module. A plain `await import()` is what we want
|
||||
// but TypeScript rewrites it to `require()` under `module: commonjs`.
|
||||
// `new Function('import(...)')` would also work but the renderer's CSP
|
||||
// forbids unsafe-eval. So we hide the dynamic import in a sibling .js
|
||||
// file that TypeScript can't see (and so can't rewrite).
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports -- the .js helper preserves a native ESM import past TS lowering
|
||||
// See dynamicEsmImport.js for why this isn't a plain `await import()`.
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports -- see dynamicEsmImport.js
|
||||
const dynamicImport = require('./dynamicEsmImport') as (s: string)=> Promise<{
|
||||
env: { localModelPath: string; allowRemoteModels: boolean };
|
||||
AutoTokenizer: { from_pretrained: (name: string)=> Promise<Tokenizer> };
|
||||
}>;
|
||||
const transformers = await dynamicImport('@xenova/transformers');
|
||||
// Point transformers.js at the parent dir of the extracted model
|
||||
// (so model_id `multilingual-e5-small` resolves to `${modelDir}`),
|
||||
// and disable network fetches so we never silently hit huggingface.co.
|
||||
// transformers.js resolves model_id against env.localModelPath, so we
|
||||
// point it at the parent and pass the model dir name as the id.
|
||||
// allowRemoteModels=false stops it falling back to huggingface.co.
|
||||
transformers.env.localModelPath = `${modelDir}/..`;
|
||||
transformers.env.allowRemoteModels = false;
|
||||
const tokenizer = await transformers.AutoTokenizer.from_pretrained(this.model_.archiveName);
|
||||
|
||||
@@ -10,34 +10,26 @@ export type { SearchOptions, SearchQuery, SearchRelevance, SearchResult, SearchS
|
||||
|
||||
const logger = Logger.create('SearchService');
|
||||
|
||||
// Semantic search over the local embedding index. Used by the plugin API
|
||||
// (joplin.ai.search) and, eventually, by core features that want to query
|
||||
// the vector index without going through the chat layer.
|
||||
//
|
||||
// The "relevance" preset is the public contract: it maps to model-specific
|
||||
// (k, minScore) values. Plugins target the preset; we own the mapping. When
|
||||
// the bundled model changes, we re-tune the table and plugins keep working.
|
||||
// Semantic search over the local embedding index.
|
||||
// The "relevance" preset is the plugin-facing contract; we own the mapping
|
||||
// to model-specific (k, minScore) so plugins survive model changes.
|
||||
|
||||
interface RelevanceTuning {
|
||||
k: number;
|
||||
minScore: number;
|
||||
}
|
||||
|
||||
// Defaults from the spec, calibrated for multilingual-e5-small. When more
|
||||
// models are supported, this becomes a per-model map keyed by modelId.
|
||||
// Tuned for multilingual-e5-small. Becomes a per-model map when we add more.
|
||||
const RELEVANCE_DEFAULTS: Record<SearchRelevance, RelevanceTuning> = {
|
||||
strict: { k: 5, minScore: 0.55 },
|
||||
normal: { k: 10, minScore: 0.40 },
|
||||
loose: { k: 20, minScore: 0.25 },
|
||||
};
|
||||
|
||||
// vec0 returns L2 distance. Our vectors are L2-normalised, so cosine
|
||||
// similarity = 1 − d²/2 exactly. Clamp to handle float drift on self-matches
|
||||
// and opposite-vector edges.
|
||||
const cosineFromDistance = (distance: number): number => {
|
||||
// vec0 stores its distance as L2 (Euclidean) by default. The vectors we
|
||||
// index are L2-normalised, so the exact relation L2² = 2·(1 − cosine)
|
||||
// holds and we recover cosine similarity as 1 − d²/2. Clamp to [0, 1]
|
||||
// so floating-point slop on perfect-match self-queries doesn't surface
|
||||
// negatives or values above 1 — and so an opposing-vector edge case
|
||||
// (cosine = −1) maps to 0 rather than a negative score.
|
||||
const score = 1 - (distance * distance) / 2;
|
||||
if (score < 0) return 0;
|
||||
if (score > 1) return 1;
|
||||
@@ -66,14 +58,11 @@ export default class SearchService {
|
||||
if (!queryVectors.length) return [];
|
||||
|
||||
const noteIds = await this.resolveScope(options.scope);
|
||||
// Scope resolved to an explicit empty list (e.g. tag with no notes).
|
||||
// similaritySearch treats an empty noteIds as "search within nothing"
|
||||
// — return early without hitting the vec table.
|
||||
// Empty scope = search nothing (e.g. tag with no notes).
|
||||
if (noteIds && noteIds.length === 0) return [];
|
||||
|
||||
// Merge results across multiple query vectors (the `{ noteId }` query
|
||||
// produces one vector per chunk). Per (noteId, chunkIndex) we keep the
|
||||
// best score seen, then sort and trim to k.
|
||||
// noteId queries produce one vector per chunk; merge by (note, chunk),
|
||||
// keeping the highest score seen.
|
||||
const best = new Map<string, SearchResult>();
|
||||
for (const queryVector of queryVectors) {
|
||||
const hits = await NoteEmbedding.similaritySearch(queryVector, {
|
||||
@@ -107,15 +96,14 @@ export default class SearchService {
|
||||
): Promise<number[][]> {
|
||||
if ('text' in query) {
|
||||
if (!query.text.trim()) return [];
|
||||
// Use the query-side encoding when the provider exposes one (e5
|
||||
// and friends). Otherwise fall back to the symmetric path.
|
||||
// Asymmetric providers (e5) get better retrieval with embedQuery;
|
||||
// symmetric ones fall back to embed.
|
||||
const embedQuery = provider.embedQuery?.bind(provider) ?? provider.embed.bind(provider);
|
||||
return embedQuery([query.text]);
|
||||
}
|
||||
|
||||
// noteId query: reuse the note's already-indexed chunks as the query
|
||||
// vector(s). Avoids re-embedding (cheap, and matches what the indexer
|
||||
// stored — so the math is symmetric).
|
||||
// Reuse stored vectors so the math stays symmetric and we avoid a
|
||||
// re-embed pass.
|
||||
const vectors = await NoteEmbedding.vectorsByNoteId(query.noteId);
|
||||
if (!vectors.length) {
|
||||
logger.info(`No embeddings indexed for note ${query.noteId} — returning empty result`);
|
||||
|
||||
@@ -1,35 +1,21 @@
|
||||
import { scriptType } from '../../string-utils';
|
||||
|
||||
// Splits a note body into roughly-equal chunks with overlap.
|
||||
//
|
||||
// Chunk sizes derive from three knobs:
|
||||
//
|
||||
// - TARGET_TOKENS_PER_CHUNK: how many tokens we want each chunk to contain.
|
||||
// Sized to fit inside the 512-token context window of the small embedding
|
||||
// models we plan to ship first (bge-small, nomic-embed-text,
|
||||
// mxbai-embed-small). Larger chunks would be silently truncated by those
|
||||
// models.
|
||||
// - OVERLAP_RATIO: fraction of each chunk that's also present in the next
|
||||
// chunk. ~10% is the common default in vector-search literature.
|
||||
// - CHARS_PER_TOKEN: how many characters one token covers, which varies by
|
||||
// language. We pick a conservative value per profile to avoid truncation.
|
||||
// Splits a note body into chunks sized to the embedding model's context
|
||||
// window. Chunk size = TARGET_TOKENS × CHARS_PER_TOKEN; CHARS_PER_TOKEN
|
||||
// varies by script (Latin tokenises looser than CJK), so we pick a profile
|
||||
// from the text. 10% overlap matches common vector-search practice.
|
||||
|
||||
const TARGET_TOKENS_PER_CHUNK = 500;
|
||||
const OVERLAP_RATIO = 0.10;
|
||||
|
||||
// Conservative chars/token estimate for English and other Latin-script
|
||||
// languages. English is closer to 4 chars/token but French / Spanish /
|
||||
// German tokenize denser — we pick the worst case so European-language users
|
||||
// don't see truncation.
|
||||
// Conservative for Latin scripts (French/German tokenise denser than English).
|
||||
const DEFAULT_CHARS_PER_TOKEN = 3.5;
|
||||
|
||||
// CJK (Chinese / Japanese / Korean) characters are much more
|
||||
// information-dense per character. ~1.2 chars/token covers all three.
|
||||
// CJK is much denser per character.
|
||||
const CJK_CHARS_PER_TOKEN = 1.2;
|
||||
|
||||
// A note must be at least this fraction of CJK characters to be chunked with
|
||||
// the CJK profile. Catches dominantly-CJK notes while letting English notes
|
||||
// with a loanword or two stay on the default profile.
|
||||
// CJK profile kicks in only when CJK is the dominant script — a single
|
||||
// loanword in an English note shouldn't switch profiles.
|
||||
const CJK_DOMINANCE_THRESHOLD = 0.3;
|
||||
|
||||
export interface ChunkOptions {
|
||||
@@ -46,17 +32,9 @@ const makeOptions = (charsPerToken: number): ChunkOptions => {
|
||||
export const defaultChunkOptions: ChunkOptions = makeOptions(DEFAULT_CHARS_PER_TOKEN);
|
||||
export const cjkChunkOptions: ChunkOptions = makeOptions(CJK_CHARS_PER_TOKEN);
|
||||
|
||||
// Counts CJK characters in the text and returns true if they make up a
|
||||
// substantial fraction. `scriptType()` flags any presence of CJK characters,
|
||||
// which is too eager — a note with one Chinese loanword in an English page
|
||||
// shouldn't use the CJK chunking profile.
|
||||
//
|
||||
// Codepoint ranges: Hiragana + Katakana (U+3040–U+30FF), CJK Unified
|
||||
// Ideographs Extension A (U+3400–U+4DBF), CJK Unified Ideographs
|
||||
// (U+4E00–U+9FFF), Hangul Syllables (U+AC00–U+D7AF), CJK Compatibility
|
||||
// Ideographs (U+F900–U+FAFF). Using \u{...} escapes so the source file is
|
||||
// ASCII-safe — literal CJK characters here would be vulnerable to encoding
|
||||
// changes at save/transfer time.
|
||||
// Hiragana + Katakana, CJK Unified Ideographs (incl. Extension A),
|
||||
// Hangul Syllables, CJK Compatibility Ideographs. Escapes keep the source
|
||||
// ASCII-safe across encoding changes.
|
||||
const cjkRegex = /[\u{3040}-\u{30FF}\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{AC00}-\u{D7AF}\u{F900}-\u{FAFF}]/u;
|
||||
|
||||
const isCjkDominant = (text: string): boolean => {
|
||||
|
||||
@@ -1,19 +1,12 @@
|
||||
// Loads an ESM-only package from a CommonJS context.
|
||||
//
|
||||
// Tried approaches:
|
||||
// 1. `new Function('import(...)')` — blocked by the renderer's CSP
|
||||
// (unsafe-eval forbidden).
|
||||
// 2. `await import(specifier)` — TypeScript lowers this to `require()` under
|
||||
// `module: commonjs`, which can't load ESM.
|
||||
// 3. `import(specifier)` from a .js file — works in Node, but in Electron's
|
||||
// renderer the import() goes through the browser's module loader, which
|
||||
// can't resolve bare specifiers or transitive node_modules imports
|
||||
// (e.g. transformers.js pulling @huggingface/jinja).
|
||||
//
|
||||
// The reliable path is Node's own `require()`, which since 22.12 supports
|
||||
// loading ESM modules (`require(esm)`) and always uses Node's resolver. This
|
||||
// works in both the main and renderer processes when nodeIntegration is on.
|
||||
// The repo's engines.node enforces the 22.12+ requirement at install time.
|
||||
// Loads ESM-only packages (e.g. @xenova/transformers) from this CJS module.
|
||||
// Why this exists:
|
||||
// - `await import()` gets lowered to require() by tsc under module:commonjs,
|
||||
// which can't load ESM.
|
||||
// - `new Function('import(...)')` is blocked by the renderer's CSP.
|
||||
// - `import()` from a .js file uses the browser loader in the renderer,
|
||||
// which can't resolve bare specifiers (e.g. transitive @huggingface/jinja).
|
||||
// Node's require() since 22.12 loads ESM transparently using Node's resolver
|
||||
// and works in both processes. engines.node enforces the 22.12+ floor.
|
||||
module.exports = function dynamicEsmImport(specifier) {
|
||||
return Promise.resolve(require(specifier));
|
||||
};
|
||||
|
||||
@@ -30,32 +30,12 @@ export interface ChatProvider {
|
||||
chat(messages: ChatMessage[], options?: ChatOptions): Promise<ChatResult>;
|
||||
}
|
||||
|
||||
// Produces embedding vectors for arbitrary text. Implemented by the bundled
|
||||
// local provider (ONNX-backed, lands in a follow-up PR) and by a test stub
|
||||
// used in CI to exercise the indexer without a real model.
|
||||
//
|
||||
// `modelId` is stored alongside each chunk in note_embeddings_meta. When the
|
||||
// active provider's `modelId` differs from the value last seen by the indexer,
|
||||
// the index is cleared and rebuilt — vectors from different models aren't
|
||||
// comparable.
|
||||
//
|
||||
// `dimension` is the size of the vectors returned by `embed()`. It controls
|
||||
// the FLOAT[] size of the sqlite-vec virtual table, which is fixed at the
|
||||
// table's first creation.
|
||||
|
||||
// Provider-internal lifecycle state of the model artefact. The status
|
||||
// reporter widens this to include 'unavailable' (no provider active at
|
||||
// all), which providers themselves can't observe.
|
||||
// Produces embedding vectors for text.
|
||||
// - modelId is stored per chunk; a change triggers a full re-index.
|
||||
// - dimension is fixed at first vec-table creation.
|
||||
export type ProviderModelDownloadStatus = 'not-started' | 'downloading' | 'downloaded';
|
||||
|
||||
// Combined model + indexer state surfaced by EmbeddingIndexer.getStatus().
|
||||
// Used by the settings panel — kept internal to lib/ for now.
|
||||
export type ModelDownloadStatus = ProviderModelDownloadStatus | 'unavailable';
|
||||
// 'ai-disabled' = the top-level AI toggle is off.
|
||||
// 'index-disabled' = AI is on but the indexer toggle is off (chat-only mode).
|
||||
// 'idle' = settings are on and the background indexer is waiting for its
|
||||
// next tick or there's nothing new to do.
|
||||
// 'running' = a maintenance tick is currently processing notes.
|
||||
export type IndexerState = 'idle' | 'running' | 'ai-disabled' | 'index-disabled';
|
||||
export interface IndexStatus {
|
||||
modelDownloadStatus: ModelDownloadStatus;
|
||||
@@ -70,14 +50,10 @@ export interface EmbeddingProvider {
|
||||
dimension: number;
|
||||
classification: ProviderClassification;
|
||||
embed(texts: string[]): Promise<number[][]>;
|
||||
// Optional query-side embedding. Some models (e5 family) get noticeably
|
||||
// better retrieval when documents and queries are encoded with different
|
||||
// prefixes; this method lets the provider apply the query-side one.
|
||||
// Providers without an asymmetric setup can omit it — callers fall back
|
||||
// to embed().
|
||||
// Asymmetric providers (e5) get better retrieval with a query-side
|
||||
// encoding. Symmetric providers omit it and callers fall back to embed().
|
||||
embedQuery?(texts: string[]): Promise<number[][]>;
|
||||
// Optional status accessor for surfacing model lifecycle in the UI.
|
||||
// Providers that don't have a downloadable artefact can omit this; the
|
||||
// status reporter then treats them as always-ready.
|
||||
// Providers without a downloadable artefact omit this; the reporter
|
||||
// treats them as always-ready.
|
||||
modelDownloadStatus?(): Promise<ProviderModelDownloadStatus>;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
# AI embeddings (implementation)
|
||||
|
||||
How Joplin builds and queries the local note-embeddings index. See [ai_primitives.md](ai_primitives.md) for the user-facing spec.
|
||||
|
||||
## Overview
|
||||
|
||||
```
|
||||
Notes ──► EmbeddingIndexer ──► embedding provider ──► NoteEmbedding
|
||||
(background, 5 min) (local ONNX) (sqlite-vec)
|
||||
▲
|
||||
│
|
||||
SearchService ◄── joplin.ai.search() │
|
||||
│
|
||||
AiIndexStatus ──► reads counts ───────┘
|
||||
(settings panel)
|
||||
```
|
||||
|
||||
- **EmbeddingIndexer** watches note changes, chunks each note, asks the active provider to embed the chunks, and persists them.
|
||||
- **NoteEmbedding** is the storage model. Metadata sits in a regular table; vectors sit in a sqlite-vec virtual table. Joined by rowid.
|
||||
- **LocalEmbeddingProvider** runs `multilingual-e5-small` via ONNX. The model (~140 MB) is downloaded from a GitHub release on first AI enable.
|
||||
- **SearchService** backs `joplin.ai.search()`. Takes a text or note-id query and a scope, returns ranked chunks.
|
||||
- **AiIndexStatus** is the settings panel that shows model + indexer state.
|
||||
|
||||
## Indexer behaviour
|
||||
|
||||
The indexer runs on a 5-minute timer (matching `OcrService`) and has two modes:
|
||||
|
||||
- **Initial scan.** On first enable (or after a model swap), walk the entire notes table one 100-note batch per tick. At the start of the scan the change-feed cursor is snapped to the current `lastChangeId` so edits made *during* the scan are picked up normally by the change feed when the scan finishes. The scan is complete when no indexable notes remain that aren't already in `note_embeddings_meta`; at that point `ai.embedding.initialScanDone` flips to true.
|
||||
- **Change feed.** Once the scan completes, only the change-feed loop runs. Each tick drains `item_changes` past the durable cursor, collapses duplicates, processes deletes/creates/updates, then advances the cursor.
|
||||
|
||||
**Order of processing during the scan**: unspecified. The "not yet indexed" query has no `ORDER BY`, so SQLite returns rows in storage order (roughly insertion order). Determinism doesn't matter for the final state — every indexable note ends up embedded.
|
||||
|
||||
**Resume across restarts**: free, because progress is the disk state, not a counter. After a restart the indexer re-enters scan mode (flag still false) and the `NOT EXISTS` query naturally skips notes that already have rows. Already-indexed notes from the previous session don't get re-processed.
|
||||
|
||||
**Failure handling during the scan**: per-note failures are logged and the note is added to an in-memory skip set for the rest of the session — no retry loop, no log flood. The skip set is **not** persisted, so the next process restart retries the previously-failed notes.
|
||||
|
||||
Once the scan completes, the indexer never sweeps the notes table again. A note that failed during the scan only gets re-tried if (a) the user edits it (the change feed picks it up), or (b) the model id changes (full re-scan). It will not be retried just because time passed.
|
||||
|
||||
Per note, the title is injected (doubled) into chunk 0. Without this, notes whose body is just an attachment link would never match title-anchored queries.
|
||||
|
||||
The first tick fires fire-and-forget at startup so the model load doesn't block the splash screen. A model id change (e.g. switching providers) wipes the index, clears the scan flag, and rebuilds.
|
||||
|
||||
## Chunking
|
||||
|
||||
Fixed-size character windows sized to fit the model's 512-token context, with 10% overlap. Two profiles: a Latin-script default and a CJK profile (used when CJK is the dominant script — CJK tokenises ~3× denser).
|
||||
|
||||
## Search
|
||||
|
||||
- `query: { text }` runs the active provider's query-side encoding.
|
||||
- `query: { noteId }` reuses the note's stored vectors as the query — no re-embedding pass.
|
||||
- `scope`: `all` / `note` / `folder` / `tag`. Trashed and conflict notes are always excluded.
|
||||
- `relevance`: `strict` / `normal` / `loose`. Maps internally to model-specific `(k, minScore)` so plugins survive model changes.
|
||||
- Score is cosine similarity in `[0, 1]`, computed from the L2 distance sqlite-vec returns (vectors are L2-normalised, so `cos = 1 − d²/2`).
|
||||
|
||||
## Platform support
|
||||
|
||||
| Platform | Embeddings | Notes |
|
||||
|-------------------------|:----------:|----------------------------------------|
|
||||
| macOS Apple Silicon | ✓ | |
|
||||
| macOS Intel | ✗ | `onnxruntime-node` ships no darwin-x64 |
|
||||
| Linux x64 / arm64 | ✓ | |
|
||||
| Windows x64 / arm64 | ✓ | |
|
||||
| CLI / mobile / server | ✗ | No ONNX wired in; chat still works |
|
||||
|
||||
Unsupported platforms degrade cleanly: the provider isn't installed, the indexer stays paused, search throws a clear error.
|
||||
|
||||
## Performance, in rough numbers
|
||||
|
||||
- Model load: 2–15 s (one-time per process, runs in the background).
|
||||
- Per-chunk inference: 30–200 ms depending on CPU.
|
||||
- Search latency: dominated by the query-embedding pass; the vec MATCH itself is sub-millisecond on tens of thousands of chunks.
|
||||
- Big vaults take a while: 100 notes per tick × 5-min interval, so a fresh 10k-note vault is ~8 hours of background indexing. Intentional pacing.
|
||||
|
||||
## Durable state
|
||||
|
||||
| Setting | Purpose |
|
||||
|----------------------------------------|----------------------------------|
|
||||
| `ai.enabled` | Top-level AI toggle |
|
||||
| `ai.embedding.enabled` | Indexer kill switch (default on) |
|
||||
| `ai.embedding.lastProcessedChangeId` | Cursor into `item_changes` |
|
||||
| `ai.embedding.lastIndexedModelId` | Detects model swap → re-index |
|
||||
| `ai.embedding.initialScanDone` | True once the full-vault scan finished |
|
||||
@@ -1,6 +1,6 @@
|
||||
# AI primitives
|
||||
|
||||
This spec describes the core AI primitives that will be added to Joplin. The goal is not to ship a single AI feature, but to provide a platform on which features and plugins can be built. The primitives below are validated against five target use cases:
|
||||
This spec describes the core AI primitives in Joplin. The goal is not to ship a single AI feature, but to provide a platform on which features and plugins can be built. The primitives below are validated against five target use cases:
|
||||
|
||||
- **Chat with your note** — a sidebar that can summarise, rewrite, or answer questions about the current note.
|
||||
- **Chat with your note collection** — ask a question across all notes and get a cited answer.
|
||||
@@ -22,6 +22,20 @@ The primitives are:
|
||||
|
||||
Primitives 1–3 are required for any of the five target use cases to work. Primitive 4 must be in place from day one. Primitive 5 is independently valuable and can ship in parallel.
|
||||
|
||||
## Implementation status
|
||||
|
||||
| Primitive | Status |
|
||||
|-------------------------------------|------------------------------------------------------------------------|
|
||||
| Provider abstraction (chat) | Shipped (Joplin Cloud, OpenAI-compatible, Anthropic) |
|
||||
| Provider abstraction (embeddings) | Shipped (local ONNX-backed) |
|
||||
| Local embeddings index | Shipped — multilingual-e5-small, downloaded on first enable |
|
||||
| Retrieval helpers (`search`) | Shipped as `joplin.ai.search()` |
|
||||
| Chat helper (`chat`) | Shipped as `joplin.ai.chat()` |
|
||||
| Privacy & cost guardrails | Shipped (off by default, remote-allow flag, classification, token tally)|
|
||||
| MCP server | Not started |
|
||||
|
||||
Implementation detail for the embeddings stack lives in [ai_embeddings.md](ai_embeddings.md).
|
||||
|
||||
## 1. Provider abstraction
|
||||
|
||||
A pluggable layer so users can pick their LLM and embedding model independently (cloud, self-hosted, or on-device). No provider is hardcoded.
|
||||
@@ -39,36 +53,35 @@ Users configure a **list** of providers (each with its own settings — API key,
|
||||
|
||||
### Built-in providers
|
||||
|
||||
- An **OpenAI-compatible** adapter (covers OpenAI, Ollama, LM Studio, vLLM, OpenRouter, and similar via base-URL override).
|
||||
- An **Anthropic** adapter.
|
||||
- A **bundled local embedding model** (see below)
|
||||
Chat:
|
||||
|
||||
- **Joplin Cloud AI** — zero-config for users on Joplin Cloud sync.
|
||||
- **OpenAI-compatible** adapter (covers OpenAI, Ollama, LM Studio, vLLM, OpenRouter, and similar via base-URL override).
|
||||
- **Anthropic** adapter.
|
||||
|
||||
Embeddings:
|
||||
|
||||
- **Bundled local embedding provider** (see below).
|
||||
|
||||
### Chat API
|
||||
|
||||
Plugins call `joplin.ai.chat(messages, options?)`. The active provider and model are taken from user settings — plugins cannot pick a model. Throws if AI is disabled, if the active provider is remote and the user hasn't allowed remote providers, or if the provider is misconfigured.
|
||||
|
||||
## 2. Local embeddings index
|
||||
|
||||
Notes are chunked, embedded, and stored locally so retrieval can run without a network call.
|
||||
Notes are chunked, embedded, and stored locally so retrieval can run without a network call. Embeddings are **not synced**: they are large, model-specific, and re-derivable. The model identifier is stored alongside each chunk so a model change triggers a clear-and-rebuild rather than silent corruption.
|
||||
|
||||
### Storage
|
||||
Indexing runs as a background service: on first enable it walks the entire vault, after which it follows the note-change feed incrementally. New and edited notes become searchable within minutes.
|
||||
|
||||
- A new local SQLite table holding `(note_id, chunk_index, model_id, vector)` and the source text of each chunk.
|
||||
- Implemented using the [sqlite-vec](https://github.com/asg017/sqlite-vec) extension for vector storage and similarity search.
|
||||
- **Not synced.** Embeddings are large, model-specific, and re-derivable.
|
||||
- Schema includes the model identifier so a model change triggers a re-index rather than silent corruption.
|
||||
|
||||
### Indexing
|
||||
|
||||
- Background task following the existing OCR Service pattern: timer-based, polls `ItemChange`, processes in chunks, persists progress in a settings key.
|
||||
- Chunking strategy: roughly 512–1024 tokens with overlap. Tunable internally.
|
||||
|
||||
### Embedding model
|
||||
|
||||
- Joplin ships a small embedding model (~100MB, e.g. from the nomic/mxbai/bge family) bundled with the desktop app, or downloaded after installation.
|
||||
- Runtime: **ONNX Runtime** (`onnxruntime-node`), loaded in-process. No external service, daemon, or Python required.
|
||||
- The bundled model is the default. Users may switch to a cloud embedding provider via the provider abstraction; doing so triggers a re-index.
|
||||
The bundled model is the default. Users may switch to a different embedding provider via the provider abstraction; doing so triggers a re-index because vectors from different models aren't comparable.
|
||||
|
||||
### Platform scope
|
||||
|
||||
- **Desktop and CLI**: full support.
|
||||
- **Mobile**: deferred. sqlite-vec packaging for iOS/Android and on-device embedding cost on mobile are separate efforts. Mobile may eventually query an existing index produced on desktop, but that is out of scope for the initial work.
|
||||
- **Desktop on Apple Silicon macOS, Linux x64/arm64, Windows x64/arm64**: full support.
|
||||
- **macOS Intel**: chat works; embeddings do not (no ONNX prebuild for `darwin-x64`).
|
||||
- **CLI / mobile / server**: no on-device embeddings. Chat with remote providers still works.
|
||||
|
||||
Implementation detail lives in [ai_embeddings.md](ai_embeddings.md).
|
||||
|
||||
## 3. Retrieval helpers
|
||||
|
||||
@@ -76,36 +89,23 @@ The shared query surface. All five target features differ mainly in *what* they
|
||||
|
||||
### API
|
||||
|
||||
A single primary call:
|
||||
Plugins call `joplin.ai.search({ query, scope?, relevance? })`. Returns matching chunks with the source note id, chunk text, and a similarity score.
|
||||
|
||||
- **`search({ query, scope, relevance })`** — returns matching chunks with their source note ID, the chunk text, and a similarity score.
|
||||
- `query`: either plain text (embedded internally using the active embedding provider) or `{ noteId }` to find chunks similar to an existing note. When a note ID is given, Joplin reuses the note's already-indexed chunks as the query vector(s), so no re-embedding is needed. This is what the tag-suggestion and semantic-graph use cases rely on.
|
||||
- `scope`: where to search. One of `'note'` (with a note ID), `'notebook'` (with a folder ID), `'tag'` (with a tag ID), or `'all'`. Trashed and conflict notes are excluded by default.
|
||||
- `relevance`: `'strict' | 'normal' | 'loose'`. A preset that maps internally to model-appropriate values for the number of results returned (`k`) and the minimum similarity threshold.
|
||||
- `query`: either plain text or `{ noteId }` to find chunks similar to an existing note. Note-id queries reuse the note's already-indexed chunks as the query — no re-embedding needed. This is what tag-suggestion and semantic-graph use cases rely on.
|
||||
- `scope`: `all` (default), `note`, `folder`, or `tag`. Trashed and conflict notes are always excluded.
|
||||
- `relevance`: `strict` / `normal` / `loose`. A preset that maps internally to model-appropriate values for the number of results returned and the minimum similarity threshold.
|
||||
|
||||
Raw thresholds (`k`, `minScore`) are a leaky abstraction: the right values depend on the embedding model, and silently break when the model changes. Plugins calibrated against one model would produce poor results against another with no signal that anything had changed.
|
||||
|
||||
The `relevance` preset is the contract plugins write against. Joplin owns the mapping from preset to numeric values per model. When the bundled model changes, the mapping is re-tuned and plugins continue working without modification.
|
||||
|
||||
### Default mappings
|
||||
|
||||
Reference defaults (subject to per-model calibration):
|
||||
|
||||
| `relevance` preset | k | minScore (cosine) |
|
||||
|----------|----|-------------------|
|
||||
| strict | 5 | ~0.55 |
|
||||
| normal | 10 | ~0.40 |
|
||||
| loose | 20 | ~0.25 |
|
||||
|
||||
These are internal values and are not part of the public API contract.
|
||||
|
||||
### Hybrid search
|
||||
|
||||
Retrieval may be combined internally with the existing FTS-based keyword search. This is an implementation detail; plugins still call `search()` with the same shape.
|
||||
Retrieval may eventually be combined internally with the existing FTS-based keyword search. Plugins will not see a contract change when hybrid ranking lands.
|
||||
|
||||
### Prior art
|
||||
|
||||
The [Jarvis](https://github.com/alondmnt/joplin-plugin-jarvis) plugin already exposes a [semantic search API](https://github.com/alondmnt/joplin-plugin-jarvis/blob/master/docs/API.md) to other plugins, supporting both free-text and note-ID queries. It is a useful reference for anyone wanting to prototype against the shape proposed here.
|
||||
The [Jarvis](https://github.com/alondmnt/joplin-plugin-jarvis) plugin already exposes a [semantic search API](https://github.com/alondmnt/joplin-plugin-jarvis/blob/master/docs/API.md) to other plugins, supporting both free-text and note-ID queries. It is a useful reference.
|
||||
|
||||
### Mapping to features
|
||||
|
||||
@@ -113,7 +113,7 @@ How each target use case composes the primitives:
|
||||
|
||||
| Feature | Retrieval scope | Then |
|
||||
|--------------------------|-----------------------------|------------------------------------------|
|
||||
| Chat with note | `note` or `notebook` | Pass chunks as context to chat model |
|
||||
| Chat with note | `note` or `folder` | Pass chunks as context to chat model |
|
||||
| Chat with note collection | `all` | Pass top chunks (with note IDs) as context to chat model |
|
||||
| Fuzzy search | `all` | Show chunks directly as results |
|
||||
| Tag suggestions | `all`, query = note content | Inspect tags of returned chunks |
|
||||
@@ -123,18 +123,20 @@ Chat-based features additionally pass each chunk's source note ID into the promp
|
||||
|
||||
## 4. Privacy & cost guardrails
|
||||
|
||||
Enforced at the provider layer so every feature (core or plugin) inherits these checks automatically.
|
||||
Enforced at the provider layer so every feature — core or plugin — inherits these checks automatically.
|
||||
|
||||
### Requirements
|
||||
|
||||
- **AI features off by default.** A single top-level setting plus per-feature toggles.
|
||||
- **Offline by default**: User must explicitly grant permission to use online features.
|
||||
- **Per-provider classification** as `local` or `remote`. Surfaced in the provider picker and used by the indicator.
|
||||
- **Token accounting** per provider, queryable by plugins and shown to users.
|
||||
- **No silent enablement.** Switching from a local to a remote provider requires explicit user confirmation, with clear text about what data will be sent and where.
|
||||
- **AI features off by default.** A top-level toggle, plus a per-feature kill switch for the embeddings indexer (users who want chat-only).
|
||||
- **Offline by default**: remote providers require a separate explicit opt-in.
|
||||
- **Per-provider classification** as `local` or `remote`. OpenAI-compatible providers can be either depending on the configured base URL (loopback addresses count as local).
|
||||
- **Token accounting** per provider, queryable by plugins and shown in settings.
|
||||
- **No silent enablement.** Switching from a local to a remote provider requires an explicit user choice; auto-defaults (e.g. selecting Joplin Cloud AI for Joplin Cloud users on first enable) only apply once.
|
||||
|
||||
## 5. MCP server
|
||||
|
||||
> Status: not started. Design recorded here for reference.
|
||||
|
||||
Joplin runs an optional [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes notes to external AI applications (Claude Desktop, ChatGPT desktop, Cursor, Zed, etc.).
|
||||
|
||||
### Scope
|
||||
|
||||
Reference in New Issue
Block a user