From e3ef136bca8562185b307cf5ff159e0bbd8a9a57 Mon Sep 17 00:00:00 2001 From: rudi193-cmd Date: Mon, 8 Jun 2026 23:38:46 +0900 Subject: [PATCH] fix(memory): keep FTS keyword search model agnostic Make lexical FTS/LIKE search ignore embedding model identity so exact keyword recall survives provider/model changes. Vector search remains model-scoped, and refreshed or stale FTS rows are cleaned by path/source with live-chunk filtering to prevent old orphan rows from surfacing. Fixes #48300 --- .../src/memory/manager-fts-state.test.ts | 21 +- .../src/memory/manager-fts-state.ts | 8 +- .../src/memory/manager-search.test.ts | 318 ++++++++++++++---- .../memory-core/src/memory/manager-search.ts | 16 +- .../src/memory/manager-sync-ops.ts | 12 +- extensions/memory-core/src/memory/manager.ts | 3 - 6 files changed, 275 insertions(+), 103 deletions(-) diff --git a/extensions/memory-core/src/memory/manager-fts-state.test.ts b/extensions/memory-core/src/memory/manager-fts-state.test.ts index c20caf62483..8a25c967e82 100644 --- a/extensions/memory-core/src/memory/manager-fts-state.test.ts +++ b/extensions/memory-core/src/memory/manager-fts-state.test.ts @@ -11,7 +11,7 @@ describe("memory FTS state", () => { db = null; }); - it("only removes rows for the active model when a provider is active", () => { + it("removes rows for all models when a provider is active", () => { db = new DatabaseSync(":memory:"); db.exec("CREATE TABLE chunks_fts (path TEXT, source TEXT, model TEXT)"); db.prepare("INSERT INTO chunks_fts (path, source, model) VALUES (?, ?, ?)").run( @@ -24,6 +24,16 @@ describe("memory FTS state", () => { "memory", "other-model", ); + db.prepare("INSERT INTO chunks_fts (path, source, model) VALUES (?, ?, ?)").run( + "memory/2026-01-13.md", + "memory", + "other-model", + ); + db.prepare("INSERT INTO chunks_fts (path, source, model) VALUES (?, ?, ?)").run( + "memory/2026-01-12.md", + "sessions", + "other-model", + ); deleteMemoryFtsRows({ db, @@ -32,10 +42,15 @@ describe("memory FTS state", () => { currentModel: "mock-embed", }); - const rows = db.prepare("SELECT model FROM chunks_fts ORDER BY model").all() as Array<{ + const rows = db.prepare("SELECT path, source, model FROM chunks_fts ORDER BY path, source").all() as Array<{ + path: string; + source: string; model: string; }>; - expect(rows).toEqual([{ model: "other-model" }]); + expect(rows).toEqual([ + { path: "memory/2026-01-12.md", source: "sessions", model: "other-model" }, + { path: "memory/2026-01-13.md", source: "memory", model: "other-model" }, + ]); }); it("removes all rows for the path in FTS-only mode", () => { diff --git a/extensions/memory-core/src/memory/manager-fts-state.ts b/extensions/memory-core/src/memory/manager-fts-state.ts index 139fbfe44c6..eafa89f7e06 100644 --- a/extensions/memory-core/src/memory/manager-fts-state.ts +++ b/extensions/memory-core/src/memory/manager-fts-state.ts @@ -10,12 +10,8 @@ export function deleteMemoryFtsRows(params: { currentModel?: string; }): void { const tableName = params.tableName ?? "chunks_fts"; - if (params.currentModel) { - params.db - .prepare(`DELETE FROM ${tableName} WHERE path = ? AND source = ? AND model = ?`) - .run(params.path, params.source, params.currentModel); - return; - } + // Lexical search is model-agnostic, so refreshed/deleted files must not + // leave old-model FTS rows behind for the same path/source. params.db .prepare(`DELETE FROM ${tableName} WHERE path = ? AND source = ?`) .run(params.path, params.source); diff --git a/extensions/memory-core/src/memory/manager-search.test.ts b/extensions/memory-core/src/memory/manager-search.test.ts index edfe5fdbed2..2bde6136aac 100644 --- a/extensions/memory-core/src/memory/manager-search.test.ts +++ b/extensions/memory-core/src/memory/manager-search.test.ts @@ -1,4 +1,5 @@ // Memory Core tests cover manager search plugin behavior. +import type { DatabaseSync } from "node:sqlite"; import { ensureMemoryIndexSchema, loadSqliteVecExtension, @@ -11,6 +12,45 @@ import { searchKeyword, searchVector } from "./manager-search.js"; const vectorToBlob = (embedding: number[]): Buffer => Buffer.from(new Float32Array(embedding).buffer); +function insertKeywordFixture( + db: DatabaseSync, + params: { + text: string; + id: string; + path: string; + source: "memory" | "sessions"; + model: string; + startLine: number; + endLine: number; + }, +): void { + db.prepare( + "INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ).run( + params.id, + params.path, + params.source, + params.startLine, + params.endLine, + `${params.id}:hash`, + params.model, + params.text, + JSON.stringify([0]), + Date.now(), + ); + db.prepare( + "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", + ).run( + params.text, + params.id, + params.path, + params.source, + params.model, + params.startLine, + params.endLine, + ); +} + describe("searchKeyword trigram fallback", () => { const { DatabaseSync } = requireNodeSqlite(); @@ -55,16 +95,20 @@ describe("searchKeyword trigram fallback", () => { }) { const db = createTrigramDb(); try { - const insert = db.prepare( - "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", - ); for (const row of params.rows) { - insert.run(row.text, row.id, row.path, "memory", "mock-embed", 1, 1); + insertKeywordFixture(db, { + text: row.text, + id: row.id, + path: row.path, + source: "memory", + model: "mock-embed", + startLine: 1, + endLine: 1, + }); } return await searchKeyword({ db, ftsTable: "chunks_fts", - providerModel: "mock-embed", query: params.query, ftsTokenizer: "trigram", limit: 10, @@ -220,27 +264,24 @@ describe("searchKeyword FTS MATCH fallback", () => { itWithFts("falls back to LIKE search when FTS MATCH throws", async () => { const db = createFtsDb(); try { - const insert = db.prepare( - "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", - ); - insert.run( - "The Agent framework handles API calls and cron jobs", - "1", - "doc.md", - "sessions", - "mock-embed", - 1, - 5, - ); - insert.run( - "Deploy the database cluster on Hetzner", - "2", - "ops.md", - "sessions", - "mock-embed", - 1, - 3, - ); + insertKeywordFixture(db, { + text: "The Agent framework handles API calls and cron jobs", + id: "1", + path: "doc.md", + source: "sessions", + model: "mock-embed", + startLine: 1, + endLine: 5, + }); + insertKeywordFixture(db, { + text: "Deploy the database cluster on Hetzner", + id: "2", + path: "ops.md", + source: "sessions", + model: "mock-embed", + startLine: 1, + endLine: 3, + }); // Simulate a buildFtsQuery that produces a broken MATCH expression const brokenBuildFtsQuery = () => "BROKEN_QUERY_SYNTAX <<<"; @@ -248,7 +289,6 @@ describe("searchKeyword FTS MATCH fallback", () => { const results = await searchKeyword({ db, ftsTable: "chunks_fts", - providerModel: "mock-embed", query: "Agent", ftsTokenizer: "unicode61", limit: 10, @@ -271,23 +311,19 @@ describe("searchKeyword FTS MATCH fallback", () => { itWithFts("returns BM25-scored results when FTS MATCH succeeds", async () => { const db = createFtsDb(); try { - const insert = db.prepare( - "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", - ); - insert.run( - "The Transformer architecture powers modern LLMs", - "1", - "ml.md", - "memory", - "mock-embed", - 1, - 3, - ); + insertKeywordFixture(db, { + text: "The Transformer architecture powers modern LLMs", + id: "1", + path: "ml.md", + source: "memory", + model: "mock-embed", + startLine: 1, + endLine: 3, + }); const results = await searchKeyword({ db, ftsTable: "chunks_fts", - providerModel: "mock-embed", query: "Transformer", ftsTokenizer: "unicode61", limit: 10, @@ -310,17 +346,29 @@ describe("searchKeyword FTS MATCH fallback", () => { itWithFts("applies source filter in LIKE fallback", async () => { const db = createFtsDb(); try { - const insert = db.prepare( - "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", - ); - insert.run("Agent handles API calls", "1", "doc.md", "sessions", "mock-embed", 1, 3); - insert.run("Agent design patterns", "2", "notes.md", "memory", "mock-embed", 1, 3); + insertKeywordFixture(db, { + text: "Agent handles API calls", + id: "1", + path: "doc.md", + source: "sessions", + model: "mock-embed", + startLine: 1, + endLine: 3, + }); + insertKeywordFixture(db, { + text: "Agent design patterns", + id: "2", + path: "notes.md", + source: "memory", + model: "mock-embed", + startLine: 1, + endLine: 3, + }); const brokenBuildFtsQuery = () => "BROKEN <<<"; const results = await searchKeyword({ db, ftsTable: "chunks_fts", - providerModel: "mock-embed", query: "Agent", ftsTokenizer: "unicode61", limit: 10, @@ -341,29 +389,26 @@ describe("searchKeyword FTS MATCH fallback", () => { itWithFts("splits multi-word query into per-token LIKE clauses in fallback", async () => { const db = createFtsDb(); try { - const insert = db.prepare( - "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", - ); // "Agent" and "cron" appear in this row but not adjacent - insert.run( - "The Agent framework handles API calls and cron jobs", - "1", - "doc.md", - "sessions", - "mock-embed", - 1, - 5, - ); + insertKeywordFixture(db, { + text: "The Agent framework handles API calls and cron jobs", + id: "1", + path: "doc.md", + source: "sessions", + model: "mock-embed", + startLine: 1, + endLine: 5, + }); // Only "Agent" appears in this row - insert.run( - "Agent design patterns for microservices", - "2", - "arch.md", - "sessions", - "mock-embed", - 1, - 3, - ); + insertKeywordFixture(db, { + text: "Agent design patterns for microservices", + id: "2", + path: "arch.md", + source: "sessions", + model: "mock-embed", + startLine: 1, + endLine: 3, + }); // A single-substring LIKE '%Agent cron%' would miss row 1 because // the words are not adjacent. Per-token LIKE should find it. @@ -371,7 +416,6 @@ describe("searchKeyword FTS MATCH fallback", () => { const results = await searchKeyword({ db, ftsTable: "chunks_fts", - providerModel: "mock-embed", query: "Agent cron", ftsTokenizer: "unicode61", limit: 10, @@ -393,15 +437,19 @@ describe("searchKeyword FTS MATCH fallback", () => { const db = createFtsDb(); const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); try { - const insert = db.prepare( - "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", - ); - insert.run("test content", "1", "doc.md", "sessions", "mock-embed", 1, 1); + insertKeywordFixture(db, { + text: "test content", + id: "1", + path: "doc.md", + source: "sessions", + model: "mock-embed", + startLine: 1, + endLine: 1, + }); await searchKeyword({ db, ftsTable: "chunks_fts", - providerModel: "mock-embed", query: "test", ftsTokenizer: "unicode61", limit: 10, @@ -426,6 +474,130 @@ describe("searchKeyword FTS MATCH fallback", () => { }); }); +describe("searchKeyword cross-model FTS visibility (issue #48300)", () => { + const { DatabaseSync } = requireNodeSqlite(); + + function supportsFts(): boolean { + const db = new DatabaseSync(":memory:"); + try { + const result = ensureMemoryIndexSchema({ + db, + embeddingCacheTable: "embedding_cache", + cacheEnabled: false, + ftsTable: "chunks_fts", + ftsEnabled: true, + }); + return result.ftsAvailable; + } finally { + db.close(); + } + } + + const itWithFts = supportsFts() ? it : it.skip; + + itWithFts("returns FTS hits indexed under a different embedding model", async () => { + const db = new DatabaseSync(":memory:"); + try { + const result = ensureMemoryIndexSchema({ + db, + embeddingCacheTable: "embedding_cache", + cacheEnabled: false, + ftsTable: "chunks_fts", + ftsEnabled: true, + }); + if (!result.ftsAvailable) { + throw new Error(result.ftsError ?? "FTS unavailable"); + } + insertKeywordFixture(db, { + text: "Persona notes for Clyde the assistant", + id: "clyde-old", + path: "memory/persona.md", + source: "memory", + model: "bge-m3", + startLine: 1, + endLine: 3, + }); + insertKeywordFixture(db, { + text: "Persona notes for Clyde the assistant", + id: "clyde-new", + path: "memory/persona.md", + source: "memory", + model: "nomic-embed-text", + startLine: 1, + endLine: 3, + }); + + const results = await searchKeyword({ + db, + ftsTable: "chunks_fts", + query: "Clyde", + ftsTokenizer: "unicode61", + limit: 10, + snippetMaxChars: 200, + sourceFilter: { sql: "", params: [] }, + buildFtsQuery, + bm25RankToScore, + }); + + expect(results.map((row) => row.id).toSorted()).toEqual(["clyde-new", "clyde-old"]); + } finally { + db.close(); + } + }); + + itWithFts("does not return orphaned old-model FTS rows without a live chunk", async () => { + const db = new DatabaseSync(":memory:"); + try { + const result = ensureMemoryIndexSchema({ + db, + embeddingCacheTable: "embedding_cache", + cacheEnabled: false, + ftsTable: "chunks_fts", + ftsEnabled: true, + }); + if (!result.ftsAvailable) { + throw new Error(result.ftsError ?? "FTS unavailable"); + } + insertKeywordFixture(db, { + text: "Current Clyde notes", + id: "live-clyde", + path: "memory/persona.md", + source: "memory", + model: "nomic-embed-text", + startLine: 1, + endLine: 3, + }); + db.prepare( + "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)", + ).run( + "Deleted Clyde notes from an older model", + "orphan-clyde", + "memory/persona.md", + "memory", + "bge-m3", + 1, + 3, + ); + + const results = await searchKeyword({ + db, + ftsTable: "chunks_fts", + query: "Clyde", + ftsTokenizer: "unicode61", + limit: 10, + snippetMaxChars: 200, + sourceFilter: { sql: "", params: [] }, + buildFtsQuery, + bm25RankToScore, + }); + + expect(results.map((row) => row.id)).toEqual(["live-clyde"]); + } finally { + db.close(); + } + }); +}); + describe("searchVector sqlite-vec KNN", () => { const { DatabaseSync } = requireNodeSqlite(); diff --git a/extensions/memory-core/src/memory/manager-search.ts b/extensions/memory-core/src/memory/manager-search.ts index ff907b19600..57d9cd1ccbd 100644 --- a/extensions/memory-core/src/memory/manager-search.ts +++ b/extensions/memory-core/src/memory/manager-search.ts @@ -308,7 +308,6 @@ async function searchChunksByEmbedding(params: { export async function searchKeyword(params: { db: DatabaseSync; ftsTable: string; - providerModel: string | undefined; query: string; ftsTokenizer?: "unicode61" | "trigram"; limit: number; @@ -330,9 +329,9 @@ export async function searchKeyword(params: { return []; } - // When providerModel is undefined (FTS-only mode), search all models - const modelClause = params.providerModel ? " AND model = ?" : ""; - const modelParams = params.providerModel ? [params.providerModel] : []; + // Lexical FTS is model-agnostic (issue #48300), but old databases may + // already contain orphaned FTS rows from prior model-scoped cleanup. + const liveChunkClause = ` AND EXISTS (SELECT 1 FROM chunks c WHERE c.id = ${params.ftsTable}.id)`; const substringClause = plan.substringTerms.map(() => " AND text LIKE ? ESCAPE '\\'").join(""); const substringParams = plan.substringTerms.map((term) => `%${escapeLikePattern(term)}%`); @@ -354,14 +353,13 @@ export async function searchKeyword(params: { `SELECT id, path, source, start_line, end_line, text,\n` + ` bm25(${params.ftsTable}) AS rank\n` + ` FROM ${params.ftsTable}\n` + - ` WHERE ${params.ftsTable} MATCH ?${substringClause}${modelClause}${params.sourceFilter.sql}\n` + + ` WHERE ${params.ftsTable} MATCH ?${substringClause}${liveChunkClause}${params.sourceFilter.sql}\n` + ` ORDER BY rank ASC\n` + ` LIMIT ?`, ) .all( plan.matchQuery, ...substringParams, - ...modelParams, ...params.sourceFilter.params, params.limit, ) as typeof rows; @@ -381,12 +379,11 @@ export async function searchKeyword(params: { `SELECT id, path, source, start_line, end_line, text,\n` + ` 0 AS rank\n` + ` FROM ${params.ftsTable}\n` + - ` WHERE 1=1${fallbackLikeClause}${modelClause}${params.sourceFilter.sql}\n` + + ` WHERE 1=1${fallbackLikeClause}${liveChunkClause}${params.sourceFilter.sql}\n` + ` LIMIT ?`, ) .all( ...fallbackLikeParams, - ...modelParams, ...params.sourceFilter.params, params.limit, ) as typeof rows; @@ -397,12 +394,11 @@ export async function searchKeyword(params: { `SELECT id, path, source, start_line, end_line, text,\n` + ` 0 AS rank\n` + ` FROM ${params.ftsTable}\n` + - ` WHERE 1=1${substringClause}${modelClause}${params.sourceFilter.sql}\n` + + ` WHERE 1=1${substringClause}${liveChunkClause}${params.sourceFilter.sql}\n` + ` LIMIT ?`, ) .all( ...substringParams, - ...modelParams, ...params.sourceFilter.params, params.limit, ) as typeof rows; diff --git a/extensions/memory-core/src/memory/manager-sync-ops.ts b/extensions/memory-core/src/memory/manager-sync-ops.ts index 70df8b42f8b..76d67aa987b 100644 --- a/extensions/memory-core/src/memory/manager-sync-ops.ts +++ b/extensions/memory-core/src/memory/manager-sync-ops.ts @@ -1616,9 +1616,9 @@ export abstract class MemoryManagerSyncOps { `DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`, ) : null; - const deleteFtsRowsByPathSourceAndModel = + const deleteFtsRowsByPathAndSource = this.fts.enabled && this.fts.available - ? this.db.prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`) + ? this.db.prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ?`) : null; const targetSessionFiles = params.needsFullReindex @@ -1734,13 +1734,9 @@ export abstract class MemoryManagerSyncOps { } catch {} } deleteChunksByPathAndSource.run(stale.path, "sessions"); - if (deleteFtsRowsByPathSourceAndModel) { + if (deleteFtsRowsByPathAndSource) { try { - deleteFtsRowsByPathSourceAndModel.run( - stale.path, - "sessions", - this.provider?.model ?? "fts-only", - ); + deleteFtsRowsByPathAndSource.run(stale.path, "sessions"); } catch {} } } finally { diff --git a/extensions/memory-core/src/memory/manager.ts b/extensions/memory-core/src/memory/manager.ts index 74fcf70a033..326e160ec89 100644 --- a/extensions/memory-core/src/memory/manager.ts +++ b/extensions/memory-core/src/memory/manager.ts @@ -900,12 +900,9 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem return []; } const sourceFilter = this.buildSourceFilter(undefined, sourceFilterList); - // In FTS-only mode (no provider), search all models; otherwise filter by current provider's model - const providerModel = this.provider?.model; const results = await searchKeyword({ db: this.db, ftsTable: FTS_TABLE, - providerModel, query, ftsTokenizer: this.settings.store.fts.tokenizer, limit,