fix(memory): keep FTS keyword search model agnostic

Make lexical FTS/LIKE search ignore embedding model identity so exact keyword recall survives provider/model changes. Vector search remains model-scoped, and refreshed or stale FTS rows are cleaned by path/source with live-chunk filtering to prevent old orphan rows from surfacing.

Fixes #48300
This commit is contained in:
rudi193-cmd
2026-06-08 23:38:46 +09:00
committed by Vincent Koc
parent 7499a020d9
commit e3ef136bca
6 changed files with 275 additions and 103 deletions

View File

@@ -11,7 +11,7 @@ describe("memory FTS state", () => {
db = null;
});
it("only removes rows for the active model when a provider is active", () => {
it("removes rows for all models when a provider is active", () => {
db = new DatabaseSync(":memory:");
db.exec("CREATE TABLE chunks_fts (path TEXT, source TEXT, model TEXT)");
db.prepare("INSERT INTO chunks_fts (path, source, model) VALUES (?, ?, ?)").run(
@@ -24,6 +24,16 @@ describe("memory FTS state", () => {
"memory",
"other-model",
);
db.prepare("INSERT INTO chunks_fts (path, source, model) VALUES (?, ?, ?)").run(
"memory/2026-01-13.md",
"memory",
"other-model",
);
db.prepare("INSERT INTO chunks_fts (path, source, model) VALUES (?, ?, ?)").run(
"memory/2026-01-12.md",
"sessions",
"other-model",
);
deleteMemoryFtsRows({
db,
@@ -32,10 +42,15 @@ describe("memory FTS state", () => {
currentModel: "mock-embed",
});
const rows = db.prepare("SELECT model FROM chunks_fts ORDER BY model").all() as Array<{
const rows = db.prepare("SELECT path, source, model FROM chunks_fts ORDER BY path, source").all() as Array<{
path: string;
source: string;
model: string;
}>;
expect(rows).toEqual([{ model: "other-model" }]);
expect(rows).toEqual([
{ path: "memory/2026-01-12.md", source: "sessions", model: "other-model" },
{ path: "memory/2026-01-13.md", source: "memory", model: "other-model" },
]);
});
it("removes all rows for the path in FTS-only mode", () => {

View File

@@ -10,12 +10,8 @@ export function deleteMemoryFtsRows(params: {
currentModel?: string;
}): void {
const tableName = params.tableName ?? "chunks_fts";
if (params.currentModel) {
params.db
.prepare(`DELETE FROM ${tableName} WHERE path = ? AND source = ? AND model = ?`)
.run(params.path, params.source, params.currentModel);
return;
}
// Lexical search is model-agnostic, so refreshed/deleted files must not
// leave old-model FTS rows behind for the same path/source.
params.db
.prepare(`DELETE FROM ${tableName} WHERE path = ? AND source = ?`)
.run(params.path, params.source);

View File

@@ -1,4 +1,5 @@
// Memory Core tests cover manager search plugin behavior.
import type { DatabaseSync } from "node:sqlite";
import {
ensureMemoryIndexSchema,
loadSqliteVecExtension,
@@ -11,6 +12,45 @@ import { searchKeyword, searchVector } from "./manager-search.js";
const vectorToBlob = (embedding: number[]): Buffer =>
Buffer.from(new Float32Array(embedding).buffer);
function insertKeywordFixture(
db: DatabaseSync,
params: {
text: string;
id: string;
path: string;
source: "memory" | "sessions";
model: string;
startLine: number;
endLine: number;
},
): void {
db.prepare(
"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
).run(
params.id,
params.path,
params.source,
params.startLine,
params.endLine,
`${params.id}:hash`,
params.model,
params.text,
JSON.stringify([0]),
Date.now(),
);
db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
).run(
params.text,
params.id,
params.path,
params.source,
params.model,
params.startLine,
params.endLine,
);
}
describe("searchKeyword trigram fallback", () => {
const { DatabaseSync } = requireNodeSqlite();
@@ -55,16 +95,20 @@ describe("searchKeyword trigram fallback", () => {
}) {
const db = createTrigramDb();
try {
const insert = db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
);
for (const row of params.rows) {
insert.run(row.text, row.id, row.path, "memory", "mock-embed", 1, 1);
insertKeywordFixture(db, {
text: row.text,
id: row.id,
path: row.path,
source: "memory",
model: "mock-embed",
startLine: 1,
endLine: 1,
});
}
return await searchKeyword({
db,
ftsTable: "chunks_fts",
providerModel: "mock-embed",
query: params.query,
ftsTokenizer: "trigram",
limit: 10,
@@ -220,27 +264,24 @@ describe("searchKeyword FTS MATCH fallback", () => {
itWithFts("falls back to LIKE search when FTS MATCH throws", async () => {
const db = createFtsDb();
try {
const insert = db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
);
insert.run(
"The Agent framework handles API calls and cron jobs",
"1",
"doc.md",
"sessions",
"mock-embed",
1,
5,
);
insert.run(
"Deploy the database cluster on Hetzner",
"2",
"ops.md",
"sessions",
"mock-embed",
1,
3,
);
insertKeywordFixture(db, {
text: "The Agent framework handles API calls and cron jobs",
id: "1",
path: "doc.md",
source: "sessions",
model: "mock-embed",
startLine: 1,
endLine: 5,
});
insertKeywordFixture(db, {
text: "Deploy the database cluster on Hetzner",
id: "2",
path: "ops.md",
source: "sessions",
model: "mock-embed",
startLine: 1,
endLine: 3,
});
// Simulate a buildFtsQuery that produces a broken MATCH expression
const brokenBuildFtsQuery = () => "BROKEN_QUERY_SYNTAX <<<";
@@ -248,7 +289,6 @@ describe("searchKeyword FTS MATCH fallback", () => {
const results = await searchKeyword({
db,
ftsTable: "chunks_fts",
providerModel: "mock-embed",
query: "Agent",
ftsTokenizer: "unicode61",
limit: 10,
@@ -271,23 +311,19 @@ describe("searchKeyword FTS MATCH fallback", () => {
itWithFts("returns BM25-scored results when FTS MATCH succeeds", async () => {
const db = createFtsDb();
try {
const insert = db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
);
insert.run(
"The Transformer architecture powers modern LLMs",
"1",
"ml.md",
"memory",
"mock-embed",
1,
3,
);
insertKeywordFixture(db, {
text: "The Transformer architecture powers modern LLMs",
id: "1",
path: "ml.md",
source: "memory",
model: "mock-embed",
startLine: 1,
endLine: 3,
});
const results = await searchKeyword({
db,
ftsTable: "chunks_fts",
providerModel: "mock-embed",
query: "Transformer",
ftsTokenizer: "unicode61",
limit: 10,
@@ -310,17 +346,29 @@ describe("searchKeyword FTS MATCH fallback", () => {
itWithFts("applies source filter in LIKE fallback", async () => {
const db = createFtsDb();
try {
const insert = db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
);
insert.run("Agent handles API calls", "1", "doc.md", "sessions", "mock-embed", 1, 3);
insert.run("Agent design patterns", "2", "notes.md", "memory", "mock-embed", 1, 3);
insertKeywordFixture(db, {
text: "Agent handles API calls",
id: "1",
path: "doc.md",
source: "sessions",
model: "mock-embed",
startLine: 1,
endLine: 3,
});
insertKeywordFixture(db, {
text: "Agent design patterns",
id: "2",
path: "notes.md",
source: "memory",
model: "mock-embed",
startLine: 1,
endLine: 3,
});
const brokenBuildFtsQuery = () => "BROKEN <<<";
const results = await searchKeyword({
db,
ftsTable: "chunks_fts",
providerModel: "mock-embed",
query: "Agent",
ftsTokenizer: "unicode61",
limit: 10,
@@ -341,29 +389,26 @@ describe("searchKeyword FTS MATCH fallback", () => {
itWithFts("splits multi-word query into per-token LIKE clauses in fallback", async () => {
const db = createFtsDb();
try {
const insert = db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
);
// "Agent" and "cron" appear in this row but not adjacent
insert.run(
"The Agent framework handles API calls and cron jobs",
"1",
"doc.md",
"sessions",
"mock-embed",
1,
5,
);
insertKeywordFixture(db, {
text: "The Agent framework handles API calls and cron jobs",
id: "1",
path: "doc.md",
source: "sessions",
model: "mock-embed",
startLine: 1,
endLine: 5,
});
// Only "Agent" appears in this row
insert.run(
"Agent design patterns for microservices",
"2",
"arch.md",
"sessions",
"mock-embed",
1,
3,
);
insertKeywordFixture(db, {
text: "Agent design patterns for microservices",
id: "2",
path: "arch.md",
source: "sessions",
model: "mock-embed",
startLine: 1,
endLine: 3,
});
// A single-substring LIKE '%Agent cron%' would miss row 1 because
// the words are not adjacent. Per-token LIKE should find it.
@@ -371,7 +416,6 @@ describe("searchKeyword FTS MATCH fallback", () => {
const results = await searchKeyword({
db,
ftsTable: "chunks_fts",
providerModel: "mock-embed",
query: "Agent cron",
ftsTokenizer: "unicode61",
limit: 10,
@@ -393,15 +437,19 @@ describe("searchKeyword FTS MATCH fallback", () => {
const db = createFtsDb();
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
try {
const insert = db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
);
insert.run("test content", "1", "doc.md", "sessions", "mock-embed", 1, 1);
insertKeywordFixture(db, {
text: "test content",
id: "1",
path: "doc.md",
source: "sessions",
model: "mock-embed",
startLine: 1,
endLine: 1,
});
await searchKeyword({
db,
ftsTable: "chunks_fts",
providerModel: "mock-embed",
query: "test",
ftsTokenizer: "unicode61",
limit: 10,
@@ -426,6 +474,130 @@ describe("searchKeyword FTS MATCH fallback", () => {
});
});
describe("searchKeyword cross-model FTS visibility (issue #48300)", () => {
const { DatabaseSync } = requireNodeSqlite();
function supportsFts(): boolean {
const db = new DatabaseSync(":memory:");
try {
const result = ensureMemoryIndexSchema({
db,
embeddingCacheTable: "embedding_cache",
cacheEnabled: false,
ftsTable: "chunks_fts",
ftsEnabled: true,
});
return result.ftsAvailable;
} finally {
db.close();
}
}
const itWithFts = supportsFts() ? it : it.skip;
itWithFts("returns FTS hits indexed under a different embedding model", async () => {
const db = new DatabaseSync(":memory:");
try {
const result = ensureMemoryIndexSchema({
db,
embeddingCacheTable: "embedding_cache",
cacheEnabled: false,
ftsTable: "chunks_fts",
ftsEnabled: true,
});
if (!result.ftsAvailable) {
throw new Error(result.ftsError ?? "FTS unavailable");
}
insertKeywordFixture(db, {
text: "Persona notes for Clyde the assistant",
id: "clyde-old",
path: "memory/persona.md",
source: "memory",
model: "bge-m3",
startLine: 1,
endLine: 3,
});
insertKeywordFixture(db, {
text: "Persona notes for Clyde the assistant",
id: "clyde-new",
path: "memory/persona.md",
source: "memory",
model: "nomic-embed-text",
startLine: 1,
endLine: 3,
});
const results = await searchKeyword({
db,
ftsTable: "chunks_fts",
query: "Clyde",
ftsTokenizer: "unicode61",
limit: 10,
snippetMaxChars: 200,
sourceFilter: { sql: "", params: [] },
buildFtsQuery,
bm25RankToScore,
});
expect(results.map((row) => row.id).toSorted()).toEqual(["clyde-new", "clyde-old"]);
} finally {
db.close();
}
});
itWithFts("does not return orphaned old-model FTS rows without a live chunk", async () => {
const db = new DatabaseSync(":memory:");
try {
const result = ensureMemoryIndexSchema({
db,
embeddingCacheTable: "embedding_cache",
cacheEnabled: false,
ftsTable: "chunks_fts",
ftsEnabled: true,
});
if (!result.ftsAvailable) {
throw new Error(result.ftsError ?? "FTS unavailable");
}
insertKeywordFixture(db, {
text: "Current Clyde notes",
id: "live-clyde",
path: "memory/persona.md",
source: "memory",
model: "nomic-embed-text",
startLine: 1,
endLine: 3,
});
db.prepare(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
).run(
"Deleted Clyde notes from an older model",
"orphan-clyde",
"memory/persona.md",
"memory",
"bge-m3",
1,
3,
);
const results = await searchKeyword({
db,
ftsTable: "chunks_fts",
query: "Clyde",
ftsTokenizer: "unicode61",
limit: 10,
snippetMaxChars: 200,
sourceFilter: { sql: "", params: [] },
buildFtsQuery,
bm25RankToScore,
});
expect(results.map((row) => row.id)).toEqual(["live-clyde"]);
} finally {
db.close();
}
});
});
describe("searchVector sqlite-vec KNN", () => {
const { DatabaseSync } = requireNodeSqlite();

View File

@@ -308,7 +308,6 @@ async function searchChunksByEmbedding(params: {
export async function searchKeyword(params: {
db: DatabaseSync;
ftsTable: string;
providerModel: string | undefined;
query: string;
ftsTokenizer?: "unicode61" | "trigram";
limit: number;
@@ -330,9 +329,9 @@ export async function searchKeyword(params: {
return [];
}
// When providerModel is undefined (FTS-only mode), search all models
const modelClause = params.providerModel ? " AND model = ?" : "";
const modelParams = params.providerModel ? [params.providerModel] : [];
// Lexical FTS is model-agnostic (issue #48300), but old databases may
// already contain orphaned FTS rows from prior model-scoped cleanup.
const liveChunkClause = ` AND EXISTS (SELECT 1 FROM chunks c WHERE c.id = ${params.ftsTable}.id)`;
const substringClause = plan.substringTerms.map(() => " AND text LIKE ? ESCAPE '\\'").join("");
const substringParams = plan.substringTerms.map((term) => `%${escapeLikePattern(term)}%`);
@@ -354,14 +353,13 @@ export async function searchKeyword(params: {
`SELECT id, path, source, start_line, end_line, text,\n` +
` bm25(${params.ftsTable}) AS rank\n` +
` FROM ${params.ftsTable}\n` +
` WHERE ${params.ftsTable} MATCH ?${substringClause}${modelClause}${params.sourceFilter.sql}\n` +
` WHERE ${params.ftsTable} MATCH ?${substringClause}${liveChunkClause}${params.sourceFilter.sql}\n` +
` ORDER BY rank ASC\n` +
` LIMIT ?`,
)
.all(
plan.matchQuery,
...substringParams,
...modelParams,
...params.sourceFilter.params,
params.limit,
) as typeof rows;
@@ -381,12 +379,11 @@ export async function searchKeyword(params: {
`SELECT id, path, source, start_line, end_line, text,\n` +
` 0 AS rank\n` +
` FROM ${params.ftsTable}\n` +
` WHERE 1=1${fallbackLikeClause}${modelClause}${params.sourceFilter.sql}\n` +
` WHERE 1=1${fallbackLikeClause}${liveChunkClause}${params.sourceFilter.sql}\n` +
` LIMIT ?`,
)
.all(
...fallbackLikeParams,
...modelParams,
...params.sourceFilter.params,
params.limit,
) as typeof rows;
@@ -397,12 +394,11 @@ export async function searchKeyword(params: {
`SELECT id, path, source, start_line, end_line, text,\n` +
` 0 AS rank\n` +
` FROM ${params.ftsTable}\n` +
` WHERE 1=1${substringClause}${modelClause}${params.sourceFilter.sql}\n` +
` WHERE 1=1${substringClause}${liveChunkClause}${params.sourceFilter.sql}\n` +
` LIMIT ?`,
)
.all(
...substringParams,
...modelParams,
...params.sourceFilter.params,
params.limit,
) as typeof rows;

View File

@@ -1616,9 +1616,9 @@ export abstract class MemoryManagerSyncOps {
`DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`,
)
: null;
const deleteFtsRowsByPathSourceAndModel =
const deleteFtsRowsByPathAndSource =
this.fts.enabled && this.fts.available
? this.db.prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`)
? this.db.prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ?`)
: null;
const targetSessionFiles = params.needsFullReindex
@@ -1734,13 +1734,9 @@ export abstract class MemoryManagerSyncOps {
} catch {}
}
deleteChunksByPathAndSource.run(stale.path, "sessions");
if (deleteFtsRowsByPathSourceAndModel) {
if (deleteFtsRowsByPathAndSource) {
try {
deleteFtsRowsByPathSourceAndModel.run(
stale.path,
"sessions",
this.provider?.model ?? "fts-only",
);
deleteFtsRowsByPathAndSource.run(stale.path, "sessions");
} catch {}
}
} finally {

View File

@@ -900,12 +900,9 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
return [];
}
const sourceFilter = this.buildSourceFilter(undefined, sourceFilterList);
// In FTS-only mode (no provider), search all models; otherwise filter by current provider's model
const providerModel = this.provider?.model;
const results = await searchKeyword({
db: this.db,
ftsTable: FTS_TABLE,
providerModel,
query,
ftsTokenizer: this.settings.store.fts.tokenizer,
limit,