mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 19:31:00 +00:00
fix(memory-core): bound fallback vector chunk scoring
- stream fallback Memory Core vector scoring with SQLite iterate() and a bounded top-K result set - add regression coverage and live-main lint/boundary helper repairs - supersedes #73069 Thanks @parkertoddbrooks.
This commit is contained in:
committed by
GitHub
parent
56875c4d32
commit
864c4f7ff4
@@ -3,7 +3,7 @@ import {
|
||||
loadSqliteVecExtension,
|
||||
requireNodeSqlite,
|
||||
} from "openclaw/plugin-sdk/memory-core-host-engine-storage";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { bm25RankToScore, buildFtsQuery } from "./hybrid.js";
|
||||
import { searchKeyword, searchVector } from "./manager-search.js";
|
||||
|
||||
@@ -182,6 +182,98 @@ describe("searchKeyword trigram fallback", () => {
|
||||
describe("searchVector sqlite-vec KNN", () => {
|
||||
const { DatabaseSync } = requireNodeSqlite();
|
||||
|
||||
it("streams fallback chunk scoring without materializing candidates", async () => {
|
||||
type ChunkRow = {
|
||||
id: string;
|
||||
path: string;
|
||||
start_line: number;
|
||||
end_line: number;
|
||||
text: string;
|
||||
embedding: string;
|
||||
source: string;
|
||||
};
|
||||
type StatementWithAll = {
|
||||
all: (...params: unknown[]) => ChunkRow[];
|
||||
};
|
||||
|
||||
const db = new DatabaseSync(":memory:");
|
||||
try {
|
||||
ensureMemoryIndexSchema({
|
||||
db,
|
||||
embeddingCacheTable: "embedding_cache",
|
||||
cacheEnabled: false,
|
||||
ftsTable: "chunks_fts",
|
||||
ftsEnabled: false,
|
||||
});
|
||||
|
||||
const insertChunk = db.prepare(
|
||||
"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
);
|
||||
const addChunk = (params: { id: string; model: string; vector: [number, number] }) => {
|
||||
insertChunk.run(
|
||||
params.id,
|
||||
`memory/${params.id}.md`,
|
||||
"memory",
|
||||
1,
|
||||
1,
|
||||
params.id,
|
||||
params.model,
|
||||
`chunk ${params.id}`,
|
||||
JSON.stringify(params.vector),
|
||||
1,
|
||||
);
|
||||
};
|
||||
addChunk({ id: "target-1", model: "target-model", vector: [1, 0] });
|
||||
addChunk({ id: "target-2", model: "target-model", vector: [0.8, 0.2] });
|
||||
addChunk({ id: "target-3", model: "target-model", vector: [0, 1] });
|
||||
addChunk({ id: "other-1", model: "other-model", vector: [1, 0] });
|
||||
|
||||
const prepareTarget = db as unknown as { prepare: (sql: string) => unknown };
|
||||
const originalPrepare = prepareTarget.prepare.bind(db);
|
||||
const chunkRows = (
|
||||
originalPrepare(
|
||||
"SELECT id, path, start_line, end_line, text, embedding, source\n" +
|
||||
" FROM chunks\n" +
|
||||
" WHERE model = ?",
|
||||
) as StatementWithAll
|
||||
).all("target-model");
|
||||
const prepareSpy = vi.spyOn(prepareTarget, "prepare").mockImplementation((sql: string) => {
|
||||
if (
|
||||
sql.includes("SELECT id, path, start_line, end_line, text, embedding, source") &&
|
||||
sql.includes("FROM chunks")
|
||||
) {
|
||||
return {
|
||||
all: () => {
|
||||
throw new Error("fallback vector search must stream rows via iterate()");
|
||||
},
|
||||
iterate: () => chunkRows[Symbol.iterator](),
|
||||
};
|
||||
}
|
||||
return originalPrepare(sql);
|
||||
});
|
||||
|
||||
try {
|
||||
const results = await searchVector({
|
||||
db,
|
||||
vectorTable: "chunks_vec",
|
||||
providerModel: "target-model",
|
||||
queryVec: [1, 0],
|
||||
limit: 2,
|
||||
snippetMaxChars: 200,
|
||||
ensureVectorReady: async () => false,
|
||||
sourceFilterVec: { sql: "", params: [] },
|
||||
sourceFilterChunks: { sql: "", params: [] },
|
||||
});
|
||||
|
||||
expect(results.map((row) => row.id)).toEqual(["target-1", "target-2"]);
|
||||
} finally {
|
||||
prepareSpy.mockRestore();
|
||||
}
|
||||
} finally {
|
||||
db.close();
|
||||
}
|
||||
});
|
||||
|
||||
it("fills the requested limit after model filters prune nearest KNN candidates", async () => {
|
||||
const db = new DatabaseSync(":memory:", { allowExtension: true });
|
||||
try {
|
||||
|
||||
@@ -205,51 +205,34 @@ export async function searchVector(params: {
|
||||
}));
|
||||
}
|
||||
|
||||
const candidates = listChunks({
|
||||
return searchChunksByEmbedding({
|
||||
db: params.db,
|
||||
providerModel: params.providerModel,
|
||||
sourceFilter: params.sourceFilterChunks,
|
||||
queryVec: params.queryVec,
|
||||
limit: params.limit,
|
||||
snippetMaxChars: params.snippetMaxChars,
|
||||
});
|
||||
const scored = candidates
|
||||
.map((chunk) => ({
|
||||
chunk,
|
||||
score: cosineSimilarity(params.queryVec, chunk.embedding),
|
||||
}))
|
||||
.filter((entry) => Number.isFinite(entry.score));
|
||||
return scored
|
||||
.toSorted((a, b) => b.score - a.score)
|
||||
.slice(0, params.limit)
|
||||
.map((entry) => ({
|
||||
id: entry.chunk.id,
|
||||
path: entry.chunk.path,
|
||||
startLine: entry.chunk.startLine,
|
||||
endLine: entry.chunk.endLine,
|
||||
score: entry.score,
|
||||
snippet: truncateUtf16Safe(entry.chunk.text, params.snippetMaxChars),
|
||||
source: entry.chunk.source,
|
||||
}));
|
||||
}
|
||||
|
||||
export function listChunks(params: {
|
||||
export function searchChunksByEmbedding(params: {
|
||||
db: DatabaseSync;
|
||||
providerModel: string;
|
||||
sourceFilter: { sql: string; params: SearchSource[] };
|
||||
}): Array<{
|
||||
id: string;
|
||||
path: string;
|
||||
startLine: number;
|
||||
endLine: number;
|
||||
text: string;
|
||||
embedding: number[];
|
||||
source: SearchSource;
|
||||
}> {
|
||||
queryVec: number[];
|
||||
limit: number;
|
||||
snippetMaxChars: number;
|
||||
}): SearchRowResult[] {
|
||||
if (params.limit <= 0) {
|
||||
return [];
|
||||
}
|
||||
const rows = params.db
|
||||
.prepare(
|
||||
`SELECT id, path, start_line, end_line, text, embedding, source\n` +
|
||||
` FROM chunks\n` +
|
||||
` WHERE model = ?${params.sourceFilter.sql}`,
|
||||
)
|
||||
.all(params.providerModel, ...params.sourceFilter.params) as Array<{
|
||||
.iterate(params.providerModel, ...params.sourceFilter.params) as IterableIterator<{
|
||||
id: string;
|
||||
path: string;
|
||||
start_line: number;
|
||||
@@ -259,15 +242,36 @@ export function listChunks(params: {
|
||||
source: SearchSource;
|
||||
}>;
|
||||
|
||||
return rows.map((row) => ({
|
||||
id: row.id,
|
||||
path: row.path,
|
||||
startLine: row.start_line,
|
||||
endLine: row.end_line,
|
||||
text: row.text,
|
||||
embedding: parseEmbedding(row.embedding),
|
||||
source: row.source,
|
||||
}));
|
||||
const topResults: SearchRowResult[] = [];
|
||||
for (const row of rows) {
|
||||
const score = cosineSimilarity(params.queryVec, parseEmbedding(row.embedding));
|
||||
if (!Number.isFinite(score)) {
|
||||
continue;
|
||||
}
|
||||
const result: SearchRowResult = {
|
||||
id: row.id,
|
||||
path: row.path,
|
||||
startLine: row.start_line,
|
||||
endLine: row.end_line,
|
||||
score,
|
||||
snippet: truncateUtf16Safe(row.text, params.snippetMaxChars),
|
||||
source: row.source,
|
||||
};
|
||||
if (topResults.length < params.limit) {
|
||||
topResults.push(result);
|
||||
if (topResults.length === params.limit) {
|
||||
topResults.sort((a, b) => b.score - a.score);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const lowest = topResults.at(-1);
|
||||
if (lowest && result.score > lowest.score) {
|
||||
topResults[topResults.length - 1] = result;
|
||||
topResults.sort((a, b) => b.score - a.score);
|
||||
}
|
||||
}
|
||||
topResults.sort((a, b) => b.score - a.score);
|
||||
return topResults;
|
||||
}
|
||||
|
||||
export async function searchKeyword(params: {
|
||||
|
||||
Reference in New Issue
Block a user