Files
openclaw/src/memory/embedding-chunk-limits.test.ts
2026-02-22 15:40:18 -08:00

103 lines
3.1 KiB
TypeScript

import { describe, expect, it } from "vitest";
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
import { estimateUtf8Bytes } from "./embedding-input-limits.js";
import type { EmbeddingProvider } from "./embeddings.js";
function createProvider(maxInputTokens: number): EmbeddingProvider {
return {
id: "mock",
model: "mock-embed",
maxInputTokens,
embedQuery: async () => [0],
embedBatch: async () => [[0]],
};
}
function createProviderWithoutMaxInputTokens(params: {
id: string;
model: string;
}): EmbeddingProvider {
return {
id: params.id,
model: params.model,
embedQuery: async () => [0],
embedBatch: async () => [[0]],
};
}
describe("embedding chunk limits", () => {
it("splits oversized chunks so each embedding input stays <= maxInputTokens bytes", () => {
const provider = createProvider(8192);
const input = {
startLine: 1,
endLine: 1,
text: "x".repeat(9000),
hash: "ignored",
};
const out = enforceEmbeddingMaxInputTokens(provider, [input]);
expect(out.length).toBeGreaterThan(1);
expect(out.map((chunk) => chunk.text).join("")).toBe(input.text);
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
expect(out.every((chunk) => chunk.startLine === 1 && chunk.endLine === 1)).toBe(true);
expect(out.every((chunk) => typeof chunk.hash === "string" && chunk.hash.length > 0)).toBe(
true,
);
});
it("does not split inside surrogate pairs (emoji)", () => {
const provider = createProvider(8192);
const emoji = "😀";
const inputText = `${emoji.repeat(2100)}\n${emoji.repeat(2100)}`;
const out = enforceEmbeddingMaxInputTokens(provider, [
{ startLine: 1, endLine: 2, text: inputText, hash: "ignored" },
]);
expect(out.length).toBeGreaterThan(1);
expect(out.map((chunk) => chunk.text).join("")).toBe(inputText);
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
// If we split inside surrogate pairs we'd likely end up with replacement chars.
expect(out.map((chunk) => chunk.text).join("")).not.toContain("\uFFFD");
});
it("uses conservative fallback limits for local providers without declared maxInputTokens", () => {
const provider = createProviderWithoutMaxInputTokens({
id: "local",
model: "unknown-local-embedding",
});
const out = enforceEmbeddingMaxInputTokens(provider, [
{
startLine: 1,
endLine: 1,
text: "x".repeat(3000),
hash: "ignored",
},
]);
expect(out.length).toBeGreaterThan(1);
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 2048)).toBe(true);
});
it("honors hard safety caps lower than provider maxInputTokens", () => {
const provider = createProvider(8192);
const out = enforceEmbeddingMaxInputTokens(
provider,
[
{
startLine: 1,
endLine: 1,
text: "x".repeat(8100),
hash: "ignored",
},
],
8000,
);
expect(out.length).toBeGreaterThan(1);
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8000)).toBe(true);
});
});