Files
openclaw/src/memory/embedding-chunk-limits.ts
2026-02-22 15:40:18 -08:00

36 lines
1.1 KiB
TypeScript

import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js";
import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js";
import type { EmbeddingProvider } from "./embeddings.js";
import { hashText, type MemoryChunk } from "./internal.js";
export function enforceEmbeddingMaxInputTokens(
provider: EmbeddingProvider,
chunks: MemoryChunk[],
hardMaxInputTokens?: number,
): MemoryChunk[] {
const providerMaxInputTokens = resolveEmbeddingMaxInputTokens(provider);
const maxInputTokens =
typeof hardMaxInputTokens === "number" && hardMaxInputTokens > 0
? Math.min(providerMaxInputTokens, hardMaxInputTokens)
: providerMaxInputTokens;
const out: MemoryChunk[] = [];
for (const chunk of chunks) {
if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) {
out.push(chunk);
continue;
}
for (const text of splitTextToUtf8ByteLimit(chunk.text, maxInputTokens)) {
out.push({
startLine: chunk.startLine,
endLine: chunk.endLine,
text,
hash: hashText(text),
});
}
}
return out;
}