mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
(fix): enforce embedding model token limit to prevent overflow (#13455)
* fix: enforce embedding model token limit to prevent 8192 overflow - Replace EMBEDDING_APPROX_CHARS_PER_TOKEN=1 with UTF-8 byte length estimation (safe upper bound for tokenizer output) - Add EMBEDDING_MODEL_MAX_TOKENS=8192 hard cap - Add splitChunkToTokenLimit() that binary-searches for the largest safe split point, with surrogate pair handling - Add enforceChunkTokenLimit() wrapper called in indexFile() after chunkMarkdown(), before any embedding API call - Fixes: session files with large JSONL entries could produce chunks exceeding text-embedding-3-small's 8192 token limit Tests: 2 new colocated tests in manager.embedding-token-limit.test.ts - Verifies oversized ASCII chunks are split to <=8192 bytes each - Verifies multibyte (emoji) content batching respects byte limits * fix: make embedding token limit provider-aware - Add optional maxInputTokens to EmbeddingProvider interface - Each provider (openai, gemini, voyage) reports its own limit - Known-limits map as fallback: openai 8192, gemini 2048, voyage 32K - Resolution: provider field > known map > default 8192 - Backward compatible: local/llama uses fallback * fix: enforce embedding input size limits (#13455) (thanks @rodrigouroz) --------- Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
30
src/memory/embedding-chunk-limits.ts
Normal file
30
src/memory/embedding-chunk-limits.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import type { EmbeddingProvider } from "./embeddings.js";
|
||||
import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js";
|
||||
import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js";
|
||||
import { hashText, type MemoryChunk } from "./internal.js";
|
||||
|
||||
export function enforceEmbeddingMaxInputTokens(
|
||||
provider: EmbeddingProvider,
|
||||
chunks: MemoryChunk[],
|
||||
): MemoryChunk[] {
|
||||
const maxInputTokens = resolveEmbeddingMaxInputTokens(provider);
|
||||
const out: MemoryChunk[] = [];
|
||||
|
||||
for (const chunk of chunks) {
|
||||
if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) {
|
||||
out.push(chunk);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const text of splitTextToUtf8ByteLimit(chunk.text, maxInputTokens)) {
|
||||
out.push({
|
||||
startLine: chunk.startLine,
|
||||
endLine: chunk.endLine,
|
||||
text,
|
||||
hash: hashText(text),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
67
src/memory/embedding-input-limits.ts
Normal file
67
src/memory/embedding-input-limits.ts
Normal file
@@ -0,0 +1,67 @@
|
||||
// Helpers for enforcing embedding model input size limits.
|
||||
//
|
||||
// We use UTF-8 byte length as a conservative upper bound for tokenizer output.
|
||||
// Tokenizers operate over bytes; a token must contain at least one byte, so
|
||||
// token_count <= utf8_byte_length.
|
||||
|
||||
export function estimateUtf8Bytes(text: string): number {
|
||||
if (!text) {
|
||||
return 0;
|
||||
}
|
||||
return Buffer.byteLength(text, "utf8");
|
||||
}
|
||||
|
||||
export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] {
|
||||
if (maxUtf8Bytes <= 0) {
|
||||
return [text];
|
||||
}
|
||||
if (estimateUtf8Bytes(text) <= maxUtf8Bytes) {
|
||||
return [text];
|
||||
}
|
||||
|
||||
const parts: string[] = [];
|
||||
let cursor = 0;
|
||||
while (cursor < text.length) {
|
||||
// The number of UTF-16 code units is always <= the number of UTF-8 bytes.
|
||||
// This makes `cursor + maxUtf8Bytes` a safe upper bound on the next split point.
|
||||
let low = cursor + 1;
|
||||
let high = Math.min(text.length, cursor + maxUtf8Bytes);
|
||||
let best = cursor;
|
||||
|
||||
while (low <= high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
const bytes = estimateUtf8Bytes(text.slice(cursor, mid));
|
||||
if (bytes <= maxUtf8Bytes) {
|
||||
best = mid;
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (best <= cursor) {
|
||||
best = Math.min(text.length, cursor + 1);
|
||||
}
|
||||
|
||||
// Avoid splitting inside a surrogate pair.
|
||||
if (
|
||||
best < text.length &&
|
||||
best > cursor &&
|
||||
text.charCodeAt(best - 1) >= 0xd800 &&
|
||||
text.charCodeAt(best - 1) <= 0xdbff &&
|
||||
text.charCodeAt(best) >= 0xdc00 &&
|
||||
text.charCodeAt(best) <= 0xdfff
|
||||
) {
|
||||
best -= 1;
|
||||
}
|
||||
|
||||
const part = text.slice(cursor, best);
|
||||
if (!part) {
|
||||
break;
|
||||
}
|
||||
parts.push(part);
|
||||
cursor = best;
|
||||
}
|
||||
|
||||
return parts;
|
||||
}
|
||||
35
src/memory/embedding-model-limits.ts
Normal file
35
src/memory/embedding-model-limits.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import type { EmbeddingProvider } from "./embeddings.js";
|
||||
|
||||
const DEFAULT_EMBEDDING_MAX_INPUT_TOKENS = 8192;
|
||||
|
||||
const KNOWN_EMBEDDING_MAX_INPUT_TOKENS: Record<string, number> = {
|
||||
"openai:text-embedding-3-small": 8192,
|
||||
"openai:text-embedding-3-large": 8192,
|
||||
"openai:text-embedding-ada-002": 8191,
|
||||
"gemini:text-embedding-004": 2048,
|
||||
"voyage:voyage-3": 32000,
|
||||
"voyage:voyage-3-lite": 16000,
|
||||
"voyage:voyage-code-3": 32000,
|
||||
};
|
||||
|
||||
export function resolveEmbeddingMaxInputTokens(provider: EmbeddingProvider): number {
|
||||
if (typeof provider.maxInputTokens === "number") {
|
||||
return provider.maxInputTokens;
|
||||
}
|
||||
|
||||
// Provider/model mapping is best-effort; different providers use different
|
||||
// limits and we prefer to be conservative when we don't know.
|
||||
const key = `${provider.id}:${provider.model}`.toLowerCase();
|
||||
const known = KNOWN_EMBEDDING_MAX_INPUT_TOKENS[key];
|
||||
if (typeof known === "number") {
|
||||
return known;
|
||||
}
|
||||
|
||||
// Provider-specific conservative fallbacks. This prevents us from accidentally
|
||||
// using the OpenAI default for providers with much smaller limits.
|
||||
if (provider.id.toLowerCase() === "gemini") {
|
||||
return 2048;
|
||||
}
|
||||
|
||||
return DEFAULT_EMBEDDING_MAX_INPUT_TOKENS;
|
||||
}
|
||||
@@ -12,6 +12,9 @@ export type GeminiEmbeddingClient = {
|
||||
|
||||
const DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
export const DEFAULT_GEMINI_EMBEDDING_MODEL = "gemini-embedding-001";
|
||||
const GEMINI_MAX_INPUT_TOKENS: Record<string, number> = {
|
||||
"text-embedding-004": 2048,
|
||||
};
|
||||
const debugEmbeddings = isTruthyEnvValue(process.env.OPENCLAW_DEBUG_MEMORY_EMBEDDINGS);
|
||||
const log = createSubsystemLogger("memory/embeddings");
|
||||
|
||||
@@ -117,6 +120,7 @@ export async function createGeminiEmbeddingProvider(
|
||||
provider: {
|
||||
id: "gemini",
|
||||
model: client.model,
|
||||
maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model],
|
||||
embedQuery,
|
||||
embedBatch,
|
||||
},
|
||||
|
||||
@@ -9,6 +9,11 @@ export type OpenAiEmbeddingClient = {
|
||||
|
||||
export const DEFAULT_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small";
|
||||
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
||||
const OPENAI_MAX_INPUT_TOKENS: Record<string, number> = {
|
||||
"text-embedding-3-small": 8192,
|
||||
"text-embedding-3-large": 8192,
|
||||
"text-embedding-ada-002": 8191,
|
||||
};
|
||||
|
||||
export function normalizeOpenAiModel(model: string): string {
|
||||
const trimmed = model.trim();
|
||||
@@ -51,6 +56,7 @@ export async function createOpenAiEmbeddingProvider(
|
||||
provider: {
|
||||
id: "openai",
|
||||
model: client.model,
|
||||
maxInputTokens: OPENAI_MAX_INPUT_TOKENS[client.model],
|
||||
embedQuery: async (text) => {
|
||||
const [vec] = await embed([text]);
|
||||
return vec ?? [];
|
||||
|
||||
@@ -9,6 +9,11 @@ export type VoyageEmbeddingClient = {
|
||||
|
||||
export const DEFAULT_VOYAGE_EMBEDDING_MODEL = "voyage-4-large";
|
||||
const DEFAULT_VOYAGE_BASE_URL = "https://api.voyageai.com/v1";
|
||||
const VOYAGE_MAX_INPUT_TOKENS: Record<string, number> = {
|
||||
"voyage-3": 32000,
|
||||
"voyage-3-lite": 16000,
|
||||
"voyage-code-3": 32000,
|
||||
};
|
||||
|
||||
export function normalizeVoyageModel(model: string): string {
|
||||
const trimmed = model.trim();
|
||||
@@ -59,6 +64,7 @@ export async function createVoyageEmbeddingProvider(
|
||||
provider: {
|
||||
id: "voyage",
|
||||
model: client.model,
|
||||
maxInputTokens: VOYAGE_MAX_INPUT_TOKENS[client.model],
|
||||
embedQuery: async (text) => {
|
||||
const [vec] = await embed([text], "query");
|
||||
return vec ?? [];
|
||||
|
||||
@@ -24,6 +24,7 @@ export type { VoyageEmbeddingClient } from "./embeddings-voyage.js";
|
||||
export type EmbeddingProvider = {
|
||||
id: string;
|
||||
model: string;
|
||||
maxInputTokens?: number;
|
||||
embedQuery: (text: string) => Promise<number[]>;
|
||||
embedBatch: (texts: string[]) => Promise<number[][]>;
|
||||
};
|
||||
|
||||
120
src/memory/manager.embedding-token-limit.test.ts
Normal file
120
src/memory/manager.embedding-token-limit.test.ts
Normal file
@@ -0,0 +1,120 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";
|
||||
|
||||
const embedBatch = vi.fn(async (texts: string[]) => texts.map(() => [0, 1, 0]));
|
||||
const embedQuery = vi.fn(async () => [0, 1, 0]);
|
||||
|
||||
vi.mock("./embeddings.js", () => ({
|
||||
createEmbeddingProvider: async () => ({
|
||||
requestedProvider: "openai",
|
||||
provider: {
|
||||
id: "mock",
|
||||
model: "mock-embed",
|
||||
maxInputTokens: 8192,
|
||||
embedQuery,
|
||||
embedBatch,
|
||||
},
|
||||
}),
|
||||
}));
|
||||
|
||||
describe("memory embedding token limits", () => {
|
||||
let workspaceDir: string;
|
||||
let indexPath: string;
|
||||
let manager: MemoryIndexManager | null = null;
|
||||
|
||||
beforeEach(async () => {
|
||||
embedBatch.mockReset();
|
||||
embedQuery.mockReset();
|
||||
embedBatch.mockImplementation(async (texts: string[]) => texts.map(() => [0, 1, 0]));
|
||||
embedQuery.mockImplementation(async () => [0, 1, 0]);
|
||||
workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-mem-token-"));
|
||||
indexPath = path.join(workspaceDir, "index.sqlite");
|
||||
await fs.mkdir(path.join(workspaceDir, "memory"));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
if (manager) {
|
||||
await manager.close();
|
||||
manager = null;
|
||||
}
|
||||
await fs.rm(workspaceDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
|
||||
const content = "x".repeat(9500);
|
||||
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-09.md"), content);
|
||||
|
||||
const cfg = {
|
||||
agents: {
|
||||
defaults: {
|
||||
workspace: workspaceDir,
|
||||
memorySearch: {
|
||||
provider: "openai",
|
||||
model: "mock-embed",
|
||||
store: { path: indexPath },
|
||||
chunking: { tokens: 10_000, overlap: 0 },
|
||||
sync: { watch: false, onSessionStart: false, onSearch: false },
|
||||
query: { minScore: 0 },
|
||||
},
|
||||
},
|
||||
list: [{ id: "main", default: true }],
|
||||
},
|
||||
};
|
||||
|
||||
const result = await getMemorySearchManager({ cfg, agentId: "main" });
|
||||
expect(result.manager).not.toBeNull();
|
||||
if (!result.manager) {
|
||||
throw new Error("manager missing");
|
||||
}
|
||||
manager = result.manager;
|
||||
await manager.sync({ force: true });
|
||||
|
||||
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
|
||||
expect(inputs.length).toBeGreaterThan(1);
|
||||
expect(
|
||||
Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
|
||||
).toBeLessThanOrEqual(8192);
|
||||
});
|
||||
|
||||
it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
|
||||
const line = "😀".repeat(1800);
|
||||
const content = `${line}\n${line}\n${line}`;
|
||||
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-10.md"), content);
|
||||
|
||||
const cfg = {
|
||||
agents: {
|
||||
defaults: {
|
||||
workspace: workspaceDir,
|
||||
memorySearch: {
|
||||
provider: "openai",
|
||||
model: "mock-embed",
|
||||
store: { path: indexPath },
|
||||
chunking: { tokens: 1000, overlap: 0 },
|
||||
sync: { watch: false, onSessionStart: false, onSearch: false },
|
||||
query: { minScore: 0 },
|
||||
},
|
||||
},
|
||||
list: [{ id: "main", default: true }],
|
||||
},
|
||||
};
|
||||
|
||||
const result = await getMemorySearchManager({ cfg, agentId: "main" });
|
||||
expect(result.manager).not.toBeNull();
|
||||
if (!result.manager) {
|
||||
throw new Error("manager missing");
|
||||
}
|
||||
manager = result.manager;
|
||||
await manager.sync({ force: true });
|
||||
|
||||
const batchSizes = embedBatch.mock.calls.map(
|
||||
(call) => (call[0] as string[] | undefined)?.length ?? 0,
|
||||
);
|
||||
expect(batchSizes.length).toBe(3);
|
||||
expect(batchSizes.every((size) => size === 1)).toBe(true);
|
||||
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
|
||||
expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -27,6 +27,8 @@ import {
|
||||
runOpenAiEmbeddingBatches,
|
||||
} from "./batch-openai.js";
|
||||
import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js";
|
||||
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
|
||||
import { estimateUtf8Bytes } from "./embedding-input-limits.js";
|
||||
import { DEFAULT_GEMINI_EMBEDDING_MODEL } from "./embeddings-gemini.js";
|
||||
import { DEFAULT_OPENAI_EMBEDDING_MODEL } from "./embeddings-openai.js";
|
||||
import { DEFAULT_VOYAGE_EMBEDDING_MODEL } from "./embeddings-voyage.js";
|
||||
@@ -87,7 +89,6 @@ const FTS_TABLE = "chunks_fts";
|
||||
const EMBEDDING_CACHE_TABLE = "embedding_cache";
|
||||
const SESSION_DIRTY_DEBOUNCE_MS = 5000;
|
||||
const EMBEDDING_BATCH_MAX_TOKENS = 8000;
|
||||
const EMBEDDING_APPROX_CHARS_PER_TOKEN = 1;
|
||||
const EMBEDDING_INDEX_CONCURRENCY = 4;
|
||||
const EMBEDDING_RETRY_MAX_ATTEMPTS = 3;
|
||||
const EMBEDDING_RETRY_BASE_DELAY_MS = 500;
|
||||
@@ -1543,20 +1544,13 @@ export class MemoryIndexManager implements MemorySearchManager {
|
||||
.run(META_KEY, value);
|
||||
}
|
||||
|
||||
private estimateEmbeddingTokens(text: string): number {
|
||||
if (!text) {
|
||||
return 0;
|
||||
}
|
||||
return Math.ceil(text.length / EMBEDDING_APPROX_CHARS_PER_TOKEN);
|
||||
}
|
||||
|
||||
private buildEmbeddingBatches(chunks: MemoryChunk[]): MemoryChunk[][] {
|
||||
const batches: MemoryChunk[][] = [];
|
||||
let current: MemoryChunk[] = [];
|
||||
let currentTokens = 0;
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const estimate = this.estimateEmbeddingTokens(chunk.text);
|
||||
const estimate = estimateUtf8Bytes(chunk.text);
|
||||
const wouldExceed =
|
||||
current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS;
|
||||
if (wouldExceed) {
|
||||
@@ -2206,8 +2200,11 @@ export class MemoryIndexManager implements MemorySearchManager {
|
||||
options: { source: MemorySource; content?: string },
|
||||
) {
|
||||
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
|
||||
const chunks = chunkMarkdown(content, this.settings.chunking).filter(
|
||||
(chunk) => chunk.text.trim().length > 0,
|
||||
const chunks = enforceEmbeddingMaxInputTokens(
|
||||
this.provider,
|
||||
chunkMarkdown(content, this.settings.chunking).filter(
|
||||
(chunk) => chunk.text.trim().length > 0,
|
||||
),
|
||||
);
|
||||
if (options.source === "sessions" && "lineMap" in entry) {
|
||||
remapChunkLines(chunks, entry.lineMap);
|
||||
|
||||
Reference in New Issue
Block a user