From 7f1712c1ba134f030276c28d78f9d0c47cdebe28 Mon Sep 17 00:00:00 2001 From: Rodrigo Uroz Date: Tue, 10 Feb 2026 23:10:17 -0300 Subject: [PATCH] (fix): enforce embedding model token limit to prevent overflow (#13455) * fix: enforce embedding model token limit to prevent 8192 overflow - Replace EMBEDDING_APPROX_CHARS_PER_TOKEN=1 with UTF-8 byte length estimation (safe upper bound for tokenizer output) - Add EMBEDDING_MODEL_MAX_TOKENS=8192 hard cap - Add splitChunkToTokenLimit() that binary-searches for the largest safe split point, with surrogate pair handling - Add enforceChunkTokenLimit() wrapper called in indexFile() after chunkMarkdown(), before any embedding API call - Fixes: session files with large JSONL entries could produce chunks exceeding text-embedding-3-small's 8192 token limit Tests: 2 new colocated tests in manager.embedding-token-limit.test.ts - Verifies oversized ASCII chunks are split to <=8192 bytes each - Verifies multibyte (emoji) content batching respects byte limits * fix: make embedding token limit provider-aware - Add optional maxInputTokens to EmbeddingProvider interface - Each provider (openai, gemini, voyage) reports its own limit - Known-limits map as fallback: openai 8192, gemini 2048, voyage 32K - Resolution: provider field > known map > default 8192 - Backward compatible: local/llama uses fallback * fix: enforce embedding input size limits (#13455) (thanks @rodrigouroz) --------- Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com> --- src/memory/embedding-chunk-limits.ts | 30 +++++ src/memory/embedding-input-limits.ts | 67 ++++++++++ src/memory/embedding-model-limits.ts | 35 +++++ src/memory/embeddings-gemini.ts | 4 + src/memory/embeddings-openai.ts | 6 + src/memory/embeddings-voyage.ts | 6 + src/memory/embeddings.ts | 1 + .../manager.embedding-token-limit.test.ts | 120 ++++++++++++++++++ src/memory/manager.ts | 19 ++- 9 files changed, 277 insertions(+), 11 deletions(-) create mode 100644 src/memory/embedding-chunk-limits.ts create mode 100644 src/memory/embedding-input-limits.ts create mode 100644 src/memory/embedding-model-limits.ts create mode 100644 src/memory/manager.embedding-token-limit.test.ts diff --git a/src/memory/embedding-chunk-limits.ts b/src/memory/embedding-chunk-limits.ts new file mode 100644 index 00000000000..74b1637bd22 --- /dev/null +++ b/src/memory/embedding-chunk-limits.ts @@ -0,0 +1,30 @@ +import type { EmbeddingProvider } from "./embeddings.js"; +import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js"; +import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js"; +import { hashText, type MemoryChunk } from "./internal.js"; + +export function enforceEmbeddingMaxInputTokens( + provider: EmbeddingProvider, + chunks: MemoryChunk[], +): MemoryChunk[] { + const maxInputTokens = resolveEmbeddingMaxInputTokens(provider); + const out: MemoryChunk[] = []; + + for (const chunk of chunks) { + if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) { + out.push(chunk); + continue; + } + + for (const text of splitTextToUtf8ByteLimit(chunk.text, maxInputTokens)) { + out.push({ + startLine: chunk.startLine, + endLine: chunk.endLine, + text, + hash: hashText(text), + }); + } + } + + return out; +} diff --git a/src/memory/embedding-input-limits.ts b/src/memory/embedding-input-limits.ts new file mode 100644 index 00000000000..dad83bb7aa7 --- /dev/null +++ b/src/memory/embedding-input-limits.ts @@ -0,0 +1,67 @@ +// Helpers for enforcing embedding model input size limits. +// +// We use UTF-8 byte length as a conservative upper bound for tokenizer output. +// Tokenizers operate over bytes; a token must contain at least one byte, so +// token_count <= utf8_byte_length. + +export function estimateUtf8Bytes(text: string): number { + if (!text) { + return 0; + } + return Buffer.byteLength(text, "utf8"); +} + +export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] { + if (maxUtf8Bytes <= 0) { + return [text]; + } + if (estimateUtf8Bytes(text) <= maxUtf8Bytes) { + return [text]; + } + + const parts: string[] = []; + let cursor = 0; + while (cursor < text.length) { + // The number of UTF-16 code units is always <= the number of UTF-8 bytes. + // This makes `cursor + maxUtf8Bytes` a safe upper bound on the next split point. + let low = cursor + 1; + let high = Math.min(text.length, cursor + maxUtf8Bytes); + let best = cursor; + + while (low <= high) { + const mid = Math.floor((low + high) / 2); + const bytes = estimateUtf8Bytes(text.slice(cursor, mid)); + if (bytes <= maxUtf8Bytes) { + best = mid; + low = mid + 1; + } else { + high = mid - 1; + } + } + + if (best <= cursor) { + best = Math.min(text.length, cursor + 1); + } + + // Avoid splitting inside a surrogate pair. + if ( + best < text.length && + best > cursor && + text.charCodeAt(best - 1) >= 0xd800 && + text.charCodeAt(best - 1) <= 0xdbff && + text.charCodeAt(best) >= 0xdc00 && + text.charCodeAt(best) <= 0xdfff + ) { + best -= 1; + } + + const part = text.slice(cursor, best); + if (!part) { + break; + } + parts.push(part); + cursor = best; + } + + return parts; +} diff --git a/src/memory/embedding-model-limits.ts b/src/memory/embedding-model-limits.ts new file mode 100644 index 00000000000..0f6dad821eb --- /dev/null +++ b/src/memory/embedding-model-limits.ts @@ -0,0 +1,35 @@ +import type { EmbeddingProvider } from "./embeddings.js"; + +const DEFAULT_EMBEDDING_MAX_INPUT_TOKENS = 8192; + +const KNOWN_EMBEDDING_MAX_INPUT_TOKENS: Record = { + "openai:text-embedding-3-small": 8192, + "openai:text-embedding-3-large": 8192, + "openai:text-embedding-ada-002": 8191, + "gemini:text-embedding-004": 2048, + "voyage:voyage-3": 32000, + "voyage:voyage-3-lite": 16000, + "voyage:voyage-code-3": 32000, +}; + +export function resolveEmbeddingMaxInputTokens(provider: EmbeddingProvider): number { + if (typeof provider.maxInputTokens === "number") { + return provider.maxInputTokens; + } + + // Provider/model mapping is best-effort; different providers use different + // limits and we prefer to be conservative when we don't know. + const key = `${provider.id}:${provider.model}`.toLowerCase(); + const known = KNOWN_EMBEDDING_MAX_INPUT_TOKENS[key]; + if (typeof known === "number") { + return known; + } + + // Provider-specific conservative fallbacks. This prevents us from accidentally + // using the OpenAI default for providers with much smaller limits. + if (provider.id.toLowerCase() === "gemini") { + return 2048; + } + + return DEFAULT_EMBEDDING_MAX_INPUT_TOKENS; +} diff --git a/src/memory/embeddings-gemini.ts b/src/memory/embeddings-gemini.ts index 95f8137ea35..b4911163a4f 100644 --- a/src/memory/embeddings-gemini.ts +++ b/src/memory/embeddings-gemini.ts @@ -12,6 +12,9 @@ export type GeminiEmbeddingClient = { const DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"; export const DEFAULT_GEMINI_EMBEDDING_MODEL = "gemini-embedding-001"; +const GEMINI_MAX_INPUT_TOKENS: Record = { + "text-embedding-004": 2048, +}; const debugEmbeddings = isTruthyEnvValue(process.env.OPENCLAW_DEBUG_MEMORY_EMBEDDINGS); const log = createSubsystemLogger("memory/embeddings"); @@ -117,6 +120,7 @@ export async function createGeminiEmbeddingProvider( provider: { id: "gemini", model: client.model, + maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model], embedQuery, embedBatch, }, diff --git a/src/memory/embeddings-openai.ts b/src/memory/embeddings-openai.ts index d125fa816b0..f4705fd6245 100644 --- a/src/memory/embeddings-openai.ts +++ b/src/memory/embeddings-openai.ts @@ -9,6 +9,11 @@ export type OpenAiEmbeddingClient = { export const DEFAULT_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"; const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; +const OPENAI_MAX_INPUT_TOKENS: Record = { + "text-embedding-3-small": 8192, + "text-embedding-3-large": 8192, + "text-embedding-ada-002": 8191, +}; export function normalizeOpenAiModel(model: string): string { const trimmed = model.trim(); @@ -51,6 +56,7 @@ export async function createOpenAiEmbeddingProvider( provider: { id: "openai", model: client.model, + maxInputTokens: OPENAI_MAX_INPUT_TOKENS[client.model], embedQuery: async (text) => { const [vec] = await embed([text]); return vec ?? []; diff --git a/src/memory/embeddings-voyage.ts b/src/memory/embeddings-voyage.ts index 8585b3dc346..4e014a28fbd 100644 --- a/src/memory/embeddings-voyage.ts +++ b/src/memory/embeddings-voyage.ts @@ -9,6 +9,11 @@ export type VoyageEmbeddingClient = { export const DEFAULT_VOYAGE_EMBEDDING_MODEL = "voyage-4-large"; const DEFAULT_VOYAGE_BASE_URL = "https://api.voyageai.com/v1"; +const VOYAGE_MAX_INPUT_TOKENS: Record = { + "voyage-3": 32000, + "voyage-3-lite": 16000, + "voyage-code-3": 32000, +}; export function normalizeVoyageModel(model: string): string { const trimmed = model.trim(); @@ -59,6 +64,7 @@ export async function createVoyageEmbeddingProvider( provider: { id: "voyage", model: client.model, + maxInputTokens: VOYAGE_MAX_INPUT_TOKENS[client.model], embedQuery: async (text) => { const [vec] = await embed([text], "query"); return vec ?? []; diff --git a/src/memory/embeddings.ts b/src/memory/embeddings.ts index e87b491f6f3..a81f5fbabfb 100644 --- a/src/memory/embeddings.ts +++ b/src/memory/embeddings.ts @@ -24,6 +24,7 @@ export type { VoyageEmbeddingClient } from "./embeddings-voyage.js"; export type EmbeddingProvider = { id: string; model: string; + maxInputTokens?: number; embedQuery: (text: string) => Promise; embedBatch: (texts: string[]) => Promise; }; diff --git a/src/memory/manager.embedding-token-limit.test.ts b/src/memory/manager.embedding-token-limit.test.ts new file mode 100644 index 00000000000..4cd89c609a5 --- /dev/null +++ b/src/memory/manager.embedding-token-limit.test.ts @@ -0,0 +1,120 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { getMemorySearchManager, type MemoryIndexManager } from "./index.js"; + +const embedBatch = vi.fn(async (texts: string[]) => texts.map(() => [0, 1, 0])); +const embedQuery = vi.fn(async () => [0, 1, 0]); + +vi.mock("./embeddings.js", () => ({ + createEmbeddingProvider: async () => ({ + requestedProvider: "openai", + provider: { + id: "mock", + model: "mock-embed", + maxInputTokens: 8192, + embedQuery, + embedBatch, + }, + }), +})); + +describe("memory embedding token limits", () => { + let workspaceDir: string; + let indexPath: string; + let manager: MemoryIndexManager | null = null; + + beforeEach(async () => { + embedBatch.mockReset(); + embedQuery.mockReset(); + embedBatch.mockImplementation(async (texts: string[]) => texts.map(() => [0, 1, 0])); + embedQuery.mockImplementation(async () => [0, 1, 0]); + workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-mem-token-")); + indexPath = path.join(workspaceDir, "index.sqlite"); + await fs.mkdir(path.join(workspaceDir, "memory")); + }); + + afterEach(async () => { + if (manager) { + await manager.close(); + manager = null; + } + await fs.rm(workspaceDir, { recursive: true, force: true }); + }); + + it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => { + const content = "x".repeat(9500); + await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-09.md"), content); + + const cfg = { + agents: { + defaults: { + workspace: workspaceDir, + memorySearch: { + provider: "openai", + model: "mock-embed", + store: { path: indexPath }, + chunking: { tokens: 10_000, overlap: 0 }, + sync: { watch: false, onSessionStart: false, onSearch: false }, + query: { minScore: 0 }, + }, + }, + list: [{ id: "main", default: true }], + }, + }; + + const result = await getMemorySearchManager({ cfg, agentId: "main" }); + expect(result.manager).not.toBeNull(); + if (!result.manager) { + throw new Error("manager missing"); + } + manager = result.manager; + await manager.sync({ force: true }); + + const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []); + expect(inputs.length).toBeGreaterThan(1); + expect( + Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))), + ).toBeLessThanOrEqual(8192); + }); + + it("uses UTF-8 byte estimates when batching multibyte chunks", async () => { + const line = "😀".repeat(1800); + const content = `${line}\n${line}\n${line}`; + await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-10.md"), content); + + const cfg = { + agents: { + defaults: { + workspace: workspaceDir, + memorySearch: { + provider: "openai", + model: "mock-embed", + store: { path: indexPath }, + chunking: { tokens: 1000, overlap: 0 }, + sync: { watch: false, onSessionStart: false, onSearch: false }, + query: { minScore: 0 }, + }, + }, + list: [{ id: "main", default: true }], + }, + }; + + const result = await getMemorySearchManager({ cfg, agentId: "main" }); + expect(result.manager).not.toBeNull(); + if (!result.manager) { + throw new Error("manager missing"); + } + manager = result.manager; + await manager.sync({ force: true }); + + const batchSizes = embedBatch.mock.calls.map( + (call) => (call[0] as string[] | undefined)?.length ?? 0, + ); + expect(batchSizes.length).toBe(3); + expect(batchSizes.every((size) => size === 1)).toBe(true); + const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []); + expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true); + }); +}); diff --git a/src/memory/manager.ts b/src/memory/manager.ts index 2517474598b..715695e82da 100644 --- a/src/memory/manager.ts +++ b/src/memory/manager.ts @@ -27,6 +27,8 @@ import { runOpenAiEmbeddingBatches, } from "./batch-openai.js"; import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js"; +import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js"; +import { estimateUtf8Bytes } from "./embedding-input-limits.js"; import { DEFAULT_GEMINI_EMBEDDING_MODEL } from "./embeddings-gemini.js"; import { DEFAULT_OPENAI_EMBEDDING_MODEL } from "./embeddings-openai.js"; import { DEFAULT_VOYAGE_EMBEDDING_MODEL } from "./embeddings-voyage.js"; @@ -87,7 +89,6 @@ const FTS_TABLE = "chunks_fts"; const EMBEDDING_CACHE_TABLE = "embedding_cache"; const SESSION_DIRTY_DEBOUNCE_MS = 5000; const EMBEDDING_BATCH_MAX_TOKENS = 8000; -const EMBEDDING_APPROX_CHARS_PER_TOKEN = 1; const EMBEDDING_INDEX_CONCURRENCY = 4; const EMBEDDING_RETRY_MAX_ATTEMPTS = 3; const EMBEDDING_RETRY_BASE_DELAY_MS = 500; @@ -1543,20 +1544,13 @@ export class MemoryIndexManager implements MemorySearchManager { .run(META_KEY, value); } - private estimateEmbeddingTokens(text: string): number { - if (!text) { - return 0; - } - return Math.ceil(text.length / EMBEDDING_APPROX_CHARS_PER_TOKEN); - } - private buildEmbeddingBatches(chunks: MemoryChunk[]): MemoryChunk[][] { const batches: MemoryChunk[][] = []; let current: MemoryChunk[] = []; let currentTokens = 0; for (const chunk of chunks) { - const estimate = this.estimateEmbeddingTokens(chunk.text); + const estimate = estimateUtf8Bytes(chunk.text); const wouldExceed = current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS; if (wouldExceed) { @@ -2206,8 +2200,11 @@ export class MemoryIndexManager implements MemorySearchManager { options: { source: MemorySource; content?: string }, ) { const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8")); - const chunks = chunkMarkdown(content, this.settings.chunking).filter( - (chunk) => chunk.text.trim().length > 0, + const chunks = enforceEmbeddingMaxInputTokens( + this.provider, + chunkMarkdown(content, this.settings.chunking).filter( + (chunk) => chunk.text.trim().length > 0, + ), ); if (options.source === "sessions" && "lineMap" in entry) { remapChunkLines(chunks, entry.lineMap);