(fix): enforce embedding model token limit to prevent overflow (#13455)

* fix: enforce embedding model token limit to prevent 8192 overflow

- Replace EMBEDDING_APPROX_CHARS_PER_TOKEN=1 with UTF-8 byte length
  estimation (safe upper bound for tokenizer output)
- Add EMBEDDING_MODEL_MAX_TOKENS=8192 hard cap
- Add splitChunkToTokenLimit() that binary-searches for the largest
  safe split point, with surrogate pair handling
- Add enforceChunkTokenLimit() wrapper called in indexFile() after
  chunkMarkdown(), before any embedding API call
- Fixes: session files with large JSONL entries could produce chunks
  exceeding text-embedding-3-small's 8192 token limit

Tests: 2 new colocated tests in manager.embedding-token-limit.test.ts
- Verifies oversized ASCII chunks are split to <=8192 bytes each
- Verifies multibyte (emoji) content batching respects byte limits

* fix: make embedding token limit provider-aware

- Add optional maxInputTokens to EmbeddingProvider interface
- Each provider (openai, gemini, voyage) reports its own limit
- Known-limits map as fallback: openai 8192, gemini 2048, voyage 32K
- Resolution: provider field > known map > default 8192
- Backward compatible: local/llama uses fallback

* fix: enforce embedding input size limits (#13455) (thanks @rodrigouroz)

---------

Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
Rodrigo Uroz
2026-02-10 23:10:17 -03:00
committed by GitHub
parent c95b3783ef
commit 7f1712c1ba
9 changed files with 277 additions and 11 deletions

View File

@@ -0,0 +1,30 @@
import type { EmbeddingProvider } from "./embeddings.js";
import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js";
import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js";
import { hashText, type MemoryChunk } from "./internal.js";
export function enforceEmbeddingMaxInputTokens(
provider: EmbeddingProvider,
chunks: MemoryChunk[],
): MemoryChunk[] {
const maxInputTokens = resolveEmbeddingMaxInputTokens(provider);
const out: MemoryChunk[] = [];
for (const chunk of chunks) {
if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) {
out.push(chunk);
continue;
}
for (const text of splitTextToUtf8ByteLimit(chunk.text, maxInputTokens)) {
out.push({
startLine: chunk.startLine,
endLine: chunk.endLine,
text,
hash: hashText(text),
});
}
}
return out;
}

View File

@@ -0,0 +1,67 @@
// Helpers for enforcing embedding model input size limits.
//
// We use UTF-8 byte length as a conservative upper bound for tokenizer output.
// Tokenizers operate over bytes; a token must contain at least one byte, so
// token_count <= utf8_byte_length.
export function estimateUtf8Bytes(text: string): number {
if (!text) {
return 0;
}
return Buffer.byteLength(text, "utf8");
}
export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] {
if (maxUtf8Bytes <= 0) {
return [text];
}
if (estimateUtf8Bytes(text) <= maxUtf8Bytes) {
return [text];
}
const parts: string[] = [];
let cursor = 0;
while (cursor < text.length) {
// The number of UTF-16 code units is always <= the number of UTF-8 bytes.
// This makes `cursor + maxUtf8Bytes` a safe upper bound on the next split point.
let low = cursor + 1;
let high = Math.min(text.length, cursor + maxUtf8Bytes);
let best = cursor;
while (low <= high) {
const mid = Math.floor((low + high) / 2);
const bytes = estimateUtf8Bytes(text.slice(cursor, mid));
if (bytes <= maxUtf8Bytes) {
best = mid;
low = mid + 1;
} else {
high = mid - 1;
}
}
if (best <= cursor) {
best = Math.min(text.length, cursor + 1);
}
// Avoid splitting inside a surrogate pair.
if (
best < text.length &&
best > cursor &&
text.charCodeAt(best - 1) >= 0xd800 &&
text.charCodeAt(best - 1) <= 0xdbff &&
text.charCodeAt(best) >= 0xdc00 &&
text.charCodeAt(best) <= 0xdfff
) {
best -= 1;
}
const part = text.slice(cursor, best);
if (!part) {
break;
}
parts.push(part);
cursor = best;
}
return parts;
}

View File

@@ -0,0 +1,35 @@
import type { EmbeddingProvider } from "./embeddings.js";
const DEFAULT_EMBEDDING_MAX_INPUT_TOKENS = 8192;
const KNOWN_EMBEDDING_MAX_INPUT_TOKENS: Record<string, number> = {
"openai:text-embedding-3-small": 8192,
"openai:text-embedding-3-large": 8192,
"openai:text-embedding-ada-002": 8191,
"gemini:text-embedding-004": 2048,
"voyage:voyage-3": 32000,
"voyage:voyage-3-lite": 16000,
"voyage:voyage-code-3": 32000,
};
export function resolveEmbeddingMaxInputTokens(provider: EmbeddingProvider): number {
if (typeof provider.maxInputTokens === "number") {
return provider.maxInputTokens;
}
// Provider/model mapping is best-effort; different providers use different
// limits and we prefer to be conservative when we don't know.
const key = `${provider.id}:${provider.model}`.toLowerCase();
const known = KNOWN_EMBEDDING_MAX_INPUT_TOKENS[key];
if (typeof known === "number") {
return known;
}
// Provider-specific conservative fallbacks. This prevents us from accidentally
// using the OpenAI default for providers with much smaller limits.
if (provider.id.toLowerCase() === "gemini") {
return 2048;
}
return DEFAULT_EMBEDDING_MAX_INPUT_TOKENS;
}

View File

@@ -12,6 +12,9 @@ export type GeminiEmbeddingClient = {
const DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
export const DEFAULT_GEMINI_EMBEDDING_MODEL = "gemini-embedding-001";
const GEMINI_MAX_INPUT_TOKENS: Record<string, number> = {
"text-embedding-004": 2048,
};
const debugEmbeddings = isTruthyEnvValue(process.env.OPENCLAW_DEBUG_MEMORY_EMBEDDINGS);
const log = createSubsystemLogger("memory/embeddings");
@@ -117,6 +120,7 @@ export async function createGeminiEmbeddingProvider(
provider: {
id: "gemini",
model: client.model,
maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model],
embedQuery,
embedBatch,
},

View File

@@ -9,6 +9,11 @@ export type OpenAiEmbeddingClient = {
export const DEFAULT_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small";
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
const OPENAI_MAX_INPUT_TOKENS: Record<string, number> = {
"text-embedding-3-small": 8192,
"text-embedding-3-large": 8192,
"text-embedding-ada-002": 8191,
};
export function normalizeOpenAiModel(model: string): string {
const trimmed = model.trim();
@@ -51,6 +56,7 @@ export async function createOpenAiEmbeddingProvider(
provider: {
id: "openai",
model: client.model,
maxInputTokens: OPENAI_MAX_INPUT_TOKENS[client.model],
embedQuery: async (text) => {
const [vec] = await embed([text]);
return vec ?? [];

View File

@@ -9,6 +9,11 @@ export type VoyageEmbeddingClient = {
export const DEFAULT_VOYAGE_EMBEDDING_MODEL = "voyage-4-large";
const DEFAULT_VOYAGE_BASE_URL = "https://api.voyageai.com/v1";
const VOYAGE_MAX_INPUT_TOKENS: Record<string, number> = {
"voyage-3": 32000,
"voyage-3-lite": 16000,
"voyage-code-3": 32000,
};
export function normalizeVoyageModel(model: string): string {
const trimmed = model.trim();
@@ -59,6 +64,7 @@ export async function createVoyageEmbeddingProvider(
provider: {
id: "voyage",
model: client.model,
maxInputTokens: VOYAGE_MAX_INPUT_TOKENS[client.model],
embedQuery: async (text) => {
const [vec] = await embed([text], "query");
return vec ?? [];

View File

@@ -24,6 +24,7 @@ export type { VoyageEmbeddingClient } from "./embeddings-voyage.js";
export type EmbeddingProvider = {
id: string;
model: string;
maxInputTokens?: number;
embedQuery: (text: string) => Promise<number[]>;
embedBatch: (texts: string[]) => Promise<number[][]>;
};

View File

@@ -0,0 +1,120 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";
const embedBatch = vi.fn(async (texts: string[]) => texts.map(() => [0, 1, 0]));
const embedQuery = vi.fn(async () => [0, 1, 0]);
vi.mock("./embeddings.js", () => ({
createEmbeddingProvider: async () => ({
requestedProvider: "openai",
provider: {
id: "mock",
model: "mock-embed",
maxInputTokens: 8192,
embedQuery,
embedBatch,
},
}),
}));
describe("memory embedding token limits", () => {
let workspaceDir: string;
let indexPath: string;
let manager: MemoryIndexManager | null = null;
beforeEach(async () => {
embedBatch.mockReset();
embedQuery.mockReset();
embedBatch.mockImplementation(async (texts: string[]) => texts.map(() => [0, 1, 0]));
embedQuery.mockImplementation(async () => [0, 1, 0]);
workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-mem-token-"));
indexPath = path.join(workspaceDir, "index.sqlite");
await fs.mkdir(path.join(workspaceDir, "memory"));
});
afterEach(async () => {
if (manager) {
await manager.close();
manager = null;
}
await fs.rm(workspaceDir, { recursive: true, force: true });
});
it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
const content = "x".repeat(9500);
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-09.md"), content);
const cfg = {
agents: {
defaults: {
workspace: workspaceDir,
memorySearch: {
provider: "openai",
model: "mock-embed",
store: { path: indexPath },
chunking: { tokens: 10_000, overlap: 0 },
sync: { watch: false, onSessionStart: false, onSearch: false },
query: { minScore: 0 },
},
},
list: [{ id: "main", default: true }],
},
};
const result = await getMemorySearchManager({ cfg, agentId: "main" });
expect(result.manager).not.toBeNull();
if (!result.manager) {
throw new Error("manager missing");
}
manager = result.manager;
await manager.sync({ force: true });
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
expect(inputs.length).toBeGreaterThan(1);
expect(
Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
).toBeLessThanOrEqual(8192);
});
it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
const line = "😀".repeat(1800);
const content = `${line}\n${line}\n${line}`;
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-10.md"), content);
const cfg = {
agents: {
defaults: {
workspace: workspaceDir,
memorySearch: {
provider: "openai",
model: "mock-embed",
store: { path: indexPath },
chunking: { tokens: 1000, overlap: 0 },
sync: { watch: false, onSessionStart: false, onSearch: false },
query: { minScore: 0 },
},
},
list: [{ id: "main", default: true }],
},
};
const result = await getMemorySearchManager({ cfg, agentId: "main" });
expect(result.manager).not.toBeNull();
if (!result.manager) {
throw new Error("manager missing");
}
manager = result.manager;
await manager.sync({ force: true });
const batchSizes = embedBatch.mock.calls.map(
(call) => (call[0] as string[] | undefined)?.length ?? 0,
);
expect(batchSizes.length).toBe(3);
expect(batchSizes.every((size) => size === 1)).toBe(true);
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
});
});

View File

@@ -27,6 +27,8 @@ import {
runOpenAiEmbeddingBatches,
} from "./batch-openai.js";
import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js";
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
import { estimateUtf8Bytes } from "./embedding-input-limits.js";
import { DEFAULT_GEMINI_EMBEDDING_MODEL } from "./embeddings-gemini.js";
import { DEFAULT_OPENAI_EMBEDDING_MODEL } from "./embeddings-openai.js";
import { DEFAULT_VOYAGE_EMBEDDING_MODEL } from "./embeddings-voyage.js";
@@ -87,7 +89,6 @@ const FTS_TABLE = "chunks_fts";
const EMBEDDING_CACHE_TABLE = "embedding_cache";
const SESSION_DIRTY_DEBOUNCE_MS = 5000;
const EMBEDDING_BATCH_MAX_TOKENS = 8000;
const EMBEDDING_APPROX_CHARS_PER_TOKEN = 1;
const EMBEDDING_INDEX_CONCURRENCY = 4;
const EMBEDDING_RETRY_MAX_ATTEMPTS = 3;
const EMBEDDING_RETRY_BASE_DELAY_MS = 500;
@@ -1543,20 +1544,13 @@ export class MemoryIndexManager implements MemorySearchManager {
.run(META_KEY, value);
}
private estimateEmbeddingTokens(text: string): number {
if (!text) {
return 0;
}
return Math.ceil(text.length / EMBEDDING_APPROX_CHARS_PER_TOKEN);
}
private buildEmbeddingBatches(chunks: MemoryChunk[]): MemoryChunk[][] {
const batches: MemoryChunk[][] = [];
let current: MemoryChunk[] = [];
let currentTokens = 0;
for (const chunk of chunks) {
const estimate = this.estimateEmbeddingTokens(chunk.text);
const estimate = estimateUtf8Bytes(chunk.text);
const wouldExceed =
current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS;
if (wouldExceed) {
@@ -2206,8 +2200,11 @@ export class MemoryIndexManager implements MemorySearchManager {
options: { source: MemorySource; content?: string },
) {
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
const chunks = chunkMarkdown(content, this.settings.chunking).filter(
(chunk) => chunk.text.trim().length > 0,
const chunks = enforceEmbeddingMaxInputTokens(
this.provider,
chunkMarkdown(content, this.settings.chunking).filter(
(chunk) => chunk.text.trim().length > 0,
),
);
if (options.source === "sessions" && "lineMap" in entry) {
remapChunkLines(chunks, entry.lineMap);