(fix): enforce embedding model token limit to prevent overflow (#13455)

* fix: enforce embedding model token limit to prevent 8192 overflow - Replace EMBEDDING_APPROX_CHARS_PER_TOKEN=1 with UTF-8 byte length estimation (safe upper bound for tokenizer output) - Add EMBEDDING_MODEL_MAX_TOKENS=8192 hard cap - Add splitChunkToTokenLimit() that binary-searches for the largest safe split point, with surrogate pair handling - Add enforceChunkTokenLimit() wrapper called in indexFile() after chunkMarkdown(), before any embedding API call - Fixes: session files with large JSONL entries could produce chunks exceeding text-embedding-3-small's 8192 token limit Tests: 2 new colocated tests in manager.embedding-token-limit.test.ts - Verifies oversized ASCII chunks are split to <=8192 bytes each - Verifies multibyte (emoji) content batching respects byte limits * fix: make embedding token limit provider-aware - Add optional maxInputTokens to EmbeddingProvider interface - Each provider (openai, gemini, voyage) reports its own limit - Known-limits map as fallback: openai 8192, gemini 2048, voyage 32K - Resolution: provider field > known map > default 8192 - Backward compatible: local/llama uses fallback * fix: enforce embedding input size limits (#13455) (thanks @rodrigouroz) --------- Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
2026-05-06 20:10:42 +00:00 · 2026-02-10 23:10:17 -03:00
parent c95b3783ef
commit 7f1712c1ba
9 changed files with 277 additions and 11 deletions
--- a/src/memory/embedding-chunk-limits.ts
+++ b/src/memory/embedding-chunk-limits.ts
@@ -0,0 +1,30 @@
+import type { EmbeddingProvider } from "./embeddings.js";
+import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js";
+import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js";
+import { hashText, type MemoryChunk } from "./internal.js";
+
+export function enforceEmbeddingMaxInputTokens(
+  provider: EmbeddingProvider,
+  chunks: MemoryChunk[],
+): MemoryChunk[] {
+  const maxInputTokens = resolveEmbeddingMaxInputTokens(provider);
+  const out: MemoryChunk[] = [];
+
+  for (const chunk of chunks) {
+    if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) {
+      out.push(chunk);
+      continue;
+    }
+
+    for (const text of splitTextToUtf8ByteLimit(chunk.text, maxInputTokens)) {
+      out.push({
+        startLine: chunk.startLine,
+        endLine: chunk.endLine,
+        text,
+        hash: hashText(text),
+      });
+    }
+  }
+
+  return out;
+}
--- a/src/memory/embedding-input-limits.ts
+++ b/src/memory/embedding-input-limits.ts
@@ -0,0 +1,67 @@
+// Helpers for enforcing embedding model input size limits.
+//
+// We use UTF-8 byte length as a conservative upper bound for tokenizer output.
+// Tokenizers operate over bytes; a token must contain at least one byte, so
+// token_count <= utf8_byte_length.
+
+export function estimateUtf8Bytes(text: string): number {
+  if (!text) {
+    return 0;
+  }
+  return Buffer.byteLength(text, "utf8");
+}
+
+export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] {
+  if (maxUtf8Bytes <= 0) {
+    return [text];
+  }
+  if (estimateUtf8Bytes(text) <= maxUtf8Bytes) {
+    return [text];
+  }
+
+  const parts: string[] = [];
+  let cursor = 0;
+  while (cursor < text.length) {
+    // The number of UTF-16 code units is always <= the number of UTF-8 bytes.
+    // This makes `cursor + maxUtf8Bytes` a safe upper bound on the next split point.
+    let low = cursor + 1;
+    let high = Math.min(text.length, cursor + maxUtf8Bytes);
+    let best = cursor;
+
+    while (low <= high) {
+      const mid = Math.floor((low + high) / 2);
+      const bytes = estimateUtf8Bytes(text.slice(cursor, mid));
+      if (bytes <= maxUtf8Bytes) {
+        best = mid;
+        low = mid + 1;
+      } else {
+        high = mid - 1;
+      }
+    }
+
+    if (best <= cursor) {
+      best = Math.min(text.length, cursor + 1);
+    }
+
+    // Avoid splitting inside a surrogate pair.
+    if (
+      best < text.length &&
+      best > cursor &&
+      text.charCodeAt(best - 1) >= 0xd800 &&
+      text.charCodeAt(best - 1) <= 0xdbff &&
+      text.charCodeAt(best) >= 0xdc00 &&
+      text.charCodeAt(best) <= 0xdfff
+    ) {
+      best -= 1;
+    }
+
+    const part = text.slice(cursor, best);
+    if (!part) {
+      break;
+    }
+    parts.push(part);
+    cursor = best;
+  }
+
+  return parts;
+}
--- a/src/memory/embedding-model-limits.ts
+++ b/src/memory/embedding-model-limits.ts
@@ -0,0 +1,35 @@
+import type { EmbeddingProvider } from "./embeddings.js";
+
+const DEFAULT_EMBEDDING_MAX_INPUT_TOKENS = 8192;
+
+const KNOWN_EMBEDDING_MAX_INPUT_TOKENS: Record<string, number> = {
+  "openai:text-embedding-3-small": 8192,
+  "openai:text-embedding-3-large": 8192,
+  "openai:text-embedding-ada-002": 8191,
+  "gemini:text-embedding-004": 2048,
+  "voyage:voyage-3": 32000,
+  "voyage:voyage-3-lite": 16000,
+  "voyage:voyage-code-3": 32000,
+};
+
+export function resolveEmbeddingMaxInputTokens(provider: EmbeddingProvider): number {
+  if (typeof provider.maxInputTokens === "number") {
+    return provider.maxInputTokens;
+  }
+
+  // Provider/model mapping is best-effort; different providers use different
+  // limits and we prefer to be conservative when we don't know.
+  const key = `${provider.id}:${provider.model}`.toLowerCase();
+  const known = KNOWN_EMBEDDING_MAX_INPUT_TOKENS[key];
+  if (typeof known === "number") {
+    return known;
+  }
+
+  // Provider-specific conservative fallbacks. This prevents us from accidentally
+  // using the OpenAI default for providers with much smaller limits.
+  if (provider.id.toLowerCase() === "gemini") {
+    return 2048;
+  }
+
+  return DEFAULT_EMBEDDING_MAX_INPUT_TOKENS;
+}
--- a/src/memory/embeddings-gemini.ts
+++ b/src/memory/embeddings-gemini.ts
@@ -12,6 +12,9 @@ export type GeminiEmbeddingClient = {

 const DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
 export const DEFAULT_GEMINI_EMBEDDING_MODEL = "gemini-embedding-001";
+const GEMINI_MAX_INPUT_TOKENS: Record<string, number> = {
+  "text-embedding-004": 2048,
+};
 const debugEmbeddings = isTruthyEnvValue(process.env.OPENCLAW_DEBUG_MEMORY_EMBEDDINGS);
 const log = createSubsystemLogger("memory/embeddings");

@@ -117,6 +120,7 @@ export async function createGeminiEmbeddingProvider(
    provider: {
      id: "gemini",
      model: client.model,
+      maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model],
      embedQuery,
      embedBatch,
    },
--- a/src/memory/embeddings-openai.ts
+++ b/src/memory/embeddings-openai.ts
@@ -9,6 +9,11 @@ export type OpenAiEmbeddingClient = {

 export const DEFAULT_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small";
 const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
+const OPENAI_MAX_INPUT_TOKENS: Record<string, number> = {
+  "text-embedding-3-small": 8192,
+  "text-embedding-3-large": 8192,
+  "text-embedding-ada-002": 8191,
+};

 export function normalizeOpenAiModel(model: string): string {
  const trimmed = model.trim();
@@ -51,6 +56,7 @@ export async function createOpenAiEmbeddingProvider(
    provider: {
      id: "openai",
      model: client.model,
+      maxInputTokens: OPENAI_MAX_INPUT_TOKENS[client.model],
      embedQuery: async (text) => {
        const [vec] = await embed([text]);
        return vec ?? [];
--- a/src/memory/embeddings-voyage.ts
+++ b/src/memory/embeddings-voyage.ts
@@ -9,6 +9,11 @@ export type VoyageEmbeddingClient = {

 export const DEFAULT_VOYAGE_EMBEDDING_MODEL = "voyage-4-large";
 const DEFAULT_VOYAGE_BASE_URL = "https://api.voyageai.com/v1";
+const VOYAGE_MAX_INPUT_TOKENS: Record<string, number> = {
+  "voyage-3": 32000,
+  "voyage-3-lite": 16000,
+  "voyage-code-3": 32000,
+};

 export function normalizeVoyageModel(model: string): string {
  const trimmed = model.trim();
@@ -59,6 +64,7 @@ export async function createVoyageEmbeddingProvider(
    provider: {
      id: "voyage",
      model: client.model,
+      maxInputTokens: VOYAGE_MAX_INPUT_TOKENS[client.model],
      embedQuery: async (text) => {
        const [vec] = await embed([text], "query");
        return vec ?? [];
--- a/src/memory/embeddings.ts
+++ b/src/memory/embeddings.ts
@@ -24,6 +24,7 @@ export type { VoyageEmbeddingClient } from "./embeddings-voyage.js";
 export type EmbeddingProvider = {
  id: string;
  model: string;
+  maxInputTokens?: number;
  embedQuery: (text: string) => Promise<number[]>;
  embedBatch: (texts: string[]) => Promise<number[][]>;
 };
--- a/src/memory/manager.embedding-token-limit.test.ts
+++ b/src/memory/manager.embedding-token-limit.test.ts
@@ -0,0 +1,120 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";
+
+const embedBatch = vi.fn(async (texts: string[]) => texts.map(() => [0, 1, 0]));
+const embedQuery = vi.fn(async () => [0, 1, 0]);
+
+vi.mock("./embeddings.js", () => ({
+  createEmbeddingProvider: async () => ({
+    requestedProvider: "openai",
+    provider: {
+      id: "mock",
+      model: "mock-embed",
+      maxInputTokens: 8192,
+      embedQuery,
+      embedBatch,
+    },
+  }),
+}));
+
+describe("memory embedding token limits", () => {
+  let workspaceDir: string;
+  let indexPath: string;
+  let manager: MemoryIndexManager | null = null;
+
+  beforeEach(async () => {
+    embedBatch.mockReset();
+    embedQuery.mockReset();
+    embedBatch.mockImplementation(async (texts: string[]) => texts.map(() => [0, 1, 0]));
+    embedQuery.mockImplementation(async () => [0, 1, 0]);
+    workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-mem-token-"));
+    indexPath = path.join(workspaceDir, "index.sqlite");
+    await fs.mkdir(path.join(workspaceDir, "memory"));
+  });
+
+  afterEach(async () => {
+    if (manager) {
+      await manager.close();
+      manager = null;
+    }
+    await fs.rm(workspaceDir, { recursive: true, force: true });
+  });
+
+  it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
+    const content = "x".repeat(9500);
+    await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-09.md"), content);
+
+    const cfg = {
+      agents: {
+        defaults: {
+          workspace: workspaceDir,
+          memorySearch: {
+            provider: "openai",
+            model: "mock-embed",
+            store: { path: indexPath },
+            chunking: { tokens: 10_000, overlap: 0 },
+            sync: { watch: false, onSessionStart: false, onSearch: false },
+            query: { minScore: 0 },
+          },
+        },
+        list: [{ id: "main", default: true }],
+      },
+    };
+
+    const result = await getMemorySearchManager({ cfg, agentId: "main" });
+    expect(result.manager).not.toBeNull();
+    if (!result.manager) {
+      throw new Error("manager missing");
+    }
+    manager = result.manager;
+    await manager.sync({ force: true });
+
+    const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
+    expect(inputs.length).toBeGreaterThan(1);
+    expect(
+      Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
+    ).toBeLessThanOrEqual(8192);
+  });
+
+  it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
+    const line = "😀".repeat(1800);
+    const content = `${line}\n${line}\n${line}`;
+    await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-10.md"), content);
+
+    const cfg = {
+      agents: {
+        defaults: {
+          workspace: workspaceDir,
+          memorySearch: {
+            provider: "openai",
+            model: "mock-embed",
+            store: { path: indexPath },
+            chunking: { tokens: 1000, overlap: 0 },
+            sync: { watch: false, onSessionStart: false, onSearch: false },
+            query: { minScore: 0 },
+          },
+        },
+        list: [{ id: "main", default: true }],
+      },
+    };
+
+    const result = await getMemorySearchManager({ cfg, agentId: "main" });
+    expect(result.manager).not.toBeNull();
+    if (!result.manager) {
+      throw new Error("manager missing");
+    }
+    manager = result.manager;
+    await manager.sync({ force: true });
+
+    const batchSizes = embedBatch.mock.calls.map(
+      (call) => (call[0] as string[] | undefined)?.length ?? 0,
+    );
+    expect(batchSizes.length).toBe(3);
+    expect(batchSizes.every((size) => size === 1)).toBe(true);
+    const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
+    expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
+  });
+});
--- a/src/memory/manager.ts
+++ b/src/memory/manager.ts
@@ -27,6 +27,8 @@ import {
  runOpenAiEmbeddingBatches,
 } from "./batch-openai.js";
 import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js";
+import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
+import { estimateUtf8Bytes } from "./embedding-input-limits.js";
 import { DEFAULT_GEMINI_EMBEDDING_MODEL } from "./embeddings-gemini.js";
 import { DEFAULT_OPENAI_EMBEDDING_MODEL } from "./embeddings-openai.js";
 import { DEFAULT_VOYAGE_EMBEDDING_MODEL } from "./embeddings-voyage.js";
@@ -87,7 +89,6 @@ const FTS_TABLE = "chunks_fts";
 const EMBEDDING_CACHE_TABLE = "embedding_cache";
 const SESSION_DIRTY_DEBOUNCE_MS = 5000;
 const EMBEDDING_BATCH_MAX_TOKENS = 8000;
-const EMBEDDING_APPROX_CHARS_PER_TOKEN = 1;
 const EMBEDDING_INDEX_CONCURRENCY = 4;
 const EMBEDDING_RETRY_MAX_ATTEMPTS = 3;
 const EMBEDDING_RETRY_BASE_DELAY_MS = 500;
@@ -1543,20 +1544,13 @@ export class MemoryIndexManager implements MemorySearchManager {
      .run(META_KEY, value);
  }

-  private estimateEmbeddingTokens(text: string): number {
-    if (!text) {
-      return 0;
-    }
-    return Math.ceil(text.length / EMBEDDING_APPROX_CHARS_PER_TOKEN);
-  }
-
  private buildEmbeddingBatches(chunks: MemoryChunk[]): MemoryChunk[][] {
    const batches: MemoryChunk[][] = [];
    let current: MemoryChunk[] = [];
    let currentTokens = 0;

    for (const chunk of chunks) {
-      const estimate = this.estimateEmbeddingTokens(chunk.text);
+      const estimate = estimateUtf8Bytes(chunk.text);
      const wouldExceed =
        current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS;
      if (wouldExceed) {
@@ -2206,8 +2200,11 @@ export class MemoryIndexManager implements MemorySearchManager {
    options: { source: MemorySource; content?: string },
  ) {
    const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
-    const chunks = chunkMarkdown(content, this.settings.chunking).filter(
-      (chunk) => chunk.text.trim().length > 0,
+    const chunks = enforceEmbeddingMaxInputTokens(
+      this.provider,
+      chunkMarkdown(content, this.settings.chunking).filter(
+        (chunk) => chunk.text.trim().length > 0,
+      ),
    );
    if (options.source === "sessions" && "lineMap" in entry) {
      remapChunkLines(chunks, entry.lineMap);