From 87463eee46ab71be338e0b09db6b138d078dfe98 Mon Sep 17 00:00:00 2001
From: Gustavo Madeira Santana <gumadeiras@gmail.com>
Date: Wed, 11 Mar 2026 21:46:56 +0000
Subject: [PATCH] memory: harden multimodal indexing failures

---
 docs/concepts/memory.md             |   1 +
 src/config/schema.help.ts           |   4 +-
 src/memory/index.test.ts            |  37 ++++++++--
 src/memory/manager-embedding-ops.ts | 101 +++++++++++++++++++---------
 4 files changed, 105 insertions(+), 38 deletions(-)

diff --git a/docs/concepts/memory.md b/docs/concepts/memory.md
index 99519be871c..8ed755b394c 100644
--- a/docs/concepts/memory.md
+++ b/docs/concepts/memory.md
@@ -318,6 +318,7 @@ Notes:
 - Multimodal indexing applies only to files discovered through `memorySearch.extraPaths`.
 - Supported modalities in this phase: image and audio.
 - `memorySearch.fallback` must stay `"none"` while multimodal memory is enabled.
+- Matching image/audio file bytes are uploaded to the configured Gemini embedding endpoint during indexing.
 - Supported image extensions: `.jpg`, `.jpeg`, `.png`, `.webp`, `.gif`, `.heic`, `.heif`.
 - Supported audio extensions: `.mp3`, `.wav`, `.ogg`, `.opus`, `.m4a`, `.aac`, `.flac`.
 - Search queries remain text, but Gemini can compare those text queries against indexed image/audio embeddings.
diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts
index 5370853589e..3db7f40fe73 100644
--- a/src/config/schema.help.ts
+++ b/src/config/schema.help.ts
@@ -780,9 +780,9 @@ export const FIELD_HELP: Record<string, string> = {
   "agents.defaults.memorySearch.extraPaths":
     "Adds extra directories or .md files to the memory index beyond default memory files. Use this when key reference docs live elsewhere in your repo; when multimodal memory is enabled, matching image/audio files under these paths are also eligible for indexing.",
   "agents.defaults.memorySearch.multimodal":
-    'Optional multimodal memory settings for indexing image and audio files from configured extra paths. Keep this off unless your embedding model explicitly supports cross-modal embeddings, and set `memorySearch.fallback` to "none" while it is enabled.',
+    'Optional multimodal memory settings for indexing image and audio files from configured extra paths. Keep this off unless your embedding model explicitly supports cross-modal embeddings, and set `memorySearch.fallback` to "none" while it is enabled. Matching files are uploaded to the configured remote embedding provider during indexing.',
   "agents.defaults.memorySearch.multimodal.enabled":
-    "Enables image/audio memory indexing from extraPaths. This currently requires Gemini embedding-2, keeps the default memory roots Markdown-only, and disables memory-search fallback providers.",
+    "Enables image/audio memory indexing from extraPaths. This currently requires Gemini embedding-2, keeps the default memory roots Markdown-only, disables memory-search fallback providers, and uploads matching binary content to the configured remote embedding provider.",
   "agents.defaults.memorySearch.multimodal.modalities":
     'Selects which multimodal file types are indexed from extraPaths: "image", "audio", or "all". Keep this narrow to avoid indexing large binary corpora unintentionally.',
   "agents.defaults.memorySearch.multimodal.maxFileBytes":
diff --git a/src/memory/index.test.ts b/src/memory/index.test.ts
index c06eb703cbe..1a0c6988121 100644
--- a/src/memory/index.test.ts
+++ b/src/memory/index.test.ts
@@ -48,15 +48,19 @@ vi.mock("./embeddings.js", () => {
                   inputs: Array<{
                     text: string;
                     parts?: Array<
-                      { type: "text"; text: string } | { type: "inline-data"; mimeType: string }
+                      | { type: "text"; text: string }
+                      | { type: "inline-data"; mimeType: string; data: string }
                     >;
                   }>,
                 ) => {
                   embedBatchInputCalls += 1;
                   return inputs.map((input) => {
-                    const mimeType = input.parts?.find(
-                      (part) => part.type === "inline-data",
-                    )?.mimeType;
+                    const inlineData = input.parts?.find((part) => part.type === "inline-data");
+                    if (inlineData?.type === "inline-data" && inlineData.data.length > 9000) {
+                      throw new Error("payload too large");
+                    }
+                    const mimeType =
+                      inlineData?.type === "inline-data" ? inlineData.mimeType : undefined;
                     if (mimeType?.startsWith("image/")) {
                       return [0, 0, 1, 0];
                     }
@@ -311,6 +315,31 @@ describe("memory index", () => {
     expect(audioResults.some((result) => result.path.endsWith("meeting.wav"))).toBe(true);
   });
 
+  it("skips oversized multimodal inputs without aborting sync", async () => {
+    const mediaDir = path.join(workspaceDir, "media-oversize");
+    await fs.mkdir(mediaDir, { recursive: true });
+    await fs.writeFile(path.join(mediaDir, "huge.png"), Buffer.alloc(7000, 1));
+
+    const cfg = createCfg({
+      storePath: path.join(workspaceDir, `index-oversize-${randomUUID()}.sqlite`),
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      extraPaths: [mediaDir],
+      multimodal: { enabled: true, modalities: ["image"] },
+    });
+    const manager = requireManager(await getMemorySearchManager({ cfg, agentId: "main" }));
+    await manager.sync({ reason: "test" });
+
+    expect(embedBatchInputCalls).toBeGreaterThan(0);
+    const imageResults = await manager.search("image");
+    expect(imageResults.some((result) => result.path.endsWith("huge.png"))).toBe(false);
+
+    const alphaResults = await manager.search("alpha");
+    expect(alphaResults.some((result) => result.path.endsWith("memory/2026-01-12.md"))).toBe(true);
+
+    await manager.close?.();
+  });
+
   it("keeps dirty false in status-only manager after prior indexing", async () => {
     const cfg = createCfg({ storePath: indexStatusPath });
 
diff --git a/src/memory/manager-embedding-ops.ts b/src/memory/manager-embedding-ops.ts
index dadaadb513f..2487cfa973c 100644
--- a/src/memory/manager-embedding-ops.ts
+++ b/src/memory/manager-embedding-ops.ts
@@ -758,6 +758,45 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
     return this.batch.enabled ? this.batch.concurrency : EMBEDDING_INDEX_CONCURRENCY;
   }
 
+  private clearIndexedFileData(pathname: string, source: MemorySource): void {
+    if (this.vector.enabled) {
+      try {
+        this.db
+          .prepare(
+            `DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`,
+          )
+          .run(pathname, source);
+      } catch {}
+    }
+    if (this.fts.enabled && this.fts.available && this.provider) {
+      try {
+        this.db
+          .prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`)
+          .run(pathname, source, this.provider.model);
+      } catch {}
+    }
+    this.db.prepare(`DELETE FROM chunks WHERE path = ? AND source = ?`).run(pathname, source);
+  }
+
+  private upsertFileRecord(entry: MemoryFileEntry | SessionFileEntry, source: MemorySource): void {
+    this.db
+      .prepare(
+        `INSERT INTO files (path, source, hash, mtime, size) VALUES (?, ?, ?, ?, ?)
+         ON CONFLICT(path) DO UPDATE SET
+           source=excluded.source,
+           hash=excluded.hash,
+           mtime=excluded.mtime,
+           size=excluded.size`,
+      )
+      .run(entry.path, source, entry.hash, entry.mtimeMs, entry.size);
+  }
+
+  private isStructuredInputTooLargeError(message: string): boolean {
+    return /(413|payload too large|request too large|input too large|too many tokens|input limit|request size)/i.test(
+      message,
+    );
+  }
+
   protected async indexFile(
     entry: MemoryFileEntry | SessionFileEntry,
     options: { source: MemorySource; content?: string },
@@ -772,11 +811,14 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
     }
 
     let chunks: MemoryChunk[];
+    let structuredInputBytes: number | undefined;
     if ("kind" in entry && entry.kind === "multimodal") {
       const embeddingInput = await loadMultimodalEmbeddingInput(entry);
       if (!embeddingInput) {
+        this.clearIndexedFileData(entry.path, options.source);
         return;
       }
+      structuredInputBytes = estimateStructuredEmbeddingInputBytes(embeddingInput);
       chunks = [
         {
           startLine: 1,
@@ -799,31 +841,35 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
         remapChunkLines(chunks, entry.lineMap);
       }
     }
-    const embeddings = this.batch.enabled
-      ? await this.embedChunksWithBatch(chunks, entry, options.source)
-      : await this.embedChunksInBatches(chunks);
+    let embeddings: number[][];
+    try {
+      embeddings = this.batch.enabled
+        ? await this.embedChunksWithBatch(chunks, entry, options.source)
+        : await this.embedChunksInBatches(chunks);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      if (
+        "kind" in entry &&
+        entry.kind === "multimodal" &&
+        this.isStructuredInputTooLargeError(message)
+      ) {
+        log.warn("memory embeddings: skipping multimodal file rejected as too large", {
+          path: entry.path,
+          bytes: structuredInputBytes,
+          provider: this.provider.id,
+          model: this.provider.model,
+          error: message,
+        });
+        this.clearIndexedFileData(entry.path, options.source);
+        this.upsertFileRecord(entry, options.source);
+        return;
+      }
+      throw err;
+    }
     const sample = embeddings.find((embedding) => embedding.length > 0);
     const vectorReady = sample ? await this.ensureVectorReady(sample.length) : false;
     const now = Date.now();
-    if (vectorReady) {
-      try {
-        this.db
-          .prepare(
-            `DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`,
-          )
-          .run(entry.path, options.source);
-      } catch {}
-    }
-    if (this.fts.enabled && this.fts.available) {
-      try {
-        this.db
-          .prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`)
-          .run(entry.path, options.source, this.provider.model);
-      } catch {}
-    }
-    this.db
-      .prepare(`DELETE FROM chunks WHERE path = ? AND source = ?`)
-      .run(entry.path, options.source);
+    this.clearIndexedFileData(entry.path, options.source);
     for (let i = 0; i < chunks.length; i++) {
       const chunk = chunks[i];
       const embedding = embeddings[i] ?? [];
@@ -878,15 +924,6 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
           );
       }
     }
-    this.db
-      .prepare(
-        `INSERT INTO files (path, source, hash, mtime, size) VALUES (?, ?, ?, ?, ?)
-         ON CONFLICT(path) DO UPDATE SET
-           source=excluded.source,
-           hash=excluded.hash,
-           mtime=excluded.mtime,
-           size=excluded.size`,
-      )
-      .run(entry.path, options.source, entry.hash, entry.mtimeMs, entry.size);
+    this.upsertFileRecord(entry, options.source);
   }
 }