Memory: add multimodal image and audio indexing (#43460)

Merged via squash. Prepared head SHA: a994c07190 Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com> Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com> Reviewed-by: @gumadeiras
2026-03-11 23:10:29 +00:00 · 2026-03-11 22:28:34 +00:00
parent 20d097ac2f
commit d79ca52960
23 changed files with 1295 additions and 178 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai
 - iOS/TestFlight: add a local beta release flow with Fastlane prepare/archive/upload support, canonical beta bundle IDs, and watch-app archive fixes. (#42991) Thanks @ngutman.
 - macOS/onboarding: detect when remote gateways need a shared auth token, explain where to find it on the gateway host, and clarify when a successful check used paired-device auth instead. (#43100) Thanks @ngutman.
 - Onboarding/Ollama: add first-class Ollama setup with Local or Cloud + Local modes, browser-based cloud sign-in, curated model suggestions, and cloud-model handling that skips unnecessary local pulls. (#41529) Thanks @BruceMacD.
+- Memory: add opt-in multimodal image and audio indexing for `memorySearch.extraPaths` with Gemini `gemini-embedding-2-preview`, strict fallback gating, and scope-based reindexing. (#43460) Thanks @gumadeiras.

 ### Breaking

--- a/docs/concepts/memory.md
+++ b/docs/concepts/memory.md
@@ -284,9 +284,46 @@ Notes:

 - Paths can be absolute or workspace-relative.
 - Directories are scanned recursively for `.md` files.
- Only Markdown files are indexed.
+- By default, only Markdown files are indexed.
+- If `memorySearch.multimodal.enabled = true`, OpenClaw also indexes supported image/audio files under `extraPaths` only. Default memory roots (`MEMORY.md`, `memory.md`, `memory/**/*.md`) stay Markdown-only.
 - Symlinks are ignored (files or directories).

+### Multimodal memory files (Gemini image + audio)
+
+OpenClaw can index image and audio files from `memorySearch.extraPaths` when using Gemini embedding 2:
+
+```json5
+agents: {
+  defaults: {
+    memorySearch: {
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      extraPaths: ["assets/reference", "voice-notes"],
+      multimodal: {
+        enabled: true,
+        modalities: ["image", "audio"], // or ["all"]
+        maxFileBytes: 10000000
+      },
+      remote: {
+        apiKey: "YOUR_GEMINI_API_KEY"
+      }
+    }
+  }
+}
+```
+
+Notes:
+
+- Multimodal memory is currently supported only for `gemini-embedding-2-preview`.
+- Multimodal indexing applies only to files discovered through `memorySearch.extraPaths`.
+- Supported modalities in this phase: image and audio.
+- `memorySearch.fallback` must stay `"none"` while multimodal memory is enabled.
+- Matching image/audio file bytes are uploaded to the configured Gemini embedding endpoint during indexing.
+- Supported image extensions: `.jpg`, `.jpeg`, `.png`, `.webp`, `.gif`, `.heic`, `.heif`.
+- Supported audio extensions: `.mp3`, `.wav`, `.ogg`, `.opus`, `.m4a`, `.aac`, `.flac`.
+- Search queries remain text, but Gemini can compare those text queries against indexed image/audio embeddings.
+- `memory_get` still reads Markdown only; binary files are searchable but not returned as raw file contents.
+
 ### Gemini embeddings (native)

 Set the provider to `gemini` to use the Gemini embeddings API directly:
--- a/src/agents/memory-search.test.ts
+++ b/src/agents/memory-search.test.ts
@@ -131,6 +131,113 @@ describe("memory search config", () => {
    expect(resolved?.extraPaths).toEqual(["/shared/notes", "docs", "../team-notes"]);
  });

+  it("normalizes multimodal settings", () => {
+    const cfg = asConfig({
+      agents: {
+        defaults: {
+          memorySearch: {
+            provider: "gemini",
+            model: "gemini-embedding-2-preview",
+            multimodal: {
+              enabled: true,
+              modalities: ["all"],
+              maxFileBytes: 8192,
+            },
+          },
+        },
+      },
+    });
+    const resolved = resolveMemorySearchConfig(cfg, "main");
+    expect(resolved?.multimodal).toEqual({
+      enabled: true,
+      modalities: ["image", "audio"],
+      maxFileBytes: 8192,
+    });
+  });
+
+  it("keeps an explicit empty multimodal modalities list empty", () => {
+    const cfg = asConfig({
+      agents: {
+        defaults: {
+          memorySearch: {
+            provider: "gemini",
+            model: "gemini-embedding-2-preview",
+            multimodal: {
+              enabled: true,
+              modalities: [],
+            },
+          },
+        },
+      },
+    });
+    const resolved = resolveMemorySearchConfig(cfg, "main");
+    expect(resolved?.multimodal).toEqual({
+      enabled: true,
+      modalities: [],
+      maxFileBytes: 10 * 1024 * 1024,
+    });
+    expect(resolved?.provider).toBe("gemini");
+  });
+
+  it("does not enforce multimodal provider validation when no modalities are active", () => {
+    const cfg = asConfig({
+      agents: {
+        defaults: {
+          memorySearch: {
+            provider: "openai",
+            model: "text-embedding-3-small",
+            fallback: "openai",
+            multimodal: {
+              enabled: true,
+              modalities: [],
+            },
+          },
+        },
+      },
+    });
+    const resolved = resolveMemorySearchConfig(cfg, "main");
+    expect(resolved?.multimodal).toEqual({
+      enabled: true,
+      modalities: [],
+      maxFileBytes: 10 * 1024 * 1024,
+    });
+  });
+
+  it("rejects multimodal memory on unsupported providers", () => {
+    const cfg = asConfig({
+      agents: {
+        defaults: {
+          memorySearch: {
+            provider: "openai",
+            model: "text-embedding-3-small",
+            multimodal: { enabled: true, modalities: ["image"] },
+          },
+        },
+      },
+    });
+    expect(() => resolveMemorySearchConfig(cfg, "main")).toThrow(
+      /memorySearch\.multimodal requires memorySearch\.provider = "gemini"/,
+    );
+  });
+
+  it("rejects multimodal memory when fallback is configured", () => {
+    const cfg = asConfig({
+      agents: {
+        defaults: {
+          memorySearch: {
+            provider: "gemini",
+            model: "gemini-embedding-2-preview",
+            fallback: "openai",
+            multimodal: { enabled: true, modalities: ["image"] },
+          },
+        },
+      },
+    });
+    expect(() => resolveMemorySearchConfig(cfg, "main")).toThrow(
+      /memorySearch\.multimodal does not support memorySearch\.fallback/,
+    );
+  });
+
  it("includes batch defaults for openai without remote overrides", () => {
    const cfg = configWithDefaultProvider("openai");
    const resolved = resolveMemorySearchConfig(cfg, "main");
--- a/src/agents/memory-search.ts
+++ b/src/agents/memory-search.ts
@@ -3,6 +3,12 @@ import path from "node:path";
 import type { OpenClawConfig, MemorySearchConfig } from "../config/config.js";
 import { resolveStateDir } from "../config/paths.js";
 import type { SecretInput } from "../config/types.secrets.js";
+import {
+  isMemoryMultimodalEnabled,
+  normalizeMemoryMultimodalSettings,
+  supportsMemoryMultimodalEmbeddings,
+  type MemoryMultimodalSettings,
+} from "../memory/multimodal.js";
 import { clampInt, clampNumber, resolveUserPath } from "../utils.js";
 import { resolveAgentConfig } from "./agent-scope.js";

@@ -10,6 +16,7 @@ export type ResolvedMemorySearchConfig = {
  enabled: boolean;
  sources: Array<"memory" | "sessions">;
  extraPaths: string[];
+  multimodal: MemoryMultimodalSettings;
  provider: "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama" | "auto";
  remote?: {
    baseUrl?: string;
@@ -204,6 +211,11 @@ function mergeConfig(
    .map((value) => value.trim())
    .filter(Boolean);
  const extraPaths = Array.from(new Set(rawPaths));
+  const multimodal = normalizeMemoryMultimodalSettings({
+    enabled: overrides?.multimodal?.enabled ?? defaults?.multimodal?.enabled,
+    modalities: overrides?.multimodal?.modalities ?? defaults?.multimodal?.modalities,
+    maxFileBytes: overrides?.multimodal?.maxFileBytes ?? defaults?.multimodal?.maxFileBytes,
+  });
  const vector = {
    enabled: overrides?.store?.vector?.enabled ?? defaults?.store?.vector?.enabled ?? true,
    extensionPath:
@@ -307,6 +319,7 @@ function mergeConfig(
    enabled,
    sources,
    extraPaths,
+    multimodal,
    provider,
    remote,
    experimental: {
@@ -365,5 +378,22 @@ export function resolveMemorySearchConfig(
  if (!resolved.enabled) {
    return null;
  }
+  const multimodalActive = isMemoryMultimodalEnabled(resolved.multimodal);
+  if (
+    multimodalActive &&
+    !supportsMemoryMultimodalEmbeddings({
+      provider: resolved.provider,
+      model: resolved.model,
+    })
+  ) {
+    throw new Error(
+      'agents.*.memorySearch.multimodal requires memorySearch.provider = "gemini" and model = "gemini-embedding-2-preview".',
+    );
+  }
+  if (multimodalActive && resolved.fallback !== "none") {
+    throw new Error(
+      'agents.*.memorySearch.multimodal does not support memorySearch.fallback. Set fallback to "none".',
+    );
+  }
  return resolved;
 }
--- a/src/config/schema.help.quality.test.ts
+++ b/src/config/schema.help.quality.test.ts
@@ -72,6 +72,10 @@ const TARGET_KEYS = [
  "agents.defaults.memorySearch.fallback",
  "agents.defaults.memorySearch.sources",
  "agents.defaults.memorySearch.extraPaths",
+  "agents.defaults.memorySearch.multimodal",
+  "agents.defaults.memorySearch.multimodal.enabled",
+  "agents.defaults.memorySearch.multimodal.modalities",
+  "agents.defaults.memorySearch.multimodal.maxFileBytes",
  "agents.defaults.memorySearch.experimental.sessionMemory",
  "agents.defaults.memorySearch.remote.baseUrl",
  "agents.defaults.memorySearch.remote.apiKey",
--- a/src/config/schema.help.ts
+++ b/src/config/schema.help.ts
@@ -778,7 +778,15 @@ export const FIELD_HELP: Record<string, string> = {
  "agents.defaults.memorySearch.sources":
    'Chooses which sources are indexed: "memory" reads MEMORY.md + memory files, and "sessions" includes transcript history. Keep ["memory"] unless you need recall from prior chat transcripts.',
  "agents.defaults.memorySearch.extraPaths":
-    "Adds extra directories or .md files to the memory index beyond default memory files. Use this when key reference docs live elsewhere in your repo; keep paths small and intentional to avoid noisy recall.",
+    "Adds extra directories or .md files to the memory index beyond default memory files. Use this when key reference docs live elsewhere in your repo; when multimodal memory is enabled, matching image/audio files under these paths are also eligible for indexing.",
+  "agents.defaults.memorySearch.multimodal":
+    'Optional multimodal memory settings for indexing image and audio files from configured extra paths. Keep this off unless your embedding model explicitly supports cross-modal embeddings, and set `memorySearch.fallback` to "none" while it is enabled. Matching files are uploaded to the configured remote embedding provider during indexing.',
+  "agents.defaults.memorySearch.multimodal.enabled":
+    "Enables image/audio memory indexing from extraPaths. This currently requires Gemini embedding-2, keeps the default memory roots Markdown-only, disables memory-search fallback providers, and uploads matching binary content to the configured remote embedding provider.",
+  "agents.defaults.memorySearch.multimodal.modalities":
+    'Selects which multimodal file types are indexed from extraPaths: "image", "audio", or "all". Keep this narrow to avoid indexing large binary corpora unintentionally.',
+  "agents.defaults.memorySearch.multimodal.maxFileBytes":
+    "Sets the maximum bytes allowed per multimodal file before it is skipped during memory indexing. Use this to cap upload cost and indexing latency, or raise it for short high-quality audio clips.",
  "agents.defaults.memorySearch.experimental.sessionMemory":
    "Indexes session transcripts into memory search so responses can reference prior chat turns. Keep this off unless transcript recall is needed, because indexing cost and storage usage both increase.",
  "agents.defaults.memorySearch.provider":
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -319,6 +319,10 @@ export const FIELD_LABELS: Record<string, string> = {
  "agents.defaults.memorySearch.enabled": "Enable Memory Search",
  "agents.defaults.memorySearch.sources": "Memory Search Sources",
  "agents.defaults.memorySearch.extraPaths": "Extra Memory Paths",
+  "agents.defaults.memorySearch.multimodal": "Memory Search Multimodal",
+  "agents.defaults.memorySearch.multimodal.enabled": "Enable Memory Search Multimodal",
+  "agents.defaults.memorySearch.multimodal.modalities": "Memory Search Multimodal Modalities",
+  "agents.defaults.memorySearch.multimodal.maxFileBytes": "Memory Search Multimodal Max File Bytes",
  "agents.defaults.memorySearch.experimental.sessionMemory":
    "Memory Search Session Index (Experimental)",
  "agents.defaults.memorySearch.provider": "Memory Search Provider",
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -319,6 +319,15 @@ export type MemorySearchConfig = {
  sources?: Array<"memory" | "sessions">;
  /** Extra paths to include in memory search (directories or .md files). */
  extraPaths?: string[];
+  /** Optional multimodal file indexing for selected extra paths. */
+  multimodal?: {
+    /** Enable image/audio embeddings from extraPaths. */
+    enabled?: boolean;
+    /** Which non-text file types to index. */
+    modalities?: Array<"image" | "audio" | "all">;
+    /** Max bytes allowed per multimodal file before it is skipped. */
+    maxFileBytes?: number;
+  };
  /** Experimental memory search settings. */
  experimental?: {
    /** Enable session transcript indexing (experimental, default: false). */
--- a/src/config/zod-schema.agent-runtime.ts
+++ b/src/config/zod-schema.agent-runtime.ts
@@ -553,6 +553,16 @@ export const MemorySearchSchema = z
    enabled: z.boolean().optional(),
    sources: z.array(z.union([z.literal("memory"), z.literal("sessions")])).optional(),
    extraPaths: z.array(z.string()).optional(),
+    multimodal: z
+      .object({
+        enabled: z.boolean().optional(),
+        modalities: z
+          .array(z.union([z.literal("image"), z.literal("audio"), z.literal("all")]))
+          .optional(),
+        maxFileBytes: z.number().int().positive().optional(),
+      })
+      .strict()
+      .optional(),
    experimental: z
      .object({
        sessionMemory: z.boolean().optional(),
--- a/src/media/mime.ts
+++ b/src/media/mime.ts
@@ -12,6 +12,10 @@ const EXT_BY_MIME: Record<string, string> = {
  "image/gif": ".gif",
  "audio/ogg": ".ogg",
  "audio/mpeg": ".mp3",
+  "audio/wav": ".wav",
+  "audio/flac": ".flac",
+  "audio/aac": ".aac",
+  "audio/opus": ".opus",
  "audio/x-m4a": ".m4a",
  "audio/mp4": ".m4a",
  "video/mp4": ".mp4",
--- a/src/memory/embedding-chunk-limits.ts
+++ b/src/memory/embedding-chunk-limits.ts
@@ -1,4 +1,5 @@
 import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js";
+import { hasNonTextEmbeddingParts } from "./embedding-inputs.js";
 import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js";
 import type { EmbeddingProvider } from "./embeddings.js";
 import { hashText, type MemoryChunk } from "./internal.js";
@@ -16,6 +17,10 @@ export function enforceEmbeddingMaxInputTokens(
  const out: MemoryChunk[] = [];

  for (const chunk of chunks) {
+    if (hasNonTextEmbeddingParts(chunk.embeddingInput)) {
+      out.push(chunk);
+      continue;
+    }
    if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) {
      out.push(chunk);
      continue;
@@ -27,6 +32,7 @@ export function enforceEmbeddingMaxInputTokens(
        endLine: chunk.endLine,
        text,
        hash: hashText(text),
+        embeddingInput: { text },
      });
    }
  }
--- a/src/memory/embedding-input-limits.ts
+++ b/src/memory/embedding-input-limits.ts
@@ -1,3 +1,5 @@
+import type { EmbeddingInput } from "./embedding-inputs.js";
+
 // Helpers for enforcing embedding model input size limits.
 //
 // We use UTF-8 byte length as a conservative upper bound for tokenizer output.
@@ -11,6 +13,22 @@ export function estimateUtf8Bytes(text: string): number {
  return Buffer.byteLength(text, "utf8");
 }

+export function estimateStructuredEmbeddingInputBytes(input: EmbeddingInput): number {
+  if (!input.parts?.length) {
+    return estimateUtf8Bytes(input.text);
+  }
+  let total = 0;
+  for (const part of input.parts) {
+    if (part.type === "text") {
+      total += estimateUtf8Bytes(part.text);
+      continue;
+    }
+    total += estimateUtf8Bytes(part.mimeType);
+    total += estimateUtf8Bytes(part.data);
+  }
+  return total;
+}
+
 export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] {
  if (maxUtf8Bytes <= 0) {
    return [text];
--- a/src/memory/embedding-inputs.ts
+++ b/src/memory/embedding-inputs.ts
@@ -0,0 +1,34 @@
+export type EmbeddingInputTextPart = {
+  type: "text";
+  text: string;
+};
+
+export type EmbeddingInputInlineDataPart = {
+  type: "inline-data";
+  mimeType: string;
+  data: string;
+};
+
+export type EmbeddingInputPart = EmbeddingInputTextPart | EmbeddingInputInlineDataPart;
+
+export type EmbeddingInput = {
+  text: string;
+  parts?: EmbeddingInputPart[];
+};
+
+export function buildTextEmbeddingInput(text: string): EmbeddingInput {
+  return { text };
+}
+
+export function isInlineDataEmbeddingInputPart(
+  part: EmbeddingInputPart,
+): part is EmbeddingInputInlineDataPart {
+  return part.type === "inline-data";
+}
+
+export function hasNonTextEmbeddingParts(input: EmbeddingInput | undefined): boolean {
+  if (!input?.parts?.length) {
+    return false;
+  }
+  return input.parts.some((part) => isInlineDataEmbeddingInputPart(part));
+}
--- a/src/memory/embeddings-gemini.test.ts
+++ b/src/memory/embeddings-gemini.test.ts
@@ -1,16 +1,13 @@
 import { afterEach, describe, expect, it, vi } from "vitest";
 import * as authModule from "../agents/model-auth.js";
 import {
-  buildFileDataPart,
-  buildGeminiParts,
+  buildGeminiEmbeddingRequest,
  buildGeminiTextEmbeddingRequest,
-  buildInlineDataPart,
  createGeminiEmbeddingProvider,
  DEFAULT_GEMINI_EMBEDDING_MODEL,
  GEMINI_EMBEDDING_2_MODELS,
  isGeminiEmbedding2Model,
  resolveGeminiOutputDimensionality,
-  type GeminiPart,
 } from "./embeddings-gemini.js";

 vi.mock("../agents/model-auth.js", async () => {
@@ -61,40 +58,6 @@ function mockResolvedProviderKey(apiKey = "test-key") {
  });
 }

-// ---------- Helper function tests ----------
-
-describe("buildGeminiParts", () => {
-  it("wraps a string into a single text part", () => {
-    expect(buildGeminiParts("hello")).toEqual([{ text: "hello" }]);
-  });
-
-  it("passes through an existing parts array", () => {
-    const parts: GeminiPart[] = [
-      { text: "hello" },
-      { inlineData: { mimeType: "image/png", data: "base64data" } },
-    ];
-    expect(buildGeminiParts(parts)).toBe(parts);
-  });
-});
-
-describe("buildInlineDataPart", () => {
-  it("produces the correct shape", () => {
-    const part = buildInlineDataPart("image/jpeg", "abc123");
-    expect(part).toEqual({
-      inlineData: { mimeType: "image/jpeg", data: "abc123" },
-    });
-  });
-});
-
-describe("buildFileDataPart", () => {
-  it("produces the correct shape", () => {
-    const part = buildFileDataPart("application/pdf", "gs://bucket/file.pdf");
-    expect(part).toEqual({
-      fileData: { mimeType: "application/pdf", fileUri: "gs://bucket/file.pdf" },
-    });
-  });
-});
-
 describe("buildGeminiTextEmbeddingRequest", () => {
  it("builds a text embedding request with optional model and dimensions", () => {
    expect(
@@ -113,6 +76,35 @@ describe("buildGeminiTextEmbeddingRequest", () => {
  });
 });

+describe("buildGeminiEmbeddingRequest", () => {
+  it("builds a multimodal request from structured input parts", () => {
+    expect(
+      buildGeminiEmbeddingRequest({
+        input: {
+          text: "Image file: diagram.png",
+          parts: [
+            { type: "text", text: "Image file: diagram.png" },
+            { type: "inline-data", mimeType: "image/png", data: "abc123" },
+          ],
+        },
+        taskType: "RETRIEVAL_DOCUMENT",
+        modelPath: "models/gemini-embedding-2-preview",
+        outputDimensionality: 1536,
+      }),
+    ).toEqual({
+      model: "models/gemini-embedding-2-preview",
+      content: {
+        parts: [
+          { text: "Image file: diagram.png" },
+          { inlineData: { mimeType: "image/png", data: "abc123" } },
+        ],
+      },
+      taskType: "RETRIEVAL_DOCUMENT",
+      outputDimensionality: 1536,
+    });
+  });
+});
+
 // ---------- Model detection ----------

 describe("isGeminiEmbedding2Model", () => {
@@ -319,6 +311,21 @@ describe("gemini-embedding-2-preview provider", () => {
    expect(body.outputDimensionality).toBe(768);
  });

+  it("sanitizes and normalizes embedQuery responses", async () => {
+    const fetchMock = createGeminiFetchMock([3, 4, Number.NaN]);
+    vi.stubGlobal("fetch", fetchMock);
+    mockResolvedProviderKey();
+
+    const { provider } = await createGeminiEmbeddingProvider({
+      config: {} as never,
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      fallback: "none",
+    });
+
+    await expect(provider.embedQuery("test")).resolves.toEqual([0.6, 0.8, 0]);
+  });
+
  it("uses custom outputDimensionality for each embedBatch request", async () => {
    const fetchMock = createGeminiBatchFetchMock(2);
    vi.stubGlobal("fetch", fetchMock);
@@ -341,6 +348,88 @@ describe("gemini-embedding-2-preview provider", () => {
    ]);
  });

+  it("sanitizes and normalizes structured batch responses", async () => {
+    const fetchMock = createGeminiBatchFetchMock(1, [0, Number.POSITIVE_INFINITY, 5]);
+    vi.stubGlobal("fetch", fetchMock);
+    mockResolvedProviderKey();
+
+    const { provider } = await createGeminiEmbeddingProvider({
+      config: {} as never,
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      fallback: "none",
+    });
+
+    await expect(
+      provider.embedBatchInputs?.([
+        {
+          text: "Image file: diagram.png",
+          parts: [
+            { type: "text", text: "Image file: diagram.png" },
+            { type: "inline-data", mimeType: "image/png", data: "img" },
+          ],
+        },
+      ]),
+    ).resolves.toEqual([[0, 0, 1]]);
+  });
+
+  it("supports multimodal embedBatchInputs requests", async () => {
+    const fetchMock = createGeminiBatchFetchMock(2);
+    vi.stubGlobal("fetch", fetchMock);
+    mockResolvedProviderKey();
+
+    const { provider } = await createGeminiEmbeddingProvider({
+      config: {} as never,
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      fallback: "none",
+    });
+
+    expect(provider.embedBatchInputs).toBeDefined();
+    await provider.embedBatchInputs?.([
+      {
+        text: "Image file: diagram.png",
+        parts: [
+          { type: "text", text: "Image file: diagram.png" },
+          { type: "inline-data", mimeType: "image/png", data: "img" },
+        ],
+      },
+      {
+        text: "Audio file: note.wav",
+        parts: [
+          { type: "text", text: "Audio file: note.wav" },
+          { type: "inline-data", mimeType: "audio/wav", data: "aud" },
+        ],
+      },
+    ]);
+
+    const body = parseFetchBody(fetchMock);
+    expect(body.requests).toEqual([
+      {
+        model: "models/gemini-embedding-2-preview",
+        content: {
+          parts: [
+            { text: "Image file: diagram.png" },
+            { inlineData: { mimeType: "image/png", data: "img" } },
+          ],
+        },
+        taskType: "RETRIEVAL_DOCUMENT",
+        outputDimensionality: 3072,
+      },
+      {
+        model: "models/gemini-embedding-2-preview",
+        content: {
+          parts: [
+            { text: "Audio file: note.wav" },
+            { inlineData: { mimeType: "audio/wav", data: "aud" } },
+          ],
+        },
+        taskType: "RETRIEVAL_DOCUMENT",
+        outputDimensionality: 3072,
+      },
+    ]);
+  });
+
  it("throws for invalid outputDimensionality", async () => {
    mockResolvedProviderKey();

--- a/src/memory/embeddings-gemini.ts
+++ b/src/memory/embeddings-gemini.ts
@@ -5,6 +5,7 @@ import {
 import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
 import { parseGeminiAuth } from "../infra/gemini-auth.js";
 import type { SsrFPolicy } from "../infra/net/ssrf.js";
+import type { EmbeddingInput } from "./embedding-inputs.js";
 import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js";
 import { debugEmbeddingsLog } from "./embeddings-debug.js";
 import type { EmbeddingProvider, EmbeddingProviderOptions } from "./embeddings.js";
@@ -50,34 +51,14 @@ export type GeminiTextPart = { text: string };
 export type GeminiInlinePart = {
  inlineData: { mimeType: string; data: string };
 };
-export type GeminiFilePart = {
-  fileData: { mimeType: string; fileUri: string };
-};
-export type GeminiPart = GeminiTextPart | GeminiInlinePart | GeminiFilePart;
-export type GeminiTextEmbeddingRequest = {
-  content: { parts: GeminiTextPart[] };
+export type GeminiPart = GeminiTextPart | GeminiInlinePart;
+export type GeminiEmbeddingRequest = {
+  content: { parts: GeminiPart[] };
  taskType: GeminiTaskType;
  outputDimensionality?: number;
  model?: string;
 };
-
-/** Convert a string or pre-built parts array into `GeminiPart[]`. */
-export function buildGeminiParts(input: string | GeminiPart[]): GeminiPart[] {
-  if (typeof input === "string") {
-    return [{ text: input }];
-  }
-  return input;
-}
-
-/** Convenience: build an inline-data part for multimodal embeddings. */
-export function buildInlineDataPart(mimeType: string, base64Data: string): GeminiInlinePart {
-  return { inlineData: { mimeType, data: base64Data } };
-}
-
-/** Convenience: build a file-data part for multimodal embeddings. */
-export function buildFileDataPart(mimeType: string, fileUri: string): GeminiFilePart {
-  return { fileData: { mimeType, fileUri } };
-}
+export type GeminiTextEmbeddingRequest = GeminiEmbeddingRequest;

 /** Builds the text-only Gemini embedding request shape used across direct and batch APIs. */
 export function buildGeminiTextEmbeddingRequest(params: {
@@ -86,8 +67,30 @@ export function buildGeminiTextEmbeddingRequest(params: {
  outputDimensionality?: number;
  modelPath?: string;
 }): GeminiTextEmbeddingRequest {
-  const request: GeminiTextEmbeddingRequest = {
-    content: { parts: [{ text: params.text }] },
+  return buildGeminiEmbeddingRequest({
+    input: { text: params.text },
+    taskType: params.taskType,
+    outputDimensionality: params.outputDimensionality,
+    modelPath: params.modelPath,
+  });
+}
+
+export function buildGeminiEmbeddingRequest(params: {
+  input: EmbeddingInput;
+  taskType: GeminiTaskType;
+  outputDimensionality?: number;
+  modelPath?: string;
+}): GeminiEmbeddingRequest {
+  const request: GeminiEmbeddingRequest = {
+    content: {
+      parts: params.input.parts?.map((part) =>
+        part.type === "text"
+          ? ({ text: part.text } satisfies GeminiTextPart)
+          : ({
+              inlineData: { mimeType: part.mimeType, data: part.data },
+            } satisfies GeminiInlinePart),
+      ) ?? [{ text: params.input.text }],
+    },
    taskType: params.taskType,
  };
  if (params.modelPath) {
@@ -143,7 +146,7 @@ function resolveRemoteApiKey(remoteApiKey: unknown): string | undefined {
  return trimmed;
 }

-function normalizeGeminiModel(model: string): string {
+export function normalizeGeminiModel(model: string): string {
  const trimmed = model.trim();
  if (!trimmed) {
    return DEFAULT_GEMINI_EMBEDDING_MODEL;
@@ -158,6 +161,46 @@ function normalizeGeminiModel(model: string): string {
  return withoutPrefix;
 }

+async function fetchGeminiEmbeddingPayload(params: {
+  client: GeminiEmbeddingClient;
+  endpoint: string;
+  body: unknown;
+}): Promise<{
+  embedding?: { values?: number[] };
+  embeddings?: Array<{ values?: number[] }>;
+}> {
+  return await executeWithApiKeyRotation({
+    provider: "google",
+    apiKeys: params.client.apiKeys,
+    execute: async (apiKey) => {
+      const authHeaders = parseGeminiAuth(apiKey);
+      const headers = {
+        ...authHeaders.headers,
+        ...params.client.headers,
+      };
+      return await withRemoteHttpResponse({
+        url: params.endpoint,
+        ssrfPolicy: params.client.ssrfPolicy,
+        init: {
+          method: "POST",
+          headers,
+          body: JSON.stringify(params.body),
+        },
+        onResponse: async (res) => {
+          if (!res.ok) {
+            const text = await res.text();
+            throw new Error(`gemini embeddings failed: ${res.status} ${text}`);
+          }
+          return (await res.json()) as {
+            embedding?: { values?: number[] };
+            embeddings?: Array<{ values?: number[] }>;
+          };
+        },
+      });
+    },
+  });
+}
+
 function normalizeGeminiBaseUrl(raw: string): string {
  const trimmed = raw.replace(/\/+$/, "");
  const openAiIndex = trimmed.indexOf("/openai");
@@ -181,71 +224,50 @@ export async function createGeminiEmbeddingProvider(
  const isV2 = isGeminiEmbedding2Model(client.model);
  const outputDimensionality = client.outputDimensionality;

-  const fetchWithGeminiAuth = async (apiKey: string, endpoint: string, body: unknown) => {
-    const authHeaders = parseGeminiAuth(apiKey);
-    const headers = {
-      ...authHeaders.headers,
-      ...client.headers,
-    };
-    const payload = await withRemoteHttpResponse({
-      url: endpoint,
-      ssrfPolicy: client.ssrfPolicy,
-      init: {
-        method: "POST",
-        headers,
-        body: JSON.stringify(body),
-      },
-      onResponse: async (res) => {
-        if (!res.ok) {
-          const text = await res.text();
-          throw new Error(`gemini embeddings failed: ${res.status} ${text}`);
-        }
-        return (await res.json()) as {
-          embedding?: { values?: number[] };
-          embeddings?: Array<{ values?: number[] }>;
-        };
-      },
-    });
-    return payload;
-  };
-
  const embedQuery = async (text: string): Promise<number[]> => {
    if (!text.trim()) {
      return [];
    }
-    const body = buildGeminiTextEmbeddingRequest({
-      text,
-      taskType: options.taskType ?? "RETRIEVAL_QUERY",
-      outputDimensionality: isV2 ? outputDimensionality : undefined,
-    });
-    const payload = await executeWithApiKeyRotation({
-      provider: "google",
-      apiKeys: client.apiKeys,
-      execute: (apiKey) => fetchWithGeminiAuth(apiKey, embedUrl, body),
+    const payload = await fetchGeminiEmbeddingPayload({
+      client,
+      endpoint: embedUrl,
+      body: buildGeminiTextEmbeddingRequest({
+        text,
+        taskType: options.taskType ?? "RETRIEVAL_QUERY",
+        outputDimensionality: isV2 ? outputDimensionality : undefined,
+      }),
    });
    return sanitizeAndNormalizeEmbedding(payload.embedding?.values ?? []);
  };

-  const embedBatch = async (texts: string[]): Promise<number[][]> => {
-    if (texts.length === 0) {
+  const embedBatchInputs = async (inputs: EmbeddingInput[]): Promise<number[][]> => {
+    if (inputs.length === 0) {
      return [];
    }
-    const requests = texts.map((text) =>
-      buildGeminiTextEmbeddingRequest({
-        text,
-        modelPath: client.modelPath,
-        taskType: options.taskType ?? "RETRIEVAL_DOCUMENT",
-        outputDimensionality: isV2 ? outputDimensionality : undefined,
-      }),
-    );
-    const batchBody = { requests };
-    const payload = await executeWithApiKeyRotation({
-      provider: "google",
-      apiKeys: client.apiKeys,
-      execute: (apiKey) => fetchWithGeminiAuth(apiKey, batchUrl, batchBody),
+    const payload = await fetchGeminiEmbeddingPayload({
+      client,
+      endpoint: batchUrl,
+      body: {
+        requests: inputs.map((input) =>
+          buildGeminiEmbeddingRequest({
+            input,
+            modelPath: client.modelPath,
+            taskType: options.taskType ?? "RETRIEVAL_DOCUMENT",
+            outputDimensionality: isV2 ? outputDimensionality : undefined,
+          }),
+        ),
+      },
    });
    const embeddings = Array.isArray(payload.embeddings) ? payload.embeddings : [];
-    return texts.map((_, index) => sanitizeAndNormalizeEmbedding(embeddings[index]?.values ?? []));
+    return inputs.map((_, index) => sanitizeAndNormalizeEmbedding(embeddings[index]?.values ?? []));
+  };
+
+  const embedBatch = async (texts: string[]): Promise<number[][]> => {
+    return await embedBatchInputs(
+      texts.map((text) => ({
+        text,
+      })),
+    );
  };

  return {
@@ -255,6 +277,7 @@ export async function createGeminiEmbeddingProvider(
      maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model],
      embedQuery,
      embedBatch,
+      embedBatchInputs,
    },
    client,
  };
--- a/src/memory/embeddings.ts
+++ b/src/memory/embeddings.ts
@@ -4,6 +4,7 @@ import type { OpenClawConfig } from "../config/config.js";
 import type { SecretInput } from "../config/types.secrets.js";
 import { formatErrorMessage } from "../infra/errors.js";
 import { resolveUserPath } from "../utils.js";
+import type { EmbeddingInput } from "./embedding-inputs.js";
 import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js";
 import {
  createGeminiEmbeddingProvider,
@@ -31,6 +32,7 @@ export type EmbeddingProvider = {
  maxInputTokens?: number;
  embedQuery: (text: string) => Promise<number[]>;
  embedBatch: (texts: string[]) => Promise<number[][]>;
+  embedBatchInputs?: (inputs: EmbeddingInput[]) => Promise<number[][]>;
 };

 export type EmbeddingProviderId = "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama";
--- a/src/memory/index.test.ts
+++ b/src/memory/index.test.ts
@@ -1,3 +1,4 @@
+import { randomUUID } from "node:crypto";
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
@@ -6,6 +7,7 @@ import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";
 import "./test-runtime-mocks.js";

 let embedBatchCalls = 0;
+let embedBatchInputCalls = 0;
 let providerCalls: Array<{ provider?: string; model?: string; outputDimensionality?: number }> = [];

 vi.mock("./embeddings.js", () => {
@@ -13,7 +15,9 @@ vi.mock("./embeddings.js", () => {
    const lower = text.toLowerCase();
    const alpha = lower.split("alpha").length - 1;
    const beta = lower.split("beta").length - 1;
-    return [alpha, beta];
+    const image = lower.split("image").length - 1;
+    const audio = lower.split("audio").length - 1;
+    return [alpha, beta, image, audio];
  };
  return {
    createEmbeddingProvider: async (options: {
@@ -38,6 +42,36 @@ vi.mock("./embeddings.js", () => {
            embedBatchCalls += 1;
            return texts.map(embedText);
          },
+          ...(providerId === "gemini"
+            ? {
+                embedBatchInputs: async (
+                  inputs: Array<{
+                    text: string;
+                    parts?: Array<
+                      | { type: "text"; text: string }
+                      | { type: "inline-data"; mimeType: string; data: string }
+                    >;
+                  }>,
+                ) => {
+                  embedBatchInputCalls += 1;
+                  return inputs.map((input) => {
+                    const inlineData = input.parts?.find((part) => part.type === "inline-data");
+                    if (inlineData?.type === "inline-data" && inlineData.data.length > 9000) {
+                      throw new Error("payload too large");
+                    }
+                    const mimeType =
+                      inlineData?.type === "inline-data" ? inlineData.mimeType : undefined;
+                    if (mimeType?.startsWith("image/")) {
+                      return [0, 0, 1, 0];
+                    }
+                    if (mimeType?.startsWith("audio/")) {
+                      return [0, 0, 0, 1];
+                    }
+                    return embedText(input.text);
+                  });
+                },
+              }
+            : {}),
        },
        ...(providerId === "gemini"
          ? {
@@ -64,6 +98,7 @@ describe("memory index", () => {
  let indexVectorPath = "";
  let indexMainPath = "";
  let indexExtraPath = "";
+  let indexMultimodalPath = "";
  let indexStatusPath = "";
  let indexSourceChangePath = "";
  let indexModelPath = "";
@@ -97,6 +132,7 @@ describe("memory index", () => {
    indexMainPath = path.join(workspaceDir, "index-main.sqlite");
    indexVectorPath = path.join(workspaceDir, "index-vector.sqlite");
    indexExtraPath = path.join(workspaceDir, "index-extra.sqlite");
+    indexMultimodalPath = path.join(workspaceDir, "index-multimodal.sqlite");
    indexStatusPath = path.join(workspaceDir, "index-status.sqlite");
    indexSourceChangePath = path.join(workspaceDir, "index-source-change.sqlite");
    indexModelPath = path.join(workspaceDir, "index-model-change.sqlite");
@@ -119,6 +155,7 @@ describe("memory index", () => {
    // Keep atomic reindex tests on the safe path.
    vi.stubEnv("OPENCLAW_TEST_MEMORY_UNSAFE_REINDEX", "1");
    embedBatchCalls = 0;
+    embedBatchInputCalls = 0;
    providerCalls = [];

    // Keep the workspace stable to allow manager reuse across tests.
@@ -149,6 +186,11 @@ describe("memory index", () => {
    provider?: "openai" | "gemini";
    model?: string;
    outputDimensionality?: number;
+    multimodal?: {
+      enabled?: boolean;
+      modalities?: Array<"image" | "audio" | "all">;
+      maxFileBytes?: number;
+    };
    vectorEnabled?: boolean;
    cacheEnabled?: boolean;
    minScore?: number;
@@ -172,6 +214,7 @@ describe("memory index", () => {
            },
            cache: params.cacheEnabled ? { enabled: true } : undefined,
            extraPaths: params.extraPaths,
+            multimodal: params.multimodal,
            sources: params.sources,
            experimental: { sessionMemory: params.sessionMemory ?? false },
          },
@@ -247,6 +290,103 @@ describe("memory index", () => {
    );
  });

+  it("indexes multimodal image and audio files from extra paths with Gemini structured inputs", async () => {
+    const mediaDir = path.join(workspaceDir, "media-memory");
+    await fs.mkdir(mediaDir, { recursive: true });
+    await fs.writeFile(path.join(mediaDir, "diagram.png"), Buffer.from("png"));
+    await fs.writeFile(path.join(mediaDir, "meeting.wav"), Buffer.from("wav"));
+
+    const cfg = createCfg({
+      storePath: indexMultimodalPath,
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      extraPaths: [mediaDir],
+      multimodal: { enabled: true, modalities: ["image", "audio"] },
+    });
+    const manager = await getPersistentManager(cfg);
+    await manager.sync({ reason: "test" });
+
+    expect(embedBatchInputCalls).toBeGreaterThan(0);
+
+    const imageResults = await manager.search("image");
+    expect(imageResults.some((result) => result.path.endsWith("diagram.png"))).toBe(true);
+
+    const audioResults = await manager.search("audio");
+    expect(audioResults.some((result) => result.path.endsWith("meeting.wav"))).toBe(true);
+  });
+
+  it("skips oversized multimodal inputs without aborting sync", async () => {
+    const mediaDir = path.join(workspaceDir, "media-oversize");
+    await fs.mkdir(mediaDir, { recursive: true });
+    await fs.writeFile(path.join(mediaDir, "huge.png"), Buffer.alloc(7000, 1));
+
+    const cfg = createCfg({
+      storePath: path.join(workspaceDir, `index-oversize-${randomUUID()}.sqlite`),
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      extraPaths: [mediaDir],
+      multimodal: { enabled: true, modalities: ["image"] },
+    });
+    const manager = requireManager(await getMemorySearchManager({ cfg, agentId: "main" }));
+    await manager.sync({ reason: "test" });
+
+    expect(embedBatchInputCalls).toBeGreaterThan(0);
+    const imageResults = await manager.search("image");
+    expect(imageResults.some((result) => result.path.endsWith("huge.png"))).toBe(false);
+
+    const alphaResults = await manager.search("alpha");
+    expect(alphaResults.some((result) => result.path.endsWith("memory/2026-01-12.md"))).toBe(true);
+
+    await manager.close?.();
+  });
+
+  it("reindexes a multimodal file after a transient mid-sync disappearance", async () => {
+    const mediaDir = path.join(workspaceDir, "media-race");
+    const imagePath = path.join(mediaDir, "diagram.png");
+    await fs.mkdir(mediaDir, { recursive: true });
+    await fs.writeFile(imagePath, Buffer.from("png"));
+
+    const cfg = createCfg({
+      storePath: path.join(workspaceDir, `index-race-${randomUUID()}.sqlite`),
+      provider: "gemini",
+      model: "gemini-embedding-2-preview",
+      extraPaths: [mediaDir],
+      multimodal: { enabled: true, modalities: ["image"] },
+    });
+    const manager = requireManager(await getMemorySearchManager({ cfg, agentId: "main" }));
+    const realReadFile = fs.readFile.bind(fs);
+    let imageReads = 0;
+    const readSpy = vi.spyOn(fs, "readFile").mockImplementation(async (...args) => {
+      const [targetPath] = args;
+      if (typeof targetPath === "string" && targetPath === imagePath) {
+        imageReads += 1;
+        if (imageReads === 2) {
+          const err = Object.assign(
+            new Error(`ENOENT: no such file or directory, open '${imagePath}'`),
+            {
+              code: "ENOENT",
+            },
+          ) as NodeJS.ErrnoException;
+          throw err;
+        }
+      }
+      return await realReadFile(...args);
+    });
+
+    await manager.sync({ reason: "test" });
+    readSpy.mockRestore();
+
+    const callsAfterFirstSync = embedBatchInputCalls;
+    (manager as unknown as { dirty: boolean }).dirty = true;
+    await manager.sync({ reason: "test" });
+
+    expect(embedBatchInputCalls).toBeGreaterThan(callsAfterFirstSync);
+    const results = await manager.search("image");
+    expect(results.some((result) => result.path.endsWith("diagram.png"))).toBe(true);
+
+    await manager.close?.();
+  });
+
  it("keeps dirty false in status-only manager after prior indexing", async () => {
    const cfg = createCfg({ storePath: indexStatusPath });

@@ -433,6 +573,82 @@ describe("memory index", () => {
    await secondManager.close?.();
  });

+  it("reindexes when extraPaths change", async () => {
+    const storePath = path.join(workspaceDir, `index-scope-extra-${randomUUID()}.sqlite`);
+    const firstExtraDir = path.join(workspaceDir, "scope-extra-a");
+    const secondExtraDir = path.join(workspaceDir, "scope-extra-b");
+    await fs.rm(firstExtraDir, { recursive: true, force: true });
+    await fs.rm(secondExtraDir, { recursive: true, force: true });
+    await fs.mkdir(firstExtraDir, { recursive: true });
+    await fs.mkdir(secondExtraDir, { recursive: true });
+    await fs.writeFile(path.join(firstExtraDir, "a.md"), "alpha only");
+    await fs.writeFile(path.join(secondExtraDir, "b.md"), "beta only");
+
+    const first = await getMemorySearchManager({
+      cfg: createCfg({
+        storePath,
+        extraPaths: [firstExtraDir],
+      }),
+      agentId: "main",
+    });
+    const firstManager = requireManager(first);
+    await firstManager.sync?.({ reason: "test" });
+    await firstManager.close?.();
+
+    const second = await getMemorySearchManager({
+      cfg: createCfg({
+        storePath,
+        extraPaths: [secondExtraDir],
+      }),
+      agentId: "main",
+    });
+    const secondManager = requireManager(second);
+    await secondManager.sync?.({ reason: "test" });
+    const results = await secondManager.search("beta");
+    expect(results.some((result) => result.path.endsWith("scope-extra-b/b.md"))).toBe(true);
+    expect(results.some((result) => result.path.endsWith("scope-extra-a/a.md"))).toBe(false);
+    await secondManager.close?.();
+  });
+
+  it("reindexes when multimodal settings change", async () => {
+    const storePath = path.join(workspaceDir, `index-scope-multimodal-${randomUUID()}.sqlite`);
+    const mediaDir = path.join(workspaceDir, "scope-media");
+    await fs.rm(mediaDir, { recursive: true, force: true });
+    await fs.mkdir(mediaDir, { recursive: true });
+    await fs.writeFile(path.join(mediaDir, "diagram.png"), Buffer.from("png"));
+
+    const first = await getMemorySearchManager({
+      cfg: createCfg({
+        storePath,
+        provider: "gemini",
+        model: "gemini-embedding-2-preview",
+        extraPaths: [mediaDir],
+      }),
+      agentId: "main",
+    });
+    const firstManager = requireManager(first);
+    await firstManager.sync?.({ reason: "test" });
+    const multimodalCallsAfterFirstSync = embedBatchInputCalls;
+    await firstManager.close?.();
+
+    const second = await getMemorySearchManager({
+      cfg: createCfg({
+        storePath,
+        provider: "gemini",
+        model: "gemini-embedding-2-preview",
+        extraPaths: [mediaDir],
+        multimodal: { enabled: true, modalities: ["image"] },
+      }),
+      agentId: "main",
+    });
+    const secondManager = requireManager(second);
+    await secondManager.sync?.({ reason: "test" });
+    expect(embedBatchInputCalls).toBeGreaterThan(multimodalCallsAfterFirstSync);
+    const results = await secondManager.search("image");
+    expect(results.some((result) => result.path.endsWith("scope-media/diagram.png"))).toBe(true);
+    await secondManager.close?.();
+  });
+
  it("reuses cached embeddings on forced reindex", async () => {
    const cfg = createCfg({ storePath: indexMainPath, cacheEnabled: true });
    const manager = await getPersistentManager(cfg);
--- a/src/memory/internal.test.ts
+++ b/src/memory/internal.test.ts
@@ -3,12 +3,17 @@ import os from "node:os";
 import path from "node:path";
 import { afterEach, beforeEach, describe, expect, it } from "vitest";
 import {
+  buildMultimodalChunkForIndexing,
  buildFileEntry,
  chunkMarkdown,
  listMemoryFiles,
  normalizeExtraMemoryPaths,
  remapChunkLines,
 } from "./internal.js";
+import {
+  DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES,
+  type MemoryMultimodalSettings,
+} from "./multimodal.js";

 function setupTempDirLifecycle(prefix: string): () => string {
  let tmpDir = "";
@@ -38,6 +43,11 @@ describe("normalizeExtraMemoryPaths", () => {

 describe("listMemoryFiles", () => {
  const getTmpDir = setupTempDirLifecycle("memory-test-");
+  const multimodal: MemoryMultimodalSettings = {
+    enabled: true,
+    modalities: ["image", "audio"],
+    maxFileBytes: DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES,
+  };

  it("includes files from additional paths (directory)", async () => {
    const tmpDir = getTmpDir();
@@ -131,10 +141,29 @@ describe("listMemoryFiles", () => {
    const memoryMatches = files.filter((file) => file.endsWith("MEMORY.md"));
    expect(memoryMatches).toHaveLength(1);
  });
+
+  it("includes image and audio files from extra paths when multimodal is enabled", async () => {
+    const tmpDir = getTmpDir();
+    const extraDir = path.join(tmpDir, "media");
+    await fs.mkdir(extraDir, { recursive: true });
+    await fs.writeFile(path.join(extraDir, "diagram.png"), Buffer.from("png"));
+    await fs.writeFile(path.join(extraDir, "note.wav"), Buffer.from("wav"));
+    await fs.writeFile(path.join(extraDir, "ignore.bin"), Buffer.from("bin"));
+
+    const files = await listMemoryFiles(tmpDir, [extraDir], multimodal);
+    expect(files.some((file) => file.endsWith("diagram.png"))).toBe(true);
+    expect(files.some((file) => file.endsWith("note.wav"))).toBe(true);
+    expect(files.some((file) => file.endsWith("ignore.bin"))).toBe(false);
+  });
 });

 describe("buildFileEntry", () => {
  const getTmpDir = setupTempDirLifecycle("memory-build-entry-");
+  const multimodal: MemoryMultimodalSettings = {
+    enabled: true,
+    modalities: ["image", "audio"],
+    maxFileBytes: DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES,
+  };

  it("returns null when the file disappears before reading", async () => {
    const tmpDir = getTmpDir();
@@ -154,6 +183,37 @@ describe("buildFileEntry", () => {
    expect(entry?.path).toBe("note.md");
    expect(entry?.size).toBeGreaterThan(0);
  });
+
+  it("returns multimodal metadata for eligible image files", async () => {
+    const tmpDir = getTmpDir();
+    const target = path.join(tmpDir, "diagram.png");
+    await fs.writeFile(target, Buffer.from("png"));
+
+    const entry = await buildFileEntry(target, tmpDir, multimodal);
+
+    expect(entry).toMatchObject({
+      path: "diagram.png",
+      kind: "multimodal",
+      modality: "image",
+      mimeType: "image/png",
+      contentText: "Image file: diagram.png",
+    });
+  });
+
+  it("builds a multimodal chunk lazily for indexing", async () => {
+    const tmpDir = getTmpDir();
+    const target = path.join(tmpDir, "diagram.png");
+    await fs.writeFile(target, Buffer.from("png"));
+
+    const entry = await buildFileEntry(target, tmpDir, multimodal);
+    const built = await buildMultimodalChunkForIndexing(entry!);
+
+    expect(built?.chunk.embeddingInput?.parts).toEqual([
+      { type: "text", text: "Image file: diagram.png" },
+      expect.objectContaining({ type: "inline-data", mimeType: "image/png" }),
+    ]);
+    expect(built?.structuredInputBytes).toBeGreaterThan(0);
+  });
 });

 describe("chunkMarkdown", () => {
--- a/src/memory/internal.ts
+++ b/src/memory/internal.ts
@@ -2,8 +2,17 @@ import crypto from "node:crypto";
 import fsSync from "node:fs";
 import fs from "node:fs/promises";
 import path from "node:path";
+import { detectMime } from "../media/mime.js";
 import { runTasksWithConcurrency } from "../utils/run-with-concurrency.js";
+import { estimateStructuredEmbeddingInputBytes } from "./embedding-input-limits.js";
+import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs.js";
 import { isFileMissingError } from "./fs-utils.js";
+import {
+  buildMemoryMultimodalLabel,
+  classifyMemoryMultimodalPath,
+  type MemoryMultimodalModality,
+  type MemoryMultimodalSettings,
+} from "./multimodal.js";

 export type MemoryFileEntry = {
  path: string;
@@ -11,6 +20,10 @@ export type MemoryFileEntry = {
  mtimeMs: number;
  size: number;
  hash: string;
+  kind?: "markdown" | "multimodal";
+  contentText?: string;
+  modality?: MemoryMultimodalModality;
+  mimeType?: string;
 };

 export type MemoryChunk = {
@@ -18,6 +31,18 @@ export type MemoryChunk = {
  endLine: number;
  text: string;
  hash: string;
+  embeddingInput?: EmbeddingInput;
+};
+
+export type MultimodalMemoryChunk = {
+  chunk: MemoryChunk;
+  structuredInputBytes: number;
+};
+
+const DISABLED_MULTIMODAL_SETTINGS: MemoryMultimodalSettings = {
+  enabled: false,
+  modalities: [],
+  maxFileBytes: 0,
 };

 export function ensureDir(dir: string): string {
@@ -56,7 +81,16 @@ export function isMemoryPath(relPath: string): boolean {
  return normalized.startsWith("memory/");
 }

-async function walkDir(dir: string, files: string[]) {
+function isAllowedMemoryFilePath(filePath: string, multimodal?: MemoryMultimodalSettings): boolean {
+  if (filePath.endsWith(".md")) {
+    return true;
+  }
+  return (
+    classifyMemoryMultimodalPath(filePath, multimodal ?? DISABLED_MULTIMODAL_SETTINGS) !== null
+  );
+}
+
+async function walkDir(dir: string, files: string[], multimodal?: MemoryMultimodalSettings) {
  const entries = await fs.readdir(dir, { withFileTypes: true });
  for (const entry of entries) {
    const full = path.join(dir, entry.name);
@@ -64,13 +98,13 @@ async function walkDir(dir: string, files: string[]) {
      continue;
    }
    if (entry.isDirectory()) {
-      await walkDir(full, files);
+      await walkDir(full, files, multimodal);
      continue;
    }
    if (!entry.isFile()) {
      continue;
    }
-    if (!entry.name.endsWith(".md")) {
+    if (!isAllowedMemoryFilePath(full, multimodal)) {
      continue;
    }
    files.push(full);
@@ -80,6 +114,7 @@ async function walkDir(dir: string, files: string[]) {
 export async function listMemoryFiles(
  workspaceDir: string,
  extraPaths?: string[],
+  multimodal?: MemoryMultimodalSettings,
 ): Promise<string[]> {
  const result: string[] = [];
  const memoryFile = path.join(workspaceDir, "MEMORY.md");
@@ -117,10 +152,10 @@ export async function listMemoryFiles(
          continue;
        }
        if (stat.isDirectory()) {
-          await walkDir(inputPath, result);
+          await walkDir(inputPath, result, multimodal);
          continue;
        }
-        if (stat.isFile() && inputPath.endsWith(".md")) {
+        if (stat.isFile() && isAllowedMemoryFilePath(inputPath, multimodal)) {
          result.push(inputPath);
        }
      } catch {}
@@ -152,6 +187,7 @@ export function hashText(value: string): string {
 export async function buildFileEntry(
  absPath: string,
  workspaceDir: string,
+  multimodal?: MemoryMultimodalSettings,
 ): Promise<MemoryFileEntry | null> {
  let stat;
  try {
@@ -162,6 +198,48 @@ export async function buildFileEntry(
    }
    throw err;
  }
+  const normalizedPath = path.relative(workspaceDir, absPath).replace(/\\/g, "/");
+  const multimodalSettings = multimodal ?? DISABLED_MULTIMODAL_SETTINGS;
+  const modality = classifyMemoryMultimodalPath(absPath, multimodalSettings);
+  if (modality) {
+    if (stat.size > multimodalSettings.maxFileBytes) {
+      return null;
+    }
+    let buffer: Buffer;
+    try {
+      buffer = await fs.readFile(absPath);
+    } catch (err) {
+      if (isFileMissingError(err)) {
+        return null;
+      }
+      throw err;
+    }
+    const mimeType = await detectMime({ buffer: buffer.subarray(0, 512), filePath: absPath });
+    if (!mimeType || !mimeType.startsWith(`${modality}/`)) {
+      return null;
+    }
+    const contentText = buildMemoryMultimodalLabel(modality, normalizedPath);
+    const dataHash = crypto.createHash("sha256").update(buffer).digest("hex");
+    const chunkHash = hashText(
+      JSON.stringify({
+        path: normalizedPath,
+        contentText,
+        mimeType,
+        dataHash,
+      }),
+    );
+    return {
+      path: normalizedPath,
+      absPath,
+      mtimeMs: stat.mtimeMs,
+      size: stat.size,
+      hash: chunkHash,
+      kind: "multimodal",
+      contentText,
+      modality,
+      mimeType,
+    };
+  }
  let content: string;
  try {
    content = await fs.readFile(absPath, "utf-8");
@@ -173,11 +251,59 @@ export async function buildFileEntry(
  }
  const hash = hashText(content);
  return {
-    path: path.relative(workspaceDir, absPath).replace(/\\/g, "/"),
+    path: normalizedPath,
    absPath,
    mtimeMs: stat.mtimeMs,
    size: stat.size,
    hash,
+    kind: "markdown",
+  };
+}
+
+async function loadMultimodalEmbeddingInput(
+  entry: Pick<MemoryFileEntry, "absPath" | "contentText" | "mimeType" | "kind">,
+): Promise<EmbeddingInput | null> {
+  if (entry.kind !== "multimodal" || !entry.contentText || !entry.mimeType) {
+    return null;
+  }
+  let buffer: Buffer;
+  try {
+    buffer = await fs.readFile(entry.absPath);
+  } catch (err) {
+    if (isFileMissingError(err)) {
+      return null;
+    }
+    throw err;
+  }
+  return {
+    text: entry.contentText,
+    parts: [
+      { type: "text", text: entry.contentText },
+      {
+        type: "inline-data",
+        mimeType: entry.mimeType,
+        data: buffer.toString("base64"),
+      },
+    ],
+  };
+}
+
+export async function buildMultimodalChunkForIndexing(
+  entry: Pick<MemoryFileEntry, "absPath" | "contentText" | "mimeType" | "kind" | "hash">,
+): Promise<MultimodalMemoryChunk | null> {
+  const embeddingInput = await loadMultimodalEmbeddingInput(entry);
+  if (!embeddingInput) {
+    return null;
+  }
+  return {
+    chunk: {
+      startLine: 1,
+      endLine: 1,
+      text: entry.contentText ?? embeddingInput.text,
+      hash: entry.hash,
+      embeddingInput,
+    },
+    structuredInputBytes: estimateStructuredEmbeddingInputBytes(embeddingInput),
  };
 }

@@ -213,6 +339,7 @@ export function chunkMarkdown(
      endLine,
      text,
      hash: hashText(text),
+      embeddingInput: buildTextEmbeddingInput(text),
    });
  };

--- a/src/memory/manager-embedding-ops.ts
+++ b/src/memory/manager-embedding-ops.ts
@@ -8,9 +8,14 @@ import {
 } from "./batch-openai.js";
 import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js";
 import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
-import { estimateUtf8Bytes } from "./embedding-input-limits.js";
-import { buildGeminiTextEmbeddingRequest } from "./embeddings-gemini.js";
 import {
+  estimateStructuredEmbeddingInputBytes,
+  estimateUtf8Bytes,
+} from "./embedding-input-limits.js";
+import { type EmbeddingInput, hasNonTextEmbeddingParts } from "./embedding-inputs.js";
+import { buildGeminiEmbeddingRequest } from "./embeddings-gemini.js";
+import {
+  buildMultimodalChunkForIndexing,
  chunkMarkdown,
  hashText,
  parseEmbedding,
@@ -53,7 +58,9 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
    let currentTokens = 0;

    for (const chunk of chunks) {
-      const estimate = estimateUtf8Bytes(chunk.text);
+      const estimate = chunk.embeddingInput
+        ? estimateStructuredEmbeddingInputBytes(chunk.embeddingInput)
+        : estimateUtf8Bytes(chunk.text);
      const wouldExceed =
        current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS;
      if (wouldExceed) {
@@ -188,9 +195,22 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
    const missingChunks = missing.map((m) => m.chunk);
    const batches = this.buildEmbeddingBatches(missingChunks);
    const toCache: Array<{ hash: string; embedding: number[] }> = [];
+    const provider = this.provider;
+    if (!provider) {
+      throw new Error("Cannot embed batch in FTS-only mode (no embedding provider)");
+    }
    let cursor = 0;
    for (const batch of batches) {
-      const batchEmbeddings = await this.embedBatchWithRetry(batch.map((chunk) => chunk.text));
+      const inputs = batch.map((chunk) => chunk.embeddingInput ?? { text: chunk.text });
+      const hasStructuredInputs = inputs.some((input) => hasNonTextEmbeddingParts(input));
+      if (hasStructuredInputs && !provider.embedBatchInputs) {
+        throw new Error(
+          `Embedding provider "${provider.id}" does not support multimodal memory inputs.`,
+        );
+      }
+      const batchEmbeddings = hasStructuredInputs
+        ? await this.embedBatchInputsWithRetry(inputs)
+        : await this.embedBatchWithRetry(batch.map((chunk) => chunk.text));
      for (let i = 0; i < batch.length; i += 1) {
        const item = missing[cursor + i];
        const embedding = batchEmbeddings[i] ?? [];
@@ -476,6 +496,9 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
    source: MemorySource,
  ): Promise<number[][]> {
    const gemini = this.gemini;
+    if (chunks.some((chunk) => hasNonTextEmbeddingParts(chunk.embeddingInput))) {
+      return await this.embedChunksInBatches(chunks);
+    }
    return await this.embedChunksWithProviderBatch<GeminiBatchRequest>({
      chunks,
      entry,
@@ -483,9 +506,10 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
      provider: "gemini",
      enabled: Boolean(gemini),
      buildRequest: (chunk) => ({
-        request: buildGeminiTextEmbeddingRequest({
-          text: chunk.text,
+        request: buildGeminiEmbeddingRequest({
+          input: chunk.embeddingInput ?? { text: chunk.text },
          taskType: "RETRIEVAL_DOCUMENT",
+          modelPath: this.gemini?.modelPath,
          outputDimensionality: this.gemini?.outputDimensionality,
        }),
      }),
@@ -536,6 +560,45 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
    }
  }

+  protected async embedBatchInputsWithRetry(inputs: EmbeddingInput[]): Promise<number[][]> {
+    if (inputs.length === 0) {
+      return [];
+    }
+    if (!this.provider?.embedBatchInputs) {
+      return await this.embedBatchWithRetry(inputs.map((input) => input.text));
+    }
+    let attempt = 0;
+    let delayMs = EMBEDDING_RETRY_BASE_DELAY_MS;
+    while (true) {
+      try {
+        const timeoutMs = this.resolveEmbeddingTimeout("batch");
+        log.debug("memory embeddings: structured batch start", {
+          provider: this.provider.id,
+          items: inputs.length,
+          timeoutMs,
+        });
+        return await this.withTimeout(
+          this.provider.embedBatchInputs(inputs),
+          timeoutMs,
+          `memory embeddings batch timed out after ${Math.round(timeoutMs / 1000)}s`,
+        );
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        if (!this.isRetryableEmbeddingError(message) || attempt >= EMBEDDING_RETRY_MAX_ATTEMPTS) {
+          throw err;
+        }
+        const waitMs = Math.min(
+          EMBEDDING_RETRY_MAX_DELAY_MS,
+          Math.round(delayMs * (1 + Math.random() * 0.2)),
+        );
+        log.warn(`memory embeddings rate limited; retrying structured batch in ${waitMs}ms`);
+        await new Promise((resolve) => setTimeout(resolve, waitMs));
+        delayMs *= 2;
+        attempt += 1;
+      }
+    }
+  }
+
  private isRetryableEmbeddingError(message: string): boolean {
    return /(rate[_ ]limit|too many requests|429|resource has been exhausted|5\d\d|cloudflare|tokens per day)/i.test(
      message,
@@ -695,6 +758,49 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
    return this.batch.enabled ? this.batch.concurrency : EMBEDDING_INDEX_CONCURRENCY;
  }

+  private clearIndexedFileData(pathname: string, source: MemorySource): void {
+    if (this.vector.enabled) {
+      try {
+        this.db
+          .prepare(
+            `DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`,
+          )
+          .run(pathname, source);
+      } catch {}
+    }
+    if (this.fts.enabled && this.fts.available && this.provider) {
+      try {
+        this.db
+          .prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`)
+          .run(pathname, source, this.provider.model);
+      } catch {}
+    }
+    this.db.prepare(`DELETE FROM chunks WHERE path = ? AND source = ?`).run(pathname, source);
+  }
+
+  private upsertFileRecord(entry: MemoryFileEntry | SessionFileEntry, source: MemorySource): void {
+    this.db
+      .prepare(
+        `INSERT INTO files (path, source, hash, mtime, size) VALUES (?, ?, ?, ?, ?)
+         ON CONFLICT(path) DO UPDATE SET
+           source=excluded.source,
+           hash=excluded.hash,
+           mtime=excluded.mtime,
+           size=excluded.size`,
+      )
+      .run(entry.path, source, entry.hash, entry.mtimeMs, entry.size);
+  }
+
+  private deleteFileRecord(pathname: string, source: MemorySource): void {
+    this.db.prepare(`DELETE FROM files WHERE path = ? AND source = ?`).run(pathname, source);
+  }
+
+  private isStructuredInputTooLargeError(message: string): boolean {
+    return /(413|payload too large|request too large|input too large|too many tokens|input limit|request size)/i.test(
+      message,
+    );
+  }
+
  protected async indexFile(
    entry: MemoryFileEntry | SessionFileEntry,
    options: { source: MemorySource; content?: string },
@@ -708,42 +814,59 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
      return;
    }

-    const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
-    const chunks = enforceEmbeddingMaxInputTokens(
-      this.provider,
-      chunkMarkdown(content, this.settings.chunking).filter(
-        (chunk) => chunk.text.trim().length > 0,
-      ),
-      EMBEDDING_BATCH_MAX_TOKENS,
-    );
-    if (options.source === "sessions" && "lineMap" in entry) {
-      remapChunkLines(chunks, entry.lineMap);
+    let chunks: MemoryChunk[];
+    let structuredInputBytes: number | undefined;
+    if ("kind" in entry && entry.kind === "multimodal") {
+      const multimodalChunk = await buildMultimodalChunkForIndexing(entry);
+      if (!multimodalChunk) {
+        this.clearIndexedFileData(entry.path, options.source);
+        this.deleteFileRecord(entry.path, options.source);
+        return;
+      }
+      structuredInputBytes = multimodalChunk.structuredInputBytes;
+      chunks = [multimodalChunk.chunk];
+    } else {
+      const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
+      chunks = enforceEmbeddingMaxInputTokens(
+        this.provider,
+        chunkMarkdown(content, this.settings.chunking).filter(
+          (chunk) => chunk.text.trim().length > 0,
+        ),
+        EMBEDDING_BATCH_MAX_TOKENS,
+      );
+      if (options.source === "sessions" && "lineMap" in entry) {
+        remapChunkLines(chunks, entry.lineMap);
+      }
+    }
+    let embeddings: number[][];
+    try {
+      embeddings = this.batch.enabled
+        ? await this.embedChunksWithBatch(chunks, entry, options.source)
+        : await this.embedChunksInBatches(chunks);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      if (
+        "kind" in entry &&
+        entry.kind === "multimodal" &&
+        this.isStructuredInputTooLargeError(message)
+      ) {
+        log.warn("memory embeddings: skipping multimodal file rejected as too large", {
+          path: entry.path,
+          bytes: structuredInputBytes,
+          provider: this.provider.id,
+          model: this.provider.model,
+          error: message,
+        });
+        this.clearIndexedFileData(entry.path, options.source);
+        this.upsertFileRecord(entry, options.source);
+        return;
+      }
+      throw err;
    }
-    const embeddings = this.batch.enabled
-      ? await this.embedChunksWithBatch(chunks, entry, options.source)
-      : await this.embedChunksInBatches(chunks);
    const sample = embeddings.find((embedding) => embedding.length > 0);
    const vectorReady = sample ? await this.ensureVectorReady(sample.length) : false;
    const now = Date.now();
-    if (vectorReady) {
-      try {
-        this.db
-          .prepare(
-            `DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`,
-          )
-          .run(entry.path, options.source);
-      } catch {}
-    }
-    if (this.fts.enabled && this.fts.available) {
-      try {
-        this.db
-          .prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`)
-          .run(entry.path, options.source, this.provider.model);
-      } catch {}
-    }
-    this.db
-      .prepare(`DELETE FROM chunks WHERE path = ? AND source = ?`)
-      .run(entry.path, options.source);
+    this.clearIndexedFileData(entry.path, options.source);
    for (let i = 0; i < chunks.length; i++) {
      const chunk = chunks[i];
      const embedding = embeddings[i] ?? [];
@@ -798,15 +921,6 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
          );
      }
    }
-    this.db
-      .prepare(
-        `INSERT INTO files (path, source, hash, mtime, size) VALUES (?, ?, ?, ?, ?)
-         ON CONFLICT(path) DO UPDATE SET
-           source=excluded.source,
-           hash=excluded.hash,
-           mtime=excluded.mtime,
-           size=excluded.size`,
-      )
-      .run(entry.path, options.source, entry.hash, entry.mtimeMs, entry.size);
+    this.upsertFileRecord(entry, options.source);
  }
 }
--- a/src/memory/manager-sync-ops.ts
+++ b/src/memory/manager-sync-ops.ts
@@ -29,12 +29,18 @@ import { isFileMissingError } from "./fs-utils.js";
 import {
  buildFileEntry,
  ensureDir,
+  hashText,
  listMemoryFiles,
  normalizeExtraMemoryPaths,
  runWithConcurrency,
 } from "./internal.js";
 import { type MemoryFileEntry } from "./internal.js";
 import { ensureMemoryIndexSchema } from "./memory-schema.js";
+import {
+  buildCaseInsensitiveExtensionGlob,
+  classifyMemoryMultimodalPath,
+  getMemoryMultimodalExtensions,
+} from "./multimodal.js";
 import type { SessionFileEntry } from "./session-files.js";
 import {
  buildSessionEntry,
@@ -50,6 +56,7 @@ type MemoryIndexMeta = {
  provider: string;
  providerKey?: string;
  sources?: MemorySource[];
+  scopeHash?: string;
  chunkTokens: number;
  chunkOverlap: number;
  vectorDims?: number;
@@ -383,9 +390,22 @@ export abstract class MemoryManagerSyncOps {
        }
        if (stat.isDirectory()) {
          watchPaths.add(path.join(entry, "**", "*.md"));
+          if (this.settings.multimodal.enabled) {
+            for (const modality of this.settings.multimodal.modalities) {
+              for (const extension of getMemoryMultimodalExtensions(modality)) {
+                watchPaths.add(
+                  path.join(entry, "**", buildCaseInsensitiveExtensionGlob(extension)),
+                );
+              }
+            }
+          }
          continue;
        }
-        if (stat.isFile() && entry.toLowerCase().endsWith(".md")) {
+        if (
+          stat.isFile() &&
+          (entry.toLowerCase().endsWith(".md") ||
+            classifyMemoryMultimodalPath(entry, this.settings.multimodal) !== null)
+        ) {
          watchPaths.add(entry);
        }
      } catch {
@@ -649,9 +669,19 @@ export abstract class MemoryManagerSyncOps {
      return;
    }

-    const files = await listMemoryFiles(this.workspaceDir, this.settings.extraPaths);
+    const files = await listMemoryFiles(
+      this.workspaceDir,
+      this.settings.extraPaths,
+      this.settings.multimodal,
+    );
    const fileEntries = (
-      await Promise.all(files.map(async (file) => buildFileEntry(file, this.workspaceDir)))
+      await runWithConcurrency(
+        files.map(
+          (file) => async () =>
+            await buildFileEntry(file, this.workspaceDir, this.settings.multimodal),
+        ),
+        this.getIndexConcurrency(),
+      )
    ).filter((entry): entry is MemoryFileEntry => entry !== null);
    log.debug("memory sync: indexing memory files", {
      files: fileEntries.length,
@@ -868,6 +898,7 @@ export abstract class MemoryManagerSyncOps {
    const vectorReady = await this.ensureVectorReady();
    const meta = this.readMeta();
    const configuredSources = this.resolveConfiguredSourcesForMeta();
+    const configuredScopeHash = this.resolveConfiguredScopeHash();
    const needsFullReindex =
      params?.force ||
      !meta ||
@@ -875,6 +906,7 @@ export abstract class MemoryManagerSyncOps {
      (this.provider && meta.provider !== this.provider.id) ||
      meta.providerKey !== this.providerKey ||
      this.metaSourcesDiffer(meta, configuredSources) ||
+      meta.scopeHash !== configuredScopeHash ||
      meta.chunkTokens !== this.settings.chunking.tokens ||
      meta.chunkOverlap !== this.settings.chunking.overlap ||
      (vectorReady && !meta?.vectorDims);
@@ -1088,6 +1120,7 @@ export abstract class MemoryManagerSyncOps {
        provider: this.provider?.id ?? "none",
        providerKey: this.providerKey!,
        sources: this.resolveConfiguredSourcesForMeta(),
+        scopeHash: this.resolveConfiguredScopeHash(),
        chunkTokens: this.settings.chunking.tokens,
        chunkOverlap: this.settings.chunking.overlap,
      };
@@ -1159,6 +1192,7 @@ export abstract class MemoryManagerSyncOps {
      provider: this.provider?.id ?? "none",
      providerKey: this.providerKey!,
      sources: this.resolveConfiguredSourcesForMeta(),
+      scopeHash: this.resolveConfiguredScopeHash(),
      chunkTokens: this.settings.chunking.tokens,
      chunkOverlap: this.settings.chunking.overlap,
    };
@@ -1236,6 +1270,22 @@ export abstract class MemoryManagerSyncOps {
    return normalized.length > 0 ? normalized : ["memory"];
  }

+  private resolveConfiguredScopeHash(): string {
+    const extraPaths = normalizeExtraMemoryPaths(this.workspaceDir, this.settings.extraPaths)
+      .map((value) => value.replace(/\\/g, "/"))
+      .toSorted();
+    return hashText(
+      JSON.stringify({
+        extraPaths,
+        multimodal: {
+          enabled: this.settings.multimodal.enabled,
+          modalities: [...this.settings.multimodal.modalities].toSorted(),
+          maxFileBytes: this.settings.multimodal.maxFileBytes,
+        },
+      }),
+    );
+  }
+
  private metaSourcesDiffer(meta: MemoryIndexMeta, configuredSources: MemorySource[]): boolean {
    const metaSources = this.normalizeMetaSources(meta);
    if (metaSources.length !== configuredSources.length) {
--- a/src/memory/manager.watcher-config.test.ts
+++ b/src/memory/manager.watcher-config.test.ts
@@ -106,4 +106,50 @@ describe("memory watcher config", () => {
    expect(ignored?.(path.join(workspaceDir, "memory", ".venv", "lib", "python.md"))).toBe(true);
    expect(ignored?.(path.join(workspaceDir, "memory", "project", "notes.md"))).toBe(false);
  });
+
+  it("watches multimodal extensions with case-insensitive globs", async () => {
+    workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-memory-watch-"));
+    extraDir = path.join(workspaceDir, "extra");
+    await fs.mkdir(path.join(workspaceDir, "memory"), { recursive: true });
+    await fs.mkdir(extraDir, { recursive: true });
+    await fs.writeFile(path.join(extraDir, "PHOTO.PNG"), "png");
+
+    const cfg = {
+      agents: {
+        defaults: {
+          workspace: workspaceDir,
+          memorySearch: {
+            provider: "gemini",
+            model: "gemini-embedding-2-preview",
+            fallback: "none",
+            store: { path: path.join(workspaceDir, "index.sqlite"), vector: { enabled: false } },
+            sync: { watch: true, watchDebounceMs: 25, onSessionStart: false, onSearch: false },
+            query: { minScore: 0, hybrid: { enabled: false } },
+            extraPaths: [extraDir],
+            multimodal: { enabled: true, modalities: ["image", "audio"] },
+          },
+        },
+        list: [{ id: "main", default: true }],
+      },
+    } as OpenClawConfig;
+
+    const result = await getMemorySearchManager({ cfg, agentId: "main" });
+    expect(result.manager).not.toBeNull();
+    if (!result.manager) {
+      throw new Error("manager missing");
+    }
+    manager = result.manager as unknown as MemoryIndexManager;
+
+    expect(watchMock).toHaveBeenCalledTimes(1);
+    const [watchedPaths] = watchMock.mock.calls[0] as unknown as [
+      string[],
+      Record<string, unknown>,
+    ];
+    expect(watchedPaths).toEqual(
+      expect.arrayContaining([
+        path.join(extraDir, "**", "*.[pP][nN][gG]"),
+        path.join(extraDir, "**", "*.[wW][aA][vV]"),
+      ]),
+    );
+  });
 });
--- a/src/memory/multimodal.ts
+++ b/src/memory/multimodal.ts
@@ -0,0 +1,118 @@
+const MEMORY_MULTIMODAL_SPECS = {
+  image: {
+    labelPrefix: "Image file",
+    extensions: [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"],
+  },
+  audio: {
+    labelPrefix: "Audio file",
+    extensions: [".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"],
+  },
+} as const;
+
+export type MemoryMultimodalModality = keyof typeof MEMORY_MULTIMODAL_SPECS;
+export const MEMORY_MULTIMODAL_MODALITIES = Object.keys(
+  MEMORY_MULTIMODAL_SPECS,
+) as MemoryMultimodalModality[];
+export type MemoryMultimodalSelection = MemoryMultimodalModality | "all";
+
+export type MemoryMultimodalSettings = {
+  enabled: boolean;
+  modalities: MemoryMultimodalModality[];
+  maxFileBytes: number;
+};
+
+export const DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES = 10 * 1024 * 1024;
+
+export function normalizeMemoryMultimodalModalities(
+  raw: MemoryMultimodalSelection[] | undefined,
+): MemoryMultimodalModality[] {
+  if (raw === undefined || raw.includes("all")) {
+    return [...MEMORY_MULTIMODAL_MODALITIES];
+  }
+  const normalized = new Set<MemoryMultimodalModality>();
+  for (const value of raw) {
+    if (value === "image" || value === "audio") {
+      normalized.add(value);
+    }
+  }
+  return Array.from(normalized);
+}
+
+export function normalizeMemoryMultimodalSettings(raw: {
+  enabled?: boolean;
+  modalities?: MemoryMultimodalSelection[];
+  maxFileBytes?: number;
+}): MemoryMultimodalSettings {
+  const enabled = raw.enabled === true;
+  const maxFileBytes =
+    typeof raw.maxFileBytes === "number" && Number.isFinite(raw.maxFileBytes)
+      ? Math.max(1, Math.floor(raw.maxFileBytes))
+      : DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES;
+  return {
+    enabled,
+    modalities: enabled ? normalizeMemoryMultimodalModalities(raw.modalities) : [],
+    maxFileBytes,
+  };
+}
+
+export function isMemoryMultimodalEnabled(settings: MemoryMultimodalSettings): boolean {
+  return settings.enabled && settings.modalities.length > 0;
+}
+
+export function getMemoryMultimodalExtensions(
+  modality: MemoryMultimodalModality,
+): readonly string[] {
+  return MEMORY_MULTIMODAL_SPECS[modality].extensions;
+}
+
+export function buildMemoryMultimodalLabel(
+  modality: MemoryMultimodalModality,
+  normalizedPath: string,
+): string {
+  return `${MEMORY_MULTIMODAL_SPECS[modality].labelPrefix}: ${normalizedPath}`;
+}
+
+export function buildCaseInsensitiveExtensionGlob(extension: string): string {
+  const normalized = extension.trim().replace(/^\./, "").toLowerCase();
+  if (!normalized) {
+    return "*";
+  }
+  const parts = Array.from(normalized, (char) => `[${char.toLowerCase()}${char.toUpperCase()}]`);
+  return `*.${parts.join("")}`;
+}
+
+export function classifyMemoryMultimodalPath(
+  filePath: string,
+  settings: MemoryMultimodalSettings,
+): MemoryMultimodalModality | null {
+  if (!isMemoryMultimodalEnabled(settings)) {
+    return null;
+  }
+  const lower = filePath.trim().toLowerCase();
+  for (const modality of settings.modalities) {
+    for (const extension of getMemoryMultimodalExtensions(modality)) {
+      if (lower.endsWith(extension)) {
+        return modality;
+      }
+    }
+  }
+  return null;
+}
+
+export function normalizeGeminiEmbeddingModelForMemory(model: string): string {
+  const trimmed = model.trim();
+  if (!trimmed) {
+    return "";
+  }
+  return trimmed.replace(/^models\//, "").replace(/^(gemini|google)\//, "");
+}
+
+export function supportsMemoryMultimodalEmbeddings(params: {
+  provider: string;
+  model: string;
+}): boolean {
+  if (params.provider !== "gemini") {
+    return false;
+  }
+  return normalizeGeminiEmbeddingModelForMemory(params.model) === "gemini-embedding-2-preview";
+}