diff --git a/CHANGELOG.md b/CHANGELOG.md index e88bd0d4638..73fb9ac030a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai - iOS/TestFlight: add a local beta release flow with Fastlane prepare/archive/upload support, canonical beta bundle IDs, and watch-app archive fixes. (#42991) Thanks @ngutman. - macOS/onboarding: detect when remote gateways need a shared auth token, explain where to find it on the gateway host, and clarify when a successful check used paired-device auth instead. (#43100) Thanks @ngutman. - Onboarding/Ollama: add first-class Ollama setup with Local or Cloud + Local modes, browser-based cloud sign-in, curated model suggestions, and cloud-model handling that skips unnecessary local pulls. (#41529) Thanks @BruceMacD. +- Memory: add opt-in multimodal image and audio indexing for `memorySearch.extraPaths` with Gemini `gemini-embedding-2-preview`, strict fallback gating, and scope-based reindexing. (#43460) Thanks @gumadeiras. ### Breaking diff --git a/docs/concepts/memory.md b/docs/concepts/memory.md index 35c51f6b523..8ed755b394c 100644 --- a/docs/concepts/memory.md +++ b/docs/concepts/memory.md @@ -284,9 +284,46 @@ Notes: - Paths can be absolute or workspace-relative. - Directories are scanned recursively for `.md` files. -- Only Markdown files are indexed. +- By default, only Markdown files are indexed. +- If `memorySearch.multimodal.enabled = true`, OpenClaw also indexes supported image/audio files under `extraPaths` only. Default memory roots (`MEMORY.md`, `memory.md`, `memory/**/*.md`) stay Markdown-only. - Symlinks are ignored (files or directories). +### Multimodal memory files (Gemini image + audio) + +OpenClaw can index image and audio files from `memorySearch.extraPaths` when using Gemini embedding 2: + +```json5 +agents: { + defaults: { + memorySearch: { + provider: "gemini", + model: "gemini-embedding-2-preview", + extraPaths: ["assets/reference", "voice-notes"], + multimodal: { + enabled: true, + modalities: ["image", "audio"], // or ["all"] + maxFileBytes: 10000000 + }, + remote: { + apiKey: "YOUR_GEMINI_API_KEY" + } + } + } +} +``` + +Notes: + +- Multimodal memory is currently supported only for `gemini-embedding-2-preview`. +- Multimodal indexing applies only to files discovered through `memorySearch.extraPaths`. +- Supported modalities in this phase: image and audio. +- `memorySearch.fallback` must stay `"none"` while multimodal memory is enabled. +- Matching image/audio file bytes are uploaded to the configured Gemini embedding endpoint during indexing. +- Supported image extensions: `.jpg`, `.jpeg`, `.png`, `.webp`, `.gif`, `.heic`, `.heif`. +- Supported audio extensions: `.mp3`, `.wav`, `.ogg`, `.opus`, `.m4a`, `.aac`, `.flac`. +- Search queries remain text, but Gemini can compare those text queries against indexed image/audio embeddings. +- `memory_get` still reads Markdown only; binary files are searchable but not returned as raw file contents. + ### Gemini embeddings (native) Set the provider to `gemini` to use the Gemini embeddings API directly: diff --git a/src/agents/memory-search.test.ts b/src/agents/memory-search.test.ts index 9372b4c7696..1d04b730351 100644 --- a/src/agents/memory-search.test.ts +++ b/src/agents/memory-search.test.ts @@ -131,6 +131,113 @@ describe("memory search config", () => { expect(resolved?.extraPaths).toEqual(["/shared/notes", "docs", "../team-notes"]); }); + it("normalizes multimodal settings", () => { + const cfg = asConfig({ + agents: { + defaults: { + memorySearch: { + provider: "gemini", + model: "gemini-embedding-2-preview", + multimodal: { + enabled: true, + modalities: ["all"], + maxFileBytes: 8192, + }, + }, + }, + }, + }); + const resolved = resolveMemorySearchConfig(cfg, "main"); + expect(resolved?.multimodal).toEqual({ + enabled: true, + modalities: ["image", "audio"], + maxFileBytes: 8192, + }); + }); + + it("keeps an explicit empty multimodal modalities list empty", () => { + const cfg = asConfig({ + agents: { + defaults: { + memorySearch: { + provider: "gemini", + model: "gemini-embedding-2-preview", + multimodal: { + enabled: true, + modalities: [], + }, + }, + }, + }, + }); + const resolved = resolveMemorySearchConfig(cfg, "main"); + expect(resolved?.multimodal).toEqual({ + enabled: true, + modalities: [], + maxFileBytes: 10 * 1024 * 1024, + }); + expect(resolved?.provider).toBe("gemini"); + }); + + it("does not enforce multimodal provider validation when no modalities are active", () => { + const cfg = asConfig({ + agents: { + defaults: { + memorySearch: { + provider: "openai", + model: "text-embedding-3-small", + fallback: "openai", + multimodal: { + enabled: true, + modalities: [], + }, + }, + }, + }, + }); + const resolved = resolveMemorySearchConfig(cfg, "main"); + expect(resolved?.multimodal).toEqual({ + enabled: true, + modalities: [], + maxFileBytes: 10 * 1024 * 1024, + }); + }); + + it("rejects multimodal memory on unsupported providers", () => { + const cfg = asConfig({ + agents: { + defaults: { + memorySearch: { + provider: "openai", + model: "text-embedding-3-small", + multimodal: { enabled: true, modalities: ["image"] }, + }, + }, + }, + }); + expect(() => resolveMemorySearchConfig(cfg, "main")).toThrow( + /memorySearch\.multimodal requires memorySearch\.provider = "gemini"/, + ); + }); + + it("rejects multimodal memory when fallback is configured", () => { + const cfg = asConfig({ + agents: { + defaults: { + memorySearch: { + provider: "gemini", + model: "gemini-embedding-2-preview", + fallback: "openai", + multimodal: { enabled: true, modalities: ["image"] }, + }, + }, + }, + }); + expect(() => resolveMemorySearchConfig(cfg, "main")).toThrow( + /memorySearch\.multimodal does not support memorySearch\.fallback/, + ); + }); + it("includes batch defaults for openai without remote overrides", () => { const cfg = configWithDefaultProvider("openai"); const resolved = resolveMemorySearchConfig(cfg, "main"); diff --git a/src/agents/memory-search.ts b/src/agents/memory-search.ts index 6bcacfec2db..d00dae70639 100644 --- a/src/agents/memory-search.ts +++ b/src/agents/memory-search.ts @@ -3,6 +3,12 @@ import path from "node:path"; import type { OpenClawConfig, MemorySearchConfig } from "../config/config.js"; import { resolveStateDir } from "../config/paths.js"; import type { SecretInput } from "../config/types.secrets.js"; +import { + isMemoryMultimodalEnabled, + normalizeMemoryMultimodalSettings, + supportsMemoryMultimodalEmbeddings, + type MemoryMultimodalSettings, +} from "../memory/multimodal.js"; import { clampInt, clampNumber, resolveUserPath } from "../utils.js"; import { resolveAgentConfig } from "./agent-scope.js"; @@ -10,6 +16,7 @@ export type ResolvedMemorySearchConfig = { enabled: boolean; sources: Array<"memory" | "sessions">; extraPaths: string[]; + multimodal: MemoryMultimodalSettings; provider: "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama" | "auto"; remote?: { baseUrl?: string; @@ -204,6 +211,11 @@ function mergeConfig( .map((value) => value.trim()) .filter(Boolean); const extraPaths = Array.from(new Set(rawPaths)); + const multimodal = normalizeMemoryMultimodalSettings({ + enabled: overrides?.multimodal?.enabled ?? defaults?.multimodal?.enabled, + modalities: overrides?.multimodal?.modalities ?? defaults?.multimodal?.modalities, + maxFileBytes: overrides?.multimodal?.maxFileBytes ?? defaults?.multimodal?.maxFileBytes, + }); const vector = { enabled: overrides?.store?.vector?.enabled ?? defaults?.store?.vector?.enabled ?? true, extensionPath: @@ -307,6 +319,7 @@ function mergeConfig( enabled, sources, extraPaths, + multimodal, provider, remote, experimental: { @@ -365,5 +378,22 @@ export function resolveMemorySearchConfig( if (!resolved.enabled) { return null; } + const multimodalActive = isMemoryMultimodalEnabled(resolved.multimodal); + if ( + multimodalActive && + !supportsMemoryMultimodalEmbeddings({ + provider: resolved.provider, + model: resolved.model, + }) + ) { + throw new Error( + 'agents.*.memorySearch.multimodal requires memorySearch.provider = "gemini" and model = "gemini-embedding-2-preview".', + ); + } + if (multimodalActive && resolved.fallback !== "none") { + throw new Error( + 'agents.*.memorySearch.multimodal does not support memorySearch.fallback. Set fallback to "none".', + ); + } return resolved; } diff --git a/src/config/schema.help.quality.test.ts b/src/config/schema.help.quality.test.ts index 730dd397831..965eed0e55d 100644 --- a/src/config/schema.help.quality.test.ts +++ b/src/config/schema.help.quality.test.ts @@ -72,6 +72,10 @@ const TARGET_KEYS = [ "agents.defaults.memorySearch.fallback", "agents.defaults.memorySearch.sources", "agents.defaults.memorySearch.extraPaths", + "agents.defaults.memorySearch.multimodal", + "agents.defaults.memorySearch.multimodal.enabled", + "agents.defaults.memorySearch.multimodal.modalities", + "agents.defaults.memorySearch.multimodal.maxFileBytes", "agents.defaults.memorySearch.experimental.sessionMemory", "agents.defaults.memorySearch.remote.baseUrl", "agents.defaults.memorySearch.remote.apiKey", diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index bd93f711d91..3db7f40fe73 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -778,7 +778,15 @@ export const FIELD_HELP: Record = { "agents.defaults.memorySearch.sources": 'Chooses which sources are indexed: "memory" reads MEMORY.md + memory files, and "sessions" includes transcript history. Keep ["memory"] unless you need recall from prior chat transcripts.', "agents.defaults.memorySearch.extraPaths": - "Adds extra directories or .md files to the memory index beyond default memory files. Use this when key reference docs live elsewhere in your repo; keep paths small and intentional to avoid noisy recall.", + "Adds extra directories or .md files to the memory index beyond default memory files. Use this when key reference docs live elsewhere in your repo; when multimodal memory is enabled, matching image/audio files under these paths are also eligible for indexing.", + "agents.defaults.memorySearch.multimodal": + 'Optional multimodal memory settings for indexing image and audio files from configured extra paths. Keep this off unless your embedding model explicitly supports cross-modal embeddings, and set `memorySearch.fallback` to "none" while it is enabled. Matching files are uploaded to the configured remote embedding provider during indexing.', + "agents.defaults.memorySearch.multimodal.enabled": + "Enables image/audio memory indexing from extraPaths. This currently requires Gemini embedding-2, keeps the default memory roots Markdown-only, disables memory-search fallback providers, and uploads matching binary content to the configured remote embedding provider.", + "agents.defaults.memorySearch.multimodal.modalities": + 'Selects which multimodal file types are indexed from extraPaths: "image", "audio", or "all". Keep this narrow to avoid indexing large binary corpora unintentionally.', + "agents.defaults.memorySearch.multimodal.maxFileBytes": + "Sets the maximum bytes allowed per multimodal file before it is skipped during memory indexing. Use this to cap upload cost and indexing latency, or raise it for short high-quality audio clips.", "agents.defaults.memorySearch.experimental.sessionMemory": "Indexes session transcripts into memory search so responses can reference prior chat turns. Keep this off unless transcript recall is needed, because indexing cost and storage usage both increase.", "agents.defaults.memorySearch.provider": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index b7477b4798a..01b8d0f57dd 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -319,6 +319,10 @@ export const FIELD_LABELS: Record = { "agents.defaults.memorySearch.enabled": "Enable Memory Search", "agents.defaults.memorySearch.sources": "Memory Search Sources", "agents.defaults.memorySearch.extraPaths": "Extra Memory Paths", + "agents.defaults.memorySearch.multimodal": "Memory Search Multimodal", + "agents.defaults.memorySearch.multimodal.enabled": "Enable Memory Search Multimodal", + "agents.defaults.memorySearch.multimodal.modalities": "Memory Search Multimodal Modalities", + "agents.defaults.memorySearch.multimodal.maxFileBytes": "Memory Search Multimodal Max File Bytes", "agents.defaults.memorySearch.experimental.sessionMemory": "Memory Search Session Index (Experimental)", "agents.defaults.memorySearch.provider": "Memory Search Provider", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 5de1b4cafa5..aaf6cb33e79 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -319,6 +319,15 @@ export type MemorySearchConfig = { sources?: Array<"memory" | "sessions">; /** Extra paths to include in memory search (directories or .md files). */ extraPaths?: string[]; + /** Optional multimodal file indexing for selected extra paths. */ + multimodal?: { + /** Enable image/audio embeddings from extraPaths. */ + enabled?: boolean; + /** Which non-text file types to index. */ + modalities?: Array<"image" | "audio" | "all">; + /** Max bytes allowed per multimodal file before it is skipped. */ + maxFileBytes?: number; + }; /** Experimental memory search settings. */ experimental?: { /** Enable session transcript indexing (experimental, default: false). */ diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index a240eba5d43..d5b9eeedb16 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -553,6 +553,16 @@ export const MemorySearchSchema = z enabled: z.boolean().optional(), sources: z.array(z.union([z.literal("memory"), z.literal("sessions")])).optional(), extraPaths: z.array(z.string()).optional(), + multimodal: z + .object({ + enabled: z.boolean().optional(), + modalities: z + .array(z.union([z.literal("image"), z.literal("audio"), z.literal("all")])) + .optional(), + maxFileBytes: z.number().int().positive().optional(), + }) + .strict() + .optional(), experimental: z .object({ sessionMemory: z.boolean().optional(), diff --git a/src/media/mime.ts b/src/media/mime.ts index e551350c057..4c9b6bf1db3 100644 --- a/src/media/mime.ts +++ b/src/media/mime.ts @@ -12,6 +12,10 @@ const EXT_BY_MIME: Record = { "image/gif": ".gif", "audio/ogg": ".ogg", "audio/mpeg": ".mp3", + "audio/wav": ".wav", + "audio/flac": ".flac", + "audio/aac": ".aac", + "audio/opus": ".opus", "audio/x-m4a": ".m4a", "audio/mp4": ".m4a", "video/mp4": ".mp4", diff --git a/src/memory/embedding-chunk-limits.ts b/src/memory/embedding-chunk-limits.ts index 033b30a84a3..5c8cf9020f3 100644 --- a/src/memory/embedding-chunk-limits.ts +++ b/src/memory/embedding-chunk-limits.ts @@ -1,4 +1,5 @@ import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js"; +import { hasNonTextEmbeddingParts } from "./embedding-inputs.js"; import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js"; import type { EmbeddingProvider } from "./embeddings.js"; import { hashText, type MemoryChunk } from "./internal.js"; @@ -16,6 +17,10 @@ export function enforceEmbeddingMaxInputTokens( const out: MemoryChunk[] = []; for (const chunk of chunks) { + if (hasNonTextEmbeddingParts(chunk.embeddingInput)) { + out.push(chunk); + continue; + } if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) { out.push(chunk); continue; @@ -27,6 +32,7 @@ export function enforceEmbeddingMaxInputTokens( endLine: chunk.endLine, text, hash: hashText(text), + embeddingInput: { text }, }); } } diff --git a/src/memory/embedding-input-limits.ts b/src/memory/embedding-input-limits.ts index dad83bb7aa7..4eadf1bf48d 100644 --- a/src/memory/embedding-input-limits.ts +++ b/src/memory/embedding-input-limits.ts @@ -1,3 +1,5 @@ +import type { EmbeddingInput } from "./embedding-inputs.js"; + // Helpers for enforcing embedding model input size limits. // // We use UTF-8 byte length as a conservative upper bound for tokenizer output. @@ -11,6 +13,22 @@ export function estimateUtf8Bytes(text: string): number { return Buffer.byteLength(text, "utf8"); } +export function estimateStructuredEmbeddingInputBytes(input: EmbeddingInput): number { + if (!input.parts?.length) { + return estimateUtf8Bytes(input.text); + } + let total = 0; + for (const part of input.parts) { + if (part.type === "text") { + total += estimateUtf8Bytes(part.text); + continue; + } + total += estimateUtf8Bytes(part.mimeType); + total += estimateUtf8Bytes(part.data); + } + return total; +} + export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] { if (maxUtf8Bytes <= 0) { return [text]; diff --git a/src/memory/embedding-inputs.ts b/src/memory/embedding-inputs.ts new file mode 100644 index 00000000000..767a463f740 --- /dev/null +++ b/src/memory/embedding-inputs.ts @@ -0,0 +1,34 @@ +export type EmbeddingInputTextPart = { + type: "text"; + text: string; +}; + +export type EmbeddingInputInlineDataPart = { + type: "inline-data"; + mimeType: string; + data: string; +}; + +export type EmbeddingInputPart = EmbeddingInputTextPart | EmbeddingInputInlineDataPart; + +export type EmbeddingInput = { + text: string; + parts?: EmbeddingInputPart[]; +}; + +export function buildTextEmbeddingInput(text: string): EmbeddingInput { + return { text }; +} + +export function isInlineDataEmbeddingInputPart( + part: EmbeddingInputPart, +): part is EmbeddingInputInlineDataPart { + return part.type === "inline-data"; +} + +export function hasNonTextEmbeddingParts(input: EmbeddingInput | undefined): boolean { + if (!input?.parts?.length) { + return false; + } + return input.parts.some((part) => isInlineDataEmbeddingInputPart(part)); +} diff --git a/src/memory/embeddings-gemini.test.ts b/src/memory/embeddings-gemini.test.ts index ae65c8d72b8..f97cc6cb142 100644 --- a/src/memory/embeddings-gemini.test.ts +++ b/src/memory/embeddings-gemini.test.ts @@ -1,16 +1,13 @@ import { afterEach, describe, expect, it, vi } from "vitest"; import * as authModule from "../agents/model-auth.js"; import { - buildFileDataPart, - buildGeminiParts, + buildGeminiEmbeddingRequest, buildGeminiTextEmbeddingRequest, - buildInlineDataPart, createGeminiEmbeddingProvider, DEFAULT_GEMINI_EMBEDDING_MODEL, GEMINI_EMBEDDING_2_MODELS, isGeminiEmbedding2Model, resolveGeminiOutputDimensionality, - type GeminiPart, } from "./embeddings-gemini.js"; vi.mock("../agents/model-auth.js", async () => { @@ -61,40 +58,6 @@ function mockResolvedProviderKey(apiKey = "test-key") { }); } -// ---------- Helper function tests ---------- - -describe("buildGeminiParts", () => { - it("wraps a string into a single text part", () => { - expect(buildGeminiParts("hello")).toEqual([{ text: "hello" }]); - }); - - it("passes through an existing parts array", () => { - const parts: GeminiPart[] = [ - { text: "hello" }, - { inlineData: { mimeType: "image/png", data: "base64data" } }, - ]; - expect(buildGeminiParts(parts)).toBe(parts); - }); -}); - -describe("buildInlineDataPart", () => { - it("produces the correct shape", () => { - const part = buildInlineDataPart("image/jpeg", "abc123"); - expect(part).toEqual({ - inlineData: { mimeType: "image/jpeg", data: "abc123" }, - }); - }); -}); - -describe("buildFileDataPart", () => { - it("produces the correct shape", () => { - const part = buildFileDataPart("application/pdf", "gs://bucket/file.pdf"); - expect(part).toEqual({ - fileData: { mimeType: "application/pdf", fileUri: "gs://bucket/file.pdf" }, - }); - }); -}); - describe("buildGeminiTextEmbeddingRequest", () => { it("builds a text embedding request with optional model and dimensions", () => { expect( @@ -113,6 +76,35 @@ describe("buildGeminiTextEmbeddingRequest", () => { }); }); +describe("buildGeminiEmbeddingRequest", () => { + it("builds a multimodal request from structured input parts", () => { + expect( + buildGeminiEmbeddingRequest({ + input: { + text: "Image file: diagram.png", + parts: [ + { type: "text", text: "Image file: diagram.png" }, + { type: "inline-data", mimeType: "image/png", data: "abc123" }, + ], + }, + taskType: "RETRIEVAL_DOCUMENT", + modelPath: "models/gemini-embedding-2-preview", + outputDimensionality: 1536, + }), + ).toEqual({ + model: "models/gemini-embedding-2-preview", + content: { + parts: [ + { text: "Image file: diagram.png" }, + { inlineData: { mimeType: "image/png", data: "abc123" } }, + ], + }, + taskType: "RETRIEVAL_DOCUMENT", + outputDimensionality: 1536, + }); + }); +}); + // ---------- Model detection ---------- describe("isGeminiEmbedding2Model", () => { @@ -319,6 +311,21 @@ describe("gemini-embedding-2-preview provider", () => { expect(body.outputDimensionality).toBe(768); }); + it("sanitizes and normalizes embedQuery responses", async () => { + const fetchMock = createGeminiFetchMock([3, 4, Number.NaN]); + vi.stubGlobal("fetch", fetchMock); + mockResolvedProviderKey(); + + const { provider } = await createGeminiEmbeddingProvider({ + config: {} as never, + provider: "gemini", + model: "gemini-embedding-2-preview", + fallback: "none", + }); + + await expect(provider.embedQuery("test")).resolves.toEqual([0.6, 0.8, 0]); + }); + it("uses custom outputDimensionality for each embedBatch request", async () => { const fetchMock = createGeminiBatchFetchMock(2); vi.stubGlobal("fetch", fetchMock); @@ -341,6 +348,88 @@ describe("gemini-embedding-2-preview provider", () => { ]); }); + it("sanitizes and normalizes structured batch responses", async () => { + const fetchMock = createGeminiBatchFetchMock(1, [0, Number.POSITIVE_INFINITY, 5]); + vi.stubGlobal("fetch", fetchMock); + mockResolvedProviderKey(); + + const { provider } = await createGeminiEmbeddingProvider({ + config: {} as never, + provider: "gemini", + model: "gemini-embedding-2-preview", + fallback: "none", + }); + + await expect( + provider.embedBatchInputs?.([ + { + text: "Image file: diagram.png", + parts: [ + { type: "text", text: "Image file: diagram.png" }, + { type: "inline-data", mimeType: "image/png", data: "img" }, + ], + }, + ]), + ).resolves.toEqual([[0, 0, 1]]); + }); + + it("supports multimodal embedBatchInputs requests", async () => { + const fetchMock = createGeminiBatchFetchMock(2); + vi.stubGlobal("fetch", fetchMock); + mockResolvedProviderKey(); + + const { provider } = await createGeminiEmbeddingProvider({ + config: {} as never, + provider: "gemini", + model: "gemini-embedding-2-preview", + fallback: "none", + }); + + expect(provider.embedBatchInputs).toBeDefined(); + await provider.embedBatchInputs?.([ + { + text: "Image file: diagram.png", + parts: [ + { type: "text", text: "Image file: diagram.png" }, + { type: "inline-data", mimeType: "image/png", data: "img" }, + ], + }, + { + text: "Audio file: note.wav", + parts: [ + { type: "text", text: "Audio file: note.wav" }, + { type: "inline-data", mimeType: "audio/wav", data: "aud" }, + ], + }, + ]); + + const body = parseFetchBody(fetchMock); + expect(body.requests).toEqual([ + { + model: "models/gemini-embedding-2-preview", + content: { + parts: [ + { text: "Image file: diagram.png" }, + { inlineData: { mimeType: "image/png", data: "img" } }, + ], + }, + taskType: "RETRIEVAL_DOCUMENT", + outputDimensionality: 3072, + }, + { + model: "models/gemini-embedding-2-preview", + content: { + parts: [ + { text: "Audio file: note.wav" }, + { inlineData: { mimeType: "audio/wav", data: "aud" } }, + ], + }, + taskType: "RETRIEVAL_DOCUMENT", + outputDimensionality: 3072, + }, + ]); + }); + it("throws for invalid outputDimensionality", async () => { mockResolvedProviderKey(); diff --git a/src/memory/embeddings-gemini.ts b/src/memory/embeddings-gemini.ts index 71c8b67fb1a..ab028241ed8 100644 --- a/src/memory/embeddings-gemini.ts +++ b/src/memory/embeddings-gemini.ts @@ -5,6 +5,7 @@ import { import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js"; import { parseGeminiAuth } from "../infra/gemini-auth.js"; import type { SsrFPolicy } from "../infra/net/ssrf.js"; +import type { EmbeddingInput } from "./embedding-inputs.js"; import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js"; import { debugEmbeddingsLog } from "./embeddings-debug.js"; import type { EmbeddingProvider, EmbeddingProviderOptions } from "./embeddings.js"; @@ -50,34 +51,14 @@ export type GeminiTextPart = { text: string }; export type GeminiInlinePart = { inlineData: { mimeType: string; data: string }; }; -export type GeminiFilePart = { - fileData: { mimeType: string; fileUri: string }; -}; -export type GeminiPart = GeminiTextPart | GeminiInlinePart | GeminiFilePart; -export type GeminiTextEmbeddingRequest = { - content: { parts: GeminiTextPart[] }; +export type GeminiPart = GeminiTextPart | GeminiInlinePart; +export type GeminiEmbeddingRequest = { + content: { parts: GeminiPart[] }; taskType: GeminiTaskType; outputDimensionality?: number; model?: string; }; - -/** Convert a string or pre-built parts array into `GeminiPart[]`. */ -export function buildGeminiParts(input: string | GeminiPart[]): GeminiPart[] { - if (typeof input === "string") { - return [{ text: input }]; - } - return input; -} - -/** Convenience: build an inline-data part for multimodal embeddings. */ -export function buildInlineDataPart(mimeType: string, base64Data: string): GeminiInlinePart { - return { inlineData: { mimeType, data: base64Data } }; -} - -/** Convenience: build a file-data part for multimodal embeddings. */ -export function buildFileDataPart(mimeType: string, fileUri: string): GeminiFilePart { - return { fileData: { mimeType, fileUri } }; -} +export type GeminiTextEmbeddingRequest = GeminiEmbeddingRequest; /** Builds the text-only Gemini embedding request shape used across direct and batch APIs. */ export function buildGeminiTextEmbeddingRequest(params: { @@ -86,8 +67,30 @@ export function buildGeminiTextEmbeddingRequest(params: { outputDimensionality?: number; modelPath?: string; }): GeminiTextEmbeddingRequest { - const request: GeminiTextEmbeddingRequest = { - content: { parts: [{ text: params.text }] }, + return buildGeminiEmbeddingRequest({ + input: { text: params.text }, + taskType: params.taskType, + outputDimensionality: params.outputDimensionality, + modelPath: params.modelPath, + }); +} + +export function buildGeminiEmbeddingRequest(params: { + input: EmbeddingInput; + taskType: GeminiTaskType; + outputDimensionality?: number; + modelPath?: string; +}): GeminiEmbeddingRequest { + const request: GeminiEmbeddingRequest = { + content: { + parts: params.input.parts?.map((part) => + part.type === "text" + ? ({ text: part.text } satisfies GeminiTextPart) + : ({ + inlineData: { mimeType: part.mimeType, data: part.data }, + } satisfies GeminiInlinePart), + ) ?? [{ text: params.input.text }], + }, taskType: params.taskType, }; if (params.modelPath) { @@ -143,7 +146,7 @@ function resolveRemoteApiKey(remoteApiKey: unknown): string | undefined { return trimmed; } -function normalizeGeminiModel(model: string): string { +export function normalizeGeminiModel(model: string): string { const trimmed = model.trim(); if (!trimmed) { return DEFAULT_GEMINI_EMBEDDING_MODEL; @@ -158,6 +161,46 @@ function normalizeGeminiModel(model: string): string { return withoutPrefix; } +async function fetchGeminiEmbeddingPayload(params: { + client: GeminiEmbeddingClient; + endpoint: string; + body: unknown; +}): Promise<{ + embedding?: { values?: number[] }; + embeddings?: Array<{ values?: number[] }>; +}> { + return await executeWithApiKeyRotation({ + provider: "google", + apiKeys: params.client.apiKeys, + execute: async (apiKey) => { + const authHeaders = parseGeminiAuth(apiKey); + const headers = { + ...authHeaders.headers, + ...params.client.headers, + }; + return await withRemoteHttpResponse({ + url: params.endpoint, + ssrfPolicy: params.client.ssrfPolicy, + init: { + method: "POST", + headers, + body: JSON.stringify(params.body), + }, + onResponse: async (res) => { + if (!res.ok) { + const text = await res.text(); + throw new Error(`gemini embeddings failed: ${res.status} ${text}`); + } + return (await res.json()) as { + embedding?: { values?: number[] }; + embeddings?: Array<{ values?: number[] }>; + }; + }, + }); + }, + }); +} + function normalizeGeminiBaseUrl(raw: string): string { const trimmed = raw.replace(/\/+$/, ""); const openAiIndex = trimmed.indexOf("/openai"); @@ -181,71 +224,50 @@ export async function createGeminiEmbeddingProvider( const isV2 = isGeminiEmbedding2Model(client.model); const outputDimensionality = client.outputDimensionality; - const fetchWithGeminiAuth = async (apiKey: string, endpoint: string, body: unknown) => { - const authHeaders = parseGeminiAuth(apiKey); - const headers = { - ...authHeaders.headers, - ...client.headers, - }; - const payload = await withRemoteHttpResponse({ - url: endpoint, - ssrfPolicy: client.ssrfPolicy, - init: { - method: "POST", - headers, - body: JSON.stringify(body), - }, - onResponse: async (res) => { - if (!res.ok) { - const text = await res.text(); - throw new Error(`gemini embeddings failed: ${res.status} ${text}`); - } - return (await res.json()) as { - embedding?: { values?: number[] }; - embeddings?: Array<{ values?: number[] }>; - }; - }, - }); - return payload; - }; - const embedQuery = async (text: string): Promise => { if (!text.trim()) { return []; } - const body = buildGeminiTextEmbeddingRequest({ - text, - taskType: options.taskType ?? "RETRIEVAL_QUERY", - outputDimensionality: isV2 ? outputDimensionality : undefined, - }); - const payload = await executeWithApiKeyRotation({ - provider: "google", - apiKeys: client.apiKeys, - execute: (apiKey) => fetchWithGeminiAuth(apiKey, embedUrl, body), + const payload = await fetchGeminiEmbeddingPayload({ + client, + endpoint: embedUrl, + body: buildGeminiTextEmbeddingRequest({ + text, + taskType: options.taskType ?? "RETRIEVAL_QUERY", + outputDimensionality: isV2 ? outputDimensionality : undefined, + }), }); return sanitizeAndNormalizeEmbedding(payload.embedding?.values ?? []); }; - const embedBatch = async (texts: string[]): Promise => { - if (texts.length === 0) { + const embedBatchInputs = async (inputs: EmbeddingInput[]): Promise => { + if (inputs.length === 0) { return []; } - const requests = texts.map((text) => - buildGeminiTextEmbeddingRequest({ - text, - modelPath: client.modelPath, - taskType: options.taskType ?? "RETRIEVAL_DOCUMENT", - outputDimensionality: isV2 ? outputDimensionality : undefined, - }), - ); - const batchBody = { requests }; - const payload = await executeWithApiKeyRotation({ - provider: "google", - apiKeys: client.apiKeys, - execute: (apiKey) => fetchWithGeminiAuth(apiKey, batchUrl, batchBody), + const payload = await fetchGeminiEmbeddingPayload({ + client, + endpoint: batchUrl, + body: { + requests: inputs.map((input) => + buildGeminiEmbeddingRequest({ + input, + modelPath: client.modelPath, + taskType: options.taskType ?? "RETRIEVAL_DOCUMENT", + outputDimensionality: isV2 ? outputDimensionality : undefined, + }), + ), + }, }); const embeddings = Array.isArray(payload.embeddings) ? payload.embeddings : []; - return texts.map((_, index) => sanitizeAndNormalizeEmbedding(embeddings[index]?.values ?? [])); + return inputs.map((_, index) => sanitizeAndNormalizeEmbedding(embeddings[index]?.values ?? [])); + }; + + const embedBatch = async (texts: string[]): Promise => { + return await embedBatchInputs( + texts.map((text) => ({ + text, + })), + ); }; return { @@ -255,6 +277,7 @@ export async function createGeminiEmbeddingProvider( maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model], embedQuery, embedBatch, + embedBatchInputs, }, client, }; diff --git a/src/memory/embeddings.ts b/src/memory/embeddings.ts index a5da5222542..f9cc76eb19d 100644 --- a/src/memory/embeddings.ts +++ b/src/memory/embeddings.ts @@ -4,6 +4,7 @@ import type { OpenClawConfig } from "../config/config.js"; import type { SecretInput } from "../config/types.secrets.js"; import { formatErrorMessage } from "../infra/errors.js"; import { resolveUserPath } from "../utils.js"; +import type { EmbeddingInput } from "./embedding-inputs.js"; import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js"; import { createGeminiEmbeddingProvider, @@ -31,6 +32,7 @@ export type EmbeddingProvider = { maxInputTokens?: number; embedQuery: (text: string) => Promise; embedBatch: (texts: string[]) => Promise; + embedBatchInputs?: (inputs: EmbeddingInput[]) => Promise; }; export type EmbeddingProviderId = "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama"; diff --git a/src/memory/index.test.ts b/src/memory/index.test.ts index 8010c419494..23371056b18 100644 --- a/src/memory/index.test.ts +++ b/src/memory/index.test.ts @@ -1,3 +1,4 @@ +import { randomUUID } from "node:crypto"; import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; @@ -6,6 +7,7 @@ import { getMemorySearchManager, type MemoryIndexManager } from "./index.js"; import "./test-runtime-mocks.js"; let embedBatchCalls = 0; +let embedBatchInputCalls = 0; let providerCalls: Array<{ provider?: string; model?: string; outputDimensionality?: number }> = []; vi.mock("./embeddings.js", () => { @@ -13,7 +15,9 @@ vi.mock("./embeddings.js", () => { const lower = text.toLowerCase(); const alpha = lower.split("alpha").length - 1; const beta = lower.split("beta").length - 1; - return [alpha, beta]; + const image = lower.split("image").length - 1; + const audio = lower.split("audio").length - 1; + return [alpha, beta, image, audio]; }; return { createEmbeddingProvider: async (options: { @@ -38,6 +42,36 @@ vi.mock("./embeddings.js", () => { embedBatchCalls += 1; return texts.map(embedText); }, + ...(providerId === "gemini" + ? { + embedBatchInputs: async ( + inputs: Array<{ + text: string; + parts?: Array< + | { type: "text"; text: string } + | { type: "inline-data"; mimeType: string; data: string } + >; + }>, + ) => { + embedBatchInputCalls += 1; + return inputs.map((input) => { + const inlineData = input.parts?.find((part) => part.type === "inline-data"); + if (inlineData?.type === "inline-data" && inlineData.data.length > 9000) { + throw new Error("payload too large"); + } + const mimeType = + inlineData?.type === "inline-data" ? inlineData.mimeType : undefined; + if (mimeType?.startsWith("image/")) { + return [0, 0, 1, 0]; + } + if (mimeType?.startsWith("audio/")) { + return [0, 0, 0, 1]; + } + return embedText(input.text); + }); + }, + } + : {}), }, ...(providerId === "gemini" ? { @@ -64,6 +98,7 @@ describe("memory index", () => { let indexVectorPath = ""; let indexMainPath = ""; let indexExtraPath = ""; + let indexMultimodalPath = ""; let indexStatusPath = ""; let indexSourceChangePath = ""; let indexModelPath = ""; @@ -97,6 +132,7 @@ describe("memory index", () => { indexMainPath = path.join(workspaceDir, "index-main.sqlite"); indexVectorPath = path.join(workspaceDir, "index-vector.sqlite"); indexExtraPath = path.join(workspaceDir, "index-extra.sqlite"); + indexMultimodalPath = path.join(workspaceDir, "index-multimodal.sqlite"); indexStatusPath = path.join(workspaceDir, "index-status.sqlite"); indexSourceChangePath = path.join(workspaceDir, "index-source-change.sqlite"); indexModelPath = path.join(workspaceDir, "index-model-change.sqlite"); @@ -119,6 +155,7 @@ describe("memory index", () => { // Keep atomic reindex tests on the safe path. vi.stubEnv("OPENCLAW_TEST_MEMORY_UNSAFE_REINDEX", "1"); embedBatchCalls = 0; + embedBatchInputCalls = 0; providerCalls = []; // Keep the workspace stable to allow manager reuse across tests. @@ -149,6 +186,11 @@ describe("memory index", () => { provider?: "openai" | "gemini"; model?: string; outputDimensionality?: number; + multimodal?: { + enabled?: boolean; + modalities?: Array<"image" | "audio" | "all">; + maxFileBytes?: number; + }; vectorEnabled?: boolean; cacheEnabled?: boolean; minScore?: number; @@ -172,6 +214,7 @@ describe("memory index", () => { }, cache: params.cacheEnabled ? { enabled: true } : undefined, extraPaths: params.extraPaths, + multimodal: params.multimodal, sources: params.sources, experimental: { sessionMemory: params.sessionMemory ?? false }, }, @@ -247,6 +290,103 @@ describe("memory index", () => { ); }); + it("indexes multimodal image and audio files from extra paths with Gemini structured inputs", async () => { + const mediaDir = path.join(workspaceDir, "media-memory"); + await fs.mkdir(mediaDir, { recursive: true }); + await fs.writeFile(path.join(mediaDir, "diagram.png"), Buffer.from("png")); + await fs.writeFile(path.join(mediaDir, "meeting.wav"), Buffer.from("wav")); + + const cfg = createCfg({ + storePath: indexMultimodalPath, + provider: "gemini", + model: "gemini-embedding-2-preview", + extraPaths: [mediaDir], + multimodal: { enabled: true, modalities: ["image", "audio"] }, + }); + const manager = await getPersistentManager(cfg); + await manager.sync({ reason: "test" }); + + expect(embedBatchInputCalls).toBeGreaterThan(0); + + const imageResults = await manager.search("image"); + expect(imageResults.some((result) => result.path.endsWith("diagram.png"))).toBe(true); + + const audioResults = await manager.search("audio"); + expect(audioResults.some((result) => result.path.endsWith("meeting.wav"))).toBe(true); + }); + + it("skips oversized multimodal inputs without aborting sync", async () => { + const mediaDir = path.join(workspaceDir, "media-oversize"); + await fs.mkdir(mediaDir, { recursive: true }); + await fs.writeFile(path.join(mediaDir, "huge.png"), Buffer.alloc(7000, 1)); + + const cfg = createCfg({ + storePath: path.join(workspaceDir, `index-oversize-${randomUUID()}.sqlite`), + provider: "gemini", + model: "gemini-embedding-2-preview", + extraPaths: [mediaDir], + multimodal: { enabled: true, modalities: ["image"] }, + }); + const manager = requireManager(await getMemorySearchManager({ cfg, agentId: "main" })); + await manager.sync({ reason: "test" }); + + expect(embedBatchInputCalls).toBeGreaterThan(0); + const imageResults = await manager.search("image"); + expect(imageResults.some((result) => result.path.endsWith("huge.png"))).toBe(false); + + const alphaResults = await manager.search("alpha"); + expect(alphaResults.some((result) => result.path.endsWith("memory/2026-01-12.md"))).toBe(true); + + await manager.close?.(); + }); + + it("reindexes a multimodal file after a transient mid-sync disappearance", async () => { + const mediaDir = path.join(workspaceDir, "media-race"); + const imagePath = path.join(mediaDir, "diagram.png"); + await fs.mkdir(mediaDir, { recursive: true }); + await fs.writeFile(imagePath, Buffer.from("png")); + + const cfg = createCfg({ + storePath: path.join(workspaceDir, `index-race-${randomUUID()}.sqlite`), + provider: "gemini", + model: "gemini-embedding-2-preview", + extraPaths: [mediaDir], + multimodal: { enabled: true, modalities: ["image"] }, + }); + const manager = requireManager(await getMemorySearchManager({ cfg, agentId: "main" })); + const realReadFile = fs.readFile.bind(fs); + let imageReads = 0; + const readSpy = vi.spyOn(fs, "readFile").mockImplementation(async (...args) => { + const [targetPath] = args; + if (typeof targetPath === "string" && targetPath === imagePath) { + imageReads += 1; + if (imageReads === 2) { + const err = Object.assign( + new Error(`ENOENT: no such file or directory, open '${imagePath}'`), + { + code: "ENOENT", + }, + ) as NodeJS.ErrnoException; + throw err; + } + } + return await realReadFile(...args); + }); + + await manager.sync({ reason: "test" }); + readSpy.mockRestore(); + + const callsAfterFirstSync = embedBatchInputCalls; + (manager as unknown as { dirty: boolean }).dirty = true; + await manager.sync({ reason: "test" }); + + expect(embedBatchInputCalls).toBeGreaterThan(callsAfterFirstSync); + const results = await manager.search("image"); + expect(results.some((result) => result.path.endsWith("diagram.png"))).toBe(true); + + await manager.close?.(); + }); + it("keeps dirty false in status-only manager after prior indexing", async () => { const cfg = createCfg({ storePath: indexStatusPath }); @@ -433,6 +573,82 @@ describe("memory index", () => { await secondManager.close?.(); }); + it("reindexes when extraPaths change", async () => { + const storePath = path.join(workspaceDir, `index-scope-extra-${randomUUID()}.sqlite`); + const firstExtraDir = path.join(workspaceDir, "scope-extra-a"); + const secondExtraDir = path.join(workspaceDir, "scope-extra-b"); + await fs.rm(firstExtraDir, { recursive: true, force: true }); + await fs.rm(secondExtraDir, { recursive: true, force: true }); + await fs.mkdir(firstExtraDir, { recursive: true }); + await fs.mkdir(secondExtraDir, { recursive: true }); + await fs.writeFile(path.join(firstExtraDir, "a.md"), "alpha only"); + await fs.writeFile(path.join(secondExtraDir, "b.md"), "beta only"); + + const first = await getMemorySearchManager({ + cfg: createCfg({ + storePath, + extraPaths: [firstExtraDir], + }), + agentId: "main", + }); + const firstManager = requireManager(first); + await firstManager.sync?.({ reason: "test" }); + await firstManager.close?.(); + + const second = await getMemorySearchManager({ + cfg: createCfg({ + storePath, + extraPaths: [secondExtraDir], + }), + agentId: "main", + }); + const secondManager = requireManager(second); + await secondManager.sync?.({ reason: "test" }); + const results = await secondManager.search("beta"); + expect(results.some((result) => result.path.endsWith("scope-extra-b/b.md"))).toBe(true); + expect(results.some((result) => result.path.endsWith("scope-extra-a/a.md"))).toBe(false); + await secondManager.close?.(); + }); + + it("reindexes when multimodal settings change", async () => { + const storePath = path.join(workspaceDir, `index-scope-multimodal-${randomUUID()}.sqlite`); + const mediaDir = path.join(workspaceDir, "scope-media"); + await fs.rm(mediaDir, { recursive: true, force: true }); + await fs.mkdir(mediaDir, { recursive: true }); + await fs.writeFile(path.join(mediaDir, "diagram.png"), Buffer.from("png")); + + const first = await getMemorySearchManager({ + cfg: createCfg({ + storePath, + provider: "gemini", + model: "gemini-embedding-2-preview", + extraPaths: [mediaDir], + }), + agentId: "main", + }); + const firstManager = requireManager(first); + await firstManager.sync?.({ reason: "test" }); + const multimodalCallsAfterFirstSync = embedBatchInputCalls; + await firstManager.close?.(); + + const second = await getMemorySearchManager({ + cfg: createCfg({ + storePath, + provider: "gemini", + model: "gemini-embedding-2-preview", + extraPaths: [mediaDir], + multimodal: { enabled: true, modalities: ["image"] }, + }), + agentId: "main", + }); + const secondManager = requireManager(second); + await secondManager.sync?.({ reason: "test" }); + expect(embedBatchInputCalls).toBeGreaterThan(multimodalCallsAfterFirstSync); + const results = await secondManager.search("image"); + expect(results.some((result) => result.path.endsWith("scope-media/diagram.png"))).toBe(true); + await secondManager.close?.(); + }); + it("reuses cached embeddings on forced reindex", async () => { const cfg = createCfg({ storePath: indexMainPath, cacheEnabled: true }); const manager = await getPersistentManager(cfg); diff --git a/src/memory/internal.test.ts b/src/memory/internal.test.ts index 0f17843a88d..ec0f75f143b 100644 --- a/src/memory/internal.test.ts +++ b/src/memory/internal.test.ts @@ -3,12 +3,17 @@ import os from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { + buildMultimodalChunkForIndexing, buildFileEntry, chunkMarkdown, listMemoryFiles, normalizeExtraMemoryPaths, remapChunkLines, } from "./internal.js"; +import { + DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES, + type MemoryMultimodalSettings, +} from "./multimodal.js"; function setupTempDirLifecycle(prefix: string): () => string { let tmpDir = ""; @@ -38,6 +43,11 @@ describe("normalizeExtraMemoryPaths", () => { describe("listMemoryFiles", () => { const getTmpDir = setupTempDirLifecycle("memory-test-"); + const multimodal: MemoryMultimodalSettings = { + enabled: true, + modalities: ["image", "audio"], + maxFileBytes: DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES, + }; it("includes files from additional paths (directory)", async () => { const tmpDir = getTmpDir(); @@ -131,10 +141,29 @@ describe("listMemoryFiles", () => { const memoryMatches = files.filter((file) => file.endsWith("MEMORY.md")); expect(memoryMatches).toHaveLength(1); }); + + it("includes image and audio files from extra paths when multimodal is enabled", async () => { + const tmpDir = getTmpDir(); + const extraDir = path.join(tmpDir, "media"); + await fs.mkdir(extraDir, { recursive: true }); + await fs.writeFile(path.join(extraDir, "diagram.png"), Buffer.from("png")); + await fs.writeFile(path.join(extraDir, "note.wav"), Buffer.from("wav")); + await fs.writeFile(path.join(extraDir, "ignore.bin"), Buffer.from("bin")); + + const files = await listMemoryFiles(tmpDir, [extraDir], multimodal); + expect(files.some((file) => file.endsWith("diagram.png"))).toBe(true); + expect(files.some((file) => file.endsWith("note.wav"))).toBe(true); + expect(files.some((file) => file.endsWith("ignore.bin"))).toBe(false); + }); }); describe("buildFileEntry", () => { const getTmpDir = setupTempDirLifecycle("memory-build-entry-"); + const multimodal: MemoryMultimodalSettings = { + enabled: true, + modalities: ["image", "audio"], + maxFileBytes: DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES, + }; it("returns null when the file disappears before reading", async () => { const tmpDir = getTmpDir(); @@ -154,6 +183,37 @@ describe("buildFileEntry", () => { expect(entry?.path).toBe("note.md"); expect(entry?.size).toBeGreaterThan(0); }); + + it("returns multimodal metadata for eligible image files", async () => { + const tmpDir = getTmpDir(); + const target = path.join(tmpDir, "diagram.png"); + await fs.writeFile(target, Buffer.from("png")); + + const entry = await buildFileEntry(target, tmpDir, multimodal); + + expect(entry).toMatchObject({ + path: "diagram.png", + kind: "multimodal", + modality: "image", + mimeType: "image/png", + contentText: "Image file: diagram.png", + }); + }); + + it("builds a multimodal chunk lazily for indexing", async () => { + const tmpDir = getTmpDir(); + const target = path.join(tmpDir, "diagram.png"); + await fs.writeFile(target, Buffer.from("png")); + + const entry = await buildFileEntry(target, tmpDir, multimodal); + const built = await buildMultimodalChunkForIndexing(entry!); + + expect(built?.chunk.embeddingInput?.parts).toEqual([ + { type: "text", text: "Image file: diagram.png" }, + expect.objectContaining({ type: "inline-data", mimeType: "image/png" }), + ]); + expect(built?.structuredInputBytes).toBeGreaterThan(0); + }); }); describe("chunkMarkdown", () => { diff --git a/src/memory/internal.ts b/src/memory/internal.ts index d39e355d2c0..96ce0e918ad 100644 --- a/src/memory/internal.ts +++ b/src/memory/internal.ts @@ -2,8 +2,17 @@ import crypto from "node:crypto"; import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; +import { detectMime } from "../media/mime.js"; import { runTasksWithConcurrency } from "../utils/run-with-concurrency.js"; +import { estimateStructuredEmbeddingInputBytes } from "./embedding-input-limits.js"; +import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs.js"; import { isFileMissingError } from "./fs-utils.js"; +import { + buildMemoryMultimodalLabel, + classifyMemoryMultimodalPath, + type MemoryMultimodalModality, + type MemoryMultimodalSettings, +} from "./multimodal.js"; export type MemoryFileEntry = { path: string; @@ -11,6 +20,10 @@ export type MemoryFileEntry = { mtimeMs: number; size: number; hash: string; + kind?: "markdown" | "multimodal"; + contentText?: string; + modality?: MemoryMultimodalModality; + mimeType?: string; }; export type MemoryChunk = { @@ -18,6 +31,18 @@ export type MemoryChunk = { endLine: number; text: string; hash: string; + embeddingInput?: EmbeddingInput; +}; + +export type MultimodalMemoryChunk = { + chunk: MemoryChunk; + structuredInputBytes: number; +}; + +const DISABLED_MULTIMODAL_SETTINGS: MemoryMultimodalSettings = { + enabled: false, + modalities: [], + maxFileBytes: 0, }; export function ensureDir(dir: string): string { @@ -56,7 +81,16 @@ export function isMemoryPath(relPath: string): boolean { return normalized.startsWith("memory/"); } -async function walkDir(dir: string, files: string[]) { +function isAllowedMemoryFilePath(filePath: string, multimodal?: MemoryMultimodalSettings): boolean { + if (filePath.endsWith(".md")) { + return true; + } + return ( + classifyMemoryMultimodalPath(filePath, multimodal ?? DISABLED_MULTIMODAL_SETTINGS) !== null + ); +} + +async function walkDir(dir: string, files: string[], multimodal?: MemoryMultimodalSettings) { const entries = await fs.readdir(dir, { withFileTypes: true }); for (const entry of entries) { const full = path.join(dir, entry.name); @@ -64,13 +98,13 @@ async function walkDir(dir: string, files: string[]) { continue; } if (entry.isDirectory()) { - await walkDir(full, files); + await walkDir(full, files, multimodal); continue; } if (!entry.isFile()) { continue; } - if (!entry.name.endsWith(".md")) { + if (!isAllowedMemoryFilePath(full, multimodal)) { continue; } files.push(full); @@ -80,6 +114,7 @@ async function walkDir(dir: string, files: string[]) { export async function listMemoryFiles( workspaceDir: string, extraPaths?: string[], + multimodal?: MemoryMultimodalSettings, ): Promise { const result: string[] = []; const memoryFile = path.join(workspaceDir, "MEMORY.md"); @@ -117,10 +152,10 @@ export async function listMemoryFiles( continue; } if (stat.isDirectory()) { - await walkDir(inputPath, result); + await walkDir(inputPath, result, multimodal); continue; } - if (stat.isFile() && inputPath.endsWith(".md")) { + if (stat.isFile() && isAllowedMemoryFilePath(inputPath, multimodal)) { result.push(inputPath); } } catch {} @@ -152,6 +187,7 @@ export function hashText(value: string): string { export async function buildFileEntry( absPath: string, workspaceDir: string, + multimodal?: MemoryMultimodalSettings, ): Promise { let stat; try { @@ -162,6 +198,48 @@ export async function buildFileEntry( } throw err; } + const normalizedPath = path.relative(workspaceDir, absPath).replace(/\\/g, "/"); + const multimodalSettings = multimodal ?? DISABLED_MULTIMODAL_SETTINGS; + const modality = classifyMemoryMultimodalPath(absPath, multimodalSettings); + if (modality) { + if (stat.size > multimodalSettings.maxFileBytes) { + return null; + } + let buffer: Buffer; + try { + buffer = await fs.readFile(absPath); + } catch (err) { + if (isFileMissingError(err)) { + return null; + } + throw err; + } + const mimeType = await detectMime({ buffer: buffer.subarray(0, 512), filePath: absPath }); + if (!mimeType || !mimeType.startsWith(`${modality}/`)) { + return null; + } + const contentText = buildMemoryMultimodalLabel(modality, normalizedPath); + const dataHash = crypto.createHash("sha256").update(buffer).digest("hex"); + const chunkHash = hashText( + JSON.stringify({ + path: normalizedPath, + contentText, + mimeType, + dataHash, + }), + ); + return { + path: normalizedPath, + absPath, + mtimeMs: stat.mtimeMs, + size: stat.size, + hash: chunkHash, + kind: "multimodal", + contentText, + modality, + mimeType, + }; + } let content: string; try { content = await fs.readFile(absPath, "utf-8"); @@ -173,11 +251,59 @@ export async function buildFileEntry( } const hash = hashText(content); return { - path: path.relative(workspaceDir, absPath).replace(/\\/g, "/"), + path: normalizedPath, absPath, mtimeMs: stat.mtimeMs, size: stat.size, hash, + kind: "markdown", + }; +} + +async function loadMultimodalEmbeddingInput( + entry: Pick, +): Promise { + if (entry.kind !== "multimodal" || !entry.contentText || !entry.mimeType) { + return null; + } + let buffer: Buffer; + try { + buffer = await fs.readFile(entry.absPath); + } catch (err) { + if (isFileMissingError(err)) { + return null; + } + throw err; + } + return { + text: entry.contentText, + parts: [ + { type: "text", text: entry.contentText }, + { + type: "inline-data", + mimeType: entry.mimeType, + data: buffer.toString("base64"), + }, + ], + }; +} + +export async function buildMultimodalChunkForIndexing( + entry: Pick, +): Promise { + const embeddingInput = await loadMultimodalEmbeddingInput(entry); + if (!embeddingInput) { + return null; + } + return { + chunk: { + startLine: 1, + endLine: 1, + text: entry.contentText ?? embeddingInput.text, + hash: entry.hash, + embeddingInput, + }, + structuredInputBytes: estimateStructuredEmbeddingInputBytes(embeddingInput), }; } @@ -213,6 +339,7 @@ export function chunkMarkdown( endLine, text, hash: hashText(text), + embeddingInput: buildTextEmbeddingInput(text), }); }; diff --git a/src/memory/manager-embedding-ops.ts b/src/memory/manager-embedding-ops.ts index bcc653fda7a..49171d809cb 100644 --- a/src/memory/manager-embedding-ops.ts +++ b/src/memory/manager-embedding-ops.ts @@ -8,9 +8,14 @@ import { } from "./batch-openai.js"; import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js"; import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js"; -import { estimateUtf8Bytes } from "./embedding-input-limits.js"; -import { buildGeminiTextEmbeddingRequest } from "./embeddings-gemini.js"; import { + estimateStructuredEmbeddingInputBytes, + estimateUtf8Bytes, +} from "./embedding-input-limits.js"; +import { type EmbeddingInput, hasNonTextEmbeddingParts } from "./embedding-inputs.js"; +import { buildGeminiEmbeddingRequest } from "./embeddings-gemini.js"; +import { + buildMultimodalChunkForIndexing, chunkMarkdown, hashText, parseEmbedding, @@ -53,7 +58,9 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { let currentTokens = 0; for (const chunk of chunks) { - const estimate = estimateUtf8Bytes(chunk.text); + const estimate = chunk.embeddingInput + ? estimateStructuredEmbeddingInputBytes(chunk.embeddingInput) + : estimateUtf8Bytes(chunk.text); const wouldExceed = current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS; if (wouldExceed) { @@ -188,9 +195,22 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { const missingChunks = missing.map((m) => m.chunk); const batches = this.buildEmbeddingBatches(missingChunks); const toCache: Array<{ hash: string; embedding: number[] }> = []; + const provider = this.provider; + if (!provider) { + throw new Error("Cannot embed batch in FTS-only mode (no embedding provider)"); + } let cursor = 0; for (const batch of batches) { - const batchEmbeddings = await this.embedBatchWithRetry(batch.map((chunk) => chunk.text)); + const inputs = batch.map((chunk) => chunk.embeddingInput ?? { text: chunk.text }); + const hasStructuredInputs = inputs.some((input) => hasNonTextEmbeddingParts(input)); + if (hasStructuredInputs && !provider.embedBatchInputs) { + throw new Error( + `Embedding provider "${provider.id}" does not support multimodal memory inputs.`, + ); + } + const batchEmbeddings = hasStructuredInputs + ? await this.embedBatchInputsWithRetry(inputs) + : await this.embedBatchWithRetry(batch.map((chunk) => chunk.text)); for (let i = 0; i < batch.length; i += 1) { const item = missing[cursor + i]; const embedding = batchEmbeddings[i] ?? []; @@ -476,6 +496,9 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { source: MemorySource, ): Promise { const gemini = this.gemini; + if (chunks.some((chunk) => hasNonTextEmbeddingParts(chunk.embeddingInput))) { + return await this.embedChunksInBatches(chunks); + } return await this.embedChunksWithProviderBatch({ chunks, entry, @@ -483,9 +506,10 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { provider: "gemini", enabled: Boolean(gemini), buildRequest: (chunk) => ({ - request: buildGeminiTextEmbeddingRequest({ - text: chunk.text, + request: buildGeminiEmbeddingRequest({ + input: chunk.embeddingInput ?? { text: chunk.text }, taskType: "RETRIEVAL_DOCUMENT", + modelPath: this.gemini?.modelPath, outputDimensionality: this.gemini?.outputDimensionality, }), }), @@ -536,6 +560,45 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { } } + protected async embedBatchInputsWithRetry(inputs: EmbeddingInput[]): Promise { + if (inputs.length === 0) { + return []; + } + if (!this.provider?.embedBatchInputs) { + return await this.embedBatchWithRetry(inputs.map((input) => input.text)); + } + let attempt = 0; + let delayMs = EMBEDDING_RETRY_BASE_DELAY_MS; + while (true) { + try { + const timeoutMs = this.resolveEmbeddingTimeout("batch"); + log.debug("memory embeddings: structured batch start", { + provider: this.provider.id, + items: inputs.length, + timeoutMs, + }); + return await this.withTimeout( + this.provider.embedBatchInputs(inputs), + timeoutMs, + `memory embeddings batch timed out after ${Math.round(timeoutMs / 1000)}s`, + ); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + if (!this.isRetryableEmbeddingError(message) || attempt >= EMBEDDING_RETRY_MAX_ATTEMPTS) { + throw err; + } + const waitMs = Math.min( + EMBEDDING_RETRY_MAX_DELAY_MS, + Math.round(delayMs * (1 + Math.random() * 0.2)), + ); + log.warn(`memory embeddings rate limited; retrying structured batch in ${waitMs}ms`); + await new Promise((resolve) => setTimeout(resolve, waitMs)); + delayMs *= 2; + attempt += 1; + } + } + } + private isRetryableEmbeddingError(message: string): boolean { return /(rate[_ ]limit|too many requests|429|resource has been exhausted|5\d\d|cloudflare|tokens per day)/i.test( message, @@ -695,6 +758,49 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { return this.batch.enabled ? this.batch.concurrency : EMBEDDING_INDEX_CONCURRENCY; } + private clearIndexedFileData(pathname: string, source: MemorySource): void { + if (this.vector.enabled) { + try { + this.db + .prepare( + `DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`, + ) + .run(pathname, source); + } catch {} + } + if (this.fts.enabled && this.fts.available && this.provider) { + try { + this.db + .prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`) + .run(pathname, source, this.provider.model); + } catch {} + } + this.db.prepare(`DELETE FROM chunks WHERE path = ? AND source = ?`).run(pathname, source); + } + + private upsertFileRecord(entry: MemoryFileEntry | SessionFileEntry, source: MemorySource): void { + this.db + .prepare( + `INSERT INTO files (path, source, hash, mtime, size) VALUES (?, ?, ?, ?, ?) + ON CONFLICT(path) DO UPDATE SET + source=excluded.source, + hash=excluded.hash, + mtime=excluded.mtime, + size=excluded.size`, + ) + .run(entry.path, source, entry.hash, entry.mtimeMs, entry.size); + } + + private deleteFileRecord(pathname: string, source: MemorySource): void { + this.db.prepare(`DELETE FROM files WHERE path = ? AND source = ?`).run(pathname, source); + } + + private isStructuredInputTooLargeError(message: string): boolean { + return /(413|payload too large|request too large|input too large|too many tokens|input limit|request size)/i.test( + message, + ); + } + protected async indexFile( entry: MemoryFileEntry | SessionFileEntry, options: { source: MemorySource; content?: string }, @@ -708,42 +814,59 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { return; } - const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8")); - const chunks = enforceEmbeddingMaxInputTokens( - this.provider, - chunkMarkdown(content, this.settings.chunking).filter( - (chunk) => chunk.text.trim().length > 0, - ), - EMBEDDING_BATCH_MAX_TOKENS, - ); - if (options.source === "sessions" && "lineMap" in entry) { - remapChunkLines(chunks, entry.lineMap); + let chunks: MemoryChunk[]; + let structuredInputBytes: number | undefined; + if ("kind" in entry && entry.kind === "multimodal") { + const multimodalChunk = await buildMultimodalChunkForIndexing(entry); + if (!multimodalChunk) { + this.clearIndexedFileData(entry.path, options.source); + this.deleteFileRecord(entry.path, options.source); + return; + } + structuredInputBytes = multimodalChunk.structuredInputBytes; + chunks = [multimodalChunk.chunk]; + } else { + const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8")); + chunks = enforceEmbeddingMaxInputTokens( + this.provider, + chunkMarkdown(content, this.settings.chunking).filter( + (chunk) => chunk.text.trim().length > 0, + ), + EMBEDDING_BATCH_MAX_TOKENS, + ); + if (options.source === "sessions" && "lineMap" in entry) { + remapChunkLines(chunks, entry.lineMap); + } + } + let embeddings: number[][]; + try { + embeddings = this.batch.enabled + ? await this.embedChunksWithBatch(chunks, entry, options.source) + : await this.embedChunksInBatches(chunks); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + if ( + "kind" in entry && + entry.kind === "multimodal" && + this.isStructuredInputTooLargeError(message) + ) { + log.warn("memory embeddings: skipping multimodal file rejected as too large", { + path: entry.path, + bytes: structuredInputBytes, + provider: this.provider.id, + model: this.provider.model, + error: message, + }); + this.clearIndexedFileData(entry.path, options.source); + this.upsertFileRecord(entry, options.source); + return; + } + throw err; } - const embeddings = this.batch.enabled - ? await this.embedChunksWithBatch(chunks, entry, options.source) - : await this.embedChunksInBatches(chunks); const sample = embeddings.find((embedding) => embedding.length > 0); const vectorReady = sample ? await this.ensureVectorReady(sample.length) : false; const now = Date.now(); - if (vectorReady) { - try { - this.db - .prepare( - `DELETE FROM ${VECTOR_TABLE} WHERE id IN (SELECT id FROM chunks WHERE path = ? AND source = ?)`, - ) - .run(entry.path, options.source); - } catch {} - } - if (this.fts.enabled && this.fts.available) { - try { - this.db - .prepare(`DELETE FROM ${FTS_TABLE} WHERE path = ? AND source = ? AND model = ?`) - .run(entry.path, options.source, this.provider.model); - } catch {} - } - this.db - .prepare(`DELETE FROM chunks WHERE path = ? AND source = ?`) - .run(entry.path, options.source); + this.clearIndexedFileData(entry.path, options.source); for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; const embedding = embeddings[i] ?? []; @@ -798,15 +921,6 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { ); } } - this.db - .prepare( - `INSERT INTO files (path, source, hash, mtime, size) VALUES (?, ?, ?, ?, ?) - ON CONFLICT(path) DO UPDATE SET - source=excluded.source, - hash=excluded.hash, - mtime=excluded.mtime, - size=excluded.size`, - ) - .run(entry.path, options.source, entry.hash, entry.mtimeMs, entry.size); + this.upsertFileRecord(entry, options.source); } } diff --git a/src/memory/manager-sync-ops.ts b/src/memory/manager-sync-ops.ts index 7bdf8fcdd2e..6fd3e6bb9c0 100644 --- a/src/memory/manager-sync-ops.ts +++ b/src/memory/manager-sync-ops.ts @@ -29,12 +29,18 @@ import { isFileMissingError } from "./fs-utils.js"; import { buildFileEntry, ensureDir, + hashText, listMemoryFiles, normalizeExtraMemoryPaths, runWithConcurrency, } from "./internal.js"; import { type MemoryFileEntry } from "./internal.js"; import { ensureMemoryIndexSchema } from "./memory-schema.js"; +import { + buildCaseInsensitiveExtensionGlob, + classifyMemoryMultimodalPath, + getMemoryMultimodalExtensions, +} from "./multimodal.js"; import type { SessionFileEntry } from "./session-files.js"; import { buildSessionEntry, @@ -50,6 +56,7 @@ type MemoryIndexMeta = { provider: string; providerKey?: string; sources?: MemorySource[]; + scopeHash?: string; chunkTokens: number; chunkOverlap: number; vectorDims?: number; @@ -383,9 +390,22 @@ export abstract class MemoryManagerSyncOps { } if (stat.isDirectory()) { watchPaths.add(path.join(entry, "**", "*.md")); + if (this.settings.multimodal.enabled) { + for (const modality of this.settings.multimodal.modalities) { + for (const extension of getMemoryMultimodalExtensions(modality)) { + watchPaths.add( + path.join(entry, "**", buildCaseInsensitiveExtensionGlob(extension)), + ); + } + } + } continue; } - if (stat.isFile() && entry.toLowerCase().endsWith(".md")) { + if ( + stat.isFile() && + (entry.toLowerCase().endsWith(".md") || + classifyMemoryMultimodalPath(entry, this.settings.multimodal) !== null) + ) { watchPaths.add(entry); } } catch { @@ -649,9 +669,19 @@ export abstract class MemoryManagerSyncOps { return; } - const files = await listMemoryFiles(this.workspaceDir, this.settings.extraPaths); + const files = await listMemoryFiles( + this.workspaceDir, + this.settings.extraPaths, + this.settings.multimodal, + ); const fileEntries = ( - await Promise.all(files.map(async (file) => buildFileEntry(file, this.workspaceDir))) + await runWithConcurrency( + files.map( + (file) => async () => + await buildFileEntry(file, this.workspaceDir, this.settings.multimodal), + ), + this.getIndexConcurrency(), + ) ).filter((entry): entry is MemoryFileEntry => entry !== null); log.debug("memory sync: indexing memory files", { files: fileEntries.length, @@ -868,6 +898,7 @@ export abstract class MemoryManagerSyncOps { const vectorReady = await this.ensureVectorReady(); const meta = this.readMeta(); const configuredSources = this.resolveConfiguredSourcesForMeta(); + const configuredScopeHash = this.resolveConfiguredScopeHash(); const needsFullReindex = params?.force || !meta || @@ -875,6 +906,7 @@ export abstract class MemoryManagerSyncOps { (this.provider && meta.provider !== this.provider.id) || meta.providerKey !== this.providerKey || this.metaSourcesDiffer(meta, configuredSources) || + meta.scopeHash !== configuredScopeHash || meta.chunkTokens !== this.settings.chunking.tokens || meta.chunkOverlap !== this.settings.chunking.overlap || (vectorReady && !meta?.vectorDims); @@ -1088,6 +1120,7 @@ export abstract class MemoryManagerSyncOps { provider: this.provider?.id ?? "none", providerKey: this.providerKey!, sources: this.resolveConfiguredSourcesForMeta(), + scopeHash: this.resolveConfiguredScopeHash(), chunkTokens: this.settings.chunking.tokens, chunkOverlap: this.settings.chunking.overlap, }; @@ -1159,6 +1192,7 @@ export abstract class MemoryManagerSyncOps { provider: this.provider?.id ?? "none", providerKey: this.providerKey!, sources: this.resolveConfiguredSourcesForMeta(), + scopeHash: this.resolveConfiguredScopeHash(), chunkTokens: this.settings.chunking.tokens, chunkOverlap: this.settings.chunking.overlap, }; @@ -1236,6 +1270,22 @@ export abstract class MemoryManagerSyncOps { return normalized.length > 0 ? normalized : ["memory"]; } + private resolveConfiguredScopeHash(): string { + const extraPaths = normalizeExtraMemoryPaths(this.workspaceDir, this.settings.extraPaths) + .map((value) => value.replace(/\\/g, "/")) + .toSorted(); + return hashText( + JSON.stringify({ + extraPaths, + multimodal: { + enabled: this.settings.multimodal.enabled, + modalities: [...this.settings.multimodal.modalities].toSorted(), + maxFileBytes: this.settings.multimodal.maxFileBytes, + }, + }), + ); + } + private metaSourcesDiffer(meta: MemoryIndexMeta, configuredSources: MemorySource[]): boolean { const metaSources = this.normalizeMetaSources(meta); if (metaSources.length !== configuredSources.length) { diff --git a/src/memory/manager.watcher-config.test.ts b/src/memory/manager.watcher-config.test.ts index 77221df34b6..43682183676 100644 --- a/src/memory/manager.watcher-config.test.ts +++ b/src/memory/manager.watcher-config.test.ts @@ -106,4 +106,50 @@ describe("memory watcher config", () => { expect(ignored?.(path.join(workspaceDir, "memory", ".venv", "lib", "python.md"))).toBe(true); expect(ignored?.(path.join(workspaceDir, "memory", "project", "notes.md"))).toBe(false); }); + + it("watches multimodal extensions with case-insensitive globs", async () => { + workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-memory-watch-")); + extraDir = path.join(workspaceDir, "extra"); + await fs.mkdir(path.join(workspaceDir, "memory"), { recursive: true }); + await fs.mkdir(extraDir, { recursive: true }); + await fs.writeFile(path.join(extraDir, "PHOTO.PNG"), "png"); + + const cfg = { + agents: { + defaults: { + workspace: workspaceDir, + memorySearch: { + provider: "gemini", + model: "gemini-embedding-2-preview", + fallback: "none", + store: { path: path.join(workspaceDir, "index.sqlite"), vector: { enabled: false } }, + sync: { watch: true, watchDebounceMs: 25, onSessionStart: false, onSearch: false }, + query: { minScore: 0, hybrid: { enabled: false } }, + extraPaths: [extraDir], + multimodal: { enabled: true, modalities: ["image", "audio"] }, + }, + }, + list: [{ id: "main", default: true }], + }, + } as OpenClawConfig; + + const result = await getMemorySearchManager({ cfg, agentId: "main" }); + expect(result.manager).not.toBeNull(); + if (!result.manager) { + throw new Error("manager missing"); + } + manager = result.manager as unknown as MemoryIndexManager; + + expect(watchMock).toHaveBeenCalledTimes(1); + const [watchedPaths] = watchMock.mock.calls[0] as unknown as [ + string[], + Record, + ]; + expect(watchedPaths).toEqual( + expect.arrayContaining([ + path.join(extraDir, "**", "*.[pP][nN][gG]"), + path.join(extraDir, "**", "*.[wW][aA][vV]"), + ]), + ); + }); }); diff --git a/src/memory/multimodal.ts b/src/memory/multimodal.ts new file mode 100644 index 00000000000..df72ed8c495 --- /dev/null +++ b/src/memory/multimodal.ts @@ -0,0 +1,118 @@ +const MEMORY_MULTIMODAL_SPECS = { + image: { + labelPrefix: "Image file", + extensions: [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"], + }, + audio: { + labelPrefix: "Audio file", + extensions: [".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"], + }, +} as const; + +export type MemoryMultimodalModality = keyof typeof MEMORY_MULTIMODAL_SPECS; +export const MEMORY_MULTIMODAL_MODALITIES = Object.keys( + MEMORY_MULTIMODAL_SPECS, +) as MemoryMultimodalModality[]; +export type MemoryMultimodalSelection = MemoryMultimodalModality | "all"; + +export type MemoryMultimodalSettings = { + enabled: boolean; + modalities: MemoryMultimodalModality[]; + maxFileBytes: number; +}; + +export const DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES = 10 * 1024 * 1024; + +export function normalizeMemoryMultimodalModalities( + raw: MemoryMultimodalSelection[] | undefined, +): MemoryMultimodalModality[] { + if (raw === undefined || raw.includes("all")) { + return [...MEMORY_MULTIMODAL_MODALITIES]; + } + const normalized = new Set(); + for (const value of raw) { + if (value === "image" || value === "audio") { + normalized.add(value); + } + } + return Array.from(normalized); +} + +export function normalizeMemoryMultimodalSettings(raw: { + enabled?: boolean; + modalities?: MemoryMultimodalSelection[]; + maxFileBytes?: number; +}): MemoryMultimodalSettings { + const enabled = raw.enabled === true; + const maxFileBytes = + typeof raw.maxFileBytes === "number" && Number.isFinite(raw.maxFileBytes) + ? Math.max(1, Math.floor(raw.maxFileBytes)) + : DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES; + return { + enabled, + modalities: enabled ? normalizeMemoryMultimodalModalities(raw.modalities) : [], + maxFileBytes, + }; +} + +export function isMemoryMultimodalEnabled(settings: MemoryMultimodalSettings): boolean { + return settings.enabled && settings.modalities.length > 0; +} + +export function getMemoryMultimodalExtensions( + modality: MemoryMultimodalModality, +): readonly string[] { + return MEMORY_MULTIMODAL_SPECS[modality].extensions; +} + +export function buildMemoryMultimodalLabel( + modality: MemoryMultimodalModality, + normalizedPath: string, +): string { + return `${MEMORY_MULTIMODAL_SPECS[modality].labelPrefix}: ${normalizedPath}`; +} + +export function buildCaseInsensitiveExtensionGlob(extension: string): string { + const normalized = extension.trim().replace(/^\./, "").toLowerCase(); + if (!normalized) { + return "*"; + } + const parts = Array.from(normalized, (char) => `[${char.toLowerCase()}${char.toUpperCase()}]`); + return `*.${parts.join("")}`; +} + +export function classifyMemoryMultimodalPath( + filePath: string, + settings: MemoryMultimodalSettings, +): MemoryMultimodalModality | null { + if (!isMemoryMultimodalEnabled(settings)) { + return null; + } + const lower = filePath.trim().toLowerCase(); + for (const modality of settings.modalities) { + for (const extension of getMemoryMultimodalExtensions(modality)) { + if (lower.endsWith(extension)) { + return modality; + } + } + } + return null; +} + +export function normalizeGeminiEmbeddingModelForMemory(model: string): string { + const trimmed = model.trim(); + if (!trimmed) { + return ""; + } + return trimmed.replace(/^models\//, "").replace(/^(gemini|google)\//, ""); +} + +export function supportsMemoryMultimodalEmbeddings(params: { + provider: string; + model: string; +}): boolean { + if (params.provider !== "gemini") { + return false; + } + return normalizeGeminiEmbeddingModelForMemory(params.model) === "gemini-embedding-2-preview"; +}