From 78d5dcfb77d91f2b0adf85d9941edf7f050de35d Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Wed, 11 Mar 2026 21:14:21 +0000 Subject: [PATCH] memory: defer multimodal payload loading --- src/agents/memory-search.test.ts | 23 +++++++++++++++ src/memory/internal.test.ts | 14 ++++++++- src/memory/internal.ts | 45 ++++++++++++++++++----------- src/memory/manager-embedding-ops.ts | 11 +++++-- src/memory/manager-sync-ops.ts | 8 +++-- src/memory/multimodal.ts | 2 +- 6 files changed, 78 insertions(+), 25 deletions(-) diff --git a/src/agents/memory-search.test.ts b/src/agents/memory-search.test.ts index edbdb417eaa..4aa3b2f5ebe 100644 --- a/src/agents/memory-search.test.ts +++ b/src/agents/memory-search.test.ts @@ -155,6 +155,29 @@ describe("memory search config", () => { }); }); + it("keeps an explicit empty multimodal modalities list empty", () => { + const cfg = asConfig({ + agents: { + defaults: { + memorySearch: { + provider: "gemini", + model: "gemini-embedding-2-preview", + multimodal: { + enabled: true, + modalities: [], + }, + }, + }, + }, + }); + const resolved = resolveMemorySearchConfig(cfg, "main"); + expect(resolved?.multimodal).toEqual({ + enabled: true, + modalities: [], + maxFileBytes: 10 * 1024 * 1024, + }); + }); + it("rejects multimodal memory on unsupported providers", () => { const cfg = asConfig({ agents: { diff --git a/src/memory/internal.test.ts b/src/memory/internal.test.ts index f508791d369..59310d43251 100644 --- a/src/memory/internal.test.ts +++ b/src/memory/internal.test.ts @@ -6,6 +6,7 @@ import { buildFileEntry, chunkMarkdown, listMemoryFiles, + loadMultimodalEmbeddingInput, normalizeExtraMemoryPaths, remapChunkLines, } from "./internal.js"; @@ -197,7 +198,18 @@ describe("buildFileEntry", () => { mimeType: "image/png", contentText: "Image file: diagram.png", }); - expect(entry?.embeddingInput?.parts).toEqual([ + expect(entry?.embeddingInput).toBeUndefined(); + }); + + it("loads multimodal embedding input lazily", async () => { + const tmpDir = getTmpDir(); + const target = path.join(tmpDir, "diagram.png"); + await fs.writeFile(target, Buffer.from("png")); + + const entry = await buildFileEntry(target, tmpDir, multimodal); + const input = await loadMultimodalEmbeddingInput(entry!); + + expect(input?.parts).toEqual([ { type: "text", text: "Image file: diagram.png" }, expect.objectContaining({ type: "inline-data", mimeType: "image/png" }), ]); diff --git a/src/memory/internal.ts b/src/memory/internal.ts index a2d5591fccd..e4084968154 100644 --- a/src/memory/internal.ts +++ b/src/memory/internal.ts @@ -8,7 +8,6 @@ import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs import { isFileMissingError } from "./fs-utils.js"; import { classifyMemoryMultimodalPath, - isMemoryMultimodalEnabled, type MemoryMultimodalModality, type MemoryMultimodalSettings, } from "./multimodal.js"; @@ -21,7 +20,6 @@ export type MemoryFileEntry = { hash: string; kind?: "markdown" | "multimodal"; contentText?: string; - embeddingInput?: EmbeddingInput; modality?: MemoryMultimodalModality; mimeType?: string; }; @@ -197,9 +195,6 @@ export async function buildFileEntry( const multimodalSettings = multimodal ?? DISABLED_MULTIMODAL_SETTINGS; const modality = classifyMemoryMultimodalPath(absPath, multimodalSettings); if (modality) { - if (!isMemoryMultimodalEnabled(multimodalSettings)) { - return null; - } if (stat.size > multimodalSettings.maxFileBytes) { return null; } @@ -217,17 +212,6 @@ export async function buildFileEntry( return null; } const contentText = `${modality === "image" ? "Image" : "Audio"} file: ${normalizedPath}`; - const embeddingInput: EmbeddingInput = { - text: contentText, - parts: [ - { type: "text", text: contentText }, - { - type: "inline-data", - mimeType, - data: buffer.toString("base64"), - }, - ], - }; const dataHash = crypto.createHash("sha256").update(buffer).digest("hex"); const chunkHash = hashText( JSON.stringify({ @@ -245,7 +229,6 @@ export async function buildFileEntry( hash: chunkHash, kind: "multimodal", contentText, - embeddingInput, modality, mimeType, }; @@ -270,6 +253,34 @@ export async function buildFileEntry( }; } +export async function loadMultimodalEmbeddingInput( + entry: Pick, +): Promise { + if (entry.kind !== "multimodal" || !entry.contentText || !entry.mimeType) { + return null; + } + let buffer: Buffer; + try { + buffer = await fs.readFile(entry.absPath); + } catch (err) { + if (isFileMissingError(err)) { + return null; + } + throw err; + } + return { + text: entry.contentText, + parts: [ + { type: "text", text: entry.contentText }, + { + type: "inline-data", + mimeType: entry.mimeType, + data: buffer.toString("base64"), + }, + ], + }; +} + export function chunkMarkdown( content: string, chunking: { tokens: number; overlap: number }, diff --git a/src/memory/manager-embedding-ops.ts b/src/memory/manager-embedding-ops.ts index 53f487e72df..dadaadb513f 100644 --- a/src/memory/manager-embedding-ops.ts +++ b/src/memory/manager-embedding-ops.ts @@ -17,6 +17,7 @@ import { buildGeminiEmbeddingRequest } from "./embeddings-gemini.js"; import { chunkMarkdown, hashText, + loadMultimodalEmbeddingInput, parseEmbedding, remapChunkLines, type MemoryChunk, @@ -771,14 +772,18 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { } let chunks: MemoryChunk[]; - if ("kind" in entry && entry.kind === "multimodal" && entry.embeddingInput) { + if ("kind" in entry && entry.kind === "multimodal") { + const embeddingInput = await loadMultimodalEmbeddingInput(entry); + if (!embeddingInput) { + return; + } chunks = [ { startLine: 1, endLine: 1, - text: entry.contentText ?? entry.embeddingInput.text, + text: entry.contentText ?? embeddingInput.text, hash: entry.hash, - embeddingInput: entry.embeddingInput, + embeddingInput, }, ]; } else { diff --git a/src/memory/manager-sync-ops.ts b/src/memory/manager-sync-ops.ts index 8b61b8cae1a..d59d4ff04d8 100644 --- a/src/memory/manager-sync-ops.ts +++ b/src/memory/manager-sync-ops.ts @@ -671,10 +671,12 @@ export abstract class MemoryManagerSyncOps { this.settings.multimodal, ); const fileEntries = ( - await Promise.all( - files.map(async (file) => - buildFileEntry(file, this.workspaceDir, this.settings.multimodal), + await runWithConcurrency( + files.map( + (file) => async () => + await buildFileEntry(file, this.workspaceDir, this.settings.multimodal), ), + this.getIndexConcurrency(), ) ).filter((entry): entry is MemoryFileEntry => entry !== null); log.debug("memory sync: indexing memory files", { diff --git a/src/memory/multimodal.ts b/src/memory/multimodal.ts index bbbb5269e8d..45796071055 100644 --- a/src/memory/multimodal.ts +++ b/src/memory/multimodal.ts @@ -17,7 +17,7 @@ const AUDIO_EXTENSIONS = new Set([".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac export function normalizeMemoryMultimodalModalities( raw: MemoryMultimodalSelection[] | undefined, ): MemoryMultimodalModality[] { - if (!raw?.length || raw.includes("all")) { + if (raw === undefined || raw.includes("all")) { return [...MEMORY_MULTIMODAL_MODALITIES]; } const normalized = new Set();