From b980c086fbaf1ddbeeb623e44692b9f84f5e69ec Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Wed, 11 Mar 2026 21:56:10 +0000 Subject: [PATCH] memory: simplify multimodal embedding plumbing --- src/memory/embeddings-gemini.test.ts | 38 ---------------------------- src/memory/embeddings-gemini.ts | 23 +---------------- src/memory/internal.test.ts | 9 ++++--- src/memory/internal.ts | 30 ++++++++++++++++++++-- src/memory/manager-embedding-ops.ts | 18 ++++--------- src/memory/manager-sync-ops.ts | 12 ++++----- src/memory/multimodal.ts | 37 +++++++++++++++++++++------ 7 files changed, 74 insertions(+), 93 deletions(-) diff --git a/src/memory/embeddings-gemini.test.ts b/src/memory/embeddings-gemini.test.ts index 8926a42d6b6..25d5f373431 100644 --- a/src/memory/embeddings-gemini.test.ts +++ b/src/memory/embeddings-gemini.test.ts @@ -2,16 +2,12 @@ import { afterEach, describe, expect, it, vi } from "vitest"; import * as authModule from "../agents/model-auth.js"; import { buildGeminiEmbeddingRequest, - buildFileDataPart, - buildGeminiParts, buildGeminiTextEmbeddingRequest, - buildInlineDataPart, createGeminiEmbeddingProvider, DEFAULT_GEMINI_EMBEDDING_MODEL, GEMINI_EMBEDDING_2_MODELS, isGeminiEmbedding2Model, resolveGeminiOutputDimensionality, - type GeminiPart, } from "./embeddings-gemini.js"; vi.mock("../agents/model-auth.js", async () => { @@ -62,40 +58,6 @@ function mockResolvedProviderKey(apiKey = "test-key") { }); } -// ---------- Helper function tests ---------- - -describe("buildGeminiParts", () => { - it("wraps a string into a single text part", () => { - expect(buildGeminiParts("hello")).toEqual([{ text: "hello" }]); - }); - - it("passes through an existing parts array", () => { - const parts: GeminiPart[] = [ - { text: "hello" }, - { inlineData: { mimeType: "image/png", data: "base64data" } }, - ]; - expect(buildGeminiParts(parts)).toBe(parts); - }); -}); - -describe("buildInlineDataPart", () => { - it("produces the correct shape", () => { - const part = buildInlineDataPart("image/jpeg", "abc123"); - expect(part).toEqual({ - inlineData: { mimeType: "image/jpeg", data: "abc123" }, - }); - }); -}); - -describe("buildFileDataPart", () => { - it("produces the correct shape", () => { - const part = buildFileDataPart("application/pdf", "gs://bucket/file.pdf"); - expect(part).toEqual({ - fileData: { mimeType: "application/pdf", fileUri: "gs://bucket/file.pdf" }, - }); - }); -}); - describe("buildGeminiTextEmbeddingRequest", () => { it("builds a text embedding request with optional model and dimensions", () => { expect( diff --git a/src/memory/embeddings-gemini.ts b/src/memory/embeddings-gemini.ts index e8f7e098f36..cb29af009d9 100644 --- a/src/memory/embeddings-gemini.ts +++ b/src/memory/embeddings-gemini.ts @@ -51,10 +51,7 @@ export type GeminiTextPart = { text: string }; export type GeminiInlinePart = { inlineData: { mimeType: string; data: string }; }; -export type GeminiFilePart = { - fileData: { mimeType: string; fileUri: string }; -}; -export type GeminiPart = GeminiTextPart | GeminiInlinePart | GeminiFilePart; +export type GeminiPart = GeminiTextPart | GeminiInlinePart; export type GeminiEmbeddingRequest = { content: { parts: GeminiPart[] }; taskType: GeminiTaskType; @@ -63,24 +60,6 @@ export type GeminiEmbeddingRequest = { }; export type GeminiTextEmbeddingRequest = GeminiEmbeddingRequest; -/** Convert a string or pre-built parts array into `GeminiPart[]`. */ -export function buildGeminiParts(input: string | GeminiPart[]): GeminiPart[] { - if (typeof input === "string") { - return [{ text: input }]; - } - return input; -} - -/** Convenience: build an inline-data part for multimodal embeddings. */ -export function buildInlineDataPart(mimeType: string, base64Data: string): GeminiInlinePart { - return { inlineData: { mimeType, data: base64Data } }; -} - -/** Convenience: build a file-data part for multimodal embeddings. */ -export function buildFileDataPart(mimeType: string, fileUri: string): GeminiFilePart { - return { fileData: { mimeType, fileUri } }; -} - /** Builds the text-only Gemini embedding request shape used across direct and batch APIs. */ export function buildGeminiTextEmbeddingRequest(params: { text: string; diff --git a/src/memory/internal.test.ts b/src/memory/internal.test.ts index 59310d43251..598f9d0e551 100644 --- a/src/memory/internal.test.ts +++ b/src/memory/internal.test.ts @@ -3,10 +3,10 @@ import os from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { + buildMultimodalChunkForIndexing, buildFileEntry, chunkMarkdown, listMemoryFiles, - loadMultimodalEmbeddingInput, normalizeExtraMemoryPaths, remapChunkLines, } from "./internal.js"; @@ -201,18 +201,19 @@ describe("buildFileEntry", () => { expect(entry?.embeddingInput).toBeUndefined(); }); - it("loads multimodal embedding input lazily", async () => { + it("builds a multimodal chunk lazily for indexing", async () => { const tmpDir = getTmpDir(); const target = path.join(tmpDir, "diagram.png"); await fs.writeFile(target, Buffer.from("png")); const entry = await buildFileEntry(target, tmpDir, multimodal); - const input = await loadMultimodalEmbeddingInput(entry!); + const built = await buildMultimodalChunkForIndexing(entry!); - expect(input?.parts).toEqual([ + expect(built?.chunk.embeddingInput?.parts).toEqual([ { type: "text", text: "Image file: diagram.png" }, expect.objectContaining({ type: "inline-data", mimeType: "image/png" }), ]); + expect(built?.structuredInputBytes).toBeGreaterThan(0); }); }); diff --git a/src/memory/internal.ts b/src/memory/internal.ts index e4084968154..96ce0e918ad 100644 --- a/src/memory/internal.ts +++ b/src/memory/internal.ts @@ -4,9 +4,11 @@ import fs from "node:fs/promises"; import path from "node:path"; import { detectMime } from "../media/mime.js"; import { runTasksWithConcurrency } from "../utils/run-with-concurrency.js"; +import { estimateStructuredEmbeddingInputBytes } from "./embedding-input-limits.js"; import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs.js"; import { isFileMissingError } from "./fs-utils.js"; import { + buildMemoryMultimodalLabel, classifyMemoryMultimodalPath, type MemoryMultimodalModality, type MemoryMultimodalSettings, @@ -32,6 +34,11 @@ export type MemoryChunk = { embeddingInput?: EmbeddingInput; }; +export type MultimodalMemoryChunk = { + chunk: MemoryChunk; + structuredInputBytes: number; +}; + const DISABLED_MULTIMODAL_SETTINGS: MemoryMultimodalSettings = { enabled: false, modalities: [], @@ -211,7 +218,7 @@ export async function buildFileEntry( if (!mimeType || !mimeType.startsWith(`${modality}/`)) { return null; } - const contentText = `${modality === "image" ? "Image" : "Audio"} file: ${normalizedPath}`; + const contentText = buildMemoryMultimodalLabel(modality, normalizedPath); const dataHash = crypto.createHash("sha256").update(buffer).digest("hex"); const chunkHash = hashText( JSON.stringify({ @@ -253,7 +260,7 @@ export async function buildFileEntry( }; } -export async function loadMultimodalEmbeddingInput( +async function loadMultimodalEmbeddingInput( entry: Pick, ): Promise { if (entry.kind !== "multimodal" || !entry.contentText || !entry.mimeType) { @@ -281,6 +288,25 @@ export async function loadMultimodalEmbeddingInput( }; } +export async function buildMultimodalChunkForIndexing( + entry: Pick, +): Promise { + const embeddingInput = await loadMultimodalEmbeddingInput(entry); + if (!embeddingInput) { + return null; + } + return { + chunk: { + startLine: 1, + endLine: 1, + text: entry.contentText ?? embeddingInput.text, + hash: entry.hash, + embeddingInput, + }, + structuredInputBytes: estimateStructuredEmbeddingInputBytes(embeddingInput), + }; +} + export function chunkMarkdown( content: string, chunking: { tokens: number; overlap: number }, diff --git a/src/memory/manager-embedding-ops.ts b/src/memory/manager-embedding-ops.ts index 2487cfa973c..3f4521e12d1 100644 --- a/src/memory/manager-embedding-ops.ts +++ b/src/memory/manager-embedding-ops.ts @@ -15,9 +15,9 @@ import { import { type EmbeddingInput, hasNonTextEmbeddingParts } from "./embedding-inputs.js"; import { buildGeminiEmbeddingRequest } from "./embeddings-gemini.js"; import { + buildMultimodalChunkForIndexing, chunkMarkdown, hashText, - loadMultimodalEmbeddingInput, parseEmbedding, remapChunkLines, type MemoryChunk, @@ -813,21 +813,13 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps { let chunks: MemoryChunk[]; let structuredInputBytes: number | undefined; if ("kind" in entry && entry.kind === "multimodal") { - const embeddingInput = await loadMultimodalEmbeddingInput(entry); - if (!embeddingInput) { + const multimodalChunk = await buildMultimodalChunkForIndexing(entry); + if (!multimodalChunk) { this.clearIndexedFileData(entry.path, options.source); return; } - structuredInputBytes = estimateStructuredEmbeddingInputBytes(embeddingInput); - chunks = [ - { - startLine: 1, - endLine: 1, - text: entry.contentText ?? embeddingInput.text, - hash: entry.hash, - embeddingInput, - }, - ]; + structuredInputBytes = multimodalChunk.structuredInputBytes; + chunks = [multimodalChunk.chunk]; } else { const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8")); chunks = enforceEmbeddingMaxInputTokens( diff --git a/src/memory/manager-sync-ops.ts b/src/memory/manager-sync-ops.ts index f883c0d53b6..6fd3e6bb9c0 100644 --- a/src/memory/manager-sync-ops.ts +++ b/src/memory/manager-sync-ops.ts @@ -36,7 +36,11 @@ import { } from "./internal.js"; import { type MemoryFileEntry } from "./internal.js"; import { ensureMemoryIndexSchema } from "./memory-schema.js"; -import { buildCaseInsensitiveExtensionGlob, classifyMemoryMultimodalPath } from "./multimodal.js"; +import { + buildCaseInsensitiveExtensionGlob, + classifyMemoryMultimodalPath, + getMemoryMultimodalExtensions, +} from "./multimodal.js"; import type { SessionFileEntry } from "./session-files.js"; import { buildSessionEntry, @@ -388,11 +392,7 @@ export abstract class MemoryManagerSyncOps { watchPaths.add(path.join(entry, "**", "*.md")); if (this.settings.multimodal.enabled) { for (const modality of this.settings.multimodal.modalities) { - const extensions = - modality === "image" - ? [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"] - : [".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"]; - for (const extension of extensions) { + for (const extension of getMemoryMultimodalExtensions(modality)) { watchPaths.add( path.join(entry, "**", buildCaseInsensitiveExtensionGlob(extension)), ); diff --git a/src/memory/multimodal.ts b/src/memory/multimodal.ts index 40c1707f512..df72ed8c495 100644 --- a/src/memory/multimodal.ts +++ b/src/memory/multimodal.ts @@ -1,5 +1,18 @@ -export const MEMORY_MULTIMODAL_MODALITIES = ["image", "audio"] as const; -export type MemoryMultimodalModality = (typeof MEMORY_MULTIMODAL_MODALITIES)[number]; +const MEMORY_MULTIMODAL_SPECS = { + image: { + labelPrefix: "Image file", + extensions: [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"], + }, + audio: { + labelPrefix: "Audio file", + extensions: [".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"], + }, +} as const; + +export type MemoryMultimodalModality = keyof typeof MEMORY_MULTIMODAL_SPECS; +export const MEMORY_MULTIMODAL_MODALITIES = Object.keys( + MEMORY_MULTIMODAL_SPECS, +) as MemoryMultimodalModality[]; export type MemoryMultimodalSelection = MemoryMultimodalModality | "all"; export type MemoryMultimodalSettings = { @@ -10,10 +23,6 @@ export type MemoryMultimodalSettings = { export const DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES = 10 * 1024 * 1024; -const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"]); - -const AUDIO_EXTENSIONS = new Set([".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"]); - export function normalizeMemoryMultimodalModalities( raw: MemoryMultimodalSelection[] | undefined, ): MemoryMultimodalModality[] { @@ -50,6 +59,19 @@ export function isMemoryMultimodalEnabled(settings: MemoryMultimodalSettings): b return settings.enabled && settings.modalities.length > 0; } +export function getMemoryMultimodalExtensions( + modality: MemoryMultimodalModality, +): readonly string[] { + return MEMORY_MULTIMODAL_SPECS[modality].extensions; +} + +export function buildMemoryMultimodalLabel( + modality: MemoryMultimodalModality, + normalizedPath: string, +): string { + return `${MEMORY_MULTIMODAL_SPECS[modality].labelPrefix}: ${normalizedPath}`; +} + export function buildCaseInsensitiveExtensionGlob(extension: string): string { const normalized = extension.trim().replace(/^\./, "").toLowerCase(); if (!normalized) { @@ -68,8 +90,7 @@ export function classifyMemoryMultimodalPath( } const lower = filePath.trim().toLowerCase(); for (const modality of settings.modalities) { - const extensionSet = modality === "image" ? IMAGE_EXTENSIONS : AUDIO_EXTENSIONS; - for (const extension of extensionSet) { + for (const extension of getMemoryMultimodalExtensions(modality)) { if (lower.endsWith(extension)) { return modality; }