mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
memory: simplify multimodal embedding plumbing
This commit is contained in:
@@ -2,16 +2,12 @@ import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import * as authModule from "../agents/model-auth.js";
|
||||
import {
|
||||
buildGeminiEmbeddingRequest,
|
||||
buildFileDataPart,
|
||||
buildGeminiParts,
|
||||
buildGeminiTextEmbeddingRequest,
|
||||
buildInlineDataPart,
|
||||
createGeminiEmbeddingProvider,
|
||||
DEFAULT_GEMINI_EMBEDDING_MODEL,
|
||||
GEMINI_EMBEDDING_2_MODELS,
|
||||
isGeminiEmbedding2Model,
|
||||
resolveGeminiOutputDimensionality,
|
||||
type GeminiPart,
|
||||
} from "./embeddings-gemini.js";
|
||||
|
||||
vi.mock("../agents/model-auth.js", async () => {
|
||||
@@ -62,40 +58,6 @@ function mockResolvedProviderKey(apiKey = "test-key") {
|
||||
});
|
||||
}
|
||||
|
||||
// ---------- Helper function tests ----------
|
||||
|
||||
describe("buildGeminiParts", () => {
|
||||
it("wraps a string into a single text part", () => {
|
||||
expect(buildGeminiParts("hello")).toEqual([{ text: "hello" }]);
|
||||
});
|
||||
|
||||
it("passes through an existing parts array", () => {
|
||||
const parts: GeminiPart[] = [
|
||||
{ text: "hello" },
|
||||
{ inlineData: { mimeType: "image/png", data: "base64data" } },
|
||||
];
|
||||
expect(buildGeminiParts(parts)).toBe(parts);
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildInlineDataPart", () => {
|
||||
it("produces the correct shape", () => {
|
||||
const part = buildInlineDataPart("image/jpeg", "abc123");
|
||||
expect(part).toEqual({
|
||||
inlineData: { mimeType: "image/jpeg", data: "abc123" },
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildFileDataPart", () => {
|
||||
it("produces the correct shape", () => {
|
||||
const part = buildFileDataPart("application/pdf", "gs://bucket/file.pdf");
|
||||
expect(part).toEqual({
|
||||
fileData: { mimeType: "application/pdf", fileUri: "gs://bucket/file.pdf" },
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildGeminiTextEmbeddingRequest", () => {
|
||||
it("builds a text embedding request with optional model and dimensions", () => {
|
||||
expect(
|
||||
|
||||
@@ -51,10 +51,7 @@ export type GeminiTextPart = { text: string };
|
||||
export type GeminiInlinePart = {
|
||||
inlineData: { mimeType: string; data: string };
|
||||
};
|
||||
export type GeminiFilePart = {
|
||||
fileData: { mimeType: string; fileUri: string };
|
||||
};
|
||||
export type GeminiPart = GeminiTextPart | GeminiInlinePart | GeminiFilePart;
|
||||
export type GeminiPart = GeminiTextPart | GeminiInlinePart;
|
||||
export type GeminiEmbeddingRequest = {
|
||||
content: { parts: GeminiPart[] };
|
||||
taskType: GeminiTaskType;
|
||||
@@ -63,24 +60,6 @@ export type GeminiEmbeddingRequest = {
|
||||
};
|
||||
export type GeminiTextEmbeddingRequest = GeminiEmbeddingRequest;
|
||||
|
||||
/** Convert a string or pre-built parts array into `GeminiPart[]`. */
|
||||
export function buildGeminiParts(input: string | GeminiPart[]): GeminiPart[] {
|
||||
if (typeof input === "string") {
|
||||
return [{ text: input }];
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
/** Convenience: build an inline-data part for multimodal embeddings. */
|
||||
export function buildInlineDataPart(mimeType: string, base64Data: string): GeminiInlinePart {
|
||||
return { inlineData: { mimeType, data: base64Data } };
|
||||
}
|
||||
|
||||
/** Convenience: build a file-data part for multimodal embeddings. */
|
||||
export function buildFileDataPart(mimeType: string, fileUri: string): GeminiFilePart {
|
||||
return { fileData: { mimeType, fileUri } };
|
||||
}
|
||||
|
||||
/** Builds the text-only Gemini embedding request shape used across direct and batch APIs. */
|
||||
export function buildGeminiTextEmbeddingRequest(params: {
|
||||
text: string;
|
||||
|
||||
@@ -3,10 +3,10 @@ import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
import {
|
||||
buildMultimodalChunkForIndexing,
|
||||
buildFileEntry,
|
||||
chunkMarkdown,
|
||||
listMemoryFiles,
|
||||
loadMultimodalEmbeddingInput,
|
||||
normalizeExtraMemoryPaths,
|
||||
remapChunkLines,
|
||||
} from "./internal.js";
|
||||
@@ -201,18 +201,19 @@ describe("buildFileEntry", () => {
|
||||
expect(entry?.embeddingInput).toBeUndefined();
|
||||
});
|
||||
|
||||
it("loads multimodal embedding input lazily", async () => {
|
||||
it("builds a multimodal chunk lazily for indexing", async () => {
|
||||
const tmpDir = getTmpDir();
|
||||
const target = path.join(tmpDir, "diagram.png");
|
||||
await fs.writeFile(target, Buffer.from("png"));
|
||||
|
||||
const entry = await buildFileEntry(target, tmpDir, multimodal);
|
||||
const input = await loadMultimodalEmbeddingInput(entry!);
|
||||
const built = await buildMultimodalChunkForIndexing(entry!);
|
||||
|
||||
expect(input?.parts).toEqual([
|
||||
expect(built?.chunk.embeddingInput?.parts).toEqual([
|
||||
{ type: "text", text: "Image file: diagram.png" },
|
||||
expect.objectContaining({ type: "inline-data", mimeType: "image/png" }),
|
||||
]);
|
||||
expect(built?.structuredInputBytes).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -4,9 +4,11 @@ import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { detectMime } from "../media/mime.js";
|
||||
import { runTasksWithConcurrency } from "../utils/run-with-concurrency.js";
|
||||
import { estimateStructuredEmbeddingInputBytes } from "./embedding-input-limits.js";
|
||||
import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs.js";
|
||||
import { isFileMissingError } from "./fs-utils.js";
|
||||
import {
|
||||
buildMemoryMultimodalLabel,
|
||||
classifyMemoryMultimodalPath,
|
||||
type MemoryMultimodalModality,
|
||||
type MemoryMultimodalSettings,
|
||||
@@ -32,6 +34,11 @@ export type MemoryChunk = {
|
||||
embeddingInput?: EmbeddingInput;
|
||||
};
|
||||
|
||||
export type MultimodalMemoryChunk = {
|
||||
chunk: MemoryChunk;
|
||||
structuredInputBytes: number;
|
||||
};
|
||||
|
||||
const DISABLED_MULTIMODAL_SETTINGS: MemoryMultimodalSettings = {
|
||||
enabled: false,
|
||||
modalities: [],
|
||||
@@ -211,7 +218,7 @@ export async function buildFileEntry(
|
||||
if (!mimeType || !mimeType.startsWith(`${modality}/`)) {
|
||||
return null;
|
||||
}
|
||||
const contentText = `${modality === "image" ? "Image" : "Audio"} file: ${normalizedPath}`;
|
||||
const contentText = buildMemoryMultimodalLabel(modality, normalizedPath);
|
||||
const dataHash = crypto.createHash("sha256").update(buffer).digest("hex");
|
||||
const chunkHash = hashText(
|
||||
JSON.stringify({
|
||||
@@ -253,7 +260,7 @@ export async function buildFileEntry(
|
||||
};
|
||||
}
|
||||
|
||||
export async function loadMultimodalEmbeddingInput(
|
||||
async function loadMultimodalEmbeddingInput(
|
||||
entry: Pick<MemoryFileEntry, "absPath" | "contentText" | "mimeType" | "kind">,
|
||||
): Promise<EmbeddingInput | null> {
|
||||
if (entry.kind !== "multimodal" || !entry.contentText || !entry.mimeType) {
|
||||
@@ -281,6 +288,25 @@ export async function loadMultimodalEmbeddingInput(
|
||||
};
|
||||
}
|
||||
|
||||
export async function buildMultimodalChunkForIndexing(
|
||||
entry: Pick<MemoryFileEntry, "absPath" | "contentText" | "mimeType" | "kind" | "hash">,
|
||||
): Promise<MultimodalMemoryChunk | null> {
|
||||
const embeddingInput = await loadMultimodalEmbeddingInput(entry);
|
||||
if (!embeddingInput) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
chunk: {
|
||||
startLine: 1,
|
||||
endLine: 1,
|
||||
text: entry.contentText ?? embeddingInput.text,
|
||||
hash: entry.hash,
|
||||
embeddingInput,
|
||||
},
|
||||
structuredInputBytes: estimateStructuredEmbeddingInputBytes(embeddingInput),
|
||||
};
|
||||
}
|
||||
|
||||
export function chunkMarkdown(
|
||||
content: string,
|
||||
chunking: { tokens: number; overlap: number },
|
||||
|
||||
@@ -15,9 +15,9 @@ import {
|
||||
import { type EmbeddingInput, hasNonTextEmbeddingParts } from "./embedding-inputs.js";
|
||||
import { buildGeminiEmbeddingRequest } from "./embeddings-gemini.js";
|
||||
import {
|
||||
buildMultimodalChunkForIndexing,
|
||||
chunkMarkdown,
|
||||
hashText,
|
||||
loadMultimodalEmbeddingInput,
|
||||
parseEmbedding,
|
||||
remapChunkLines,
|
||||
type MemoryChunk,
|
||||
@@ -813,21 +813,13 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
|
||||
let chunks: MemoryChunk[];
|
||||
let structuredInputBytes: number | undefined;
|
||||
if ("kind" in entry && entry.kind === "multimodal") {
|
||||
const embeddingInput = await loadMultimodalEmbeddingInput(entry);
|
||||
if (!embeddingInput) {
|
||||
const multimodalChunk = await buildMultimodalChunkForIndexing(entry);
|
||||
if (!multimodalChunk) {
|
||||
this.clearIndexedFileData(entry.path, options.source);
|
||||
return;
|
||||
}
|
||||
structuredInputBytes = estimateStructuredEmbeddingInputBytes(embeddingInput);
|
||||
chunks = [
|
||||
{
|
||||
startLine: 1,
|
||||
endLine: 1,
|
||||
text: entry.contentText ?? embeddingInput.text,
|
||||
hash: entry.hash,
|
||||
embeddingInput,
|
||||
},
|
||||
];
|
||||
structuredInputBytes = multimodalChunk.structuredInputBytes;
|
||||
chunks = [multimodalChunk.chunk];
|
||||
} else {
|
||||
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
|
||||
chunks = enforceEmbeddingMaxInputTokens(
|
||||
|
||||
@@ -36,7 +36,11 @@ import {
|
||||
} from "./internal.js";
|
||||
import { type MemoryFileEntry } from "./internal.js";
|
||||
import { ensureMemoryIndexSchema } from "./memory-schema.js";
|
||||
import { buildCaseInsensitiveExtensionGlob, classifyMemoryMultimodalPath } from "./multimodal.js";
|
||||
import {
|
||||
buildCaseInsensitiveExtensionGlob,
|
||||
classifyMemoryMultimodalPath,
|
||||
getMemoryMultimodalExtensions,
|
||||
} from "./multimodal.js";
|
||||
import type { SessionFileEntry } from "./session-files.js";
|
||||
import {
|
||||
buildSessionEntry,
|
||||
@@ -388,11 +392,7 @@ export abstract class MemoryManagerSyncOps {
|
||||
watchPaths.add(path.join(entry, "**", "*.md"));
|
||||
if (this.settings.multimodal.enabled) {
|
||||
for (const modality of this.settings.multimodal.modalities) {
|
||||
const extensions =
|
||||
modality === "image"
|
||||
? [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"]
|
||||
: [".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"];
|
||||
for (const extension of extensions) {
|
||||
for (const extension of getMemoryMultimodalExtensions(modality)) {
|
||||
watchPaths.add(
|
||||
path.join(entry, "**", buildCaseInsensitiveExtensionGlob(extension)),
|
||||
);
|
||||
|
||||
@@ -1,5 +1,18 @@
|
||||
export const MEMORY_MULTIMODAL_MODALITIES = ["image", "audio"] as const;
|
||||
export type MemoryMultimodalModality = (typeof MEMORY_MULTIMODAL_MODALITIES)[number];
|
||||
const MEMORY_MULTIMODAL_SPECS = {
|
||||
image: {
|
||||
labelPrefix: "Image file",
|
||||
extensions: [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"],
|
||||
},
|
||||
audio: {
|
||||
labelPrefix: "Audio file",
|
||||
extensions: [".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"],
|
||||
},
|
||||
} as const;
|
||||
|
||||
export type MemoryMultimodalModality = keyof typeof MEMORY_MULTIMODAL_SPECS;
|
||||
export const MEMORY_MULTIMODAL_MODALITIES = Object.keys(
|
||||
MEMORY_MULTIMODAL_SPECS,
|
||||
) as MemoryMultimodalModality[];
|
||||
export type MemoryMultimodalSelection = MemoryMultimodalModality | "all";
|
||||
|
||||
export type MemoryMultimodalSettings = {
|
||||
@@ -10,10 +23,6 @@ export type MemoryMultimodalSettings = {
|
||||
|
||||
export const DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES = 10 * 1024 * 1024;
|
||||
|
||||
const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"]);
|
||||
|
||||
const AUDIO_EXTENSIONS = new Set([".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"]);
|
||||
|
||||
export function normalizeMemoryMultimodalModalities(
|
||||
raw: MemoryMultimodalSelection[] | undefined,
|
||||
): MemoryMultimodalModality[] {
|
||||
@@ -50,6 +59,19 @@ export function isMemoryMultimodalEnabled(settings: MemoryMultimodalSettings): b
|
||||
return settings.enabled && settings.modalities.length > 0;
|
||||
}
|
||||
|
||||
export function getMemoryMultimodalExtensions(
|
||||
modality: MemoryMultimodalModality,
|
||||
): readonly string[] {
|
||||
return MEMORY_MULTIMODAL_SPECS[modality].extensions;
|
||||
}
|
||||
|
||||
export function buildMemoryMultimodalLabel(
|
||||
modality: MemoryMultimodalModality,
|
||||
normalizedPath: string,
|
||||
): string {
|
||||
return `${MEMORY_MULTIMODAL_SPECS[modality].labelPrefix}: ${normalizedPath}`;
|
||||
}
|
||||
|
||||
export function buildCaseInsensitiveExtensionGlob(extension: string): string {
|
||||
const normalized = extension.trim().replace(/^\./, "").toLowerCase();
|
||||
if (!normalized) {
|
||||
@@ -68,8 +90,7 @@ export function classifyMemoryMultimodalPath(
|
||||
}
|
||||
const lower = filePath.trim().toLowerCase();
|
||||
for (const modality of settings.modalities) {
|
||||
const extensionSet = modality === "image" ? IMAGE_EXTENSIONS : AUDIO_EXTENSIONS;
|
||||
for (const extension of extensionSet) {
|
||||
for (const extension of getMemoryMultimodalExtensions(modality)) {
|
||||
if (lower.endsWith(extension)) {
|
||||
return modality;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user