fix(cycles): split media understanding runtime contracts

This commit is contained in:
Vincent Koc
2026-04-11 16:15:05 +01:00
parent 0f77fdf4a0
commit 6e74d77a42
8 changed files with 115 additions and 69 deletions

View File

@@ -0,0 +1,4 @@
export type ActiveMediaModel = {
provider: string;
model?: string;
};

View File

@@ -14,6 +14,7 @@ import {
normalizeLowercaseStringOrEmpty,
normalizeOptionalLowercaseString,
} from "../shared/string-coerce.js";
import type { ActiveMediaModel } from "./active-model.types.js";
import { resolveAttachmentKind } from "./attachments.js";
import { runWithConcurrency } from "./concurrency.js";
import { DEFAULT_ECHO_TRANSCRIPT_FORMAT, sendTranscriptEcho } from "./echo-transcript.js";
@@ -24,7 +25,6 @@ import {
} from "./format.js";
import { resolveConcurrency } from "./resolve.js";
import {
type ActiveMediaModel,
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,

View File

@@ -1,13 +1,10 @@
import type { MsgContext } from "../auto-reply/templating.js";
import type { OpenClawConfig } from "../config/types.js";
import { logVerbose, shouldLogVerbose } from "../globals.js";
import type { ActiveMediaModel } from "./active-model.types.js";
import { isAudioAttachment } from "./attachments.js";
import { runAudioTranscription } from "./audio-transcription-runner.js";
import {
type ActiveMediaModel,
normalizeMediaAttachments,
resolveMediaAttachmentLocalRoots,
} from "./runner.js";
import { normalizeMediaAttachments, resolveMediaAttachmentLocalRoots } from "./runner.js";
import type { MediaUnderstandingProvider } from "./types.js";
/**

View File

@@ -1,7 +1,7 @@
import type { MsgContext } from "../auto-reply/templating.js";
import type { OpenClawConfig } from "../config/types.js";
import type { ActiveMediaModel } from "./active-model.types.js";
import {
type ActiveMediaModel,
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,

View File

@@ -26,6 +26,7 @@ import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
import { runExec } from "../process/exec.js";
import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js";
import { normalizeOptionalString } from "../shared/string-coerce.js";
import type { ActiveMediaModel } from "./active-model.types.js";
import { MediaAttachmentCache, selectAttachments } from "./attachments.js";
import { resolveAutoMediaKeyProviders, resolveDefaultMediaModel } from "./defaults.js";
import { isMediaUnderstandingSkipError } from "./errors.js";
@@ -52,11 +53,7 @@ import type {
MediaUnderstandingProvider,
} from "./types.js";
export { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
export type ActiveMediaModel = {
provider: string;
model?: string;
};
export type { ActiveMediaModel } from "./active-model.types.js";
type ProviderRegistry = Map<string, MediaUnderstandingProvider>;

View File

@@ -0,0 +1,73 @@
import type { OpenClawConfig } from "../config/types.js";
import type { ActiveMediaModel } from "./active-model.types.js";
import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js";
export type RunMediaUnderstandingFileParams = {
capability: "image" | "audio" | "video";
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
};
export type RunMediaUnderstandingFileResult = {
text: string | undefined;
provider?: string;
model?: string;
output?: MediaUnderstandingOutput;
};
export type DescribeImageFileParams = {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
};
export type DescribeImageFileWithModelParams = {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
provider: string;
model: string;
prompt: string;
maxTokens?: number;
timeoutMs?: number;
};
export type DescribeImageFileWithModelResult = Awaited<
ReturnType<NonNullable<MediaUnderstandingProvider["describeImage"]>>
>;
export type DescribeVideoFileParams = {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
};
export type TranscribeAudioFileParams = {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
language?: string;
prompt?: string;
};
export type MediaUnderstandingRuntime = {
runMediaUnderstandingFile: (
params: RunMediaUnderstandingFileParams,
) => Promise<RunMediaUnderstandingFileResult>;
describeImageFile: (params: DescribeImageFileParams) => Promise<RunMediaUnderstandingFileResult>;
describeImageFileWithModel: (
params: DescribeImageFileWithModelParams,
) => Promise<DescribeImageFileWithModelResult>;
describeVideoFile: (params: DescribeVideoFileParams) => Promise<RunMediaUnderstandingFileResult>;
transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>;
};

View File

@@ -1,14 +1,28 @@
import fs from "node:fs/promises";
import path from "node:path";
import type { OpenClawConfig } from "../config/types.js";
import { normalizeMediaProviderId } from "./provider-registry.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,
runCapability,
type ActiveMediaModel,
} from "./runner.js";
import type {
DescribeImageFileParams,
DescribeImageFileWithModelParams,
DescribeVideoFileParams,
RunMediaUnderstandingFileParams,
RunMediaUnderstandingFileResult,
TranscribeAudioFileParams,
} from "./runtime-types.js";
export type {
DescribeImageFileParams,
DescribeImageFileWithModelParams,
DescribeVideoFileParams,
RunMediaUnderstandingFileParams,
RunMediaUnderstandingFileResult,
TranscribeAudioFileParams,
} from "./runtime-types.js";
type MediaUnderstandingCapability = "image" | "audio" | "video";
type MediaUnderstandingOutput = Awaited<ReturnType<typeof runCapability>>["outputs"][number];
@@ -19,22 +33,6 @@ const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandin
video: "video.description",
};
export type RunMediaUnderstandingFileParams = {
capability: MediaUnderstandingCapability;
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
};
export type RunMediaUnderstandingFileResult = {
text: string | undefined;
provider?: string;
model?: string;
output?: MediaUnderstandingOutput;
};
function buildFileContext(params: { filePath: string; mime?: string }) {
return {
MediaPath: params.filePath,
@@ -92,27 +90,13 @@ export async function runMediaUnderstandingFile(
}
}
export async function describeImageFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<RunMediaUnderstandingFileResult> {
export async function describeImageFile(
params: DescribeImageFileParams,
): Promise<RunMediaUnderstandingFileResult> {
return await runMediaUnderstandingFile({ ...params, capability: "image" });
}
export async function describeImageFileWithModel(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
provider: string;
model: string;
prompt: string;
maxTokens?: number;
timeoutMs?: number;
}) {
export async function describeImageFileWithModel(params: DescribeImageFileWithModelParams) {
const timeoutMs = params.timeoutMs ?? 30_000;
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
const provider = providerRegistry.get(normalizeMediaProviderId(params.provider));
@@ -134,25 +118,15 @@ export async function describeImageFileWithModel(params: {
});
}
export async function describeVideoFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<RunMediaUnderstandingFileResult> {
export async function describeVideoFile(
params: DescribeVideoFileParams,
): Promise<RunMediaUnderstandingFileResult> {
return await runMediaUnderstandingFile({ ...params, capability: "video" });
}
export async function transcribeAudioFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
language?: string;
prompt?: string;
}): Promise<{ text: string | undefined }> {
export async function transcribeAudioFile(
params: TranscribeAudioFileParams,
): Promise<{ text: string | undefined }> {
const cfg =
params.language || params.prompt
? {

View File

@@ -4,6 +4,7 @@ import type {
} from "../../agents/pi-embedded-runtime.types.js";
import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js";
import type { LogLevel } from "../../logging/levels.js";
import type { MediaUnderstandingRuntime } from "../../media-understanding/runtime-types.js";
export type { HeartbeatRunResult };
@@ -91,11 +92,11 @@ export type PluginRuntimeCore = {
listVoices: typeof import("../../tts/tts.js").listSpeechVoices;
};
mediaUnderstanding: {
runFile: typeof import("../../media-understanding/runtime.js").runMediaUnderstandingFile;
describeImageFile: typeof import("../../media-understanding/runtime.js").describeImageFile;
describeImageFileWithModel: typeof import("../../media-understanding/runtime.js").describeImageFileWithModel;
describeVideoFile: typeof import("../../media-understanding/runtime.js").describeVideoFile;
transcribeAudioFile: typeof import("../../media-understanding/runtime.js").transcribeAudioFile;
runFile: MediaUnderstandingRuntime["runMediaUnderstandingFile"];
describeImageFile: MediaUnderstandingRuntime["describeImageFile"];
describeImageFileWithModel: MediaUnderstandingRuntime["describeImageFileWithModel"];
describeVideoFile: MediaUnderstandingRuntime["describeVideoFile"];
transcribeAudioFile: MediaUnderstandingRuntime["transcribeAudioFile"];
};
imageGeneration: {
generate: (