fix(cycles): split media understanding runtime contracts

2026-04-17 20:21:13 +00:00 · 2026-04-11 16:15:05 +01:00
parent 0f77fdf4a0
commit 6e74d77a42
8 changed files with 115 additions and 69 deletions
--- a/src/media-understanding/active-model.types.ts
+++ b/src/media-understanding/active-model.types.ts
@@ -0,0 +1,4 @@
+export type ActiveMediaModel = {
+  provider: string;
+  model?: string;
+};
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -14,6 +14,7 @@ import {
  normalizeLowercaseStringOrEmpty,
  normalizeOptionalLowercaseString,
 } from "../shared/string-coerce.js";
+import type { ActiveMediaModel } from "./active-model.types.js";
 import { resolveAttachmentKind } from "./attachments.js";
 import { runWithConcurrency } from "./concurrency.js";
 import { DEFAULT_ECHO_TRANSCRIPT_FORMAT, sendTranscriptEcho } from "./echo-transcript.js";
@@ -24,7 +25,6 @@ import {
 } from "./format.js";
 import { resolveConcurrency } from "./resolve.js";
 import {
-  type ActiveMediaModel,
  buildProviderRegistry,
  createMediaAttachmentCache,
  normalizeMediaAttachments,
--- a/src/media-understanding/audio-preflight.ts
+++ b/src/media-understanding/audio-preflight.ts
@@ -1,13 +1,10 @@
 import type { MsgContext } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/types.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
+import type { ActiveMediaModel } from "./active-model.types.js";
 import { isAudioAttachment } from "./attachments.js";
 import { runAudioTranscription } from "./audio-transcription-runner.js";
-import {
-  type ActiveMediaModel,
-  normalizeMediaAttachments,
-  resolveMediaAttachmentLocalRoots,
-} from "./runner.js";
+import { normalizeMediaAttachments, resolveMediaAttachmentLocalRoots } from "./runner.js";
 import type { MediaUnderstandingProvider } from "./types.js";

 /**
--- a/src/media-understanding/audio-transcription-runner.ts
+++ b/src/media-understanding/audio-transcription-runner.ts
@@ -1,7 +1,7 @@
 import type { MsgContext } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/types.js";
+import type { ActiveMediaModel } from "./active-model.types.js";
 import {
-  type ActiveMediaModel,
  buildProviderRegistry,
  createMediaAttachmentCache,
  normalizeMediaAttachments,
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -26,6 +26,7 @@ import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
 import { runExec } from "../process/exec.js";
 import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js";
 import { normalizeOptionalString } from "../shared/string-coerce.js";
+import type { ActiveMediaModel } from "./active-model.types.js";
 import { MediaAttachmentCache, selectAttachments } from "./attachments.js";
 import { resolveAutoMediaKeyProviders, resolveDefaultMediaModel } from "./defaults.js";
 import { isMediaUnderstandingSkipError } from "./errors.js";
@@ -52,11 +53,7 @@ import type {
  MediaUnderstandingProvider,
 } from "./types.js";
 export { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
-
-export type ActiveMediaModel = {
-  provider: string;
-  model?: string;
-};
+export type { ActiveMediaModel } from "./active-model.types.js";

 type ProviderRegistry = Map<string, MediaUnderstandingProvider>;

--- a/src/media-understanding/runtime-types.ts
+++ b/src/media-understanding/runtime-types.ts
@@ -0,0 +1,73 @@
+import type { OpenClawConfig } from "../config/types.js";
+import type { ActiveMediaModel } from "./active-model.types.js";
+import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js";
+
+export type RunMediaUnderstandingFileParams = {
+  capability: "image" | "audio" | "video";
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+  activeModel?: ActiveMediaModel;
+};
+
+export type RunMediaUnderstandingFileResult = {
+  text: string | undefined;
+  provider?: string;
+  model?: string;
+  output?: MediaUnderstandingOutput;
+};
+
+export type DescribeImageFileParams = {
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+  activeModel?: ActiveMediaModel;
+};
+
+export type DescribeImageFileWithModelParams = {
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+  provider: string;
+  model: string;
+  prompt: string;
+  maxTokens?: number;
+  timeoutMs?: number;
+};
+
+export type DescribeImageFileWithModelResult = Awaited<
+  ReturnType<NonNullable<MediaUnderstandingProvider["describeImage"]>>
+>;
+
+export type DescribeVideoFileParams = {
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+  activeModel?: ActiveMediaModel;
+};
+
+export type TranscribeAudioFileParams = {
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+  activeModel?: ActiveMediaModel;
+  language?: string;
+  prompt?: string;
+};
+
+export type MediaUnderstandingRuntime = {
+  runMediaUnderstandingFile: (
+    params: RunMediaUnderstandingFileParams,
+  ) => Promise<RunMediaUnderstandingFileResult>;
+  describeImageFile: (params: DescribeImageFileParams) => Promise<RunMediaUnderstandingFileResult>;
+  describeImageFileWithModel: (
+    params: DescribeImageFileWithModelParams,
+  ) => Promise<DescribeImageFileWithModelResult>;
+  describeVideoFile: (params: DescribeVideoFileParams) => Promise<RunMediaUnderstandingFileResult>;
+  transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>;
+};
--- a/src/media-understanding/runtime.ts
+++ b/src/media-understanding/runtime.ts
@@ -1,14 +1,28 @@
 import fs from "node:fs/promises";
 import path from "node:path";
-import type { OpenClawConfig } from "../config/types.js";
 import { normalizeMediaProviderId } from "./provider-registry.js";
 import {
  buildProviderRegistry,
  createMediaAttachmentCache,
  normalizeMediaAttachments,
  runCapability,
-  type ActiveMediaModel,
 } from "./runner.js";
+import type {
+  DescribeImageFileParams,
+  DescribeImageFileWithModelParams,
+  DescribeVideoFileParams,
+  RunMediaUnderstandingFileParams,
+  RunMediaUnderstandingFileResult,
+  TranscribeAudioFileParams,
+} from "./runtime-types.js";
+export type {
+  DescribeImageFileParams,
+  DescribeImageFileWithModelParams,
+  DescribeVideoFileParams,
+  RunMediaUnderstandingFileParams,
+  RunMediaUnderstandingFileResult,
+  TranscribeAudioFileParams,
+} from "./runtime-types.js";

 type MediaUnderstandingCapability = "image" | "audio" | "video";
 type MediaUnderstandingOutput = Awaited<ReturnType<typeof runCapability>>["outputs"][number];
@@ -19,22 +33,6 @@ const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandin
  video: "video.description",
 };

-export type RunMediaUnderstandingFileParams = {
-  capability: MediaUnderstandingCapability;
-  filePath: string;
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  mime?: string;
-  activeModel?: ActiveMediaModel;
-};
-
-export type RunMediaUnderstandingFileResult = {
-  text: string | undefined;
-  provider?: string;
-  model?: string;
-  output?: MediaUnderstandingOutput;
-};
-
 function buildFileContext(params: { filePath: string; mime?: string }) {
  return {
    MediaPath: params.filePath,
@@ -92,27 +90,13 @@ export async function runMediaUnderstandingFile(
  }
 }

-export async function describeImageFile(params: {
-  filePath: string;
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  mime?: string;
-  activeModel?: ActiveMediaModel;
-}): Promise<RunMediaUnderstandingFileResult> {
+export async function describeImageFile(
+  params: DescribeImageFileParams,
+): Promise<RunMediaUnderstandingFileResult> {
  return await runMediaUnderstandingFile({ ...params, capability: "image" });
 }

-export async function describeImageFileWithModel(params: {
-  filePath: string;
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  mime?: string;
-  provider: string;
-  model: string;
-  prompt: string;
-  maxTokens?: number;
-  timeoutMs?: number;
-}) {
+export async function describeImageFileWithModel(params: DescribeImageFileWithModelParams) {
  const timeoutMs = params.timeoutMs ?? 30_000;
  const providerRegistry = buildProviderRegistry(undefined, params.cfg);
  const provider = providerRegistry.get(normalizeMediaProviderId(params.provider));
@@ -134,25 +118,15 @@ export async function describeImageFileWithModel(params: {
  });
 }

-export async function describeVideoFile(params: {
-  filePath: string;
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  mime?: string;
-  activeModel?: ActiveMediaModel;
-}): Promise<RunMediaUnderstandingFileResult> {
+export async function describeVideoFile(
+  params: DescribeVideoFileParams,
+): Promise<RunMediaUnderstandingFileResult> {
  return await runMediaUnderstandingFile({ ...params, capability: "video" });
 }

-export async function transcribeAudioFile(params: {
-  filePath: string;
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  mime?: string;
-  activeModel?: ActiveMediaModel;
-  language?: string;
-  prompt?: string;
-}): Promise<{ text: string | undefined }> {
+export async function transcribeAudioFile(
+  params: TranscribeAudioFileParams,
+): Promise<{ text: string | undefined }> {
  const cfg =
    params.language || params.prompt
      ? {
--- a/src/plugins/runtime/types-core.ts
+++ b/src/plugins/runtime/types-core.ts
@@ -4,6 +4,7 @@ import type {
 } from "../../agents/pi-embedded-runtime.types.js";
 import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js";
 import type { LogLevel } from "../../logging/levels.js";
+import type { MediaUnderstandingRuntime } from "../../media-understanding/runtime-types.js";

 export type { HeartbeatRunResult };

@@ -91,11 +92,11 @@ export type PluginRuntimeCore = {
    listVoices: typeof import("../../tts/tts.js").listSpeechVoices;
  };
  mediaUnderstanding: {
-    runFile: typeof import("../../media-understanding/runtime.js").runMediaUnderstandingFile;
-    describeImageFile: typeof import("../../media-understanding/runtime.js").describeImageFile;
-    describeImageFileWithModel: typeof import("../../media-understanding/runtime.js").describeImageFileWithModel;
-    describeVideoFile: typeof import("../../media-understanding/runtime.js").describeVideoFile;
-    transcribeAudioFile: typeof import("../../media-understanding/runtime.js").transcribeAudioFile;
+    runFile: MediaUnderstandingRuntime["runMediaUnderstandingFile"];
+    describeImageFile: MediaUnderstandingRuntime["describeImageFile"];
+    describeImageFileWithModel: MediaUnderstandingRuntime["describeImageFileWithModel"];
+    describeVideoFile: MediaUnderstandingRuntime["describeVideoFile"];
+    transcribeAudioFile: MediaUnderstandingRuntime["transcribeAudioFile"];
  };
  imageGeneration: {
    generate: (