From 67db9df5899e007bca81ffd7b47def3b031be307 Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Sun, 15 Mar 2026 19:58:16 +0000 Subject: [PATCH] Media: extract runtime planning helpers --- src/extension-host/media-runtime-config.ts | 190 +++++++++++++++++ src/extension-host/media-runtime-decision.ts | 58 +++++ src/extension-host/media-runtime-execution.ts | 66 +----- .../media-runtime-orchestration.ts | 10 +- src/media-understanding/resolve.ts | 198 +----------------- src/media-understanding/runner.entries.ts | 64 +----- 6 files changed, 273 insertions(+), 313 deletions(-) create mode 100644 src/extension-host/media-runtime-config.ts create mode 100644 src/extension-host/media-runtime-decision.ts diff --git a/src/extension-host/media-runtime-config.ts b/src/extension-host/media-runtime-config.ts new file mode 100644 index 00000000000..8b74a4c88f9 --- /dev/null +++ b/src/extension-host/media-runtime-config.ts @@ -0,0 +1,190 @@ +import type { MsgContext } from "../auto-reply/templating.js"; +import type { OpenClawConfig } from "../config/config.js"; +import type { + MediaUnderstandingConfig, + MediaUnderstandingModelConfig, + MediaUnderstandingScopeConfig, +} from "../config/types.tools.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { + DEFAULT_MAX_BYTES, + DEFAULT_MAX_CHARS_BY_CAPABILITY, + DEFAULT_MEDIA_CONCURRENCY, + DEFAULT_PROMPT, +} from "../media-understanding/defaults.js"; +import { + normalizeMediaUnderstandingChatType, + resolveMediaUnderstandingScope, +} from "../media-understanding/scope.js"; +import type { MediaUnderstandingCapability } from "../media-understanding/types.js"; +import { normalizeExtensionHostMediaProviderId } from "./media-runtime-registry.js"; + +export function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number { + const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds; + return Math.max(1000, Math.floor(value * 1000)); +} + +export function resolvePrompt( + capability: MediaUnderstandingCapability, + prompt?: string, + maxChars?: number, +): string { + const base = prompt?.trim() || DEFAULT_PROMPT[capability]; + if (!maxChars || capability === "audio") { + return base; + } + return `${base} Respond in at most ${maxChars} characters.`; +} + +export function resolveMaxChars(params: { + capability: MediaUnderstandingCapability; + entry: MediaUnderstandingModelConfig; + cfg: OpenClawConfig; + config?: MediaUnderstandingConfig; +}): number | undefined { + const { capability, entry, cfg } = params; + const configured = + entry.maxChars ?? params.config?.maxChars ?? cfg.tools?.media?.[capability]?.maxChars; + if (typeof configured === "number") { + return configured; + } + return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability]; +} + +export function resolveMaxBytes(params: { + capability: MediaUnderstandingCapability; + entry: MediaUnderstandingModelConfig; + cfg: OpenClawConfig; + config?: MediaUnderstandingConfig; +}): number { + const configured = + params.entry.maxBytes ?? + params.config?.maxBytes ?? + params.cfg.tools?.media?.[params.capability]?.maxBytes; + if (typeof configured === "number") { + return configured; + } + return DEFAULT_MAX_BYTES[params.capability]; +} + +export function resolveCapabilityConfig( + cfg: OpenClawConfig, + capability: MediaUnderstandingCapability, +): MediaUnderstandingConfig | undefined { + return cfg.tools?.media?.[capability]; +} + +export function resolveScopeDecision(params: { + scope?: MediaUnderstandingScopeConfig; + ctx: MsgContext; +}): "allow" | "deny" { + return resolveMediaUnderstandingScope({ + scope: params.scope, + sessionKey: params.ctx.SessionKey, + channel: params.ctx.Surface ?? params.ctx.Provider, + chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType), + }); +} + +function resolveEntryCapabilities(params: { + entry: MediaUnderstandingModelConfig; + providerRegistry: Map; +}): MediaUnderstandingCapability[] | undefined { + const entryType = params.entry.type ?? (params.entry.command ? "cli" : "provider"); + if (entryType === "cli") { + return undefined; + } + const providerId = normalizeExtensionHostMediaProviderId(params.entry.provider ?? ""); + if (!providerId) { + return undefined; + } + return params.providerRegistry.get(providerId)?.capabilities; +} + +export function resolveModelEntries(params: { + cfg: OpenClawConfig; + capability: MediaUnderstandingCapability; + config?: MediaUnderstandingConfig; + providerRegistry: Map; +}): MediaUnderstandingModelConfig[] { + const { cfg, capability, config } = params; + const sharedModels = cfg.tools?.media?.models ?? []; + const entries = [ + ...(config?.models ?? []).map((entry) => ({ entry, source: "capability" as const })), + ...sharedModels.map((entry) => ({ entry, source: "shared" as const })), + ]; + if (entries.length === 0) { + return []; + } + + return entries + .filter(({ entry, source }) => { + const caps = + entry.capabilities && entry.capabilities.length > 0 + ? entry.capabilities + : source === "shared" + ? resolveEntryCapabilities({ entry, providerRegistry: params.providerRegistry }) + : undefined; + if (!caps || caps.length === 0) { + if (source === "shared") { + if (shouldLogVerbose()) { + logVerbose( + `Skipping shared media model without capabilities: ${entry.provider ?? entry.command ?? "unknown"}`, + ); + } + return false; + } + return true; + } + return caps.includes(capability); + }) + .map(({ entry }) => entry); +} + +export function resolveConcurrency(cfg: OpenClawConfig): number { + const configured = cfg.tools?.media?.concurrency; + if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) { + return Math.floor(configured); + } + return DEFAULT_MEDIA_CONCURRENCY; +} + +export function resolveEntriesWithActiveFallback(params: { + cfg: OpenClawConfig; + capability: MediaUnderstandingCapability; + config?: MediaUnderstandingConfig; + providerRegistry: Map; + activeModel?: { provider: string; model?: string }; +}): MediaUnderstandingModelConfig[] { + const entries = resolveModelEntries({ + cfg: params.cfg, + capability: params.capability, + config: params.config, + providerRegistry: params.providerRegistry, + }); + if (entries.length > 0) { + return entries; + } + if (params.config?.enabled !== true) { + return entries; + } + const activeProviderRaw = params.activeModel?.provider?.trim(); + if (!activeProviderRaw) { + return entries; + } + const activeProvider = normalizeExtensionHostMediaProviderId(activeProviderRaw); + if (!activeProvider) { + return entries; + } + const capabilities = params.providerRegistry.get(activeProvider)?.capabilities; + if (!capabilities || !capabilities.includes(params.capability)) { + return entries; + } + return [ + { + type: "provider", + provider: activeProvider, + model: params.activeModel?.model, + }, + ]; +} diff --git a/src/extension-host/media-runtime-decision.ts b/src/extension-host/media-runtime-decision.ts new file mode 100644 index 00000000000..9ca95b80493 --- /dev/null +++ b/src/extension-host/media-runtime-decision.ts @@ -0,0 +1,58 @@ +import type { MediaUnderstandingModelConfig } from "../config/types.tools.js"; +import type { + MediaUnderstandingDecision, + MediaUnderstandingModelDecision, +} from "../media-understanding/types.js"; +import { normalizeExtensionHostMediaProviderId } from "./media-runtime-registry.js"; + +export function buildModelDecision(params: { + entry: MediaUnderstandingModelConfig; + entryType: "provider" | "cli"; + outcome: MediaUnderstandingModelDecision["outcome"]; + reason?: string; +}): MediaUnderstandingModelDecision { + if (params.entryType === "cli") { + const command = params.entry.command?.trim(); + return { + type: "cli", + provider: command ?? "cli", + model: params.entry.model ?? command, + outcome: params.outcome, + reason: params.reason, + }; + } + const providerIdRaw = params.entry.provider?.trim(); + const providerId = providerIdRaw + ? normalizeExtensionHostMediaProviderId(providerIdRaw) + : undefined; + return { + type: "provider", + provider: providerId ?? providerIdRaw, + model: params.entry.model, + outcome: params.outcome, + reason: params.reason, + }; +} + +export function formatDecisionSummary(decision: MediaUnderstandingDecision): string { + const attachments = Array.isArray(decision.attachments) ? decision.attachments : []; + const total = attachments.length; + const success = attachments.filter((entry) => entry?.chosen?.outcome === "success").length; + const chosen = attachments.find((entry) => entry?.chosen)?.chosen; + const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined; + const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined; + const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined; + const reason = attachments + .flatMap((entry) => { + const attempts = Array.isArray(entry?.attempts) ? entry.attempts : []; + return attempts + .map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined)) + .filter((value): value is string => Boolean(value)); + }) + .find((value) => value.trim().length > 0); + const shortReason = reason ? reason.split(":")[0]?.trim() : undefined; + const countLabel = total > 0 ? ` (${success}/${total})` : ""; + const viaLabel = modelLabel ? ` via ${modelLabel}` : ""; + const reasonLabel = shortReason ? ` reason=${shortReason}` : ""; + return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`; +} diff --git a/src/extension-host/media-runtime-execution.ts b/src/extension-host/media-runtime-execution.ts index 83cdbdbb87f..6162ec037f8 100644 --- a/src/extension-host/media-runtime-execution.ts +++ b/src/extension-host/media-runtime-execution.ts @@ -29,21 +29,19 @@ import { import { MediaUnderstandingSkipError } from "../media-understanding/errors.js"; import { fileExists } from "../media-understanding/fs.js"; import { extractGeminiResponse } from "../media-understanding/output-extract.js"; -import { - resolveMaxBytes, - resolveMaxChars, - resolvePrompt, - resolveTimeoutMs, -} from "../media-understanding/resolve.js"; import type { MediaUnderstandingCapability, - MediaUnderstandingDecision, - MediaUnderstandingModelDecision, MediaUnderstandingOutput, MediaUnderstandingProvider, } from "../media-understanding/types.js"; import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "../media-understanding/video.js"; import { runExec } from "../process/exec.js"; +import { + resolveMaxBytes, + resolveMaxChars, + resolvePrompt, + resolveTimeoutMs, +} from "./media-runtime-config.js"; export type ProviderRegistry = Map; @@ -292,35 +290,6 @@ function resolveProviderQuery(params: { return Object.keys(query).length > 0 ? query : undefined; } -export function buildModelDecision(params: { - entry: MediaUnderstandingModelConfig; - entryType: "provider" | "cli"; - outcome: MediaUnderstandingModelDecision["outcome"]; - reason?: string; -}): MediaUnderstandingModelDecision { - if (params.entryType === "cli") { - const command = params.entry.command?.trim(); - return { - type: "cli", - provider: command ?? "cli", - model: params.entry.model ?? command, - outcome: params.outcome, - reason: params.reason, - }; - } - const providerIdRaw = params.entry.provider?.trim(); - const providerId = providerIdRaw - ? normalizeExtensionHostMediaProviderId(providerIdRaw) - : undefined; - return { - type: "provider", - provider: providerId ?? providerIdRaw, - model: params.entry.model, - outcome: params.outcome, - reason: params.reason, - }; -} - function resolveEntryRunOptions(params: { capability: MediaUnderstandingCapability; entry: MediaUnderstandingModelConfig; @@ -389,29 +358,6 @@ async function resolveProviderExecutionContext(params: { return { apiKeys, baseUrl, headers }; } -export function formatDecisionSummary(decision: MediaUnderstandingDecision): string { - const attachments = Array.isArray(decision.attachments) ? decision.attachments : []; - const total = attachments.length; - const success = attachments.filter((entry) => entry?.chosen?.outcome === "success").length; - const chosen = attachments.find((entry) => entry?.chosen)?.chosen; - const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined; - const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined; - const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined; - const reason = attachments - .flatMap((entry) => { - const attempts = Array.isArray(entry?.attempts) ? entry.attempts : []; - return attempts - .map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined)) - .filter((value): value is string => Boolean(value)); - }) - .find((value) => value.trim().length > 0); - const shortReason = reason ? reason.split(":")[0]?.trim() : undefined; - const countLabel = total > 0 ? ` (${success}/${total})` : ""; - const viaLabel = modelLabel ? ` via ${modelLabel}` : ""; - const reasonLabel = shortReason ? ` reason=${shortReason}` : ""; - return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`; -} - function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void { if (params.size >= MIN_AUDIO_FILE_BYTES) { return; diff --git a/src/extension-host/media-runtime-orchestration.ts b/src/extension-host/media-runtime-orchestration.ts index 79161068de7..5bc6f4ce47b 100644 --- a/src/extension-host/media-runtime-orchestration.ts +++ b/src/extension-host/media-runtime-orchestration.ts @@ -12,13 +12,7 @@ import type { import { logVerbose, shouldLogVerbose } from "../globals.js"; import { MediaAttachmentCache, selectAttachments } from "../media-understanding/attachments.js"; import { isMediaUnderstandingSkipError } from "../media-understanding/errors.js"; -import { resolveModelEntries, resolveScopeDecision } from "../media-understanding/resolve.js"; -import { - buildModelDecision, - formatDecisionSummary, - runCliEntry, - runProviderEntry, -} from "../media-understanding/runner.entries.js"; +import { runCliEntry, runProviderEntry } from "../media-understanding/runner.entries.js"; import type { MediaAttachment, MediaUnderstandingCapability, @@ -28,6 +22,8 @@ import type { MediaUnderstandingProvider, } from "../media-understanding/types.js"; import { resolveAutoEntries, type ActiveMediaModel } from "./media-runtime-auto.js"; +import { resolveModelEntries, resolveScopeDecision } from "./media-runtime-config.js"; +import { buildModelDecision, formatDecisionSummary } from "./media-runtime-decision.js"; type ProviderRegistry = Map; diff --git a/src/media-understanding/resolve.ts b/src/media-understanding/resolve.ts index bc852bb71ec..971cc5719a4 100644 --- a/src/media-understanding/resolve.ts +++ b/src/media-understanding/resolve.ts @@ -1,187 +1,11 @@ -import type { MsgContext } from "../auto-reply/templating.js"; -import type { OpenClawConfig } from "../config/config.js"; -import type { - MediaUnderstandingConfig, - MediaUnderstandingModelConfig, - MediaUnderstandingScopeConfig, -} from "../config/types.tools.js"; -import { normalizeExtensionHostMediaProviderId } from "../extension-host/media-runtime-registry.js"; -import { logVerbose, shouldLogVerbose } from "../globals.js"; -import { - DEFAULT_MAX_BYTES, - DEFAULT_MAX_CHARS_BY_CAPABILITY, - DEFAULT_MEDIA_CONCURRENCY, - DEFAULT_PROMPT, -} from "./defaults.js"; -import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js"; -import type { MediaUnderstandingCapability } from "./types.js"; - -export function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number { - const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds; - return Math.max(1000, Math.floor(value * 1000)); -} - -export function resolvePrompt( - capability: MediaUnderstandingCapability, - prompt?: string, - maxChars?: number, -): string { - const base = prompt?.trim() || DEFAULT_PROMPT[capability]; - if (!maxChars || capability === "audio") { - return base; - } - return `${base} Respond in at most ${maxChars} characters.`; -} - -export function resolveMaxChars(params: { - capability: MediaUnderstandingCapability; - entry: MediaUnderstandingModelConfig; - cfg: OpenClawConfig; - config?: MediaUnderstandingConfig; -}): number | undefined { - const { capability, entry, cfg } = params; - const configured = - entry.maxChars ?? params.config?.maxChars ?? cfg.tools?.media?.[capability]?.maxChars; - if (typeof configured === "number") { - return configured; - } - return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability]; -} - -export function resolveMaxBytes(params: { - capability: MediaUnderstandingCapability; - entry: MediaUnderstandingModelConfig; - cfg: OpenClawConfig; - config?: MediaUnderstandingConfig; -}): number { - const configured = - params.entry.maxBytes ?? - params.config?.maxBytes ?? - params.cfg.tools?.media?.[params.capability]?.maxBytes; - if (typeof configured === "number") { - return configured; - } - return DEFAULT_MAX_BYTES[params.capability]; -} - -export function resolveCapabilityConfig( - cfg: OpenClawConfig, - capability: MediaUnderstandingCapability, -): MediaUnderstandingConfig | undefined { - return cfg.tools?.media?.[capability]; -} - -export function resolveScopeDecision(params: { - scope?: MediaUnderstandingScopeConfig; - ctx: MsgContext; -}): "allow" | "deny" { - return resolveMediaUnderstandingScope({ - scope: params.scope, - sessionKey: params.ctx.SessionKey, - channel: params.ctx.Surface ?? params.ctx.Provider, - chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType), - }); -} - -function resolveEntryCapabilities(params: { - entry: MediaUnderstandingModelConfig; - providerRegistry: Map; -}): MediaUnderstandingCapability[] | undefined { - const entryType = params.entry.type ?? (params.entry.command ? "cli" : "provider"); - if (entryType === "cli") { - return undefined; - } - const providerId = normalizeExtensionHostMediaProviderId(params.entry.provider ?? ""); - if (!providerId) { - return undefined; - } - return params.providerRegistry.get(providerId)?.capabilities; -} - -export function resolveModelEntries(params: { - cfg: OpenClawConfig; - capability: MediaUnderstandingCapability; - config?: MediaUnderstandingConfig; - providerRegistry: Map; -}): MediaUnderstandingModelConfig[] { - const { cfg, capability, config } = params; - const sharedModels = cfg.tools?.media?.models ?? []; - const entries = [ - ...(config?.models ?? []).map((entry) => ({ entry, source: "capability" as const })), - ...sharedModels.map((entry) => ({ entry, source: "shared" as const })), - ]; - if (entries.length === 0) { - return []; - } - - return entries - .filter(({ entry, source }) => { - const caps = - entry.capabilities && entry.capabilities.length > 0 - ? entry.capabilities - : source === "shared" - ? resolveEntryCapabilities({ entry, providerRegistry: params.providerRegistry }) - : undefined; - if (!caps || caps.length === 0) { - if (source === "shared") { - if (shouldLogVerbose()) { - logVerbose( - `Skipping shared media model without capabilities: ${entry.provider ?? entry.command ?? "unknown"}`, - ); - } - return false; - } - return true; - } - return caps.includes(capability); - }) - .map(({ entry }) => entry); -} - -export function resolveConcurrency(cfg: OpenClawConfig): number { - const configured = cfg.tools?.media?.concurrency; - if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) { - return Math.floor(configured); - } - return DEFAULT_MEDIA_CONCURRENCY; -} - -export function resolveEntriesWithActiveFallback(params: { - cfg: OpenClawConfig; - capability: MediaUnderstandingCapability; - config?: MediaUnderstandingConfig; - providerRegistry: Map; - activeModel?: { provider: string; model?: string }; -}): MediaUnderstandingModelConfig[] { - const entries = resolveModelEntries({ - cfg: params.cfg, - capability: params.capability, - config: params.config, - providerRegistry: params.providerRegistry, - }); - if (entries.length > 0) { - return entries; - } - if (params.config?.enabled !== true) { - return entries; - } - const activeProviderRaw = params.activeModel?.provider?.trim(); - if (!activeProviderRaw) { - return entries; - } - const activeProvider = normalizeExtensionHostMediaProviderId(activeProviderRaw); - if (!activeProvider) { - return entries; - } - const capabilities = params.providerRegistry.get(activeProvider)?.capabilities; - if (!capabilities || !capabilities.includes(params.capability)) { - return entries; - } - return [ - { - type: "provider", - provider: activeProvider, - model: params.activeModel?.model, - }, - ]; -} +export { + resolveTimeoutMs, + resolvePrompt, + resolveMaxChars, + resolveMaxBytes, + resolveCapabilityConfig, + resolveScopeDecision, + resolveModelEntries, + resolveConcurrency, + resolveEntriesWithActiveFallback, +} from "../extension-host/media-runtime-config.js"; diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index 85c4fd3cf11..0c8a079290c 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -4,69 +4,15 @@ import type { MediaUnderstandingConfig, MediaUnderstandingModelConfig, } from "../config/types.tools.js"; -import { normalizeExtensionHostMediaProviderId } from "../extension-host/media-runtime-registry.js"; +export { + buildModelDecision, + formatDecisionSummary, +} from "../extension-host/media-runtime-decision.js"; import type { MediaAttachmentCache } from "./attachments.js"; -import type { - MediaUnderstandingCapability, - MediaUnderstandingDecision, - MediaUnderstandingModelDecision, - MediaUnderstandingOutput, -} from "./types.js"; +import type { MediaUnderstandingCapability, MediaUnderstandingOutput } from "./types.js"; export type ProviderRegistry = Map; -export function buildModelDecision(params: { - entry: MediaUnderstandingModelConfig; - entryType: "provider" | "cli"; - outcome: MediaUnderstandingModelDecision["outcome"]; - reason?: string; -}): MediaUnderstandingModelDecision { - if (params.entryType === "cli") { - const command = params.entry.command?.trim(); - return { - type: "cli", - provider: command ?? "cli", - model: params.entry.model ?? command, - outcome: params.outcome, - reason: params.reason, - }; - } - const providerIdRaw = params.entry.provider?.trim(); - const providerId = providerIdRaw - ? normalizeExtensionHostMediaProviderId(providerIdRaw) - : undefined; - return { - type: "provider", - provider: providerId ?? providerIdRaw, - model: params.entry.model, - outcome: params.outcome, - reason: params.reason, - }; -} - -export function formatDecisionSummary(decision: MediaUnderstandingDecision): string { - const attachments = Array.isArray(decision.attachments) ? decision.attachments : []; - const total = attachments.length; - const success = attachments.filter((entry) => entry?.chosen?.outcome === "success").length; - const chosen = attachments.find((entry) => entry?.chosen)?.chosen; - const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined; - const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined; - const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined; - const reason = attachments - .flatMap((entry) => { - const attempts = Array.isArray(entry?.attempts) ? entry.attempts : []; - return attempts - .map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined)) - .filter((value): value is string => Boolean(value)); - }) - .find((value) => value.trim().length > 0); - const shortReason = reason ? reason.split(":")[0]?.trim() : undefined; - const countLabel = total > 0 ? ` (${success}/${total})` : ""; - const viaLabel = modelLabel ? ` via ${modelLabel}` : ""; - const reasonLabel = shortReason ? ` reason=${shortReason}` : ""; - return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`; -} - export async function runProviderEntry(params: { capability: MediaUnderstandingCapability; entry: MediaUnderstandingModelConfig;