From 87aa0f813c4b126917bea83efa5f31b1f8c3a23e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 11:29:15 +0100 Subject: [PATCH] fix(cli): forward video generation options --- docs/cli/infer.md | 5 ++- src/cli/capability-cli.test.ts | 55 +++++++++++++++++++++++ src/cli/capability-cli.ts | 79 +++++++++++++++++++++++++++++++++- 3 files changed, 135 insertions(+), 4 deletions(-) diff --git a/docs/cli/infer.md b/docs/cli/infer.md index c7c7b221b98..bbb3f6a6dea 100644 --- a/docs/cli/infer.md +++ b/docs/cli/infer.md @@ -114,7 +114,7 @@ This table maps common inference tasks to the corresponding infer command. | Describe an image file | `openclaw infer image describe --file ./image.png --json` | `--model` must be an image-capable `` | | Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `` | | Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented | -| Generate a video | `openclaw infer video generate --prompt "..." --json` | | +| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` | | Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `` | | Search the web | `openclaw infer web search --query "..." --json` | | | Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | | @@ -223,13 +223,14 @@ Use `video` for generation and description. ```bash openclaw infer video generate --prompt "cinematic sunset over the ocean" --json -openclaw infer video generate --prompt "slow drone shot over a forest lake" --json +openclaw infer video generate --prompt "slow drone shot over a forest lake" --resolution 768P --duration 6 --json openclaw infer video describe --file ./clip.mp4 --json openclaw infer video describe --file ./clip.mp4 --model openai/gpt-4.1-mini --json ``` Notes: +- `video generate` accepts `--size`, `--aspect-ratio`, `--resolution`, `--duration`, `--audio`, `--watermark`, and `--timeout-ms` and forwards them to the video-generation runtime. - `--model` must be `` for `video describe`. ## Web diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index a712748ef62..6bc9006ff13 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -577,6 +577,61 @@ describe("capability cli", () => { ); }); + it("passes video generation parameters through to runtime", async () => { + mocks.generateVideo.mockResolvedValue({ + provider: "minimax", + model: "MiniMax-Hailuo-2.3", + attempts: [], + videos: [ + { + buffer: Buffer.from("video-bytes"), + mimeType: "video/mp4", + fileName: "provider-name.mp4", + }, + ], + }); + + await runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: [ + "capability", + "video", + "generate", + "--prompt", + "friendly lobster", + "--model", + "minimax/MiniMax-Hailuo-2.3", + "--size", + "1280x768", + "--aspect-ratio", + "16:9", + "--resolution", + "768p", + "--duration", + "6", + "--audio", + "--watermark", + "--timeout-ms", + "300000", + "--json", + ], + }); + + expect(mocks.generateVideo).toHaveBeenCalledWith( + expect.objectContaining({ + prompt: "friendly lobster", + modelOverride: "minimax/MiniMax-Hailuo-2.3", + size: "1280x768", + aspectRatio: "16:9", + resolution: "768P", + durationSeconds: 6, + audio: true, + watermark: true, + timeoutMs: 300000, + }), + ); + }); + it("fails video generate when a provider returns an undeliverable asset", async () => { mocks.generateVideo.mockResolvedValue({ provider: "vydra", diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts index e253fa7f7e4..a7088bd4983 100644 --- a/src/cli/capability-cli.ts +++ b/src/cli/capability-cli.ts @@ -61,6 +61,7 @@ import { textToSpeech, } from "../tts/tts.js"; import { generateVideo, listRuntimeVideoGenerationProviders } from "../video-generation/runtime.js"; +import type { VideoGenerationResolution } from "../video-generation/types.js"; import { isWebFetchProviderConfigured, resolveWebFetchDefinition, @@ -267,7 +268,19 @@ const CAPABILITY_METADATA: CapabilityMetadata[] = [ id: "video.generate", description: "Generate video files with configured video providers.", transports: ["local"], - flags: ["--prompt", "--model", "--output", "--json"], + flags: [ + "--prompt", + "--model", + "--size", + "--aspect-ratio", + "--resolution", + "--duration", + "--audio", + "--watermark", + "--timeout-ms", + "--output", + "--json", + ], resultShape: "saved video files plus attempts", }, { @@ -822,7 +835,48 @@ async function runAudioTranscribe(params: { } satisfies CapabilityEnvelope; } -async function runVideoGenerate(params: { prompt: string; model?: string; output?: string }) { +function parseOptionalFiniteNumber( + raw: string | number | undefined, + label: string, +): number | undefined { + if (raw === undefined || (typeof raw === "string" && raw.trim() === "")) { + return undefined; + } + const value = Number(raw); + if (!Number.isFinite(value)) { + throw new Error(`${label} must be a finite number`); + } + return value; +} + +function normalizeVideoResolution(raw: string | undefined): VideoGenerationResolution | undefined { + const normalized = raw?.trim().toUpperCase(); + if (!normalized) { + return undefined; + } + if ( + normalized === "480P" || + normalized === "720P" || + normalized === "768P" || + normalized === "1080P" + ) { + return normalized; + } + throw new Error("video resolution must be one of 480P, 720P, 768P, or 1080P"); +} + +async function runVideoGenerate(params: { + prompt: string; + model?: string; + output?: string; + size?: string; + aspectRatio?: string; + resolution?: VideoGenerationResolution; + durationSeconds?: number; + audio?: boolean; + watermark?: boolean; + timeoutMs?: number; +}) { const cfg = loadConfig(); const agentDir = resolveAgentDir(cfg, resolveDefaultAgentId(cfg)); const result = await generateVideo({ @@ -830,6 +884,13 @@ async function runVideoGenerate(params: { prompt: string; model?: string; output agentDir, prompt: params.prompt, modelOverride: params.model, + size: params.size, + aspectRatio: params.aspectRatio, + resolution: params.resolution, + durationSeconds: params.durationSeconds, + audio: params.audio, + watermark: params.watermark, + timeoutMs: params.timeoutMs, }); const outputs = await Promise.all( result.videos.map(async (video, index) => { @@ -1680,6 +1741,13 @@ export function registerCapabilityCli(program: Command) { .description("Generate video") .requiredOption("--prompt ", "Prompt text") .option("--model ", "Model override") + .option("--size ", "Size hint like 1280x720") + .option("--aspect-ratio ", "Aspect ratio hint like 16:9") + .option("--resolution ", "Resolution hint: 480P, 720P, 768P, or 1080P") + .option("--duration ", "Target duration in seconds") + .option("--audio", "Enable generated audio when supported") + .option("--watermark", "Request provider watermark when supported") + .option("--timeout-ms ", "Provider request timeout in milliseconds") .option("--output ", "Output path") .option("--json", "Output JSON", false) .action(async (opts) => { @@ -1688,6 +1756,13 @@ export function registerCapabilityCli(program: Command) { prompt: String(opts.prompt), model: opts.model as string | undefined, output: opts.output as string | undefined, + size: opts.size as string | undefined, + aspectRatio: opts.aspectRatio as string | undefined, + resolution: normalizeVideoResolution(opts.resolution as string | undefined), + durationSeconds: parseOptionalFiniteNumber(opts.duration, "--duration"), + audio: opts.audio === true ? true : undefined, + watermark: opts.watermark === true ? true : undefined, + timeoutMs: parseOptionalFiniteNumber(opts.timeoutMs, "--timeout-ms"), }); emitJsonOrText(defaultRuntime, Boolean(opts.json), result, formatEnvelopeForText); });