diff --git a/CHANGELOG.md b/CHANGELOG.md index 8dee748f167..e1f63b360a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -191,6 +191,7 @@ Docs: https://docs.openclaw.ai - Update/npm: prefer the npm binary that owns the installed global OpenClaw prefix so mixed Homebrew-plus-nvm setups update the right install. (#60153) Thanks @jayeshp19. - Windows/restart: clean up stale gateway listeners before Windows self-restart and treat listener and argv probe failures as inconclusive, so scheduled-task relaunch no longer falls into an `EADDRINUSE` retry loop. (#60480) Thanks @arifahmedjoy. - Plugins: suppress trust-warning noise during non-activating snapshot and CLI metadata loads. (#61427) Thanks @gumadeiras. +- Agents/video generation: accept `agents.defaults.videoGenerationModel` in strict config validation and `openclaw config set/get`, so gateways using `video_generate` no longer fail to boot after enabling a video model. ## 2026.4.2 diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json b/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json index 52bd890e716..5453bc9a34c 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json +++ b/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json @@ -1030,6 +1030,31 @@ } } }, + "video_generate": { + "emoji": "🎬", + "title": "Video Generation", + "actions": { + "generate": { + "label": "generate", + "detailKeys": [ + "prompt", + "model", + "durationSeconds", + "resolution", + "aspectRatio", + "audio", + "watermark" + ] + }, + "list": { + "label": "list", + "detailKeys": [ + "provider", + "model" + ] + } + } + }, "pdf": { "emoji": "📑", "title": "PDF", diff --git a/docs/concepts/models.md b/docs/concepts/models.md index 6acd34db45b..bf8d0e8366b 100644 --- a/docs/concepts/models.md +++ b/docs/concepts/models.md @@ -30,7 +30,7 @@ Related: falls back to `agents.defaults.imageModel`, then the resolved session/default model. - `agents.defaults.imageGenerationModel` is used by the shared image-generation capability. If omitted, `image_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered image-generation providers in provider-id order. If you set a specific provider/model, also configure that provider's auth/API key. -- `agents.defaults.videoGenerationModel` is used by the shared video-generation capability. Unlike image generation, this does not infer a provider default today. Set an explicit `provider/model` such as `qwen/wan2.6-t2v`, and configure that provider's auth/API key too. +- `agents.defaults.videoGenerationModel` is used by the shared video-generation capability. If omitted, `video_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered video-generation providers in provider-id order. If you set a specific provider/model, also configure that provider's auth/API key. - Per-agent defaults can override `agents.defaults.model` via `agents.list[].model` plus bindings (see [/concepts/multi-agent](/concepts/multi-agent)). ## Quick model policy @@ -252,4 +252,5 @@ This applies whenever OpenClaw regenerates `models.json`, including command-driv - [Model Providers](/concepts/model-providers) — provider routing and auth - [Model Failover](/concepts/model-failover) — fallback chains - [Image Generation](/tools/image-generation) — image model configuration +- [Video Generation](/tools/video-generation) — video model configuration - [Configuration Reference](/gateway/configuration-reference#agent-defaults) — model config keys diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index f8f1fe23c96..f7d2a7c43fc 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -1026,9 +1026,9 @@ Time format in system prompt. Default: `auto` (OS preference). - If you select a provider/model directly, configure the matching provider auth/API key too (for example `GEMINI_API_KEY` or `GOOGLE_API_KEY` for `google/*`, `OPENAI_API_KEY` for `openai/*`, `FAL_KEY` for `fal/*`). - If omitted, `image_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered image-generation providers in provider-id order. - `videoGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - - Used by the shared video-generation capability. + - Used by the shared video-generation capability and the built-in `video_generate` tool. - Typical values: `qwen/wan2.6-t2v`, `qwen/wan2.6-i2v`, `qwen/wan2.6-r2v`, `qwen/wan2.6-r2v-flash`, or `qwen/wan2.7-r2v`. - - Set this explicitly before using shared video generation. Unlike `imageGenerationModel`, the video-generation runtime does not infer a provider default yet. + - If omitted, `video_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered video-generation providers in provider-id order. - If you select a provider/model directly, configure the matching provider auth/API key too. - The bundled Qwen video-generation provider currently supports up to 1 output video, 1 input image, 4 input videos, 10 seconds duration, and provider-level `size`, `aspectRatio`, `resolution`, `audio`, and `watermark` options. - `pdfModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). @@ -1936,12 +1936,12 @@ Defaults for Talk mode (macOS/iOS/Android). Local onboarding defaults new local configs to `tools.profile: "coding"` when unset (existing explicit profiles are preserved). -| Profile | Includes | -| ----------- | ------------------------------------------------------------------------------------------------------------- | -| `minimal` | `session_status` only | -| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate` | -| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` | -| `full` | No restriction (same as unset) | +| Profile | Includes | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `minimal` | `session_status` only | +| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate`, `video_generate` | +| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` | +| `full` | No restriction (same as unset) | ### Tool groups @@ -1957,7 +1957,7 @@ Local onboarding defaults new local configs to `tools.profile: "coding"` when un | `group:messaging` | `message` | | `group:nodes` | `nodes` | | `group:agents` | `agents_list` | -| `group:media` | `image`, `image_generate`, `tts` | +| `group:media` | `image`, `image_generate`, `video_generate`, `tts` | | `group:openclaw` | All built-in tools (excludes provider plugins) | ### `tools.allow` / `tools.deny` diff --git a/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md b/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md index f9388783186..c5168722f85 100644 --- a/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md +++ b/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md @@ -98,7 +98,7 @@ Available groups: - `group:messaging`: `message` - `group:nodes`: `nodes` - `group:agents`: `agents_list` -- `group:media`: `image`, `image_generate`, `tts` +- `group:media`: `image`, `image_generate`, `video_generate`, `tts` - `group:openclaw`: all built-in OpenClaw tools (excludes provider plugins) ## Elevated: exec-only "run on host" diff --git a/docs/providers/qwen.md b/docs/providers/qwen.md index 669187b867b..3581a53ca07 100644 --- a/docs/providers/qwen.md +++ b/docs/providers/qwen.md @@ -123,6 +123,9 @@ Current bundled Qwen video-generation limits: - Up to **4** input videos - Up to **10 seconds** duration - Supports `size`, `aspectRatio`, `resolution`, `audio`, and `watermark` +- Reference image/video mode currently requires **remote http(s) URLs**. Local + file paths are rejected up front because the DashScope video endpoint does not + accept uploaded local buffers for those references. See [Qwen / Model Studio](/providers/qwen_modelstudio) for endpoint-level detail and compatibility notes. diff --git a/docs/tools/index.md b/docs/tools/index.md index e47679cf289..1dbb97a0624 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -53,25 +53,28 @@ OpenClaw has three layers that work together: These tools ship with OpenClaw and are available without installing any plugins: -| Tool | What it does | Page | -| ------------------------------------------ | --------------------------------------------------------------------- | --------------------------------------- | -| `exec` / `process` | Run shell commands, manage background processes | [Exec](/tools/exec) | -| `code_execution` | Run sandboxed remote Python analysis | [Code Execution](/tools/code-execution) | -| `browser` | Control a Chromium browser (navigate, click, screenshot) | [Browser](/tools/browser) | -| `web_search` / `x_search` / `web_fetch` | Search the web, search X posts, fetch page content | [Web](/tools/web) | -| `read` / `write` / `edit` | File I/O in the workspace | | -| `apply_patch` | Multi-hunk file patches | [Apply Patch](/tools/apply-patch) | -| `message` | Send messages across all channels | [Agent Send](/tools/agent-send) | -| `canvas` | Drive node Canvas (present, eval, snapshot) | | -| `nodes` | Discover and target paired devices | | -| `cron` / `gateway` | Manage scheduled jobs; inspect, patch, restart, or update the gateway | | -| `image` / `image_generate` | Analyze or generate images | | -| `tts` | One-shot text-to-speech conversion | [TTS](/tools/tts) | -| `sessions_*` / `subagents` / `agents_list` | Session management, status, and sub-agent orchestration | [Sub-agents](/tools/subagents) | -| `session_status` | Lightweight `/status`-style readback and session model override | [Session Tools](/concepts/session-tool) | +| Tool | What it does | Page | +| ------------------------------------------ | --------------------------------------------------------------------- | ------------------------------------------- | +| `exec` / `process` | Run shell commands, manage background processes | [Exec](/tools/exec) | +| `code_execution` | Run sandboxed remote Python analysis | [Code Execution](/tools/code-execution) | +| `browser` | Control a Chromium browser (navigate, click, screenshot) | [Browser](/tools/browser) | +| `web_search` / `x_search` / `web_fetch` | Search the web, search X posts, fetch page content | [Web](/tools/web) | +| `read` / `write` / `edit` | File I/O in the workspace | | +| `apply_patch` | Multi-hunk file patches | [Apply Patch](/tools/apply-patch) | +| `message` | Send messages across all channels | [Agent Send](/tools/agent-send) | +| `canvas` | Drive node Canvas (present, eval, snapshot) | | +| `nodes` | Discover and target paired devices | | +| `cron` / `gateway` | Manage scheduled jobs; inspect, patch, restart, or update the gateway | | +| `image` / `image_generate` | Analyze or generate images | [Image Generation](/tools/image-generation) | +| `video_generate` | Generate videos | [Video Generation](/tools/video-generation) | +| `tts` | One-shot text-to-speech conversion | [TTS](/tools/tts) | +| `sessions_*` / `subagents` / `agents_list` | Session management, status, and sub-agent orchestration | [Sub-agents](/tools/subagents) | +| `session_status` | Lightweight `/status`-style readback and session model override | [Session Tools](/concepts/session-tool) | For image work, use `image` for analysis and `image_generate` for generation or editing. If you target `openai/*`, `google/*`, `fal/*`, or another non-default image provider, configure that provider's auth/API key first. +For video work, use `video_generate`. If you target `qwen/*` or another non-default video provider, configure that provider's auth/API key first. + `session_status` is the lightweight status/readback tool in the sessions group. It answers `/status`-style questions about the current session and can optionally set a per-session model override; `model=default` clears that @@ -121,12 +124,12 @@ config. Deny always wins over allow. `tools.profile` sets a base allowlist before `allow`/`deny` is applied. Per-agent override: `agents.list[].tools.profile`. -| Profile | What it includes | -| ----------- | ------------------------------------------------------------------------------------------------------------- | -| `full` | No restriction (same as unset) | -| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate` | -| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` | -| `minimal` | `session_status` only | +| Profile | What it includes | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `full` | No restriction (same as unset) | +| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate`, `video_generate` | +| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` | +| `minimal` | `session_status` only | ### Tool groups @@ -144,7 +147,7 @@ Use `group:*` shorthands in allow/deny lists: | `group:messaging` | message | | `group:nodes` | nodes | | `group:agents` | agents_list | -| `group:media` | image, image_generate, tts | +| `group:media` | image, image_generate, video_generate, tts | | `group:openclaw` | All built-in OpenClaw tools (excludes plugin tools) | `sessions_history` returns a bounded, safety-filtered recall view. It strips diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md new file mode 100644 index 00000000000..434f6c170be --- /dev/null +++ b/docs/tools/video-generation.md @@ -0,0 +1,109 @@ +--- +summary: "Generate videos using configured providers such as Qwen" +read_when: + - Generating videos via the agent + - Configuring video generation providers and models + - Understanding the video_generate tool parameters +title: "Video Generation" +--- + +# Video Generation + +The `video_generate` tool lets the agent create videos using your configured providers. Generated videos are delivered automatically as media attachments in the agent's reply. + + +The tool only appears when at least one video-generation provider is available. If you don't see `video_generate` in your agent's tools, configure `agents.defaults.videoGenerationModel` or set up a provider API key. + + +## Quick start + +1. Set an API key for at least one provider (for example `QWEN_API_KEY`). +2. Optionally set your preferred model: + +```json5 +{ + agents: { + defaults: { + videoGenerationModel: "qwen/wan2.6-t2v", + }, + }, +} +``` + +3. Ask the agent: _"Generate a 5-second cinematic video of a friendly lobster surfing at sunset."_ + +The agent calls `video_generate` automatically. No tool allow-listing needed — it's enabled by default when a provider is available. + +## Supported providers + +| Provider | Default model | Reference inputs | API key | +| -------- | ------------- | ---------------- | ---------------------------------------------------------- | +| Qwen | `wan2.6-t2v` | Yes, remote URLs | `QWEN_API_KEY`, `MODELSTUDIO_API_KEY`, `DASHSCOPE_API_KEY` | + +Use `action: "list"` to inspect available providers and models at runtime: + +``` +/tool video_generate action=list +``` + +## Tool parameters + +| Parameter | Type | Description | +| ----------------- | -------- | ------------------------------------------------------------------------------------- | +| `prompt` | string | Video generation prompt (required for `action: "generate"`) | +| `action` | string | `"generate"` (default) or `"list"` to inspect providers | +| `model` | string | Provider/model override, e.g. `qwen/wan2.6-t2v` | +| `image` | string | Single reference image path or URL | +| `images` | string[] | Multiple reference images (up to 5) | +| `video` | string | Single reference video path or URL | +| `videos` | string[] | Multiple reference videos (up to 4) | +| `size` | string | Size hint when the provider supports it | +| `aspectRatio` | string | Aspect ratio: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9` | +| `resolution` | string | Resolution hint: `480P`, `720P`, or `1080P` | +| `durationSeconds` | number | Target duration in seconds | +| `audio` | boolean | Enable generated audio when the provider supports it | +| `watermark` | boolean | Toggle provider watermarking when supported | +| `filename` | string | Output filename hint | + +Not all providers support all parameters. The tool validates provider capability limits before it submits the request. + +## Configuration + +### Model selection + +```json5 +{ + agents: { + defaults: { + videoGenerationModel: { + primary: "qwen/wan2.6-t2v", + fallbacks: ["qwen/wan2.6-r2v-flash"], + }, + }, + }, +} +``` + +### Provider selection order + +When generating a video, OpenClaw tries providers in this order: + +1. **`model` parameter** from the tool call (if the agent specifies one) +2. **`videoGenerationModel.primary`** from config +3. **`videoGenerationModel.fallbacks`** in order +4. **Auto-detection** — uses auth-backed provider defaults only: + - current default provider first + - remaining registered video-generation providers in provider-id order + +If a provider fails, the next candidate is tried automatically. If all fail, the error includes details from each attempt. + +## Qwen reference inputs + +The bundled Qwen provider supports text-to-video plus image/video reference modes, but the upstream DashScope video endpoint currently requires **remote http(s) URLs** for reference inputs. Local file paths and uploaded buffers are rejected up front instead of being silently ignored. + +## Related + +- [Tools Overview](/tools) — all available agent tools +- [Qwen](/providers/qwen) — Qwen-specific setup and limits +- [Configuration Reference](/gateway/configuration-reference#agent-defaults) — `videoGenerationModel` config +- [Models](/concepts/models) — model configuration and failover diff --git a/extensions/qwen/video-generation-provider.test.ts b/extensions/qwen/video-generation-provider.test.ts index 93a6b121c92..5ccb35035f7 100644 --- a/extensions/qwen/video-generation-provider.test.ts +++ b/extensions/qwen/video-generation-provider.test.ts @@ -107,4 +107,21 @@ describe("qwen video generation provider", () => { }), ); }); + + it("fails fast when reference inputs are local buffers instead of remote URLs", async () => { + const provider = buildQwenVideoGenerationProvider(); + + await expect( + provider.generateVideo({ + provider: "qwen", + model: "wan2.6-i2v", + prompt: "animate this local frame", + cfg: {}, + inputImages: [{ buffer: Buffer.from("png-bytes"), mimeType: "image/png" }], + }), + ).rejects.toThrow( + "Qwen video generation currently requires remote http(s) URLs for reference images/videos.", + ); + expect(postJsonRequestMock).not.toHaveBeenCalled(); + }); }); diff --git a/extensions/qwen/video-generation-provider.ts b/extensions/qwen/video-generation-provider.ts index 8db83a4604f..207bb0b087c 100644 --- a/extensions/qwen/video-generation-provider.ts +++ b/extensions/qwen/video-generation-provider.ts @@ -90,7 +90,22 @@ function resolveReferenceUrls( .filter((value): value is string => Boolean(value)); } +function assertQwenReferenceInputsSupported( + inputImages: VideoGenerationSourceAsset[] | undefined, + inputVideos: VideoGenerationSourceAsset[] | undefined, +): void { + const unsupported = [...(inputImages ?? []), ...(inputVideos ?? [])].some( + (asset) => !asset.url?.trim() && asset.buffer, + ); + if (unsupported) { + throw new Error( + "Qwen video generation currently requires remote http(s) URLs for reference images/videos.", + ); + } +} + function buildQwenVideoGenerationInput(req: VideoGenerationRequest): Record { + assertQwenReferenceInputsSupported(req.inputImages, req.inputVideos); const input: Record = { prompt: req.prompt, }; diff --git a/src/agents/openclaw-tools.ts b/src/agents/openclaw-tools.ts index c9603d90926..ac2be1a84df 100644 --- a/src/agents/openclaw-tools.ts +++ b/src/agents/openclaw-tools.ts @@ -31,6 +31,7 @@ import { createSessionsYieldTool } from "./tools/sessions-yield-tool.js"; import { createSubagentsTool } from "./tools/subagents-tool.js"; import { createTtsTool } from "./tools/tts-tool.js"; import { createUpdatePlanTool } from "./tools/update-plan-tool.js"; +import { createVideoGenerateTool } from "./tools/video-generate-tool.js"; import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js"; import { resolveWorkspaceRoot } from "./workspace-dir.js"; @@ -159,6 +160,13 @@ export function createOpenClawTools( sandbox, fsPolicy: options?.fsPolicy, }); + const videoGenerateTool = createVideoGenerateTool({ + config: options?.config, + agentDir: options?.agentDir, + workspaceDir, + sandbox, + fsPolicy: options?.fsPolicy, + }); const pdfTool = options?.agentDir?.trim() ? createPdfTool({ config: options?.config, @@ -216,6 +224,7 @@ export function createOpenClawTools( config: options?.config, }), ...(imageGenerateTool ? [imageGenerateTool] : []), + ...(videoGenerateTool ? [videoGenerateTool] : []), createGatewayTool({ agentSessionKey: options?.agentSessionKey, config: options?.config, diff --git a/src/agents/openclaw-tools.video-generation.test.ts b/src/agents/openclaw-tools.video-generation.test.ts new file mode 100644 index 00000000000..582ef89094a --- /dev/null +++ b/src/agents/openclaw-tools.video-generation.test.ts @@ -0,0 +1,91 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; +import * as videoGenerationRuntime from "../video-generation/runtime.js"; +import { createOpenClawTools } from "./openclaw-tools.js"; + +vi.mock("../plugins/tools.js", () => ({ + resolvePluginTools: () => [], + copyPluginToolMeta: () => undefined, + getPluginToolMeta: () => undefined, +})); + +function asConfig(value: unknown): OpenClawConfig { + return value as OpenClawConfig; +} + +function stubVideoGenerationProviders() { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "qwen", + defaultModel: "wan2.6-t2v", + models: ["wan2.6-t2v"], + capabilities: { + maxVideos: 1, + maxInputImages: 1, + maxInputVideos: 4, + maxDurationSeconds: 10, + supportsSize: true, + supportsAspectRatio: true, + supportsResolution: true, + supportsAudio: true, + supportsWatermark: true, + }, + generateVideo: vi.fn(async () => { + throw new Error("not used"); + }), + }, + ]); +} + +describe("openclaw tools video generation registration", () => { + beforeEach(() => { + vi.stubEnv("QWEN_API_KEY", ""); + vi.stubEnv("MODELSTUDIO_API_KEY", ""); + vi.stubEnv("DASHSCOPE_API_KEY", ""); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.unstubAllEnvs(); + }); + + it("registers video_generate when video-generation config is present", () => { + const tools = createOpenClawTools({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { + primary: "qwen/wan2.6-t2v", + }, + }, + }, + }), + agentDir: "/tmp/openclaw-agent-main", + }); + + expect(tools.map((tool) => tool.name)).toContain("video_generate"); + }); + + it("registers video_generate when a compatible provider has env-backed auth", () => { + stubVideoGenerationProviders(); + vi.stubEnv("QWEN_API_KEY", "qwen-test"); + + const tools = createOpenClawTools({ + config: asConfig({}), + agentDir: "/tmp/openclaw-agent-main", + }); + + expect(tools.map((tool) => tool.name)).toContain("video_generate"); + }); + + it("omits video_generate when config is absent and no compatible provider auth exists", () => { + stubVideoGenerationProviders(); + + const tools = createOpenClawTools({ + config: asConfig({}), + agentDir: "/tmp/openclaw-agent-main", + }); + + expect(tools.map((tool) => tool.name)).not.toContain("video_generate"); + }); +}); diff --git a/src/agents/openclaw-tools.web-runtime.test.ts b/src/agents/openclaw-tools.web-runtime.test.ts index 5240d42b9a9..77bab620be6 100644 --- a/src/agents/openclaw-tools.web-runtime.test.ts +++ b/src/agents/openclaw-tools.web-runtime.test.ts @@ -46,6 +46,9 @@ vi.mock("./tools/gateway-tool.js", () => ({ vi.mock("./tools/image-generate-tool.js", () => ({ createImageGenerateTool: mockToolFactory("image_generate_stub"), })); +vi.mock("./tools/video-generate-tool.js", () => ({ + createVideoGenerateTool: mockToolFactory("video_generate_stub"), +})); vi.mock("./tools/image-tool.js", () => ({ createImageTool: mockToolFactory("image_stub"), })); diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts index 9d61d7f30e5..545c2d3b770 100644 --- a/src/agents/pi-embedded-subscribe.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts @@ -265,6 +265,10 @@ describe("extractToolResultMediaPaths", () => { expect(isToolResultMediaTrusted("image_generate")).toBe(true); }); + it("trusts video_generate local MEDIA paths", () => { + expect(isToolResultMediaTrusted("video_generate")).toBe(true); + }); + it("does not trust local MEDIA paths for MCP-provenance results", () => { expect( filterToolResultMediaUrls("browser", ["/tmp/screenshot.png"], { diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts index a0c4e6ec864..f792c5cc6f5 100644 --- a/src/agents/pi-embedded-subscribe.tools.ts +++ b/src/agents/pi-embedded-subscribe.tools.ts @@ -156,6 +156,7 @@ const TRUSTED_TOOL_RESULT_MEDIA = new Set([ "sessions_spawn", "subagents", "tts", + "video_generate", "web_fetch", "web_search", "x_search", diff --git a/src/agents/test-helpers/fast-openclaw-tools.ts b/src/agents/test-helpers/fast-openclaw-tools.ts index 408b8148491..34f26eaa887 100644 --- a/src/agents/test-helpers/fast-openclaw-tools.ts +++ b/src/agents/test-helpers/fast-openclaw-tools.ts @@ -32,6 +32,7 @@ const coreTools = [ stubActionTool("session_status", ["get", "show"]), stubTool("tts"), stubTool("image_generate"), + stubTool("video_generate"), stubTool("web_fetch"), stubTool("image"), stubTool("pdf"), diff --git a/src/agents/test-helpers/fast-tool-stubs.ts b/src/agents/test-helpers/fast-tool-stubs.ts index 7eb885f503b..d86eede29f0 100644 --- a/src/agents/test-helpers/fast-tool-stubs.ts +++ b/src/agents/test-helpers/fast-tool-stubs.ts @@ -23,6 +23,10 @@ vi.mock("../tools/image-generate-tool.js", () => ({ createImageGenerateTool: () => stubTool("image_generate"), })); +vi.mock("../tools/video-generate-tool.js", () => ({ + createVideoGenerateTool: () => stubTool("video_generate"), +})); + vi.mock("../tools/web-tools.js", () => ({ createWebSearchTool: () => null, createWebFetchTool: () => null, diff --git a/src/agents/tool-catalog.test.ts b/src/agents/tool-catalog.test.ts index 55ea1530465..216b0fcab84 100644 --- a/src/agents/tool-catalog.test.ts +++ b/src/agents/tool-catalog.test.ts @@ -10,6 +10,7 @@ describe("tool-catalog", () => { expect(policy!.allow).toContain("x_search"); expect(policy!.allow).toContain("web_fetch"); expect(policy!.allow).toContain("image_generate"); + expect(policy!.allow).toContain("video_generate"); expect(policy!.allow).toContain("update_plan"); }); }); diff --git a/src/agents/tool-catalog.ts b/src/agents/tool-catalog.ts index fe4d1e96e30..c28249fd584 100644 --- a/src/agents/tool-catalog.ts +++ b/src/agents/tool-catalog.ts @@ -277,6 +277,14 @@ const CORE_TOOL_DEFINITIONS: CoreToolDefinition[] = [ profiles: ["coding"], includeInOpenClawGroup: true, }, + { + id: "video_generate", + label: "video_generate", + description: "Video generation", + sectionId: "media", + profiles: ["coding"], + includeInOpenClawGroup: true, + }, { id: "tts", label: "tts", diff --git a/src/agents/tool-display-config.ts b/src/agents/tool-display-config.ts index 4e6948500f2..f5dcd69f6a2 100644 --- a/src/agents/tool-display-config.ts +++ b/src/agents/tool-display-config.ts @@ -640,6 +640,28 @@ export const TOOL_DISPLAY_CONFIG: ToolDisplayConfig = { }, }, }, + video_generate: { + emoji: "🎬", + title: "Video Generation", + actions: { + generate: { + label: "generate", + detailKeys: [ + "prompt", + "model", + "durationSeconds", + "resolution", + "aspectRatio", + "audio", + "watermark", + ], + }, + list: { + label: "list", + detailKeys: ["provider", "model"], + }, + }, + }, pdf: { emoji: "📑", title: "PDF", diff --git a/src/agents/tools/media-tool-shared.ts b/src/agents/tools/media-tool-shared.ts index 6b65c595b5e..4e384380442 100644 --- a/src/agents/tools/media-tool-shared.ts +++ b/src/agents/tools/media-tool-shared.ts @@ -32,9 +32,16 @@ export function applyImageGenerationModelConfigDefaults( return applyAgentDefaultModelConfig(cfg, "imageGenerationModel", imageGenerationModelConfig); } +export function applyVideoGenerationModelConfigDefaults( + cfg: OpenClawConfig | undefined, + videoGenerationModelConfig: ToolModelConfig, +): OpenClawConfig | undefined { + return applyAgentDefaultModelConfig(cfg, "videoGenerationModel", videoGenerationModelConfig); +} + function applyAgentDefaultModelConfig( cfg: OpenClawConfig | undefined, - key: "imageModel" | "imageGenerationModel", + key: "imageModel" | "imageGenerationModel" | "videoGenerationModel", modelConfig: ToolModelConfig, ): OpenClawConfig | undefined { if (!cfg) { diff --git a/src/agents/tools/video-generate-tool.test.ts b/src/agents/tools/video-generate-tool.test.ts new file mode 100644 index 00000000000..a69d2980642 --- /dev/null +++ b/src/agents/tools/video-generate-tool.test.ts @@ -0,0 +1,91 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { OpenClawConfig } from "../../config/config.js"; +import * as mediaStore from "../../media/store.js"; +import * as videoGenerationRuntime from "../../video-generation/runtime.js"; +import { createVideoGenerateTool } from "./video-generate-tool.js"; + +function asConfig(value: unknown): OpenClawConfig { + return value as OpenClawConfig; +} + +describe("createVideoGenerateTool", () => { + beforeEach(() => { + vi.restoreAllMocks(); + }); + + afterEach(() => { + vi.unstubAllEnvs(); + }); + + it("returns null when no video-generation config or auth-backed provider is available", () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([]); + + expect(createVideoGenerateTool({ config: asConfig({}) })).toBeNull(); + }); + + it("registers when video-generation config is present", () => { + expect( + createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "qwen/wan2.6-t2v" }, + }, + }, + }), + }), + ).not.toBeNull(); + }); + + it("generates videos, saves them, and emits MEDIA paths", async () => { + vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({ + provider: "qwen", + model: "wan2.6-t2v", + attempts: [], + videos: [ + { + buffer: Buffer.from("video-bytes"), + mimeType: "video/mp4", + fileName: "lobster.mp4", + }, + ], + metadata: { taskId: "task-1" }, + }); + vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({ + path: "/tmp/generated-lobster.mp4", + id: "generated-lobster.mp4", + size: 11, + contentType: "video/mp4", + }); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "qwen/wan2.6-t2v" }, + }, + }, + }), + }); + expect(tool).not.toBeNull(); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + const result = await tool.execute("call-1", { prompt: "friendly lobster surfing" }); + const text = (result.content?.[0] as { text: string } | undefined)?.text ?? ""; + + expect(text).toContain("Generated 1 video with qwen/wan2.6-t2v."); + expect(text).toContain("MEDIA:/tmp/generated-lobster.mp4"); + expect(result.details).toMatchObject({ + provider: "qwen", + model: "wan2.6-t2v", + count: 1, + media: { + mediaUrls: ["/tmp/generated-lobster.mp4"], + }, + paths: ["/tmp/generated-lobster.mp4"], + metadata: { taskId: "task-1" }, + }); + }); +}); diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts new file mode 100644 index 00000000000..a75069f60e0 --- /dev/null +++ b/src/agents/tools/video-generate-tool.ts @@ -0,0 +1,735 @@ +import { Type } from "@sinclair/typebox"; +import type { OpenClawConfig } from "../../config/config.js"; +import { loadConfig } from "../../config/config.js"; +import { saveMediaBuffer } from "../../media/store.js"; +import { loadWebMedia } from "../../media/web-media.js"; +import { readSnakeCaseParamRaw } from "../../param-key.js"; +import { getProviderEnvVars } from "../../secrets/provider-env-vars.js"; +import { resolveUserPath } from "../../utils.js"; +import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js"; +import { + generateVideo, + listRuntimeVideoGenerationProviders, +} from "../../video-generation/runtime.js"; +import type { + VideoGenerationProvider, + VideoGenerationResolution, + VideoGenerationSourceAsset, +} from "../../video-generation/types.js"; +import { normalizeProviderId } from "../provider-id.js"; +import { + ToolInputError, + readNumberParam, + readStringArrayParam, + readStringParam, +} from "./common.js"; +import { decodeDataUrl } from "./image-tool.helpers.js"; +import { + applyVideoGenerationModelConfigDefaults, + resolveMediaToolLocalRoots, +} from "./media-tool-shared.js"; +import { + buildToolModelConfigFromCandidates, + coerceToolModelConfig, + hasAuthForProvider, + hasToolModelConfig, + resolveDefaultModelRef, + type ToolModelConfig, +} from "./model-config.helpers.js"; +import { + createSandboxBridgeReadFile, + resolveSandboxedBridgeMediaPath, + type AnyAgentTool, + type SandboxFsBridge, + type ToolFsPolicy, +} from "./tool-runtime.helpers.js"; + +const MAX_INPUT_IMAGES = 5; +const MAX_INPUT_VIDEOS = 4; +const SUPPORTED_ASPECT_RATIOS = new Set([ + "1:1", + "2:3", + "3:2", + "3:4", + "4:3", + "4:5", + "5:4", + "9:16", + "16:9", + "21:9", +]); + +const VideoGenerateToolSchema = Type.Object({ + action: Type.Optional( + Type.String({ + description: + 'Optional action: "generate" (default) or "list" to inspect available providers/models.', + }), + ), + prompt: Type.Optional(Type.String({ description: "Video generation prompt." })), + image: Type.Optional( + Type.String({ + description: "Optional single reference image path or URL.", + }), + ), + images: Type.Optional( + Type.Array(Type.String(), { + description: `Optional reference images (up to ${MAX_INPUT_IMAGES}).`, + }), + ), + video: Type.Optional( + Type.String({ + description: "Optional single reference video path or URL.", + }), + ), + videos: Type.Optional( + Type.Array(Type.String(), { + description: `Optional reference videos (up to ${MAX_INPUT_VIDEOS}).`, + }), + ), + model: Type.Optional( + Type.String({ description: "Optional provider/model override, e.g. qwen/wan2.6-t2v." }), + ), + filename: Type.Optional( + Type.String({ + description: + "Optional output filename hint. OpenClaw preserves the basename and saves under its managed media directory.", + }), + ), + size: Type.Optional( + Type.String({ + description: "Optional size hint like 1280x720 or 1920x1080 when the provider supports it.", + }), + ), + aspectRatio: Type.Optional( + Type.String({ + description: + "Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9.", + }), + ), + resolution: Type.Optional( + Type.String({ + description: "Optional resolution hint: 480P, 720P, or 1080P.", + }), + ), + durationSeconds: Type.Optional( + Type.Number({ + description: "Optional target duration in seconds.", + minimum: 1, + }), + ), + audio: Type.Optional( + Type.Boolean({ + description: "Optional audio toggle when the provider supports generated audio.", + }), + ), + watermark: Type.Optional( + Type.Boolean({ + description: "Optional watermark toggle when the provider supports it.", + }), + ), +}); + +function getVideoGenerationProviderAuthEnvVars(providerId: string): string[] { + return getProviderEnvVars(providerId); +} + +function resolveVideoGenerationModelCandidates(params: { + cfg?: OpenClawConfig; + agentDir?: string; +}): Array { + const providerDefaults = new Map(); + for (const provider of listRuntimeVideoGenerationProviders({ config: params.cfg })) { + const providerId = provider.id.trim(); + const modelId = provider.defaultModel?.trim(); + if ( + !providerId || + !modelId || + providerDefaults.has(providerId) || + !isVideoGenerationProviderConfigured({ + provider, + cfg: params.cfg, + agentDir: params.agentDir, + }) + ) { + continue; + } + providerDefaults.set(providerId, `${providerId}/${modelId}`); + } + + const primaryProvider = resolveDefaultModelRef(params.cfg).provider; + const orderedProviders = [ + primaryProvider, + ...[...providerDefaults.keys()] + .filter((providerId) => providerId !== primaryProvider) + .toSorted(), + ]; + const orderedRefs: string[] = []; + const seen = new Set(); + for (const providerId of orderedProviders) { + const ref = providerDefaults.get(providerId); + if (!ref || seen.has(ref)) { + continue; + } + seen.add(ref); + orderedRefs.push(ref); + } + return orderedRefs; +} + +export function resolveVideoGenerationModelConfigForTool(params: { + cfg?: OpenClawConfig; + agentDir?: string; +}): ToolModelConfig | null { + const explicit = coerceToolModelConfig(params.cfg?.agents?.defaults?.videoGenerationModel); + if (hasToolModelConfig(explicit)) { + return explicit; + } + return buildToolModelConfigFromCandidates({ + explicit, + agentDir: params.agentDir, + candidates: resolveVideoGenerationModelCandidates(params), + isProviderConfigured: (providerId) => + isVideoGenerationProviderConfigured({ + providerId, + cfg: params.cfg, + agentDir: params.agentDir, + }), + }); +} + +function isVideoGenerationProviderConfigured(params: { + provider?: VideoGenerationProvider; + providerId?: string; + cfg?: OpenClawConfig; + agentDir?: string; +}): boolean { + const provider = + params.provider ?? + listRuntimeVideoGenerationProviders({ config: params.cfg }).find((candidate) => { + const normalizedId = normalizeProviderId(params.providerId ?? ""); + return ( + normalizeProviderId(candidate.id) === normalizedId || + (candidate.aliases ?? []).some((alias) => normalizeProviderId(alias) === normalizedId) + ); + }); + if (!provider) { + return params.providerId + ? hasAuthForProvider({ provider: params.providerId, agentDir: params.agentDir }) + : false; + } + if (provider.isConfigured) { + return provider.isConfigured({ + cfg: params.cfg, + agentDir: params.agentDir, + }); + } + return hasAuthForProvider({ provider: provider.id, agentDir: params.agentDir }); +} + +function resolveAction(args: Record): "generate" | "list" { + const raw = readStringParam(args, "action"); + if (!raw) { + return "generate"; + } + const normalized = raw.trim().toLowerCase(); + if (normalized === "generate" || normalized === "list") { + return normalized; + } + throw new ToolInputError('action must be "generate" or "list"'); +} + +function normalizeResolution(raw: string | undefined): VideoGenerationResolution | undefined { + const normalized = raw?.trim().toUpperCase(); + if (!normalized) { + return undefined; + } + if (normalized === "480P" || normalized === "720P" || normalized === "1080P") { + return normalized; + } + throw new ToolInputError("resolution must be one of 480P, 720P, or 1080P"); +} + +function normalizeAspectRatio(raw: string | undefined): string | undefined { + const normalized = raw?.trim(); + if (!normalized) { + return undefined; + } + if (SUPPORTED_ASPECT_RATIOS.has(normalized)) { + return normalized; + } + throw new ToolInputError( + "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9", + ); +} + +function readBooleanParam(params: Record, key: string): boolean | undefined { + const raw = readSnakeCaseParamRaw(params, key); + if (typeof raw === "boolean") { + return raw; + } + if (typeof raw === "string") { + const normalized = raw.trim().toLowerCase(); + if (normalized === "true") { + return true; + } + if (normalized === "false") { + return false; + } + } + return undefined; +} + +function normalizeReferenceInputs(params: { + args: Record; + singularKey: "image" | "video"; + pluralKey: "images" | "videos"; + maxCount: number; +}): string[] { + const single = readStringParam(params.args, params.singularKey); + const multiple = readStringArrayParam(params.args, params.pluralKey); + const combined = [...(single ? [single] : []), ...(multiple ?? [])]; + const deduped: string[] = []; + const seen = new Set(); + for (const candidate of combined) { + const trimmed = candidate.trim(); + const dedupe = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed; + if (!dedupe || seen.has(dedupe)) { + continue; + } + seen.add(dedupe); + deduped.push(trimmed); + } + if (deduped.length > params.maxCount) { + throw new ToolInputError( + `Too many reference ${params.pluralKey}: ${deduped.length} provided, maximum is ${params.maxCount}.`, + ); + } + return deduped; +} + +function resolveSelectedVideoGenerationProvider(params: { + config?: OpenClawConfig; + videoGenerationModelConfig: ToolModelConfig; + modelOverride?: string; +}): VideoGenerationProvider | undefined { + const selectedRef = + parseVideoGenerationModelRef(params.modelOverride) ?? + parseVideoGenerationModelRef(params.videoGenerationModelConfig.primary); + if (!selectedRef) { + return undefined; + } + const selectedProvider = normalizeProviderId(selectedRef.provider); + return listRuntimeVideoGenerationProviders({ config: params.config }).find( + (provider) => + normalizeProviderId(provider.id) === selectedProvider || + (provider.aliases ?? []).some((alias) => normalizeProviderId(alias) === selectedProvider), + ); +} + +function validateVideoGenerationCapabilities(params: { + provider: VideoGenerationProvider | undefined; + inputImageCount: number; + inputVideoCount: number; + size?: string; + aspectRatio?: string; + resolution?: VideoGenerationResolution; + durationSeconds?: number; + audio?: boolean; + watermark?: boolean; +}) { + const provider = params.provider; + if (!provider) { + return; + } + const caps = provider.capabilities; + if (params.inputImageCount > 0) { + const maxInputImages = caps.maxInputImages ?? MAX_INPUT_IMAGES; + if (params.inputImageCount > maxInputImages) { + throw new ToolInputError( + `${provider.id} supports at most ${maxInputImages} reference image${maxInputImages === 1 ? "" : "s"}.`, + ); + } + } + if (params.inputVideoCount > 0) { + const maxInputVideos = caps.maxInputVideos ?? MAX_INPUT_VIDEOS; + if (params.inputVideoCount > maxInputVideos) { + throw new ToolInputError( + `${provider.id} supports at most ${maxInputVideos} reference video${maxInputVideos === 1 ? "" : "s"}.`, + ); + } + } + if (params.size && !caps.supportsSize) { + throw new ToolInputError(`${provider.id} does not support size overrides.`); + } + if (params.aspectRatio && !caps.supportsAspectRatio) { + throw new ToolInputError(`${provider.id} does not support aspectRatio overrides.`); + } + if (params.resolution && !caps.supportsResolution) { + throw new ToolInputError(`${provider.id} does not support resolution overrides.`); + } + if ( + typeof params.durationSeconds === "number" && + Number.isFinite(params.durationSeconds) && + typeof caps.maxDurationSeconds === "number" && + params.durationSeconds > caps.maxDurationSeconds + ) { + throw new ToolInputError( + `${provider.id} supports at most ${caps.maxDurationSeconds} seconds per video.`, + ); + } + if (typeof params.audio === "boolean" && !caps.supportsAudio) { + throw new ToolInputError(`${provider.id} does not support audio toggles.`); + } + if (typeof params.watermark === "boolean" && !caps.supportsWatermark) { + throw new ToolInputError(`${provider.id} does not support watermark toggles.`); + } +} + +type VideoGenerateSandboxConfig = { + root: string; + bridge: SandboxFsBridge; +}; + +async function loadReferenceAssets(params: { + inputs: string[]; + expectedKind: "image" | "video"; + maxBytes?: number; + workspaceDir?: string; + sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null; +}): Promise< + Array<{ + sourceAsset: VideoGenerationSourceAsset; + resolvedInput: string; + rewrittenFrom?: string; + }> +> { + const loaded: Array<{ + sourceAsset: VideoGenerationSourceAsset; + resolvedInput: string; + rewrittenFrom?: string; + }> = []; + + for (const rawInput of params.inputs) { + const trimmed = rawInput.trim(); + const inputRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed; + if (!inputRaw) { + throw new ToolInputError(`${params.expectedKind} required (empty string in array)`); + } + const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(inputRaw); + const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(inputRaw); + const isFileUrl = /^file:/i.test(inputRaw); + const isHttpUrl = /^https?:\/\//i.test(inputRaw); + const isDataUrl = /^data:/i.test(inputRaw); + if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) { + throw new ToolInputError( + `Unsupported ${params.expectedKind} reference: ${rawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`, + ); + } + if (params.sandboxConfig && isHttpUrl) { + throw new ToolInputError( + `Sandboxed video_generate does not allow remote ${params.expectedKind} URLs.`, + ); + } + + const resolvedInput = (() => { + if (params.sandboxConfig) { + return inputRaw; + } + if (inputRaw.startsWith("~")) { + return resolveUserPath(inputRaw); + } + return inputRaw; + })(); + + if (isHttpUrl && !params.sandboxConfig) { + loaded.push({ + sourceAsset: { url: resolvedInput }, + resolvedInput, + }); + continue; + } + + const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl + ? { resolved: "" } + : params.sandboxConfig + ? await resolveSandboxedBridgeMediaPath({ + sandbox: params.sandboxConfig, + mediaPath: resolvedInput, + inboundFallbackDir: "media/inbound", + }) + : { + resolved: resolvedInput.startsWith("file://") + ? resolvedInput.slice("file://".length) + : resolvedInput, + }; + const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved; + const localRoots = resolveMediaToolLocalRoots( + params.workspaceDir, + { + workspaceOnly: params.sandboxConfig?.workspaceOnly === true, + }, + resolvedPath ? [resolvedPath] : undefined, + ); + const media = isDataUrl + ? params.expectedKind === "image" + ? decodeDataUrl(resolvedInput) + : (() => { + throw new ToolInputError("Video data: URLs are not supported for video_generate."); + })() + : params.sandboxConfig + ? await loadWebMedia(resolvedPath ?? resolvedInput, { + maxBytes: params.maxBytes, + sandboxValidated: true, + readFile: createSandboxBridgeReadFile({ sandbox: params.sandboxConfig }), + }) + : await loadWebMedia(resolvedPath ?? resolvedInput, { + maxBytes: params.maxBytes, + localRoots, + }); + if (media.kind !== params.expectedKind) { + throw new ToolInputError(`Unsupported media type: ${media.kind ?? "unknown"}`); + } + const mimeType = "mimeType" in media ? media.mimeType : media.contentType; + const fileName = "fileName" in media ? media.fileName : undefined; + loaded.push({ + sourceAsset: { + buffer: media.buffer, + mimeType, + fileName, + }, + resolvedInput, + ...(resolvedPathInfo.rewrittenFrom ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } : {}), + }); + } + + return loaded; +} + +export function createVideoGenerateTool(options?: { + config?: OpenClawConfig; + agentDir?: string; + workspaceDir?: string; + sandbox?: VideoGenerateSandboxConfig; + fsPolicy?: ToolFsPolicy; +}): AnyAgentTool | null { + const cfg: OpenClawConfig = options?.config ?? loadConfig(); + const videoGenerationModelConfig = resolveVideoGenerationModelConfigForTool({ + cfg, + agentDir: options?.agentDir, + }); + if (!videoGenerationModelConfig) { + return null; + } + + const sandboxConfig = options?.sandbox + ? { + root: options.sandbox.root, + bridge: options.sandbox.bridge, + workspaceOnly: options.fsPolicy?.workspaceOnly === true, + } + : null; + + return { + label: "Video Generation", + name: "video_generate", + displaySummary: "Generate videos", + description: + "Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments.", + parameters: VideoGenerateToolSchema, + execute: async (_toolCallId, rawArgs) => { + const args = rawArgs as Record; + const action = resolveAction(args); + const effectiveCfg = + applyVideoGenerationModelConfigDefaults(cfg, videoGenerationModelConfig) ?? cfg; + + if (action === "list") { + const providers = listRuntimeVideoGenerationProviders({ config: effectiveCfg }); + if (providers.length === 0) { + return { + content: [{ type: "text", text: "No video-generation providers are registered." }], + details: { providers: [] }, + }; + } + const lines = providers.map((provider) => { + const authHints = getVideoGenerationProviderAuthEnvVars(provider.id); + const capabilities = [ + provider.capabilities.maxVideos ? `maxVideos=${provider.capabilities.maxVideos}` : null, + provider.capabilities.maxInputImages + ? `maxInputImages=${provider.capabilities.maxInputImages}` + : null, + provider.capabilities.maxInputVideos + ? `maxInputVideos=${provider.capabilities.maxInputVideos}` + : null, + provider.capabilities.maxDurationSeconds + ? `maxDurationSeconds=${provider.capabilities.maxDurationSeconds}` + : null, + provider.capabilities.supportsResolution ? "resolution" : null, + provider.capabilities.supportsAspectRatio ? "aspectRatio" : null, + provider.capabilities.supportsSize ? "size" : null, + provider.capabilities.supportsAudio ? "audio" : null, + provider.capabilities.supportsWatermark ? "watermark" : null, + ] + .filter((entry): entry is string => Boolean(entry)) + .join(", "); + return [ + `${provider.id}: default=${provider.defaultModel ?? "none"}`, + provider.models?.length ? `models=${provider.models.join(", ")}` : null, + capabilities ? `capabilities=${capabilities}` : null, + authHints.length > 0 ? `auth=${authHints.join(" / ")}` : null, + ] + .filter((entry): entry is string => Boolean(entry)) + .join(" | "); + }); + return { + content: [{ type: "text", text: lines.join("\n") }], + details: { + providers: providers.map((provider) => ({ + id: provider.id, + defaultModel: provider.defaultModel, + models: provider.models ?? [], + authEnvVars: getVideoGenerationProviderAuthEnvVars(provider.id), + capabilities: provider.capabilities, + })), + }, + }; + } + + const prompt = readStringParam(args, "prompt", { required: true }); + const model = readStringParam(args, "model"); + const filename = readStringParam(args, "filename"); + const size = readStringParam(args, "size"); + const aspectRatio = normalizeAspectRatio(readStringParam(args, "aspectRatio")); + const resolution = normalizeResolution(readStringParam(args, "resolution")); + const durationSeconds = readNumberParam(args, "durationSeconds", { + integer: true, + strict: true, + }); + const audio = readBooleanParam(args, "audio"); + const watermark = readBooleanParam(args, "watermark"); + const imageInputs = normalizeReferenceInputs({ + args, + singularKey: "image", + pluralKey: "images", + maxCount: MAX_INPUT_IMAGES, + }); + const videoInputs = normalizeReferenceInputs({ + args, + singularKey: "video", + pluralKey: "videos", + maxCount: MAX_INPUT_VIDEOS, + }); + + const selectedProvider = resolveSelectedVideoGenerationProvider({ + config: effectiveCfg, + videoGenerationModelConfig, + modelOverride: model, + }); + const loadedReferenceImages = await loadReferenceAssets({ + inputs: imageInputs, + expectedKind: "image", + workspaceDir: options?.workspaceDir, + sandboxConfig, + }); + const loadedReferenceVideos = await loadReferenceAssets({ + inputs: videoInputs, + expectedKind: "video", + workspaceDir: options?.workspaceDir, + sandboxConfig, + }); + validateVideoGenerationCapabilities({ + provider: selectedProvider, + inputImageCount: loadedReferenceImages.length, + inputVideoCount: loadedReferenceVideos.length, + size, + aspectRatio, + resolution, + durationSeconds, + audio, + watermark, + }); + + const result = await generateVideo({ + cfg: effectiveCfg, + prompt, + agentDir: options?.agentDir, + modelOverride: model, + size, + aspectRatio, + resolution, + durationSeconds, + audio, + watermark, + inputImages: loadedReferenceImages.map((entry) => entry.sourceAsset), + inputVideos: loadedReferenceVideos.map((entry) => entry.sourceAsset), + }); + const savedVideos = await Promise.all( + result.videos.map((video) => + saveMediaBuffer( + video.buffer, + video.mimeType, + "tool-video-generation", + undefined, + filename || video.fileName, + ), + ), + ); + const lines = [ + `Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`, + ...savedVideos.map((video) => `MEDIA:${video.path}`), + ]; + + return { + content: [{ type: "text", text: lines.join("\n") }], + details: { + provider: result.provider, + model: result.model, + count: savedVideos.length, + media: { + mediaUrls: savedVideos.map((video) => video.path), + }, + paths: savedVideos.map((video) => video.path), + ...(loadedReferenceImages.length === 1 + ? { + image: loadedReferenceImages[0]?.resolvedInput, + ...(loadedReferenceImages[0]?.rewrittenFrom + ? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom } + : {}), + } + : loadedReferenceImages.length > 1 + ? { + images: loadedReferenceImages.map((entry) => ({ + image: entry.resolvedInput, + ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}), + })), + } + : {}), + ...(loadedReferenceVideos.length === 1 + ? { + video: loadedReferenceVideos[0]?.resolvedInput, + ...(loadedReferenceVideos[0]?.rewrittenFrom + ? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom } + : {}), + } + : loadedReferenceVideos.length > 1 + ? { + videos: loadedReferenceVideos.map((entry) => ({ + video: entry.resolvedInput, + ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}), + })), + } + : {}), + ...(size ? { size } : {}), + ...(aspectRatio ? { aspectRatio } : {}), + ...(resolution ? { resolution } : {}), + ...(typeof durationSeconds === "number" ? { durationSeconds } : {}), + ...(typeof audio === "boolean" ? { audio } : {}), + ...(typeof watermark === "boolean" ? { watermark } : {}), + ...(filename ? { filename } : {}), + attempts: result.attempts, + metadata: result.metadata, + }, + }; + }, + }; +} diff --git a/src/cli/config-cli.test.ts b/src/cli/config-cli.test.ts index e82d9885f85..2334f0dc826 100644 --- a/src/cli/config-cli.test.ts +++ b/src/cli/config-cli.test.ts @@ -242,6 +242,37 @@ describe("config cli", () => { expect(written.gateway?.auth).toEqual({ mode: "token" }); }); + it("writes agents.defaults.videoGenerationModel.primary without disturbing sibling defaults", async () => { + const resolved: OpenClawConfig = { + agents: { + defaults: { + model: "openai/gpt-5.4", + imageGenerationModel: { + primary: "openai/gpt-image-1", + }, + }, + }, + }; + setSnapshot(resolved, resolved); + + await runConfigCommand([ + "config", + "set", + "agents.defaults.videoGenerationModel.primary", + "qwen/wan2.6-t2v", + ]); + + expect(mockWriteConfigFile).toHaveBeenCalledTimes(1); + const written = mockWriteConfigFile.mock.calls[0]?.[0]; + expect(written.agents?.defaults?.model).toBe("openai/gpt-5.4"); + expect(written.agents?.defaults?.imageGenerationModel).toEqual({ + primary: "openai/gpt-image-1", + }); + expect(written.agents?.defaults?.videoGenerationModel).toEqual({ + primary: "qwen/wan2.6-t2v", + }); + }); + it("drops gateway.auth.password when switching mode to token", async () => { const resolved: OpenClawConfig = { gateway: { diff --git a/src/config/schema.base.generated.test.ts b/src/config/schema.base.generated.test.ts index 647181d8241..d7bc682947f 100644 --- a/src/config/schema.base.generated.test.ts +++ b/src/config/schema.base.generated.test.ts @@ -40,4 +40,25 @@ describe("generated base config schema", () => { expect(hooksInternalProperties?.handlers).toBeUndefined(); expect(uiHints["hooks.internal.handlers"]).toBeUndefined(); }); + + it("includes videoGenerationModel in the public schema payload", () => { + const agentDefaultsProperties = ( + GENERATED_BASE_CONFIG_SCHEMA.schema as { + properties?: { + agents?: { + properties?: { + defaults?: { + properties?: Record; + }; + }; + }; + }; + } + ).properties?.agents?.properties?.defaults?.properties; + const uiHints = GENERATED_BASE_CONFIG_SCHEMA.uiHints as Record; + + expect(agentDefaultsProperties?.videoGenerationModel).toBeDefined(); + expect(uiHints["agents.defaults.videoGenerationModel.primary"]).toBeDefined(); + expect(uiHints["agents.defaults.videoGenerationModel.fallbacks"]).toBeDefined(); + }); }); diff --git a/src/config/zod-schema.agent-defaults.test.ts b/src/config/zod-schema.agent-defaults.test.ts index 1a99b73bb21..878dc59a9ed 100644 --- a/src/config/zod-schema.agent-defaults.test.ts +++ b/src/config/zod-schema.agent-defaults.test.ts @@ -11,4 +11,15 @@ describe("agent defaults schema", () => { }), ).not.toThrow(); }); + + it("accepts videoGenerationModel", () => { + expect(() => + AgentDefaultsSchema.parse({ + videoGenerationModel: { + primary: "qwen/wan2.6-t2v", + fallbacks: ["minimax/video-01"], + }, + }), + ).not.toThrow(); + }); });