diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8dee748f167..e1f63b360a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -191,6 +191,7 @@ Docs: https://docs.openclaw.ai
- Update/npm: prefer the npm binary that owns the installed global OpenClaw prefix so mixed Homebrew-plus-nvm setups update the right install. (#60153) Thanks @jayeshp19.
- Windows/restart: clean up stale gateway listeners before Windows self-restart and treat listener and argv probe failures as inconclusive, so scheduled-task relaunch no longer falls into an `EADDRINUSE` retry loop. (#60480) Thanks @arifahmedjoy.
- Plugins: suppress trust-warning noise during non-activating snapshot and CLI metadata loads. (#61427) Thanks @gumadeiras.
+- Agents/video generation: accept `agents.defaults.videoGenerationModel` in strict config validation and `openclaw config set/get`, so gateways using `video_generate` no longer fail to boot after enabling a video model.
## 2026.4.2
diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json b/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json
index 52bd890e716..5453bc9a34c 100644
--- a/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json
+++ b/apps/shared/OpenClawKit/Sources/OpenClawKit/Resources/tool-display.json
@@ -1030,6 +1030,31 @@
}
}
},
+ "video_generate": {
+ "emoji": "🎬",
+ "title": "Video Generation",
+ "actions": {
+ "generate": {
+ "label": "generate",
+ "detailKeys": [
+ "prompt",
+ "model",
+ "durationSeconds",
+ "resolution",
+ "aspectRatio",
+ "audio",
+ "watermark"
+ ]
+ },
+ "list": {
+ "label": "list",
+ "detailKeys": [
+ "provider",
+ "model"
+ ]
+ }
+ }
+ },
"pdf": {
"emoji": "📑",
"title": "PDF",
diff --git a/docs/concepts/models.md b/docs/concepts/models.md
index 6acd34db45b..bf8d0e8366b 100644
--- a/docs/concepts/models.md
+++ b/docs/concepts/models.md
@@ -30,7 +30,7 @@ Related:
falls back to `agents.defaults.imageModel`, then the resolved session/default
model.
- `agents.defaults.imageGenerationModel` is used by the shared image-generation capability. If omitted, `image_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered image-generation providers in provider-id order. If you set a specific provider/model, also configure that provider's auth/API key.
-- `agents.defaults.videoGenerationModel` is used by the shared video-generation capability. Unlike image generation, this does not infer a provider default today. Set an explicit `provider/model` such as `qwen/wan2.6-t2v`, and configure that provider's auth/API key too.
+- `agents.defaults.videoGenerationModel` is used by the shared video-generation capability. If omitted, `video_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered video-generation providers in provider-id order. If you set a specific provider/model, also configure that provider's auth/API key.
- Per-agent defaults can override `agents.defaults.model` via `agents.list[].model` plus bindings (see [/concepts/multi-agent](/concepts/multi-agent)).
## Quick model policy
@@ -252,4 +252,5 @@ This applies whenever OpenClaw regenerates `models.json`, including command-driv
- [Model Providers](/concepts/model-providers) — provider routing and auth
- [Model Failover](/concepts/model-failover) — fallback chains
- [Image Generation](/tools/image-generation) — image model configuration
+- [Video Generation](/tools/video-generation) — video model configuration
- [Configuration Reference](/gateway/configuration-reference#agent-defaults) — model config keys
diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md
index f8f1fe23c96..f7d2a7c43fc 100644
--- a/docs/gateway/configuration-reference.md
+++ b/docs/gateway/configuration-reference.md
@@ -1026,9 +1026,9 @@ Time format in system prompt. Default: `auto` (OS preference).
- If you select a provider/model directly, configure the matching provider auth/API key too (for example `GEMINI_API_KEY` or `GOOGLE_API_KEY` for `google/*`, `OPENAI_API_KEY` for `openai/*`, `FAL_KEY` for `fal/*`).
- If omitted, `image_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered image-generation providers in provider-id order.
- `videoGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
- - Used by the shared video-generation capability.
+ - Used by the shared video-generation capability and the built-in `video_generate` tool.
- Typical values: `qwen/wan2.6-t2v`, `qwen/wan2.6-i2v`, `qwen/wan2.6-r2v`, `qwen/wan2.6-r2v-flash`, or `qwen/wan2.7-r2v`.
- - Set this explicitly before using shared video generation. Unlike `imageGenerationModel`, the video-generation runtime does not infer a provider default yet.
+ - If omitted, `video_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered video-generation providers in provider-id order.
- If you select a provider/model directly, configure the matching provider auth/API key too.
- The bundled Qwen video-generation provider currently supports up to 1 output video, 1 input image, 4 input videos, 10 seconds duration, and provider-level `size`, `aspectRatio`, `resolution`, `audio`, and `watermark` options.
- `pdfModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
@@ -1936,12 +1936,12 @@ Defaults for Talk mode (macOS/iOS/Android).
Local onboarding defaults new local configs to `tools.profile: "coding"` when unset (existing explicit profiles are preserved).
-| Profile | Includes |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `minimal` | `session_status` only |
-| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate` |
-| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
-| `full` | No restriction (same as unset) |
+| Profile | Includes |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| `minimal` | `session_status` only |
+| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate`, `video_generate` |
+| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
+| `full` | No restriction (same as unset) |
### Tool groups
@@ -1957,7 +1957,7 @@ Local onboarding defaults new local configs to `tools.profile: "coding"` when un
| `group:messaging` | `message` |
| `group:nodes` | `nodes` |
| `group:agents` | `agents_list` |
-| `group:media` | `image`, `image_generate`, `tts` |
+| `group:media` | `image`, `image_generate`, `video_generate`, `tts` |
| `group:openclaw` | All built-in tools (excludes provider plugins) |
### `tools.allow` / `tools.deny`
diff --git a/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md b/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md
index f9388783186..c5168722f85 100644
--- a/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md
+++ b/docs/gateway/sandbox-vs-tool-policy-vs-elevated.md
@@ -98,7 +98,7 @@ Available groups:
- `group:messaging`: `message`
- `group:nodes`: `nodes`
- `group:agents`: `agents_list`
-- `group:media`: `image`, `image_generate`, `tts`
+- `group:media`: `image`, `image_generate`, `video_generate`, `tts`
- `group:openclaw`: all built-in OpenClaw tools (excludes provider plugins)
## Elevated: exec-only "run on host"
diff --git a/docs/providers/qwen.md b/docs/providers/qwen.md
index 669187b867b..3581a53ca07 100644
--- a/docs/providers/qwen.md
+++ b/docs/providers/qwen.md
@@ -123,6 +123,9 @@ Current bundled Qwen video-generation limits:
- Up to **4** input videos
- Up to **10 seconds** duration
- Supports `size`, `aspectRatio`, `resolution`, `audio`, and `watermark`
+- Reference image/video mode currently requires **remote http(s) URLs**. Local
+ file paths are rejected up front because the DashScope video endpoint does not
+ accept uploaded local buffers for those references.
See [Qwen / Model Studio](/providers/qwen_modelstudio) for endpoint-level detail
and compatibility notes.
diff --git a/docs/tools/index.md b/docs/tools/index.md
index e47679cf289..1dbb97a0624 100644
--- a/docs/tools/index.md
+++ b/docs/tools/index.md
@@ -53,25 +53,28 @@ OpenClaw has three layers that work together:
These tools ship with OpenClaw and are available without installing any plugins:
-| Tool | What it does | Page |
-| ------------------------------------------ | --------------------------------------------------------------------- | --------------------------------------- |
-| `exec` / `process` | Run shell commands, manage background processes | [Exec](/tools/exec) |
-| `code_execution` | Run sandboxed remote Python analysis | [Code Execution](/tools/code-execution) |
-| `browser` | Control a Chromium browser (navigate, click, screenshot) | [Browser](/tools/browser) |
-| `web_search` / `x_search` / `web_fetch` | Search the web, search X posts, fetch page content | [Web](/tools/web) |
-| `read` / `write` / `edit` | File I/O in the workspace | |
-| `apply_patch` | Multi-hunk file patches | [Apply Patch](/tools/apply-patch) |
-| `message` | Send messages across all channels | [Agent Send](/tools/agent-send) |
-| `canvas` | Drive node Canvas (present, eval, snapshot) | |
-| `nodes` | Discover and target paired devices | |
-| `cron` / `gateway` | Manage scheduled jobs; inspect, patch, restart, or update the gateway | |
-| `image` / `image_generate` | Analyze or generate images | |
-| `tts` | One-shot text-to-speech conversion | [TTS](/tools/tts) |
-| `sessions_*` / `subagents` / `agents_list` | Session management, status, and sub-agent orchestration | [Sub-agents](/tools/subagents) |
-| `session_status` | Lightweight `/status`-style readback and session model override | [Session Tools](/concepts/session-tool) |
+| Tool | What it does | Page |
+| ------------------------------------------ | --------------------------------------------------------------------- | ------------------------------------------- |
+| `exec` / `process` | Run shell commands, manage background processes | [Exec](/tools/exec) |
+| `code_execution` | Run sandboxed remote Python analysis | [Code Execution](/tools/code-execution) |
+| `browser` | Control a Chromium browser (navigate, click, screenshot) | [Browser](/tools/browser) |
+| `web_search` / `x_search` / `web_fetch` | Search the web, search X posts, fetch page content | [Web](/tools/web) |
+| `read` / `write` / `edit` | File I/O in the workspace | |
+| `apply_patch` | Multi-hunk file patches | [Apply Patch](/tools/apply-patch) |
+| `message` | Send messages across all channels | [Agent Send](/tools/agent-send) |
+| `canvas` | Drive node Canvas (present, eval, snapshot) | |
+| `nodes` | Discover and target paired devices | |
+| `cron` / `gateway` | Manage scheduled jobs; inspect, patch, restart, or update the gateway | |
+| `image` / `image_generate` | Analyze or generate images | [Image Generation](/tools/image-generation) |
+| `video_generate` | Generate videos | [Video Generation](/tools/video-generation) |
+| `tts` | One-shot text-to-speech conversion | [TTS](/tools/tts) |
+| `sessions_*` / `subagents` / `agents_list` | Session management, status, and sub-agent orchestration | [Sub-agents](/tools/subagents) |
+| `session_status` | Lightweight `/status`-style readback and session model override | [Session Tools](/concepts/session-tool) |
For image work, use `image` for analysis and `image_generate` for generation or editing. If you target `openai/*`, `google/*`, `fal/*`, or another non-default image provider, configure that provider's auth/API key first.
+For video work, use `video_generate`. If you target `qwen/*` or another non-default video provider, configure that provider's auth/API key first.
+
`session_status` is the lightweight status/readback tool in the sessions group.
It answers `/status`-style questions about the current session and can
optionally set a per-session model override; `model=default` clears that
@@ -121,12 +124,12 @@ config. Deny always wins over allow.
`tools.profile` sets a base allowlist before `allow`/`deny` is applied.
Per-agent override: `agents.list[].tools.profile`.
-| Profile | What it includes |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `full` | No restriction (same as unset) |
-| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate` |
-| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
-| `minimal` | `session_status` only |
+| Profile | What it includes |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| `full` | No restriction (same as unset) |
+| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate`, `video_generate` |
+| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
+| `minimal` | `session_status` only |
### Tool groups
@@ -144,7 +147,7 @@ Use `group:*` shorthands in allow/deny lists:
| `group:messaging` | message |
| `group:nodes` | nodes |
| `group:agents` | agents_list |
-| `group:media` | image, image_generate, tts |
+| `group:media` | image, image_generate, video_generate, tts |
| `group:openclaw` | All built-in OpenClaw tools (excludes plugin tools) |
`sessions_history` returns a bounded, safety-filtered recall view. It strips
diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md
new file mode 100644
index 00000000000..434f6c170be
--- /dev/null
+++ b/docs/tools/video-generation.md
@@ -0,0 +1,109 @@
+---
+summary: "Generate videos using configured providers such as Qwen"
+read_when:
+ - Generating videos via the agent
+ - Configuring video generation providers and models
+ - Understanding the video_generate tool parameters
+title: "Video Generation"
+---
+
+# Video Generation
+
+The `video_generate` tool lets the agent create videos using your configured providers. Generated videos are delivered automatically as media attachments in the agent's reply.
+
+
+The tool only appears when at least one video-generation provider is available. If you don't see `video_generate` in your agent's tools, configure `agents.defaults.videoGenerationModel` or set up a provider API key.
+
+
+## Quick start
+
+1. Set an API key for at least one provider (for example `QWEN_API_KEY`).
+2. Optionally set your preferred model:
+
+```json5
+{
+ agents: {
+ defaults: {
+ videoGenerationModel: "qwen/wan2.6-t2v",
+ },
+ },
+}
+```
+
+3. Ask the agent: _"Generate a 5-second cinematic video of a friendly lobster surfing at sunset."_
+
+The agent calls `video_generate` automatically. No tool allow-listing needed — it's enabled by default when a provider is available.
+
+## Supported providers
+
+| Provider | Default model | Reference inputs | API key |
+| -------- | ------------- | ---------------- | ---------------------------------------------------------- |
+| Qwen | `wan2.6-t2v` | Yes, remote URLs | `QWEN_API_KEY`, `MODELSTUDIO_API_KEY`, `DASHSCOPE_API_KEY` |
+
+Use `action: "list"` to inspect available providers and models at runtime:
+
+```
+/tool video_generate action=list
+```
+
+## Tool parameters
+
+| Parameter | Type | Description |
+| ----------------- | -------- | ------------------------------------------------------------------------------------- |
+| `prompt` | string | Video generation prompt (required for `action: "generate"`) |
+| `action` | string | `"generate"` (default) or `"list"` to inspect providers |
+| `model` | string | Provider/model override, e.g. `qwen/wan2.6-t2v` |
+| `image` | string | Single reference image path or URL |
+| `images` | string[] | Multiple reference images (up to 5) |
+| `video` | string | Single reference video path or URL |
+| `videos` | string[] | Multiple reference videos (up to 4) |
+| `size` | string | Size hint when the provider supports it |
+| `aspectRatio` | string | Aspect ratio: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9` |
+| `resolution` | string | Resolution hint: `480P`, `720P`, or `1080P` |
+| `durationSeconds` | number | Target duration in seconds |
+| `audio` | boolean | Enable generated audio when the provider supports it |
+| `watermark` | boolean | Toggle provider watermarking when supported |
+| `filename` | string | Output filename hint |
+
+Not all providers support all parameters. The tool validates provider capability limits before it submits the request.
+
+## Configuration
+
+### Model selection
+
+```json5
+{
+ agents: {
+ defaults: {
+ videoGenerationModel: {
+ primary: "qwen/wan2.6-t2v",
+ fallbacks: ["qwen/wan2.6-r2v-flash"],
+ },
+ },
+ },
+}
+```
+
+### Provider selection order
+
+When generating a video, OpenClaw tries providers in this order:
+
+1. **`model` parameter** from the tool call (if the agent specifies one)
+2. **`videoGenerationModel.primary`** from config
+3. **`videoGenerationModel.fallbacks`** in order
+4. **Auto-detection** — uses auth-backed provider defaults only:
+ - current default provider first
+ - remaining registered video-generation providers in provider-id order
+
+If a provider fails, the next candidate is tried automatically. If all fail, the error includes details from each attempt.
+
+## Qwen reference inputs
+
+The bundled Qwen provider supports text-to-video plus image/video reference modes, but the upstream DashScope video endpoint currently requires **remote http(s) URLs** for reference inputs. Local file paths and uploaded buffers are rejected up front instead of being silently ignored.
+
+## Related
+
+- [Tools Overview](/tools) — all available agent tools
+- [Qwen](/providers/qwen) — Qwen-specific setup and limits
+- [Configuration Reference](/gateway/configuration-reference#agent-defaults) — `videoGenerationModel` config
+- [Models](/concepts/models) — model configuration and failover
diff --git a/extensions/qwen/video-generation-provider.test.ts b/extensions/qwen/video-generation-provider.test.ts
index 93a6b121c92..5ccb35035f7 100644
--- a/extensions/qwen/video-generation-provider.test.ts
+++ b/extensions/qwen/video-generation-provider.test.ts
@@ -107,4 +107,21 @@ describe("qwen video generation provider", () => {
}),
);
});
+
+ it("fails fast when reference inputs are local buffers instead of remote URLs", async () => {
+ const provider = buildQwenVideoGenerationProvider();
+
+ await expect(
+ provider.generateVideo({
+ provider: "qwen",
+ model: "wan2.6-i2v",
+ prompt: "animate this local frame",
+ cfg: {},
+ inputImages: [{ buffer: Buffer.from("png-bytes"), mimeType: "image/png" }],
+ }),
+ ).rejects.toThrow(
+ "Qwen video generation currently requires remote http(s) URLs for reference images/videos.",
+ );
+ expect(postJsonRequestMock).not.toHaveBeenCalled();
+ });
});
diff --git a/extensions/qwen/video-generation-provider.ts b/extensions/qwen/video-generation-provider.ts
index 8db83a4604f..207bb0b087c 100644
--- a/extensions/qwen/video-generation-provider.ts
+++ b/extensions/qwen/video-generation-provider.ts
@@ -90,7 +90,22 @@ function resolveReferenceUrls(
.filter((value): value is string => Boolean(value));
}
+function assertQwenReferenceInputsSupported(
+ inputImages: VideoGenerationSourceAsset[] | undefined,
+ inputVideos: VideoGenerationSourceAsset[] | undefined,
+): void {
+ const unsupported = [...(inputImages ?? []), ...(inputVideos ?? [])].some(
+ (asset) => !asset.url?.trim() && asset.buffer,
+ );
+ if (unsupported) {
+ throw new Error(
+ "Qwen video generation currently requires remote http(s) URLs for reference images/videos.",
+ );
+ }
+}
+
function buildQwenVideoGenerationInput(req: VideoGenerationRequest): Record {
+ assertQwenReferenceInputsSupported(req.inputImages, req.inputVideos);
const input: Record = {
prompt: req.prompt,
};
diff --git a/src/agents/openclaw-tools.ts b/src/agents/openclaw-tools.ts
index c9603d90926..ac2be1a84df 100644
--- a/src/agents/openclaw-tools.ts
+++ b/src/agents/openclaw-tools.ts
@@ -31,6 +31,7 @@ import { createSessionsYieldTool } from "./tools/sessions-yield-tool.js";
import { createSubagentsTool } from "./tools/subagents-tool.js";
import { createTtsTool } from "./tools/tts-tool.js";
import { createUpdatePlanTool } from "./tools/update-plan-tool.js";
+import { createVideoGenerateTool } from "./tools/video-generate-tool.js";
import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js";
import { resolveWorkspaceRoot } from "./workspace-dir.js";
@@ -159,6 +160,13 @@ export function createOpenClawTools(
sandbox,
fsPolicy: options?.fsPolicy,
});
+ const videoGenerateTool = createVideoGenerateTool({
+ config: options?.config,
+ agentDir: options?.agentDir,
+ workspaceDir,
+ sandbox,
+ fsPolicy: options?.fsPolicy,
+ });
const pdfTool = options?.agentDir?.trim()
? createPdfTool({
config: options?.config,
@@ -216,6 +224,7 @@ export function createOpenClawTools(
config: options?.config,
}),
...(imageGenerateTool ? [imageGenerateTool] : []),
+ ...(videoGenerateTool ? [videoGenerateTool] : []),
createGatewayTool({
agentSessionKey: options?.agentSessionKey,
config: options?.config,
diff --git a/src/agents/openclaw-tools.video-generation.test.ts b/src/agents/openclaw-tools.video-generation.test.ts
new file mode 100644
index 00000000000..582ef89094a
--- /dev/null
+++ b/src/agents/openclaw-tools.video-generation.test.ts
@@ -0,0 +1,91 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { OpenClawConfig } from "../config/config.js";
+import * as videoGenerationRuntime from "../video-generation/runtime.js";
+import { createOpenClawTools } from "./openclaw-tools.js";
+
+vi.mock("../plugins/tools.js", () => ({
+ resolvePluginTools: () => [],
+ copyPluginToolMeta: () => undefined,
+ getPluginToolMeta: () => undefined,
+}));
+
+function asConfig(value: unknown): OpenClawConfig {
+ return value as OpenClawConfig;
+}
+
+function stubVideoGenerationProviders() {
+ vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([
+ {
+ id: "qwen",
+ defaultModel: "wan2.6-t2v",
+ models: ["wan2.6-t2v"],
+ capabilities: {
+ maxVideos: 1,
+ maxInputImages: 1,
+ maxInputVideos: 4,
+ maxDurationSeconds: 10,
+ supportsSize: true,
+ supportsAspectRatio: true,
+ supportsResolution: true,
+ supportsAudio: true,
+ supportsWatermark: true,
+ },
+ generateVideo: vi.fn(async () => {
+ throw new Error("not used");
+ }),
+ },
+ ]);
+}
+
+describe("openclaw tools video generation registration", () => {
+ beforeEach(() => {
+ vi.stubEnv("QWEN_API_KEY", "");
+ vi.stubEnv("MODELSTUDIO_API_KEY", "");
+ vi.stubEnv("DASHSCOPE_API_KEY", "");
+ });
+
+ afterEach(() => {
+ vi.restoreAllMocks();
+ vi.unstubAllEnvs();
+ });
+
+ it("registers video_generate when video-generation config is present", () => {
+ const tools = createOpenClawTools({
+ config: asConfig({
+ agents: {
+ defaults: {
+ videoGenerationModel: {
+ primary: "qwen/wan2.6-t2v",
+ },
+ },
+ },
+ }),
+ agentDir: "/tmp/openclaw-agent-main",
+ });
+
+ expect(tools.map((tool) => tool.name)).toContain("video_generate");
+ });
+
+ it("registers video_generate when a compatible provider has env-backed auth", () => {
+ stubVideoGenerationProviders();
+ vi.stubEnv("QWEN_API_KEY", "qwen-test");
+
+ const tools = createOpenClawTools({
+ config: asConfig({}),
+ agentDir: "/tmp/openclaw-agent-main",
+ });
+
+ expect(tools.map((tool) => tool.name)).toContain("video_generate");
+ });
+
+ it("omits video_generate when config is absent and no compatible provider auth exists", () => {
+ stubVideoGenerationProviders();
+
+ const tools = createOpenClawTools({
+ config: asConfig({}),
+ agentDir: "/tmp/openclaw-agent-main",
+ });
+
+ expect(tools.map((tool) => tool.name)).not.toContain("video_generate");
+ });
+});
diff --git a/src/agents/openclaw-tools.web-runtime.test.ts b/src/agents/openclaw-tools.web-runtime.test.ts
index 5240d42b9a9..77bab620be6 100644
--- a/src/agents/openclaw-tools.web-runtime.test.ts
+++ b/src/agents/openclaw-tools.web-runtime.test.ts
@@ -46,6 +46,9 @@ vi.mock("./tools/gateway-tool.js", () => ({
vi.mock("./tools/image-generate-tool.js", () => ({
createImageGenerateTool: mockToolFactory("image_generate_stub"),
}));
+vi.mock("./tools/video-generate-tool.js", () => ({
+ createVideoGenerateTool: mockToolFactory("video_generate_stub"),
+}));
vi.mock("./tools/image-tool.js", () => ({
createImageTool: mockToolFactory("image_stub"),
}));
diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts
index 9d61d7f30e5..545c2d3b770 100644
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -265,6 +265,10 @@ describe("extractToolResultMediaPaths", () => {
expect(isToolResultMediaTrusted("image_generate")).toBe(true);
});
+ it("trusts video_generate local MEDIA paths", () => {
+ expect(isToolResultMediaTrusted("video_generate")).toBe(true);
+ });
+
it("does not trust local MEDIA paths for MCP-provenance results", () => {
expect(
filterToolResultMediaUrls("browser", ["/tmp/screenshot.png"], {
diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts
index a0c4e6ec864..f792c5cc6f5 100644
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -156,6 +156,7 @@ const TRUSTED_TOOL_RESULT_MEDIA = new Set([
"sessions_spawn",
"subagents",
"tts",
+ "video_generate",
"web_fetch",
"web_search",
"x_search",
diff --git a/src/agents/test-helpers/fast-openclaw-tools.ts b/src/agents/test-helpers/fast-openclaw-tools.ts
index 408b8148491..34f26eaa887 100644
--- a/src/agents/test-helpers/fast-openclaw-tools.ts
+++ b/src/agents/test-helpers/fast-openclaw-tools.ts
@@ -32,6 +32,7 @@ const coreTools = [
stubActionTool("session_status", ["get", "show"]),
stubTool("tts"),
stubTool("image_generate"),
+ stubTool("video_generate"),
stubTool("web_fetch"),
stubTool("image"),
stubTool("pdf"),
diff --git a/src/agents/test-helpers/fast-tool-stubs.ts b/src/agents/test-helpers/fast-tool-stubs.ts
index 7eb885f503b..d86eede29f0 100644
--- a/src/agents/test-helpers/fast-tool-stubs.ts
+++ b/src/agents/test-helpers/fast-tool-stubs.ts
@@ -23,6 +23,10 @@ vi.mock("../tools/image-generate-tool.js", () => ({
createImageGenerateTool: () => stubTool("image_generate"),
}));
+vi.mock("../tools/video-generate-tool.js", () => ({
+ createVideoGenerateTool: () => stubTool("video_generate"),
+}));
+
vi.mock("../tools/web-tools.js", () => ({
createWebSearchTool: () => null,
createWebFetchTool: () => null,
diff --git a/src/agents/tool-catalog.test.ts b/src/agents/tool-catalog.test.ts
index 55ea1530465..216b0fcab84 100644
--- a/src/agents/tool-catalog.test.ts
+++ b/src/agents/tool-catalog.test.ts
@@ -10,6 +10,7 @@ describe("tool-catalog", () => {
expect(policy!.allow).toContain("x_search");
expect(policy!.allow).toContain("web_fetch");
expect(policy!.allow).toContain("image_generate");
+ expect(policy!.allow).toContain("video_generate");
expect(policy!.allow).toContain("update_plan");
});
});
diff --git a/src/agents/tool-catalog.ts b/src/agents/tool-catalog.ts
index fe4d1e96e30..c28249fd584 100644
--- a/src/agents/tool-catalog.ts
+++ b/src/agents/tool-catalog.ts
@@ -277,6 +277,14 @@ const CORE_TOOL_DEFINITIONS: CoreToolDefinition[] = [
profiles: ["coding"],
includeInOpenClawGroup: true,
},
+ {
+ id: "video_generate",
+ label: "video_generate",
+ description: "Video generation",
+ sectionId: "media",
+ profiles: ["coding"],
+ includeInOpenClawGroup: true,
+ },
{
id: "tts",
label: "tts",
diff --git a/src/agents/tool-display-config.ts b/src/agents/tool-display-config.ts
index 4e6948500f2..f5dcd69f6a2 100644
--- a/src/agents/tool-display-config.ts
+++ b/src/agents/tool-display-config.ts
@@ -640,6 +640,28 @@ export const TOOL_DISPLAY_CONFIG: ToolDisplayConfig = {
},
},
},
+ video_generate: {
+ emoji: "🎬",
+ title: "Video Generation",
+ actions: {
+ generate: {
+ label: "generate",
+ detailKeys: [
+ "prompt",
+ "model",
+ "durationSeconds",
+ "resolution",
+ "aspectRatio",
+ "audio",
+ "watermark",
+ ],
+ },
+ list: {
+ label: "list",
+ detailKeys: ["provider", "model"],
+ },
+ },
+ },
pdf: {
emoji: "📑",
title: "PDF",
diff --git a/src/agents/tools/media-tool-shared.ts b/src/agents/tools/media-tool-shared.ts
index 6b65c595b5e..4e384380442 100644
--- a/src/agents/tools/media-tool-shared.ts
+++ b/src/agents/tools/media-tool-shared.ts
@@ -32,9 +32,16 @@ export function applyImageGenerationModelConfigDefaults(
return applyAgentDefaultModelConfig(cfg, "imageGenerationModel", imageGenerationModelConfig);
}
+export function applyVideoGenerationModelConfigDefaults(
+ cfg: OpenClawConfig | undefined,
+ videoGenerationModelConfig: ToolModelConfig,
+): OpenClawConfig | undefined {
+ return applyAgentDefaultModelConfig(cfg, "videoGenerationModel", videoGenerationModelConfig);
+}
+
function applyAgentDefaultModelConfig(
cfg: OpenClawConfig | undefined,
- key: "imageModel" | "imageGenerationModel",
+ key: "imageModel" | "imageGenerationModel" | "videoGenerationModel",
modelConfig: ToolModelConfig,
): OpenClawConfig | undefined {
if (!cfg) {
diff --git a/src/agents/tools/video-generate-tool.test.ts b/src/agents/tools/video-generate-tool.test.ts
new file mode 100644
index 00000000000..a69d2980642
--- /dev/null
+++ b/src/agents/tools/video-generate-tool.test.ts
@@ -0,0 +1,91 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { OpenClawConfig } from "../../config/config.js";
+import * as mediaStore from "../../media/store.js";
+import * as videoGenerationRuntime from "../../video-generation/runtime.js";
+import { createVideoGenerateTool } from "./video-generate-tool.js";
+
+function asConfig(value: unknown): OpenClawConfig {
+ return value as OpenClawConfig;
+}
+
+describe("createVideoGenerateTool", () => {
+ beforeEach(() => {
+ vi.restoreAllMocks();
+ });
+
+ afterEach(() => {
+ vi.unstubAllEnvs();
+ });
+
+ it("returns null when no video-generation config or auth-backed provider is available", () => {
+ vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([]);
+
+ expect(createVideoGenerateTool({ config: asConfig({}) })).toBeNull();
+ });
+
+ it("registers when video-generation config is present", () => {
+ expect(
+ createVideoGenerateTool({
+ config: asConfig({
+ agents: {
+ defaults: {
+ videoGenerationModel: { primary: "qwen/wan2.6-t2v" },
+ },
+ },
+ }),
+ }),
+ ).not.toBeNull();
+ });
+
+ it("generates videos, saves them, and emits MEDIA paths", async () => {
+ vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({
+ provider: "qwen",
+ model: "wan2.6-t2v",
+ attempts: [],
+ videos: [
+ {
+ buffer: Buffer.from("video-bytes"),
+ mimeType: "video/mp4",
+ fileName: "lobster.mp4",
+ },
+ ],
+ metadata: { taskId: "task-1" },
+ });
+ vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({
+ path: "/tmp/generated-lobster.mp4",
+ id: "generated-lobster.mp4",
+ size: 11,
+ contentType: "video/mp4",
+ });
+
+ const tool = createVideoGenerateTool({
+ config: asConfig({
+ agents: {
+ defaults: {
+ videoGenerationModel: { primary: "qwen/wan2.6-t2v" },
+ },
+ },
+ }),
+ });
+ expect(tool).not.toBeNull();
+ if (!tool) {
+ throw new Error("expected video_generate tool");
+ }
+
+ const result = await tool.execute("call-1", { prompt: "friendly lobster surfing" });
+ const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
+
+ expect(text).toContain("Generated 1 video with qwen/wan2.6-t2v.");
+ expect(text).toContain("MEDIA:/tmp/generated-lobster.mp4");
+ expect(result.details).toMatchObject({
+ provider: "qwen",
+ model: "wan2.6-t2v",
+ count: 1,
+ media: {
+ mediaUrls: ["/tmp/generated-lobster.mp4"],
+ },
+ paths: ["/tmp/generated-lobster.mp4"],
+ metadata: { taskId: "task-1" },
+ });
+ });
+});
diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts
new file mode 100644
index 00000000000..a75069f60e0
--- /dev/null
+++ b/src/agents/tools/video-generate-tool.ts
@@ -0,0 +1,735 @@
+import { Type } from "@sinclair/typebox";
+import type { OpenClawConfig } from "../../config/config.js";
+import { loadConfig } from "../../config/config.js";
+import { saveMediaBuffer } from "../../media/store.js";
+import { loadWebMedia } from "../../media/web-media.js";
+import { readSnakeCaseParamRaw } from "../../param-key.js";
+import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
+import { resolveUserPath } from "../../utils.js";
+import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
+import {
+ generateVideo,
+ listRuntimeVideoGenerationProviders,
+} from "../../video-generation/runtime.js";
+import type {
+ VideoGenerationProvider,
+ VideoGenerationResolution,
+ VideoGenerationSourceAsset,
+} from "../../video-generation/types.js";
+import { normalizeProviderId } from "../provider-id.js";
+import {
+ ToolInputError,
+ readNumberParam,
+ readStringArrayParam,
+ readStringParam,
+} from "./common.js";
+import { decodeDataUrl } from "./image-tool.helpers.js";
+import {
+ applyVideoGenerationModelConfigDefaults,
+ resolveMediaToolLocalRoots,
+} from "./media-tool-shared.js";
+import {
+ buildToolModelConfigFromCandidates,
+ coerceToolModelConfig,
+ hasAuthForProvider,
+ hasToolModelConfig,
+ resolveDefaultModelRef,
+ type ToolModelConfig,
+} from "./model-config.helpers.js";
+import {
+ createSandboxBridgeReadFile,
+ resolveSandboxedBridgeMediaPath,
+ type AnyAgentTool,
+ type SandboxFsBridge,
+ type ToolFsPolicy,
+} from "./tool-runtime.helpers.js";
+
+const MAX_INPUT_IMAGES = 5;
+const MAX_INPUT_VIDEOS = 4;
+const SUPPORTED_ASPECT_RATIOS = new Set([
+ "1:1",
+ "2:3",
+ "3:2",
+ "3:4",
+ "4:3",
+ "4:5",
+ "5:4",
+ "9:16",
+ "16:9",
+ "21:9",
+]);
+
+const VideoGenerateToolSchema = Type.Object({
+ action: Type.Optional(
+ Type.String({
+ description:
+ 'Optional action: "generate" (default) or "list" to inspect available providers/models.',
+ }),
+ ),
+ prompt: Type.Optional(Type.String({ description: "Video generation prompt." })),
+ image: Type.Optional(
+ Type.String({
+ description: "Optional single reference image path or URL.",
+ }),
+ ),
+ images: Type.Optional(
+ Type.Array(Type.String(), {
+ description: `Optional reference images (up to ${MAX_INPUT_IMAGES}).`,
+ }),
+ ),
+ video: Type.Optional(
+ Type.String({
+ description: "Optional single reference video path or URL.",
+ }),
+ ),
+ videos: Type.Optional(
+ Type.Array(Type.String(), {
+ description: `Optional reference videos (up to ${MAX_INPUT_VIDEOS}).`,
+ }),
+ ),
+ model: Type.Optional(
+ Type.String({ description: "Optional provider/model override, e.g. qwen/wan2.6-t2v." }),
+ ),
+ filename: Type.Optional(
+ Type.String({
+ description:
+ "Optional output filename hint. OpenClaw preserves the basename and saves under its managed media directory.",
+ }),
+ ),
+ size: Type.Optional(
+ Type.String({
+ description: "Optional size hint like 1280x720 or 1920x1080 when the provider supports it.",
+ }),
+ ),
+ aspectRatio: Type.Optional(
+ Type.String({
+ description:
+ "Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9.",
+ }),
+ ),
+ resolution: Type.Optional(
+ Type.String({
+ description: "Optional resolution hint: 480P, 720P, or 1080P.",
+ }),
+ ),
+ durationSeconds: Type.Optional(
+ Type.Number({
+ description: "Optional target duration in seconds.",
+ minimum: 1,
+ }),
+ ),
+ audio: Type.Optional(
+ Type.Boolean({
+ description: "Optional audio toggle when the provider supports generated audio.",
+ }),
+ ),
+ watermark: Type.Optional(
+ Type.Boolean({
+ description: "Optional watermark toggle when the provider supports it.",
+ }),
+ ),
+});
+
+function getVideoGenerationProviderAuthEnvVars(providerId: string): string[] {
+ return getProviderEnvVars(providerId);
+}
+
+function resolveVideoGenerationModelCandidates(params: {
+ cfg?: OpenClawConfig;
+ agentDir?: string;
+}): Array {
+ const providerDefaults = new Map();
+ for (const provider of listRuntimeVideoGenerationProviders({ config: params.cfg })) {
+ const providerId = provider.id.trim();
+ const modelId = provider.defaultModel?.trim();
+ if (
+ !providerId ||
+ !modelId ||
+ providerDefaults.has(providerId) ||
+ !isVideoGenerationProviderConfigured({
+ provider,
+ cfg: params.cfg,
+ agentDir: params.agentDir,
+ })
+ ) {
+ continue;
+ }
+ providerDefaults.set(providerId, `${providerId}/${modelId}`);
+ }
+
+ const primaryProvider = resolveDefaultModelRef(params.cfg).provider;
+ const orderedProviders = [
+ primaryProvider,
+ ...[...providerDefaults.keys()]
+ .filter((providerId) => providerId !== primaryProvider)
+ .toSorted(),
+ ];
+ const orderedRefs: string[] = [];
+ const seen = new Set();
+ for (const providerId of orderedProviders) {
+ const ref = providerDefaults.get(providerId);
+ if (!ref || seen.has(ref)) {
+ continue;
+ }
+ seen.add(ref);
+ orderedRefs.push(ref);
+ }
+ return orderedRefs;
+}
+
+export function resolveVideoGenerationModelConfigForTool(params: {
+ cfg?: OpenClawConfig;
+ agentDir?: string;
+}): ToolModelConfig | null {
+ const explicit = coerceToolModelConfig(params.cfg?.agents?.defaults?.videoGenerationModel);
+ if (hasToolModelConfig(explicit)) {
+ return explicit;
+ }
+ return buildToolModelConfigFromCandidates({
+ explicit,
+ agentDir: params.agentDir,
+ candidates: resolveVideoGenerationModelCandidates(params),
+ isProviderConfigured: (providerId) =>
+ isVideoGenerationProviderConfigured({
+ providerId,
+ cfg: params.cfg,
+ agentDir: params.agentDir,
+ }),
+ });
+}
+
+function isVideoGenerationProviderConfigured(params: {
+ provider?: VideoGenerationProvider;
+ providerId?: string;
+ cfg?: OpenClawConfig;
+ agentDir?: string;
+}): boolean {
+ const provider =
+ params.provider ??
+ listRuntimeVideoGenerationProviders({ config: params.cfg }).find((candidate) => {
+ const normalizedId = normalizeProviderId(params.providerId ?? "");
+ return (
+ normalizeProviderId(candidate.id) === normalizedId ||
+ (candidate.aliases ?? []).some((alias) => normalizeProviderId(alias) === normalizedId)
+ );
+ });
+ if (!provider) {
+ return params.providerId
+ ? hasAuthForProvider({ provider: params.providerId, agentDir: params.agentDir })
+ : false;
+ }
+ if (provider.isConfigured) {
+ return provider.isConfigured({
+ cfg: params.cfg,
+ agentDir: params.agentDir,
+ });
+ }
+ return hasAuthForProvider({ provider: provider.id, agentDir: params.agentDir });
+}
+
+function resolveAction(args: Record): "generate" | "list" {
+ const raw = readStringParam(args, "action");
+ if (!raw) {
+ return "generate";
+ }
+ const normalized = raw.trim().toLowerCase();
+ if (normalized === "generate" || normalized === "list") {
+ return normalized;
+ }
+ throw new ToolInputError('action must be "generate" or "list"');
+}
+
+function normalizeResolution(raw: string | undefined): VideoGenerationResolution | undefined {
+ const normalized = raw?.trim().toUpperCase();
+ if (!normalized) {
+ return undefined;
+ }
+ if (normalized === "480P" || normalized === "720P" || normalized === "1080P") {
+ return normalized;
+ }
+ throw new ToolInputError("resolution must be one of 480P, 720P, or 1080P");
+}
+
+function normalizeAspectRatio(raw: string | undefined): string | undefined {
+ const normalized = raw?.trim();
+ if (!normalized) {
+ return undefined;
+ }
+ if (SUPPORTED_ASPECT_RATIOS.has(normalized)) {
+ return normalized;
+ }
+ throw new ToolInputError(
+ "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9",
+ );
+}
+
+function readBooleanParam(params: Record, key: string): boolean | undefined {
+ const raw = readSnakeCaseParamRaw(params, key);
+ if (typeof raw === "boolean") {
+ return raw;
+ }
+ if (typeof raw === "string") {
+ const normalized = raw.trim().toLowerCase();
+ if (normalized === "true") {
+ return true;
+ }
+ if (normalized === "false") {
+ return false;
+ }
+ }
+ return undefined;
+}
+
+function normalizeReferenceInputs(params: {
+ args: Record;
+ singularKey: "image" | "video";
+ pluralKey: "images" | "videos";
+ maxCount: number;
+}): string[] {
+ const single = readStringParam(params.args, params.singularKey);
+ const multiple = readStringArrayParam(params.args, params.pluralKey);
+ const combined = [...(single ? [single] : []), ...(multiple ?? [])];
+ const deduped: string[] = [];
+ const seen = new Set();
+ for (const candidate of combined) {
+ const trimmed = candidate.trim();
+ const dedupe = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
+ if (!dedupe || seen.has(dedupe)) {
+ continue;
+ }
+ seen.add(dedupe);
+ deduped.push(trimmed);
+ }
+ if (deduped.length > params.maxCount) {
+ throw new ToolInputError(
+ `Too many reference ${params.pluralKey}: ${deduped.length} provided, maximum is ${params.maxCount}.`,
+ );
+ }
+ return deduped;
+}
+
+function resolveSelectedVideoGenerationProvider(params: {
+ config?: OpenClawConfig;
+ videoGenerationModelConfig: ToolModelConfig;
+ modelOverride?: string;
+}): VideoGenerationProvider | undefined {
+ const selectedRef =
+ parseVideoGenerationModelRef(params.modelOverride) ??
+ parseVideoGenerationModelRef(params.videoGenerationModelConfig.primary);
+ if (!selectedRef) {
+ return undefined;
+ }
+ const selectedProvider = normalizeProviderId(selectedRef.provider);
+ return listRuntimeVideoGenerationProviders({ config: params.config }).find(
+ (provider) =>
+ normalizeProviderId(provider.id) === selectedProvider ||
+ (provider.aliases ?? []).some((alias) => normalizeProviderId(alias) === selectedProvider),
+ );
+}
+
+function validateVideoGenerationCapabilities(params: {
+ provider: VideoGenerationProvider | undefined;
+ inputImageCount: number;
+ inputVideoCount: number;
+ size?: string;
+ aspectRatio?: string;
+ resolution?: VideoGenerationResolution;
+ durationSeconds?: number;
+ audio?: boolean;
+ watermark?: boolean;
+}) {
+ const provider = params.provider;
+ if (!provider) {
+ return;
+ }
+ const caps = provider.capabilities;
+ if (params.inputImageCount > 0) {
+ const maxInputImages = caps.maxInputImages ?? MAX_INPUT_IMAGES;
+ if (params.inputImageCount > maxInputImages) {
+ throw new ToolInputError(
+ `${provider.id} supports at most ${maxInputImages} reference image${maxInputImages === 1 ? "" : "s"}.`,
+ );
+ }
+ }
+ if (params.inputVideoCount > 0) {
+ const maxInputVideos = caps.maxInputVideos ?? MAX_INPUT_VIDEOS;
+ if (params.inputVideoCount > maxInputVideos) {
+ throw new ToolInputError(
+ `${provider.id} supports at most ${maxInputVideos} reference video${maxInputVideos === 1 ? "" : "s"}.`,
+ );
+ }
+ }
+ if (params.size && !caps.supportsSize) {
+ throw new ToolInputError(`${provider.id} does not support size overrides.`);
+ }
+ if (params.aspectRatio && !caps.supportsAspectRatio) {
+ throw new ToolInputError(`${provider.id} does not support aspectRatio overrides.`);
+ }
+ if (params.resolution && !caps.supportsResolution) {
+ throw new ToolInputError(`${provider.id} does not support resolution overrides.`);
+ }
+ if (
+ typeof params.durationSeconds === "number" &&
+ Number.isFinite(params.durationSeconds) &&
+ typeof caps.maxDurationSeconds === "number" &&
+ params.durationSeconds > caps.maxDurationSeconds
+ ) {
+ throw new ToolInputError(
+ `${provider.id} supports at most ${caps.maxDurationSeconds} seconds per video.`,
+ );
+ }
+ if (typeof params.audio === "boolean" && !caps.supportsAudio) {
+ throw new ToolInputError(`${provider.id} does not support audio toggles.`);
+ }
+ if (typeof params.watermark === "boolean" && !caps.supportsWatermark) {
+ throw new ToolInputError(`${provider.id} does not support watermark toggles.`);
+ }
+}
+
+type VideoGenerateSandboxConfig = {
+ root: string;
+ bridge: SandboxFsBridge;
+};
+
+async function loadReferenceAssets(params: {
+ inputs: string[];
+ expectedKind: "image" | "video";
+ maxBytes?: number;
+ workspaceDir?: string;
+ sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null;
+}): Promise<
+ Array<{
+ sourceAsset: VideoGenerationSourceAsset;
+ resolvedInput: string;
+ rewrittenFrom?: string;
+ }>
+> {
+ const loaded: Array<{
+ sourceAsset: VideoGenerationSourceAsset;
+ resolvedInput: string;
+ rewrittenFrom?: string;
+ }> = [];
+
+ for (const rawInput of params.inputs) {
+ const trimmed = rawInput.trim();
+ const inputRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
+ if (!inputRaw) {
+ throw new ToolInputError(`${params.expectedKind} required (empty string in array)`);
+ }
+ const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(inputRaw);
+ const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(inputRaw);
+ const isFileUrl = /^file:/i.test(inputRaw);
+ const isHttpUrl = /^https?:\/\//i.test(inputRaw);
+ const isDataUrl = /^data:/i.test(inputRaw);
+ if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
+ throw new ToolInputError(
+ `Unsupported ${params.expectedKind} reference: ${rawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
+ );
+ }
+ if (params.sandboxConfig && isHttpUrl) {
+ throw new ToolInputError(
+ `Sandboxed video_generate does not allow remote ${params.expectedKind} URLs.`,
+ );
+ }
+
+ const resolvedInput = (() => {
+ if (params.sandboxConfig) {
+ return inputRaw;
+ }
+ if (inputRaw.startsWith("~")) {
+ return resolveUserPath(inputRaw);
+ }
+ return inputRaw;
+ })();
+
+ if (isHttpUrl && !params.sandboxConfig) {
+ loaded.push({
+ sourceAsset: { url: resolvedInput },
+ resolvedInput,
+ });
+ continue;
+ }
+
+ const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl
+ ? { resolved: "" }
+ : params.sandboxConfig
+ ? await resolveSandboxedBridgeMediaPath({
+ sandbox: params.sandboxConfig,
+ mediaPath: resolvedInput,
+ inboundFallbackDir: "media/inbound",
+ })
+ : {
+ resolved: resolvedInput.startsWith("file://")
+ ? resolvedInput.slice("file://".length)
+ : resolvedInput,
+ };
+ const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved;
+ const localRoots = resolveMediaToolLocalRoots(
+ params.workspaceDir,
+ {
+ workspaceOnly: params.sandboxConfig?.workspaceOnly === true,
+ },
+ resolvedPath ? [resolvedPath] : undefined,
+ );
+ const media = isDataUrl
+ ? params.expectedKind === "image"
+ ? decodeDataUrl(resolvedInput)
+ : (() => {
+ throw new ToolInputError("Video data: URLs are not supported for video_generate.");
+ })()
+ : params.sandboxConfig
+ ? await loadWebMedia(resolvedPath ?? resolvedInput, {
+ maxBytes: params.maxBytes,
+ sandboxValidated: true,
+ readFile: createSandboxBridgeReadFile({ sandbox: params.sandboxConfig }),
+ })
+ : await loadWebMedia(resolvedPath ?? resolvedInput, {
+ maxBytes: params.maxBytes,
+ localRoots,
+ });
+ if (media.kind !== params.expectedKind) {
+ throw new ToolInputError(`Unsupported media type: ${media.kind ?? "unknown"}`);
+ }
+ const mimeType = "mimeType" in media ? media.mimeType : media.contentType;
+ const fileName = "fileName" in media ? media.fileName : undefined;
+ loaded.push({
+ sourceAsset: {
+ buffer: media.buffer,
+ mimeType,
+ fileName,
+ },
+ resolvedInput,
+ ...(resolvedPathInfo.rewrittenFrom ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } : {}),
+ });
+ }
+
+ return loaded;
+}
+
+export function createVideoGenerateTool(options?: {
+ config?: OpenClawConfig;
+ agentDir?: string;
+ workspaceDir?: string;
+ sandbox?: VideoGenerateSandboxConfig;
+ fsPolicy?: ToolFsPolicy;
+}): AnyAgentTool | null {
+ const cfg: OpenClawConfig = options?.config ?? loadConfig();
+ const videoGenerationModelConfig = resolveVideoGenerationModelConfigForTool({
+ cfg,
+ agentDir: options?.agentDir,
+ });
+ if (!videoGenerationModelConfig) {
+ return null;
+ }
+
+ const sandboxConfig = options?.sandbox
+ ? {
+ root: options.sandbox.root,
+ bridge: options.sandbox.bridge,
+ workspaceOnly: options.fsPolicy?.workspaceOnly === true,
+ }
+ : null;
+
+ return {
+ label: "Video Generation",
+ name: "video_generate",
+ displaySummary: "Generate videos",
+ description:
+ "Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments.",
+ parameters: VideoGenerateToolSchema,
+ execute: async (_toolCallId, rawArgs) => {
+ const args = rawArgs as Record;
+ const action = resolveAction(args);
+ const effectiveCfg =
+ applyVideoGenerationModelConfigDefaults(cfg, videoGenerationModelConfig) ?? cfg;
+
+ if (action === "list") {
+ const providers = listRuntimeVideoGenerationProviders({ config: effectiveCfg });
+ if (providers.length === 0) {
+ return {
+ content: [{ type: "text", text: "No video-generation providers are registered." }],
+ details: { providers: [] },
+ };
+ }
+ const lines = providers.map((provider) => {
+ const authHints = getVideoGenerationProviderAuthEnvVars(provider.id);
+ const capabilities = [
+ provider.capabilities.maxVideos ? `maxVideos=${provider.capabilities.maxVideos}` : null,
+ provider.capabilities.maxInputImages
+ ? `maxInputImages=${provider.capabilities.maxInputImages}`
+ : null,
+ provider.capabilities.maxInputVideos
+ ? `maxInputVideos=${provider.capabilities.maxInputVideos}`
+ : null,
+ provider.capabilities.maxDurationSeconds
+ ? `maxDurationSeconds=${provider.capabilities.maxDurationSeconds}`
+ : null,
+ provider.capabilities.supportsResolution ? "resolution" : null,
+ provider.capabilities.supportsAspectRatio ? "aspectRatio" : null,
+ provider.capabilities.supportsSize ? "size" : null,
+ provider.capabilities.supportsAudio ? "audio" : null,
+ provider.capabilities.supportsWatermark ? "watermark" : null,
+ ]
+ .filter((entry): entry is string => Boolean(entry))
+ .join(", ");
+ return [
+ `${provider.id}: default=${provider.defaultModel ?? "none"}`,
+ provider.models?.length ? `models=${provider.models.join(", ")}` : null,
+ capabilities ? `capabilities=${capabilities}` : null,
+ authHints.length > 0 ? `auth=${authHints.join(" / ")}` : null,
+ ]
+ .filter((entry): entry is string => Boolean(entry))
+ .join(" | ");
+ });
+ return {
+ content: [{ type: "text", text: lines.join("\n") }],
+ details: {
+ providers: providers.map((provider) => ({
+ id: provider.id,
+ defaultModel: provider.defaultModel,
+ models: provider.models ?? [],
+ authEnvVars: getVideoGenerationProviderAuthEnvVars(provider.id),
+ capabilities: provider.capabilities,
+ })),
+ },
+ };
+ }
+
+ const prompt = readStringParam(args, "prompt", { required: true });
+ const model = readStringParam(args, "model");
+ const filename = readStringParam(args, "filename");
+ const size = readStringParam(args, "size");
+ const aspectRatio = normalizeAspectRatio(readStringParam(args, "aspectRatio"));
+ const resolution = normalizeResolution(readStringParam(args, "resolution"));
+ const durationSeconds = readNumberParam(args, "durationSeconds", {
+ integer: true,
+ strict: true,
+ });
+ const audio = readBooleanParam(args, "audio");
+ const watermark = readBooleanParam(args, "watermark");
+ const imageInputs = normalizeReferenceInputs({
+ args,
+ singularKey: "image",
+ pluralKey: "images",
+ maxCount: MAX_INPUT_IMAGES,
+ });
+ const videoInputs = normalizeReferenceInputs({
+ args,
+ singularKey: "video",
+ pluralKey: "videos",
+ maxCount: MAX_INPUT_VIDEOS,
+ });
+
+ const selectedProvider = resolveSelectedVideoGenerationProvider({
+ config: effectiveCfg,
+ videoGenerationModelConfig,
+ modelOverride: model,
+ });
+ const loadedReferenceImages = await loadReferenceAssets({
+ inputs: imageInputs,
+ expectedKind: "image",
+ workspaceDir: options?.workspaceDir,
+ sandboxConfig,
+ });
+ const loadedReferenceVideos = await loadReferenceAssets({
+ inputs: videoInputs,
+ expectedKind: "video",
+ workspaceDir: options?.workspaceDir,
+ sandboxConfig,
+ });
+ validateVideoGenerationCapabilities({
+ provider: selectedProvider,
+ inputImageCount: loadedReferenceImages.length,
+ inputVideoCount: loadedReferenceVideos.length,
+ size,
+ aspectRatio,
+ resolution,
+ durationSeconds,
+ audio,
+ watermark,
+ });
+
+ const result = await generateVideo({
+ cfg: effectiveCfg,
+ prompt,
+ agentDir: options?.agentDir,
+ modelOverride: model,
+ size,
+ aspectRatio,
+ resolution,
+ durationSeconds,
+ audio,
+ watermark,
+ inputImages: loadedReferenceImages.map((entry) => entry.sourceAsset),
+ inputVideos: loadedReferenceVideos.map((entry) => entry.sourceAsset),
+ });
+ const savedVideos = await Promise.all(
+ result.videos.map((video) =>
+ saveMediaBuffer(
+ video.buffer,
+ video.mimeType,
+ "tool-video-generation",
+ undefined,
+ filename || video.fileName,
+ ),
+ ),
+ );
+ const lines = [
+ `Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
+ ...savedVideos.map((video) => `MEDIA:${video.path}`),
+ ];
+
+ return {
+ content: [{ type: "text", text: lines.join("\n") }],
+ details: {
+ provider: result.provider,
+ model: result.model,
+ count: savedVideos.length,
+ media: {
+ mediaUrls: savedVideos.map((video) => video.path),
+ },
+ paths: savedVideos.map((video) => video.path),
+ ...(loadedReferenceImages.length === 1
+ ? {
+ image: loadedReferenceImages[0]?.resolvedInput,
+ ...(loadedReferenceImages[0]?.rewrittenFrom
+ ? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom }
+ : {}),
+ }
+ : loadedReferenceImages.length > 1
+ ? {
+ images: loadedReferenceImages.map((entry) => ({
+ image: entry.resolvedInput,
+ ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
+ })),
+ }
+ : {}),
+ ...(loadedReferenceVideos.length === 1
+ ? {
+ video: loadedReferenceVideos[0]?.resolvedInput,
+ ...(loadedReferenceVideos[0]?.rewrittenFrom
+ ? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom }
+ : {}),
+ }
+ : loadedReferenceVideos.length > 1
+ ? {
+ videos: loadedReferenceVideos.map((entry) => ({
+ video: entry.resolvedInput,
+ ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
+ })),
+ }
+ : {}),
+ ...(size ? { size } : {}),
+ ...(aspectRatio ? { aspectRatio } : {}),
+ ...(resolution ? { resolution } : {}),
+ ...(typeof durationSeconds === "number" ? { durationSeconds } : {}),
+ ...(typeof audio === "boolean" ? { audio } : {}),
+ ...(typeof watermark === "boolean" ? { watermark } : {}),
+ ...(filename ? { filename } : {}),
+ attempts: result.attempts,
+ metadata: result.metadata,
+ },
+ };
+ },
+ };
+}
diff --git a/src/cli/config-cli.test.ts b/src/cli/config-cli.test.ts
index e82d9885f85..2334f0dc826 100644
--- a/src/cli/config-cli.test.ts
+++ b/src/cli/config-cli.test.ts
@@ -242,6 +242,37 @@ describe("config cli", () => {
expect(written.gateway?.auth).toEqual({ mode: "token" });
});
+ it("writes agents.defaults.videoGenerationModel.primary without disturbing sibling defaults", async () => {
+ const resolved: OpenClawConfig = {
+ agents: {
+ defaults: {
+ model: "openai/gpt-5.4",
+ imageGenerationModel: {
+ primary: "openai/gpt-image-1",
+ },
+ },
+ },
+ };
+ setSnapshot(resolved, resolved);
+
+ await runConfigCommand([
+ "config",
+ "set",
+ "agents.defaults.videoGenerationModel.primary",
+ "qwen/wan2.6-t2v",
+ ]);
+
+ expect(mockWriteConfigFile).toHaveBeenCalledTimes(1);
+ const written = mockWriteConfigFile.mock.calls[0]?.[0];
+ expect(written.agents?.defaults?.model).toBe("openai/gpt-5.4");
+ expect(written.agents?.defaults?.imageGenerationModel).toEqual({
+ primary: "openai/gpt-image-1",
+ });
+ expect(written.agents?.defaults?.videoGenerationModel).toEqual({
+ primary: "qwen/wan2.6-t2v",
+ });
+ });
+
it("drops gateway.auth.password when switching mode to token", async () => {
const resolved: OpenClawConfig = {
gateway: {
diff --git a/src/config/schema.base.generated.test.ts b/src/config/schema.base.generated.test.ts
index 647181d8241..d7bc682947f 100644
--- a/src/config/schema.base.generated.test.ts
+++ b/src/config/schema.base.generated.test.ts
@@ -40,4 +40,25 @@ describe("generated base config schema", () => {
expect(hooksInternalProperties?.handlers).toBeUndefined();
expect(uiHints["hooks.internal.handlers"]).toBeUndefined();
});
+
+ it("includes videoGenerationModel in the public schema payload", () => {
+ const agentDefaultsProperties = (
+ GENERATED_BASE_CONFIG_SCHEMA.schema as {
+ properties?: {
+ agents?: {
+ properties?: {
+ defaults?: {
+ properties?: Record;
+ };
+ };
+ };
+ };
+ }
+ ).properties?.agents?.properties?.defaults?.properties;
+ const uiHints = GENERATED_BASE_CONFIG_SCHEMA.uiHints as Record;
+
+ expect(agentDefaultsProperties?.videoGenerationModel).toBeDefined();
+ expect(uiHints["agents.defaults.videoGenerationModel.primary"]).toBeDefined();
+ expect(uiHints["agents.defaults.videoGenerationModel.fallbacks"]).toBeDefined();
+ });
});
diff --git a/src/config/zod-schema.agent-defaults.test.ts b/src/config/zod-schema.agent-defaults.test.ts
index 1a99b73bb21..878dc59a9ed 100644
--- a/src/config/zod-schema.agent-defaults.test.ts
+++ b/src/config/zod-schema.agent-defaults.test.ts
@@ -11,4 +11,15 @@ describe("agent defaults schema", () => {
}),
).not.toThrow();
});
+
+ it("accepts videoGenerationModel", () => {
+ expect(() =>
+ AgentDefaultsSchema.parse({
+ videoGenerationModel: {
+ primary: "qwen/wan2.6-t2v",
+ fallbacks: ["minimax/video-01"],
+ },
+ }),
+ ).not.toThrow();
+ });
});