mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 01:31:08 +00:00
feat(agents): add video_generate tool
This commit is contained in:
@@ -191,6 +191,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Update/npm: prefer the npm binary that owns the installed global OpenClaw prefix so mixed Homebrew-plus-nvm setups update the right install. (#60153) Thanks @jayeshp19.
|
||||
- Windows/restart: clean up stale gateway listeners before Windows self-restart and treat listener and argv probe failures as inconclusive, so scheduled-task relaunch no longer falls into an `EADDRINUSE` retry loop. (#60480) Thanks @arifahmedjoy.
|
||||
- Plugins: suppress trust-warning noise during non-activating snapshot and CLI metadata loads. (#61427) Thanks @gumadeiras.
|
||||
- Agents/video generation: accept `agents.defaults.videoGenerationModel` in strict config validation and `openclaw config set/get`, so gateways using `video_generate` no longer fail to boot after enabling a video model.
|
||||
|
||||
## 2026.4.2
|
||||
|
||||
|
||||
@@ -1030,6 +1030,31 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"video_generate": {
|
||||
"emoji": "🎬",
|
||||
"title": "Video Generation",
|
||||
"actions": {
|
||||
"generate": {
|
||||
"label": "generate",
|
||||
"detailKeys": [
|
||||
"prompt",
|
||||
"model",
|
||||
"durationSeconds",
|
||||
"resolution",
|
||||
"aspectRatio",
|
||||
"audio",
|
||||
"watermark"
|
||||
]
|
||||
},
|
||||
"list": {
|
||||
"label": "list",
|
||||
"detailKeys": [
|
||||
"provider",
|
||||
"model"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"pdf": {
|
||||
"emoji": "📑",
|
||||
"title": "PDF",
|
||||
|
||||
@@ -30,7 +30,7 @@ Related:
|
||||
falls back to `agents.defaults.imageModel`, then the resolved session/default
|
||||
model.
|
||||
- `agents.defaults.imageGenerationModel` is used by the shared image-generation capability. If omitted, `image_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered image-generation providers in provider-id order. If you set a specific provider/model, also configure that provider's auth/API key.
|
||||
- `agents.defaults.videoGenerationModel` is used by the shared video-generation capability. Unlike image generation, this does not infer a provider default today. Set an explicit `provider/model` such as `qwen/wan2.6-t2v`, and configure that provider's auth/API key too.
|
||||
- `agents.defaults.videoGenerationModel` is used by the shared video-generation capability. If omitted, `video_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered video-generation providers in provider-id order. If you set a specific provider/model, also configure that provider's auth/API key.
|
||||
- Per-agent defaults can override `agents.defaults.model` via `agents.list[].model` plus bindings (see [/concepts/multi-agent](/concepts/multi-agent)).
|
||||
|
||||
## Quick model policy
|
||||
@@ -252,4 +252,5 @@ This applies whenever OpenClaw regenerates `models.json`, including command-driv
|
||||
- [Model Providers](/concepts/model-providers) — provider routing and auth
|
||||
- [Model Failover](/concepts/model-failover) — fallback chains
|
||||
- [Image Generation](/tools/image-generation) — image model configuration
|
||||
- [Video Generation](/tools/video-generation) — video model configuration
|
||||
- [Configuration Reference](/gateway/configuration-reference#agent-defaults) — model config keys
|
||||
|
||||
@@ -1026,9 +1026,9 @@ Time format in system prompt. Default: `auto` (OS preference).
|
||||
- If you select a provider/model directly, configure the matching provider auth/API key too (for example `GEMINI_API_KEY` or `GOOGLE_API_KEY` for `google/*`, `OPENAI_API_KEY` for `openai/*`, `FAL_KEY` for `fal/*`).
|
||||
- If omitted, `image_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered image-generation providers in provider-id order.
|
||||
- `videoGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
|
||||
- Used by the shared video-generation capability.
|
||||
- Used by the shared video-generation capability and the built-in `video_generate` tool.
|
||||
- Typical values: `qwen/wan2.6-t2v`, `qwen/wan2.6-i2v`, `qwen/wan2.6-r2v`, `qwen/wan2.6-r2v-flash`, or `qwen/wan2.7-r2v`.
|
||||
- Set this explicitly before using shared video generation. Unlike `imageGenerationModel`, the video-generation runtime does not infer a provider default yet.
|
||||
- If omitted, `video_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered video-generation providers in provider-id order.
|
||||
- If you select a provider/model directly, configure the matching provider auth/API key too.
|
||||
- The bundled Qwen video-generation provider currently supports up to 1 output video, 1 input image, 4 input videos, 10 seconds duration, and provider-level `size`, `aspectRatio`, `resolution`, `audio`, and `watermark` options.
|
||||
- `pdfModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
|
||||
@@ -1936,12 +1936,12 @@ Defaults for Talk mode (macOS/iOS/Android).
|
||||
|
||||
Local onboarding defaults new local configs to `tools.profile: "coding"` when unset (existing explicit profiles are preserved).
|
||||
|
||||
| Profile | Includes |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `minimal` | `session_status` only |
|
||||
| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate` |
|
||||
| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
|
||||
| `full` | No restriction (same as unset) |
|
||||
| Profile | Includes |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `minimal` | `session_status` only |
|
||||
| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate`, `video_generate` |
|
||||
| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
|
||||
| `full` | No restriction (same as unset) |
|
||||
|
||||
### Tool groups
|
||||
|
||||
@@ -1957,7 +1957,7 @@ Local onboarding defaults new local configs to `tools.profile: "coding"` when un
|
||||
| `group:messaging` | `message` |
|
||||
| `group:nodes` | `nodes` |
|
||||
| `group:agents` | `agents_list` |
|
||||
| `group:media` | `image`, `image_generate`, `tts` |
|
||||
| `group:media` | `image`, `image_generate`, `video_generate`, `tts` |
|
||||
| `group:openclaw` | All built-in tools (excludes provider plugins) |
|
||||
|
||||
### `tools.allow` / `tools.deny`
|
||||
|
||||
@@ -98,7 +98,7 @@ Available groups:
|
||||
- `group:messaging`: `message`
|
||||
- `group:nodes`: `nodes`
|
||||
- `group:agents`: `agents_list`
|
||||
- `group:media`: `image`, `image_generate`, `tts`
|
||||
- `group:media`: `image`, `image_generate`, `video_generate`, `tts`
|
||||
- `group:openclaw`: all built-in OpenClaw tools (excludes provider plugins)
|
||||
|
||||
## Elevated: exec-only "run on host"
|
||||
|
||||
@@ -123,6 +123,9 @@ Current bundled Qwen video-generation limits:
|
||||
- Up to **4** input videos
|
||||
- Up to **10 seconds** duration
|
||||
- Supports `size`, `aspectRatio`, `resolution`, `audio`, and `watermark`
|
||||
- Reference image/video mode currently requires **remote http(s) URLs**. Local
|
||||
file paths are rejected up front because the DashScope video endpoint does not
|
||||
accept uploaded local buffers for those references.
|
||||
|
||||
See [Qwen / Model Studio](/providers/qwen_modelstudio) for endpoint-level detail
|
||||
and compatibility notes.
|
||||
|
||||
@@ -53,25 +53,28 @@ OpenClaw has three layers that work together:
|
||||
|
||||
These tools ship with OpenClaw and are available without installing any plugins:
|
||||
|
||||
| Tool | What it does | Page |
|
||||
| ------------------------------------------ | --------------------------------------------------------------------- | --------------------------------------- |
|
||||
| `exec` / `process` | Run shell commands, manage background processes | [Exec](/tools/exec) |
|
||||
| `code_execution` | Run sandboxed remote Python analysis | [Code Execution](/tools/code-execution) |
|
||||
| `browser` | Control a Chromium browser (navigate, click, screenshot) | [Browser](/tools/browser) |
|
||||
| `web_search` / `x_search` / `web_fetch` | Search the web, search X posts, fetch page content | [Web](/tools/web) |
|
||||
| `read` / `write` / `edit` | File I/O in the workspace | |
|
||||
| `apply_patch` | Multi-hunk file patches | [Apply Patch](/tools/apply-patch) |
|
||||
| `message` | Send messages across all channels | [Agent Send](/tools/agent-send) |
|
||||
| `canvas` | Drive node Canvas (present, eval, snapshot) | |
|
||||
| `nodes` | Discover and target paired devices | |
|
||||
| `cron` / `gateway` | Manage scheduled jobs; inspect, patch, restart, or update the gateway | |
|
||||
| `image` / `image_generate` | Analyze or generate images | |
|
||||
| `tts` | One-shot text-to-speech conversion | [TTS](/tools/tts) |
|
||||
| `sessions_*` / `subagents` / `agents_list` | Session management, status, and sub-agent orchestration | [Sub-agents](/tools/subagents) |
|
||||
| `session_status` | Lightweight `/status`-style readback and session model override | [Session Tools](/concepts/session-tool) |
|
||||
| Tool | What it does | Page |
|
||||
| ------------------------------------------ | --------------------------------------------------------------------- | ------------------------------------------- |
|
||||
| `exec` / `process` | Run shell commands, manage background processes | [Exec](/tools/exec) |
|
||||
| `code_execution` | Run sandboxed remote Python analysis | [Code Execution](/tools/code-execution) |
|
||||
| `browser` | Control a Chromium browser (navigate, click, screenshot) | [Browser](/tools/browser) |
|
||||
| `web_search` / `x_search` / `web_fetch` | Search the web, search X posts, fetch page content | [Web](/tools/web) |
|
||||
| `read` / `write` / `edit` | File I/O in the workspace | |
|
||||
| `apply_patch` | Multi-hunk file patches | [Apply Patch](/tools/apply-patch) |
|
||||
| `message` | Send messages across all channels | [Agent Send](/tools/agent-send) |
|
||||
| `canvas` | Drive node Canvas (present, eval, snapshot) | |
|
||||
| `nodes` | Discover and target paired devices | |
|
||||
| `cron` / `gateway` | Manage scheduled jobs; inspect, patch, restart, or update the gateway | |
|
||||
| `image` / `image_generate` | Analyze or generate images | [Image Generation](/tools/image-generation) |
|
||||
| `video_generate` | Generate videos | [Video Generation](/tools/video-generation) |
|
||||
| `tts` | One-shot text-to-speech conversion | [TTS](/tools/tts) |
|
||||
| `sessions_*` / `subagents` / `agents_list` | Session management, status, and sub-agent orchestration | [Sub-agents](/tools/subagents) |
|
||||
| `session_status` | Lightweight `/status`-style readback and session model override | [Session Tools](/concepts/session-tool) |
|
||||
|
||||
For image work, use `image` for analysis and `image_generate` for generation or editing. If you target `openai/*`, `google/*`, `fal/*`, or another non-default image provider, configure that provider's auth/API key first.
|
||||
|
||||
For video work, use `video_generate`. If you target `qwen/*` or another non-default video provider, configure that provider's auth/API key first.
|
||||
|
||||
`session_status` is the lightweight status/readback tool in the sessions group.
|
||||
It answers `/status`-style questions about the current session and can
|
||||
optionally set a per-session model override; `model=default` clears that
|
||||
@@ -121,12 +124,12 @@ config. Deny always wins over allow.
|
||||
`tools.profile` sets a base allowlist before `allow`/`deny` is applied.
|
||||
Per-agent override: `agents.list[].tools.profile`.
|
||||
|
||||
| Profile | What it includes |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `full` | No restriction (same as unset) |
|
||||
| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate` |
|
||||
| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
|
||||
| `minimal` | `session_status` only |
|
||||
| Profile | What it includes |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `full` | No restriction (same as unset) |
|
||||
| `coding` | `group:fs`, `group:runtime`, `group:web`, `group:sessions`, `group:memory`, `cron`, `image`, `image_generate`, `video_generate` |
|
||||
| `messaging` | `group:messaging`, `sessions_list`, `sessions_history`, `sessions_send`, `session_status` |
|
||||
| `minimal` | `session_status` only |
|
||||
|
||||
### Tool groups
|
||||
|
||||
@@ -144,7 +147,7 @@ Use `group:*` shorthands in allow/deny lists:
|
||||
| `group:messaging` | message |
|
||||
| `group:nodes` | nodes |
|
||||
| `group:agents` | agents_list |
|
||||
| `group:media` | image, image_generate, tts |
|
||||
| `group:media` | image, image_generate, video_generate, tts |
|
||||
| `group:openclaw` | All built-in OpenClaw tools (excludes plugin tools) |
|
||||
|
||||
`sessions_history` returns a bounded, safety-filtered recall view. It strips
|
||||
|
||||
109
docs/tools/video-generation.md
Normal file
109
docs/tools/video-generation.md
Normal file
@@ -0,0 +1,109 @@
|
||||
---
|
||||
summary: "Generate videos using configured providers such as Qwen"
|
||||
read_when:
|
||||
- Generating videos via the agent
|
||||
- Configuring video generation providers and models
|
||||
- Understanding the video_generate tool parameters
|
||||
title: "Video Generation"
|
||||
---
|
||||
|
||||
# Video Generation
|
||||
|
||||
The `video_generate` tool lets the agent create videos using your configured providers. Generated videos are delivered automatically as media attachments in the agent's reply.
|
||||
|
||||
<Note>
|
||||
The tool only appears when at least one video-generation provider is available. If you don't see `video_generate` in your agent's tools, configure `agents.defaults.videoGenerationModel` or set up a provider API key.
|
||||
</Note>
|
||||
|
||||
## Quick start
|
||||
|
||||
1. Set an API key for at least one provider (for example `QWEN_API_KEY`).
|
||||
2. Optionally set your preferred model:
|
||||
|
||||
```json5
|
||||
{
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: "qwen/wan2.6-t2v",
|
||||
},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
3. Ask the agent: _"Generate a 5-second cinematic video of a friendly lobster surfing at sunset."_
|
||||
|
||||
The agent calls `video_generate` automatically. No tool allow-listing needed — it's enabled by default when a provider is available.
|
||||
|
||||
## Supported providers
|
||||
|
||||
| Provider | Default model | Reference inputs | API key |
|
||||
| -------- | ------------- | ---------------- | ---------------------------------------------------------- |
|
||||
| Qwen | `wan2.6-t2v` | Yes, remote URLs | `QWEN_API_KEY`, `MODELSTUDIO_API_KEY`, `DASHSCOPE_API_KEY` |
|
||||
|
||||
Use `action: "list"` to inspect available providers and models at runtime:
|
||||
|
||||
```
|
||||
/tool video_generate action=list
|
||||
```
|
||||
|
||||
## Tool parameters
|
||||
|
||||
| Parameter | Type | Description |
|
||||
| ----------------- | -------- | ------------------------------------------------------------------------------------- |
|
||||
| `prompt` | string | Video generation prompt (required for `action: "generate"`) |
|
||||
| `action` | string | `"generate"` (default) or `"list"` to inspect providers |
|
||||
| `model` | string | Provider/model override, e.g. `qwen/wan2.6-t2v` |
|
||||
| `image` | string | Single reference image path or URL |
|
||||
| `images` | string[] | Multiple reference images (up to 5) |
|
||||
| `video` | string | Single reference video path or URL |
|
||||
| `videos` | string[] | Multiple reference videos (up to 4) |
|
||||
| `size` | string | Size hint when the provider supports it |
|
||||
| `aspectRatio` | string | Aspect ratio: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9` |
|
||||
| `resolution` | string | Resolution hint: `480P`, `720P`, or `1080P` |
|
||||
| `durationSeconds` | number | Target duration in seconds |
|
||||
| `audio` | boolean | Enable generated audio when the provider supports it |
|
||||
| `watermark` | boolean | Toggle provider watermarking when supported |
|
||||
| `filename` | string | Output filename hint |
|
||||
|
||||
Not all providers support all parameters. The tool validates provider capability limits before it submits the request.
|
||||
|
||||
## Configuration
|
||||
|
||||
### Model selection
|
||||
|
||||
```json5
|
||||
{
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: {
|
||||
primary: "qwen/wan2.6-t2v",
|
||||
fallbacks: ["qwen/wan2.6-r2v-flash"],
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
### Provider selection order
|
||||
|
||||
When generating a video, OpenClaw tries providers in this order:
|
||||
|
||||
1. **`model` parameter** from the tool call (if the agent specifies one)
|
||||
2. **`videoGenerationModel.primary`** from config
|
||||
3. **`videoGenerationModel.fallbacks`** in order
|
||||
4. **Auto-detection** — uses auth-backed provider defaults only:
|
||||
- current default provider first
|
||||
- remaining registered video-generation providers in provider-id order
|
||||
|
||||
If a provider fails, the next candidate is tried automatically. If all fail, the error includes details from each attempt.
|
||||
|
||||
## Qwen reference inputs
|
||||
|
||||
The bundled Qwen provider supports text-to-video plus image/video reference modes, but the upstream DashScope video endpoint currently requires **remote http(s) URLs** for reference inputs. Local file paths and uploaded buffers are rejected up front instead of being silently ignored.
|
||||
|
||||
## Related
|
||||
|
||||
- [Tools Overview](/tools) — all available agent tools
|
||||
- [Qwen](/providers/qwen) — Qwen-specific setup and limits
|
||||
- [Configuration Reference](/gateway/configuration-reference#agent-defaults) — `videoGenerationModel` config
|
||||
- [Models](/concepts/models) — model configuration and failover
|
||||
@@ -107,4 +107,21 @@ describe("qwen video generation provider", () => {
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("fails fast when reference inputs are local buffers instead of remote URLs", async () => {
|
||||
const provider = buildQwenVideoGenerationProvider();
|
||||
|
||||
await expect(
|
||||
provider.generateVideo({
|
||||
provider: "qwen",
|
||||
model: "wan2.6-i2v",
|
||||
prompt: "animate this local frame",
|
||||
cfg: {},
|
||||
inputImages: [{ buffer: Buffer.from("png-bytes"), mimeType: "image/png" }],
|
||||
}),
|
||||
).rejects.toThrow(
|
||||
"Qwen video generation currently requires remote http(s) URLs for reference images/videos.",
|
||||
);
|
||||
expect(postJsonRequestMock).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -90,7 +90,22 @@ function resolveReferenceUrls(
|
||||
.filter((value): value is string => Boolean(value));
|
||||
}
|
||||
|
||||
function assertQwenReferenceInputsSupported(
|
||||
inputImages: VideoGenerationSourceAsset[] | undefined,
|
||||
inputVideos: VideoGenerationSourceAsset[] | undefined,
|
||||
): void {
|
||||
const unsupported = [...(inputImages ?? []), ...(inputVideos ?? [])].some(
|
||||
(asset) => !asset.url?.trim() && asset.buffer,
|
||||
);
|
||||
if (unsupported) {
|
||||
throw new Error(
|
||||
"Qwen video generation currently requires remote http(s) URLs for reference images/videos.",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function buildQwenVideoGenerationInput(req: VideoGenerationRequest): Record<string, unknown> {
|
||||
assertQwenReferenceInputsSupported(req.inputImages, req.inputVideos);
|
||||
const input: Record<string, unknown> = {
|
||||
prompt: req.prompt,
|
||||
};
|
||||
|
||||
@@ -31,6 +31,7 @@ import { createSessionsYieldTool } from "./tools/sessions-yield-tool.js";
|
||||
import { createSubagentsTool } from "./tools/subagents-tool.js";
|
||||
import { createTtsTool } from "./tools/tts-tool.js";
|
||||
import { createUpdatePlanTool } from "./tools/update-plan-tool.js";
|
||||
import { createVideoGenerateTool } from "./tools/video-generate-tool.js";
|
||||
import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js";
|
||||
import { resolveWorkspaceRoot } from "./workspace-dir.js";
|
||||
|
||||
@@ -159,6 +160,13 @@ export function createOpenClawTools(
|
||||
sandbox,
|
||||
fsPolicy: options?.fsPolicy,
|
||||
});
|
||||
const videoGenerateTool = createVideoGenerateTool({
|
||||
config: options?.config,
|
||||
agentDir: options?.agentDir,
|
||||
workspaceDir,
|
||||
sandbox,
|
||||
fsPolicy: options?.fsPolicy,
|
||||
});
|
||||
const pdfTool = options?.agentDir?.trim()
|
||||
? createPdfTool({
|
||||
config: options?.config,
|
||||
@@ -216,6 +224,7 @@ export function createOpenClawTools(
|
||||
config: options?.config,
|
||||
}),
|
||||
...(imageGenerateTool ? [imageGenerateTool] : []),
|
||||
...(videoGenerateTool ? [videoGenerateTool] : []),
|
||||
createGatewayTool({
|
||||
agentSessionKey: options?.agentSessionKey,
|
||||
config: options?.config,
|
||||
|
||||
91
src/agents/openclaw-tools.video-generation.test.ts
Normal file
91
src/agents/openclaw-tools.video-generation.test.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import * as videoGenerationRuntime from "../video-generation/runtime.js";
|
||||
import { createOpenClawTools } from "./openclaw-tools.js";
|
||||
|
||||
vi.mock("../plugins/tools.js", () => ({
|
||||
resolvePluginTools: () => [],
|
||||
copyPluginToolMeta: () => undefined,
|
||||
getPluginToolMeta: () => undefined,
|
||||
}));
|
||||
|
||||
function asConfig(value: unknown): OpenClawConfig {
|
||||
return value as OpenClawConfig;
|
||||
}
|
||||
|
||||
function stubVideoGenerationProviders() {
|
||||
vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([
|
||||
{
|
||||
id: "qwen",
|
||||
defaultModel: "wan2.6-t2v",
|
||||
models: ["wan2.6-t2v"],
|
||||
capabilities: {
|
||||
maxVideos: 1,
|
||||
maxInputImages: 1,
|
||||
maxInputVideos: 4,
|
||||
maxDurationSeconds: 10,
|
||||
supportsSize: true,
|
||||
supportsAspectRatio: true,
|
||||
supportsResolution: true,
|
||||
supportsAudio: true,
|
||||
supportsWatermark: true,
|
||||
},
|
||||
generateVideo: vi.fn(async () => {
|
||||
throw new Error("not used");
|
||||
}),
|
||||
},
|
||||
]);
|
||||
}
|
||||
|
||||
describe("openclaw tools video generation registration", () => {
|
||||
beforeEach(() => {
|
||||
vi.stubEnv("QWEN_API_KEY", "");
|
||||
vi.stubEnv("MODELSTUDIO_API_KEY", "");
|
||||
vi.stubEnv("DASHSCOPE_API_KEY", "");
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
vi.unstubAllEnvs();
|
||||
});
|
||||
|
||||
it("registers video_generate when video-generation config is present", () => {
|
||||
const tools = createOpenClawTools({
|
||||
config: asConfig({
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: {
|
||||
primary: "qwen/wan2.6-t2v",
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
agentDir: "/tmp/openclaw-agent-main",
|
||||
});
|
||||
|
||||
expect(tools.map((tool) => tool.name)).toContain("video_generate");
|
||||
});
|
||||
|
||||
it("registers video_generate when a compatible provider has env-backed auth", () => {
|
||||
stubVideoGenerationProviders();
|
||||
vi.stubEnv("QWEN_API_KEY", "qwen-test");
|
||||
|
||||
const tools = createOpenClawTools({
|
||||
config: asConfig({}),
|
||||
agentDir: "/tmp/openclaw-agent-main",
|
||||
});
|
||||
|
||||
expect(tools.map((tool) => tool.name)).toContain("video_generate");
|
||||
});
|
||||
|
||||
it("omits video_generate when config is absent and no compatible provider auth exists", () => {
|
||||
stubVideoGenerationProviders();
|
||||
|
||||
const tools = createOpenClawTools({
|
||||
config: asConfig({}),
|
||||
agentDir: "/tmp/openclaw-agent-main",
|
||||
});
|
||||
|
||||
expect(tools.map((tool) => tool.name)).not.toContain("video_generate");
|
||||
});
|
||||
});
|
||||
@@ -46,6 +46,9 @@ vi.mock("./tools/gateway-tool.js", () => ({
|
||||
vi.mock("./tools/image-generate-tool.js", () => ({
|
||||
createImageGenerateTool: mockToolFactory("image_generate_stub"),
|
||||
}));
|
||||
vi.mock("./tools/video-generate-tool.js", () => ({
|
||||
createVideoGenerateTool: mockToolFactory("video_generate_stub"),
|
||||
}));
|
||||
vi.mock("./tools/image-tool.js", () => ({
|
||||
createImageTool: mockToolFactory("image_stub"),
|
||||
}));
|
||||
|
||||
@@ -265,6 +265,10 @@ describe("extractToolResultMediaPaths", () => {
|
||||
expect(isToolResultMediaTrusted("image_generate")).toBe(true);
|
||||
});
|
||||
|
||||
it("trusts video_generate local MEDIA paths", () => {
|
||||
expect(isToolResultMediaTrusted("video_generate")).toBe(true);
|
||||
});
|
||||
|
||||
it("does not trust local MEDIA paths for MCP-provenance results", () => {
|
||||
expect(
|
||||
filterToolResultMediaUrls("browser", ["/tmp/screenshot.png"], {
|
||||
|
||||
@@ -156,6 +156,7 @@ const TRUSTED_TOOL_RESULT_MEDIA = new Set([
|
||||
"sessions_spawn",
|
||||
"subagents",
|
||||
"tts",
|
||||
"video_generate",
|
||||
"web_fetch",
|
||||
"web_search",
|
||||
"x_search",
|
||||
|
||||
@@ -32,6 +32,7 @@ const coreTools = [
|
||||
stubActionTool("session_status", ["get", "show"]),
|
||||
stubTool("tts"),
|
||||
stubTool("image_generate"),
|
||||
stubTool("video_generate"),
|
||||
stubTool("web_fetch"),
|
||||
stubTool("image"),
|
||||
stubTool("pdf"),
|
||||
|
||||
@@ -23,6 +23,10 @@ vi.mock("../tools/image-generate-tool.js", () => ({
|
||||
createImageGenerateTool: () => stubTool("image_generate"),
|
||||
}));
|
||||
|
||||
vi.mock("../tools/video-generate-tool.js", () => ({
|
||||
createVideoGenerateTool: () => stubTool("video_generate"),
|
||||
}));
|
||||
|
||||
vi.mock("../tools/web-tools.js", () => ({
|
||||
createWebSearchTool: () => null,
|
||||
createWebFetchTool: () => null,
|
||||
|
||||
@@ -10,6 +10,7 @@ describe("tool-catalog", () => {
|
||||
expect(policy!.allow).toContain("x_search");
|
||||
expect(policy!.allow).toContain("web_fetch");
|
||||
expect(policy!.allow).toContain("image_generate");
|
||||
expect(policy!.allow).toContain("video_generate");
|
||||
expect(policy!.allow).toContain("update_plan");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -277,6 +277,14 @@ const CORE_TOOL_DEFINITIONS: CoreToolDefinition[] = [
|
||||
profiles: ["coding"],
|
||||
includeInOpenClawGroup: true,
|
||||
},
|
||||
{
|
||||
id: "video_generate",
|
||||
label: "video_generate",
|
||||
description: "Video generation",
|
||||
sectionId: "media",
|
||||
profiles: ["coding"],
|
||||
includeInOpenClawGroup: true,
|
||||
},
|
||||
{
|
||||
id: "tts",
|
||||
label: "tts",
|
||||
|
||||
@@ -640,6 +640,28 @@ export const TOOL_DISPLAY_CONFIG: ToolDisplayConfig = {
|
||||
},
|
||||
},
|
||||
},
|
||||
video_generate: {
|
||||
emoji: "🎬",
|
||||
title: "Video Generation",
|
||||
actions: {
|
||||
generate: {
|
||||
label: "generate",
|
||||
detailKeys: [
|
||||
"prompt",
|
||||
"model",
|
||||
"durationSeconds",
|
||||
"resolution",
|
||||
"aspectRatio",
|
||||
"audio",
|
||||
"watermark",
|
||||
],
|
||||
},
|
||||
list: {
|
||||
label: "list",
|
||||
detailKeys: ["provider", "model"],
|
||||
},
|
||||
},
|
||||
},
|
||||
pdf: {
|
||||
emoji: "📑",
|
||||
title: "PDF",
|
||||
|
||||
@@ -32,9 +32,16 @@ export function applyImageGenerationModelConfigDefaults(
|
||||
return applyAgentDefaultModelConfig(cfg, "imageGenerationModel", imageGenerationModelConfig);
|
||||
}
|
||||
|
||||
export function applyVideoGenerationModelConfigDefaults(
|
||||
cfg: OpenClawConfig | undefined,
|
||||
videoGenerationModelConfig: ToolModelConfig,
|
||||
): OpenClawConfig | undefined {
|
||||
return applyAgentDefaultModelConfig(cfg, "videoGenerationModel", videoGenerationModelConfig);
|
||||
}
|
||||
|
||||
function applyAgentDefaultModelConfig(
|
||||
cfg: OpenClawConfig | undefined,
|
||||
key: "imageModel" | "imageGenerationModel",
|
||||
key: "imageModel" | "imageGenerationModel" | "videoGenerationModel",
|
||||
modelConfig: ToolModelConfig,
|
||||
): OpenClawConfig | undefined {
|
||||
if (!cfg) {
|
||||
|
||||
91
src/agents/tools/video-generate-tool.test.ts
Normal file
91
src/agents/tools/video-generate-tool.test.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import * as mediaStore from "../../media/store.js";
|
||||
import * as videoGenerationRuntime from "../../video-generation/runtime.js";
|
||||
import { createVideoGenerateTool } from "./video-generate-tool.js";
|
||||
|
||||
function asConfig(value: unknown): OpenClawConfig {
|
||||
return value as OpenClawConfig;
|
||||
}
|
||||
|
||||
describe("createVideoGenerateTool", () => {
|
||||
beforeEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.unstubAllEnvs();
|
||||
});
|
||||
|
||||
it("returns null when no video-generation config or auth-backed provider is available", () => {
|
||||
vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([]);
|
||||
|
||||
expect(createVideoGenerateTool({ config: asConfig({}) })).toBeNull();
|
||||
});
|
||||
|
||||
it("registers when video-generation config is present", () => {
|
||||
expect(
|
||||
createVideoGenerateTool({
|
||||
config: asConfig({
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: { primary: "qwen/wan2.6-t2v" },
|
||||
},
|
||||
},
|
||||
}),
|
||||
}),
|
||||
).not.toBeNull();
|
||||
});
|
||||
|
||||
it("generates videos, saves them, and emits MEDIA paths", async () => {
|
||||
vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({
|
||||
provider: "qwen",
|
||||
model: "wan2.6-t2v",
|
||||
attempts: [],
|
||||
videos: [
|
||||
{
|
||||
buffer: Buffer.from("video-bytes"),
|
||||
mimeType: "video/mp4",
|
||||
fileName: "lobster.mp4",
|
||||
},
|
||||
],
|
||||
metadata: { taskId: "task-1" },
|
||||
});
|
||||
vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({
|
||||
path: "/tmp/generated-lobster.mp4",
|
||||
id: "generated-lobster.mp4",
|
||||
size: 11,
|
||||
contentType: "video/mp4",
|
||||
});
|
||||
|
||||
const tool = createVideoGenerateTool({
|
||||
config: asConfig({
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: { primary: "qwen/wan2.6-t2v" },
|
||||
},
|
||||
},
|
||||
}),
|
||||
});
|
||||
expect(tool).not.toBeNull();
|
||||
if (!tool) {
|
||||
throw new Error("expected video_generate tool");
|
||||
}
|
||||
|
||||
const result = await tool.execute("call-1", { prompt: "friendly lobster surfing" });
|
||||
const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
|
||||
|
||||
expect(text).toContain("Generated 1 video with qwen/wan2.6-t2v.");
|
||||
expect(text).toContain("MEDIA:/tmp/generated-lobster.mp4");
|
||||
expect(result.details).toMatchObject({
|
||||
provider: "qwen",
|
||||
model: "wan2.6-t2v",
|
||||
count: 1,
|
||||
media: {
|
||||
mediaUrls: ["/tmp/generated-lobster.mp4"],
|
||||
},
|
||||
paths: ["/tmp/generated-lobster.mp4"],
|
||||
metadata: { taskId: "task-1" },
|
||||
});
|
||||
});
|
||||
});
|
||||
735
src/agents/tools/video-generate-tool.ts
Normal file
735
src/agents/tools/video-generate-tool.ts
Normal file
@@ -0,0 +1,735 @@
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import { loadConfig } from "../../config/config.js";
|
||||
import { saveMediaBuffer } from "../../media/store.js";
|
||||
import { loadWebMedia } from "../../media/web-media.js";
|
||||
import { readSnakeCaseParamRaw } from "../../param-key.js";
|
||||
import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
|
||||
import { resolveUserPath } from "../../utils.js";
|
||||
import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
|
||||
import {
|
||||
generateVideo,
|
||||
listRuntimeVideoGenerationProviders,
|
||||
} from "../../video-generation/runtime.js";
|
||||
import type {
|
||||
VideoGenerationProvider,
|
||||
VideoGenerationResolution,
|
||||
VideoGenerationSourceAsset,
|
||||
} from "../../video-generation/types.js";
|
||||
import { normalizeProviderId } from "../provider-id.js";
|
||||
import {
|
||||
ToolInputError,
|
||||
readNumberParam,
|
||||
readStringArrayParam,
|
||||
readStringParam,
|
||||
} from "./common.js";
|
||||
import { decodeDataUrl } from "./image-tool.helpers.js";
|
||||
import {
|
||||
applyVideoGenerationModelConfigDefaults,
|
||||
resolveMediaToolLocalRoots,
|
||||
} from "./media-tool-shared.js";
|
||||
import {
|
||||
buildToolModelConfigFromCandidates,
|
||||
coerceToolModelConfig,
|
||||
hasAuthForProvider,
|
||||
hasToolModelConfig,
|
||||
resolveDefaultModelRef,
|
||||
type ToolModelConfig,
|
||||
} from "./model-config.helpers.js";
|
||||
import {
|
||||
createSandboxBridgeReadFile,
|
||||
resolveSandboxedBridgeMediaPath,
|
||||
type AnyAgentTool,
|
||||
type SandboxFsBridge,
|
||||
type ToolFsPolicy,
|
||||
} from "./tool-runtime.helpers.js";
|
||||
|
||||
const MAX_INPUT_IMAGES = 5;
|
||||
const MAX_INPUT_VIDEOS = 4;
|
||||
const SUPPORTED_ASPECT_RATIOS = new Set([
|
||||
"1:1",
|
||||
"2:3",
|
||||
"3:2",
|
||||
"3:4",
|
||||
"4:3",
|
||||
"4:5",
|
||||
"5:4",
|
||||
"9:16",
|
||||
"16:9",
|
||||
"21:9",
|
||||
]);
|
||||
|
||||
const VideoGenerateToolSchema = Type.Object({
|
||||
action: Type.Optional(
|
||||
Type.String({
|
||||
description:
|
||||
'Optional action: "generate" (default) or "list" to inspect available providers/models.',
|
||||
}),
|
||||
),
|
||||
prompt: Type.Optional(Type.String({ description: "Video generation prompt." })),
|
||||
image: Type.Optional(
|
||||
Type.String({
|
||||
description: "Optional single reference image path or URL.",
|
||||
}),
|
||||
),
|
||||
images: Type.Optional(
|
||||
Type.Array(Type.String(), {
|
||||
description: `Optional reference images (up to ${MAX_INPUT_IMAGES}).`,
|
||||
}),
|
||||
),
|
||||
video: Type.Optional(
|
||||
Type.String({
|
||||
description: "Optional single reference video path or URL.",
|
||||
}),
|
||||
),
|
||||
videos: Type.Optional(
|
||||
Type.Array(Type.String(), {
|
||||
description: `Optional reference videos (up to ${MAX_INPUT_VIDEOS}).`,
|
||||
}),
|
||||
),
|
||||
model: Type.Optional(
|
||||
Type.String({ description: "Optional provider/model override, e.g. qwen/wan2.6-t2v." }),
|
||||
),
|
||||
filename: Type.Optional(
|
||||
Type.String({
|
||||
description:
|
||||
"Optional output filename hint. OpenClaw preserves the basename and saves under its managed media directory.",
|
||||
}),
|
||||
),
|
||||
size: Type.Optional(
|
||||
Type.String({
|
||||
description: "Optional size hint like 1280x720 or 1920x1080 when the provider supports it.",
|
||||
}),
|
||||
),
|
||||
aspectRatio: Type.Optional(
|
||||
Type.String({
|
||||
description:
|
||||
"Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9.",
|
||||
}),
|
||||
),
|
||||
resolution: Type.Optional(
|
||||
Type.String({
|
||||
description: "Optional resolution hint: 480P, 720P, or 1080P.",
|
||||
}),
|
||||
),
|
||||
durationSeconds: Type.Optional(
|
||||
Type.Number({
|
||||
description: "Optional target duration in seconds.",
|
||||
minimum: 1,
|
||||
}),
|
||||
),
|
||||
audio: Type.Optional(
|
||||
Type.Boolean({
|
||||
description: "Optional audio toggle when the provider supports generated audio.",
|
||||
}),
|
||||
),
|
||||
watermark: Type.Optional(
|
||||
Type.Boolean({
|
||||
description: "Optional watermark toggle when the provider supports it.",
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
function getVideoGenerationProviderAuthEnvVars(providerId: string): string[] {
|
||||
return getProviderEnvVars(providerId);
|
||||
}
|
||||
|
||||
function resolveVideoGenerationModelCandidates(params: {
|
||||
cfg?: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
}): Array<string | undefined> {
|
||||
const providerDefaults = new Map<string, string>();
|
||||
for (const provider of listRuntimeVideoGenerationProviders({ config: params.cfg })) {
|
||||
const providerId = provider.id.trim();
|
||||
const modelId = provider.defaultModel?.trim();
|
||||
if (
|
||||
!providerId ||
|
||||
!modelId ||
|
||||
providerDefaults.has(providerId) ||
|
||||
!isVideoGenerationProviderConfigured({
|
||||
provider,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
})
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
providerDefaults.set(providerId, `${providerId}/${modelId}`);
|
||||
}
|
||||
|
||||
const primaryProvider = resolveDefaultModelRef(params.cfg).provider;
|
||||
const orderedProviders = [
|
||||
primaryProvider,
|
||||
...[...providerDefaults.keys()]
|
||||
.filter((providerId) => providerId !== primaryProvider)
|
||||
.toSorted(),
|
||||
];
|
||||
const orderedRefs: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const providerId of orderedProviders) {
|
||||
const ref = providerDefaults.get(providerId);
|
||||
if (!ref || seen.has(ref)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(ref);
|
||||
orderedRefs.push(ref);
|
||||
}
|
||||
return orderedRefs;
|
||||
}
|
||||
|
||||
export function resolveVideoGenerationModelConfigForTool(params: {
|
||||
cfg?: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
}): ToolModelConfig | null {
|
||||
const explicit = coerceToolModelConfig(params.cfg?.agents?.defaults?.videoGenerationModel);
|
||||
if (hasToolModelConfig(explicit)) {
|
||||
return explicit;
|
||||
}
|
||||
return buildToolModelConfigFromCandidates({
|
||||
explicit,
|
||||
agentDir: params.agentDir,
|
||||
candidates: resolveVideoGenerationModelCandidates(params),
|
||||
isProviderConfigured: (providerId) =>
|
||||
isVideoGenerationProviderConfigured({
|
||||
providerId,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
function isVideoGenerationProviderConfigured(params: {
|
||||
provider?: VideoGenerationProvider;
|
||||
providerId?: string;
|
||||
cfg?: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
}): boolean {
|
||||
const provider =
|
||||
params.provider ??
|
||||
listRuntimeVideoGenerationProviders({ config: params.cfg }).find((candidate) => {
|
||||
const normalizedId = normalizeProviderId(params.providerId ?? "");
|
||||
return (
|
||||
normalizeProviderId(candidate.id) === normalizedId ||
|
||||
(candidate.aliases ?? []).some((alias) => normalizeProviderId(alias) === normalizedId)
|
||||
);
|
||||
});
|
||||
if (!provider) {
|
||||
return params.providerId
|
||||
? hasAuthForProvider({ provider: params.providerId, agentDir: params.agentDir })
|
||||
: false;
|
||||
}
|
||||
if (provider.isConfigured) {
|
||||
return provider.isConfigured({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
}
|
||||
return hasAuthForProvider({ provider: provider.id, agentDir: params.agentDir });
|
||||
}
|
||||
|
||||
function resolveAction(args: Record<string, unknown>): "generate" | "list" {
|
||||
const raw = readStringParam(args, "action");
|
||||
if (!raw) {
|
||||
return "generate";
|
||||
}
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
if (normalized === "generate" || normalized === "list") {
|
||||
return normalized;
|
||||
}
|
||||
throw new ToolInputError('action must be "generate" or "list"');
|
||||
}
|
||||
|
||||
function normalizeResolution(raw: string | undefined): VideoGenerationResolution | undefined {
|
||||
const normalized = raw?.trim().toUpperCase();
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
if (normalized === "480P" || normalized === "720P" || normalized === "1080P") {
|
||||
return normalized;
|
||||
}
|
||||
throw new ToolInputError("resolution must be one of 480P, 720P, or 1080P");
|
||||
}
|
||||
|
||||
function normalizeAspectRatio(raw: string | undefined): string | undefined {
|
||||
const normalized = raw?.trim();
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
if (SUPPORTED_ASPECT_RATIOS.has(normalized)) {
|
||||
return normalized;
|
||||
}
|
||||
throw new ToolInputError(
|
||||
"aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9",
|
||||
);
|
||||
}
|
||||
|
||||
function readBooleanParam(params: Record<string, unknown>, key: string): boolean | undefined {
|
||||
const raw = readSnakeCaseParamRaw(params, key);
|
||||
if (typeof raw === "boolean") {
|
||||
return raw;
|
||||
}
|
||||
if (typeof raw === "string") {
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
if (normalized === "true") {
|
||||
return true;
|
||||
}
|
||||
if (normalized === "false") {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function normalizeReferenceInputs(params: {
|
||||
args: Record<string, unknown>;
|
||||
singularKey: "image" | "video";
|
||||
pluralKey: "images" | "videos";
|
||||
maxCount: number;
|
||||
}): string[] {
|
||||
const single = readStringParam(params.args, params.singularKey);
|
||||
const multiple = readStringArrayParam(params.args, params.pluralKey);
|
||||
const combined = [...(single ? [single] : []), ...(multiple ?? [])];
|
||||
const deduped: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const candidate of combined) {
|
||||
const trimmed = candidate.trim();
|
||||
const dedupe = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
|
||||
if (!dedupe || seen.has(dedupe)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(dedupe);
|
||||
deduped.push(trimmed);
|
||||
}
|
||||
if (deduped.length > params.maxCount) {
|
||||
throw new ToolInputError(
|
||||
`Too many reference ${params.pluralKey}: ${deduped.length} provided, maximum is ${params.maxCount}.`,
|
||||
);
|
||||
}
|
||||
return deduped;
|
||||
}
|
||||
|
||||
function resolveSelectedVideoGenerationProvider(params: {
|
||||
config?: OpenClawConfig;
|
||||
videoGenerationModelConfig: ToolModelConfig;
|
||||
modelOverride?: string;
|
||||
}): VideoGenerationProvider | undefined {
|
||||
const selectedRef =
|
||||
parseVideoGenerationModelRef(params.modelOverride) ??
|
||||
parseVideoGenerationModelRef(params.videoGenerationModelConfig.primary);
|
||||
if (!selectedRef) {
|
||||
return undefined;
|
||||
}
|
||||
const selectedProvider = normalizeProviderId(selectedRef.provider);
|
||||
return listRuntimeVideoGenerationProviders({ config: params.config }).find(
|
||||
(provider) =>
|
||||
normalizeProviderId(provider.id) === selectedProvider ||
|
||||
(provider.aliases ?? []).some((alias) => normalizeProviderId(alias) === selectedProvider),
|
||||
);
|
||||
}
|
||||
|
||||
function validateVideoGenerationCapabilities(params: {
|
||||
provider: VideoGenerationProvider | undefined;
|
||||
inputImageCount: number;
|
||||
inputVideoCount: number;
|
||||
size?: string;
|
||||
aspectRatio?: string;
|
||||
resolution?: VideoGenerationResolution;
|
||||
durationSeconds?: number;
|
||||
audio?: boolean;
|
||||
watermark?: boolean;
|
||||
}) {
|
||||
const provider = params.provider;
|
||||
if (!provider) {
|
||||
return;
|
||||
}
|
||||
const caps = provider.capabilities;
|
||||
if (params.inputImageCount > 0) {
|
||||
const maxInputImages = caps.maxInputImages ?? MAX_INPUT_IMAGES;
|
||||
if (params.inputImageCount > maxInputImages) {
|
||||
throw new ToolInputError(
|
||||
`${provider.id} supports at most ${maxInputImages} reference image${maxInputImages === 1 ? "" : "s"}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
if (params.inputVideoCount > 0) {
|
||||
const maxInputVideos = caps.maxInputVideos ?? MAX_INPUT_VIDEOS;
|
||||
if (params.inputVideoCount > maxInputVideos) {
|
||||
throw new ToolInputError(
|
||||
`${provider.id} supports at most ${maxInputVideos} reference video${maxInputVideos === 1 ? "" : "s"}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
if (params.size && !caps.supportsSize) {
|
||||
throw new ToolInputError(`${provider.id} does not support size overrides.`);
|
||||
}
|
||||
if (params.aspectRatio && !caps.supportsAspectRatio) {
|
||||
throw new ToolInputError(`${provider.id} does not support aspectRatio overrides.`);
|
||||
}
|
||||
if (params.resolution && !caps.supportsResolution) {
|
||||
throw new ToolInputError(`${provider.id} does not support resolution overrides.`);
|
||||
}
|
||||
if (
|
||||
typeof params.durationSeconds === "number" &&
|
||||
Number.isFinite(params.durationSeconds) &&
|
||||
typeof caps.maxDurationSeconds === "number" &&
|
||||
params.durationSeconds > caps.maxDurationSeconds
|
||||
) {
|
||||
throw new ToolInputError(
|
||||
`${provider.id} supports at most ${caps.maxDurationSeconds} seconds per video.`,
|
||||
);
|
||||
}
|
||||
if (typeof params.audio === "boolean" && !caps.supportsAudio) {
|
||||
throw new ToolInputError(`${provider.id} does not support audio toggles.`);
|
||||
}
|
||||
if (typeof params.watermark === "boolean" && !caps.supportsWatermark) {
|
||||
throw new ToolInputError(`${provider.id} does not support watermark toggles.`);
|
||||
}
|
||||
}
|
||||
|
||||
type VideoGenerateSandboxConfig = {
|
||||
root: string;
|
||||
bridge: SandboxFsBridge;
|
||||
};
|
||||
|
||||
async function loadReferenceAssets(params: {
|
||||
inputs: string[];
|
||||
expectedKind: "image" | "video";
|
||||
maxBytes?: number;
|
||||
workspaceDir?: string;
|
||||
sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null;
|
||||
}): Promise<
|
||||
Array<{
|
||||
sourceAsset: VideoGenerationSourceAsset;
|
||||
resolvedInput: string;
|
||||
rewrittenFrom?: string;
|
||||
}>
|
||||
> {
|
||||
const loaded: Array<{
|
||||
sourceAsset: VideoGenerationSourceAsset;
|
||||
resolvedInput: string;
|
||||
rewrittenFrom?: string;
|
||||
}> = [];
|
||||
|
||||
for (const rawInput of params.inputs) {
|
||||
const trimmed = rawInput.trim();
|
||||
const inputRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
|
||||
if (!inputRaw) {
|
||||
throw new ToolInputError(`${params.expectedKind} required (empty string in array)`);
|
||||
}
|
||||
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(inputRaw);
|
||||
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(inputRaw);
|
||||
const isFileUrl = /^file:/i.test(inputRaw);
|
||||
const isHttpUrl = /^https?:\/\//i.test(inputRaw);
|
||||
const isDataUrl = /^data:/i.test(inputRaw);
|
||||
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
|
||||
throw new ToolInputError(
|
||||
`Unsupported ${params.expectedKind} reference: ${rawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
|
||||
);
|
||||
}
|
||||
if (params.sandboxConfig && isHttpUrl) {
|
||||
throw new ToolInputError(
|
||||
`Sandboxed video_generate does not allow remote ${params.expectedKind} URLs.`,
|
||||
);
|
||||
}
|
||||
|
||||
const resolvedInput = (() => {
|
||||
if (params.sandboxConfig) {
|
||||
return inputRaw;
|
||||
}
|
||||
if (inputRaw.startsWith("~")) {
|
||||
return resolveUserPath(inputRaw);
|
||||
}
|
||||
return inputRaw;
|
||||
})();
|
||||
|
||||
if (isHttpUrl && !params.sandboxConfig) {
|
||||
loaded.push({
|
||||
sourceAsset: { url: resolvedInput },
|
||||
resolvedInput,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl
|
||||
? { resolved: "" }
|
||||
: params.sandboxConfig
|
||||
? await resolveSandboxedBridgeMediaPath({
|
||||
sandbox: params.sandboxConfig,
|
||||
mediaPath: resolvedInput,
|
||||
inboundFallbackDir: "media/inbound",
|
||||
})
|
||||
: {
|
||||
resolved: resolvedInput.startsWith("file://")
|
||||
? resolvedInput.slice("file://".length)
|
||||
: resolvedInput,
|
||||
};
|
||||
const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved;
|
||||
const localRoots = resolveMediaToolLocalRoots(
|
||||
params.workspaceDir,
|
||||
{
|
||||
workspaceOnly: params.sandboxConfig?.workspaceOnly === true,
|
||||
},
|
||||
resolvedPath ? [resolvedPath] : undefined,
|
||||
);
|
||||
const media = isDataUrl
|
||||
? params.expectedKind === "image"
|
||||
? decodeDataUrl(resolvedInput)
|
||||
: (() => {
|
||||
throw new ToolInputError("Video data: URLs are not supported for video_generate.");
|
||||
})()
|
||||
: params.sandboxConfig
|
||||
? await loadWebMedia(resolvedPath ?? resolvedInput, {
|
||||
maxBytes: params.maxBytes,
|
||||
sandboxValidated: true,
|
||||
readFile: createSandboxBridgeReadFile({ sandbox: params.sandboxConfig }),
|
||||
})
|
||||
: await loadWebMedia(resolvedPath ?? resolvedInput, {
|
||||
maxBytes: params.maxBytes,
|
||||
localRoots,
|
||||
});
|
||||
if (media.kind !== params.expectedKind) {
|
||||
throw new ToolInputError(`Unsupported media type: ${media.kind ?? "unknown"}`);
|
||||
}
|
||||
const mimeType = "mimeType" in media ? media.mimeType : media.contentType;
|
||||
const fileName = "fileName" in media ? media.fileName : undefined;
|
||||
loaded.push({
|
||||
sourceAsset: {
|
||||
buffer: media.buffer,
|
||||
mimeType,
|
||||
fileName,
|
||||
},
|
||||
resolvedInput,
|
||||
...(resolvedPathInfo.rewrittenFrom ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } : {}),
|
||||
});
|
||||
}
|
||||
|
||||
return loaded;
|
||||
}
|
||||
|
||||
export function createVideoGenerateTool(options?: {
|
||||
config?: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
workspaceDir?: string;
|
||||
sandbox?: VideoGenerateSandboxConfig;
|
||||
fsPolicy?: ToolFsPolicy;
|
||||
}): AnyAgentTool | null {
|
||||
const cfg: OpenClawConfig = options?.config ?? loadConfig();
|
||||
const videoGenerationModelConfig = resolveVideoGenerationModelConfigForTool({
|
||||
cfg,
|
||||
agentDir: options?.agentDir,
|
||||
});
|
||||
if (!videoGenerationModelConfig) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const sandboxConfig = options?.sandbox
|
||||
? {
|
||||
root: options.sandbox.root,
|
||||
bridge: options.sandbox.bridge,
|
||||
workspaceOnly: options.fsPolicy?.workspaceOnly === true,
|
||||
}
|
||||
: null;
|
||||
|
||||
return {
|
||||
label: "Video Generation",
|
||||
name: "video_generate",
|
||||
displaySummary: "Generate videos",
|
||||
description:
|
||||
"Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments.",
|
||||
parameters: VideoGenerateToolSchema,
|
||||
execute: async (_toolCallId, rawArgs) => {
|
||||
const args = rawArgs as Record<string, unknown>;
|
||||
const action = resolveAction(args);
|
||||
const effectiveCfg =
|
||||
applyVideoGenerationModelConfigDefaults(cfg, videoGenerationModelConfig) ?? cfg;
|
||||
|
||||
if (action === "list") {
|
||||
const providers = listRuntimeVideoGenerationProviders({ config: effectiveCfg });
|
||||
if (providers.length === 0) {
|
||||
return {
|
||||
content: [{ type: "text", text: "No video-generation providers are registered." }],
|
||||
details: { providers: [] },
|
||||
};
|
||||
}
|
||||
const lines = providers.map((provider) => {
|
||||
const authHints = getVideoGenerationProviderAuthEnvVars(provider.id);
|
||||
const capabilities = [
|
||||
provider.capabilities.maxVideos ? `maxVideos=${provider.capabilities.maxVideos}` : null,
|
||||
provider.capabilities.maxInputImages
|
||||
? `maxInputImages=${provider.capabilities.maxInputImages}`
|
||||
: null,
|
||||
provider.capabilities.maxInputVideos
|
||||
? `maxInputVideos=${provider.capabilities.maxInputVideos}`
|
||||
: null,
|
||||
provider.capabilities.maxDurationSeconds
|
||||
? `maxDurationSeconds=${provider.capabilities.maxDurationSeconds}`
|
||||
: null,
|
||||
provider.capabilities.supportsResolution ? "resolution" : null,
|
||||
provider.capabilities.supportsAspectRatio ? "aspectRatio" : null,
|
||||
provider.capabilities.supportsSize ? "size" : null,
|
||||
provider.capabilities.supportsAudio ? "audio" : null,
|
||||
provider.capabilities.supportsWatermark ? "watermark" : null,
|
||||
]
|
||||
.filter((entry): entry is string => Boolean(entry))
|
||||
.join(", ");
|
||||
return [
|
||||
`${provider.id}: default=${provider.defaultModel ?? "none"}`,
|
||||
provider.models?.length ? `models=${provider.models.join(", ")}` : null,
|
||||
capabilities ? `capabilities=${capabilities}` : null,
|
||||
authHints.length > 0 ? `auth=${authHints.join(" / ")}` : null,
|
||||
]
|
||||
.filter((entry): entry is string => Boolean(entry))
|
||||
.join(" | ");
|
||||
});
|
||||
return {
|
||||
content: [{ type: "text", text: lines.join("\n") }],
|
||||
details: {
|
||||
providers: providers.map((provider) => ({
|
||||
id: provider.id,
|
||||
defaultModel: provider.defaultModel,
|
||||
models: provider.models ?? [],
|
||||
authEnvVars: getVideoGenerationProviderAuthEnvVars(provider.id),
|
||||
capabilities: provider.capabilities,
|
||||
})),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const prompt = readStringParam(args, "prompt", { required: true });
|
||||
const model = readStringParam(args, "model");
|
||||
const filename = readStringParam(args, "filename");
|
||||
const size = readStringParam(args, "size");
|
||||
const aspectRatio = normalizeAspectRatio(readStringParam(args, "aspectRatio"));
|
||||
const resolution = normalizeResolution(readStringParam(args, "resolution"));
|
||||
const durationSeconds = readNumberParam(args, "durationSeconds", {
|
||||
integer: true,
|
||||
strict: true,
|
||||
});
|
||||
const audio = readBooleanParam(args, "audio");
|
||||
const watermark = readBooleanParam(args, "watermark");
|
||||
const imageInputs = normalizeReferenceInputs({
|
||||
args,
|
||||
singularKey: "image",
|
||||
pluralKey: "images",
|
||||
maxCount: MAX_INPUT_IMAGES,
|
||||
});
|
||||
const videoInputs = normalizeReferenceInputs({
|
||||
args,
|
||||
singularKey: "video",
|
||||
pluralKey: "videos",
|
||||
maxCount: MAX_INPUT_VIDEOS,
|
||||
});
|
||||
|
||||
const selectedProvider = resolveSelectedVideoGenerationProvider({
|
||||
config: effectiveCfg,
|
||||
videoGenerationModelConfig,
|
||||
modelOverride: model,
|
||||
});
|
||||
const loadedReferenceImages = await loadReferenceAssets({
|
||||
inputs: imageInputs,
|
||||
expectedKind: "image",
|
||||
workspaceDir: options?.workspaceDir,
|
||||
sandboxConfig,
|
||||
});
|
||||
const loadedReferenceVideos = await loadReferenceAssets({
|
||||
inputs: videoInputs,
|
||||
expectedKind: "video",
|
||||
workspaceDir: options?.workspaceDir,
|
||||
sandboxConfig,
|
||||
});
|
||||
validateVideoGenerationCapabilities({
|
||||
provider: selectedProvider,
|
||||
inputImageCount: loadedReferenceImages.length,
|
||||
inputVideoCount: loadedReferenceVideos.length,
|
||||
size,
|
||||
aspectRatio,
|
||||
resolution,
|
||||
durationSeconds,
|
||||
audio,
|
||||
watermark,
|
||||
});
|
||||
|
||||
const result = await generateVideo({
|
||||
cfg: effectiveCfg,
|
||||
prompt,
|
||||
agentDir: options?.agentDir,
|
||||
modelOverride: model,
|
||||
size,
|
||||
aspectRatio,
|
||||
resolution,
|
||||
durationSeconds,
|
||||
audio,
|
||||
watermark,
|
||||
inputImages: loadedReferenceImages.map((entry) => entry.sourceAsset),
|
||||
inputVideos: loadedReferenceVideos.map((entry) => entry.sourceAsset),
|
||||
});
|
||||
const savedVideos = await Promise.all(
|
||||
result.videos.map((video) =>
|
||||
saveMediaBuffer(
|
||||
video.buffer,
|
||||
video.mimeType,
|
||||
"tool-video-generation",
|
||||
undefined,
|
||||
filename || video.fileName,
|
||||
),
|
||||
),
|
||||
);
|
||||
const lines = [
|
||||
`Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
|
||||
...savedVideos.map((video) => `MEDIA:${video.path}`),
|
||||
];
|
||||
|
||||
return {
|
||||
content: [{ type: "text", text: lines.join("\n") }],
|
||||
details: {
|
||||
provider: result.provider,
|
||||
model: result.model,
|
||||
count: savedVideos.length,
|
||||
media: {
|
||||
mediaUrls: savedVideos.map((video) => video.path),
|
||||
},
|
||||
paths: savedVideos.map((video) => video.path),
|
||||
...(loadedReferenceImages.length === 1
|
||||
? {
|
||||
image: loadedReferenceImages[0]?.resolvedInput,
|
||||
...(loadedReferenceImages[0]?.rewrittenFrom
|
||||
? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom }
|
||||
: {}),
|
||||
}
|
||||
: loadedReferenceImages.length > 1
|
||||
? {
|
||||
images: loadedReferenceImages.map((entry) => ({
|
||||
image: entry.resolvedInput,
|
||||
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
|
||||
})),
|
||||
}
|
||||
: {}),
|
||||
...(loadedReferenceVideos.length === 1
|
||||
? {
|
||||
video: loadedReferenceVideos[0]?.resolvedInput,
|
||||
...(loadedReferenceVideos[0]?.rewrittenFrom
|
||||
? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom }
|
||||
: {}),
|
||||
}
|
||||
: loadedReferenceVideos.length > 1
|
||||
? {
|
||||
videos: loadedReferenceVideos.map((entry) => ({
|
||||
video: entry.resolvedInput,
|
||||
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
|
||||
})),
|
||||
}
|
||||
: {}),
|
||||
...(size ? { size } : {}),
|
||||
...(aspectRatio ? { aspectRatio } : {}),
|
||||
...(resolution ? { resolution } : {}),
|
||||
...(typeof durationSeconds === "number" ? { durationSeconds } : {}),
|
||||
...(typeof audio === "boolean" ? { audio } : {}),
|
||||
...(typeof watermark === "boolean" ? { watermark } : {}),
|
||||
...(filename ? { filename } : {}),
|
||||
attempts: result.attempts,
|
||||
metadata: result.metadata,
|
||||
},
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -242,6 +242,37 @@ describe("config cli", () => {
|
||||
expect(written.gateway?.auth).toEqual({ mode: "token" });
|
||||
});
|
||||
|
||||
it("writes agents.defaults.videoGenerationModel.primary without disturbing sibling defaults", async () => {
|
||||
const resolved: OpenClawConfig = {
|
||||
agents: {
|
||||
defaults: {
|
||||
model: "openai/gpt-5.4",
|
||||
imageGenerationModel: {
|
||||
primary: "openai/gpt-image-1",
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
setSnapshot(resolved, resolved);
|
||||
|
||||
await runConfigCommand([
|
||||
"config",
|
||||
"set",
|
||||
"agents.defaults.videoGenerationModel.primary",
|
||||
"qwen/wan2.6-t2v",
|
||||
]);
|
||||
|
||||
expect(mockWriteConfigFile).toHaveBeenCalledTimes(1);
|
||||
const written = mockWriteConfigFile.mock.calls[0]?.[0];
|
||||
expect(written.agents?.defaults?.model).toBe("openai/gpt-5.4");
|
||||
expect(written.agents?.defaults?.imageGenerationModel).toEqual({
|
||||
primary: "openai/gpt-image-1",
|
||||
});
|
||||
expect(written.agents?.defaults?.videoGenerationModel).toEqual({
|
||||
primary: "qwen/wan2.6-t2v",
|
||||
});
|
||||
});
|
||||
|
||||
it("drops gateway.auth.password when switching mode to token", async () => {
|
||||
const resolved: OpenClawConfig = {
|
||||
gateway: {
|
||||
|
||||
@@ -40,4 +40,25 @@ describe("generated base config schema", () => {
|
||||
expect(hooksInternalProperties?.handlers).toBeUndefined();
|
||||
expect(uiHints["hooks.internal.handlers"]).toBeUndefined();
|
||||
});
|
||||
|
||||
it("includes videoGenerationModel in the public schema payload", () => {
|
||||
const agentDefaultsProperties = (
|
||||
GENERATED_BASE_CONFIG_SCHEMA.schema as {
|
||||
properties?: {
|
||||
agents?: {
|
||||
properties?: {
|
||||
defaults?: {
|
||||
properties?: Record<string, unknown>;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
).properties?.agents?.properties?.defaults?.properties;
|
||||
const uiHints = GENERATED_BASE_CONFIG_SCHEMA.uiHints as Record<string, unknown>;
|
||||
|
||||
expect(agentDefaultsProperties?.videoGenerationModel).toBeDefined();
|
||||
expect(uiHints["agents.defaults.videoGenerationModel.primary"]).toBeDefined();
|
||||
expect(uiHints["agents.defaults.videoGenerationModel.fallbacks"]).toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,4 +11,15 @@ describe("agent defaults schema", () => {
|
||||
}),
|
||||
).not.toThrow();
|
||||
});
|
||||
|
||||
it("accepts videoGenerationModel", () => {
|
||||
expect(() =>
|
||||
AgentDefaultsSchema.parse({
|
||||
videoGenerationModel: {
|
||||
primary: "qwen/wan2.6-t2v",
|
||||
fallbacks: ["minimax/video-01"],
|
||||
},
|
||||
}),
|
||||
).not.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user