diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e2d265d708..4ad365feb08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -158,6 +158,7 @@ Docs: https://docs.openclaw.ai - QA/lab: add character-vibes evaluation reports with model selection and parallel runs so live QA can compare candidate behavior faster. - Plugins/provider-auth: let provider manifests declare `providerAuthAliases` so provider variants can share env vars, auth profiles, config-backed auth, and API-key onboarding choices without core-specific wiring. - iOS: pin release versioning to an explicit CalVer in `apps/ios/version.json`, keep TestFlight iteration on the same short version until maintainers intentionally promote the next gateway version, and add the documented `pnpm ios:version:pin -- --from-gateway` workflow for release trains. (#63001) Thanks @ngutman. +- Tools/video_generate: extend the tool and the Plugin SDK with `providerOptions` (vendor-specific options forwarded as a JSON object), `inputAudios` / `audioRef` / `audioRefs` reference audio inputs, per-asset semantic role hints (`imageRoles` / `videoRoles` / `audioRoles`) using a typed `VideoGenerationAssetRole` union, a new `"adaptive"` aspect-ratio sentinel, and `maxInputAudios` provider capability declarations. Providers opt into `providerOptions` by declaring a typed `capabilities.providerOptions` schema (`{ seed: "number", draft: "boolean", ... }`); unknown keys and type mismatches cause the runtime fallback loop to skip the candidate with a visible warning and an `attempts` entry, so vendor-specific options never silently reach the wrong provider. Also raises the in-tool image input cap to 9 and updates the docs table to list all new parameters. (#61987) Thanks @xieyongliang. ### Fixes diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 21a9beb741f..907eec06192 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -ee16273fa5ad8c5408e9dad8d96fde86dfa666ef8eb44840b78135814ff97173 plugin-sdk-api-baseline.json -2bd0d5edf23e6a889d6bedb74d0d06411dd7750dac6ebf24971c789f8a69253a plugin-sdk-api-baseline.jsonl +7a9bb7a5e4b243e2123af94301ba363d57eddab2baa6378d16cd37a1cb8a55f7 plugin-sdk-api-baseline.json +2bdca027d5fda72399479569927cd34d18b56b242e4b12ac45e7c2352e551c77 plugin-sdk-api-baseline.jsonl diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md index ee77c800f73..7de3bc29082 100644 --- a/docs/tools/video-generation.md +++ b/docs/tools/video-generation.md @@ -1,5 +1,5 @@ --- -summary: "Generate videos from text, images, or existing videos using 12 provider backends" +summary: "Generate videos from text, images, or existing videos using 14 provider backends" read_when: - Generating videos via the agent - Configuring video generation providers and models @@ -9,7 +9,7 @@ title: "Video Generation" # Video Generation -OpenClaw agents can generate videos from text prompts, reference images, or existing videos. Twelve provider backends are supported, each with different model options, input modes, and feature sets. The agent picks the right provider automatically based on your configuration and available API keys. +OpenClaw agents can generate videos from text prompts, reference images, or existing videos. Fourteen provider backends are supported, each with different model options, input modes, and feature sets. The agent picks the right provider automatically based on your configuration and available API keys. The `video_generate` tool only appears when at least one video-generation provider is available. If you do not see it in your agent tools, set a provider API key or configure `agents.defaults.videoGenerationModel`. @@ -78,20 +78,22 @@ Duplicate prevention: if a video task is already `queued` or `running` for the c ## Supported providers -| Provider | Default model | Text | Image ref | Video ref | API key | -| -------- | ------------------------------- | ---- | ----------------- | ---------------- | ---------------------------------------- | -| Alibaba | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `MODELSTUDIO_API_KEY` | -| BytePlus | `seedance-1-0-lite-t2v-250428` | Yes | 1 image | No | `BYTEPLUS_API_KEY` | -| ComfyUI | `workflow` | Yes | 1 image | No | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` | -| fal | `fal-ai/minimax/video-01-live` | Yes | 1 image | No | `FAL_KEY` | -| Google | `veo-3.1-fast-generate-preview` | Yes | 1 image | 1 video | `GEMINI_API_KEY` | -| MiniMax | `MiniMax-Hailuo-2.3` | Yes | 1 image | No | `MINIMAX_API_KEY` | -| OpenAI | `sora-2` | Yes | 1 image | 1 video | `OPENAI_API_KEY` | -| Qwen | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `QWEN_API_KEY` | -| Runway | `gen4.5` | Yes | 1 image | 1 video | `RUNWAYML_API_SECRET` | -| Together | `Wan-AI/Wan2.2-T2V-A14B` | Yes | 1 image | No | `TOGETHER_API_KEY` | -| Vydra | `veo3` | Yes | 1 image (`kling`) | No | `VYDRA_API_KEY` | -| xAI | `grok-imagine-video` | Yes | 1 image | 1 video | `XAI_API_KEY` | +| Provider | Default model | Text | Image ref | Video ref | API key | +| --------------------- | ------------------------------- | ---- | ---------------------------------------------------- | ---------------- | ---------------------------------------- | +| Alibaba | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `MODELSTUDIO_API_KEY` | +| BytePlus (1.0) | `seedance-1-0-pro-250528` | Yes | Up to 2 images (I2V models only; first + last frame) | No | `BYTEPLUS_API_KEY` | +| BytePlus Seedance 1.5 | `seedance-1-5-pro-251215` | Yes | Up to 2 images (first + last frame via role) | No | `BYTEPLUS_API_KEY` | +| BytePlus Seedance 2.0 | `dreamina-seedance-2-0-260128` | Yes | Up to 9 reference images | Up to 3 videos | `BYTEPLUS_API_KEY` | +| ComfyUI | `workflow` | Yes | 1 image | No | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` | +| fal | `fal-ai/minimax/video-01-live` | Yes | 1 image | No | `FAL_KEY` | +| Google | `veo-3.1-fast-generate-preview` | Yes | 1 image | 1 video | `GEMINI_API_KEY` | +| MiniMax | `MiniMax-Hailuo-2.3` | Yes | 1 image | No | `MINIMAX_API_KEY` | +| OpenAI | `sora-2` | Yes | 1 image | 1 video | `OPENAI_API_KEY` | +| Qwen | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `QWEN_API_KEY` | +| Runway | `gen4.5` | Yes | 1 image | 1 video | `RUNWAYML_API_SECRET` | +| Together | `Wan-AI/Wan2.2-T2V-A14B` | Yes | 1 image | No | `TOGETHER_API_KEY` | +| Vydra | `veo3` | Yes | 1 image (`kling`) | No | `VYDRA_API_KEY` | +| xAI | `grok-imagine-video` | Yes | 1 image | 1 video | `XAI_API_KEY` | Some providers accept additional or alternate API key env vars. See individual [provider pages](#related) for details. @@ -128,31 +130,49 @@ and the shared live sweep. ### Content inputs -| Parameter | Type | Description | -| --------- | -------- | ------------------------------------ | -| `image` | string | Single reference image (path or URL) | -| `images` | string[] | Multiple reference images (up to 5) | -| `video` | string | Single reference video (path or URL) | -| `videos` | string[] | Multiple reference videos (up to 4) | +| Parameter | Type | Description | +| ------------ | -------- | -------------------------------------------------------------------------------------------------------------------------------------- | +| `image` | string | Single reference image (path or URL) | +| `images` | string[] | Multiple reference images (up to 9) | +| `imageRoles` | string[] | Optional per-position role hints parallel to the combined image list. Canonical values: `first_frame`, `last_frame`, `reference_image` | +| `video` | string | Single reference video (path or URL) | +| `videos` | string[] | Multiple reference videos (up to 4) | +| `videoRoles` | string[] | Optional per-position role hints parallel to the combined video list. Canonical value: `reference_video` | +| `audioRef` | string | Single reference audio (path or URL). Used for e.g. background music or voice reference when the provider supports audio inputs | +| `audioRefs` | string[] | Multiple reference audios (up to 3) | +| `audioRoles` | string[] | Optional per-position role hints parallel to the combined audio list. Canonical value: `reference_audio` | + +Role hints are forwarded to the provider as-is. Canonical values come from +the `VideoGenerationAssetRole` union but providers may accept additional +role strings. `*Roles` arrays must not have more entries than the +corresponding reference list; off-by-one mistakes fail with a clear error. +Use an empty string to leave a slot unset. ### Style controls -| Parameter | Type | Description | -| ----------------- | ------- | ------------------------------------------------------------------------ | -| `aspectRatio` | string | `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9` | -| `resolution` | string | `480P`, `720P`, `768P`, or `1080P` | -| `durationSeconds` | number | Target duration in seconds (rounded to nearest provider-supported value) | -| `size` | string | Size hint when the provider supports it | -| `audio` | boolean | Enable generated audio when supported | -| `watermark` | boolean | Toggle provider watermarking when supported | +| Parameter | Type | Description | +| ----------------- | ------- | --------------------------------------------------------------------------------------- | +| `aspectRatio` | string | `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9`, or `adaptive` | +| `resolution` | string | `480P`, `720P`, `768P`, or `1080P` | +| `durationSeconds` | number | Target duration in seconds (rounded to nearest provider-supported value) | +| `size` | string | Size hint when the provider supports it | +| `audio` | boolean | Enable generated audio in the output when supported. Distinct from `audioRef*` (inputs) | +| `watermark` | boolean | Toggle provider watermarking when supported | + +`adaptive` is a provider-specific sentinel: it is forwarded as-is to +providers that declare `adaptive` in their capabilities (e.g. BytePlus +Seedance uses it to auto-detect the ratio from the input image +dimensions). Providers that do not declare it surface the value via +`details.ignoredOverrides` in the tool result so the drop is visible. ### Advanced -| Parameter | Type | Description | -| ---------- | ------ | ----------------------------------------------- | -| `action` | string | `"generate"` (default), `"status"`, or `"list"` | -| `model` | string | Provider/model override (e.g. `runway/gen4.5`) | -| `filename` | string | Output filename hint | +| Parameter | Type | Description | +| ----------------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `action` | string | `"generate"` (default), `"status"`, or `"list"` | +| `model` | string | Provider/model override (e.g. `runway/gen4.5`) | +| `filename` | string | Output filename hint | +| `providerOptions` | object | Provider-specific options as a JSON object (e.g. `{"seed": 42, "draft": true}`). Providers that declare a typed schema validate the keys and types; unknown keys or mismatches skip the candidate during fallback. Providers without a declared schema receive the options as-is. Run `video_generate action=list` to see what each provider accepts | Not all providers support all parameters. OpenClaw already normalizes duration to the closest provider-supported value, and it also remaps translated geometry hints such as size-to-aspect-ratio when a fallback provider exposes a different control surface. Truly unsupported overrides are ignored on a best-effort basis and reported as warnings in the tool result. Hard capability limits (such as too many reference inputs) fail before submission. @@ -163,10 +183,37 @@ Reference inputs also select the runtime mode: - No reference media: `generate` - Any image reference: `imageToVideo` - Any video reference: `videoToVideo` +- Reference audio inputs do not change the resolved mode; they apply on top of whatever mode the image/video references select, and only work with providers that declare `maxInputAudios` Mixed image and video references are not a stable shared capability surface. Prefer one reference type per request. +#### Fallback and typed options + +Some capability checks are applied at the fallback layer rather than the +tool boundary so that a request that exceeds the primary provider's limits +can still run on a capable fallback: + +- If the active candidate declares no `maxInputAudios` (or declares it as + `0`), it is skipped when the request contains audio references, and the + next candidate is tried. +- If the active candidate's `maxDurationSeconds` is below the requested + `durationSeconds` and the candidate does not declare a + `supportedDurationSeconds` list, it is skipped. +- If the request contains `providerOptions` and the active candidate + explicitly declares a typed `providerOptions` schema, the candidate is + skipped when the supplied keys are not in the schema or the value types do + not match. Providers that have not yet declared a schema receive the + options as-is (backward-compatible pass-through). A provider can + explicitly opt out of all provider options by declaring an empty schema + (`capabilities.providerOptions: {}`), which causes the same skip as a + type mismatch. + +The first skip reason in a request is logged at `warn` so operators see +when their primary provider was passed over; subsequent skips log at +`debug` to keep long fallback chains quiet. If every candidate is skipped, +the aggregated error includes the skip reason for each. + ## Actions - **generate** (default) -- create a video from the given prompt and optional reference inputs. @@ -201,50 +248,24 @@ entries. } ``` -HeyGen video-agent on fal can be pinned with: - -```json5 -{ - agents: { - defaults: { - videoGenerationModel: { - primary: "fal/fal-ai/heygen/v2/video-agent", - }, - }, - }, -} -``` - -Seedance 2.0 on fal can be pinned with: - -```json5 -{ - agents: { - defaults: { - videoGenerationModel: { - primary: "fal/bytedance/seedance-2.0/fast/text-to-video", - }, - }, - }, -} -``` - ## Provider notes -| Provider | Notes | -| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Alibaba | Uses DashScope/Model Studio async endpoint. Reference images and videos must be remote `http(s)` URLs. | -| BytePlus | Single image reference only. | -| ComfyUI | Workflow-driven local or cloud execution. Supports text-to-video and image-to-video through the configured graph. | -| fal | Uses queue-backed flow for long-running jobs. Single image reference only. Includes HeyGen video-agent and Seedance 2.0 text-to-video and image-to-video model refs. | -| Google | Uses Gemini/Veo. Supports one image or one video reference. | -| MiniMax | Single image reference only. | -| OpenAI | Only `size` override is forwarded. Other style overrides (`aspectRatio`, `resolution`, `audio`, `watermark`) are ignored with a warning. | -| Qwen | Same DashScope backend as Alibaba. Reference inputs must be remote `http(s)` URLs; local files are rejected upfront. | -| Runway | Supports local files via data URIs. Video-to-video requires `runway/gen4_aleph`. Text-only runs expose `16:9` and `9:16` aspect ratios. | -| Together | Single image reference only. | -| Vydra | Uses `https://www.vydra.ai/api/v1` directly to avoid auth-dropping redirects. `veo3` is bundled as text-to-video only; `kling` requires a remote image URL. | -| xAI | Supports text-to-video, image-to-video, and remote video edit/extend flows. | +| Provider | Notes | +| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Alibaba | Uses DashScope/Model Studio async endpoint. Reference images and videos must be remote `http(s)` URLs. | +| BytePlus (1.0) | Provider id `byteplus`. Models: `seedance-1-0-pro-250528` (default), `seedance-1-0-pro-t2v-250528`, `seedance-1-0-pro-fast-251015`, `seedance-1-0-lite-t2v-250428`, `seedance-1-0-lite-i2v-250428`. T2V models (`*-t2v-*`) do not accept image inputs; I2V models and general `*-pro-*` models support a single reference image (first frame). Pass the image positionally or set `role: "first_frame"`. T2V model IDs are automatically switched to the corresponding I2V variant when an image is provided. Supported `providerOptions` keys: `seed` (number), `draft` (boolean, forces 480p), `camera_fixed` (boolean). | +| BytePlus Seedance 1.5 | Requires the [`@openclaw/byteplus-modelark`](https://www.npmjs.com/package/@openclaw/byteplus-modelark) plugin. Provider id `byteplus-seedance15`. Model: `seedance-1-5-pro-251215`. Uses the unified `content[]` API. Supports at most 2 input images (first_frame + last_frame). All inputs must be remote `https://` URLs. Set `role: "first_frame"` / `"last_frame"` on each image, or pass images positionally. `aspectRatio: "adaptive"` auto-detects ratio from the input image. `audio: true` maps to `generate_audio`. `providerOptions.seed` (number) is forwarded. | +| BytePlus Seedance 2.0 | Requires the [`@openclaw/byteplus-modelark`](https://www.npmjs.com/package/@openclaw/byteplus-modelark) plugin. Provider id `byteplus-seedance2`. Models: `dreamina-seedance-2-0-260128`, `dreamina-seedance-2-0-fast-260128`. Uses the unified `content[]` API. Supports up to 9 reference images, 3 reference videos, and 3 reference audios. All inputs must be remote `https://` URLs. Set `role` on each asset — supported values: `"first_frame"`, `"last_frame"`, `"reference_image"`, `"reference_video"`, `"reference_audio"`. `aspectRatio: "adaptive"` auto-detects ratio from the input image. `audio: true` maps to `generate_audio`. `providerOptions.seed` (number) is forwarded. | +| ComfyUI | Workflow-driven local or cloud execution. Supports text-to-video and image-to-video through the configured graph. | +| fal | Uses queue-backed flow for long-running jobs. Single image reference only. | +| Google | Uses Gemini/Veo. Supports one image or one video reference. | +| MiniMax | Single image reference only. | +| OpenAI | Only `size` override is forwarded. Other style overrides (`aspectRatio`, `resolution`, `audio`, `watermark`) are ignored with a warning. | +| Qwen | Same DashScope backend as Alibaba. Reference inputs must be remote `http(s)` URLs; local files are rejected upfront. | +| Runway | Supports local files via data URIs. Video-to-video requires `runway/gen4_aleph`. Text-only runs expose `16:9` and `9:16` aspect ratios. | +| Together | Single image reference only. | +| Vydra | Uses `https://www.vydra.ai/api/v1` directly to avoid auth-dropping redirects. `veo3` is bundled as text-to-video only; `kling` requires a remote image URL. | +| xAI | Supports text-to-video, image-to-video, and remote video edit/extend flows. | ## Provider capability modes diff --git a/extensions/byteplus/video-generation-provider.test.ts b/extensions/byteplus/video-generation-provider.test.ts index 975668374e3..998b43b9dbe 100644 --- a/extensions/byteplus/video-generation-provider.test.ts +++ b/extensions/byteplus/video-generation-provider.test.ts @@ -14,31 +14,35 @@ beforeAll(async () => { installProviderHttpMockCleanup(); +function mockSuccessfulBytePlusTask(params?: { model?: string }) { + postJsonRequestMock.mockResolvedValue({ + response: { + json: async () => ({ + id: "task_123", + }), + }, + release: vi.fn(async () => {}), + }); + fetchWithTimeoutMock + .mockResolvedValueOnce({ + json: async () => ({ + id: "task_123", + status: "succeeded", + content: { + video_url: "https://example.com/byteplus.mp4", + }, + model: params?.model ?? "seedance-1-0-lite-t2v-250428", + }), + }) + .mockResolvedValueOnce({ + headers: new Headers({ "content-type": "video/mp4" }), + arrayBuffer: async () => Buffer.from("mp4-bytes"), + }); +} + describe("byteplus video generation provider", () => { it("creates a content-generation task, polls, and downloads the video", async () => { - postJsonRequestMock.mockResolvedValue({ - response: { - json: async () => ({ - id: "task_123", - }), - }, - release: vi.fn(async () => {}), - }); - fetchWithTimeoutMock - .mockResolvedValueOnce({ - json: async () => ({ - id: "task_123", - status: "succeeded", - content: { - video_url: "https://example.com/byteplus.mp4", - }, - model: "seedance-1-0-lite-t2v-250428", - }), - }) - .mockResolvedValueOnce({ - headers: new Headers({ "content-type": "video/mp4" }), - arrayBuffer: async () => Buffer.from("mp4-bytes"), - }); + mockSuccessfulBytePlusTask(); const provider = buildBytePlusVideoGenerationProvider(); const result = await provider.generateVideo({ @@ -60,4 +64,57 @@ describe("byteplus video generation provider", () => { }), ); }); + + it("switches t2v image requests to i2v models and lowercases resolution", async () => { + mockSuccessfulBytePlusTask({ model: "seedance-1-0-lite-i2v-250428" }); + + const provider = buildBytePlusVideoGenerationProvider(); + await provider.generateVideo({ + provider: "byteplus", + model: "seedance-1-0-lite-t2v-250428", + prompt: "Animate this still image", + resolution: "720P", + inputImages: [{ url: "https://example.com/first-frame.png" }], + cfg: {}, + }); + + const request = postJsonRequestMock.mock.calls[0]?.[0] as { body?: Record }; + expect(request.body).toMatchObject({ + model: "seedance-1-0-lite-i2v-250428", + resolution: "720p", + content: [ + { type: "text", text: "Animate this still image" }, + { + type: "image_url", + image_url: { url: "https://example.com/first-frame.png" }, + role: "first_frame", + }, + ], + }); + }); + + it("maps declared providerOptions into the request body", async () => { + mockSuccessfulBytePlusTask({ model: "seedance-1-0-pro-250528" }); + + const provider = buildBytePlusVideoGenerationProvider(); + await provider.generateVideo({ + provider: "byteplus", + model: "seedance-1-0-pro-250528", + prompt: "A cinematic lobster montage", + providerOptions: { + seed: 42, + draft: true, + camera_fixed: false, + }, + cfg: {}, + }); + + const request = postJsonRequestMock.mock.calls[0]?.[0] as { body?: Record }; + expect(request.body).toMatchObject({ + model: "seedance-1-0-pro-250528", + seed: 42, + resolution: "480p", + camera_fixed: false, + }); + }); }); diff --git a/extensions/byteplus/video-generation-provider.ts b/extensions/byteplus/video-generation-provider.ts index 90ebdce7704..10e1de91432 100644 --- a/extensions/byteplus/video-generation-provider.ts +++ b/extensions/byteplus/video-generation-provider.ts @@ -141,6 +141,11 @@ export function buildBytePlusVideoGenerationProvider(): VideoGenerationProvider agentDir, }), capabilities: { + providerOptions: { + seed: "number", + draft: "boolean", + camera_fixed: "boolean", + }, generate: { maxVideos: 1, maxDurationSeconds: 12, @@ -191,6 +196,17 @@ export function buildBytePlusVideoGenerationProvider(): VideoGenerationProvider capability: "video", transport: "http", }); + // Seedance 1.0 has separate T2V and I2V model IDs (e.g. seedance-1-0-lite-t2v-250428 vs + // seedance-1-0-lite-i2v-250428). When input images are provided with a T2V model, auto- + // switch to the corresponding I2V variant so the API does not reject with task_type mismatch. + // 1.5 Pro uses a single model ID for both modes and is unaffected by this substitution. + const hasInputImages = (req.inputImages?.length ?? 0) > 0; + const requestedModel = normalizeOptionalString(req.model) || DEFAULT_BYTEPLUS_VIDEO_MODEL; + const resolvedModel = + hasInputImages && requestedModel.includes("-t2v-") + ? requestedModel.replace("-t2v-", "-i2v-") + : requestedModel; + const content: Array> = [{ type: "text", text: req.prompt }]; const imageUrl = resolveBytePlusImageUrl(req); if (imageUrl) { @@ -201,15 +217,18 @@ export function buildBytePlusVideoGenerationProvider(): VideoGenerationProvider }); } const body: Record = { - model: normalizeOptionalString(req.model) || DEFAULT_BYTEPLUS_VIDEO_MODEL, + model: resolvedModel, content, }; const aspectRatio = normalizeOptionalString(req.aspectRatio); if (aspectRatio) { body.ratio = aspectRatio; } - if (req.resolution) { - body.resolution = req.resolution; + // Seedance API requires lowercase resolution values (e.g. "480p", "720p"); uppercase + // variants like "480P" are rejected with InvalidParameter. + const resolution = normalizeOptionalString(req.resolution)?.toLowerCase(); + if (resolution) { + body.resolution = resolution; } if (typeof req.durationSeconds === "number" && Number.isFinite(req.durationSeconds)) { body.duration = Math.max(1, Math.round(req.durationSeconds)); @@ -221,6 +240,23 @@ export function buildBytePlusVideoGenerationProvider(): VideoGenerationProvider body.watermark = req.watermark; } + // Forward declared providerOptions: seed, draft, camerafixed. + // draft=true forces 480p resolution for faster generation. + const opts = req.providerOptions ?? {}; + const seed = typeof opts.seed === "number" ? opts.seed : undefined; + const draft = opts.draft === true; + // Official JSON body field is camera_fixed (with underscore). + const cameraFixed = typeof opts.camera_fixed === "boolean" ? opts.camera_fixed : undefined; + if (seed != null) { + body.seed = seed; + } + if (draft && !body.resolution) { + body.resolution = "480p"; + } + if (cameraFixed != null) { + body.camera_fixed = cameraFixed; + } + const { response, release } = await postJsonRequest({ url: `${baseUrl}/contents/generations/tasks`, headers, @@ -255,7 +291,7 @@ export function buildBytePlusVideoGenerationProvider(): VideoGenerationProvider }); return { videos: [video], - model: completed.model ?? req.model ?? DEFAULT_BYTEPLUS_VIDEO_MODEL, + model: completed.model ?? resolvedModel, metadata: { taskId, status: completed.status, diff --git a/src/agents/tools/video-generate-tool.actions.ts b/src/agents/tools/video-generate-tool.actions.ts index ec1796670ea..a5e2e8d7605 100644 --- a/src/agents/tools/video-generate-tool.actions.ts +++ b/src/agents/tools/video-generate-tool.actions.ts @@ -21,11 +21,36 @@ function summarizeVideoGenerationCapabilities( const generate = provider.capabilities.generate; const imageToVideo = provider.capabilities.imageToVideo; const videoToVideo = provider.capabilities.videoToVideo; + // providerOptions may be declared at the mode level (generate) or at the flat + // provider-capabilities level. The runtime checks both; surface the union so + // the agent sees a single merged view of which opaque keys each provider + // actually accepts. + const declaredProviderOptions: Record = {}; + for (const [key, type] of Object.entries(provider.capabilities.providerOptions ?? {})) { + declaredProviderOptions[key] = type; + } + for (const [key, type] of Object.entries(generate?.providerOptions ?? {})) { + declaredProviderOptions[key] = type; + } + for (const [key, type] of Object.entries(imageToVideo?.providerOptions ?? {})) { + declaredProviderOptions[key] = type; + } + for (const [key, type] of Object.entries(videoToVideo?.providerOptions ?? {})) { + declaredProviderOptions[key] = type; + } + const maxInputAudios = + generate?.maxInputAudios ?? + imageToVideo?.maxInputAudios ?? + videoToVideo?.maxInputAudios ?? + provider.capabilities.maxInputAudios; const capabilities = [ supportedModes.length > 0 ? `modes=${supportedModes.join("/")}` : null, generate?.maxVideos ? `maxVideos=${generate.maxVideos}` : null, imageToVideo?.maxInputImages ? `maxInputImages=${imageToVideo.maxInputImages}` : null, videoToVideo?.maxInputVideos ? `maxInputVideos=${videoToVideo.maxInputVideos}` : null, + typeof maxInputAudios === "number" && maxInputAudios > 0 + ? `maxInputAudios=${maxInputAudios}` + : null, generate?.maxDurationSeconds ? `maxDurationSeconds=${generate.maxDurationSeconds}` : null, generate?.supportedDurationSeconds?.length ? `supportedDurationSeconds=${generate.supportedDurationSeconds.join("/")}` @@ -41,6 +66,11 @@ function summarizeVideoGenerationCapabilities( generate?.supportsSize ? "size" : null, generate?.supportsAudio ? "audio" : null, generate?.supportsWatermark ? "watermark" : null, + Object.keys(declaredProviderOptions).length > 0 + ? `providerOptions={${Object.entries(declaredProviderOptions) + .map(([key, type]) => `${key}:${type}`) + .join(", ")}}` + : null, ] .filter((entry): entry is string => Boolean(entry)) .join(", "); diff --git a/src/agents/tools/video-generate-tool.test.ts b/src/agents/tools/video-generate-tool.test.ts index a71420dd032..931568a93a2 100644 --- a/src/agents/tools/video-generate-tool.test.ts +++ b/src/agents/tools/video-generate-tool.test.ts @@ -550,4 +550,359 @@ describe("createVideoGenerateTool", () => { expect(result.details).not.toHaveProperty("audio"); expect(result.details).not.toHaveProperty("watermark"); }); + + it("rejects providerOptions that is not a plain JSON object", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: {}, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const generateSpy = vi.spyOn(videoGenerationRuntime, "generateVideo"); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + // Array-shaped providerOptions should be rejected up front, not cast to a + // Record with numeric-string keys and silently forwarded. + await expect( + tool.execute("call-1", { + prompt: "lobster", + providerOptions: ["seed", 42] as unknown as Record, + }), + ).rejects.toThrow( + "providerOptions must be a JSON object keyed by provider-specific option name.", + ); + // String providerOptions should also be rejected. + await expect( + tool.execute("call-2", { + prompt: "lobster", + providerOptions: "seed=42" as unknown as Record, + }), + ).rejects.toThrow( + "providerOptions must be a JSON object keyed by provider-specific option name.", + ); + expect(generateSpy).not.toHaveBeenCalled(); + }); + + it("forwards providerOptions to the runtime for valid JSON-object payloads", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: { + providerOptions: { seed: "number", draft: "boolean" }, + }, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const generateSpy = vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({ + provider: "video-plugin", + model: "vid-v1", + attempts: [], + ignoredOverrides: [], + videos: [{ buffer: Buffer.from("video-bytes"), mimeType: "video/mp4", fileName: "out.mp4" }], + }); + vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({ + path: "/tmp/out.mp4", + id: "out.mp4", + size: 11, + contentType: "video/mp4", + }); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await tool.execute("call-1", { + prompt: "lobster", + providerOptions: { seed: 42, draft: true }, + }); + + expect(generateSpy).toHaveBeenCalledWith( + expect.objectContaining({ + providerOptions: { seed: 42, draft: true }, + }), + ); + }); + + it("rejects *Roles arrays that are longer than the asset list", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: { + imageToVideo: { enabled: true, maxInputImages: 2 }, + }, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const generateSpy = vi.spyOn(videoGenerationRuntime, "generateVideo"); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await expect( + tool.execute("call-1", { + prompt: "lobster", + image: "data:image/png;base64,cG5n", + // Only one image is provided, so passing two roles is an off-by-one bug. + imageRoles: ["first_frame", "last_frame"], + }), + ).rejects.toThrow(/imageRoles has 2 entries but only 1 reference image/); + expect(generateSpy).not.toHaveBeenCalled(); + }); + + it("rejects *Roles that are not arrays", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: {}, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const generateSpy = vi.spyOn(videoGenerationRuntime, "generateVideo"); + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await expect( + tool.execute("call-1", { + prompt: "lobster", + imageRoles: "first_frame" as unknown as string[], + }), + ).rejects.toThrow( + "imageRoles must be a JSON array of role strings, parallel to the reference list.", + ); + expect(generateSpy).not.toHaveBeenCalled(); + }); + + it("attaches positional role hints to loaded reference assets", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: { + imageToVideo: { enabled: true, maxInputImages: 2 }, + }, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const generateSpy = vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({ + provider: "video-plugin", + model: "vid-v1", + attempts: [], + ignoredOverrides: [], + videos: [{ buffer: Buffer.from("video-bytes"), mimeType: "video/mp4", fileName: "out.mp4" }], + }); + vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({ + path: "/tmp/out.mp4", + id: "out.mp4", + size: 11, + contentType: "video/mp4", + }); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await tool.execute("call-1", { + prompt: "lobster", + images: ["data:image/png;base64,Zmlyc3Q=", "data:image/png;base64,bGFzdA=="], + imageRoles: ["first_frame", "last_frame"], + }); + + expect(generateSpy).toHaveBeenCalledTimes(1); + const call = generateSpy.mock.calls[0]?.[0] as { + inputImages?: Array<{ role?: string }>; + }; + expect(call.inputImages).toHaveLength(2); + expect(call.inputImages?.[0]?.role).toBe("first_frame"); + expect(call.inputImages?.[1]?.role).toBe("last_frame"); + }); + + it("rejects audio data: URLs via the templated rejection branch", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: { + maxInputAudios: 1, + }, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const generateSpy = vi.spyOn(videoGenerationRuntime, "generateVideo"); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await expect( + tool.execute("call-1", { + prompt: "lobster", + audioRef: "data:audio/mpeg;base64,bXAz", + }), + ).rejects.toThrow("audio data: URLs are not supported for video_generate."); + expect(generateSpy).not.toHaveBeenCalled(); + }); + + it("accepts aspectRatio=adaptive and forwards it to the runtime", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: {}, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const generateSpy = vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({ + provider: "video-plugin", + model: "vid-v1", + attempts: [], + ignoredOverrides: [], + videos: [{ buffer: Buffer.from("video-bytes"), mimeType: "video/mp4", fileName: "out.mp4" }], + }); + vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({ + path: "/tmp/out.mp4", + id: "out.mp4", + size: 11, + contentType: "video/mp4", + }); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await tool.execute("call-1", { + prompt: "lobster", + aspectRatio: "adaptive", + }); + + expect(generateSpy).toHaveBeenCalledWith(expect.objectContaining({ aspectRatio: "adaptive" })); + }); + + it("rejects unsupported aspectRatio values", async () => { + vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([ + { + id: "video-plugin", + defaultModel: "vid-v1", + models: ["vid-v1"], + capabilities: {}, + generateVideo: vi.fn(async () => ({ + videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }], + })), + }, + ]); + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await expect( + tool.execute("call-1", { + prompt: "lobster", + aspectRatio: "17:9", + }), + ).rejects.toThrow( + "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or adaptive", + ); + }); }); diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts index c20d98e07db..df5bde4551d 100644 --- a/src/agents/tools/video-generate-tool.ts +++ b/src/agents/tools/video-generate-tool.ts @@ -5,6 +5,7 @@ import { formatErrorMessage } from "../../infra/errors.js"; import { createSubsystemLogger } from "../../logging/subsystem.js"; import { saveMediaBuffer } from "../../media/store.js"; import { loadWebMedia } from "../../media/web-media.js"; +import { readSnakeCaseParamRaw } from "../../param-key.js"; import { resolveUserPath } from "../../utils.js"; import type { DeliveryContext } from "../../utils/delivery-context.js"; import { @@ -58,8 +59,9 @@ import { } from "./video-generate-tool.actions.js"; const log = createSubsystemLogger("agents/tools/video-generate"); -const MAX_INPUT_IMAGES = 5; +const MAX_INPUT_IMAGES = 9; const MAX_INPUT_VIDEOS = 4; +const MAX_INPUT_AUDIOS = 3; const SUPPORTED_ASPECT_RATIOS = new Set([ "1:1", "2:3", @@ -71,6 +73,14 @@ const SUPPORTED_ASPECT_RATIOS = new Set([ "9:16", "16:9", "21:9", + // Provider-specific sentinel: accepted at the tool boundary, then forwarded + // to the active provider only if that provider declares "adaptive" in its + // capabilities.aspectRatios list. Providers that do not declare it see the + // value pushed into `ignoredOverrides` in the normalization layer so the + // tool surfaces a user-visible "ignored override" warning rather than + // silently dropping the request. Seedance uses this to auto-detect the + // ratio from input image dimensions. + "adaptive", ]); const VideoGenerateToolSchema = Type.Object({ @@ -91,6 +101,17 @@ const VideoGenerateToolSchema = Type.Object({ description: `Optional reference images (up to ${MAX_INPUT_IMAGES}).`, }), ), + imageRoles: Type.Optional( + Type.Array(Type.String(), { + description: + "Optional semantic roles for the combined reference image list, parallel by index. " + + "The list is `image` (if provided) followed by each entry in `images`, in order, " + + "after de-duplication. " + + 'Canonical values: "first_frame", "last_frame", "reference_image". ' + + "Providers may accept additional role strings. " + + "Must not have more entries than the combined image list; use an empty string to leave a position unset.", + }), + ), video: Type.Optional( Type.String({ description: "Optional single reference video path or URL.", @@ -101,6 +122,36 @@ const VideoGenerateToolSchema = Type.Object({ description: `Optional reference videos (up to ${MAX_INPUT_VIDEOS}).`, }), ), + videoRoles: Type.Optional( + Type.Array(Type.String(), { + description: + "Optional semantic roles for the combined reference video list, parallel by index. " + + "The list is `video` (if provided) followed by each entry in `videos`, in order, " + + "after de-duplication. " + + 'Canonical value: "reference_video". Providers may accept additional role strings. ' + + "Must not have more entries than the combined video list; use an empty string to leave a position unset.", + }), + ), + audioRef: Type.Optional( + Type.String({ + description: "Optional single reference audio path or URL (e.g. background music).", + }), + ), + audioRefs: Type.Optional( + Type.Array(Type.String(), { + description: `Optional reference audios (up to ${MAX_INPUT_AUDIOS}).`, + }), + ), + audioRoles: Type.Optional( + Type.Array(Type.String(), { + description: + "Optional semantic roles for the combined reference audio list, parallel by index. " + + "The list is `audioRef` (if provided) followed by each entry in `audioRefs`, in order, " + + "after de-duplication. " + + 'Canonical value: "reference_audio". Providers may accept additional role strings. ' + + "Must not have more entries than the combined audio list; use an empty string to leave a position unset.", + }), + ), model: Type.Optional( Type.String({ description: "Optional provider/model override, e.g. qwen/wan2.6-t2v." }), ), @@ -118,7 +169,7 @@ const VideoGenerateToolSchema = Type.Object({ aspectRatio: Type.Optional( Type.String({ description: - "Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9.", + 'Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or "adaptive".', }), ), resolution: Type.Optional( @@ -143,6 +194,16 @@ const VideoGenerateToolSchema = Type.Object({ description: "Optional watermark toggle when the provider supports it.", }), ), + providerOptions: Type.Optional( + Type.Record(Type.String(), Type.Unknown(), { + description: + 'Optional provider-specific options as a JSON object, e.g. `{"seed": 42, "draft": true}`. ' + + "Each provider declares its own accepted keys and primitive types (number/boolean/string) " + + "via its capabilities; unknown keys or type mismatches skip the candidate during fallback " + + "and never silently reach the wrong provider. Run `video_generate action=list` to see which " + + "keys each provider accepts.", + }), + ), }); export function resolveVideoGenerationModelConfigForTool(params: { @@ -190,14 +251,44 @@ function normalizeAspectRatio(raw: string | undefined): string | undefined { return normalized; } throw new ToolInputError( - "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9", + "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or adaptive", ); } +/** + * Parse a `*Roles` parallel string array for `video_generate`. Throws when + * the caller supplies more roles than assets so off-by-one alignment bugs + * fail loudly at the tool boundary instead of silently dropping the + * trailing roles. Empty strings in the array are allowed and mean "no + * role at this position". Non-string entries are coerced to empty strings + * and treated as "unset" so providers can leave individual slots empty. + */ +function parseRoleArray(params: { + raw: unknown; + kind: "imageRoles" | "videoRoles" | "audioRoles"; + assetCount: number; +}): string[] { + if (params.raw === undefined || params.raw === null) { + return []; + } + if (!Array.isArray(params.raw)) { + throw new ToolInputError( + `${params.kind} must be a JSON array of role strings, parallel to the reference list.`, + ); + } + const roles = params.raw.map((entry) => (typeof entry === "string" ? entry.trim() : "")); + if (roles.length > params.assetCount) { + throw new ToolInputError( + `${params.kind} has ${roles.length} entries but only ${params.assetCount} reference ${params.kind === "imageRoles" ? "image" : params.kind === "videoRoles" ? "video" : "audio"}${params.assetCount === 1 ? "" : "s"} were provided; extra roles cannot be aligned positionally.`, + ); + } + return roles; +} + function normalizeReferenceInputs(params: { args: Record; - singularKey: "image" | "video"; - pluralKey: "images" | "videos"; + singularKey: "image" | "video" | "audioRef"; + pluralKey: "images" | "videos" | "audioRefs"; maxCount: number; }): string[] { return normalizeMediaReferenceInputs({ @@ -227,6 +318,7 @@ function validateVideoGenerationCapabilities(params: { model?: string; inputImageCount: number; inputVideoCount: number; + inputAudioCount: number; size?: string; aspectRatio?: string; resolution?: VideoGenerationResolution; @@ -288,6 +380,15 @@ function validateVideoGenerationCapabilities(params: { ); } } + // Audio-count validation is intentionally deferred to runtime.ts (generateVideo). + // The runtime guard skips per-candidate providers that lack audio support, allowing + // fallback candidates that do support audio to run. A ToolInputError here would fire + // against only the primary provider and prevent valid fallback-based audio requests. + // maxDurationSeconds validation is intentionally deferred to runtime.ts (generateVideo). + // The runtime guard skips per-candidate providers whose hard cap is below the requested + // duration, allowing a fallback with a higher cap to run — same rationale as the audio + // check above. When providers declare an explicit supportedDurationSeconds list, runtime + // normalization snaps to the nearest valid value instead of skipping. } function formatIgnoredVideoGenerationOverride(override: VideoGenerationIgnoredOverride): string { @@ -313,7 +414,7 @@ function defaultScheduleVideoGenerateBackgroundWork(work: () => Promise) { async function loadReferenceAssets(params: { inputs: string[]; - expectedKind: "image" | "video"; + expectedKind: "image" | "video" | "audio"; maxBytes?: number; workspaceDir?: string; sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null; @@ -395,7 +496,9 @@ async function loadReferenceAssets(params: { ? params.expectedKind === "image" ? decodeDataUrl(resolvedInput) : (() => { - throw new ToolInputError("Video data: URLs are not supported for video_generate."); + throw new ToolInputError( + `${params.expectedKind} data: URLs are not supported for video_generate.`, + ); })() : params.sandboxConfig ? await loadWebMedia(resolvedPath ?? resolvedInput, { @@ -451,7 +554,9 @@ async function executeVideoGenerationJob(params: { filename?: string; loadedReferenceImages: LoadedReferenceAsset[]; loadedReferenceVideos: LoadedReferenceAsset[]; + loadedReferenceAudios: LoadedReferenceAsset[]; taskHandle?: VideoGenerationTaskHandle | null; + providerOptions?: Record; }): Promise { if (params.taskHandle) { recordVideoGenerationTaskProgress({ @@ -472,6 +577,8 @@ async function executeVideoGenerationJob(params: { watermark: params.watermark, inputImages: params.loadedReferenceImages.map((entry) => entry.sourceAsset), inputVideos: params.loadedReferenceVideos.map((entry) => entry.sourceAsset), + inputAudios: params.loadedReferenceAudios.map((entry) => entry.sourceAsset), + providerOptions: params.providerOptions, }); if (params.taskHandle) { recordVideoGenerationTaskProgress({ @@ -479,6 +586,7 @@ async function executeVideoGenerationJob(params: { progressSummary: "Saving generated video", }); } + const savedVideos = await Promise.all( result.videos.map((video) => saveMediaBuffer( @@ -683,18 +791,56 @@ export function createVideoGenerateTool(options?: { }); const audio = readBooleanToolParam(args, "audio"); const watermark = readBooleanToolParam(args, "watermark"); + // providerOptions must be a plain object. Arrays are objects in JS, so + // exclude them explicitly — a bogus call like `providerOptions: ["seed", 42]` + // would otherwise be cast to `Record` with numeric-string + // keys and silently forwarded to the provider. + const providerOptionsRaw = readSnakeCaseParamRaw(args, "providerOptions"); + if ( + providerOptionsRaw != null && + (typeof providerOptionsRaw !== "object" || Array.isArray(providerOptionsRaw)) + ) { + throw new ToolInputError( + "providerOptions must be a JSON object keyed by provider-specific option name.", + ); + } + const providerOptions = + providerOptionsRaw != null ? (providerOptionsRaw as Record) : undefined; const imageInputs = normalizeReferenceInputs({ args, singularKey: "image", pluralKey: "images", maxCount: MAX_INPUT_IMAGES, }); + // *Roles: parallel string arrays giving each asset a semantic role hint. + // Use readSnakeCaseParamRaw so both camelCase and snake_case keys are accepted. + const imageRoles = parseRoleArray({ + raw: readSnakeCaseParamRaw(args, "imageRoles"), + kind: "imageRoles", + assetCount: imageInputs.length, + }); const videoInputs = normalizeReferenceInputs({ args, singularKey: "video", pluralKey: "videos", maxCount: MAX_INPUT_VIDEOS, }); + const videoRoles = parseRoleArray({ + raw: readSnakeCaseParamRaw(args, "videoRoles"), + kind: "videoRoles", + assetCount: videoInputs.length, + }); + const audioInputs = normalizeReferenceInputs({ + args, + singularKey: "audioRef", + pluralKey: "audioRefs", + maxCount: MAX_INPUT_AUDIOS, + }); + const audioRoles = parseRoleArray({ + raw: readSnakeCaseParamRaw(args, "audioRoles"), + kind: "audioRoles", + assetCount: audioInputs.length, + }); const selectedProvider = resolveSelectedVideoGenerationProvider({ config: effectiveCfg, @@ -707,18 +853,44 @@ export function createVideoGenerateTool(options?: { workspaceDir: options?.workspaceDir, sandboxConfig, }); + // Attach roles to the loaded image assets (positional, by index into images[]). + for (let i = 0; i < loadedReferenceImages.length; i++) { + const role = imageRoles[i]; + if (role) { + loadedReferenceImages[i].sourceAsset.role = role; + } + } const loadedReferenceVideos = await loadReferenceAssets({ inputs: videoInputs, expectedKind: "video", workspaceDir: options?.workspaceDir, sandboxConfig, }); + for (let i = 0; i < loadedReferenceVideos.length; i++) { + const role = videoRoles[i]; + if (role) { + loadedReferenceVideos[i].sourceAsset.role = role; + } + } + const loadedReferenceAudios = await loadReferenceAssets({ + inputs: audioInputs, + expectedKind: "audio", + workspaceDir: options?.workspaceDir, + sandboxConfig, + }); + for (let i = 0; i < loadedReferenceAudios.length; i++) { + const role = audioRoles[i]; + if (role) { + loadedReferenceAudios[i].sourceAsset.role = role; + } + } validateVideoGenerationCapabilities({ provider: selectedProvider, model: parseVideoGenerationModelRef(model)?.model ?? model ?? selectedProvider?.defaultModel, inputImageCount: loadedReferenceImages.length, inputVideoCount: loadedReferenceVideos.length, + inputAudioCount: loadedReferenceAudios.length, size, aspectRatio, resolution, @@ -751,7 +923,9 @@ export function createVideoGenerateTool(options?: { filename, loadedReferenceImages, loadedReferenceVideos, + loadedReferenceAudios, taskHandle, + providerOptions, }); completeVideoGenerationTaskRun({ handle: taskHandle, @@ -843,7 +1017,9 @@ export function createVideoGenerateTool(options?: { filename, loadedReferenceImages, loadedReferenceVideos, + loadedReferenceAudios, taskHandle, + providerOptions, }); completeVideoGenerationTaskRun({ handle: taskHandle, diff --git a/src/plugin-sdk/video-generation.ts b/src/plugin-sdk/video-generation.ts index f1731ace696..0e537c7dd91 100644 --- a/src/plugin-sdk/video-generation.ts +++ b/src/plugin-sdk/video-generation.ts @@ -7,11 +7,13 @@ import type { AuthProfileStore } from "../agents/auth-profiles.js"; import type { OpenClawConfig } from "../config/config.js"; import type { GeneratedVideoAsset as CoreGeneratedVideoAsset, + VideoGenerationAssetRole as CoreVideoGenerationAssetRole, VideoGenerationMode as CoreVideoGenerationMode, VideoGenerationModeCapabilities as CoreVideoGenerationModeCapabilities, VideoGenerationProvider as CoreVideoGenerationProvider, VideoGenerationProviderCapabilities as CoreVideoGenerationProviderCapabilities, VideoGenerationProviderConfiguredContext as CoreVideoGenerationProviderConfiguredContext, + VideoGenerationProviderOptionType as CoreVideoGenerationProviderOptionType, VideoGenerationRequest as CoreVideoGenerationRequest, VideoGenerationResolution as CoreVideoGenerationResolution, VideoGenerationResult as CoreVideoGenerationResult, @@ -28,11 +30,29 @@ export type GeneratedVideoAsset = { export type VideoGenerationResolution = "480P" | "720P" | "768P" | "1080P"; +/** + * Canonical semantic role hints for reference assets (first/last frame, + * reference image/video/audio). Providers may accept additional role strings; + * the asset.role type accepts both canonical values and arbitrary strings. + */ +export type VideoGenerationAssetRole = + | "first_frame" + | "last_frame" + | "reference_image" + | "reference_video" + | "reference_audio"; + export type VideoGenerationSourceAsset = { url?: string; buffer?: Buffer; mimeType?: string; fileName?: string; + /** + * Optional semantic role hint forwarded to the provider. Canonical values + * come from `VideoGenerationAssetRole`; plain strings are accepted for + * provider-specific extensions. + */ + role?: VideoGenerationAssetRole | (string & {}); metadata?: Record; }; @@ -57,6 +77,10 @@ export type VideoGenerationRequest = { watermark?: boolean; inputImages?: VideoGenerationSourceAsset[]; inputVideos?: VideoGenerationSourceAsset[]; + /** Reference audio assets (e.g. background music) forwarded to the provider. */ + inputAudios?: VideoGenerationSourceAsset[]; + /** Arbitrary provider-specific parameters forwarded as-is (e.g. seed, draft, camerafixed). */ + providerOptions?: Record; }; export type VideoGenerationResult = { @@ -67,10 +91,19 @@ export type VideoGenerationResult = { export type VideoGenerationMode = "generate" | "imageToVideo" | "videoToVideo"; +/** + * Primitive type tag for a declared `providerOptions` key. Keep narrow — + * plugins that need richer shapes should leave them out of the typed contract + * and interpret the forwarded opaque value inside their own provider code. + */ +export type VideoGenerationProviderOptionType = "number" | "boolean" | "string"; + export type VideoGenerationModeCapabilities = { maxVideos?: number; maxInputImages?: number; maxInputVideos?: number; + /** Max number of reference audio assets the provider accepts (e.g. background music, voice reference). */ + maxInputAudios?: number; maxDurationSeconds?: number; supportedDurationSeconds?: readonly number[]; supportedDurationSecondsByModel?: Readonly>; @@ -82,6 +115,14 @@ export type VideoGenerationModeCapabilities = { supportsResolution?: boolean; supportsAudio?: boolean; supportsWatermark?: boolean; + /** + * Declared typed schema for `VideoGenerationRequest.providerOptions`. Keys + * listed here are accepted and validated against the declared primitive + * type before forwarding; unknown keys or type mismatches skip the + * candidate provider at runtime so mis-typed or provider-specific options + * never silently reach the wrong provider. + */ + providerOptions?: Readonly>; }; export type VideoGenerationTransformCapabilities = VideoGenerationModeCapabilities & { @@ -110,6 +151,10 @@ type AssertAssignable<_Left extends _Right, _Right> = true; type _VideoGenerationSdkCompat = [ AssertAssignable, AssertAssignable, + AssertAssignable, + AssertAssignable, + AssertAssignable, + AssertAssignable, AssertAssignable, AssertAssignable, AssertAssignable, diff --git a/src/video-generation/normalization.ts b/src/video-generation/normalization.ts index 030cc8aeea3..e8efbe626d6 100644 --- a/src/video-generation/normalization.ts +++ b/src/video-generation/normalization.ts @@ -103,6 +103,13 @@ export function resolveVideoGenerationOverrides(params: { requested: aspectRatio, applied: normalizedAspectRatio, }; + } else if (!normalizedAspectRatio) { + // Provider-specific sentinel values like `"adaptive"` are unparseable as a + // numeric ratio, so `resolveClosestAspectRatio` returns undefined for + // providers that don't list the sentinel in `caps.aspectRatios`. Surface + // the drop via `ignoredOverrides` so the tool result warning picks it up + // instead of silently forgetting the requested value. + ignoredOverrides.push({ key: "aspectRatio", value: aspectRatio }); } aspectRatio = normalizedAspectRatio; } else if (!caps.supportsAspectRatio && aspectRatio) { diff --git a/src/video-generation/runtime.test.ts b/src/video-generation/runtime.test.ts index 8ef7624586a..ad7b498f9db 100644 --- a/src/video-generation/runtime.test.ts +++ b/src/video-generation/runtime.test.ts @@ -5,7 +5,7 @@ import { } from "../../test/helpers/media-generation/runtime-module-mocks.js"; import type { OpenClawConfig } from "../config/types.js"; import { generateVideo, listRuntimeVideoGenerationProviders } from "./runtime.js"; -import type { VideoGenerationProvider } from "./types.js"; +import type { VideoGenerationProvider, VideoGenerationProviderOptionType } from "./types.js"; const mocks = getMediaGenerationRuntimeMocks(); @@ -135,6 +135,388 @@ describe("video-generation runtime", () => { ]); }); + it("forwards providerOptions to providers that declare the matching schema", async () => { + mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1"); + let seenProviderOptions: unknown; + const provider: VideoGenerationProvider = { + id: "video-plugin", + capabilities: { + providerOptions: { + seed: "number", + draft: "boolean", + camera_fixed: "boolean", + }, + }, + async generateVideo(req) { + seenProviderOptions = req.providerOptions; + return { videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }] }; + }, + }; + mocks.getVideoGenerationProvider.mockReturnValue(provider); + + await generateVideo({ + cfg: { + agents: { defaults: { videoGenerationModel: { primary: "video-plugin/vid-v1" } } }, + } as OpenClawConfig, + prompt: "test", + providerOptions: { seed: 42, draft: true, camera_fixed: false }, + }); + + expect(seenProviderOptions).toEqual({ seed: 42, draft: true, camera_fixed: false }); + }); + + it("passes providerOptions through to providers that do not declare any schema", async () => { + // Undeclared schema = backward-compatible pass-through: the provider receives the + // options and can handle or ignore them. No skip occurs. + mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1"); + let seenProviderOptions: unknown; + const provider: VideoGenerationProvider = { + id: "video-plugin", + capabilities: {}, // no providerOptions declared + async generateVideo(req) { + seenProviderOptions = req.providerOptions; + return { videos: [{ buffer: Buffer.from("x"), mimeType: "video/mp4" }] }; + }, + }; + mocks.getVideoGenerationProvider.mockReturnValue(provider); + + await generateVideo({ + cfg: { + agents: { defaults: { videoGenerationModel: { primary: "video-plugin/vid-v1" } } }, + } as OpenClawConfig, + prompt: "test", + providerOptions: { seed: 42 }, + }); + + expect(seenProviderOptions).toEqual({ seed: 42 }); + }); + + it("skips candidates that explicitly declare an empty providerOptions schema", async () => { + // Explicitly declared empty schema ({}) = provider has opted in and supports no options. + mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1"); + const provider: VideoGenerationProvider = { + id: "video-plugin", + capabilities: { providerOptions: {} as Record }, // explicitly empty + async generateVideo() { + throw new Error("should not be called"); + }, + }; + mocks.getVideoGenerationProvider.mockReturnValue(provider); + + await expect( + generateVideo({ + cfg: { + agents: { defaults: { videoGenerationModel: { primary: "video-plugin/vid-v1" } } }, + } as OpenClawConfig, + prompt: "test", + providerOptions: { seed: 42 }, + }), + ).rejects.toThrow(/does not accept providerOptions/); + }); + + it("skips candidates that declare a providerOptions schema missing the requested key", async () => { + mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1"); + const provider: VideoGenerationProvider = { + id: "video-plugin", + capabilities: { + providerOptions: { draft: "boolean" }, + }, + async generateVideo() { + throw new Error("should not be called"); + }, + }; + mocks.getVideoGenerationProvider.mockReturnValue(provider); + + await expect( + generateVideo({ + cfg: { + agents: { defaults: { videoGenerationModel: { primary: "video-plugin/vid-v1" } } }, + } as OpenClawConfig, + prompt: "test", + providerOptions: { seed: 42 }, + }), + ).rejects.toThrow(/does not accept providerOptions keys: seed \(accepted: draft\)/); + }); + + it("skips candidates when providerOptions values do not match the declared type", async () => { + mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1"); + const provider: VideoGenerationProvider = { + id: "video-plugin", + capabilities: { + providerOptions: { seed: "number" }, + }, + async generateVideo() { + throw new Error("should not be called"); + }, + }; + mocks.getVideoGenerationProvider.mockReturnValue(provider); + + await expect( + generateVideo({ + cfg: { + agents: { defaults: { videoGenerationModel: { primary: "video-plugin/vid-v1" } } }, + } as OpenClawConfig, + prompt: "test", + providerOptions: { seed: "forty-two" }, + }), + ).rejects.toThrow(/expects providerOptions\.seed to be a finite number, got string/); + }); + + it("falls over from a provider with explicitly empty providerOptions schema to one that has it", async () => { + // Explicitly empty schema ({}) causes a skip; undeclared schema passes through. + // Here "openai" declares {} to signal it has been audited and truly accepts no options. + mocks.getVideoGenerationProvider.mockImplementation((providerId: string) => { + if (providerId === "openai") { + return { + id: "openai", + defaultModel: "sora-2", + capabilities: { + providerOptions: {} as Record, + }, // explicitly empty: accepts no options + isConfigured: () => true, + async generateVideo() { + throw new Error("should not be called"); + }, + }; + } + if (providerId === "byteplus") { + return { + id: "byteplus", + defaultModel: "seedance-1-0-pro-250528", + capabilities: { + providerOptions: { seed: "number" }, + }, + isConfigured: () => true, + async generateVideo(req) { + expect(req.providerOptions).toEqual({ seed: 42 }); + return { + videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }], + model: "seedance-1-0-pro-250528", + }; + }, + }; + } + return undefined; + }); + mocks.listVideoGenerationProviders.mockReturnValue([ + { + id: "openai", + defaultModel: "sora-2", + capabilities: { providerOptions: {} as Record }, + isConfigured: () => true, + generateVideo: async () => ({ videos: [] }), + }, + { + id: "byteplus", + defaultModel: "seedance-1-0-pro-250528", + capabilities: { providerOptions: { seed: "number" } }, + isConfigured: () => true, + generateVideo: async () => ({ videos: [] }), + }, + ]); + + const result = await generateVideo({ + cfg: {} as OpenClawConfig, + prompt: "animate a cat", + providerOptions: { seed: 42 }, + }); + + expect(result.provider).toBe("byteplus"); + expect(result.attempts).toHaveLength(1); + expect(result.attempts[0]?.provider).toBe("openai"); + expect(result.attempts[0]?.error).toMatch(/does not accept providerOptions/); + }); + + it("skips providers that cannot satisfy reference audio inputs and falls back", async () => { + mocks.getVideoGenerationProvider.mockImplementation((providerId: string) => { + if (providerId === "openai") { + return { + id: "openai", + defaultModel: "sora-2", + capabilities: {}, + isConfigured: () => true, + async generateVideo() { + throw new Error("should not be called"); + }, + }; + } + if (providerId === "byteplus") { + return { + id: "byteplus", + defaultModel: "seedance-1-0-pro-250528", + capabilities: { + maxInputAudios: 1, + }, + isConfigured: () => true, + async generateVideo(req) { + expect(req.inputAudios).toEqual([ + { url: "https://example.com/reference-audio.mp3", role: "reference_audio" }, + ]); + return { + videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }], + model: "seedance-1-0-pro-250528", + }; + }, + }; + } + return undefined; + }); + mocks.listVideoGenerationProviders.mockReturnValue([ + { + id: "openai", + defaultModel: "sora-2", + capabilities: {}, + isConfigured: () => true, + generateVideo: async () => ({ videos: [] }), + }, + { + id: "byteplus", + defaultModel: "seedance-1-0-pro-250528", + capabilities: { maxInputAudios: 1 }, + isConfigured: () => true, + generateVideo: async () => ({ videos: [] }), + }, + ]); + + const result = await generateVideo({ + cfg: { + agents: { + defaults: { + videoGenerationModel: { primary: "openai/sora-2" }, + }, + }, + } as OpenClawConfig, + prompt: "animate a cat", + inputAudios: [{ url: "https://example.com/reference-audio.mp3", role: "reference_audio" }], + }); + + expect(result.provider).toBe("byteplus"); + expect(result.attempts).toHaveLength(1); + expect(result.attempts[0]?.provider).toBe("openai"); + expect(result.attempts[0]?.error).toMatch(/does not support reference audio inputs/); + }); + + it("fails when every candidate is skipped for unsupported reference audio inputs", async () => { + mocks.resolveAgentModelPrimaryValue.mockReturnValue("openai/sora-2"); + mocks.getVideoGenerationProvider.mockReturnValue({ + id: "openai", + capabilities: {}, + async generateVideo() { + throw new Error("should not be called"); + }, + }); + + await expect( + generateVideo({ + cfg: { + agents: { defaults: { videoGenerationModel: { primary: "openai/sora-2" } } }, + } as OpenClawConfig, + prompt: "animate a cat", + inputAudios: [{ url: "https://example.com/reference-audio.mp3" }], + }), + ).rejects.toThrow(/does not support reference audio inputs/); + }); + + it("skips providers whose hard duration cap is below the request and falls back", async () => { + let seenDurationSeconds: number | undefined; + mocks.getVideoGenerationProvider.mockImplementation((providerId: string) => { + if (providerId === "openai") { + return { + id: "openai", + defaultModel: "sora-2", + capabilities: { + generate: { + maxDurationSeconds: 4, + }, + }, + isConfigured: () => true, + async generateVideo() { + throw new Error("should not be called"); + }, + }; + } + if (providerId === "runway") { + return { + id: "runway", + defaultModel: "gen4.5", + capabilities: { + generate: { + maxDurationSeconds: 8, + }, + }, + isConfigured: () => true, + async generateVideo(req) { + seenDurationSeconds = req.durationSeconds; + return { + videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }], + model: "gen4.5", + }; + }, + }; + } + return undefined; + }); + mocks.listVideoGenerationProviders.mockReturnValue([ + { + id: "openai", + defaultModel: "sora-2", + capabilities: { generate: { maxDurationSeconds: 4 } }, + isConfigured: () => true, + generateVideo: async () => ({ videos: [] }), + }, + { + id: "runway", + defaultModel: "gen4.5", + capabilities: { generate: { maxDurationSeconds: 8 } }, + isConfigured: () => true, + generateVideo: async () => ({ videos: [] }), + }, + ]); + + const result = await generateVideo({ + cfg: { + agents: { + defaults: { + videoGenerationModel: { primary: "openai/sora-2" }, + }, + }, + } as OpenClawConfig, + prompt: "animate a cat", + durationSeconds: 6, + }); + + expect(result.provider).toBe("runway"); + expect(seenDurationSeconds).toBe(6); + expect(result.attempts).toHaveLength(1); + expect(result.attempts[0]?.provider).toBe("openai"); + expect(result.attempts[0]?.error).toMatch(/supports at most 4s per video, 6s requested/); + }); + + it("fails when every candidate is skipped for exceeding hard duration caps", async () => { + mocks.resolveAgentModelPrimaryValue.mockReturnValue("openai/sora-2"); + mocks.getVideoGenerationProvider.mockReturnValue({ + id: "openai", + capabilities: { + generate: { + maxDurationSeconds: 4, + }, + }, + async generateVideo() { + throw new Error("should not be called"); + }, + }); + + await expect( + generateVideo({ + cfg: { + agents: { defaults: { videoGenerationModel: { primary: "openai/sora-2" } } }, + } as OpenClawConfig, + prompt: "animate a cat", + durationSeconds: 6, + }), + ).rejects.toThrow(/supports at most 4s per video, 6s requested/); + }); + it("lists runtime video-generation providers through the provider registry", () => { const providers: VideoGenerationProvider[] = [ { diff --git a/src/video-generation/runtime.ts b/src/video-generation/runtime.ts index f1af46dc3e7..a6a59b1f532 100644 --- a/src/video-generation/runtime.ts +++ b/src/video-generation/runtime.ts @@ -10,6 +10,8 @@ import { resolveCapabilityModelCandidates, throwCapabilityGenerationFailure, } from "../media-generation/runtime-shared.js"; +import { resolveVideoGenerationModeCapabilities } from "./capabilities.js"; +import { resolveVideoGenerationSupportedDurations } from "./duration-support.js"; import { parseVideoGenerationModelRef } from "./model-ref.js"; import { resolveVideoGenerationOverrides } from "./normalization.js"; import { getVideoGenerationProvider, listVideoGenerationProviders } from "./provider-registry.js"; @@ -17,6 +19,7 @@ import type { GeneratedVideoAsset, VideoGenerationIgnoredOverride, VideoGenerationNormalization, + VideoGenerationProviderOptionType, VideoGenerationResolution, VideoGenerationResult, VideoGenerationSourceAsset, @@ -24,6 +27,62 @@ import type { const log = createSubsystemLogger("video-generation"); +/** + * Validate agent-supplied providerOptions against the candidate's declared + * schema. Returns a human-readable skip reason when the candidate cannot + * accept the supplied options, or undefined when everything checks out. + * + * Backward-compatible behavior: + * - Provider declares no schema (undefined): pass options through as-is. + * The provider receives them and may silently ignore unknown keys. This is + * the safe default for legacy / not-yet-migrated providers. + * - Provider explicitly declares an empty schema ({}): rejects any options. + * This is the opt-in signal that the provider has been audited and truly + * supports no provider-specific options. + * - Provider declares a typed schema: validates each key name and value type, + * skipping the candidate on any mismatch. + */ +function validateProviderOptionsAgainstDeclaration(params: { + providerId: string; + model: string; + providerOptions: Record; + declaration: Readonly> | undefined; +}): string | undefined { + const { providerId, model, providerOptions, declaration } = params; + const keys = Object.keys(providerOptions); + if (keys.length === 0) { + return undefined; + } + // Undeclared schema: pass through for backward compatibility. + if (declaration === undefined) { + return undefined; + } + // Explicitly declared empty schema: provider accepts no options. + if (Object.keys(declaration).length === 0) { + return `${providerId}/${model} does not accept providerOptions (caller supplied: ${keys.join(", ")}); skipping`; + } + const unknown = keys.filter((key) => !Object.hasOwn(declaration, key)); + if (unknown.length > 0) { + const accepted = Object.keys(declaration).join(", "); + return `${providerId}/${model} does not accept providerOptions keys: ${unknown.join(", ")} (accepted: ${accepted}); skipping`; + } + for (const key of keys) { + const expected = declaration[key]; + const value = providerOptions[key]; + const actual = typeof value; + if (expected === "number" && (actual !== "number" || !Number.isFinite(value as number))) { + return `${providerId}/${model} expects providerOptions.${key} to be a finite number, got ${actual}; skipping`; + } + if (expected === "boolean" && actual !== "boolean") { + return `${providerId}/${model} expects providerOptions.${key} to be a boolean, got ${actual}; skipping`; + } + if (expected === "string" && actual !== "string") { + return `${providerId}/${model} expects providerOptions.${key} to be a string, got ${actual}; skipping`; + } + } + return undefined; +} + export type GenerateVideoParams = { cfg: OpenClawConfig; prompt: string; @@ -38,6 +97,9 @@ export type GenerateVideoParams = { watermark?: boolean; inputImages?: VideoGenerationSourceAsset[]; inputVideos?: VideoGenerationSourceAsset[]; + inputAudios?: VideoGenerationSourceAsset[]; + /** Arbitrary provider-specific options forwarded as-is to provider.generateVideo. Core does not validate or log the contents. */ + providerOptions?: Record; }; export type GenerateVideoRuntimeResult = { @@ -79,6 +141,17 @@ export async function generateVideo( const attempts: FallbackAttempt[] = []; let lastError: unknown; + let skipWarnEmitted = false; + const warnOnFirstSkip = (reason: string) => { + // Skip events are common in normal fallback flow, so log the *first* one in + // a request at warn level with the reason, and leave the rest at debug. + // This gives the operator visible feedback that their primary provider was + // passed over without flooding logs on long fallback chains. + if (!skipWarnEmitted) { + skipWarnEmitted = true; + log.warn(`video-generation candidate skipped: ${reason}`); + } + }; for (const candidate of candidates) { const provider = getVideoGenerationProvider(candidate.provider, params.cfg); @@ -93,6 +166,109 @@ export async function generateVideo( continue; } + // Guard: skip candidates that cannot satisfy reference-input counts so + // we never silently drop audio/image/video refs by falling over to a + // provider that ignores them and "succeeds" without the caller's assets. + const inputImageCount = params.inputImages?.length ?? 0; + const inputVideoCount = params.inputVideos?.length ?? 0; + const inputAudioCount = params.inputAudios?.length ?? 0; + if (inputAudioCount > 0) { + const { capabilities: candCaps } = resolveVideoGenerationModeCapabilities({ + provider, + inputImageCount, + inputVideoCount, + }); + // Fall back to flat provider.capabilities.maxInputAudios for providers that + // set the all-modes default directly rather than nesting it in capabilities.generate etc. + const maxAudio = candCaps?.maxInputAudios ?? provider.capabilities.maxInputAudios ?? 0; + if (inputAudioCount > maxAudio) { + const error = + maxAudio === 0 + ? `${candidate.provider}/${candidate.model} does not support reference audio inputs; skipping to avoid silent audio drop` + : `${candidate.provider}/${candidate.model} supports at most ${maxAudio} reference audio(s), ${inputAudioCount} requested; skipping`; + attempts.push({ provider: candidate.provider, model: candidate.model, error }); + lastError = new Error(error); + warnOnFirstSkip(error); + log.debug( + `video-generation candidate skipped (audio capability): ${candidate.provider}/${candidate.model}`, + ); + continue; + } + } + + // Guard: skip candidates that do not accept the requested providerOptions keys, + // or whose declared providerOptions schema does not match the supplied value + // types. Same skip-in-fallback rationale as the audio guard above — we never + // want to silently forward provider-specific options to the wrong provider, + // but we also do not want to block valid fallback candidates that *do* accept + // them. Providers opt in by declaring `capabilities.providerOptions` on the + // active mode or on the flat provider capabilities. + if ( + params.providerOptions && + typeof params.providerOptions === "object" && + Object.keys(params.providerOptions).length > 0 + ) { + const { capabilities: optCaps } = resolveVideoGenerationModeCapabilities({ + provider, + inputImageCount, + inputVideoCount, + }); + const declaredOptions = + optCaps?.providerOptions ?? provider.capabilities.providerOptions ?? undefined; + const mismatch = validateProviderOptionsAgainstDeclaration({ + providerId: candidate.provider, + model: candidate.model, + providerOptions: params.providerOptions, + declaration: declaredOptions, + }); + if (mismatch) { + attempts.push({ provider: candidate.provider, model: candidate.model, error: mismatch }); + lastError = new Error(mismatch); + warnOnFirstSkip(mismatch); + log.debug( + `video-generation candidate skipped (providerOptions): ${candidate.provider}/${candidate.model}`, + ); + continue; + } + } + + // Guard: skip candidates whose maxDurationSeconds hard cap is below the requested + // duration. Only applies when the provider uses a simple max with no explicit + // supported-durations list — when a list exists, runtime normalization snaps to the + // nearest valid value so skipping is not appropriate. + const requestedDuration = params.durationSeconds; + if (typeof requestedDuration === "number" && Number.isFinite(requestedDuration)) { + const { capabilities: durCaps } = resolveVideoGenerationModeCapabilities({ + provider, + inputImageCount, + inputVideoCount, + }); + const supportedDurations = resolveVideoGenerationSupportedDurations({ + provider, + model: candidate.model, + inputImageCount, + inputVideoCount, + }); + const maxDuration = durCaps?.maxDurationSeconds ?? provider.capabilities.maxDurationSeconds; + if ( + !supportedDurations && + typeof maxDuration === "number" && + // Compare the normalized (rounded) duration, not the raw float, since + // resolveVideoGenerationOverrides applies Math.round before sending to the provider. + // A request for 4.4s against maxDurationSeconds=4 rounds to 4 and is valid. + Math.round(requestedDuration) > maxDuration + ) { + const error = `${candidate.provider}/${candidate.model} supports at most ${maxDuration}s per video, ${requestedDuration}s requested; skipping`; + attempts.push({ provider: candidate.provider, model: candidate.model, error }); + lastError = new Error(error); + warnOnFirstSkip(error); + log.debug( + `video-generation candidate skipped (duration capability): ${candidate.provider}/${candidate.model}`, + ); + continue; + } + } + try { const sanitized = resolveVideoGenerationOverrides({ provider, @@ -103,8 +279,8 @@ export async function generateVideo( durationSeconds: params.durationSeconds, audio: params.audio, watermark: params.watermark, - inputImageCount: params.inputImages?.length ?? 0, - inputVideoCount: params.inputVideos?.length ?? 0, + inputImageCount, + inputVideoCount, }); const result: VideoGenerationResult = await provider.generateVideo({ provider: candidate.provider, @@ -121,6 +297,8 @@ export async function generateVideo( watermark: sanitized.watermark, inputImages: params.inputImages, inputVideos: params.inputVideos, + inputAudios: params.inputAudios, + providerOptions: params.providerOptions, }); if (!Array.isArray(result.videos) || result.videos.length === 0) { throw new Error("Video generation provider returned no videos."); diff --git a/src/video-generation/types.ts b/src/video-generation/types.ts index 83fd595d960..fc257b8c437 100644 --- a/src/video-generation/types.ts +++ b/src/video-generation/types.ts @@ -11,11 +11,33 @@ export type GeneratedVideoAsset = { export type VideoGenerationResolution = "480P" | "720P" | "768P" | "1080P"; +/** + * Canonical semantic role hints for reference assets. The list covers the + * near-universal I2V vocabulary plus per-kind reference roles. Providers may + * accept additional role strings (extend the asset.role type with a plain + * string at call sites) — core forwards whatever value is set. + */ +export type VideoGenerationAssetRole = + | "first_frame" + | "last_frame" + | "reference_image" + | "reference_video" + | "reference_audio"; + export type VideoGenerationSourceAsset = { url?: string; buffer?: Buffer; mimeType?: string; fileName?: string; + /** + * Optional semantic role hint forwarded to the provider. Canonical values + * come from `VideoGenerationAssetRole`; plain strings are accepted for + * provider-specific extensions. Core does not validate the value beyond + * shape. + */ + // Union with `(string & {})` keeps autocomplete on the canonical values while + // still accepting arbitrary provider-specific role strings. + role?: VideoGenerationAssetRole | (string & {}); metadata?: Record; }; @@ -36,10 +58,15 @@ export type VideoGenerationRequest = { aspectRatio?: string; resolution?: VideoGenerationResolution; durationSeconds?: number; + /** Enable generated audio in the output when the provider supports it. Distinct from inputAudios (reference audio input). */ audio?: boolean; watermark?: boolean; inputImages?: VideoGenerationSourceAsset[]; inputVideos?: VideoGenerationSourceAsset[]; + /** Reference audio assets (e.g. background music). Role field on each asset is forwarded to the provider as-is. */ + inputAudios?: VideoGenerationSourceAsset[]; + /** Arbitrary provider-specific options forwarded as-is to provider.generateVideo. Core does not validate or log the contents. */ + providerOptions?: Record; }; export type VideoGenerationResult = { @@ -55,10 +82,21 @@ export type VideoGenerationIgnoredOverride = { export type VideoGenerationMode = "generate" | "imageToVideo" | "videoToVideo"; +/** + * Primitive type tag for a declared `providerOptions` key. Core validates + * the agent-supplied value against this tag before forwarding it to the + * provider. Kept deliberately narrow — plugins that need richer shapes + * should keep those fields out of the typed contract and reinterpret the + * forwarded opaque value inside their own provider code. + */ +export type VideoGenerationProviderOptionType = "number" | "boolean" | "string"; + export type VideoGenerationModeCapabilities = { maxVideos?: number; maxInputImages?: number; maxInputVideos?: number; + /** Max number of reference audio assets the provider accepts (e.g. background music, voice reference). */ + maxInputAudios?: number; maxDurationSeconds?: number; supportedDurationSeconds?: readonly number[]; supportedDurationSecondsByModel?: Readonly>; @@ -68,8 +106,17 @@ export type VideoGenerationModeCapabilities = { supportsSize?: boolean; supportsAspectRatio?: boolean; supportsResolution?: boolean; + /** Provider can generate audio in the output video. */ supportsAudio?: boolean; supportsWatermark?: boolean; + /** + * Declared typed schema for the opaque `VideoGenerationRequest.providerOptions` + * bag. Keys listed here are accepted; any other keys the agent passes are + * rejected at the runtime fallback boundary so mis-typed or provider-specific + * options never silently reach the wrong provider. Plugins that currently + * accept no providerOptions should leave this undefined or set to `{}`. + */ + providerOptions?: Readonly>; }; export type VideoGenerationTransformCapabilities = VideoGenerationModeCapabilities & { diff --git a/test/helpers/media-generation/runtime-module-mocks.ts b/test/helpers/media-generation/runtime-module-mocks.ts index 56fe5bd6451..521000d10ab 100644 --- a/test/helpers/media-generation/runtime-module-mocks.ts +++ b/test/helpers/media-generation/runtime-module-mocks.ts @@ -9,6 +9,7 @@ type ModelRef = { provider: string; model: string }; const mediaRuntimeMocks = vi.hoisted(() => { const debug = vi.fn(); + const warn = vi.fn(); const parseGenerationModelRef = (raw?: string): ModelRef | undefined => { const trimmed = raw?.trim(); if (!trimmed) { @@ -24,7 +25,7 @@ const mediaRuntimeMocks = vi.hoisted(() => { }; }; return { - createSubsystemLogger: vi.fn(() => ({ debug })), + createSubsystemLogger: vi.fn(() => ({ debug, warn: vi.fn() })), describeFailoverError: vi.fn(), getImageGenerationProvider: vi.fn< (providerId: string, config?: OpenClawConfig) => ImageGenerationProvider | undefined @@ -56,6 +57,7 @@ const mediaRuntimeMocks = vi.hoisted(() => { resolveAgentModelPrimaryValue: vi.fn<(value: unknown) => string | undefined>(() => undefined), resolveProviderAuthEnvVarCandidates: vi.fn(() => ({})), debug, + warn, }; });