From a932a58e8785fd7b885d2afbdad034ad95e082c7 Mon Sep 17 00:00:00 2001 From: Shivanker Goel Date: Sun, 26 Apr 2026 02:30:23 +0100 Subject: [PATCH] feat(fal): support Seedance reference video Adds fal Seedance 2.0 reference-to-video support with model-aware reference input limits. --- CHANGELOG.md | 3 + docs/plugins/sdk-provider-plugins.md | 8 +- docs/providers/fal.md | 29 ++- docs/tools/video-generation.md | 41 +-- .../fal/video-generation-provider.test.ts | 238 +++++++++++++++++- extensions/fal/video-generation-provider.ts | 202 ++++++++++++--- src/agents/tools/media-tool-shared.test.ts | 2 +- src/agents/tools/video-generate-tool.ts | 1 + src/plugin-sdk/video-generation.ts | 3 + src/video-generation/capabilities.test.ts | 81 ++++++ src/video-generation/capabilities.ts | 47 +++- src/video-generation/duration-support.ts | 1 + src/video-generation/live-test-helpers.ts | 3 + src/video-generation/normalization.ts | 1 + src/video-generation/runtime.test.ts | 55 ++++ src/video-generation/runtime.ts | 3 + src/video-generation/types.ts | 3 + .../provider-capability-assertions.ts | 24 +- 18 files changed, 675 insertions(+), 70 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83e157d73df..2b1ef914f1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,9 @@ Docs: https://docs.openclaw.ai - Providers/Volcengine: add Volcengine/BytePlus Seed Speech as a bundled TTS provider with API-key auth, native Ogg/Opus voice-note output, and MP3 audio-file output. (#55641) Thanks @xuruiray. - Android/Talk Mode: expose Talk Mode in the Voice tab with runtime-owned voice capture modes and microphone foreground-service escalation. Thanks @alex-latitude. - Providers/LiteLLM: register `litellm` as an image-generation provider so `image_generate model=litellm/...` calls and `agents.defaults.imageGenerationModel.fallbacks` entries resolve through the LiteLLM proxy. Thanks @zqchris. +- Providers/fal: add Seedance 2.0 reference-to-video models with multi-image, + video, and audio reference input mapping plus model-specific capability limits + for `video_generate`. Thanks @shivanker. - Codex harness: require Codex app-server `0.125.0` or newer and cover native MCP `PreToolUse`, `PostToolUse`, and `PermissionRequest` payloads through the OpenClaw hook relay. - Agents/Codex: teach prompts and `agents_list` to surface native Codex app-server availability so agents prefer `/codex ...` over Codex ACP unless ACP/acpx is explicit. Thanks @vincentkoc. - ACPX/Droid: add Factory Droid to the live ACP bind Docker matrix, including diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md index 574fb5e5dff..c88761bd3ca 100644 --- a/docs/plugins/sdk-provider-plugins.md +++ b/docs/plugins/sdk-provider-plugins.md @@ -626,7 +626,13 @@ API key auth, and dynamic model resolution. label: "Acme Video", capabilities: { generate: { maxVideos: 1, maxDurationSeconds: 10, supportsResolution: true }, - imageToVideo: { enabled: true, maxVideos: 1, maxInputImages: 1, maxDurationSeconds: 5 }, + imageToVideo: { + enabled: true, + maxVideos: 1, + maxInputImages: 1, + maxInputImagesByModel: { "acme/reference-to-video": 9 }, + maxDurationSeconds: 5, + }, videoToVideo: { enabled: false }, }, generateVideo: async (req) => ({ videos: [] }), diff --git a/docs/providers/fal.md b/docs/providers/fal.md index 8c5db1e1b45..3a444f05cbd 100644 --- a/docs/providers/fal.md +++ b/docs/providers/fal.md @@ -79,10 +79,10 @@ To use fal as the default image provider: The bundled `fal` video-generation provider defaults to `fal/fal-ai/minimax/video-01-live`. -| Capability | Value | -| ---------- | ------------------------------------------------------------ | -| Modes | Text-to-video, single-image reference | -| Runtime | Queue-backed submit/status/result flow for long-running jobs | +| Capability | Value | +| ---------- | ------------------------------------------------------------------ | +| Modes | Text-to-video, single-image reference, Seedance reference-to-video | +| Runtime | Queue-backed submit/status/result flow for long-running jobs | @@ -94,8 +94,10 @@ The bundled `fal` video-generation provider defaults to - `fal/bytedance/seedance-2.0/fast/text-to-video` - `fal/bytedance/seedance-2.0/fast/image-to-video` + - `fal/bytedance/seedance-2.0/fast/reference-to-video` - `fal/bytedance/seedance-2.0/text-to-video` - `fal/bytedance/seedance-2.0/image-to-video` + - `fal/bytedance/seedance-2.0/reference-to-video` @@ -113,6 +115,25 @@ The bundled `fal` video-generation provider defaults to ``` + + ```json5 + { + agents: { + defaults: { + videoGenerationModel: { + primary: "fal/bytedance/seedance-2.0/fast/reference-to-video", + }, + }, + }, + } + ``` + + Reference-to-video accepts up to 9 images, 3 videos, and 3 audio references + through the shared `video_generate` `images`, `videos`, and `audioRefs` + parameters, with at most 12 total reference files. + + + ```json5 { diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md index 8ca3345c839..1871a5ceb14 100644 --- a/docs/tools/video-generation.md +++ b/docs/tools/video-generation.md @@ -82,22 +82,22 @@ Duplicate prevention: if a video task is already `queued` or `running` for the c ## Supported providers -| Provider | Default model | Text | Image ref | Video ref | API key | -| --------------------- | ------------------------------- | ---- | ---------------------------------------------------- | ---------------- | ---------------------------------------- | -| Alibaba | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `MODELSTUDIO_API_KEY` | -| BytePlus (1.0) | `seedance-1-0-pro-250528` | Yes | Up to 2 images (I2V models only; first + last frame) | No | `BYTEPLUS_API_KEY` | -| BytePlus Seedance 1.5 | `seedance-1-5-pro-251215` | Yes | Up to 2 images (first + last frame via role) | No | `BYTEPLUS_API_KEY` | -| BytePlus Seedance 2.0 | `dreamina-seedance-2-0-260128` | Yes | Up to 9 reference images | Up to 3 videos | `BYTEPLUS_API_KEY` | -| ComfyUI | `workflow` | Yes | 1 image | No | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` | -| fal | `fal-ai/minimax/video-01-live` | Yes | 1 image | No | `FAL_KEY` | -| Google | `veo-3.1-fast-generate-preview` | Yes | 1 image | 1 video | `GEMINI_API_KEY` | -| MiniMax | `MiniMax-Hailuo-2.3` | Yes | 1 image | No | `MINIMAX_API_KEY` or MiniMax OAuth | -| OpenAI | `sora-2` | Yes | 1 image | 1 video | `OPENAI_API_KEY` | -| Qwen | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `QWEN_API_KEY` | -| Runway | `gen4.5` | Yes | 1 image | 1 video | `RUNWAYML_API_SECRET` | -| Together | `Wan-AI/Wan2.2-T2V-A14B` | Yes | 1 image | No | `TOGETHER_API_KEY` | -| Vydra | `veo3` | Yes | 1 image (`kling`) | No | `VYDRA_API_KEY` | -| xAI | `grok-imagine-video` | Yes | 1 first-frame image or up to 7 `reference_image`s | 1 video | `XAI_API_KEY` | +| Provider | Default model | Text | Image ref | Video ref | API key | +| --------------------- | ------------------------------- | ---- | ---------------------------------------------------- | ----------------------------------------------- | ---------------------------------------- | +| Alibaba | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `MODELSTUDIO_API_KEY` | +| BytePlus (1.0) | `seedance-1-0-pro-250528` | Yes | Up to 2 images (I2V models only; first + last frame) | No | `BYTEPLUS_API_KEY` | +| BytePlus Seedance 1.5 | `seedance-1-5-pro-251215` | Yes | Up to 2 images (first + last frame via role) | No | `BYTEPLUS_API_KEY` | +| BytePlus Seedance 2.0 | `dreamina-seedance-2-0-260128` | Yes | Up to 9 reference images | Up to 3 videos | `BYTEPLUS_API_KEY` | +| ComfyUI | `workflow` | Yes | 1 image | No | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` | +| fal | `fal-ai/minimax/video-01-live` | Yes | 1 image; up to 9 with Seedance reference-to-video | Up to 3 videos with Seedance reference-to-video | `FAL_KEY` | +| Google | `veo-3.1-fast-generate-preview` | Yes | 1 image | 1 video | `GEMINI_API_KEY` | +| MiniMax | `MiniMax-Hailuo-2.3` | Yes | 1 image | No | `MINIMAX_API_KEY` or MiniMax OAuth | +| OpenAI | `sora-2` | Yes | 1 image | 1 video | `OPENAI_API_KEY` | +| Qwen | `wan2.6-t2v` | Yes | Yes (remote URL) | Yes (remote URL) | `QWEN_API_KEY` | +| Runway | `gen4.5` | Yes | 1 image | 1 video | `RUNWAYML_API_SECRET` | +| Together | `Wan-AI/Wan2.2-T2V-A14B` | Yes | 1 image | No | `TOGETHER_API_KEY` | +| Vydra | `veo3` | Yes | 1 image (`kling`) | No | `VYDRA_API_KEY` | +| xAI | `grok-imagine-video` | Yes | 1 first-frame image or up to 7 `reference_image`s | 1 video | `XAI_API_KEY` | Some providers accept additional or alternate API key env vars. See individual [provider pages](#related) for details. @@ -114,7 +114,7 @@ and the shared live sweep. | Alibaba | Yes | Yes | Yes | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider needs remote `http(s)` video URLs | | BytePlus | Yes | Yes | No | `generate`, `imageToVideo` | | ComfyUI | Yes | Yes | No | Not in the shared sweep; workflow-specific coverage lives with Comfy tests | -| fal | Yes | Yes | No | `generate`, `imageToVideo` | +| fal | Yes | Yes | Yes | `generate`, `imageToVideo`; `videoToVideo` only when using Seedance reference-to-video | | Google | Yes | Yes | Yes | `generate`, `imageToVideo`; shared `videoToVideo` skipped because the current buffer-backed Gemini/Veo sweep does not accept that input | | MiniMax | Yes | Yes | No | `generate`, `imageToVideo` | | OpenAI | Yes | Yes | Yes | `generate`, `imageToVideo`; shared `videoToVideo` skipped because this org/input path currently needs provider-side inpaint/remix access | @@ -296,7 +296,7 @@ entries. - Uses a queue-backed flow for long-running jobs. Single image reference only. + Uses a queue-backed flow for long-running jobs. Most fal video models accept a single image reference. Seedance 2.0 reference-to-video models accept up to 9 images, 3 videos, and 3 audio references, with at most 12 total reference files. @@ -349,6 +349,7 @@ capabilities: { enabled: true, maxVideos: 1, maxInputImages: 1, + maxInputImagesByModel: { "provider/reference-to-video": 9 }, maxDurationSeconds: 5, }, videoToVideo: { @@ -366,6 +367,10 @@ enough to advertise transform-mode support. Providers should declare contract tests, and the shared `video_generate` tool can validate mode support deterministically. +When one model in a provider has wider reference-input support than the rest, +use `maxInputImagesByModel`, `maxInputVideosByModel`, or +`maxInputAudiosByModel` instead of raising the mode-wide limit. + ## Live tests Opt-in live coverage for the shared bundled providers: diff --git a/extensions/fal/video-generation-provider.test.ts b/extensions/fal/video-generation-provider.test.ts index 71ddc8ab701..f0ac363ed64 100644 --- a/extensions/fal/video-generation-provider.test.ts +++ b/extensions/fal/video-generation-provider.test.ts @@ -81,6 +81,13 @@ describe("fal video generation provider", () => { .mockResolvedValueOnce(releasedVideo({ contentType: "video/mp4", bytes: params.bytes })); } + function getSubmitBody(): Record { + return JSON.parse(String(fetchGuardMock.mock.calls[0]?.[0]?.init?.body ?? "{}")) as Record< + string, + unknown + >; + } + afterEach(() => { vi.restoreAllMocks(); fetchGuardMock.mockReset(); @@ -88,7 +95,21 @@ describe("fal video generation provider", () => { }); it("declares explicit mode capabilities", () => { - expectExplicitVideoGenerationCapabilities(buildFalVideoGenerationProvider()); + const provider = buildFalVideoGenerationProvider(); + expectExplicitVideoGenerationCapabilities(provider); + expect(provider.capabilities.imageToVideo?.maxInputImages).toBe(1); + expect( + provider.capabilities.imageToVideo?.maxInputImagesByModel?.[ + "bytedance/seedance-2.0/fast/reference-to-video" + ], + ).toBe(9); + expect(provider.capabilities.videoToVideo?.maxInputVideos).toBe(0); + expect( + Object.keys(provider.capabilities.videoToVideo?.supportedDurationSecondsByModel ?? {}), + ).toEqual([ + "bytedance/seedance-2.0/fast/reference-to-video", + "bytedance/seedance-2.0/reference-to-video", + ]); }); it("submits fal video jobs through the queue API and downloads the completed result", async () => { @@ -152,8 +173,10 @@ describe("fal video generation provider", () => { "fal-ai/heygen/v2/video-agent", "bytedance/seedance-2.0/fast/text-to-video", "bytedance/seedance-2.0/fast/image-to-video", + "bytedance/seedance-2.0/fast/reference-to-video", "bytedance/seedance-2.0/text-to-video", "bytedance/seedance-2.0/image-to-video", + "bytedance/seedance-2.0/reference-to-video", ]), ); }); @@ -187,10 +210,7 @@ describe("fal video generation provider", () => { url: "https://queue.fal.run/fal-ai/heygen/v2/video-agent", }), ); - const submitBody = JSON.parse( - String(fetchGuardMock.mock.calls[0]?.[0]?.init?.body ?? "{}"), - ) as Record; - expect(submitBody).toEqual({ + expect(getSubmitBody()).toEqual({ prompt: "A founder explains OpenClaw in a concise studio video", }); expect(result.metadata).toEqual({ @@ -229,10 +249,7 @@ describe("fal video generation provider", () => { url: "https://queue.fal.run/bytedance/seedance-2.0/fast/text-to-video", }), ); - const submitBody = JSON.parse( - String(fetchGuardMock.mock.calls[0]?.[0]?.init?.body ?? "{}"), - ) as Record; - expect(submitBody).toEqual({ + expect(getSubmitBody()).toEqual({ prompt: "A chrome lobster drives a tiny kart across a neon pier", aspect_ratio: "16:9", resolution: "720p", @@ -244,4 +261,207 @@ describe("fal video generation provider", () => { seed: 42, }); }); + + it("submits Seedance 2 image-to-video requests with a single image_url", async () => { + mockFalProviderRuntime(); + mockCompletedFalVideoJob({ + requestId: "seedance-i2v-req-123", + statusUrl: + "https://queue.fal.run/bytedance/seedance-2.0/fast/image-to-video/requests/seedance-i2v-req-123/status", + responseUrl: + "https://queue.fal.run/bytedance/seedance-2.0/fast/image-to-video/requests/seedance-i2v-req-123", + videoUrl: "https://fal.run/files/seedance-i2v.mp4", + bytes: "seedance-i2v-mp4-bytes", + }); + + const provider = buildFalVideoGenerationProvider(); + await provider.generateVideo({ + provider: "fal", + model: "bytedance/seedance-2.0/fast/image-to-video", + prompt: "Animate this product still with a slow orbit", + durationSeconds: 6, + inputImages: [{ url: "https://example.com/start-frame.png" }], + cfg: {}, + }); + + expect(getSubmitBody()).toEqual({ + prompt: "Animate this product still with a slow orbit", + image_url: "https://example.com/start-frame.png", + duration: "6", + }); + }); + + it("submits Seedance 2 reference-to-video requests with image, video, and audio URLs", async () => { + mockFalProviderRuntime(); + mockCompletedFalVideoJob({ + requestId: "seedance-ref-req-123", + statusUrl: + "https://queue.fal.run/bytedance/seedance-2.0/fast/reference-to-video/requests/seedance-ref-req-123/status", + responseUrl: + "https://queue.fal.run/bytedance/seedance-2.0/fast/reference-to-video/requests/seedance-ref-req-123", + videoUrl: "https://fal.run/files/seedance-ref.mp4", + bytes: "seedance-ref-mp4-bytes", + responseExtras: { seed: 1234 }, + }); + + const provider = buildFalVideoGenerationProvider(); + const result = await provider.generateVideo({ + provider: "fal", + model: "bytedance/seedance-2.0/fast/reference-to-video", + prompt: "Blend @Image1, @Image2, @Video1, @Video2, and @Audio1 into one short film", + durationSeconds: 8, + aspectRatio: "9:16", + resolution: "480P", + audio: false, + inputImages: [ + { url: "https://example.com/reference-1.png" }, + { buffer: Buffer.from("local-image"), mimeType: "image/webp" }, + ], + inputVideos: [ + { url: "https://example.com/reference-1.mp4" }, + { buffer: Buffer.from("local-video"), mimeType: "video/quicktime" }, + ], + inputAudios: [ + { url: "https://example.com/reference-1.mp3" }, + { buffer: Buffer.from("local-audio"), mimeType: "audio/wav" }, + ], + cfg: {}, + }); + + expect(fetchGuardMock).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ + url: "https://queue.fal.run/bytedance/seedance-2.0/fast/reference-to-video", + }), + ); + expect(getSubmitBody()).toEqual({ + prompt: "Blend @Image1, @Image2, @Video1, @Video2, and @Audio1 into one short film", + image_urls: [ + "https://example.com/reference-1.png", + `data:image/webp;base64,${Buffer.from("local-image").toString("base64")}`, + ], + video_urls: [ + "https://example.com/reference-1.mp4", + `data:video/quicktime;base64,${Buffer.from("local-video").toString("base64")}`, + ], + audio_urls: [ + "https://example.com/reference-1.mp3", + `data:audio/wav;base64,${Buffer.from("local-audio").toString("base64")}`, + ], + aspect_ratio: "9:16", + resolution: "480p", + duration: "8", + generate_audio: false, + }); + expect(result.metadata).toEqual({ + requestId: "seedance-ref-req-123", + seed: 1234, + }); + }); + + it("rejects video, audio, and multiple image references for non-reference fal models", async () => { + const provider = buildFalVideoGenerationProvider(); + + await expect( + provider.generateVideo({ + provider: "fal", + model: "fal-ai/minimax/video-01-live", + prompt: "Animate this", + inputImages: [ + { url: "https://example.com/one.png" }, + { url: "https://example.com/two.png" }, + ], + cfg: {}, + }), + ).rejects.toThrow("fal video generation supports at most one image reference."); + + await expect( + provider.generateVideo({ + provider: "fal", + model: "fal-ai/minimax/video-01-live", + prompt: "Animate this", + inputVideos: [{ url: "https://example.com/reference.mp4" }], + cfg: {}, + }), + ).rejects.toThrow("fal video generation does not support video reference inputs."); + + await expect( + provider.generateVideo({ + provider: "fal", + model: "fal-ai/minimax/video-01-live", + prompt: "Animate this", + inputAudios: [{ url: "https://example.com/reference.mp3" }], + cfg: {}, + }), + ).rejects.toThrow("fal video generation does not support audio reference inputs."); + }); + + it("rejects over-limit and audio-only Seedance reference-to-video requests", async () => { + const provider = buildFalVideoGenerationProvider(); + const model = "bytedance/seedance-2.0/fast/reference-to-video"; + + await expect( + provider.generateVideo({ + provider: "fal", + model, + prompt: "Too many images", + inputImages: Array.from({ length: 10 }, (_, index) => ({ + url: `https://example.com/image-${index}.png`, + })), + cfg: {}, + }), + ).rejects.toThrow("fal Seedance reference-to-video supports at most 9 reference images."); + + await expect( + provider.generateVideo({ + provider: "fal", + model, + prompt: "Too many videos", + inputVideos: Array.from({ length: 4 }, (_, index) => ({ + url: `https://example.com/video-${index}.mp4`, + })), + cfg: {}, + }), + ).rejects.toThrow("fal Seedance reference-to-video supports at most 3 reference videos."); + + await expect( + provider.generateVideo({ + provider: "fal", + model, + prompt: "Too many audios", + inputAudios: Array.from({ length: 4 }, (_, index) => ({ + url: `https://example.com/audio-${index}.mp3`, + })), + cfg: {}, + }), + ).rejects.toThrow("fal Seedance reference-to-video supports at most 3 reference audios."); + + await expect( + provider.generateVideo({ + provider: "fal", + model, + prompt: "Too many total files", + inputImages: Array.from({ length: 9 }, (_, index) => ({ + url: `https://example.com/image-${index}.png`, + })), + inputVideos: Array.from({ length: 3 }, (_, index) => ({ + url: `https://example.com/video-${index}.mp4`, + })), + inputAudios: [{ url: "https://example.com/audio.mp3" }], + cfg: {}, + }), + ).rejects.toThrow("fal Seedance reference-to-video supports at most 12 total reference files."); + + await expect( + provider.generateVideo({ + provider: "fal", + model, + prompt: "Audio only", + inputAudios: [{ url: "https://example.com/audio.mp3" }], + cfg: {}, + }), + ).rejects.toThrow( + "fal Seedance reference-to-video requires at least one image or video reference when audio references are provided.", + ); + }); }); diff --git a/extensions/fal/video-generation-provider.ts b/extensions/fal/video-generation-provider.ts index ce4d145b201..0bb0e80c2c9 100644 --- a/extensions/fal/video-generation-provider.ts +++ b/extensions/fal/video-generation-provider.ts @@ -23,13 +23,34 @@ const DEFAULT_FAL_BASE_URL = "https://fal.run"; const DEFAULT_FAL_QUEUE_BASE_URL = "https://queue.fal.run"; const DEFAULT_FAL_VIDEO_MODEL = "fal-ai/minimax/video-01-live"; const HEYGEN_VIDEO_AGENT_MODEL = "fal-ai/heygen/v2/video-agent"; -const SEEDANCE_2_VIDEO_MODELS = [ +const SEEDANCE_2_TEXT_IMAGE_VIDEO_MODELS = [ "bytedance/seedance-2.0/fast/text-to-video", "bytedance/seedance-2.0/fast/image-to-video", "bytedance/seedance-2.0/text-to-video", "bytedance/seedance-2.0/image-to-video", ] as const; +const SEEDANCE_2_REFERENCE_VIDEO_MODELS = [ + "bytedance/seedance-2.0/fast/reference-to-video", + "bytedance/seedance-2.0/reference-to-video", +] as const; +const SEEDANCE_2_VIDEO_MODELS = [ + ...SEEDANCE_2_TEXT_IMAGE_VIDEO_MODELS, + ...SEEDANCE_2_REFERENCE_VIDEO_MODELS, +] as const; const SEEDANCE_2_DURATION_SECONDS = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] as const; +const SEEDANCE_REFERENCE_MAX_IMAGES = 9; +const SEEDANCE_REFERENCE_MAX_VIDEOS = 3; +const SEEDANCE_REFERENCE_MAX_AUDIOS = 3; +const SEEDANCE_REFERENCE_MAX_FILES = 12; +const SEEDANCE_REFERENCE_MAX_IMAGES_BY_MODEL = Object.fromEntries( + SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_REFERENCE_MAX_IMAGES]), +); +const SEEDANCE_REFERENCE_MAX_VIDEOS_BY_MODEL = Object.fromEntries( + SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_REFERENCE_MAX_VIDEOS]), +); +const SEEDANCE_REFERENCE_MAX_AUDIOS_BY_MODEL = Object.fromEntries( + SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_REFERENCE_MAX_AUDIOS]), +); const DEFAULT_HTTP_TIMEOUT_MS = 30_000; const DEFAULT_OPERATION_TIMEOUT_MS = 600_000; const POLL_INTERVAL_MS = 5_000; @@ -128,6 +149,12 @@ function isFalSeedance2Model(model: string): boolean { return SEEDANCE_2_VIDEO_MODELS.includes(model as (typeof SEEDANCE_2_VIDEO_MODELS)[number]); } +function isFalSeedance2ReferenceModel(model: string): boolean { + return SEEDANCE_2_REFERENCE_VIDEO_MODELS.includes( + model as (typeof SEEDANCE_2_REFERENCE_VIDEO_MODELS)[number], + ); +} + function isFalHeyGenVideoAgentModel(model: string): boolean { return normalizeLowercaseStringOrEmpty(model) === HEYGEN_VIDEO_AGENT_MODEL; } @@ -156,6 +183,55 @@ function resolveFalDuration( return duration; } +function resolveFalReferenceUrl( + asset: NonNullable[number] | undefined, + defaultMimeType: string, + label: string, +): string { + const assetUrl = normalizeOptionalString(asset?.url); + if (assetUrl) { + return assetUrl; + } + if (!asset?.buffer) { + throw new Error(`fal ${label} is missing media data.`); + } + return toDataUrl(asset.buffer, normalizeOptionalString(asset.mimeType) ?? defaultMimeType); +} + +function resolveFalReferenceUrls( + assets: VideoGenerationRequest["inputImages"], + defaultMimeType: string, + label: string, +): string[] { + return (assets ?? []).map((asset) => resolveFalReferenceUrl(asset, defaultMimeType, label)); +} + +function applyFalSeedanceControls(params: { + req: VideoGenerationRequest; + model: string; + body: Record; +}): void { + const aspectRatio = normalizeOptionalString(params.req.aspectRatio); + if (aspectRatio) { + params.body.aspect_ratio = aspectRatio; + } + const size = normalizeOptionalString(params.req.size); + if (size) { + params.body.size = size; + } + const resolution = resolveFalResolution(params.req.resolution, params.model); + if (resolution) { + params.body.resolution = resolution; + } + const duration = resolveFalDuration(params.req.durationSeconds, params.model); + if (duration) { + params.body.duration = duration; + } + if (isFalSeedance2Model(params.model) && typeof params.req.audio === "boolean") { + params.body.generate_audio = params.req.audio; + } +} + function buildFalVideoRequestBody(params: { req: VideoGenerationRequest; model: string; @@ -163,6 +239,36 @@ function buildFalVideoRequestBody(params: { const requestBody: Record = { prompt: params.req.prompt, }; + + if (isFalSeedance2ReferenceModel(params.model)) { + const imageUrls = resolveFalReferenceUrls( + params.req.inputImages, + "image/png", + "reference image", + ); + const videoUrls = resolveFalReferenceUrls( + params.req.inputVideos, + "video/mp4", + "reference video", + ); + const audioUrls = resolveFalReferenceUrls( + params.req.inputAudios, + "audio/mpeg", + "reference audio", + ); + if (imageUrls.length > 0) { + requestBody.image_urls = imageUrls; + } + if (videoUrls.length > 0) { + requestBody.video_urls = videoUrls; + } + if (audioUrls.length > 0) { + requestBody.audio_urls = audioUrls; + } + applyFalSeedanceControls({ req: params.req, model: params.model, body: requestBody }); + return requestBody; + } + const input = params.req.inputImages?.[0]; if (input) { requestBody.image_url = normalizeOptionalString(input.url) @@ -177,28 +283,58 @@ function buildFalVideoRequestBody(params: { if (isFalMiniMaxLiveModel(params.model) || isFalHeyGenVideoAgentModel(params.model)) { return requestBody; } - const aspectRatio = normalizeOptionalString(params.req.aspectRatio); - if (aspectRatio) { - requestBody.aspect_ratio = aspectRatio; - } - const size = normalizeOptionalString(params.req.size); - if (size) { - requestBody.size = size; - } - const resolution = resolveFalResolution(params.req.resolution, params.model); - if (resolution) { - requestBody.resolution = resolution; - } - const duration = resolveFalDuration(params.req.durationSeconds, params.model); - if (duration) { - requestBody.duration = duration; - } - if (isFalSeedance2Model(params.model) && typeof params.req.audio === "boolean") { - requestBody.generate_audio = params.req.audio; - } + applyFalSeedanceControls({ req: params.req, model: params.model, body: requestBody }); return requestBody; } +function validateFalVideoReferenceInputs(params: { + req: VideoGenerationRequest; + model: string; +}): void { + const imageCount = params.req.inputImages?.length ?? 0; + const videoCount = params.req.inputVideos?.length ?? 0; + const audioCount = params.req.inputAudios?.length ?? 0; + if (isFalSeedance2ReferenceModel(params.model)) { + if (imageCount > SEEDANCE_REFERENCE_MAX_IMAGES) { + throw new Error( + `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_IMAGES} reference images.`, + ); + } + if (videoCount > SEEDANCE_REFERENCE_MAX_VIDEOS) { + throw new Error( + `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_VIDEOS} reference videos.`, + ); + } + if (audioCount > SEEDANCE_REFERENCE_MAX_AUDIOS) { + throw new Error( + `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_AUDIOS} reference audios.`, + ); + } + const totalFiles = imageCount + videoCount + audioCount; + if (totalFiles > SEEDANCE_REFERENCE_MAX_FILES) { + throw new Error( + `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_FILES} total reference files.`, + ); + } + if (audioCount > 0 && imageCount === 0 && videoCount === 0) { + throw new Error( + "fal Seedance reference-to-video requires at least one image or video reference when audio references are provided.", + ); + } + return; + } + + if (videoCount > 0) { + throw new Error("fal video generation does not support video reference inputs."); + } + if (audioCount > 0) { + throw new Error("fal video generation does not support audio reference inputs."); + } + if (imageCount > 1) { + throw new Error("fal video generation supports at most one image reference."); + } +} + async function fetchFalJson(params: { url: string; init?: RequestInit; @@ -317,6 +453,8 @@ export function buildFalVideoGenerationProvider(): VideoGenerationProvider { enabled: true, maxVideos: 1, maxInputImages: 1, + maxInputImagesByModel: SEEDANCE_REFERENCE_MAX_IMAGES_BY_MODEL, + maxInputAudiosByModel: SEEDANCE_REFERENCE_MAX_AUDIOS_BY_MODEL, supportedDurationSecondsByModel: Object.fromEntries( SEEDANCE_2_VIDEO_MODELS.map((model) => [model, SEEDANCE_2_DURATION_SECONDS]), ), @@ -326,16 +464,25 @@ export function buildFalVideoGenerationProvider(): VideoGenerationProvider { supportsAudio: true, }, videoToVideo: { - enabled: false, + enabled: true, + maxVideos: 1, + maxInputImages: 0, + maxInputImagesByModel: SEEDANCE_REFERENCE_MAX_IMAGES_BY_MODEL, + maxInputVideos: 0, + maxInputVideosByModel: SEEDANCE_REFERENCE_MAX_VIDEOS_BY_MODEL, + maxInputAudiosByModel: SEEDANCE_REFERENCE_MAX_AUDIOS_BY_MODEL, + supportedDurationSecondsByModel: Object.fromEntries( + SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_2_DURATION_SECONDS]), + ), + supportsAspectRatio: true, + supportsResolution: true, + supportsSize: true, + supportsAudio: true, }, }, async generateVideo(req) { - if ((req.inputVideos?.length ?? 0) > 0) { - throw new Error("fal video generation does not support video reference inputs."); - } - if ((req.inputImages?.length ?? 0) > 1) { - throw new Error("fal video generation supports at most one image reference."); - } + const model = normalizeOptionalString(req.model) || DEFAULT_FAL_VIDEO_MODEL; + validateFalVideoReferenceInputs({ req, model }); const auth = await resolveApiKeyForProvider({ provider: "fal", cfg: req.cfg, @@ -358,7 +505,6 @@ export function buildFalVideoGenerationProvider(): VideoGenerationProvider { capability: "video", transport: "http", }); - const model = normalizeOptionalString(req.model) || DEFAULT_FAL_VIDEO_MODEL; const requestBody = buildFalVideoRequestBody({ req, model }); const policy = buildPolicy(allowPrivateNetwork); const queueBaseUrl = resolveFalQueueBaseUrl(baseUrl); diff --git a/src/agents/tools/media-tool-shared.test.ts b/src/agents/tools/media-tool-shared.test.ts index 772ea09adad..0d229e6b54c 100644 --- a/src/agents/tools/media-tool-shared.test.ts +++ b/src/agents/tools/media-tool-shared.test.ts @@ -96,5 +96,5 @@ describe("resolveModelFromRegistry", () => { ["kimchi", "kimchi/claude-opus-4-6"], ]); expect(result).toBe(foundModel); - }); + }, 180_000); }); diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts index 835c12d31b0..44115387d54 100644 --- a/src/agents/tools/video-generate-tool.ts +++ b/src/agents/tools/video-generate-tool.ts @@ -350,6 +350,7 @@ function validateVideoGenerationCapabilities(params: { }); const { capabilities: caps } = resolveVideoGenerationModeCapabilities({ provider, + model: params.model, inputImageCount: params.inputImageCount, inputVideoCount: params.inputVideoCount, }); diff --git a/src/plugin-sdk/video-generation.ts b/src/plugin-sdk/video-generation.ts index 2ba2b091856..6223a27ef96 100644 --- a/src/plugin-sdk/video-generation.ts +++ b/src/plugin-sdk/video-generation.ts @@ -106,9 +106,12 @@ export type VideoGenerationProviderOptionType = "number" | "boolean" | "string"; export type VideoGenerationModeCapabilities = { maxVideos?: number; maxInputImages?: number; + maxInputImagesByModel?: Readonly>; maxInputVideos?: number; + maxInputVideosByModel?: Readonly>; /** Max number of reference audio assets the provider accepts (e.g. background music, voice reference). */ maxInputAudios?: number; + maxInputAudiosByModel?: Readonly>; maxDurationSeconds?: number; supportedDurationSeconds?: readonly number[]; supportedDurationSecondsByModel?: Readonly>; diff --git a/src/video-generation/capabilities.test.ts b/src/video-generation/capabilities.test.ts index dab2e2f16fb..926959bb0ba 100644 --- a/src/video-generation/capabilities.test.ts +++ b/src/video-generation/capabilities.test.ts @@ -75,4 +75,85 @@ describe("video-generation capabilities", () => { capabilities: undefined, }); }); + + it("uses explicit video-to-video capabilities for mixed reference requests", () => { + const provider = createProvider({ + imageToVideo: { + enabled: true, + maxInputImages: 2, + }, + videoToVideo: { + enabled: true, + maxInputImages: 2, + maxInputVideos: 3, + maxInputAudios: 1, + }, + }); + + expect(resolveVideoGenerationMode({ inputImageCount: 1, inputVideoCount: 1 })).toBeNull(); + expect( + resolveVideoGenerationModeCapabilities({ + provider, + inputImageCount: 1, + inputVideoCount: 1, + }), + ).toEqual({ + mode: null, + capabilities: { + enabled: true, + maxInputImages: 2, + maxInputVideos: 3, + maxInputAudios: 1, + }, + }); + }); + + it("applies model-specific reference input limits", () => { + const provider = createProvider({ + imageToVideo: { + enabled: true, + maxInputImages: 1, + maxInputImagesByModel: { + "vendor/reference-to-video": 9, + }, + }, + videoToVideo: { + enabled: true, + maxInputImages: 0, + maxInputImagesByModel: { + "vendor/reference-to-video": 9, + }, + maxInputVideos: 0, + maxInputVideosByModel: { + "vendor/reference-to-video": 3, + }, + }, + }); + + expect( + resolveVideoGenerationModeCapabilities({ + provider, + model: "vendor/text-to-video", + inputImageCount: 2, + }).capabilities?.maxInputImages, + ).toBe(1); + expect( + resolveVideoGenerationModeCapabilities({ + provider, + model: "vendor/reference-to-video", + inputImageCount: 2, + }).capabilities?.maxInputImages, + ).toBe(9); + expect( + resolveVideoGenerationModeCapabilities({ + provider, + model: "vendor/reference-to-video", + inputImageCount: 1, + inputVideoCount: 1, + }).capabilities, + ).toMatchObject({ + maxInputImages: 9, + maxInputVideos: 3, + }); + }); }); diff --git a/src/video-generation/capabilities.ts b/src/video-generation/capabilities.ts index d2d789700c7..37451252b2e 100644 --- a/src/video-generation/capabilities.ts +++ b/src/video-generation/capabilities.ts @@ -40,33 +40,74 @@ export function listSupportedVideoGenerationModes( export function resolveVideoGenerationModeCapabilities(params: { provider?: Pick; + model?: string; inputImageCount?: number; inputVideoCount?: number; }): { mode: VideoGenerationMode | null; capabilities: VideoGenerationModeCapabilities | VideoGenerationTransformCapabilities | undefined; } { + const inputImageCount = params.inputImageCount ?? 0; + const inputVideoCount = params.inputVideoCount ?? 0; const mode = resolveVideoGenerationMode(params); const capabilities = params.provider?.capabilities; + const withModelLimits = < + T extends VideoGenerationModeCapabilities | VideoGenerationTransformCapabilities | undefined, + >( + caps: T, + ): T => { + const model = params.model?.trim(); + if (!caps || !model) { + return caps; + } + const maxInputImages = caps.maxInputImagesByModel?.[model]; + const maxInputVideos = caps.maxInputVideosByModel?.[model]; + const maxInputAudios = caps.maxInputAudiosByModel?.[model]; + if ( + typeof maxInputImages !== "number" && + typeof maxInputVideos !== "number" && + typeof maxInputAudios !== "number" + ) { + return caps; + } + return { + ...caps, + ...(typeof maxInputImages === "number" ? { maxInputImages } : {}), + ...(typeof maxInputVideos === "number" ? { maxInputVideos } : {}), + ...(typeof maxInputAudios === "number" ? { maxInputAudios } : {}), + }; + }; if (!capabilities) { return { mode, capabilities: undefined }; } if (mode === "generate") { return { mode, - capabilities: capabilities.generate, + capabilities: withModelLimits(capabilities.generate), }; } if (mode === "imageToVideo") { return { mode, - capabilities: capabilities.imageToVideo, + capabilities: withModelLimits(capabilities.imageToVideo), }; } if (mode === "videoToVideo") { return { mode, - capabilities: capabilities.videoToVideo, + capabilities: withModelLimits(capabilities.videoToVideo), + }; + } + const videoToVideoCapabilities = withModelLimits(capabilities.videoToVideo); + if ( + inputImageCount > 0 && + inputVideoCount > 0 && + videoToVideoCapabilities?.enabled && + (videoToVideoCapabilities.maxInputImages ?? 0) > 0 + ) { + return { + mode, + capabilities: videoToVideoCapabilities, }; } return { diff --git a/src/video-generation/duration-support.ts b/src/video-generation/duration-support.ts index 28f21e8020c..6bdae9774ae 100644 --- a/src/video-generation/duration-support.ts +++ b/src/video-generation/duration-support.ts @@ -23,6 +23,7 @@ export function resolveVideoGenerationSupportedDurations(params: { }): number[] | undefined { const { capabilities: caps } = resolveVideoGenerationModeCapabilities({ provider: params.provider, + model: params.model, inputImageCount: params.inputImageCount, inputVideoCount: params.inputVideoCount, }); diff --git a/src/video-generation/live-test-helpers.ts b/src/video-generation/live-test-helpers.ts index 3d783158aca..a04c1b3796d 100644 --- a/src/video-generation/live-test-helpers.ts +++ b/src/video-generation/live-test-helpers.ts @@ -55,6 +55,9 @@ export function canRunBufferBackedVideoToVideoLiveLane(params: { return false; } if (providerId !== "runway") { + if (providerId === "fal") { + return params.modelRef.includes("reference-to-video"); + } return true; } const slash = params.modelRef.indexOf("/"); diff --git a/src/video-generation/normalization.ts b/src/video-generation/normalization.ts index e8efbe626d6..b65fd7bd4f5 100644 --- a/src/video-generation/normalization.ts +++ b/src/video-generation/normalization.ts @@ -42,6 +42,7 @@ export function resolveVideoGenerationOverrides(params: { }): ResolvedVideoGenerationOverrides { const { capabilities: caps } = resolveVideoGenerationModeCapabilities({ provider: params.provider, + model: params.model, inputImageCount: params.inputImageCount, inputVideoCount: params.inputVideoCount, }); diff --git a/src/video-generation/runtime.test.ts b/src/video-generation/runtime.test.ts index 61366bbb46b..78292465bf9 100644 --- a/src/video-generation/runtime.test.ts +++ b/src/video-generation/runtime.test.ts @@ -405,6 +405,61 @@ describe("video-generation runtime", () => { expect(result.attempts[0]?.error).toMatch(/does not support reference audio inputs/); }); + it("forwards mixed image, video, and audio references when explicitly supported", async () => { + const seenRequest: { + inputImages?: unknown; + inputVideos?: unknown; + inputAudios?: unknown; + } = {}; + mocks.resolveAgentModelPrimaryValue.mockReturnValue( + "fal/bytedance/seedance-2.0/fast/reference-to-video", + ); + mocks.getVideoGenerationProvider.mockReturnValue({ + id: "fal", + capabilities: { + videoToVideo: { + enabled: true, + maxInputImages: 9, + maxInputVideos: 3, + maxInputAudios: 3, + }, + }, + async generateVideo(req) { + seenRequest.inputImages = req.inputImages; + seenRequest.inputVideos = req.inputVideos; + seenRequest.inputAudios = req.inputAudios; + return { + videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }], + model: "bytedance/seedance-2.0/fast/reference-to-video", + }; + }, + }); + + const result = await generateVideo({ + cfg: { + agents: { + defaults: { + videoGenerationModel: { + primary: "fal/bytedance/seedance-2.0/fast/reference-to-video", + }, + }, + }, + } as OpenClawConfig, + prompt: "Blend all references", + inputImages: [{ url: "https://example.com/reference.png" }], + inputVideos: [{ url: "https://example.com/reference.mp4" }], + inputAudios: [{ url: "https://example.com/reference.mp3" }], + }); + + expect(result.provider).toBe("fal"); + expect(result.attempts).toEqual([]); + expect(seenRequest).toEqual({ + inputImages: [{ url: "https://example.com/reference.png" }], + inputVideos: [{ url: "https://example.com/reference.mp4" }], + inputAudios: [{ url: "https://example.com/reference.mp3" }], + }); + }); + it("fails when every candidate is skipped for unsupported reference audio inputs", async () => { mocks.resolveAgentModelPrimaryValue.mockReturnValue("openai/sora-2"); mocks.getVideoGenerationProvider.mockReturnValue({ diff --git a/src/video-generation/runtime.ts b/src/video-generation/runtime.ts index fd6468746a4..192f2837636 100644 --- a/src/video-generation/runtime.ts +++ b/src/video-generation/runtime.ts @@ -136,6 +136,7 @@ export async function generateVideo( if (inputAudioCount > 0) { const { capabilities: candCaps } = resolveVideoGenerationModeCapabilities({ provider, + model: candidate.model, inputImageCount, inputVideoCount, }); @@ -171,6 +172,7 @@ export async function generateVideo( ) { const { capabilities: optCaps } = resolveVideoGenerationModeCapabilities({ provider, + model: candidate.model, inputImageCount, inputVideoCount, }); @@ -201,6 +203,7 @@ export async function generateVideo( if (typeof requestedDuration === "number" && Number.isFinite(requestedDuration)) { const { capabilities: durCaps } = resolveVideoGenerationModeCapabilities({ provider, + model: candidate.model, inputImageCount, inputVideoCount, }); diff --git a/src/video-generation/types.ts b/src/video-generation/types.ts index b21ae461dd0..e29e29e2cdc 100644 --- a/src/video-generation/types.ts +++ b/src/video-generation/types.ts @@ -100,9 +100,12 @@ export type VideoGenerationProviderOptionType = "number" | "boolean" | "string"; export type VideoGenerationModeCapabilities = { maxVideos?: number; maxInputImages?: number; + maxInputImagesByModel?: Readonly>; maxInputVideos?: number; + maxInputVideosByModel?: Readonly>; /** Max number of reference audio assets the provider accepts (e.g. background music, voice reference). */ maxInputAudios?: number; + maxInputAudiosByModel?: Readonly>; maxDurationSeconds?: number; supportedDurationSeconds?: readonly number[]; supportedDurationSecondsByModel?: Readonly>; diff --git a/test/helpers/media-generation/provider-capability-assertions.ts b/test/helpers/media-generation/provider-capability-assertions.ts index cf42203674a..b84493bcb84 100644 --- a/test/helpers/media-generation/provider-capability-assertions.ts +++ b/test/helpers/media-generation/provider-capability-assertions.ts @@ -6,6 +6,18 @@ import type { } from "../../../src/plugins/types.js"; import { listSupportedVideoGenerationModes } from "../../../src/video-generation/capabilities.js"; +function hasPositiveModeLimit( + value: number | undefined, + valuesByModel: Readonly> | undefined, +): boolean { + return ( + (value ?? 0) > 0 || + Object.values(valuesByModel ?? {}).some( + (modelValue) => Number.isFinite(modelValue) && modelValue > 0, + ) + ); +} + export function expectExplicitVideoGenerationCapabilities( provider: VideoGenerationProviderPlugin, ): void { @@ -28,16 +40,16 @@ export function expectExplicitVideoGenerationCapabilities( if (imageToVideo?.enabled) { expect( - imageToVideo.maxInputImages ?? 0, - `${provider.id} imageToVideo.enabled requires maxInputImages`, - ).toBeGreaterThan(0); + hasPositiveModeLimit(imageToVideo.maxInputImages, imageToVideo.maxInputImagesByModel), + `${provider.id} imageToVideo.enabled requires maxInputImages or maxInputImagesByModel`, + ).toBe(true); expect(supportedModes).toContain("imageToVideo"); } if (videoToVideo?.enabled) { expect( - videoToVideo.maxInputVideos ?? 0, - `${provider.id} videoToVideo.enabled requires maxInputVideos`, - ).toBeGreaterThan(0); + hasPositiveModeLimit(videoToVideo.maxInputVideos, videoToVideo.maxInputVideosByModel), + `${provider.id} videoToVideo.enabled requires maxInputVideos or maxInputVideosByModel`, + ).toBe(true); expect(supportedModes).toContain("videoToVideo"); } }