From a932a58e8785fd7b885d2afbdad034ad95e082c7 Mon Sep 17 00:00:00 2001
From: Shivanker Goel <shivanker.goel@gmail.com>
Date: Sun, 26 Apr 2026 02:30:23 +0100
Subject: [PATCH] feat(fal): support Seedance reference video

Adds fal Seedance 2.0 reference-to-video support with model-aware reference input limits.
---
 CHANGELOG.md                                  |   3 +
 docs/plugins/sdk-provider-plugins.md          |   8 +-
 docs/providers/fal.md                         |  29 ++-
 docs/tools/video-generation.md                |  41 +--
 .../fal/video-generation-provider.test.ts     | 238 +++++++++++++++++-
 extensions/fal/video-generation-provider.ts   | 202 ++++++++++++---
 src/agents/tools/media-tool-shared.test.ts    |   2 +-
 src/agents/tools/video-generate-tool.ts       |   1 +
 src/plugin-sdk/video-generation.ts            |   3 +
 src/video-generation/capabilities.test.ts     |  81 ++++++
 src/video-generation/capabilities.ts          |  47 +++-
 src/video-generation/duration-support.ts      |   1 +
 src/video-generation/live-test-helpers.ts     |   3 +
 src/video-generation/normalization.ts         |   1 +
 src/video-generation/runtime.test.ts          |  55 ++++
 src/video-generation/runtime.ts               |   3 +
 src/video-generation/types.ts                 |   3 +
 .../provider-capability-assertions.ts         |  24 +-
 18 files changed, 675 insertions(+), 70 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 83e157d73df..2b1ef914f1d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -58,6 +58,9 @@ Docs: https://docs.openclaw.ai
 - Providers/Volcengine: add Volcengine/BytePlus Seed Speech as a bundled TTS provider with API-key auth, native Ogg/Opus voice-note output, and MP3 audio-file output. (#55641) Thanks @xuruiray.
 - Android/Talk Mode: expose Talk Mode in the Voice tab with runtime-owned voice capture modes and microphone foreground-service escalation. Thanks @alex-latitude.
 - Providers/LiteLLM: register `litellm` as an image-generation provider so `image_generate model=litellm/...` calls and `agents.defaults.imageGenerationModel.fallbacks` entries resolve through the LiteLLM proxy. Thanks @zqchris.
+- Providers/fal: add Seedance 2.0 reference-to-video models with multi-image,
+  video, and audio reference input mapping plus model-specific capability limits
+  for `video_generate`. Thanks @shivanker.
 - Codex harness: require Codex app-server `0.125.0` or newer and cover native MCP `PreToolUse`, `PostToolUse`, and `PermissionRequest` payloads through the OpenClaw hook relay.
 - Agents/Codex: teach prompts and `agents_list` to surface native Codex app-server availability so agents prefer `/codex ...` over Codex ACP unless ACP/acpx is explicit. Thanks @vincentkoc.
 - ACPX/Droid: add Factory Droid to the live ACP bind Docker matrix, including
diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md
index 574fb5e5dff..c88761bd3ca 100644
--- a/docs/plugins/sdk-provider-plugins.md
+++ b/docs/plugins/sdk-provider-plugins.md
@@ -626,7 +626,13 @@ API key auth, and dynamic model resolution.
           label: "Acme Video",
           capabilities: {
             generate: { maxVideos: 1, maxDurationSeconds: 10, supportsResolution: true },
-            imageToVideo: { enabled: true, maxVideos: 1, maxInputImages: 1, maxDurationSeconds: 5 },
+            imageToVideo: {
+              enabled: true,
+              maxVideos: 1,
+              maxInputImages: 1,
+              maxInputImagesByModel: { "acme/reference-to-video": 9 },
+              maxDurationSeconds: 5,
+            },
             videoToVideo: { enabled: false },
           },
           generateVideo: async (req) => ({ videos: [] }),
diff --git a/docs/providers/fal.md b/docs/providers/fal.md
index 8c5db1e1b45..3a444f05cbd 100644
--- a/docs/providers/fal.md
+++ b/docs/providers/fal.md
@@ -79,10 +79,10 @@ To use fal as the default image provider:
 The bundled `fal` video-generation provider defaults to
 `fal/fal-ai/minimax/video-01-live`.
 
-| Capability | Value                                                        |
-| ---------- | ------------------------------------------------------------ |
-| Modes      | Text-to-video, single-image reference                        |
-| Runtime    | Queue-backed submit/status/result flow for long-running jobs |
+| Capability | Value                                                              |
+| ---------- | ------------------------------------------------------------------ |
+| Modes      | Text-to-video, single-image reference, Seedance reference-to-video |
+| Runtime    | Queue-backed submit/status/result flow for long-running jobs       |
 
 <AccordionGroup>
   <Accordion title="Available video models">
@@ -94,8 +94,10 @@ The bundled `fal` video-generation provider defaults to
 
     - `fal/bytedance/seedance-2.0/fast/text-to-video`
     - `fal/bytedance/seedance-2.0/fast/image-to-video`
+    - `fal/bytedance/seedance-2.0/fast/reference-to-video`
     - `fal/bytedance/seedance-2.0/text-to-video`
     - `fal/bytedance/seedance-2.0/image-to-video`
+    - `fal/bytedance/seedance-2.0/reference-to-video`
 
   </Accordion>
 
@@ -113,6 +115,25 @@ The bundled `fal` video-generation provider defaults to
     ```
   </Accordion>
 
+  <Accordion title="Seedance 2.0 reference-to-video config example">
+    ```json5
+    {
+      agents: {
+        defaults: {
+          videoGenerationModel: {
+            primary: "fal/bytedance/seedance-2.0/fast/reference-to-video",
+          },
+        },
+      },
+    }
+    ```
+
+    Reference-to-video accepts up to 9 images, 3 videos, and 3 audio references
+    through the shared `video_generate` `images`, `videos`, and `audioRefs`
+    parameters, with at most 12 total reference files.
+
+  </Accordion>
+
   <Accordion title="HeyGen video-agent config example">
     ```json5
     {
diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md
index 8ca3345c839..1871a5ceb14 100644
--- a/docs/tools/video-generation.md
+++ b/docs/tools/video-generation.md
@@ -82,22 +82,22 @@ Duplicate prevention: if a video task is already `queued` or `running` for the c
 
 ## Supported providers
 
-| Provider              | Default model                   | Text | Image ref                                            | Video ref        | API key                                  |
-| --------------------- | ------------------------------- | ---- | ---------------------------------------------------- | ---------------- | ---------------------------------------- |
-| Alibaba               | `wan2.6-t2v`                    | Yes  | Yes (remote URL)                                     | Yes (remote URL) | `MODELSTUDIO_API_KEY`                    |
-| BytePlus (1.0)        | `seedance-1-0-pro-250528`       | Yes  | Up to 2 images (I2V models only; first + last frame) | No               | `BYTEPLUS_API_KEY`                       |
-| BytePlus Seedance 1.5 | `seedance-1-5-pro-251215`       | Yes  | Up to 2 images (first + last frame via role)         | No               | `BYTEPLUS_API_KEY`                       |
-| BytePlus Seedance 2.0 | `dreamina-seedance-2-0-260128`  | Yes  | Up to 9 reference images                             | Up to 3 videos   | `BYTEPLUS_API_KEY`                       |
-| ComfyUI               | `workflow`                      | Yes  | 1 image                                              | No               | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` |
-| fal                   | `fal-ai/minimax/video-01-live`  | Yes  | 1 image                                              | No               | `FAL_KEY`                                |
-| Google                | `veo-3.1-fast-generate-preview` | Yes  | 1 image                                              | 1 video          | `GEMINI_API_KEY`                         |
-| MiniMax               | `MiniMax-Hailuo-2.3`            | Yes  | 1 image                                              | No               | `MINIMAX_API_KEY` or MiniMax OAuth       |
-| OpenAI                | `sora-2`                        | Yes  | 1 image                                              | 1 video          | `OPENAI_API_KEY`                         |
-| Qwen                  | `wan2.6-t2v`                    | Yes  | Yes (remote URL)                                     | Yes (remote URL) | `QWEN_API_KEY`                           |
-| Runway                | `gen4.5`                        | Yes  | 1 image                                              | 1 video          | `RUNWAYML_API_SECRET`                    |
-| Together              | `Wan-AI/Wan2.2-T2V-A14B`        | Yes  | 1 image                                              | No               | `TOGETHER_API_KEY`                       |
-| Vydra                 | `veo3`                          | Yes  | 1 image (`kling`)                                    | No               | `VYDRA_API_KEY`                          |
-| xAI                   | `grok-imagine-video`            | Yes  | 1 first-frame image or up to 7 `reference_image`s    | 1 video          | `XAI_API_KEY`                            |
+| Provider              | Default model                   | Text | Image ref                                            | Video ref                                       | API key                                  |
+| --------------------- | ------------------------------- | ---- | ---------------------------------------------------- | ----------------------------------------------- | ---------------------------------------- |
+| Alibaba               | `wan2.6-t2v`                    | Yes  | Yes (remote URL)                                     | Yes (remote URL)                                | `MODELSTUDIO_API_KEY`                    |
+| BytePlus (1.0)        | `seedance-1-0-pro-250528`       | Yes  | Up to 2 images (I2V models only; first + last frame) | No                                              | `BYTEPLUS_API_KEY`                       |
+| BytePlus Seedance 1.5 | `seedance-1-5-pro-251215`       | Yes  | Up to 2 images (first + last frame via role)         | No                                              | `BYTEPLUS_API_KEY`                       |
+| BytePlus Seedance 2.0 | `dreamina-seedance-2-0-260128`  | Yes  | Up to 9 reference images                             | Up to 3 videos                                  | `BYTEPLUS_API_KEY`                       |
+| ComfyUI               | `workflow`                      | Yes  | 1 image                                              | No                                              | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` |
+| fal                   | `fal-ai/minimax/video-01-live`  | Yes  | 1 image; up to 9 with Seedance reference-to-video    | Up to 3 videos with Seedance reference-to-video | `FAL_KEY`                                |
+| Google                | `veo-3.1-fast-generate-preview` | Yes  | 1 image                                              | 1 video                                         | `GEMINI_API_KEY`                         |
+| MiniMax               | `MiniMax-Hailuo-2.3`            | Yes  | 1 image                                              | No                                              | `MINIMAX_API_KEY` or MiniMax OAuth       |
+| OpenAI                | `sora-2`                        | Yes  | 1 image                                              | 1 video                                         | `OPENAI_API_KEY`                         |
+| Qwen                  | `wan2.6-t2v`                    | Yes  | Yes (remote URL)                                     | Yes (remote URL)                                | `QWEN_API_KEY`                           |
+| Runway                | `gen4.5`                        | Yes  | 1 image                                              | 1 video                                         | `RUNWAYML_API_SECRET`                    |
+| Together              | `Wan-AI/Wan2.2-T2V-A14B`        | Yes  | 1 image                                              | No                                              | `TOGETHER_API_KEY`                       |
+| Vydra                 | `veo3`                          | Yes  | 1 image (`kling`)                                    | No                                              | `VYDRA_API_KEY`                          |
+| xAI                   | `grok-imagine-video`            | Yes  | 1 first-frame image or up to 7 `reference_image`s    | 1 video                                         | `XAI_API_KEY`                            |
 
 Some providers accept additional or alternate API key env vars. See individual [provider pages](#related) for details.
 
@@ -114,7 +114,7 @@ and the shared live sweep.
 | Alibaba  | Yes        | Yes            | Yes            | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider needs remote `http(s)` video URLs                               |
 | BytePlus | Yes        | Yes            | No             | `generate`, `imageToVideo`                                                                                                               |
 | ComfyUI  | Yes        | Yes            | No             | Not in the shared sweep; workflow-specific coverage lives with Comfy tests                                                               |
-| fal      | Yes        | Yes            | No             | `generate`, `imageToVideo`                                                                                                               |
+| fal      | Yes        | Yes            | Yes            | `generate`, `imageToVideo`; `videoToVideo` only when using Seedance reference-to-video                                                   |
 | Google   | Yes        | Yes            | Yes            | `generate`, `imageToVideo`; shared `videoToVideo` skipped because the current buffer-backed Gemini/Veo sweep does not accept that input  |
 | MiniMax  | Yes        | Yes            | No             | `generate`, `imageToVideo`                                                                                                               |
 | OpenAI   | Yes        | Yes            | Yes            | `generate`, `imageToVideo`; shared `videoToVideo` skipped because this org/input path currently needs provider-side inpaint/remix access |
@@ -296,7 +296,7 @@ entries.
   </Accordion>
 
   <Accordion title="fal">
-    Uses a queue-backed flow for long-running jobs. Single image reference only.
+    Uses a queue-backed flow for long-running jobs. Most fal video models accept a single image reference. Seedance 2.0 reference-to-video models accept up to 9 images, 3 videos, and 3 audio references, with at most 12 total reference files.
   </Accordion>
 
   <Accordion title="Google (Gemini / Veo)">
@@ -349,6 +349,7 @@ capabilities: {
     enabled: true,
     maxVideos: 1,
     maxInputImages: 1,
+    maxInputImagesByModel: { "provider/reference-to-video": 9 },
     maxDurationSeconds: 5,
   },
   videoToVideo: {
@@ -366,6 +367,10 @@ enough to advertise transform-mode support. Providers should declare
 contract tests, and the shared `video_generate` tool can validate mode support
 deterministically.
 
+When one model in a provider has wider reference-input support than the rest,
+use `maxInputImagesByModel`, `maxInputVideosByModel`, or
+`maxInputAudiosByModel` instead of raising the mode-wide limit.
+
 ## Live tests
 
 Opt-in live coverage for the shared bundled providers:
diff --git a/extensions/fal/video-generation-provider.test.ts b/extensions/fal/video-generation-provider.test.ts
index 71ddc8ab701..f0ac363ed64 100644
--- a/extensions/fal/video-generation-provider.test.ts
+++ b/extensions/fal/video-generation-provider.test.ts
@@ -81,6 +81,13 @@ describe("fal video generation provider", () => {
       .mockResolvedValueOnce(releasedVideo({ contentType: "video/mp4", bytes: params.bytes }));
   }
 
+  function getSubmitBody(): Record<string, unknown> {
+    return JSON.parse(String(fetchGuardMock.mock.calls[0]?.[0]?.init?.body ?? "{}")) as Record<
+      string,
+      unknown
+    >;
+  }
+
   afterEach(() => {
     vi.restoreAllMocks();
     fetchGuardMock.mockReset();
@@ -88,7 +95,21 @@ describe("fal video generation provider", () => {
   });
 
   it("declares explicit mode capabilities", () => {
-    expectExplicitVideoGenerationCapabilities(buildFalVideoGenerationProvider());
+    const provider = buildFalVideoGenerationProvider();
+    expectExplicitVideoGenerationCapabilities(provider);
+    expect(provider.capabilities.imageToVideo?.maxInputImages).toBe(1);
+    expect(
+      provider.capabilities.imageToVideo?.maxInputImagesByModel?.[
+        "bytedance/seedance-2.0/fast/reference-to-video"
+      ],
+    ).toBe(9);
+    expect(provider.capabilities.videoToVideo?.maxInputVideos).toBe(0);
+    expect(
+      Object.keys(provider.capabilities.videoToVideo?.supportedDurationSecondsByModel ?? {}),
+    ).toEqual([
+      "bytedance/seedance-2.0/fast/reference-to-video",
+      "bytedance/seedance-2.0/reference-to-video",
+    ]);
   });
 
   it("submits fal video jobs through the queue API and downloads the completed result", async () => {
@@ -152,8 +173,10 @@ describe("fal video generation provider", () => {
         "fal-ai/heygen/v2/video-agent",
         "bytedance/seedance-2.0/fast/text-to-video",
         "bytedance/seedance-2.0/fast/image-to-video",
+        "bytedance/seedance-2.0/fast/reference-to-video",
         "bytedance/seedance-2.0/text-to-video",
         "bytedance/seedance-2.0/image-to-video",
+        "bytedance/seedance-2.0/reference-to-video",
       ]),
     );
   });
@@ -187,10 +210,7 @@ describe("fal video generation provider", () => {
         url: "https://queue.fal.run/fal-ai/heygen/v2/video-agent",
       }),
     );
-    const submitBody = JSON.parse(
-      String(fetchGuardMock.mock.calls[0]?.[0]?.init?.body ?? "{}"),
-    ) as Record<string, unknown>;
-    expect(submitBody).toEqual({
+    expect(getSubmitBody()).toEqual({
       prompt: "A founder explains OpenClaw in a concise studio video",
     });
     expect(result.metadata).toEqual({
@@ -229,10 +249,7 @@ describe("fal video generation provider", () => {
         url: "https://queue.fal.run/bytedance/seedance-2.0/fast/text-to-video",
       }),
     );
-    const submitBody = JSON.parse(
-      String(fetchGuardMock.mock.calls[0]?.[0]?.init?.body ?? "{}"),
-    ) as Record<string, unknown>;
-    expect(submitBody).toEqual({
+    expect(getSubmitBody()).toEqual({
       prompt: "A chrome lobster drives a tiny kart across a neon pier",
       aspect_ratio: "16:9",
       resolution: "720p",
@@ -244,4 +261,207 @@ describe("fal video generation provider", () => {
       seed: 42,
     });
   });
+
+  it("submits Seedance 2 image-to-video requests with a single image_url", async () => {
+    mockFalProviderRuntime();
+    mockCompletedFalVideoJob({
+      requestId: "seedance-i2v-req-123",
+      statusUrl:
+        "https://queue.fal.run/bytedance/seedance-2.0/fast/image-to-video/requests/seedance-i2v-req-123/status",
+      responseUrl:
+        "https://queue.fal.run/bytedance/seedance-2.0/fast/image-to-video/requests/seedance-i2v-req-123",
+      videoUrl: "https://fal.run/files/seedance-i2v.mp4",
+      bytes: "seedance-i2v-mp4-bytes",
+    });
+
+    const provider = buildFalVideoGenerationProvider();
+    await provider.generateVideo({
+      provider: "fal",
+      model: "bytedance/seedance-2.0/fast/image-to-video",
+      prompt: "Animate this product still with a slow orbit",
+      durationSeconds: 6,
+      inputImages: [{ url: "https://example.com/start-frame.png" }],
+      cfg: {},
+    });
+
+    expect(getSubmitBody()).toEqual({
+      prompt: "Animate this product still with a slow orbit",
+      image_url: "https://example.com/start-frame.png",
+      duration: "6",
+    });
+  });
+
+  it("submits Seedance 2 reference-to-video requests with image, video, and audio URLs", async () => {
+    mockFalProviderRuntime();
+    mockCompletedFalVideoJob({
+      requestId: "seedance-ref-req-123",
+      statusUrl:
+        "https://queue.fal.run/bytedance/seedance-2.0/fast/reference-to-video/requests/seedance-ref-req-123/status",
+      responseUrl:
+        "https://queue.fal.run/bytedance/seedance-2.0/fast/reference-to-video/requests/seedance-ref-req-123",
+      videoUrl: "https://fal.run/files/seedance-ref.mp4",
+      bytes: "seedance-ref-mp4-bytes",
+      responseExtras: { seed: 1234 },
+    });
+
+    const provider = buildFalVideoGenerationProvider();
+    const result = await provider.generateVideo({
+      provider: "fal",
+      model: "bytedance/seedance-2.0/fast/reference-to-video",
+      prompt: "Blend @Image1, @Image2, @Video1, @Video2, and @Audio1 into one short film",
+      durationSeconds: 8,
+      aspectRatio: "9:16",
+      resolution: "480P",
+      audio: false,
+      inputImages: [
+        { url: "https://example.com/reference-1.png" },
+        { buffer: Buffer.from("local-image"), mimeType: "image/webp" },
+      ],
+      inputVideos: [
+        { url: "https://example.com/reference-1.mp4" },
+        { buffer: Buffer.from("local-video"), mimeType: "video/quicktime" },
+      ],
+      inputAudios: [
+        { url: "https://example.com/reference-1.mp3" },
+        { buffer: Buffer.from("local-audio"), mimeType: "audio/wav" },
+      ],
+      cfg: {},
+    });
+
+    expect(fetchGuardMock).toHaveBeenNthCalledWith(
+      1,
+      expect.objectContaining({
+        url: "https://queue.fal.run/bytedance/seedance-2.0/fast/reference-to-video",
+      }),
+    );
+    expect(getSubmitBody()).toEqual({
+      prompt: "Blend @Image1, @Image2, @Video1, @Video2, and @Audio1 into one short film",
+      image_urls: [
+        "https://example.com/reference-1.png",
+        `data:image/webp;base64,${Buffer.from("local-image").toString("base64")}`,
+      ],
+      video_urls: [
+        "https://example.com/reference-1.mp4",
+        `data:video/quicktime;base64,${Buffer.from("local-video").toString("base64")}`,
+      ],
+      audio_urls: [
+        "https://example.com/reference-1.mp3",
+        `data:audio/wav;base64,${Buffer.from("local-audio").toString("base64")}`,
+      ],
+      aspect_ratio: "9:16",
+      resolution: "480p",
+      duration: "8",
+      generate_audio: false,
+    });
+    expect(result.metadata).toEqual({
+      requestId: "seedance-ref-req-123",
+      seed: 1234,
+    });
+  });
+
+  it("rejects video, audio, and multiple image references for non-reference fal models", async () => {
+    const provider = buildFalVideoGenerationProvider();
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model: "fal-ai/minimax/video-01-live",
+        prompt: "Animate this",
+        inputImages: [
+          { url: "https://example.com/one.png" },
+          { url: "https://example.com/two.png" },
+        ],
+        cfg: {},
+      }),
+    ).rejects.toThrow("fal video generation supports at most one image reference.");
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model: "fal-ai/minimax/video-01-live",
+        prompt: "Animate this",
+        inputVideos: [{ url: "https://example.com/reference.mp4" }],
+        cfg: {},
+      }),
+    ).rejects.toThrow("fal video generation does not support video reference inputs.");
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model: "fal-ai/minimax/video-01-live",
+        prompt: "Animate this",
+        inputAudios: [{ url: "https://example.com/reference.mp3" }],
+        cfg: {},
+      }),
+    ).rejects.toThrow("fal video generation does not support audio reference inputs.");
+  });
+
+  it("rejects over-limit and audio-only Seedance reference-to-video requests", async () => {
+    const provider = buildFalVideoGenerationProvider();
+    const model = "bytedance/seedance-2.0/fast/reference-to-video";
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model,
+        prompt: "Too many images",
+        inputImages: Array.from({ length: 10 }, (_, index) => ({
+          url: `https://example.com/image-${index}.png`,
+        })),
+        cfg: {},
+      }),
+    ).rejects.toThrow("fal Seedance reference-to-video supports at most 9 reference images.");
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model,
+        prompt: "Too many videos",
+        inputVideos: Array.from({ length: 4 }, (_, index) => ({
+          url: `https://example.com/video-${index}.mp4`,
+        })),
+        cfg: {},
+      }),
+    ).rejects.toThrow("fal Seedance reference-to-video supports at most 3 reference videos.");
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model,
+        prompt: "Too many audios",
+        inputAudios: Array.from({ length: 4 }, (_, index) => ({
+          url: `https://example.com/audio-${index}.mp3`,
+        })),
+        cfg: {},
+      }),
+    ).rejects.toThrow("fal Seedance reference-to-video supports at most 3 reference audios.");
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model,
+        prompt: "Too many total files",
+        inputImages: Array.from({ length: 9 }, (_, index) => ({
+          url: `https://example.com/image-${index}.png`,
+        })),
+        inputVideos: Array.from({ length: 3 }, (_, index) => ({
+          url: `https://example.com/video-${index}.mp4`,
+        })),
+        inputAudios: [{ url: "https://example.com/audio.mp3" }],
+        cfg: {},
+      }),
+    ).rejects.toThrow("fal Seedance reference-to-video supports at most 12 total reference files.");
+
+    await expect(
+      provider.generateVideo({
+        provider: "fal",
+        model,
+        prompt: "Audio only",
+        inputAudios: [{ url: "https://example.com/audio.mp3" }],
+        cfg: {},
+      }),
+    ).rejects.toThrow(
+      "fal Seedance reference-to-video requires at least one image or video reference when audio references are provided.",
+    );
+  });
 });
diff --git a/extensions/fal/video-generation-provider.ts b/extensions/fal/video-generation-provider.ts
index ce4d145b201..0bb0e80c2c9 100644
--- a/extensions/fal/video-generation-provider.ts
+++ b/extensions/fal/video-generation-provider.ts
@@ -23,13 +23,34 @@ const DEFAULT_FAL_BASE_URL = "https://fal.run";
 const DEFAULT_FAL_QUEUE_BASE_URL = "https://queue.fal.run";
 const DEFAULT_FAL_VIDEO_MODEL = "fal-ai/minimax/video-01-live";
 const HEYGEN_VIDEO_AGENT_MODEL = "fal-ai/heygen/v2/video-agent";
-const SEEDANCE_2_VIDEO_MODELS = [
+const SEEDANCE_2_TEXT_IMAGE_VIDEO_MODELS = [
   "bytedance/seedance-2.0/fast/text-to-video",
   "bytedance/seedance-2.0/fast/image-to-video",
   "bytedance/seedance-2.0/text-to-video",
   "bytedance/seedance-2.0/image-to-video",
 ] as const;
+const SEEDANCE_2_REFERENCE_VIDEO_MODELS = [
+  "bytedance/seedance-2.0/fast/reference-to-video",
+  "bytedance/seedance-2.0/reference-to-video",
+] as const;
+const SEEDANCE_2_VIDEO_MODELS = [
+  ...SEEDANCE_2_TEXT_IMAGE_VIDEO_MODELS,
+  ...SEEDANCE_2_REFERENCE_VIDEO_MODELS,
+] as const;
 const SEEDANCE_2_DURATION_SECONDS = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] as const;
+const SEEDANCE_REFERENCE_MAX_IMAGES = 9;
+const SEEDANCE_REFERENCE_MAX_VIDEOS = 3;
+const SEEDANCE_REFERENCE_MAX_AUDIOS = 3;
+const SEEDANCE_REFERENCE_MAX_FILES = 12;
+const SEEDANCE_REFERENCE_MAX_IMAGES_BY_MODEL = Object.fromEntries(
+  SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_REFERENCE_MAX_IMAGES]),
+);
+const SEEDANCE_REFERENCE_MAX_VIDEOS_BY_MODEL = Object.fromEntries(
+  SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_REFERENCE_MAX_VIDEOS]),
+);
+const SEEDANCE_REFERENCE_MAX_AUDIOS_BY_MODEL = Object.fromEntries(
+  SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_REFERENCE_MAX_AUDIOS]),
+);
 const DEFAULT_HTTP_TIMEOUT_MS = 30_000;
 const DEFAULT_OPERATION_TIMEOUT_MS = 600_000;
 const POLL_INTERVAL_MS = 5_000;
@@ -128,6 +149,12 @@ function isFalSeedance2Model(model: string): boolean {
   return SEEDANCE_2_VIDEO_MODELS.includes(model as (typeof SEEDANCE_2_VIDEO_MODELS)[number]);
 }
 
+function isFalSeedance2ReferenceModel(model: string): boolean {
+  return SEEDANCE_2_REFERENCE_VIDEO_MODELS.includes(
+    model as (typeof SEEDANCE_2_REFERENCE_VIDEO_MODELS)[number],
+  );
+}
+
 function isFalHeyGenVideoAgentModel(model: string): boolean {
   return normalizeLowercaseStringOrEmpty(model) === HEYGEN_VIDEO_AGENT_MODEL;
 }
@@ -156,6 +183,55 @@ function resolveFalDuration(
   return duration;
 }
 
+function resolveFalReferenceUrl(
+  asset: NonNullable<VideoGenerationRequest["inputImages"]>[number] | undefined,
+  defaultMimeType: string,
+  label: string,
+): string {
+  const assetUrl = normalizeOptionalString(asset?.url);
+  if (assetUrl) {
+    return assetUrl;
+  }
+  if (!asset?.buffer) {
+    throw new Error(`fal ${label} is missing media data.`);
+  }
+  return toDataUrl(asset.buffer, normalizeOptionalString(asset.mimeType) ?? defaultMimeType);
+}
+
+function resolveFalReferenceUrls(
+  assets: VideoGenerationRequest["inputImages"],
+  defaultMimeType: string,
+  label: string,
+): string[] {
+  return (assets ?? []).map((asset) => resolveFalReferenceUrl(asset, defaultMimeType, label));
+}
+
+function applyFalSeedanceControls(params: {
+  req: VideoGenerationRequest;
+  model: string;
+  body: Record<string, unknown>;
+}): void {
+  const aspectRatio = normalizeOptionalString(params.req.aspectRatio);
+  if (aspectRatio) {
+    params.body.aspect_ratio = aspectRatio;
+  }
+  const size = normalizeOptionalString(params.req.size);
+  if (size) {
+    params.body.size = size;
+  }
+  const resolution = resolveFalResolution(params.req.resolution, params.model);
+  if (resolution) {
+    params.body.resolution = resolution;
+  }
+  const duration = resolveFalDuration(params.req.durationSeconds, params.model);
+  if (duration) {
+    params.body.duration = duration;
+  }
+  if (isFalSeedance2Model(params.model) && typeof params.req.audio === "boolean") {
+    params.body.generate_audio = params.req.audio;
+  }
+}
+
 function buildFalVideoRequestBody(params: {
   req: VideoGenerationRequest;
   model: string;
@@ -163,6 +239,36 @@ function buildFalVideoRequestBody(params: {
   const requestBody: Record<string, unknown> = {
     prompt: params.req.prompt,
   };
+
+  if (isFalSeedance2ReferenceModel(params.model)) {
+    const imageUrls = resolveFalReferenceUrls(
+      params.req.inputImages,
+      "image/png",
+      "reference image",
+    );
+    const videoUrls = resolveFalReferenceUrls(
+      params.req.inputVideos,
+      "video/mp4",
+      "reference video",
+    );
+    const audioUrls = resolveFalReferenceUrls(
+      params.req.inputAudios,
+      "audio/mpeg",
+      "reference audio",
+    );
+    if (imageUrls.length > 0) {
+      requestBody.image_urls = imageUrls;
+    }
+    if (videoUrls.length > 0) {
+      requestBody.video_urls = videoUrls;
+    }
+    if (audioUrls.length > 0) {
+      requestBody.audio_urls = audioUrls;
+    }
+    applyFalSeedanceControls({ req: params.req, model: params.model, body: requestBody });
+    return requestBody;
+  }
+
   const input = params.req.inputImages?.[0];
   if (input) {
     requestBody.image_url = normalizeOptionalString(input.url)
@@ -177,28 +283,58 @@ function buildFalVideoRequestBody(params: {
   if (isFalMiniMaxLiveModel(params.model) || isFalHeyGenVideoAgentModel(params.model)) {
     return requestBody;
   }
-  const aspectRatio = normalizeOptionalString(params.req.aspectRatio);
-  if (aspectRatio) {
-    requestBody.aspect_ratio = aspectRatio;
-  }
-  const size = normalizeOptionalString(params.req.size);
-  if (size) {
-    requestBody.size = size;
-  }
-  const resolution = resolveFalResolution(params.req.resolution, params.model);
-  if (resolution) {
-    requestBody.resolution = resolution;
-  }
-  const duration = resolveFalDuration(params.req.durationSeconds, params.model);
-  if (duration) {
-    requestBody.duration = duration;
-  }
-  if (isFalSeedance2Model(params.model) && typeof params.req.audio === "boolean") {
-    requestBody.generate_audio = params.req.audio;
-  }
+  applyFalSeedanceControls({ req: params.req, model: params.model, body: requestBody });
   return requestBody;
 }
 
+function validateFalVideoReferenceInputs(params: {
+  req: VideoGenerationRequest;
+  model: string;
+}): void {
+  const imageCount = params.req.inputImages?.length ?? 0;
+  const videoCount = params.req.inputVideos?.length ?? 0;
+  const audioCount = params.req.inputAudios?.length ?? 0;
+  if (isFalSeedance2ReferenceModel(params.model)) {
+    if (imageCount > SEEDANCE_REFERENCE_MAX_IMAGES) {
+      throw new Error(
+        `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_IMAGES} reference images.`,
+      );
+    }
+    if (videoCount > SEEDANCE_REFERENCE_MAX_VIDEOS) {
+      throw new Error(
+        `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_VIDEOS} reference videos.`,
+      );
+    }
+    if (audioCount > SEEDANCE_REFERENCE_MAX_AUDIOS) {
+      throw new Error(
+        `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_AUDIOS} reference audios.`,
+      );
+    }
+    const totalFiles = imageCount + videoCount + audioCount;
+    if (totalFiles > SEEDANCE_REFERENCE_MAX_FILES) {
+      throw new Error(
+        `fal Seedance reference-to-video supports at most ${SEEDANCE_REFERENCE_MAX_FILES} total reference files.`,
+      );
+    }
+    if (audioCount > 0 && imageCount === 0 && videoCount === 0) {
+      throw new Error(
+        "fal Seedance reference-to-video requires at least one image or video reference when audio references are provided.",
+      );
+    }
+    return;
+  }
+
+  if (videoCount > 0) {
+    throw new Error("fal video generation does not support video reference inputs.");
+  }
+  if (audioCount > 0) {
+    throw new Error("fal video generation does not support audio reference inputs.");
+  }
+  if (imageCount > 1) {
+    throw new Error("fal video generation supports at most one image reference.");
+  }
+}
+
 async function fetchFalJson(params: {
   url: string;
   init?: RequestInit;
@@ -317,6 +453,8 @@ export function buildFalVideoGenerationProvider(): VideoGenerationProvider {
         enabled: true,
         maxVideos: 1,
         maxInputImages: 1,
+        maxInputImagesByModel: SEEDANCE_REFERENCE_MAX_IMAGES_BY_MODEL,
+        maxInputAudiosByModel: SEEDANCE_REFERENCE_MAX_AUDIOS_BY_MODEL,
         supportedDurationSecondsByModel: Object.fromEntries(
           SEEDANCE_2_VIDEO_MODELS.map((model) => [model, SEEDANCE_2_DURATION_SECONDS]),
         ),
@@ -326,16 +464,25 @@ export function buildFalVideoGenerationProvider(): VideoGenerationProvider {
         supportsAudio: true,
       },
       videoToVideo: {
-        enabled: false,
+        enabled: true,
+        maxVideos: 1,
+        maxInputImages: 0,
+        maxInputImagesByModel: SEEDANCE_REFERENCE_MAX_IMAGES_BY_MODEL,
+        maxInputVideos: 0,
+        maxInputVideosByModel: SEEDANCE_REFERENCE_MAX_VIDEOS_BY_MODEL,
+        maxInputAudiosByModel: SEEDANCE_REFERENCE_MAX_AUDIOS_BY_MODEL,
+        supportedDurationSecondsByModel: Object.fromEntries(
+          SEEDANCE_2_REFERENCE_VIDEO_MODELS.map((model) => [model, SEEDANCE_2_DURATION_SECONDS]),
+        ),
+        supportsAspectRatio: true,
+        supportsResolution: true,
+        supportsSize: true,
+        supportsAudio: true,
       },
     },
     async generateVideo(req) {
-      if ((req.inputVideos?.length ?? 0) > 0) {
-        throw new Error("fal video generation does not support video reference inputs.");
-      }
-      if ((req.inputImages?.length ?? 0) > 1) {
-        throw new Error("fal video generation supports at most one image reference.");
-      }
+      const model = normalizeOptionalString(req.model) || DEFAULT_FAL_VIDEO_MODEL;
+      validateFalVideoReferenceInputs({ req, model });
       const auth = await resolveApiKeyForProvider({
         provider: "fal",
         cfg: req.cfg,
@@ -358,7 +505,6 @@ export function buildFalVideoGenerationProvider(): VideoGenerationProvider {
           capability: "video",
           transport: "http",
         });
-      const model = normalizeOptionalString(req.model) || DEFAULT_FAL_VIDEO_MODEL;
       const requestBody = buildFalVideoRequestBody({ req, model });
       const policy = buildPolicy(allowPrivateNetwork);
       const queueBaseUrl = resolveFalQueueBaseUrl(baseUrl);
diff --git a/src/agents/tools/media-tool-shared.test.ts b/src/agents/tools/media-tool-shared.test.ts
index 772ea09adad..0d229e6b54c 100644
--- a/src/agents/tools/media-tool-shared.test.ts
+++ b/src/agents/tools/media-tool-shared.test.ts
@@ -96,5 +96,5 @@ describe("resolveModelFromRegistry", () => {
       ["kimchi", "kimchi/claude-opus-4-6"],
     ]);
     expect(result).toBe(foundModel);
-  });
+  }, 180_000);
 });
diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts
index 835c12d31b0..44115387d54 100644
--- a/src/agents/tools/video-generate-tool.ts
+++ b/src/agents/tools/video-generate-tool.ts
@@ -350,6 +350,7 @@ function validateVideoGenerationCapabilities(params: {
   });
   const { capabilities: caps } = resolveVideoGenerationModeCapabilities({
     provider,
+    model: params.model,
     inputImageCount: params.inputImageCount,
     inputVideoCount: params.inputVideoCount,
   });
diff --git a/src/plugin-sdk/video-generation.ts b/src/plugin-sdk/video-generation.ts
index 2ba2b091856..6223a27ef96 100644
--- a/src/plugin-sdk/video-generation.ts
+++ b/src/plugin-sdk/video-generation.ts
@@ -106,9 +106,12 @@ export type VideoGenerationProviderOptionType = "number" | "boolean" | "string";
 export type VideoGenerationModeCapabilities = {
   maxVideos?: number;
   maxInputImages?: number;
+  maxInputImagesByModel?: Readonly<Record<string, number>>;
   maxInputVideos?: number;
+  maxInputVideosByModel?: Readonly<Record<string, number>>;
   /** Max number of reference audio assets the provider accepts (e.g. background music, voice reference). */
   maxInputAudios?: number;
+  maxInputAudiosByModel?: Readonly<Record<string, number>>;
   maxDurationSeconds?: number;
   supportedDurationSeconds?: readonly number[];
   supportedDurationSecondsByModel?: Readonly<Record<string, readonly number[]>>;
diff --git a/src/video-generation/capabilities.test.ts b/src/video-generation/capabilities.test.ts
index dab2e2f16fb..926959bb0ba 100644
--- a/src/video-generation/capabilities.test.ts
+++ b/src/video-generation/capabilities.test.ts
@@ -75,4 +75,85 @@ describe("video-generation capabilities", () => {
       capabilities: undefined,
     });
   });
+
+  it("uses explicit video-to-video capabilities for mixed reference requests", () => {
+    const provider = createProvider({
+      imageToVideo: {
+        enabled: true,
+        maxInputImages: 2,
+      },
+      videoToVideo: {
+        enabled: true,
+        maxInputImages: 2,
+        maxInputVideos: 3,
+        maxInputAudios: 1,
+      },
+    });
+
+    expect(resolveVideoGenerationMode({ inputImageCount: 1, inputVideoCount: 1 })).toBeNull();
+    expect(
+      resolveVideoGenerationModeCapabilities({
+        provider,
+        inputImageCount: 1,
+        inputVideoCount: 1,
+      }),
+    ).toEqual({
+      mode: null,
+      capabilities: {
+        enabled: true,
+        maxInputImages: 2,
+        maxInputVideos: 3,
+        maxInputAudios: 1,
+      },
+    });
+  });
+
+  it("applies model-specific reference input limits", () => {
+    const provider = createProvider({
+      imageToVideo: {
+        enabled: true,
+        maxInputImages: 1,
+        maxInputImagesByModel: {
+          "vendor/reference-to-video": 9,
+        },
+      },
+      videoToVideo: {
+        enabled: true,
+        maxInputImages: 0,
+        maxInputImagesByModel: {
+          "vendor/reference-to-video": 9,
+        },
+        maxInputVideos: 0,
+        maxInputVideosByModel: {
+          "vendor/reference-to-video": 3,
+        },
+      },
+    });
+
+    expect(
+      resolveVideoGenerationModeCapabilities({
+        provider,
+        model: "vendor/text-to-video",
+        inputImageCount: 2,
+      }).capabilities?.maxInputImages,
+    ).toBe(1);
+    expect(
+      resolveVideoGenerationModeCapabilities({
+        provider,
+        model: "vendor/reference-to-video",
+        inputImageCount: 2,
+      }).capabilities?.maxInputImages,
+    ).toBe(9);
+    expect(
+      resolveVideoGenerationModeCapabilities({
+        provider,
+        model: "vendor/reference-to-video",
+        inputImageCount: 1,
+        inputVideoCount: 1,
+      }).capabilities,
+    ).toMatchObject({
+      maxInputImages: 9,
+      maxInputVideos: 3,
+    });
+  });
 });
diff --git a/src/video-generation/capabilities.ts b/src/video-generation/capabilities.ts
index d2d789700c7..37451252b2e 100644
--- a/src/video-generation/capabilities.ts
+++ b/src/video-generation/capabilities.ts
@@ -40,33 +40,74 @@ export function listSupportedVideoGenerationModes(
 
 export function resolveVideoGenerationModeCapabilities(params: {
   provider?: Pick<VideoGenerationProvider, "capabilities">;
+  model?: string;
   inputImageCount?: number;
   inputVideoCount?: number;
 }): {
   mode: VideoGenerationMode | null;
   capabilities: VideoGenerationModeCapabilities | VideoGenerationTransformCapabilities | undefined;
 } {
+  const inputImageCount = params.inputImageCount ?? 0;
+  const inputVideoCount = params.inputVideoCount ?? 0;
   const mode = resolveVideoGenerationMode(params);
   const capabilities = params.provider?.capabilities;
+  const withModelLimits = <
+    T extends VideoGenerationModeCapabilities | VideoGenerationTransformCapabilities | undefined,
+  >(
+    caps: T,
+  ): T => {
+    const model = params.model?.trim();
+    if (!caps || !model) {
+      return caps;
+    }
+    const maxInputImages = caps.maxInputImagesByModel?.[model];
+    const maxInputVideos = caps.maxInputVideosByModel?.[model];
+    const maxInputAudios = caps.maxInputAudiosByModel?.[model];
+    if (
+      typeof maxInputImages !== "number" &&
+      typeof maxInputVideos !== "number" &&
+      typeof maxInputAudios !== "number"
+    ) {
+      return caps;
+    }
+    return {
+      ...caps,
+      ...(typeof maxInputImages === "number" ? { maxInputImages } : {}),
+      ...(typeof maxInputVideos === "number" ? { maxInputVideos } : {}),
+      ...(typeof maxInputAudios === "number" ? { maxInputAudios } : {}),
+    };
+  };
   if (!capabilities) {
     return { mode, capabilities: undefined };
   }
   if (mode === "generate") {
     return {
       mode,
-      capabilities: capabilities.generate,
+      capabilities: withModelLimits(capabilities.generate),
     };
   }
   if (mode === "imageToVideo") {
     return {
       mode,
-      capabilities: capabilities.imageToVideo,
+      capabilities: withModelLimits(capabilities.imageToVideo),
     };
   }
   if (mode === "videoToVideo") {
     return {
       mode,
-      capabilities: capabilities.videoToVideo,
+      capabilities: withModelLimits(capabilities.videoToVideo),
+    };
+  }
+  const videoToVideoCapabilities = withModelLimits(capabilities.videoToVideo);
+  if (
+    inputImageCount > 0 &&
+    inputVideoCount > 0 &&
+    videoToVideoCapabilities?.enabled &&
+    (videoToVideoCapabilities.maxInputImages ?? 0) > 0
+  ) {
+    return {
+      mode,
+      capabilities: videoToVideoCapabilities,
     };
   }
   return {
diff --git a/src/video-generation/duration-support.ts b/src/video-generation/duration-support.ts
index 28f21e8020c..6bdae9774ae 100644
--- a/src/video-generation/duration-support.ts
+++ b/src/video-generation/duration-support.ts
@@ -23,6 +23,7 @@ export function resolveVideoGenerationSupportedDurations(params: {
 }): number[] | undefined {
   const { capabilities: caps } = resolveVideoGenerationModeCapabilities({
     provider: params.provider,
+    model: params.model,
     inputImageCount: params.inputImageCount,
     inputVideoCount: params.inputVideoCount,
   });
diff --git a/src/video-generation/live-test-helpers.ts b/src/video-generation/live-test-helpers.ts
index 3d783158aca..a04c1b3796d 100644
--- a/src/video-generation/live-test-helpers.ts
+++ b/src/video-generation/live-test-helpers.ts
@@ -55,6 +55,9 @@ export function canRunBufferBackedVideoToVideoLiveLane(params: {
     return false;
   }
   if (providerId !== "runway") {
+    if (providerId === "fal") {
+      return params.modelRef.includes("reference-to-video");
+    }
     return true;
   }
   const slash = params.modelRef.indexOf("/");
diff --git a/src/video-generation/normalization.ts b/src/video-generation/normalization.ts
index e8efbe626d6..b65fd7bd4f5 100644
--- a/src/video-generation/normalization.ts
+++ b/src/video-generation/normalization.ts
@@ -42,6 +42,7 @@ export function resolveVideoGenerationOverrides(params: {
 }): ResolvedVideoGenerationOverrides {
   const { capabilities: caps } = resolveVideoGenerationModeCapabilities({
     provider: params.provider,
+    model: params.model,
     inputImageCount: params.inputImageCount,
     inputVideoCount: params.inputVideoCount,
   });
diff --git a/src/video-generation/runtime.test.ts b/src/video-generation/runtime.test.ts
index 61366bbb46b..78292465bf9 100644
--- a/src/video-generation/runtime.test.ts
+++ b/src/video-generation/runtime.test.ts
@@ -405,6 +405,61 @@ describe("video-generation runtime", () => {
     expect(result.attempts[0]?.error).toMatch(/does not support reference audio inputs/);
   });
 
+  it("forwards mixed image, video, and audio references when explicitly supported", async () => {
+    const seenRequest: {
+      inputImages?: unknown;
+      inputVideos?: unknown;
+      inputAudios?: unknown;
+    } = {};
+    mocks.resolveAgentModelPrimaryValue.mockReturnValue(
+      "fal/bytedance/seedance-2.0/fast/reference-to-video",
+    );
+    mocks.getVideoGenerationProvider.mockReturnValue({
+      id: "fal",
+      capabilities: {
+        videoToVideo: {
+          enabled: true,
+          maxInputImages: 9,
+          maxInputVideos: 3,
+          maxInputAudios: 3,
+        },
+      },
+      async generateVideo(req) {
+        seenRequest.inputImages = req.inputImages;
+        seenRequest.inputVideos = req.inputVideos;
+        seenRequest.inputAudios = req.inputAudios;
+        return {
+          videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }],
+          model: "bytedance/seedance-2.0/fast/reference-to-video",
+        };
+      },
+    });
+
+    const result = await generateVideo({
+      cfg: {
+        agents: {
+          defaults: {
+            videoGenerationModel: {
+              primary: "fal/bytedance/seedance-2.0/fast/reference-to-video",
+            },
+          },
+        },
+      } as OpenClawConfig,
+      prompt: "Blend all references",
+      inputImages: [{ url: "https://example.com/reference.png" }],
+      inputVideos: [{ url: "https://example.com/reference.mp4" }],
+      inputAudios: [{ url: "https://example.com/reference.mp3" }],
+    });
+
+    expect(result.provider).toBe("fal");
+    expect(result.attempts).toEqual([]);
+    expect(seenRequest).toEqual({
+      inputImages: [{ url: "https://example.com/reference.png" }],
+      inputVideos: [{ url: "https://example.com/reference.mp4" }],
+      inputAudios: [{ url: "https://example.com/reference.mp3" }],
+    });
+  });
+
   it("fails when every candidate is skipped for unsupported reference audio inputs", async () => {
     mocks.resolveAgentModelPrimaryValue.mockReturnValue("openai/sora-2");
     mocks.getVideoGenerationProvider.mockReturnValue({
diff --git a/src/video-generation/runtime.ts b/src/video-generation/runtime.ts
index fd6468746a4..192f2837636 100644
--- a/src/video-generation/runtime.ts
+++ b/src/video-generation/runtime.ts
@@ -136,6 +136,7 @@ export async function generateVideo(
     if (inputAudioCount > 0) {
       const { capabilities: candCaps } = resolveVideoGenerationModeCapabilities({
         provider,
+        model: candidate.model,
         inputImageCount,
         inputVideoCount,
       });
@@ -171,6 +172,7 @@ export async function generateVideo(
     ) {
       const { capabilities: optCaps } = resolveVideoGenerationModeCapabilities({
         provider,
+        model: candidate.model,
         inputImageCount,
         inputVideoCount,
       });
@@ -201,6 +203,7 @@ export async function generateVideo(
     if (typeof requestedDuration === "number" && Number.isFinite(requestedDuration)) {
       const { capabilities: durCaps } = resolveVideoGenerationModeCapabilities({
         provider,
+        model: candidate.model,
         inputImageCount,
         inputVideoCount,
       });
diff --git a/src/video-generation/types.ts b/src/video-generation/types.ts
index b21ae461dd0..e29e29e2cdc 100644
--- a/src/video-generation/types.ts
+++ b/src/video-generation/types.ts
@@ -100,9 +100,12 @@ export type VideoGenerationProviderOptionType = "number" | "boolean" | "string";
 export type VideoGenerationModeCapabilities = {
   maxVideos?: number;
   maxInputImages?: number;
+  maxInputImagesByModel?: Readonly<Record<string, number>>;
   maxInputVideos?: number;
+  maxInputVideosByModel?: Readonly<Record<string, number>>;
   /** Max number of reference audio assets the provider accepts (e.g. background music, voice reference). */
   maxInputAudios?: number;
+  maxInputAudiosByModel?: Readonly<Record<string, number>>;
   maxDurationSeconds?: number;
   supportedDurationSeconds?: readonly number[];
   supportedDurationSecondsByModel?: Readonly<Record<string, readonly number[]>>;
diff --git a/test/helpers/media-generation/provider-capability-assertions.ts b/test/helpers/media-generation/provider-capability-assertions.ts
index cf42203674a..b84493bcb84 100644
--- a/test/helpers/media-generation/provider-capability-assertions.ts
+++ b/test/helpers/media-generation/provider-capability-assertions.ts
@@ -6,6 +6,18 @@ import type {
 } from "../../../src/plugins/types.js";
 import { listSupportedVideoGenerationModes } from "../../../src/video-generation/capabilities.js";
 
+function hasPositiveModeLimit(
+  value: number | undefined,
+  valuesByModel: Readonly<Record<string, number>> | undefined,
+): boolean {
+  return (
+    (value ?? 0) > 0 ||
+    Object.values(valuesByModel ?? {}).some(
+      (modelValue) => Number.isFinite(modelValue) && modelValue > 0,
+    )
+  );
+}
+
 export function expectExplicitVideoGenerationCapabilities(
   provider: VideoGenerationProviderPlugin,
 ): void {
@@ -28,16 +40,16 @@ export function expectExplicitVideoGenerationCapabilities(
 
   if (imageToVideo?.enabled) {
     expect(
-      imageToVideo.maxInputImages ?? 0,
-      `${provider.id} imageToVideo.enabled requires maxInputImages`,
-    ).toBeGreaterThan(0);
+      hasPositiveModeLimit(imageToVideo.maxInputImages, imageToVideo.maxInputImagesByModel),
+      `${provider.id} imageToVideo.enabled requires maxInputImages or maxInputImagesByModel`,
+    ).toBe(true);
     expect(supportedModes).toContain("imageToVideo");
   }
   if (videoToVideo?.enabled) {
     expect(
-      videoToVideo.maxInputVideos ?? 0,
-      `${provider.id} videoToVideo.enabled requires maxInputVideos`,
-    ).toBeGreaterThan(0);
+      hasPositiveModeLimit(videoToVideo.maxInputVideos, videoToVideo.maxInputVideosByModel),
+      `${provider.id} videoToVideo.enabled requires maxInputVideos or maxInputVideosByModel`,
+    ).toBe(true);
     expect(supportedModes).toContain("videoToVideo");
   }
 }