fix(provider): bound Vydra and Comfy media downloads

2026-06-03 18:04:06 +00:00 · 2026-05-29 15:46:27 +02:00
parent c093e4508d
commit 0902ee723b
12 changed files with 336 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

- Providers: bound generated video downloads from OpenAI, Runway, xAI, MiniMax, BytePlus, DashScope-compatible, FAL, OpenRouter, and Google providers, and bound generated FAL image downloads.
+- Providers: bound generated media downloads from OpenAI, Runway, xAI, MiniMax, BytePlus, DashScope-compatible, FAL, OpenRouter, Google, Vydra, and Comfy providers.
 - Cron: retry recurring jobs after transient model rate limits before waiting for the next scheduled slot.

 ## 2026.5.28
--- a/extensions/comfy/image-generation-provider.test.ts
+++ b/extensions/comfy/image-generation-provider.test.ts
@@ -201,6 +201,63 @@ describe("comfy image-generation provider", () => {
    });
  });

+  it("rejects generated image downloads that exceed the configured media cap", async () => {
+    setComfyFetchGuardForTesting(fetchWithSsrFGuardMock);
+    fetchWithSsrFGuardMock
+      .mockResolvedValueOnce({
+        response: new Response(JSON.stringify({ prompt_id: "local-prompt-1" }), {
+          status: 200,
+          headers: { "content-type": "application/json" },
+        }),
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: new Response(
+          JSON.stringify({
+            "local-prompt-1": {
+              outputs: {
+                "9": {
+                  images: [{ filename: "generated.png", subfolder: "", type: "output" }],
+                },
+              },
+            },
+          }),
+          {
+            status: 200,
+            headers: { "content-type": "application/json" },
+          },
+        ),
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: new Response(Buffer.from("too-large"), {
+          status: 200,
+          headers: { "content-type": "image/png" },
+        }),
+        release: vi.fn(async () => {}),
+      });
+
+    const provider = buildComfyImageGenerationProvider();
+    await expect(
+      provider.generateImage({
+        provider: "comfy",
+        model: "workflow",
+        prompt: "draw a lobster",
+        cfg: {
+          ...buildComfyConfig({
+            workflow: {
+              "6": { inputs: { text: "" } },
+              "9": { inputs: {} },
+            },
+            promptNodeId: "6",
+            outputNodeId: "9",
+          }),
+          agents: { defaults: { mediaMaxMb: 0.000001 } },
+        } as never,
+      }),
+    ).rejects.toThrow("Comfy image output download exceeds 1 bytes");
+  });
+
  it("reports malformed local workflow submit JSON as a provider error", async () => {
    setComfyFetchGuardForTesting(fetchWithSsrFGuardMock);
    const release = vi.fn(async () => {});
--- a/extensions/comfy/music-generation-provider.test.ts
+++ b/extensions/comfy/music-generation-provider.test.ts
@@ -98,4 +98,69 @@ describe("comfy music-generation provider", () => {
      },
    });
  });
+
+  it("rejects generated music downloads that exceed the configured media cap", async () => {
+    setComfyFetchGuardForTesting(fetchWithSsrFGuardMock);
+    fetchWithSsrFGuardMock
+      .mockResolvedValueOnce({
+        response: new Response(JSON.stringify({ prompt_id: "music-job-1" }), {
+          status: 200,
+          headers: { "content-type": "application/json" },
+        }),
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: new Response(
+          JSON.stringify({
+            "music-job-1": {
+              outputs: {
+                "9": {
+                  audio: [{ filename: "song.mp3", subfolder: "", type: "output" }],
+                },
+              },
+            },
+          }),
+          {
+            status: 200,
+            headers: { "content-type": "application/json" },
+          },
+        ),
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: new Response(Buffer.from("too-large"), {
+          status: 200,
+          headers: { "content-type": "audio/mpeg" },
+        }),
+        release: vi.fn(async () => {}),
+      });
+
+    const provider = buildComfyMusicGenerationProvider();
+    await expect(
+      provider.generateMusic({
+        provider: "comfy",
+        model: "workflow",
+        prompt: "gentle ambient synth loop",
+        cfg: {
+          plugins: {
+            entries: {
+              comfy: {
+                config: {
+                  music: {
+                    workflow: {
+                      "6": { inputs: { text: "" } },
+                      "9": { inputs: {} },
+                    },
+                    promptNodeId: "6",
+                    outputNodeId: "9",
+                  },
+                },
+              },
+            },
+          },
+          agents: { defaults: { mediaMaxMb: 0.000001 } },
+        } as never,
+      }),
+    ).rejects.toThrow("Comfy music output download exceeds 1 bytes");
+  });
 });
--- a/extensions/comfy/video-generation-provider.test.ts
+++ b/extensions/comfy/video-generation-provider.test.ts
@@ -144,6 +144,65 @@ describe("comfy video-generation provider", () => {
    });
  });

+  it("rejects generated video downloads that exceed the configured media cap", async () => {
+    setComfyFetchGuardForTesting(fetchWithSsrFGuardMock);
+    fetchWithSsrFGuardMock
+      .mockResolvedValueOnce({
+        response: new Response(JSON.stringify({ prompt_id: "local-video-1" }), {
+          status: 200,
+          headers: { "content-type": "application/json" },
+        }),
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: new Response(
+          JSON.stringify({
+            "local-video-1": {
+              outputs: {
+                "9": {
+                  gifs: [{ filename: "generated.mp4", subfolder: "", type: "output" }],
+                },
+              },
+            },
+          }),
+          {
+            status: 200,
+            headers: { "content-type": "application/json" },
+          },
+        ),
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: new Response(Buffer.from("too-large"), {
+          status: 200,
+          headers: { "content-type": "video/mp4" },
+        }),
+        release: vi.fn(async () => {}),
+      });
+
+    const provider = buildComfyVideoGenerationProvider();
+    await expect(
+      provider.generateVideo({
+        provider: "comfy",
+        model: "workflow",
+        prompt: "animate a lobster",
+        cfg: {
+          ...buildComfyConfig({
+            video: {
+              workflow: {
+                "6": { inputs: { text: "" } },
+                "9": { inputs: {} },
+              },
+              promptNodeId: "6",
+              outputNodeId: "9",
+            },
+          }),
+          agents: { defaults: { mediaMaxMb: 0.000001 } },
+        } as never,
+      }),
+    ).rejects.toThrow("Comfy video output download exceeds 1 bytes");
+  });
+
  it("uses cloud endpoints for video workflows", async () => {
    mockComfyProviderApiKey();
    setComfyFetchGuardForTesting(fetchWithSsrFGuardMock);
--- a/extensions/comfy/workflow-runtime.ts
+++ b/extensions/comfy/workflow-runtime.ts
@@ -12,6 +12,7 @@ import {
  normalizeBaseUrl,
  resolveProviderHttpRequestConfig,
 } from "openclaw/plugin-sdk/provider-http";
+import { readResponseWithLimit } from "openclaw/plugin-sdk/response-limit-runtime";
 import {
  normalizeSecretInputString,
  resolveSecretInputString,
@@ -39,6 +40,8 @@ const DEFAULT_PROMPT_INPUT_NAME = "text";
 const DEFAULT_INPUT_IMAGE_INPUT_NAME = "image";
 const DEFAULT_POLL_INTERVAL_MS = 1_500;
 const DEFAULT_TIMEOUT_MS = 5 * 60_000;
+const DEFAULT_GENERATED_IMAGE_MAX_BYTES = 6 * 1024 * 1024;
+const DEFAULT_GENERATED_MEDIA_MAX_BYTES = 16 * 1024 * 1024;

 export const DEFAULT_COMFY_MODEL = "workflow";

@@ -113,6 +116,19 @@ export function setComfyFetchGuardForTesting(impl: typeof fetchWithSsrFGuard | n
  comfyFetchGuard = impl ?? fetchWithSsrFGuard;
 }

+function resolveComfyGeneratedOutputMaxBytes(params: {
+  cfg: OpenClawConfig;
+  capability: ComfyCapability;
+}): number {
+  const configured = params.cfg.agents?.defaults?.mediaMaxMb;
+  if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
+    return Math.floor(configured * 1024 * 1024);
+  }
+  return params.capability === "image"
+    ? DEFAULT_GENERATED_IMAGE_MAX_BYTES
+    : DEFAULT_GENERATED_MEDIA_MAX_BYTES;
+}
+
 function readConfigBoolean(config: ComfyProviderConfig, key: string): boolean | undefined {
  return asBoolean(config[key]);
 }
@@ -505,6 +521,7 @@ async function downloadOutputFile(params: {
  file: ComfyOutputFile;
  mode: ComfyMode;
  capability: ComfyCapability;
+  maxBytes: number;
 }): Promise<{ buffer: Buffer; mimeType: string }> {
  const fileName =
    normalizeOptionalString(params.file.filename) || normalizeOptionalString(params.file.name);
@@ -557,7 +574,15 @@ async function downloadOutputFile(params: {
          normalizeOptionalString(redirected.response.headers.get("content-type")) ||
          "application/octet-stream";
        return {
-          buffer: Buffer.from(await redirected.response.arrayBuffer()),
+          buffer: await readResponseWithLimit(redirected.response, params.maxBytes, {
+            chunkTimeoutMs: params.timeoutMs,
+            onOverflow: ({ maxBytes }) =>
+              new Error(`Comfy ${params.capability} output download exceeds ${maxBytes} bytes`),
+            onIdleTimeout: ({ chunkTimeoutMs }) =>
+              new Error(
+                `Comfy ${params.capability} output download stalled after ${chunkTimeoutMs}ms`,
+              ),
+          }),
          mimeType,
        };
      } finally {
@@ -570,7 +595,13 @@ async function downloadOutputFile(params: {
      normalizeOptionalString(firstResponse.response.headers.get("content-type")) ||
      "application/octet-stream";
    return {
-      buffer: Buffer.from(await firstResponse.response.arrayBuffer()),
+      buffer: await readResponseWithLimit(firstResponse.response, params.maxBytes, {
+        chunkTimeoutMs: params.timeoutMs,
+        onOverflow: ({ maxBytes }) =>
+          new Error(`Comfy ${params.capability} output download exceeds ${maxBytes} bytes`),
+        onIdleTimeout: ({ chunkTimeoutMs }) =>
+          new Error(`Comfy ${params.capability} output download stalled after ${chunkTimeoutMs}ms`),
+      }),
      mimeType,
    };
  } finally {
@@ -794,6 +825,10 @@ export async function runComfyWorkflow(params: {
  }

  const assets: ComfyGeneratedAsset[] = [];
+  const maxOutputBytes = resolveComfyGeneratedOutputMaxBytes({
+    cfg: params.cfg,
+    capability: params.capability,
+  });
  let assetIndex = 0;
  for (const output of outputFiles) {
    const downloaded = await downloadOutputFile({
@@ -805,6 +840,7 @@ export async function runComfyWorkflow(params: {
      file: output.file,
      mode,
      capability: params.capability,
+      maxBytes: maxOutputBytes,
    });
    assetIndex += 1;
    const originalName =
--- a/extensions/vydra/image-generation-provider.test.ts
+++ b/extensions/vydra/image-generation-provider.test.ts
@@ -71,6 +71,28 @@ describe("vydra image-generation provider", () => {
    });
  });

+  it("rejects generated image downloads that exceed the configured media cap", async () => {
+    stubVydraApiKey();
+    stubFetch(
+      jsonResponse({
+        jobId: "job-123",
+        status: "completed",
+        imageUrl: "https://cdn.vydra.ai/generated/test.png",
+      }),
+      binaryResponse("too-large", "image/png"),
+    );
+
+    const provider = buildVydraImageGenerationProvider();
+    await expect(
+      provider.generateImage({
+        provider: "vydra",
+        model: "grok-imagine",
+        prompt: "draw a cat",
+        cfg: { agents: { defaults: { mediaMaxMb: 0.000001 } } },
+      }),
+    ).rejects.toThrow("Vydra image download exceeds 1 bytes");
+  });
+
  it("passes request SSRF policy to the image creation request", async () => {
    stubVydraApiKey();
    const fetchMock = stubFetch(
--- a/extensions/vydra/image-generation-provider.ts
+++ b/extensions/vydra/image-generation-provider.ts
@@ -6,6 +6,7 @@ import {
  downloadVydraAsset,
  extractVydraResultUrls,
  resolveCompletedVydraPayload,
+  resolveVydraGeneratedMediaMaxBytes,
  resolveVydraResponseJobId,
  resolveVydraResponseStatus,
  resolveVydraRequestContext,
@@ -92,6 +93,7 @@ export function buildVydraImageGenerationProvider(): ImageGenerationProvider {
          kind: "image",
          timeoutMs: req.timeoutMs,
          fetchFn,
+          maxBytes: resolveVydraGeneratedMediaMaxBytes({ cfg: req.cfg, kind: "image" }),
        });
        return {
          images: [
--- a/extensions/vydra/shared.ts
+++ b/extensions/vydra/shared.ts
@@ -11,6 +11,7 @@ import {
  type ProviderOperationDeadline,
  type ProviderOperationTimeoutMs,
 } from "openclaw/plugin-sdk/provider-http";
+import { readResponseWithLimit } from "openclaw/plugin-sdk/response-limit-runtime";
 import {
  normalizeOptionalLowercaseString,
  normalizeOptionalString,
@@ -22,6 +23,9 @@ export const DEFAULT_VYDRA_VIDEO_MODEL = "veo3";
 export const DEFAULT_VYDRA_SPEECH_MODEL = "elevenlabs/tts";
 export const DEFAULT_VYDRA_VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
 const DEFAULT_HTTP_TIMEOUT_MS = 120_000;
+const DEFAULT_GENERATED_IMAGE_MAX_BYTES = 6 * 1024 * 1024;
+const DEFAULT_GENERATED_AUDIO_MAX_BYTES = 16 * 1024 * 1024;
+const DEFAULT_GENERATED_VIDEO_MAX_BYTES = 16 * 1024 * 1024;
 const POLL_INTERVAL_MS = 2_500;
 const MAX_POLL_ATTEMPTS = 120;
 type VydraAuthStore = Parameters<typeof resolveApiKeyForProvider>[0]["store"];
@@ -210,27 +214,47 @@ function resolveVydraHttpTimeoutMs(timeoutMs: ProviderOperationTimeoutMs | undef
  return resolved;
 }

+export function resolveVydraGeneratedMediaMaxBytes(params: {
+  cfg: { agents?: { defaults?: { mediaMaxMb?: number } } };
+  kind: VydraMediaKind;
+}): number {
+  const configured = params.cfg.agents?.defaults?.mediaMaxMb;
+  if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
+    return Math.floor(configured * 1024 * 1024);
+  }
+  if (params.kind === "image") {
+    return DEFAULT_GENERATED_IMAGE_MAX_BYTES;
+  }
+  if (params.kind === "audio") {
+    return DEFAULT_GENERATED_AUDIO_MAX_BYTES;
+  }
+  return DEFAULT_GENERATED_VIDEO_MAX_BYTES;
+}
+
 export async function downloadVydraAsset(params: {
  url: string;
  kind: VydraMediaKind;
  timeoutMs?: ProviderOperationTimeoutMs;
  fetchFn: typeof fetch;
+  maxBytes: number;
 }): Promise<{ buffer: Buffer; mimeType: string; fileName: string }> {
-  const response = await fetchWithTimeout(
-    params.url,
-    { method: "GET" },
-    resolveVydraHttpTimeoutMs(params.timeoutMs),
-    params.fetchFn,
-  );
+  const timeoutMs = resolveVydraHttpTimeoutMs(params.timeoutMs);
+  const response = await fetchWithTimeout(params.url, { method: "GET" }, timeoutMs, params.fetchFn);
  await assertOkOrThrowHttpError(response, `Vydra ${params.kind} download failed`);
  const mimeType =
    response.headers.get("content-type")?.trim() ||
    (params.kind === "image" ? "image/png" : params.kind === "audio" ? "audio/mpeg" : "video/mp4");
-  const arrayBuffer = await response.arrayBuffer();
+  const buffer = await readResponseWithLimit(response, params.maxBytes, {
+    chunkTimeoutMs: timeoutMs,
+    onOverflow: ({ maxBytes }) =>
+      new Error(`Vydra ${params.kind} download exceeds ${maxBytes} bytes`),
+    onIdleTimeout: ({ chunkTimeoutMs }) =>
+      new Error(`Vydra ${params.kind} download stalled after ${chunkTimeoutMs}ms`),
+  });
  const extension = resolveVydraFileExtension(params.kind, mimeType);
  const fileStem = params.kind === "image" ? "image" : params.kind === "audio" ? "audio" : "video";
  return {
-    buffer: Buffer.from(arrayBuffer),
+    buffer,
    mimeType,
    fileName: `${fileStem}-1.${extension}`,
  };
--- a/extensions/vydra/speech-provider.test.ts
+++ b/extensions/vydra/speech-provider.test.ts
@@ -69,4 +69,37 @@ describe("vydra speech provider", () => {
    expect(result.fileExtension).toBe(".mp3");
    expect(result.audioBuffer).toEqual(Buffer.from("mp3-data"));
  });
+
+  it("rejects generated audio downloads that exceed the configured media cap", async () => {
+    const fetchMock = vi
+      .fn()
+      .mockResolvedValueOnce(
+        new Response(
+          JSON.stringify({
+            audioUrl: "https://cdn.vydra.ai/generated/test.mp3",
+          }),
+          {
+            status: 200,
+            headers: { "Content-Type": "application/json" },
+          },
+        ),
+      )
+      .mockResolvedValueOnce(
+        new Response(Buffer.from("too-large"), {
+          status: 200,
+          headers: { "Content-Type": "audio/mpeg" },
+        }),
+      );
+    vi.stubGlobal("fetch", fetchMock);
+
+    await expect(
+      provider.synthesize({
+        text: "OpenClaw test",
+        cfg: { agents: { defaults: { mediaMaxMb: 0.000001 } } } as never,
+        providerConfig: { apiKey: "vydra-test-key" },
+        target: "audio-file",
+        timeoutMs: 30_000,
+      }),
+    ).rejects.toThrow("Vydra audio download exceeds 1 bytes");
+  });
 });
--- a/extensions/vydra/speech-provider.ts
+++ b/extensions/vydra/speech-provider.ts
@@ -17,6 +17,7 @@ import {
  downloadVydraAsset,
  extractVydraResultUrls,
  normalizeVydraBaseUrl,
+  resolveVydraGeneratedMediaMaxBytes,
  trimToUndefined,
 } from "./shared.js";

@@ -137,6 +138,7 @@ export function buildVydraSpeechProvider(): SpeechProviderPlugin {
          kind: "audio",
          timeoutMs: req.timeoutMs,
          fetchFn,
+          maxBytes: resolveVydraGeneratedMediaMaxBytes({ cfg: req.cfg, kind: "audio" }),
        });
        return {
          audioBuffer: audio.buffer,
--- a/extensions/vydra/video-generation-provider.test.ts
+++ b/extensions/vydra/video-generation-provider.test.ts
@@ -72,6 +72,29 @@ describe("vydra video-generation provider", () => {
    });
  });

+  it("rejects generated video downloads that exceed the configured media cap", async () => {
+    stubVydraApiKey();
+    stubFetch(
+      jsonResponse({ jobId: "job-123", status: "processing" }),
+      jsonResponse({
+        jobId: "job-123",
+        status: "completed",
+        videoUrl: "https://cdn.vydra.ai/generated/test.mp4",
+      }),
+      binaryResponse("too-large", "video/mp4"),
+    );
+
+    const provider = buildVydraVideoGenerationProvider();
+    await expect(
+      provider.generateVideo({
+        provider: "vydra",
+        model: "veo3",
+        prompt: "tiny city at sunrise",
+        cfg: { agents: { defaults: { mediaMaxMb: 0.000001 } } },
+      }),
+    ).rejects.toThrow("Vydra video download exceeds 1 bytes");
+  });
+
  it("requires a remote image url for kling", async () => {
    stubVydraApiKey();
    vi.stubGlobal("fetch", vi.fn());
--- a/extensions/vydra/video-generation-provider.ts
+++ b/extensions/vydra/video-generation-provider.ts
@@ -12,6 +12,7 @@ import {
  downloadVydraAsset,
  extractVydraResultUrls,
  resolveCompletedVydraPayload,
+  resolveVydraGeneratedMediaMaxBytes,
  resolveVydraResponseJobId,
  resolveVydraResponseStatus,
  resolveVydraRequestContext,
@@ -131,6 +132,7 @@ export function buildVydraVideoGenerationProvider(): VideoGenerationProvider {
            defaultTimeoutMs: DEFAULT_VYDRA_VIDEO_TIMEOUT_MS,
          }),
          fetchFn,
+          maxBytes: resolveVydraGeneratedMediaMaxBytes({ cfg: req.cfg, kind: "video" }),
        });
        return {
          videos: [