diff --git a/src/agents/tools/image-generate-tool.test.ts b/src/agents/tools/image-generate-tool.test.ts index 74e2f6782e1..d03a510e3fd 100644 --- a/src/agents/tools/image-generate-tool.test.ts +++ b/src/agents/tools/image-generate-tool.test.ts @@ -705,6 +705,26 @@ describe("createImageGenerateTool", () => { it("passes web_fetch SSRF policy to remote reference images", async () => { stubImageGenerationProviders(); stubEditedImageFlow({ width: 1024, height: 1024 }); + const defaultTool = requireImageGenerateTool( + createImageGenerateTool({ + config: { + agents: { + defaults: { imageGenerationModel: { primary: "google/gemini-3-pro-image-preview" } }, + }, + }, + workspaceDir: process.cwd(), + }), + ); + + await defaultTool.execute("call-edit-rfc2544-default", { + prompt: "Use this reference.", + image: "http://198.18.0.153/reference.png", + }); + expect(webMedia.loadWebMedia).toHaveBeenLastCalledWith( + "http://198.18.0.153/reference.png", + expect.not.objectContaining({ ssrfPolicy: expect.anything() }), + ); + const tool = requireImageGenerateTool( createImageGenerateTool({ config: { diff --git a/src/agents/tools/music-generate-tool.test.ts b/src/agents/tools/music-generate-tool.test.ts index aaa1c8313e1..3d508d59f4e 100644 --- a/src/agents/tools/music-generate-tool.test.ts +++ b/src/agents/tools/music-generate-tool.test.ts @@ -1,6 +1,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../../config/config.js"; import * as mediaStore from "../../media/store.js"; +import * as webMedia from "../../media/web-media.js"; import * as musicGenerationRuntime from "../../music-generation/runtime.js"; import * as musicGenerateBackground from "./music-generate-background.js"; import { createMusicGenerateTool } from "./music-generate-tool.js"; @@ -92,9 +93,15 @@ const musicGenerateBackgroundMocks = vi.hoisted(() => ({ vi.mock("../../config/config.js", () => configMocks); vi.mock("../../media/store.js", () => mediaStoreMocks); -vi.mock("../../media/web-media.js", () => ({ - loadWebMedia: vi.fn(), -})); +vi.mock("../../media/web-media.js", async () => { + const actual = await vi.importActual( + "../../media/web-media.js", + ); + return { + ...actual, + loadWebMedia: vi.fn(), + }; +}); vi.mock("../../music-generation/runtime.js", () => musicGenerationRuntimeMocks); vi.mock("./music-generate-background.js", () => musicGenerateBackgroundMocks); vi.mock("../../tasks/runtime-internal.js", () => taskRuntimeInternalMocks); @@ -509,4 +516,63 @@ describe("createMusicGenerateTool", () => { }, }); }); + + it("passes web_fetch SSRF policy when loading reference images", async () => { + vi.spyOn(musicGenerationRuntime, "listRuntimeMusicGenerationProviders").mockReturnValue([ + { + id: "minimax", + defaultModel: "music-2.5+", + models: ["music-2.5+"], + capabilities: { + edit: { enabled: true, maxInputImages: 1 }, + }, + generateMusic: vi.fn(async () => { + throw new Error("not used"); + }), + }, + ]); + vi.spyOn(webMedia, "loadWebMedia").mockResolvedValue({ + kind: "image", + buffer: Buffer.from("image"), + contentType: "image/png", + }); + vi.spyOn(musicGenerationRuntime, "generateMusic").mockResolvedValue({ + provider: "minimax", + model: "music-2.5+", + attempts: [], + ignoredOverrides: [], + tracks: [{ buffer: Buffer.from("music"), mimeType: "audio/mpeg" }], + }); + vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({ + path: "/tmp/generated-night-drive.mp3", + id: "generated-night-drive.mp3", + size: 11, + contentType: "audio/mpeg", + }); + const tool = createMusicGenerateTool({ + config: asConfig({ + agents: { + defaults: { + musicGenerationModel: { primary: "minimax/music-2.5+" }, + }, + }, + tools: { web: { fetch: { ssrfPolicy: { allowRfc2544BenchmarkRange: true } } } }, + }), + }); + if (!tool) { + throw new Error("expected music_generate tool"); + } + + await tool.execute("call-1", { + prompt: "night-drive synthwave", + image: "http://198.18.0.153/reference.png", + }); + + expect(webMedia.loadWebMedia).toHaveBeenCalledWith( + "http://198.18.0.153/reference.png", + expect.objectContaining({ + ssrfPolicy: { allowRfc2544BenchmarkRange: true }, + }), + ); + }); }); diff --git a/src/agents/tools/music-generate-tool.ts b/src/agents/tools/music-generate-tool.ts index b60b70e70cc..c6d2ebbf03e 100644 --- a/src/agents/tools/music-generate-tool.ts +++ b/src/agents/tools/music-generate-tool.ts @@ -2,6 +2,7 @@ import { Type } from "typebox"; import { loadConfig } from "../../config/config.js"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; import { formatErrorMessage } from "../../infra/errors.js"; +import type { SsrFPolicy } from "../../infra/net/ssrf.js"; import { createSubsystemLogger } from "../../logging/subsystem.js"; import { resolveConfiguredMediaMaxBytes } from "../../media/configured-max-bytes.js"; import { @@ -236,6 +237,7 @@ async function loadReferenceImages(params: { inputs: string[]; workspaceDir?: string; sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null; + ssrfPolicy?: SsrFPolicy; }): Promise< Array<{ sourceImage: MusicGenerationSourceImage; @@ -303,6 +305,7 @@ async function loadReferenceImages(params: { }) : await loadWebMedia(resolvedPath ?? resolvedInput, { localRoots, + ssrfPolicy: params.ssrfPolicy, }); if (media.kind !== "image") { throw new ToolInputError(`Unsupported media type: ${media.kind ?? "unknown"}`); @@ -540,10 +543,12 @@ export function createMusicGenerateTool(options?: { musicGenerationModelConfig, modelOverride: model, }); + const remoteMediaSsrfPolicy = effectiveCfg.tools?.web?.fetch?.ssrfPolicy; const loadedReferenceImages = await loadReferenceImages({ inputs: imageInputs, workspaceDir: options?.workspaceDir, sandboxConfig, + ssrfPolicy: remoteMediaSsrfPolicy, }); validateMusicGenerationCapabilities({ provider: selectedProvider, diff --git a/src/agents/tools/video-generate-tool.test.ts b/src/agents/tools/video-generate-tool.test.ts index 841588cc6c1..e15879cff14 100644 --- a/src/agents/tools/video-generate-tool.test.ts +++ b/src/agents/tools/video-generate-tool.test.ts @@ -1,6 +1,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../../config/config.js"; import * as mediaStore from "../../media/store.js"; +import * as webMedia from "../../media/web-media.js"; import * as videoGenerationRuntime from "../../video-generation/runtime.js"; import * as videoGenerateBackground from "./video-generate-background.js"; import { createVideoGenerateTool } from "./video-generate-tool.js"; @@ -755,6 +756,43 @@ describe("createVideoGenerateTool", () => { expect(call.inputImages?.[1]?.role).toBe("last_frame"); }); + it("passes web_fetch SSRF policy when loading reference assets", async () => { + mockVideoPluginProvider({ + imageToVideo: { enabled: true, maxInputImages: 1 }, + }); + vi.spyOn(webMedia, "loadWebMedia").mockResolvedValue({ + kind: "image", + buffer: Buffer.from("image"), + contentType: "image/png", + }); + mockSavedVideoResult(); + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + tools: { web: { fetch: { ssrfPolicy: { allowRfc2544BenchmarkRange: true } } } }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + await tool.execute("call-1", { + prompt: "lobster", + image: "/tmp/reference.png", + }); + + expect(webMedia.loadWebMedia).toHaveBeenCalledWith( + "/tmp/reference.png", + expect.objectContaining({ + ssrfPolicy: { allowRfc2544BenchmarkRange: true }, + }), + ); + }); + it("rejects audio data: URLs via the templated rejection branch", async () => { mockVideoPluginProvider({ maxInputAudios: 1, diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts index 173adf1bdea..5b8c7d9fc91 100644 --- a/src/agents/tools/video-generate-tool.ts +++ b/src/agents/tools/video-generate-tool.ts @@ -2,6 +2,7 @@ import { Type } from "typebox"; import { loadConfig } from "../../config/config.js"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; import { formatErrorMessage } from "../../infra/errors.js"; +import type { SsrFPolicy } from "../../infra/net/ssrf.js"; import { createSubsystemLogger } from "../../logging/subsystem.js"; import { resolveConfiguredMediaMaxBytes } from "../../media/configured-max-bytes.js"; import { @@ -430,6 +431,7 @@ async function loadReferenceAssets(params: { maxBytes?: number; workspaceDir?: string; sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null; + ssrfPolicy?: SsrFPolicy; }): Promise< Array<{ sourceAsset: VideoGenerationSourceAsset; @@ -520,6 +522,7 @@ async function loadReferenceAssets(params: { : await loadWebMedia(resolvedPath ?? resolvedInput, { maxBytes: params.maxBytes, localRoots, + ssrfPolicy: params.ssrfPolicy, }); if (media.kind !== params.expectedKind) { throw new ToolInputError(`Unsupported media type: ${media.kind ?? "unknown"}`); @@ -810,6 +813,7 @@ export function createVideoGenerateTool(options?: { const action = resolveAction(args); const effectiveCfg = applyVideoGenerationModelConfigDefaults(cfg, videoGenerationModelConfig) ?? cfg; + const remoteMediaSsrfPolicy = effectiveCfg.tools?.web?.fetch?.ssrfPolicy; if (action === "list") { return createVideoGenerateListActionResult(effectiveCfg); @@ -900,6 +904,7 @@ export function createVideoGenerateTool(options?: { expectedKind: "image", workspaceDir: options?.workspaceDir, sandboxConfig, + ssrfPolicy: remoteMediaSsrfPolicy, }); // Attach roles to the loaded image assets (positional, by index into images[]). for (let i = 0; i < loadedReferenceImages.length; i++) { @@ -913,6 +918,7 @@ export function createVideoGenerateTool(options?: { expectedKind: "video", workspaceDir: options?.workspaceDir, sandboxConfig, + ssrfPolicy: remoteMediaSsrfPolicy, }); for (let i = 0; i < loadedReferenceVideos.length; i++) { const role = videoRoles[i]; @@ -925,6 +931,7 @@ export function createVideoGenerateTool(options?: { expectedKind: "audio", workspaceDir: options?.workspaceDir, sandboxConfig, + ssrfPolicy: remoteMediaSsrfPolicy, }); for (let i = 0; i < loadedReferenceAudios.length; i++) { const role = audioRoles[i];