From e0a2c568b28efd8b1606bc212d48743d334cbde9 Mon Sep 17 00:00:00 2001 From: xieyongliang Date: Sat, 11 Apr 2026 18:08:30 +0800 Subject: [PATCH] video_generate: support url-only delivery (#61988) (thanks @xieyongliang) (#61988) Co-authored-by: George Zhang --- CHANGELOG.md | 2 + .../.generated/plugin-sdk-api-baseline.sha256 | 4 +- src/agents/tools/video-generate-tool.test.ts | 69 +++++++++++--- src/agents/tools/video-generate-tool.ts | 48 ++++++++-- src/cli/capability-cli.test.ts | 89 ++++++++++++++++++- src/cli/capability-cli.ts | 60 ++++++++++--- src/plugin-sdk/video-generation.ts | 7 +- src/video-generation/runtime-types.ts | 1 + src/video-generation/runtime.test.ts | 24 +++++ src/video-generation/runtime.ts | 7 ++ src/video-generation/types.ts | 7 +- 11 files changed, 281 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ad365feb08..f38844028a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ Docs: https://docs.openclaw.ai ### Changes +- Tools/video_generate: allow providers and plugins to return URL-only generated video assets so agent delivery and `openclaw capability video generate --output ...` can forward or stream large videos without requiring the full file in memory first. (#61988) Thanks @xieyongliang. + ### Fixes - WhatsApp: honor the configured default account when the active listener helper is used without an explicit account id, so named default accounts do not get registered under `default`. (#53918) Thanks @yhyatt. diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 907eec06192..2cf48e08487 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -7a9bb7a5e4b243e2123af94301ba363d57eddab2baa6378d16cd37a1cb8a55f7 plugin-sdk-api-baseline.json -2bdca027d5fda72399479569927cd34d18b56b242e4b12ac45e7c2352e551c77 plugin-sdk-api-baseline.jsonl +7a5c71593c9efbb936b9632f0b381a6c603e9bce44706b312a0172504fa51ef6 plugin-sdk-api-baseline.json +0b044de57266d20561838a5ae0edbaacaa53b323d4c8c068e701a48f92f0a264 plugin-sdk-api-baseline.jsonl diff --git a/src/agents/tools/video-generate-tool.test.ts b/src/agents/tools/video-generate-tool.test.ts index 931568a93a2..54c5c2167da 100644 --- a/src/agents/tools/video-generate-tool.test.ts +++ b/src/agents/tools/video-generate-tool.test.ts @@ -127,7 +127,55 @@ describe("createVideoGenerateTool", () => { expect(taskExecutorMocks.completeTaskRunByRunId).not.toHaveBeenCalled(); }); - it("starts background generation and wakes the session with MEDIA lines", async () => { + it("surfaces url-only generated videos without saving local files", async () => { + vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({ + provider: "vydra", + model: "veo3", + attempts: [], + ignoredOverrides: [], + videos: [ + { + url: "https://example.com/generated-lobster.mp4", + mimeType: "video/mp4", + fileName: "lobster.mp4", + }, + ], + metadata: { taskId: "task-1" }, + }); + const saveSpy = vi.spyOn(mediaStore, "saveMediaBuffer"); + + const tool = createVideoGenerateTool({ + config: asConfig({ + agents: { + defaults: { + videoGenerationModel: { primary: "vydra/veo3" }, + }, + }, + }), + }); + if (!tool) { + throw new Error("expected video_generate tool"); + } + + const result = await tool.execute("call-url", { prompt: "friendly lobster surfing" }); + const text = (result.content?.[0] as { text: string } | undefined)?.text ?? ""; + + expect(saveSpy).not.toHaveBeenCalled(); + expect(text).toContain("Generated 1 video with vydra/veo3."); + expect(text).toContain("MEDIA:https://example.com/generated-lobster.mp4"); + expect(result.details).toMatchObject({ + provider: "vydra", + model: "veo3", + count: 1, + media: { + mediaUrls: ["https://example.com/generated-lobster.mp4"], + }, + paths: ["https://example.com/generated-lobster.mp4"], + metadata: { taskId: "task-1" }, + }); + }); + + it("starts background generation and wakes the session with url-only MEDIA lines", async () => { taskExecutorMocks.createRunningTaskRun.mockReturnValue({ taskId: "task-123", runtime: "cli", @@ -143,33 +191,28 @@ describe("createVideoGenerateTool", () => { const wakeSpy = vi .spyOn(videoGenerateBackground, "wakeVideoGenerationTaskCompletion") .mockResolvedValue(undefined); + const saveSpy = vi.spyOn(mediaStore, "saveMediaBuffer"); vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({ - provider: "qwen", - model: "wan2.6-t2v", + provider: "vydra", + model: "veo3", attempts: [], ignoredOverrides: [], videos: [ { - buffer: Buffer.from("video-bytes"), + url: "https://example.com/generated-lobster.mp4", mimeType: "video/mp4", fileName: "lobster.mp4", }, ], metadata: { taskId: "task-1" }, }); - vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({ - path: "/tmp/generated-lobster.mp4", - id: "generated-lobster.mp4", - size: 11, - contentType: "video/mp4", - }); let scheduledWork: (() => Promise) | undefined; const tool = createVideoGenerateTool({ config: asConfig({ agents: { defaults: { - videoGenerationModel: { primary: "qwen/wan2.6-t2v" }, + videoGenerationModel: { primary: "vydra/veo3" }, }, }, }), @@ -200,6 +243,7 @@ describe("createVideoGenerateTool", () => { }); expect(typeof scheduledWork).toBe("function"); await scheduledWork?.(); + expect(saveSpy).not.toHaveBeenCalled(); expect(taskExecutorMocks.recordTaskRunProgressByRunId).toHaveBeenCalledWith( expect.objectContaining({ runId: expect.stringMatching(/^tool:video_generate:/), @@ -217,7 +261,8 @@ describe("createVideoGenerateTool", () => { taskId: "task-123", }), status: "ok", - result: expect.stringContaining("MEDIA:/tmp/generated-lobster.mp4"), + mediaUrls: ["https://example.com/generated-lobster.mp4"], + result: expect.stringContaining("MEDIA:https://example.com/generated-lobster.mp4"), }), ); }); diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts index df5bde4551d..6d35a16c08e 100644 --- a/src/agents/tools/video-generate-tool.ts +++ b/src/agents/tools/video-generate-tool.ts @@ -535,6 +535,10 @@ type ExecutedVideoGeneration = { provider: string; model: string; savedPaths: string[]; + /** URLs of url-only assets that were not saved locally. */ + urlOnlyUrls: string[]; + /** Total generated video count, including url-only assets. */ + count: number; contentText: string; details: Record; wakeResult: string; @@ -587,8 +591,28 @@ async function executeVideoGenerationJob(params: { }); } + const urlOnlyVideos: Array<{ url: string; mimeType: string; fileName?: string }> = []; + const bufferVideos: Array<(typeof result.videos)[number] & { buffer: Buffer }> = []; + for (const video of result.videos) { + if (video.buffer) { + bufferVideos.push(video as (typeof result.videos)[number] & { buffer: Buffer }); + continue; + } + if (video.url) { + urlOnlyVideos.push({ + url: video.url, + mimeType: video.mimeType, + fileName: video.fileName, + }); + continue; + } + throw new Error( + `Provider ${result.provider} returned a video asset with neither buffer nor url — cannot deliver.`, + ); + } + const savedVideos = await Promise.all( - result.videos.map((video) => + bufferVideos.map((video) => saveMediaBuffer( video.buffer, video.mimeType, @@ -598,6 +622,7 @@ async function executeVideoGenerationJob(params: { ), ), ); + const totalCount = savedVideos.length + urlOnlyVideos.length; const requestedDurationSeconds = result.normalization?.durationSeconds?.requested ?? (typeof result.metadata?.requestedDurationSeconds === "number" && @@ -646,8 +671,12 @@ async function executeVideoGenerationJob(params: { typeof result.metadata?.requestedSize === "string" && result.metadata.requestedSize === params.size && Boolean(normalizedAspectRatio)); + const allMediaUrls = [ + ...savedVideos.map((video) => video.path), + ...urlOnlyVideos.map((video) => video.url), + ]; const lines = [ - `Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`, + `Generated ${totalCount} video${totalCount === 1 ? "" : "s"} with ${result.provider}/${result.model}.`, ...(warning ? [`Warning: ${warning}`] : []), typeof requestedDurationSeconds === "number" && typeof normalizedDurationSeconds === "number" && @@ -655,22 +684,25 @@ async function executeVideoGenerationJob(params: { ? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.` : null, ...savedVideos.map((video) => `MEDIA:${video.path}`), + ...urlOnlyVideos.map((video) => `MEDIA:${video.url}`), ].filter((entry): entry is string => Boolean(entry)); return { provider: result.provider, model: result.model, savedPaths: savedVideos.map((video) => video.path), + urlOnlyUrls: urlOnlyVideos.map((video) => video.url), + count: totalCount, contentText: lines.join("\n"), wakeResult: lines.join("\n"), details: { provider: result.provider, model: result.model, - count: savedVideos.length, + count: totalCount, media: { - mediaUrls: savedVideos.map((video) => video.path), + mediaUrls: allMediaUrls, }, - paths: savedVideos.map((video) => video.path), + paths: allMediaUrls, ...buildTaskRunDetails(params.taskHandle), ...buildMediaReferenceDetails({ entries: params.loadedReferenceImages, @@ -931,7 +963,7 @@ export function createVideoGenerateTool(options?: { handle: taskHandle, provider: executed.provider, model: executed.model, - count: executed.savedPaths.length, + count: executed.count, paths: executed.savedPaths, }); try { @@ -941,7 +973,7 @@ export function createVideoGenerateTool(options?: { status: "ok", statusLabel: "completed successfully", result: executed.wakeResult, - mediaUrls: executed.savedPaths, + mediaUrls: [...executed.savedPaths, ...executed.urlOnlyUrls], }); } catch (error) { log.warn("Video generation completion wake failed after successful generation", { @@ -1025,7 +1057,7 @@ export function createVideoGenerateTool(options?: { handle: taskHandle, provider: executed.provider, model: executed.model, - count: executed.savedPaths.length, + count: executed.count, paths: executed.savedPaths, }); diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index fea7468a730..4add51dad14 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -2,7 +2,7 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import { Command } from "commander"; -import { beforeEach, describe, expect, it, vi } from "vitest"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { runRegisteredCli } from "../test-utils/command-runner.js"; import { registerCapabilityCli } from "./capability-cli.js"; @@ -58,6 +58,7 @@ const mocks = vi.hoisted(() => ({ model: "gpt-4.1-mini", })), generateImage: vi.fn(), + generateVideo: vi.fn(), transcribeAudioFile: vi.fn(async () => ({ text: "meeting notes" })), textToSpeech: vi.fn(async () => ({ success: true, @@ -202,7 +203,7 @@ vi.mock("../image-generation/runtime.js", () => ({ })); vi.mock("../video-generation/runtime.js", () => ({ - generateVideo: vi.fn(), + generateVideo: mocks.generateVideo, listRuntimeVideoGenerationProviders: vi.fn(() => []), })); @@ -238,6 +239,10 @@ vi.mock("../web-fetch/runtime.js", () => ({ })); describe("capability cli", () => { + afterEach(() => { + vi.unstubAllGlobals(); + }); + beforeEach(() => { mocks.runtime.log.mockClear(); mocks.runtime.error.mockClear(); @@ -278,6 +283,7 @@ describe("capability cli", () => { }) as never); mocks.describeImageFile.mockClear(); mocks.generateImage.mockReset(); + mocks.generateVideo.mockReset(); mocks.transcribeAudioFile.mockClear(); mocks.textToSpeech.mockClear(); mocks.setTtsProvider.mockClear(); @@ -434,6 +440,85 @@ describe("capability cli", () => { ); }); + it("streams url-only generated videos to --output paths", async () => { + mocks.generateVideo.mockResolvedValue({ + provider: "vydra", + model: "veo3", + attempts: [], + videos: [ + { + url: "https://example.com/generated-video.mp4", + mimeType: "video/mp4", + fileName: "provider-name.mp4", + }, + ], + }); + const fetchMock = vi.fn( + async () => + new Response(Buffer.from("video-bytes"), { + status: 200, + headers: { "content-type": "video/mp4" }, + }), + ); + vi.stubGlobal("fetch", fetchMock); + + const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-video-generate-")); + const outputBase = path.join(tempDir, "result"); + + await runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: [ + "capability", + "video", + "generate", + "--prompt", + "friendly lobster", + "--output", + outputBase, + "--json", + ], + }); + + const outputPath = `${outputBase}.mp4`; + expect(fetchMock).toHaveBeenCalledWith( + "https://example.com/generated-video.mp4", + expect.objectContaining({ signal: expect.any(AbortSignal) }), + ); + expect(await fs.readFile(outputPath, "utf8")).toBe("video-bytes"); + expect(mocks.runtime.writeJson).toHaveBeenCalledWith( + expect.objectContaining({ + capability: "video.generate", + provider: "vydra", + outputs: [ + expect.objectContaining({ + path: outputPath, + mimeType: "video/mp4", + size: 11, + }), + ], + }), + ); + }); + + it("fails video generate when a provider returns an undeliverable asset", async () => { + mocks.generateVideo.mockResolvedValue({ + provider: "vydra", + model: "veo3", + attempts: [], + videos: [{ mimeType: "video/mp4" }], + }); + + await expect( + runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: ["capability", "video", "generate", "--prompt", "friendly lobster", "--json"], + }), + ).rejects.toThrow("exit 1"); + expect(mocks.runtime.error).toHaveBeenCalledWith( + expect.stringContaining("Video asset at index 0 has neither buffer nor url"), + ); + }); + it("routes audio transcribe through transcription, not realtime", async () => { await runRegisteredCli({ register: registerCapabilityCli as (program: Command) => void, diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts index 872500e34a9..cdb291a85fb 100644 --- a/src/cli/capability-cli.ts +++ b/src/cli/capability-cli.ts @@ -815,17 +815,55 @@ async function runVideoGenerate(params: { prompt: string; model?: string; output modelOverride: params.model, }); const outputs = await Promise.all( - result.videos.map(async (video, index) => ({ - ...(await writeOutputAsset({ - buffer: video.buffer, - mimeType: video.mimeType, - originalFilename: video.fileName, - outputPath: params.output, - outputIndex: index, - outputCount: result.videos.length, - subdir: "generated", - })), - })), + result.videos.map(async (video, index) => { + if (!video.buffer && !video.url) { + throw new Error(`Video asset at index ${index} has neither buffer nor url`); + } + + let videoBuffer = video.buffer; + if (!videoBuffer && video.url) { + const response = await fetch(video.url, { signal: AbortSignal.timeout(120_000) }); + if (!response.ok) { + throw new Error(`Failed to download video from ${video.url}: ${response.status}`); + } + if (params.output && response.body) { + const { pipeline } = await import("node:stream/promises"); + const { Readable } = await import("node:stream"); + const { createWriteStream } = await import("node:fs"); + const mimeType = normalizeMimeType(video.mimeType); + const ext = + extensionForMime(mimeType) || + path.extname(video.fileName ?? "") || + path.extname(params.output ?? ""); + const resolvedOutput = path.resolve(params.output); + const parsed = path.parse(resolvedOutput); + const filePath = + result.videos.length <= 1 + ? path.join(parsed.dir, `${parsed.name}${ext}`) + : path.join(parsed.dir, `${parsed.name}-${String(index + 1)}${ext}`); + await fs.mkdir(path.dirname(filePath), { recursive: true }); + await pipeline( + Readable.fromWeb(response.body as import("node:stream/web").ReadableStream), + createWriteStream(filePath), + ); + const stat = await fs.stat(filePath); + return { path: filePath, mimeType: video.mimeType, size: stat.size }; + } + videoBuffer = Buffer.from(await response.arrayBuffer()); + } + + return { + ...(await writeOutputAsset({ + buffer: videoBuffer!, + mimeType: video.mimeType, + originalFilename: video.fileName, + outputPath: params.output, + outputIndex: index, + outputCount: result.videos.length, + subdir: "generated", + })), + }; + }), ); return { ok: true, diff --git a/src/plugin-sdk/video-generation.ts b/src/plugin-sdk/video-generation.ts index 0e537c7dd91..b008075ef72 100644 --- a/src/plugin-sdk/video-generation.ts +++ b/src/plugin-sdk/video-generation.ts @@ -22,7 +22,12 @@ import type { } from "../video-generation/types.js"; export type GeneratedVideoAsset = { - buffer: Buffer; + /** Raw video bytes. Either buffer or url must be present. */ + buffer?: Buffer; + /** Pre-signed or provider-hosted URL for the video. When set and buffer is + * absent, callers can deliver or download the asset without requiring the + * provider to materialize the full file in memory first. */ + url?: string; mimeType: string; fileName?: string; metadata?: Record; diff --git a/src/video-generation/runtime-types.ts b/src/video-generation/runtime-types.ts index a6a3e388d86..886c948cb01 100644 --- a/src/video-generation/runtime-types.ts +++ b/src/video-generation/runtime-types.ts @@ -25,6 +25,7 @@ export type GenerateVideoParams = { inputImages?: VideoGenerationSourceAsset[]; inputVideos?: VideoGenerationSourceAsset[]; inputAudios?: VideoGenerationSourceAsset[]; + /** Arbitrary provider-specific options forwarded as-is to provider.generateVideo. */ providerOptions?: Record; }; diff --git a/src/video-generation/runtime.test.ts b/src/video-generation/runtime.test.ts index ad7b498f9db..f08a4ff3923 100644 --- a/src/video-generation/runtime.test.ts +++ b/src/video-generation/runtime.test.ts @@ -517,6 +517,30 @@ describe("video-generation runtime", () => { ).rejects.toThrow(/supports at most 4s per video, 6s requested/); }); + it("rejects provider results that contain undeliverable assets", async () => { + mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1"); + mocks.getVideoGenerationProvider.mockReturnValue({ + id: "video-plugin", + capabilities: {}, + generateVideo: async () => ({ + videos: [{ mimeType: "video/mp4" }], + }), + }); + + await expect( + generateVideo({ + cfg: { + agents: { + defaults: { + videoGenerationModel: { primary: "video-plugin/vid-v1" }, + }, + }, + } as OpenClawConfig, + prompt: "animate a cat", + }), + ).rejects.toThrow(/neither buffer nor url is set/); + }); + it("lists runtime video-generation providers through the provider registry", () => { const providers: VideoGenerationProvider[] = [ { diff --git a/src/video-generation/runtime.ts b/src/video-generation/runtime.ts index fd6453cdbc4..da4f4888510 100644 --- a/src/video-generation/runtime.ts +++ b/src/video-generation/runtime.ts @@ -265,6 +265,13 @@ export async function generateVideo( if (!Array.isArray(result.videos) || result.videos.length === 0) { throw new Error("Video generation provider returned no videos."); } + for (const [index, video] of result.videos.entries()) { + if (!video.buffer && !video.url) { + throw new Error( + `Video generation provider returned an undeliverable asset at index ${index}: neither buffer nor url is set.`, + ); + } + } return { videos: result.videos, provider: candidate.provider, diff --git a/src/video-generation/types.ts b/src/video-generation/types.ts index 3246f2f0769..ccffab908c2 100644 --- a/src/video-generation/types.ts +++ b/src/video-generation/types.ts @@ -3,7 +3,12 @@ import type { OpenClawConfig } from "../config/types.openclaw.js"; import type { MediaNormalizationEntry } from "../media-generation/normalization.types.js"; export type GeneratedVideoAsset = { - buffer: Buffer; + /** Raw video bytes. Required for local delivery; omit when url is provided instead. */ + buffer?: Buffer; + /** External URL for the video (for example a pre-signed cloud storage URL). + * When set and buffer is absent, delivery surfaces can forward the URL + * without downloading the full video into memory first. */ + url?: string; mimeType: string; fileName?: string; metadata?: Record;