From 177167c846efff606219158a940edab3d3d60cb3 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Mon, 4 May 2026 22:26:10 -0700 Subject: [PATCH] fix(video): recover generation parameter fallbacks --- CHANGELOG.md | 1 + docs/tools/video-generation.md | 4 +- .../google/generation-provider-metadata.ts | 6 +- .../google/video-generation-provider.test.ts | 2 +- .../google/video-generation-provider.ts | 5 -- .../minimax/video-generation-provider.test.ts | 2 + .../minimax/video-generation-provider.ts | 53 +++++++++++- src/agents/tools/video-generate-tool.test.ts | 17 ++-- src/agents/tools/video-generate-tool.ts | 46 ++-------- src/plugin-sdk/video-generation.ts | 2 +- src/video-generation/normalization.ts | 10 +++ src/video-generation/runtime.test.ts | 85 +++++++++++++++++++ src/video-generation/types.ts | 2 +- 13 files changed, 177 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 935865e0301..d1d1f9f3f90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,7 @@ Docs: https://docs.openclaw.ai - Update/restart: probe managed Gateway restarts with the service environment and add a Docker product lane that exercises candidate-owned `openclaw update --yes --json` restarts, so SecretRef-backed local gateway auth cannot regress behind mocked restart checks. Thanks @vincentkoc. - Webhooks/Gmail/Windows: resolve `gcloud`, `gog`, and `tailscale` PATH/PATHEXT shims before setup and watcher spawns, using the Windows-safe `.cmd` wrapper for long-lived `gog serve` processes. (#74881, fixes #54470) Thanks @Angfr95. +- Video generation: accept provider-specific aspect-ratio and resolution hints at the tool boundary, normalize `720P` to MiniMax's supported `768P`, and stop sending Google `generateAudio` on Gemini video requests so provider fallback can recover from model-specific parameter differences. Thanks @vincentkoc. - Plugins/install: honor the beta update channel for onboarding and doctor-managed plugin installs by requesting floating npm and ClawHub specs with `@beta` while keeping persistent install records on the catalog default. Thanks @vincentkoc. - WhatsApp/onboarding: canonicalize setup and pairing allowlist entries to WhatsApp's digit-only phone ids while still accepting E.164, JID, and `whatsapp:` inputs, so personal-phone allowlists match WhatsApp Web sender ids after setup. Thanks @vincentkoc. - Gateway/startup: load provider plugins that own explicitly configured image, video, or music generation defaults so generation tools become live after gateway restart instead of remaining catalog-only. Fixes #77244. Thanks @buyuangtampan, @Nikoxx99, and @vincentkoc. diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md index 4f6ab8f12a4..a70299e81e6 100644 --- a/docs/tools/video-generation.md +++ b/docs/tools/video-generation.md @@ -198,9 +198,9 @@ role or use `first_frame` for single-image image-to-video. ### Style controls - `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9`, or `adaptive`. + Aspect-ratio hint such as `1:1`, `16:9`, `9:16`, `adaptive`, or a provider-specific value. OpenClaw normalizes or ignores unsupported values per provider. -`480P`, `720P`, `768P`, or `1080P`. +Resolution hint such as `480P`, `720P`, `768P`, `1080P`, `4K`, or a provider-specific value. OpenClaw normalizes or ignores unsupported values per provider. Target duration in seconds (rounded to nearest provider-supported value). diff --git a/extensions/google/generation-provider-metadata.ts b/extensions/google/generation-provider-metadata.ts index ea942815981..9d2ac74218e 100644 --- a/extensions/google/generation-provider-metadata.ts +++ b/extensions/google/generation-provider-metadata.ts @@ -88,7 +88,7 @@ export function createGoogleVideoGenerationProviderMetadata(): Omit< supportsAspectRatio: true, supportsResolution: true, supportsSize: true, - supportsAudio: true, + supportsAudio: false, }, imageToVideo: { enabled: true, @@ -101,7 +101,7 @@ export function createGoogleVideoGenerationProviderMetadata(): Omit< supportsAspectRatio: true, supportsResolution: true, supportsSize: true, - supportsAudio: true, + supportsAudio: false, }, videoToVideo: { enabled: true, @@ -114,7 +114,7 @@ export function createGoogleVideoGenerationProviderMetadata(): Omit< supportsAspectRatio: true, supportsResolution: true, supportsSize: true, - supportsAudio: true, + supportsAudio: false, }, }, }; diff --git a/extensions/google/video-generation-provider.test.ts b/extensions/google/video-generation-provider.test.ts index 3db4de23aab..b2cede22a39 100644 --- a/extensions/google/video-generation-provider.test.ts +++ b/extensions/google/video-generation-provider.test.ts @@ -86,11 +86,11 @@ describe("google video generation provider", () => { durationSeconds: 4, aspectRatio: "16:9", resolution: "720p", - generateAudio: true, }), }), ); expect(request?.config).not.toHaveProperty("numberOfVideos"); + expect(request?.config).not.toHaveProperty("generateAudio"); expect(result.videos).toHaveLength(1); expect(result.videos[0]?.mimeType).toBe("video/mp4"); expect(createGoogleGenAIMock).toHaveBeenCalledWith( diff --git a/extensions/google/video-generation-provider.ts b/extensions/google/video-generation-provider.ts index b4c6fd104bc..daf78c9e2bc 100644 --- a/extensions/google/video-generation-provider.ts +++ b/extensions/google/video-generation-provider.ts @@ -322,7 +322,6 @@ async function generateGoogleVideoViaRest(params: { durationSeconds?: number; aspectRatio?: "16:9" | "9:16"; resolution?: "720p" | "1080p"; - audio?: boolean; }): Promise { let operation = await requestGoogleVideoJson({ url: `${params.baseUrl}/${resolveGoogleVideoRestModelPath(params.model)}:predictLongRunning`, @@ -337,7 +336,6 @@ async function generateGoogleVideoViaRest(params: { : {}), ...(params.aspectRatio ? { aspectRatio: params.aspectRatio } : {}), ...(params.resolution ? { resolution: params.resolution } : {}), - ...(params.audio === true ? { generateAudio: true } : {}), }, }, }); @@ -429,7 +427,6 @@ export function buildGoogleVideoGenerationProvider(): VideoGenerationProvider { ...(typeof durationSeconds === "number" ? { durationSeconds } : {}), ...(aspectRatio ? { aspectRatio } : {}), ...(resolution ? { resolution } : {}), - ...(req.audio === true ? { generateAudio: true } : {}), }, }); } catch (error) { @@ -446,7 +443,6 @@ export function buildGoogleVideoGenerationProvider(): VideoGenerationProvider { durationSeconds, aspectRatio, resolution, - audio: req.audio, }); } @@ -480,7 +476,6 @@ export function buildGoogleVideoGenerationProvider(): VideoGenerationProvider { durationSeconds, aspectRatio, resolution, - audio: req.audio, }); generatedVideos = extractGeneratedVideos(operation); } diff --git a/extensions/minimax/video-generation-provider.test.ts b/extensions/minimax/video-generation-provider.test.ts index ff71b0fca1c..2f717a23bf1 100644 --- a/extensions/minimax/video-generation-provider.test.ts +++ b/extensions/minimax/video-generation-provider.test.ts @@ -64,6 +64,7 @@ describe("minimax video generation provider", () => { prompt: "A fox sprints across snowy hills", cfg: {}, durationSeconds: 5, + resolution: "720P", }); expect(postJsonRequestMock).toHaveBeenCalledWith( @@ -71,6 +72,7 @@ describe("minimax video generation provider", () => { url: "https://api.minimax.io/v1/video_generation", body: expect.objectContaining({ duration: 6, + resolution: "768P", }), }), ); diff --git a/extensions/minimax/video-generation-provider.ts b/extensions/minimax/video-generation-provider.ts index 88181dc5b87..27cb6e24e74 100644 --- a/extensions/minimax/video-generation-provider.ts +++ b/extensions/minimax/video-generation-provider.ts @@ -25,6 +25,12 @@ const MINIMAX_MODEL_ALLOWED_DURATIONS: Readonly> = { + "MiniMax-Hailuo-2.3": ["768P", "1080P"], + "MiniMax-Hailuo-2.3-Fast": ["768P", "1080P"], + "MiniMax-Hailuo-02": ["768P", "1080P"], +}; +const MINIMAX_RESOLUTION_ORDER = ["480P", "720P", "768P", "1080P"] as const; type MinimaxBaseResp = { status_code?: number; @@ -112,6 +118,43 @@ function resolveDurationSeconds(params: { ); } +function resolveResolution(params: { + model: string; + resolution: string | undefined; +}): string | undefined { + const requested = normalizeOptionalString(params.resolution)?.toUpperCase(); + if (!requested) { + return undefined; + } + const allowed = MINIMAX_MODEL_ALLOWED_RESOLUTIONS[params.model]; + if (!allowed || allowed.length === 0 || allowed.includes(requested)) { + return requested; + } + const requestedIndex = MINIMAX_RESOLUTION_ORDER.indexOf( + requested as (typeof MINIMAX_RESOLUTION_ORDER)[number], + ); + if (requestedIndex < 0) { + return undefined; + } + return allowed.reduce((best, current) => { + const currentIndex = MINIMAX_RESOLUTION_ORDER.indexOf( + current as (typeof MINIMAX_RESOLUTION_ORDER)[number], + ); + const bestIndex = MINIMAX_RESOLUTION_ORDER.indexOf( + best as (typeof MINIMAX_RESOLUTION_ORDER)[number], + ); + if (currentIndex < 0) { + return best; + } + if (bestIndex < 0) { + return current; + } + return Math.abs(currentIndex - requestedIndex) < Math.abs(bestIndex - requestedIndex) + ? current + : best; + }); +} + async function pollMinimaxVideo(params: { taskId: string; headers: Headers; @@ -246,6 +289,7 @@ function buildMinimaxVideoProvider(providerId: string): VideoGenerationProvider maxVideos: 1, maxDurationSeconds: 10, supportedDurationSecondsByModel: MINIMAX_MODEL_ALLOWED_DURATIONS, + resolutions: ["768P", "1080P"], supportsResolution: true, supportsWatermark: false, }, @@ -255,6 +299,7 @@ function buildMinimaxVideoProvider(providerId: string): VideoGenerationProvider maxInputImages: 1, maxDurationSeconds: 10, supportedDurationSecondsByModel: MINIMAX_MODEL_ALLOWED_DURATIONS, + resolutions: ["768P", "1080P"], supportsResolution: true, supportsWatermark: false, }, @@ -303,8 +348,12 @@ function buildMinimaxVideoProvider(providerId: string): VideoGenerationProvider if (firstFrameImage) { body.first_frame_image = firstFrameImage; } - if (req.resolution) { - body.resolution = req.resolution; + const resolution = resolveResolution({ + model, + resolution: req.resolution, + }); + if (resolution) { + body.resolution = resolution; } const durationSeconds = resolveDurationSeconds({ model, diff --git a/src/agents/tools/video-generate-tool.test.ts b/src/agents/tools/video-generate-tool.test.ts index b34dd3a622e..0f6f6051c2e 100644 --- a/src/agents/tools/video-generate-tool.test.ts +++ b/src/agents/tools/video-generate-tool.test.ts @@ -1073,17 +1073,22 @@ describe("createVideoGenerateTool", () => { expect(generateSpy).toHaveBeenCalledWith(expect.objectContaining({ aspectRatio: "adaptive" })); }); - it("rejects unsupported aspectRatio values", async () => { + it("accepts provider-specific aspectRatio and resolution values and forwards them to the runtime", async () => { mockVideoPluginProvider(); + const generateSpy = mockSavedVideoResult(); const tool = createVideoPluginTool(); - await expect( - tool.execute("call-1", { - prompt: "lobster", + await tool.execute("call-1", { + prompt: "lobster", + aspectRatio: "17:9", + resolution: "draft-large", + }); + + expect(generateSpy).toHaveBeenCalledWith( + expect.objectContaining({ aspectRatio: "17:9", + resolution: "draft-large", }), - ).rejects.toThrow( - "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or adaptive", ); }); }); diff --git a/src/agents/tools/video-generate-tool.ts b/src/agents/tools/video-generate-tool.ts index b32c774625a..b6ca6cbebb2 100644 --- a/src/agents/tools/video-generate-tool.ts +++ b/src/agents/tools/video-generate-tool.ts @@ -77,26 +77,6 @@ const log = createSubsystemLogger("agents/tools/video-generate"); const MAX_INPUT_IMAGES = 9; const MAX_INPUT_VIDEOS = 4; const MAX_INPUT_AUDIOS = 3; -const SUPPORTED_ASPECT_RATIOS = new Set([ - "1:1", - "2:3", - "3:2", - "3:4", - "4:3", - "4:5", - "5:4", - "9:16", - "16:9", - "21:9", - // Provider-specific sentinel: accepted at the tool boundary, then forwarded - // to the active provider only if that provider declares "adaptive" in its - // capabilities.aspectRatios list. Providers that do not declare it see the - // value pushed into `ignoredOverrides` in the normalization layer so the - // tool surfaces a user-visible "ignored override" warning rather than - // silently dropping the request. Seedance uses this to auto-detect the - // ratio from input image dimensions. - "adaptive", -]); const VideoGenerateToolSchema = Type.Object({ action: Type.Optional( @@ -184,12 +164,13 @@ const VideoGenerateToolSchema = Type.Object({ aspectRatio: Type.Optional( Type.String({ description: - 'Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or "adaptive".', + 'Optional aspect ratio hint such as 1:1, 16:9, 9:16, "adaptive", or a provider-specific value. OpenClaw normalizes or ignores unsupported values per provider.', }), ), resolution: Type.Optional( Type.String({ - description: "Optional resolution hint: 480P, 720P, 768P, or 1080P.", + description: + "Optional resolution hint such as 480P, 720P, 768P, 1080P, 4K, or a provider-specific value. OpenClaw normalizes or ignores unsupported values per provider.", }), ), durationSeconds: Type.Optional( @@ -254,19 +235,15 @@ function resolveAction(args: Record): "generate" | "list" | "st } function normalizeResolution(raw: string | undefined): VideoGenerationResolution | undefined { - const normalized = raw?.trim().toUpperCase(); + const normalized = raw?.trim(); if (!normalized) { return undefined; } - if ( - normalized === "480P" || - normalized === "720P" || - normalized === "768P" || - normalized === "1080P" - ) { - return normalized; + const uppercase = normalized.toUpperCase(); + if (/^\d+P$/.test(uppercase) || /^\d+K$/.test(uppercase)) { + return uppercase; } - throw new ToolInputError("resolution must be one of 480P, 720P, 768P, or 1080P"); + return normalized; } function normalizeAspectRatio(raw: string | undefined): string | undefined { @@ -274,12 +251,7 @@ function normalizeAspectRatio(raw: string | undefined): string | undefined { if (!normalized) { return undefined; } - if (SUPPORTED_ASPECT_RATIOS.has(normalized)) { - return normalized; - } - throw new ToolInputError( - "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or adaptive", - ); + return normalized; } /** diff --git a/src/plugin-sdk/video-generation.ts b/src/plugin-sdk/video-generation.ts index 6223a27ef96..20d86f84b57 100644 --- a/src/plugin-sdk/video-generation.ts +++ b/src/plugin-sdk/video-generation.ts @@ -33,7 +33,7 @@ export type GeneratedVideoAsset = { metadata?: Record; }; -export type VideoGenerationResolution = "480P" | "720P" | "768P" | "1080P"; +export type VideoGenerationResolution = "480P" | "720P" | "768P" | "1080P" | (string & {}); /** * Canonical semantic role hints for reference assets (first/last frame, diff --git a/src/video-generation/normalization.ts b/src/video-generation/normalization.ts index b65fd7bd4f5..4cf7ef40243 100644 --- a/src/video-generation/normalization.ts +++ b/src/video-generation/normalization.ts @@ -16,6 +16,13 @@ import type { VideoGenerationResolution, } from "./types.js"; +const VIDEO_RESOLUTION_ORDER: readonly VideoGenerationResolution[] = [ + "480P", + "720P", + "768P", + "1080P", +]; + export type ResolvedVideoGenerationOverrides = { size?: string; aspectRatio?: string; @@ -138,12 +145,15 @@ export function resolveVideoGenerationOverrides(params: { const normalizedResolution = resolveClosestResolution({ requestedResolution: resolution, supportedResolutions: caps.resolutions, + order: VIDEO_RESOLUTION_ORDER, }); if (normalizedResolution && normalizedResolution !== resolution) { normalization.resolution = { requested: resolution, applied: normalizedResolution, }; + } else if (!normalizedResolution) { + ignoredOverrides.push({ key: "resolution", value: resolution }); } resolution = normalizedResolution; } else if (resolution && !caps.supportsResolution) { diff --git a/src/video-generation/runtime.test.ts b/src/video-generation/runtime.test.ts index ee7a842567a..c1381782ab1 100644 --- a/src/video-generation/runtime.test.ts +++ b/src/video-generation/runtime.test.ts @@ -690,6 +690,91 @@ describe("video-generation runtime", () => { ]); }); + it("normalizes video resolutions against provider-supported values", async () => { + let seenResolution: string | undefined; + providers = [ + { + id: "minimax", + capabilities: { + generate: { + supportsResolution: true, + resolutions: ["768P", "1080P"], + }, + }, + generateVideo: async (req) => { + seenResolution = req.resolution; + return { + videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }], + model: "MiniMax-Hailuo-2.3", + }; + }, + }, + ]; + + const result = await runGenerateVideo({ + cfg: { + agents: { + defaults: { + videoGenerationModel: { primary: "minimax/MiniMax-Hailuo-2.3" }, + }, + }, + } as OpenClawConfig, + prompt: "animate a lobster", + resolution: "720P", + }); + + expect(seenResolution).toBe("768P"); + expect(result.ignoredOverrides).toEqual([]); + expect(result.normalization).toMatchObject({ + resolution: { + requested: "720P", + applied: "768P", + }, + }); + expect(result.metadata).toMatchObject({ + requestedResolution: "720P", + normalizedResolution: "768P", + }); + }); + + it("ignores unparseable video resolutions instead of sending them to providers", async () => { + let seenResolution: string | undefined; + providers = [ + { + id: "minimax", + capabilities: { + generate: { + supportsResolution: true, + resolutions: ["768P", "1080P"], + }, + }, + generateVideo: async (req) => { + seenResolution = req.resolution; + return { + videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }], + model: "MiniMax-Hailuo-2.3", + }; + }, + }, + ]; + + const result = await runGenerateVideo({ + cfg: { + agents: { + defaults: { + videoGenerationModel: { primary: "minimax/MiniMax-Hailuo-2.3" }, + }, + }, + } as OpenClawConfig, + prompt: "animate a lobster", + resolution: "4K", + }); + + expect(seenResolution).toBeUndefined(); + expect(result.ignoredOverrides).toEqual([{ key: "resolution", value: "4K" }]); + expect(result.normalization).toBeUndefined(); + }); + it("uses mode-specific capabilities for image-to-video requests", async () => { let seenRequest: | { diff --git a/src/video-generation/types.ts b/src/video-generation/types.ts index e29e29e2cdc..8c42bbabdeb 100644 --- a/src/video-generation/types.ts +++ b/src/video-generation/types.ts @@ -14,7 +14,7 @@ export type GeneratedVideoAsset = { metadata?: Record; }; -export type VideoGenerationResolution = "480P" | "720P" | "768P" | "1080P"; +export type VideoGenerationResolution = "480P" | "720P" | "768P" | "1080P" | (string & {}); /** * Canonical semantic role hints for reference assets. The list covers the