From 6c3fcb8bfca11176208e9fbed7e7b7eb14e30d5b Mon Sep 17 00:00:00 2001 From: Shakker Date: Sat, 23 May 2026 00:50:33 +0100 Subject: [PATCH] fix: route openai video edits to edits endpoint --- CHANGELOG.md | 1 + docs/help/testing-live.md | 2 +- docs/providers/openai.md | 6 ++- docs/tools/video-generation.md | 32 ++++++------ .../openai/video-generation-provider.test.ts | 21 ++++++-- .../openai/video-generation-provider.ts | 49 +++++++++---------- 6 files changed, 64 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a200e0fcf92..5fb2ffa4d1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ Docs: https://docs.openclaw.ai - Codex app-server: restart the native app-server and retry once when server-side compaction times out, so preflight compaction stalls recover instead of failing every dispatch. (#85500) - Restore Control UI gateway token pairing [AI]. (#85459) Thanks @pgondhi987. - OpenAI video: honor configured provider request private-network opt-in for local/custom video endpoints so explicitly trusted mock and self-hosted providers are not blocked. Thanks @shakkernerd. +- OpenAI video: send uploaded video edit requests to the documented `/videos/edits` endpoint with a `video` file instead of posting MP4 references to `/videos`. Thanks @shakkernerd. - CLI/update: repair managed npm plugin `openclaw` peer links during post-core convergence and reject stale or wrong-target peer links before restart. (#83794) Thanks @fuller-stack-dev. - CLI/agents: default new omitted-account bindings to all accounts when the channel has multiple configured accounts, and clarify account-scope docs. (#49769) Thanks @Gcaufy. - Codex app-server: let authorized `/codex` control commands such as `/codex detach` escape plugin-owned conversation bindings while keeping unknown or unauthorized slash text routed to the bound plugin. Fixes #85157. (#85188) Thanks @TurboTheTurtle. diff --git a/docs/help/testing-live.md b/docs/help/testing-live.md index 0b8810ec57f..7c5fbc31b2f 100644 --- a/docs/help/testing-live.md +++ b/docs/help/testing-live.md @@ -552,7 +552,7 @@ request. Plugin dependencies are expected to be present before runtime load. - Current declared-but-skipped `videoToVideo` providers in the shared sweep: - `alibaba`, `qwen`, `xai` because those paths currently require remote `http(s)` / MP4 reference URLs - `google` because the current shared Gemini/Veo lane uses local buffer-backed input and that path is not accepted in the shared sweep - - `openai` because the current shared lane lacks org-specific video inpaint/remix access guarantees + - `openai` because the current shared lane lacks org-specific video edit access guarantees - Optional narrowing: - `OPENCLAW_LIVE_VIDEO_GENERATION_PROVIDERS="deepinfra,google,openai,runway"` - `OPENCLAW_LIVE_VIDEO_GENERATION_MODELS="google/veo-3.1-fast-generate-preview,openai/sora-2,runway/gen4_aleph"` diff --git a/docs/providers/openai.md b/docs/providers/openai.md index cf6ca10b33f..74ff11175e4 100644 --- a/docs/providers/openai.md +++ b/docs/providers/openai.md @@ -516,9 +516,13 @@ The bundled `openai` plugin registers video generation through the `video_genera | Default model | `openai/sora-2` | | Modes | Text-to-video, image-to-video, single-video edit | | Reference inputs | 1 image or 1 video | -| Size overrides | Supported | +| Size overrides | Supported for text-to-video and image-to-video | | Other overrides | `aspectRatio`, `resolution`, `audio`, `watermark` are ignored with a tool warning | +OpenAI image-to-video requests use `POST /v1/videos` with an image +`input_reference`. Single-video edits use `POST /v1/videos/edits` with the +uploaded video in the `video` field. + ```json5 { agents: { diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md index def7108bfcc..2497adce239 100644 --- a/docs/tools/video-generation.md +++ b/docs/tools/video-generation.md @@ -137,22 +137,22 @@ runtime modes at runtime. The explicit mode contract used by `video_generate`, contract tests, and the shared live sweep: -| Provider | `generate` | `imageToVideo` | `videoToVideo` | Shared live lanes today | -| ---------- | :--------: | :------------: | :------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | -| Alibaba | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider needs remote `http(s)` video URLs | -| BytePlus | ✓ | ✓ | - | `generate`, `imageToVideo` | -| ComfyUI | ✓ | ✓ | - | Not in the shared sweep; workflow-specific coverage lives with Comfy tests | -| DeepInfra | ✓ | - | - | `generate`; native DeepInfra video schemas are text-to-video in the bundled contract | -| fal | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` only when using Seedance reference-to-video | -| Google | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; shared `videoToVideo` skipped because the current buffer-backed Gemini/Veo sweep does not accept that input | -| MiniMax | ✓ | ✓ | - | `generate`, `imageToVideo` | -| OpenAI | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; shared `videoToVideo` skipped because this org/input path currently needs provider-side inpaint/remix access | -| OpenRouter | ✓ | ✓ | - | `generate`, `imageToVideo` | -| Qwen | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider needs remote `http(s)` video URLs | -| Runway | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` runs only when the selected model is `runway/gen4_aleph` | -| Together | ✓ | ✓ | - | `generate`, `imageToVideo` | -| Vydra | ✓ | ✓ | - | `generate`; shared `imageToVideo` skipped because bundled `veo3` is text-only and bundled `kling` requires a remote image URL | -| xAI | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider currently needs a remote MP4 URL | +| Provider | `generate` | `imageToVideo` | `videoToVideo` | Shared live lanes today | +| ---------- | :--------: | :------------: | :------------: | --------------------------------------------------------------------------------------------------------------------------------------- | +| Alibaba | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider needs remote `http(s)` video URLs | +| BytePlus | ✓ | ✓ | - | `generate`, `imageToVideo` | +| ComfyUI | ✓ | ✓ | - | Not in the shared sweep; workflow-specific coverage lives with Comfy tests | +| DeepInfra | ✓ | - | - | `generate`; native DeepInfra video schemas are text-to-video in the bundled contract | +| fal | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` only when using Seedance reference-to-video | +| Google | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; shared `videoToVideo` skipped because the current buffer-backed Gemini/Veo sweep does not accept that input | +| MiniMax | ✓ | ✓ | - | `generate`, `imageToVideo` | +| OpenAI | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; shared `videoToVideo` skipped because this org/input path currently needs provider-side video edit access | +| OpenRouter | ✓ | ✓ | - | `generate`, `imageToVideo` | +| Qwen | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider needs remote `http(s)` video URLs | +| Runway | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` runs only when the selected model is `runway/gen4_aleph` | +| Together | ✓ | ✓ | - | `generate`, `imageToVideo` | +| Vydra | ✓ | ✓ | - | `generate`; shared `imageToVideo` skipped because bundled `veo3` is text-only and bundled `kling` requires a remote image URL | +| xAI | ✓ | ✓ | ✓ | `generate`, `imageToVideo`; `videoToVideo` skipped because this provider currently needs a remote MP4 URL | ## Tool parameters diff --git a/extensions/openai/video-generation-provider.test.ts b/extensions/openai/video-generation-provider.test.ts index 356178317d9..532373522cf 100644 --- a/extensions/openai/video-generation-provider.test.ts +++ b/extensions/openai/video-generation-provider.test.ts @@ -98,6 +98,16 @@ describe("openai video generation provider", () => { expectExplicitVideoGenerationCapabilities(buildOpenAIVideoGenerationProvider()); }); + it("does not claim size or duration controls for OpenAI video edits", () => { + const provider = buildOpenAIVideoGenerationProvider(); + + expect(provider.capabilities.videoToVideo).toEqual({ + enabled: true, + maxVideos: 1, + maxInputVideos: 1, + }); + }); + it("uses JSON for text-only Sora requests", async () => { postJsonRequestMock.mockResolvedValue({ response: { @@ -440,7 +450,7 @@ describe("openai video generation provider", () => { expect(secondRelease).toHaveBeenCalledTimes(1); }); - it("uses multipart input_reference for video-to-video uploads", async () => { + it("uses the video edits endpoint for video-to-video uploads", async () => { fetchWithTimeoutMock .mockResolvedValueOnce({ ok: true, @@ -473,8 +483,13 @@ describe("openai video generation provider", () => { expect(postJsonRequestMock).not.toHaveBeenCalled(); const createRequest = postMultipartRequest(); - expect(createRequest.url).toBe("https://api.openai.com/v1/videos"); + expect(createRequest.url).toBe("https://api.openai.com/v1/videos/edits"); expect(createRequest.body).toBeInstanceOf(FormData); + const form = createRequest.body as FormData; + expect(form.get("prompt")).toBe("Remix this clip"); + expect(form.get("model")).toBe("sora-2"); + expect(form.get("video")).toBeInstanceOf(File); + expect(form.get("input_reference")).toBeNull(); expect(createRequest.timeoutMs).toBe(120000); expect(createRequest.fetchFn).toBe(fetch); expect(createRequest.allowPrivateNetwork).toBe(false); @@ -523,7 +538,7 @@ describe("openai video generation provider", () => { expect(postJsonRequestMock).not.toHaveBeenCalled(); const createRequest = postMultipartRequest(); - expect(createRequest.url).toBe("http://127.0.0.1:44080/v1/videos"); + expect(createRequest.url).toBe("http://127.0.0.1:44080/v1/videos/edits"); expect(createRequest.body).toBeInstanceOf(FormData); expect(createRequest.allowPrivateNetwork).toBe(true); expect(pollProviderOperationRequest().allowPrivateNetwork).toBe(true); diff --git a/extensions/openai/video-generation-provider.ts b/extensions/openai/video-generation-provider.ts index ba881cf15e5..a489cdb660c 100644 --- a/extensions/openai/video-generation-provider.ts +++ b/extensions/openai/video-generation-provider.ts @@ -39,6 +39,13 @@ type OpenAIVideoRequestPolicy = { type OpenAIVideoStatus = "queued" | "in_progress" | "completed" | "failed"; +type OpenAIReferenceAsset = { + kind: "image" | "video"; + file: File; + buffer: Buffer; + mimeType: string; +}; + type OpenAIVideoResponse = { id?: string; model?: string; @@ -99,7 +106,7 @@ function resolveSize(params: { return undefined; } -function resolveReferenceAsset(req: VideoGenerationRequest) { +function resolveReferenceAsset(req: VideoGenerationRequest): OpenAIReferenceAsset | null { const allAssets = [...(req.inputImages ?? []), ...(req.inputVideos ?? [])]; if (allAssets.length === 0) { return null; @@ -113,15 +120,20 @@ function resolveReferenceAsset(req: VideoGenerationRequest) { "OpenAI video generation currently requires local image/video uploads for reference assets.", ); } + const kind = (req.inputVideos?.length ?? 0) > 0 ? "video" : "image"; const mimeType = - normalizeOptionalString(asset.mimeType) || - ((req.inputVideos?.length ?? 0) > 0 ? "video/mp4" : "image/png"); + normalizeOptionalString(asset.mimeType) || (kind === "video" ? "video/mp4" : "image/png"); const extension = extensionForMime(mimeType)?.slice(1) ?? (mimeType.startsWith("video/") ? "mp4" : "png"); const fileName = normalizeOptionalString(asset.fileName) || - `${(req.inputVideos?.length ?? 0) > 0 ? "reference-video" : "reference-image"}.${extension}`; - return new File([toBlobBytes(asset.buffer)], fileName, { type: mimeType }); + `${kind === "video" ? "reference-video" : "reference-image"}.${extension}`; + return { + kind, + file: new File([toBlobBytes(asset.buffer)], fileName, { type: mimeType }), + buffer: asset.buffer, + mimeType, + }; } async function pollOpenAIVideo( @@ -285,10 +297,6 @@ export function buildOpenAIVideoGenerationProvider(): VideoGenerationProvider { enabled: true, maxVideos: 1, maxInputVideos: 1, - maxDurationSeconds: 12, - supportedDurationSeconds: OPENAI_VIDEO_SECONDS, - supportsSize: true, - sizes: OPENAI_VIDEO_SIZES, }, }, async generateVideo(req) { @@ -328,16 +336,14 @@ export function buildOpenAIVideoGenerationProvider(): VideoGenerationProvider { aspectRatio: req.aspectRatio, resolution: req.resolution, }); - const inputImage = req.inputImages?.[0]; const referenceAsset = resolveReferenceAsset(req); - const requestUrl = `${baseUrl}/videos`; const requestResult = referenceAsset - ? inputImage?.buffer + ? referenceAsset.kind === "image" ? await (() => { const jsonHeaders = new Headers(headers); jsonHeaders.set("Content-Type", "application/json"); return postJsonRequest({ - url: requestUrl, + url: `${baseUrl}/videos`, headers: jsonHeaders, body: { prompt: req.prompt, @@ -345,10 +351,7 @@ export function buildOpenAIVideoGenerationProvider(): VideoGenerationProvider { ...(seconds ? { seconds } : {}), ...(size ? { size } : {}), input_reference: { - image_url: toOpenAIDataUrl( - inputImage.buffer, - normalizeOptionalString(inputImage.mimeType) ?? "image/png", - ), + image_url: toOpenAIDataUrl(referenceAsset.buffer, referenceAsset.mimeType), }, }, timeoutMs: resolveProviderOperationTimeoutMs({ @@ -364,17 +367,11 @@ export function buildOpenAIVideoGenerationProvider(): VideoGenerationProvider { const form = new FormData(); form.set("prompt", req.prompt); form.set("model", model); - if (seconds) { - form.set("seconds", seconds); - } - if (size) { - form.set("size", size); - } - form.set("input_reference", referenceAsset); + form.set("video", referenceAsset.file); const multipartHeaders = new Headers(headers); multipartHeaders.delete("Content-Type"); return postMultipartRequest({ - url: requestUrl, + url: `${baseUrl}/videos/edits`, headers: multipartHeaders, body: form, timeoutMs: resolveProviderOperationTimeoutMs({ @@ -390,7 +387,7 @@ export function buildOpenAIVideoGenerationProvider(): VideoGenerationProvider { const jsonHeaders = new Headers(headers); jsonHeaders.set("Content-Type", "application/json"); return postJsonRequest({ - url: requestUrl, + url: `${baseUrl}/videos`, headers: jsonHeaders, body: { prompt: req.prompt,