From 67506ac2a9c51999e3254fbc95ab09aab530e2c4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 18:14:36 +0100 Subject: [PATCH] fix(xai): support video reference images --- docs/providers/xai.md | 10 +- docs/tools/video-generation.md | 8 +- .../xai/video-generation-provider.test.ts | 130 ++++++++++++++++++ extensions/xai/video-generation-provider.ts | 58 +++++++- 4 files changed, 195 insertions(+), 11 deletions(-) diff --git a/docs/providers/xai.md b/docs/providers/xai.md index 31551a38137..b8221a11dd3 100644 --- a/docs/providers/xai.md +++ b/docs/providers/xai.md @@ -132,12 +132,14 @@ Legacy aliases still normalize to the canonical bundled ids: `video_generate` tool. - Default video model: `xai/grok-imagine-video` - - Modes: text-to-video, image-to-video, remote video edit, and remote video - extension + - Modes: text-to-video, image-to-video, reference-image generation, remote + video edit, and remote video extension - Aspect ratios: `1:1`, `16:9`, `9:16`, `4:3`, `3:4`, `3:2`, `2:3` - Resolutions: `480P`, `720P` - - Duration: 1-15 seconds for generation/image-to-video, 2-10 seconds for - extension + - Duration: 1-15 seconds for generation/image-to-video, 1-10 seconds when + using `reference_image` roles, 2-10 seconds for extension + - Reference-image generation: set `imageRoles` to `reference_image` for + every supplied image; xAI accepts up to 7 such images Local video buffers are not accepted. Use remote `http(s)` URLs for diff --git a/docs/tools/video-generation.md b/docs/tools/video-generation.md index c62af2c1daa..48fefb4e188 100644 --- a/docs/tools/video-generation.md +++ b/docs/tools/video-generation.md @@ -97,7 +97,7 @@ Duplicate prevention: if a video task is already `queued` or `running` for the c | Runway | `gen4.5` | Yes | 1 image | 1 video | `RUNWAYML_API_SECRET` | | Together | `Wan-AI/Wan2.2-T2V-A14B` | Yes | 1 image | No | `TOGETHER_API_KEY` | | Vydra | `veo3` | Yes | 1 image (`kling`) | No | `VYDRA_API_KEY` | -| xAI | `grok-imagine-video` | Yes | 1 image | 1 video | `XAI_API_KEY` | +| xAI | `grok-imagine-video` | Yes | 1 first-frame image or up to 7 `reference_image`s | 1 video | `XAI_API_KEY` | Some providers accept additional or alternate API key env vars. See individual [provider pages](#related) for details. @@ -150,7 +150,9 @@ Role hints are forwarded to the provider as-is. Canonical values come from the `VideoGenerationAssetRole` union but providers may accept additional role strings. `*Roles` arrays must not have more entries than the corresponding reference list; off-by-one mistakes fail with a clear error. -Use an empty string to leave a slot unset. +Use an empty string to leave a slot unset. For xAI, set every image role to +`reference_image` to use its `reference_images` generation mode; omit the role +or use `first_frame` for single-image image-to-video. ### Style controls @@ -326,7 +328,7 @@ entries. - Supports text-to-video, image-to-video, and remote video edit/extend flows. + Supports text-to-video, single first-frame image-to-video, up to 7 `reference_image` inputs through xAI `reference_images`, and remote video edit/extend flows. diff --git a/extensions/xai/video-generation-provider.test.ts b/extensions/xai/video-generation-provider.test.ts index f93c05cb640..bed2b9b814b 100644 --- a/extensions/xai/video-generation-provider.test.ts +++ b/extensions/xai/video-generation-provider.test.ts @@ -81,6 +81,136 @@ describe("xai video generation provider", () => { ); }); + it("sends a single unroled image as xAI first-frame image-to-video", async () => { + postJsonRequestMock.mockResolvedValue({ + response: { + json: async () => ({ + request_id: "req_image", + }), + }, + release: vi.fn(async () => {}), + }); + fetchWithTimeoutMock + .mockResolvedValueOnce({ + json: async () => ({ + request_id: "req_image", + status: "done", + video: { url: "https://cdn.x.ai/image-video.mp4" }, + }), + }) + .mockResolvedValueOnce({ + headers: new Headers({ "content-type": "video/mp4" }), + arrayBuffer: async () => Buffer.from("image-video-bytes"), + }); + + const provider = buildXaiVideoGenerationProvider(); + const result = await provider.generateVideo({ + provider: "xai", + model: "grok-imagine-video", + prompt: "Animate this logo into a clean bumper", + cfg: {}, + inputImages: [{ buffer: Buffer.from("png-bytes"), mimeType: "image/png" }], + }); + + const body = postJsonRequestMock.mock.calls[0]?.[0]?.body as Record; + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://api.x.ai/v1/videos/generations", + body: expect.objectContaining({ + image: { + url: expect.stringMatching(/^data:image\/png;base64,/), + }, + }), + }), + ); + expect(body).not.toHaveProperty("reference_images"); + expect(result.metadata).toEqual( + expect.objectContaining({ + mode: "generate", + }), + ); + }); + + it("sends reference_image roles through xAI reference_images mode", async () => { + postJsonRequestMock.mockResolvedValue({ + response: { + json: async () => ({ + request_id: "req_refs", + }), + }, + release: vi.fn(async () => {}), + }); + fetchWithTimeoutMock + .mockResolvedValueOnce({ + json: async () => ({ + request_id: "req_refs", + status: "done", + video: { url: "https://cdn.x.ai/reference-video.mp4" }, + }), + }) + .mockResolvedValueOnce({ + headers: new Headers({ "content-type": "video/mp4" }), + arrayBuffer: async () => Buffer.from("reference-video-bytes"), + }); + + const provider = buildXaiVideoGenerationProvider(); + const result = await provider.generateVideo({ + provider: "xai", + model: "grok-imagine-video", + prompt: "Make a cinematic brand vignette using these references", + cfg: {}, + durationSeconds: 12, + aspectRatio: "9:16", + resolution: "720P", + inputImages: [ + { url: "https://example.com/subject.png", role: "reference_image" }, + { url: "https://example.com/style.png", role: "reference_image" }, + ], + }); + + const body = postJsonRequestMock.mock.calls[0]?.[0]?.body as Record; + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://api.x.ai/v1/videos/generations", + body: expect.objectContaining({ + reference_images: [ + { url: "https://example.com/subject.png" }, + { url: "https://example.com/style.png" }, + ], + duration: 10, + aspect_ratio: "9:16", + resolution: "720p", + }), + }), + ); + expect(body).not.toHaveProperty("image"); + expect(result.metadata).toEqual( + expect.objectContaining({ + mode: "referenceToVideo", + }), + ); + }); + + it("rejects mixed xAI first-frame and reference-image roles", async () => { + const provider = buildXaiVideoGenerationProvider(); + + await expect( + provider.generateVideo({ + provider: "xai", + model: "grok-imagine-video", + prompt: "Use both images", + cfg: {}, + inputImages: [ + { url: "https://example.com/subject.png", role: "reference_image" }, + { url: "https://example.com/first-frame.png", role: "first_frame" }, + ], + }), + ).rejects.toThrow( + "xAI reference-image video generation requires every image role to be reference_image.", + ); + expect(postJsonRequestMock).not.toHaveBeenCalled(); + }); + it("routes video inputs to the extension endpoint when duration is set", async () => { postJsonRequestMock.mockResolvedValue({ response: { diff --git a/extensions/xai/video-generation-provider.ts b/extensions/xai/video-generation-provider.ts index bb007f98f0b..35724e4bcb1 100644 --- a/extensions/xai/video-generation-provider.ts +++ b/extensions/xai/video-generation-provider.ts @@ -47,6 +47,7 @@ type VideoGenerationSourceInput = { url?: string; buffer?: Buffer; mimeType?: string; + role?: string; }; function resolveXaiVideoBaseUrl(req: VideoGenerationRequest): string { @@ -73,6 +74,18 @@ function resolveImageUrl(input: VideoGenerationSourceInput | undefined): string return toDataUrl(input.buffer, normalizeOptionalString(input.mimeType) ?? "image/png"); } +function resolveRequiredImageUrl(input: VideoGenerationSourceInput): string { + const imageUrl = resolveImageUrl(input); + if (!imageUrl) { + throw new Error("xAI image-to-video input is missing image data."); + } + return imageUrl; +} + +function isReferenceImage(input: VideoGenerationSourceInput): boolean { + return normalizeOptionalString(input.role)?.toLowerCase() === "reference_image"; +} + function resolveInputVideoUrl(input: VideoGenerationSourceInput | undefined): string | undefined { if (!input) { return undefined; @@ -117,8 +130,13 @@ function resolveResolution(value: string | undefined): "480p" | "720p" | undefin return undefined; } -function resolveXaiVideoMode(req: VideoGenerationRequest): "generate" | "edit" | "extend" { +function resolveXaiVideoMode( + req: VideoGenerationRequest, +): "generate" | "referenceToVideo" | "edit" | "extend" { const hasVideoInput = (req.inputVideos?.length ?? 0) > 0; + if (!hasVideoInput && (req.inputImages ?? []).some(isReferenceImage)) { + return "referenceToVideo"; + } if (!hasVideoInput) { return "generate"; } @@ -132,8 +150,18 @@ function resolveXaiVideoMode(req: VideoGenerationRequest): "generate" | "edit" | } function buildCreateBody(req: VideoGenerationRequest): Record { - if ((req.inputImages?.length ?? 0) > 1) { - throw new Error("xAI video generation supports at most one reference image."); + const inputImages = req.inputImages ?? []; + const hasReferenceImages = inputImages.some(isReferenceImage); + if (hasReferenceImages && !inputImages.every(isReferenceImage)) { + throw new Error( + "xAI reference-image video generation requires every image role to be reference_image.", + ); + } + if (!hasReferenceImages && inputImages.length > 1) { + throw new Error("xAI image-to-video generation supports at most one first-frame image."); + } + if (hasReferenceImages && inputImages.length > 7) { + throw new Error("xAI reference-image video generation supports at most 7 reference images."); } if ((req.inputVideos?.length ?? 0) > 1) { throw new Error("xAI video generation supports at most one input video."); @@ -172,6 +200,27 @@ function buildCreateBody(req: VideoGenerationRequest): Record { return body; } + if (mode === "referenceToVideo") { + body.reference_images = inputImages.map((image) => ({ url: resolveRequiredImageUrl(image) })); + const duration = resolveDurationSeconds({ + durationSeconds: req.durationSeconds, + min: 1, + max: 10, + }); + if (typeof duration === "number") { + body.duration = duration; + } + const aspectRatio = resolveAspectRatio(req.aspectRatio); + if (aspectRatio) { + body.aspect_ratio = aspectRatio; + } + const resolution = resolveResolution(req.resolution); + if (resolution) { + body.resolution = resolution; + } + return body; + } + body.video = { url: resolveInputVideoUrl(req.inputVideos?.[0]) }; if (mode === "extend") { const duration = resolveDurationSeconds({ @@ -192,6 +241,7 @@ function resolveCreateEndpoint(req: VideoGenerationRequest): string { return "/videos/edits"; case "extend": return "/videos/extensions"; + case "referenceToVideo": case "generate": default: return "/videos/generations"; @@ -284,7 +334,7 @@ export function buildXaiVideoGenerationProvider(): VideoGenerationProvider { imageToVideo: { enabled: true, maxVideos: 1, - maxInputImages: 1, + maxInputImages: 7, maxDurationSeconds: 15, aspectRatios: [...XAI_VIDEO_ASPECT_RATIOS], resolutions: ["480P", "720P"],