From de0097a23c7a0295a6b305826bc687cf2b306c9a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 19:28:25 +0100 Subject: [PATCH] fix: support transparent OpenAI image generation --- CHANGELOG.md | 1 + docs/gateway/config-agents.md | 4 +- docs/providers/openai.md | 20 ++- docs/tools/image-generation.md | 58 +++++-- .../openai/image-generation-provider.test.ts | 152 +++++++++++++++++- .../openai/image-generation-provider.ts | 36 ++++- .../openai-ws-message-conversion.test.ts | 39 ++++- src/agents/tools/image-generate-tool.test.ts | 68 ++++++++ src/agents/tools/image-generate-tool.ts | 10 +- 9 files changed, 362 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89496d4615c..3dac30c8452 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,6 +87,7 @@ Docs: https://docs.openclaw.ai - Gateway/dashboard: render Control UI and WebSocket links with `https://`/`wss://` when `gateway.tls.enabled=true`, including `openclaw gateway status`. Fixes #71494. (#71499) Thanks @deepkilo. - Agents/OpenAI-compatible: default proxy/local completions tool requests to `tool_choice: "auto"` when tools are present, so providers enter native tool-calling mode instead of replying with plain-text tool directives. (#71472) Thanks @Speed-maker. - OpenAI image generation: use `gpt-5.5` for the Codex OAuth responses transport instead of the retired `gpt-5.4` model, fixing 500s from ChatGPT Codex image generation. Fixes #71513. Thanks @baolongl. +- OpenAI image generation: route transparent-background default-model requests to `gpt-image-1.5`, document the expected `image_generate` call shape, and keep Azure/custom OpenAI-compatible deployment names untouched. Thanks @steipete. - Google video generation: download direct MLDev Veo `video.uri` results instead of passing them through the Files API path, fixing 404s after successful generation/polling. Fixes #71200. Thanks @panhaishan. - Google video generation: fall back to the REST `predictLongRunning` Veo endpoint for text-only SDK 404s while keeping reference image/video generation on the SDK path. Fixes #62309 and #63008. (#62343) Thanks @leoleedev. - MiniMax music generation: switch the bundled default model from the unsupported `music-2.5+` id to the current `music-2.6` API model. Fixes #64870 and addresses the music default from #62315. Thanks @noahclanman and @edwardzheng1. diff --git a/docs/gateway/config-agents.md b/docs/gateway/config-agents.md index 7e6b63b0493..c735efa1ecf 100644 --- a/docs/gateway/config-agents.md +++ b/docs/gateway/config-agents.md @@ -342,8 +342,8 @@ Time format in system prompt. Default: `auto` (OS preference). - Also used as fallback routing when the selected/default model cannot accept image input. - `imageGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - Used by the shared image-generation capability and any future tool/plugin surface that generates images. - - Typical values: `google/gemini-3.1-flash-image-preview` for native Gemini image generation, `fal/fal-ai/flux/dev` for fal, or `openai/gpt-image-2` for OpenAI Images. - - If you select a provider/model directly, configure matching provider auth too (for example `GEMINI_API_KEY` or `GOOGLE_API_KEY` for `google/*`, `OPENAI_API_KEY` or OpenAI Codex OAuth for `openai/gpt-image-2`, `FAL_KEY` for `fal/*`). + - Typical values: `google/gemini-3.1-flash-image-preview` for native Gemini image generation, `fal/fal-ai/flux/dev` for fal, `openai/gpt-image-2` for OpenAI Images, or `openai/gpt-image-1.5` for transparent-background OpenAI PNG/WebP output. + - If you select a provider/model directly, configure matching provider auth too (for example `GEMINI_API_KEY` or `GOOGLE_API_KEY` for `google/*`, `OPENAI_API_KEY` or OpenAI Codex OAuth for `openai/gpt-image-2` / `openai/gpt-image-1.5`, `FAL_KEY` for `fal/*`). - If omitted, `image_generate` can still infer an auth-backed provider default. It tries the current default provider first, then the remaining registered image-generation providers in provider-id order. - `musicGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - Used by the shared music-generation capability and the built-in `music_generate` tool. diff --git a/docs/providers/openai.md b/docs/providers/openai.md index 3643a1f597e..bfaacf22ea5 100644 --- a/docs/providers/openai.md +++ b/docs/providers/openai.md @@ -27,6 +27,7 @@ changing config. | GPT-5.5 with ChatGPT/Codex subscription auth | `openai-codex/gpt-5.5` | Default PI route for Codex OAuth. Best first choice for subscription setups. | | GPT-5.5 with native Codex app-server behavior | `openai/gpt-5.5` plus `embeddedHarness.runtime: "codex"` | Forces the Codex app-server harness for that model ref. | | Image generation or editing | `openai/gpt-image-2` | Works with either `OPENAI_API_KEY` or OpenAI Codex OAuth. | +| Transparent-background images | `openai/gpt-image-1.5` | Use `outputFormat=png` or `webp` and `openai.background=transparent`. | GPT-5.5 is available through both direct OpenAI Platform API-key access and @@ -254,8 +255,17 @@ See [Image Generation](/tools/image-generation) for shared tool parameters, prov `gpt-image-2` is the default for both OpenAI text-to-image generation and image -editing. `gpt-image-1` remains usable as an explicit model override, but new -OpenAI image workflows should use `openai/gpt-image-2`. +editing. `gpt-image-1.5`, `gpt-image-1`, and `gpt-image-1-mini` remain usable as +explicit model overrides. Use `openai/gpt-image-1.5` for transparent-background +PNG/WebP output; the current `gpt-image-2` API rejects +`background: "transparent"`. + +For a transparent-background request, agents should call `image_generate` with +`model: "openai/gpt-image-1.5"`, `outputFormat: "png"` or `"webp"`, and +`openai.background: "transparent"`. OpenClaw also protects the public OpenAI and +OpenAI Codex OAuth routes by rewriting default `openai/gpt-image-2` transparent +requests to `gpt-image-1.5`; Azure and custom OpenAI-compatible endpoints keep +their configured deployment/model names. For Codex OAuth installs, keep the same `openai/gpt-image-2` ref. When an `openai-codex` OAuth profile is configured, OpenClaw resolves that stored OAuth @@ -275,6 +285,12 @@ Generate: /tool image_generate model=openai/gpt-image-2 prompt="A polished launch poster for OpenClaw on macOS" size=3840x2160 count=1 ``` +Generate a transparent PNG: + +``` +/tool image_generate model=openai/gpt-image-1.5 prompt="A simple red circle sticker on a transparent background" outputFormat=png openai='{"background":"transparent"}' +``` + Edit: ``` diff --git a/docs/tools/image-generation.md b/docs/tools/image-generation.md index 715b11ae5e1..b9bcaa19c3b 100644 --- a/docs/tools/image-generation.md +++ b/docs/tools/image-generation.md @@ -48,13 +48,14 @@ The agent calls `image_generate` automatically. No tool allow-listing needed — ## Common routes -| Goal | Model ref | Auth | -| ---------------------------------------------------- | -------------------------------------------------- | ------------------------------------ | -| OpenAI image generation with API billing | `openai/gpt-image-2` | `OPENAI_API_KEY` | -| OpenAI image generation with Codex subscription auth | `openai/gpt-image-2` | OpenAI Codex OAuth | -| OpenRouter image generation | `openrouter/google/gemini-3.1-flash-image-preview` | `OPENROUTER_API_KEY` | -| LiteLLM image generation | `litellm/gpt-image-2` | `LITELLM_API_KEY` | -| Google Gemini image generation | `google/gemini-3.1-flash-image-preview` | `GEMINI_API_KEY` or `GOOGLE_API_KEY` | +| Goal | Model ref | Auth | +| ---------------------------------------------------- | -------------------------------------------------- | -------------------------------------- | +| OpenAI image generation with API billing | `openai/gpt-image-2` | `OPENAI_API_KEY` | +| OpenAI image generation with Codex subscription auth | `openai/gpt-image-2` | OpenAI Codex OAuth | +| OpenAI transparent-background PNG/WebP | `openai/gpt-image-1.5` | `OPENAI_API_KEY` or OpenAI Codex OAuth | +| OpenRouter image generation | `openrouter/google/gemini-3.1-flash-image-preview` | `OPENROUTER_API_KEY` | +| LiteLLM image generation | `litellm/gpt-image-2` | `LITELLM_API_KEY` | +| Google Gemini image generation | `google/gemini-3.1-flash-image-preview` | `GEMINI_API_KEY` or `GOOGLE_API_KEY` | The same `image_generate` tool handles text-to-image and reference-image editing. Use `image` for one reference or `images` for multiple references. @@ -93,7 +94,8 @@ Use `"list"` to inspect available providers and models at runtime. -Provider/model override, e.g. `openai/gpt-image-2`. +Provider/model override, e.g. `openai/gpt-image-2`; use +`openai/gpt-image-1.5` for transparent OpenAI backgrounds. @@ -233,9 +235,10 @@ through the Codex Responses backend. Legacy Codex base URLs such as `https://chatgpt.com/backend-api/codex` for image requests. It does not silently fall back to `OPENAI_API_KEY` for that request. To force direct OpenAI Images API routing, configure `models.providers.openai` explicitly with an API -key, custom base URL, or Azure endpoint. The older -`openai/gpt-image-1` model can still be selected explicitly, but new OpenAI -image-generation and image-editing requests should use `gpt-image-2`. +key, custom base URL, or Azure endpoint. The `openai/gpt-image-1.5`, +`openai/gpt-image-1`, and `openai/gpt-image-1-mini` models can still be +selected explicitly. Use `gpt-image-1.5` for transparent-background PNG/WebP +output; the current `gpt-image-2` API rejects `background: "transparent"`. `gpt-image-2` supports both text-to-image generation and reference-image editing through the same `image_generate` tool. OpenClaw forwards `prompt`, @@ -260,8 +263,31 @@ OpenAI-specific options live under the `openai` object: ``` `openai.background` accepts `transparent`, `opaque`, or `auto`; transparent -outputs require `outputFormat` `png` or `webp`. `openai.outputCompression` -applies to JPEG/WebP outputs. +outputs require `outputFormat` `png` or `webp` and a transparency-capable OpenAI +image model. OpenClaw routes default `gpt-image-2` transparent-background +requests to `gpt-image-1.5`. `openai.outputCompression` applies to JPEG/WebP +outputs. + +When asking an agent for a transparent-background OpenAI image, the expected +tool call is: + +```json +{ + "model": "openai/gpt-image-1.5", + "prompt": "A simple red circle sticker on a transparent background", + "outputFormat": "png", + "openai": { + "background": "transparent" + } +} +``` + +The explicit `openai/gpt-image-1.5` model keeps the request portable across +tool summaries and harnesses. If the agent instead uses the default +`openai/gpt-image-2` with `openai.background: "transparent"` on the public +OpenAI or OpenAI Codex OAuth route, OpenClaw rewrites the provider request to +`gpt-image-1.5`. Azure and custom OpenAI-compatible endpoints keep their +configured deployment/model names. Generate one 4K landscape image: @@ -269,6 +295,12 @@ Generate one 4K landscape image: /tool image_generate action=generate model=openai/gpt-image-2 prompt="A clean editorial poster for OpenClaw image generation" size=3840x2160 count=1 ``` +Generate a transparent PNG: + +``` +/tool image_generate action=generate model=openai/gpt-image-1.5 prompt="A simple red circle sticker on a transparent background" outputFormat=png openai='{"background":"transparent"}' +``` + Generate two square images: ``` diff --git a/extensions/openai/image-generation-provider.test.ts b/extensions/openai/image-generation-provider.test.ts index 9305e5a4e59..0f1853546f8 100644 --- a/extensions/openai/image-generation-provider.test.ts +++ b/extensions/openai/image-generation-provider.test.ts @@ -194,7 +194,12 @@ describe("openai image generation provider", () => { const provider = buildOpenAIImageGenerationProvider(); expect(provider.defaultModel).toBe("gpt-image-2"); - expect(provider.models).toEqual(["gpt-image-2"]); + expect(provider.models).toEqual([ + "gpt-image-2", + "gpt-image-1.5", + "gpt-image-1", + "gpt-image-1-mini", + ]); expect(provider.capabilities.geometry?.sizes).toEqual( expect.arrayContaining(["2048x2048", "3840x2160", "2160x3840"]), ); @@ -428,6 +433,74 @@ describe("openai image generation provider", () => { }); }); + it("routes transparent default-model requests to the OpenAI image model that supports alpha", async () => { + mockGeneratedPngResponse(); + + const provider = buildOpenAIImageGenerationProvider(); + const result = await provider.generateImage({ + provider: "openai", + model: "gpt-image-2", + prompt: "Transparent sticker", + cfg: {}, + outputFormat: "png", + providerOptions: { + openai: { + background: "transparent", + }, + }, + }); + + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://api.openai.com/v1/images/generations", + body: expect.objectContaining({ + model: "gpt-image-1.5", + output_format: "png", + background: "transparent", + }), + }), + ); + expect(result.model).toBe("gpt-image-1.5"); + }); + + it("does not reroute transparent requests for custom OpenAI-compatible endpoints", async () => { + mockGeneratedPngResponse(); + + const provider = buildOpenAIImageGenerationProvider(); + await provider.generateImage({ + provider: "openai", + model: "gpt-image-2", + prompt: "Transparent custom endpoint sticker", + cfg: { + models: { + providers: { + openai: { + baseUrl: "https://openai-compatible.example.com/v1", + models: [], + }, + }, + }, + }, + outputFormat: "png", + providerOptions: { + openai: { + background: "transparent", + }, + }, + }); + + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://openai-compatible.example.com/v1/images/generations", + body: expect.objectContaining({ + model: "gpt-image-2", + output_format: "png", + background: "transparent", + }), + }), + ); + }); + it("allows loopback image requests for the synthetic mock-openai provider", async () => { mockGeneratedPngResponse(); @@ -684,6 +757,43 @@ describe("openai image generation provider", () => { }); }); + it("routes transparent default-model Codex OAuth requests to the alpha-capable image model", async () => { + mockCodexAuthOnly(); + mockCodexImageStream({ imageData: "codex-transparent-image" }); + + const provider = buildOpenAIImageGenerationProvider(); + const result = await provider.generateImage({ + provider: "openai", + model: "gpt-image-2", + prompt: "Draw a transparent Codex sticker", + cfg: {}, + authStore: { version: 1, profiles: {} }, + outputFormat: "png", + providerOptions: { + openai: { + background: "transparent", + }, + }, + }); + + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://chatgpt.com/backend-api/codex/responses", + body: expect.objectContaining({ + tools: [ + expect.objectContaining({ + type: "image_generation", + model: "gpt-image-1.5", + output_format: "png", + background: "transparent", + }), + ], + }), + }), + ); + expect(result.model).toBe("gpt-image-1.5"); + }); + it("uses configured Codex OAuth directly instead of probing an available OpenAI API key", async () => { resolveApiKeyForProviderMock.mockImplementation(async (params?: { provider?: string }) => { if (params?.provider === "openai") { @@ -1213,6 +1323,46 @@ describe("openai image generation provider", () => { ); }); + it("does not reroute transparent background requests for Azure deployment names", async () => { + mockGeneratedPngResponse(); + + const provider = buildOpenAIImageGenerationProvider(); + await provider.generateImage({ + provider: "openai", + model: "gpt-image-2", + prompt: "Transparent Azure sticker", + cfg: { + models: { + providers: { + openai: { + baseUrl: "https://myresource.openai.azure.com", + models: [], + }, + }, + }, + }, + outputFormat: "png", + providerOptions: { + openai: { + background: "transparent", + }, + }, + }); + + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://myresource.openai.azure.com/openai/deployments/gpt-image-2/images/generations?api-version=2024-12-01-preview", + body: { + prompt: "Transparent Azure sticker", + n: 1, + size: "1024x1024", + output_format: "png", + background: "transparent", + }, + }), + ); + }); + it("uses api-key header and deployment-scoped URL for .cognitiveservices.azure.com hosts", async () => { mockGeneratedPngResponse(); diff --git a/extensions/openai/image-generation-provider.ts b/extensions/openai/image-generation-provider.ts index 31a30e65dd1..96497811fd9 100644 --- a/extensions/openai/image-generation-provider.ts +++ b/extensions/openai/image-generation-provider.ts @@ -30,6 +30,7 @@ const DEFAULT_OPENAI_IMAGE_BASE_URL = "https://api.openai.com/v1"; const DEFAULT_OPENAI_CODEX_IMAGE_BASE_URL = OPENAI_CODEX_RESPONSES_BASE_URL; const DEFAULT_OPENAI_CODEX_IMAGE_RESPONSES_MODEL = "gpt-5.5"; const OPENAI_CODEX_IMAGE_INSTRUCTIONS = "You are an image generation assistant."; +const OPENAI_TRANSPARENT_BACKGROUND_IMAGE_MODEL = "gpt-image-1.5"; const DEFAULT_OPENAI_IMAGE_TIMEOUT_MS = 180_000; const DEFAULT_OUTPUT_MIME = "image/png"; const DEFAULT_OUTPUT_EXTENSION = "png"; @@ -52,6 +53,12 @@ const LOG_VALUE_MAX_CHARS = 256; const MOCK_OPENAI_PROVIDER_ID = "mock-openai"; const OPENAI_OUTPUT_FORMATS = ["png", "jpeg", "webp"] as const; const OPENAI_QUALITIES = ["low", "medium", "high", "auto"] as const; +const OPENAI_IMAGE_MODELS = [ + DEFAULT_OPENAI_IMAGE_MODEL, + OPENAI_TRANSPARENT_BACKGROUND_IMAGE_MODEL, + "gpt-image-1", + "gpt-image-1-mini", +] as const; const log = createSubsystemLogger("image-generation/openai"); const AZURE_HOSTNAME_SUFFIXES = [ @@ -186,6 +193,21 @@ function appendOpenAIImageOptions( } } +function resolveOpenAIImageRequestModel( + req: Parameters[0], + options?: { allowTransparentDefaultReroute?: boolean }, +): string { + const model = req.model || DEFAULT_OPENAI_IMAGE_MODEL; + if ( + options?.allowTransparentDefaultReroute === true && + model === DEFAULT_OPENAI_IMAGE_MODEL && + req.providerOptions?.openai?.background === "transparent" + ) { + return OPENAI_TRANSPARENT_BACKGROUND_IMAGE_MODEL; + } + return model; +} + function shouldAllowPrivateImageEndpoint(req: { provider: string; cfg: OpenClawConfig | undefined; @@ -468,7 +490,7 @@ function createOpenAIImageGenerationProviderBase(params: { id: params.id, label: params.label, defaultModel: DEFAULT_OPENAI_IMAGE_MODEL, - models: [DEFAULT_OPENAI_IMAGE_MODEL], + models: [...OPENAI_IMAGE_MODELS], isConfigured: params.isConfigured, capabilities: { generate: { @@ -517,7 +539,9 @@ function logCodexImageAuthSelected(params: { authMode?: unknown; timeoutMs: number; }) { - const model = params.req.model || DEFAULT_OPENAI_IMAGE_MODEL; + const model = resolveOpenAIImageRequestModel(params.req, { + allowTransparentDefaultReroute: true, + }); log.info( `image auth selected: provider=openai-codex mode=${sanitizeLogValue( params.authMode, @@ -549,7 +573,9 @@ async function generateOpenAICodexImage(params: { transport: "http", }); - const model = req.model || DEFAULT_OPENAI_IMAGE_MODEL; + const model = resolveOpenAIImageRequestModel(req, { + allowTransparentDefaultReroute: true, + }); const count = resolveOpenAIImageCount(req.count); const size = req.size ?? DEFAULT_SIZE; const timeoutMs = resolveOpenAIImageTimeoutMs(req.timeoutMs); @@ -711,7 +737,9 @@ export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider { transport: "http", }); - const model = req.model || DEFAULT_OPENAI_IMAGE_MODEL; + const model = resolveOpenAIImageRequestModel(req, { + allowTransparentDefaultReroute: publicOpenAIBaseUrl, + }); const count = resolveOpenAIImageCount(req.count); const size = req.size ?? DEFAULT_SIZE; const timeoutMs = resolveOpenAIImageTimeoutMs(req.timeoutMs); diff --git a/src/agents/openai-ws-message-conversion.test.ts b/src/agents/openai-ws-message-conversion.test.ts index 4702f517192..4a818ef4cae 100644 --- a/src/agents/openai-ws-message-conversion.test.ts +++ b/src/agents/openai-ws-message-conversion.test.ts @@ -1,8 +1,45 @@ import { describe, expect, it } from "vitest"; import type { ResponseObject } from "./openai-ws-connection.js"; -import { buildAssistantMessageFromResponse } from "./openai-ws-message-conversion.js"; +import { buildAssistantMessageFromResponse, convertTools } from "./openai-ws-message-conversion.js"; describe("openai ws message conversion", () => { + it("preserves image_generate transparent-background guidance in OpenAI tool payloads", () => { + const [tool] = convertTools([ + { + name: "image_generate", + description: + 'Generate images. For transparent OpenAI backgrounds, use outputFormat="png" or "webp" and openai.background="transparent"; OpenClaw routes the default OpenAI image model to gpt-image-1.5 for that mode.', + parameters: { + type: "object", + properties: { + model: { + type: "string", + description: + "Optional provider/model override; use openai/gpt-image-1.5 for transparent OpenAI backgrounds.", + }, + outputFormat: { type: "string", enum: ["png", "jpeg", "webp"] }, + openai: { + type: "object", + properties: { + background: { + type: "string", + enum: ["transparent", "opaque", "auto"], + description: + "For transparent output use outputFormat png or webp; OpenClaw routes the default OpenAI image model to gpt-image-1.5 for this mode.", + }, + }, + }, + }, + }, + }, + ]); + + expect(tool?.description).toContain('openai.background="transparent"'); + expect(tool?.description).toContain("gpt-image-1.5"); + expect(JSON.stringify(tool?.parameters)).toContain("openai/gpt-image-1.5"); + expect(JSON.stringify(tool?.parameters)).toContain("transparent"); + }); + it("preserves cached token usage from responses usage details", () => { const response: ResponseObject = { id: "resp_123", diff --git a/src/agents/tools/image-generate-tool.test.ts b/src/agents/tools/image-generate-tool.test.ts index bc395916b5f..2c73706615e 100644 --- a/src/agents/tools/image-generate-tool.test.ts +++ b/src/agents/tools/image-generate-tool.test.ts @@ -218,6 +218,18 @@ describe("createImageGenerateTool", () => { expect(createImageGenerateTool({ config: {} })).toBeNull(); }); + it("tells agents how to request transparent OpenAI backgrounds", () => { + vi.stubEnv("OPENAI_API_KEY", "openai-key"); + stubImageGenerationProviders(); + + const tool = requireImageGenerateTool(createImageGenerateTool({ config: {} })); + + expect(tool.description).toContain('outputFormat="png" or "webp"'); + expect(tool.description).toContain('openai.background="transparent"'); + expect(tool.description).toContain("gpt-image-1.5"); + expect(JSON.stringify(tool.parameters)).toContain("openai/gpt-image-1.5"); + }); + it("matches image-generation providers across canonical provider aliases", () => { vi.spyOn(imageGenerationRuntime, "listRuntimeImageGenerationProviders").mockReturnValue([ { @@ -595,6 +607,62 @@ describe("createImageGenerateTool", () => { }); }); + it("forwards transparent OpenAI background requests with a PNG output format", async () => { + const generateImage = vi.spyOn(imageGenerationRuntime, "generateImage").mockResolvedValue({ + provider: "openai", + model: "gpt-image-1.5", + attempts: [], + ignoredOverrides: [], + images: [ + { + buffer: Buffer.from("png-out"), + mimeType: "image/png", + fileName: "transparent.png", + }, + ], + }); + vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValue({ + path: "/tmp/transparent.png", + id: "transparent.png", + size: 7, + contentType: "image/png", + }); + + const tool = createToolWithPrimaryImageModel("openai/gpt-image-1.5"); + const result = await tool.execute("call-openai-transparent", { + prompt: "A transparent badge", + outputFormat: "png", + openai: { + background: "transparent", + }, + }); + + expect(generateImage).toHaveBeenCalledWith( + expect.objectContaining({ + cfg: expect.objectContaining({ + agents: expect.objectContaining({ + defaults: expect.objectContaining({ + imageGenerationModel: { primary: "openai/gpt-image-1.5" }, + }), + }), + }), + outputFormat: "png", + providerOptions: { + openai: { + background: "transparent", + }, + }, + }), + ); + expect(result).toMatchObject({ + details: { + provider: "openai", + model: "gpt-image-1.5", + outputFormat: "png", + }, + }); + }); + it("includes MEDIA paths in content text so follow-up replies use the real saved file", async () => { vi.spyOn(imageGenerationRuntime, "listRuntimeImageGenerationProviders").mockReturnValue([ { diff --git a/src/agents/tools/image-generate-tool.ts b/src/agents/tools/image-generate-tool.ts index 2a04fb47961..be232146cae 100644 --- a/src/agents/tools/image-generate-tool.ts +++ b/src/agents/tools/image-generate-tool.ts @@ -96,7 +96,10 @@ const ImageGenerateToolSchema = Type.Object({ }), ), model: Type.Optional( - Type.String({ description: "Optional provider/model override, e.g. openai/gpt-image-2." }), + Type.String({ + description: + "Optional provider/model override, e.g. openai/gpt-image-2; use openai/gpt-image-1.5 for transparent OpenAI backgrounds.", + }), ), filename: Type.Optional( Type.String({ @@ -131,7 +134,8 @@ const ImageGenerateToolSchema = Type.Object({ openai: Type.Optional( Type.Object({ background: optionalStringEnum(SUPPORTED_OPENAI_BACKGROUNDS, { - description: "OpenAI-only background hint: transparent, opaque, or auto.", + description: + "OpenAI-only background hint: transparent, opaque, or auto. For transparent output use outputFormat png or webp; OpenClaw routes the default OpenAI image model to gpt-image-1.5 for this mode.", }), moderation: optionalStringEnum(SUPPORTED_OPENAI_MODERATIONS, { description: "OpenAI-only moderation hint: low or auto.", @@ -570,7 +574,7 @@ export function createImageGenerateTool(options?: { label: "Image Generation", name: "image_generate", description: - 'Generate new images or edit reference images with the configured or inferred image-generation model. Set agents.defaults.imageGenerationModel.primary to pick a provider/model. Providers declare their own auth/readiness; use action="list" to inspect registered providers, models, readiness, and auth hints. Generated images are delivered automatically from the tool result as MEDIA paths.', + 'Generate new images or edit reference images with the configured or inferred image-generation model. For transparent OpenAI backgrounds, use outputFormat="png" or "webp" and openai.background="transparent"; OpenClaw routes the default OpenAI image model to gpt-image-1.5 for that mode. Set agents.defaults.imageGenerationModel.primary to pick a provider/model. Providers declare their own auth/readiness; use action="list" to inspect registered providers, models, readiness, and auth hints. Generated images are delivered automatically from the tool result as MEDIA paths.', parameters: ImageGenerateToolSchema, execute: async (_toolCallId, args) => { const params = args as Record;