diff --git a/CHANGELOG.md b/CHANGELOG.md index df50c7681f4..6616a7f2aec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai - Agents/subagents: add optional forked context for native `sessions_spawn` runs so agents can let a child inherit the requester transcript when needed, while keeping clean isolated sessions as the default; includes prompt guidance, context-engine hook metadata, docs, and QA coverage. - Providers/OpenAI: add forward-compatible `gpt-5.5` and `gpt-5.5-pro` support for OpenAI API keys, OpenAI Codex OAuth, and the Codex CLI default model. +- Providers/OpenAI Codex: add image generation and reference-image editing through Codex OAuth, so `openai-codex/gpt-image-2` works without an `OPENAI_API_KEY`. Fixes #70703. ### Fixes diff --git a/docs/providers/openai.md b/docs/providers/openai.md index 8fcfec2d1ed..d438220cc5f 100644 --- a/docs/providers/openai.md +++ b/docs/providers/openai.md @@ -210,12 +210,23 @@ See [Image Generation](/tools/image-generation) for shared tool parameters, prov editing. `gpt-image-1` remains usable as an explicit model override, but new OpenAI image workflows should use `openai/gpt-image-2`. +The `openai-codex` provider also exposes `gpt-image-2` for image generation and +reference-image editing through OpenAI Codex OAuth. Use +`openai-codex/gpt-image-2` when the agent is signed in with Codex OAuth but does +not have an `OPENAI_API_KEY`. + Generate: ``` /tool image_generate model=openai/gpt-image-2 prompt="A polished launch poster for OpenClaw on macOS" size=3840x2160 count=1 ``` +Generate with Codex OAuth: + +``` +/tool image_generate model=openai-codex/gpt-image-2 prompt="A polished launch poster for OpenClaw on macOS" size=3840x2160 count=1 +``` + Edit: ``` diff --git a/docs/tools/image-generation.md b/docs/tools/image-generation.md index 21c54c57063..f679bed6621 100644 --- a/docs/tools/image-generation.md +++ b/docs/tools/image-generation.md @@ -36,15 +36,16 @@ The agent calls `image_generate` automatically. No tool allow-listing needed — ## Supported providers -| Provider | Default model | Edit support | API key | -| -------- | -------------------------------- | ---------------------------------- | ----------------------------------------------------- | -| OpenAI | `gpt-image-2` | Yes (up to 5 images) | `OPENAI_API_KEY` | -| Google | `gemini-3.1-flash-image-preview` | Yes | `GEMINI_API_KEY` or `GOOGLE_API_KEY` | -| fal | `fal-ai/flux/dev` | Yes | `FAL_KEY` | -| MiniMax | `image-01` | Yes (subject reference) | `MINIMAX_API_KEY` or MiniMax OAuth (`minimax-portal`) | -| ComfyUI | `workflow` | Yes (1 image, workflow-configured) | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` for cloud | -| Vydra | `grok-imagine` | No | `VYDRA_API_KEY` | -| xAI | `grok-imagine-image` | Yes (up to 5 images) | `XAI_API_KEY` | +| Provider | Default model | Edit support | API key | +| ------------ | -------------------------------- | ---------------------------------- | ----------------------------------------------------- | +| OpenAI | `gpt-image-2` | Yes (up to 4 images) | `OPENAI_API_KEY` | +| OpenAI Codex | `gpt-image-2` | Yes (up to 4 images) | OpenAI Codex OAuth | +| Google | `gemini-3.1-flash-image-preview` | Yes | `GEMINI_API_KEY` or `GOOGLE_API_KEY` | +| fal | `fal-ai/flux/dev` | Yes | `FAL_KEY` | +| MiniMax | `image-01` | Yes (subject reference) | `MINIMAX_API_KEY` or MiniMax OAuth (`minimax-portal`) | +| ComfyUI | `workflow` | Yes (1 image, workflow-configured) | `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` for cloud | +| Vydra | `grok-imagine` | No | `VYDRA_API_KEY` | +| xAI | `grok-imagine-image` | Yes (up to 5 images) | `XAI_API_KEY` | Use `action: "list"` to inspect available providers and models at runtime: diff --git a/extensions/openai/image-generation-provider.test.ts b/extensions/openai/image-generation-provider.test.ts index 626866217bd..371ada818ed 100644 --- a/extensions/openai/image-generation-provider.test.ts +++ b/extensions/openai/image-generation-provider.test.ts @@ -1,5 +1,8 @@ import { afterEach, describe, expect, it, vi } from "vitest"; -import { buildOpenAIImageGenerationProvider } from "./image-generation-provider.js"; +import { + buildOpenAICodexImageGenerationProvider, + buildOpenAIImageGenerationProvider, +} from "./image-generation-provider.js"; const { resolveApiKeyForProviderMock, @@ -47,6 +50,32 @@ function mockGeneratedPngResponse() { }); } +function mockCodexImageStream(params: { imageData?: string; revisedPrompt?: string } = {}) { + const image = Buffer.from(params.imageData ?? "codex-png-bytes").toString("base64"); + const events = [ + { + type: "response.output_item.done", + item: { + type: "image_generation_call", + result: image, + ...(params.revisedPrompt ? { revised_prompt: params.revisedPrompt } : {}), + }, + }, + { + type: "response.completed", + response: { + usage: { input_tokens: 10, output_tokens: 20, total_tokens: 30 }, + tool_usage: { image_gen: { total_tokens: 30 } }, + }, + }, + ]; + const body = events.map((event) => `data: ${JSON.stringify(event)}\n\n`).join(""); + postJsonRequestMock.mockImplementation(async () => ({ + response: new Response(body), + release: vi.fn(async () => {}), + })); +} + describe("openai image generation provider", () => { afterEach(() => { resolveApiKeyForProviderMock.mockClear(); @@ -252,6 +281,132 @@ describe("openai image generation provider", () => { expect(result.images).toHaveLength(1); }); + it("registers Codex OAuth image generation through Responses streaming", async () => { + mockCodexImageStream({ imageData: "codex-image", revisedPrompt: "revised codex prompt" }); + + const provider = buildOpenAICodexImageGenerationProvider(); + const authStore = { version: 1, profiles: {} }; + const result = await provider.generateImage({ + provider: "openai-codex", + model: "gpt-image-2", + prompt: "Draw a Codex lighthouse", + cfg: {}, + authStore, + count: 1, + size: "1024x1536", + }); + + expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "openai-codex", + store: authStore, + }), + ); + expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith( + expect.objectContaining({ + defaultBaseUrl: "https://chatgpt.com/backend-api/codex", + defaultHeaders: expect.objectContaining({ + Authorization: "Bearer openai-key", + Accept: "text/event-stream", + }), + provider: "openai-codex", + api: "openai-codex-responses", + capability: "image", + }), + ); + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://chatgpt.com/backend-api/codex/responses", + body: expect.objectContaining({ + model: "gpt-5.4", + instructions: "You are an image generation assistant.", + stream: true, + store: false, + tools: [ + { + type: "image_generation", + model: "gpt-image-2", + size: "1024x1536", + }, + ], + tool_choice: { type: "image_generation" }, + }), + }), + ); + expect(postMultipartRequestMock).not.toHaveBeenCalled(); + expect(result.images).toEqual([ + { + buffer: Buffer.from("codex-image"), + mimeType: "image/png", + fileName: "image-1.png", + revisedPrompt: "revised codex prompt", + }, + ]); + expect(result.metadata).toEqual({ + responses: [ + { + usage: { input_tokens: 10, output_tokens: 20, total_tokens: 30 }, + toolUsage: { image_gen: { total_tokens: 30 } }, + }, + ], + }); + }); + + it("sends Codex reference images as Responses input images", async () => { + mockCodexImageStream(); + + const provider = buildOpenAICodexImageGenerationProvider(); + await provider.generateImage({ + provider: "openai-codex", + model: "gpt-image-2", + prompt: "Use the reference image", + cfg: {}, + inputImages: [ + { buffer: Buffer.from("png-bytes"), mimeType: "image/png", fileName: "ref.png" }, + ], + }); + + const body = postJsonRequestMock.mock.calls[0]?.[0].body as { + input: Array<{ content: Array> }>; + }; + expect(body.input[0]?.content).toEqual([ + { type: "input_text", text: "Use the reference image" }, + { + type: "input_image", + image_url: `data:image/png;base64,${Buffer.from("png-bytes").toString("base64")}`, + detail: "auto", + }, + ]); + expect(postJsonRequestMock).not.toHaveBeenCalledWith( + expect.objectContaining({ url: expect.stringContaining("/images/edits") }), + ); + expect(postMultipartRequestMock).not.toHaveBeenCalled(); + }); + + it("satisfies Codex count by issuing one Responses request per image", async () => { + mockCodexImageStream({ imageData: "codex-image" }); + + const provider = buildOpenAICodexImageGenerationProvider(); + const result = await provider.generateImage({ + provider: "openai-codex", + model: "gpt-image-2", + prompt: "Draw two Codex icons", + cfg: {}, + count: 2, + }); + + expect(postJsonRequestMock).toHaveBeenCalledTimes(2); + const firstBody = postJsonRequestMock.mock.calls[0]?.[0].body as { + tools: Array>; + }; + expect(firstBody.tools[0]).toEqual({ + type: "image_generation", + model: "gpt-image-2", + size: "1024x1024", + }); + expect(result.images.map((image) => image.fileName)).toEqual(["image-1.png", "image-2.png"]); + }); + it("forwards SSRF guard fields to multipart edit requests", async () => { mockGeneratedPngResponse(); diff --git a/extensions/openai/image-generation-provider.ts b/extensions/openai/image-generation-provider.ts index bf3906f5f28..6e3df1af7ed 100644 --- a/extensions/openai/image-generation-provider.ts +++ b/extensions/openai/image-generation-provider.ts @@ -1,6 +1,10 @@ import path from "node:path"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; -import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation"; +import type { + ImageGenerationProvider, + ImageGenerationResult, + ImageGenerationSourceImage, +} from "openclaw/plugin-sdk/image-generation"; import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth"; import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime"; import { @@ -13,6 +17,8 @@ import { OPENAI_DEFAULT_IMAGE_MODEL as DEFAULT_OPENAI_IMAGE_MODEL } from "./defa import { resolveConfiguredOpenAIBaseUrl } from "./shared.js"; const DEFAULT_OPENAI_IMAGE_BASE_URL = "https://api.openai.com/v1"; +const DEFAULT_OPENAI_CODEX_IMAGE_BASE_URL = "https://chatgpt.com/backend-api/codex"; +const OPENAI_CODEX_IMAGE_INSTRUCTIONS = "You are an image generation assistant."; const DEFAULT_OUTPUT_MIME = "image/png"; const DEFAULT_SIZE = "1024x1024"; const OPENAI_SUPPORTED_SIZES = [ @@ -85,6 +91,24 @@ type OpenAIImageApiResponse = { }>; }; +type OpenAICodexImageGenerationEvent = { + type?: string; + item?: { + type?: string; + result?: string; + revised_prompt?: string; + }; + response?: { + usage?: unknown; + tool_usage?: unknown; + }; + error?: { + code?: string; + message?: string; + }; + message?: string; +}; + function inferImageUploadFileName(params: { fileName?: string; mimeType?: string; @@ -99,17 +123,115 @@ function inferImageUploadFileName(params: { return `image-${params.index + 1}.${ext}`; } -export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider { +function toOpenAIDataUrl(image: ImageGenerationSourceImage): string { + const mimeType = image.mimeType?.trim() || DEFAULT_OUTPUT_MIME; + return `data:${mimeType};base64,${Buffer.from(image.buffer).toString("base64")}`; +} + +async function readResponseBodyText(response: Response): Promise { + if (!response.body) { + return await response.text(); + } + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let text = ""; + try { + while (true) { + const { value, done } = await reader.read(); + if (value) { + text += decoder.decode(value, { stream: !done }); + } + if (done) { + text += decoder.decode(); + return text; + } + } + } finally { + reader.releaseLock(); + } +} + +function parseCodexImageGenerationEvents(body: string): OpenAICodexImageGenerationEvent[] { + const events: OpenAICodexImageGenerationEvent[] = []; + for (const line of body.split(/\r?\n/)) { + if (!line.startsWith("data: ")) { + continue; + } + const data = line.slice(6).trim(); + if (!data || data === "[DONE]") { + continue; + } + try { + events.push(JSON.parse(data) as OpenAICodexImageGenerationEvent); + } catch { + // Ignore non-JSON SSE payloads from intermediaries; failed HTTP statuses + // are handled before this parser runs. + } + } + return events; +} + +function extractCodexImageGenerationResult(params: { + body: string; + model: string; +}): ImageGenerationResult { + const events = parseCodexImageGenerationEvents(params.body); + const failure = events.find( + (event) => event.type === "response.failed" || event.type === "error", + ); + if (failure) { + const message = + failure.error?.message ?? + failure.message ?? + (failure.error?.code ? `OpenAI Codex image generation failed (${failure.error.code})` : ""); + throw new Error(message || "OpenAI Codex image generation failed"); + } + const completedResponse = events.find((event) => event.type === "response.completed"); + const images = events + .filter( + (event) => + event.type === "response.output_item.done" && + event.item?.type === "image_generation_call" && + typeof event.item.result === "string" && + event.item.result.length > 0, + ) + .map((event, index) => + Object.assign( + { + buffer: Buffer.from(event.item?.result ?? "", "base64"), + mimeType: DEFAULT_OUTPUT_MIME, + fileName: `image-${index + 1}.png`, + }, + event.item?.revised_prompt ? { revisedPrompt: event.item.revised_prompt } : {}, + ), + ); + return { - id: "openai", - label: "OpenAI", + images, + model: params.model, + ...(completedResponse?.response + ? { + metadata: { + usage: completedResponse.response.usage, + toolUsage: completedResponse.response.tool_usage, + }, + } + : {}), + }; +} + +function createOpenAIImageGenerationProviderBase(params: { + id: "openai" | "openai-codex"; + label: string; + isConfigured: ImageGenerationProvider["isConfigured"]; + generateImage: ImageGenerationProvider["generateImage"]; +}): ImageGenerationProvider { + return { + id: params.id, + label: params.label, defaultModel: DEFAULT_OPENAI_IMAGE_MODEL, models: [DEFAULT_OPENAI_IMAGE_MODEL], - isConfigured: ({ agentDir }) => - isProviderApiKeyConfigured({ - provider: "openai", - agentDir, - }), + isConfigured: params.isConfigured, capabilities: { generate: { maxCount: 4, @@ -129,6 +251,19 @@ export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider { sizes: [...OPENAI_SUPPORTED_SIZES], }, }, + generateImage: params.generateImage, + }; +} + +export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider { + return createOpenAIImageGenerationProviderBase({ + id: "openai", + label: "OpenAI", + isConfigured: ({ agentDir }) => + isProviderApiKeyConfigured({ + provider: "openai", + agentDir, + }), async generateImage(req) { const inputImages = req.inputImages ?? []; const isEdit = inputImages.length > 0; @@ -245,5 +380,110 @@ export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider { await release(); } }, - }; + }); +} + +export function buildOpenAICodexImageGenerationProvider(): ImageGenerationProvider { + return createOpenAIImageGenerationProviderBase({ + id: "openai-codex", + label: "OpenAI Codex", + isConfigured: ({ agentDir }) => + isProviderApiKeyConfigured({ + provider: "openai-codex", + agentDir, + }), + async generateImage(req) { + const inputImages = req.inputImages ?? []; + const auth = await resolveApiKeyForProvider({ + provider: "openai-codex", + cfg: req.cfg, + agentDir: req.agentDir, + store: req.authStore, + }); + if (!auth.apiKey) { + throw new Error("OpenAI Codex OAuth missing"); + } + + const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } = + resolveProviderHttpRequestConfig({ + defaultBaseUrl: DEFAULT_OPENAI_CODEX_IMAGE_BASE_URL, + defaultHeaders: { + Authorization: `Bearer ${auth.apiKey}`, + Accept: "text/event-stream", + }, + provider: "openai-codex", + api: "openai-codex-responses", + capability: "image", + transport: "http", + }); + + const model = req.model || DEFAULT_OPENAI_IMAGE_MODEL; + const count = req.count ?? 1; + const size = req.size ?? DEFAULT_SIZE; + headers.set("Content-Type", "application/json"); + const content: Array> = [ + { type: "input_text", text: req.prompt }, + ...inputImages.map((image) => ({ + type: "input_image", + image_url: toOpenAIDataUrl(image), + detail: "auto", + })), + ]; + const results: ImageGenerationResult[] = []; + for (let index = 0; index < count; index += 1) { + const requestResult = await postJsonRequest({ + url: `${baseUrl}/responses`, + headers, + body: { + model: "gpt-5.4", + input: [ + { + role: "user", + content, + }, + ], + instructions: OPENAI_CODEX_IMAGE_INSTRUCTIONS, + tools: [ + { + type: "image_generation", + model, + size, + }, + ], + tool_choice: { type: "image_generation" }, + stream: true, + store: false, + }, + timeoutMs: req.timeoutMs, + fetchFn: fetch, + allowPrivateNetwork, + dispatcherPolicy, + }); + const { response, release } = requestResult; + try { + await assertOkOrThrowHttpError(response, "OpenAI Codex image generation failed"); + results.push( + extractCodexImageGenerationResult({ + body: await readResponseBodyText(response), + model, + }), + ); + } finally { + await release(); + } + } + const images = results.flatMap((result) => result.images); + return { + images: images.map((image, index) => + Object.assign({}, image, { + fileName: `image-${index + 1}.png`, + }), + ), + model, + metadata: { + responses: results.map((result) => result.metadata).filter(Boolean), + }, + }; + }, + }); } diff --git a/extensions/openai/index.ts b/extensions/openai/index.ts index c7f4b9d5642..8540217daf0 100644 --- a/extensions/openai/index.ts +++ b/extensions/openai/index.ts @@ -2,7 +2,10 @@ import { resolvePluginConfigObject } from "openclaw/plugin-sdk/config-runtime"; import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; import { buildProviderToolCompatFamilyHooks } from "openclaw/plugin-sdk/provider-tools"; import { buildOpenAICodexCliBackend } from "./cli-backend.js"; -import { buildOpenAIImageGenerationProvider } from "./image-generation-provider.js"; +import { + buildOpenAICodexImageGenerationProvider, + buildOpenAIImageGenerationProvider, +} from "./image-generation-provider.js"; import { openaiCodexMediaUnderstandingProvider, openaiMediaUnderstandingProvider, @@ -49,6 +52,7 @@ export default definePluginEntry({ api.registerProvider(buildProviderWithPromptContribution(buildOpenAICodexProviderPlugin())); api.registerMemoryEmbeddingProvider(openAiMemoryEmbeddingProviderAdapter); api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider()); + api.registerImageGenerationProvider(buildOpenAICodexImageGenerationProvider()); api.registerRealtimeTranscriptionProvider(buildOpenAIRealtimeTranscriptionProvider()); api.registerRealtimeVoiceProvider(buildOpenAIRealtimeVoiceProvider()); api.registerSpeechProvider(buildOpenAISpeechProvider()); diff --git a/extensions/openai/openclaw.plugin.json b/extensions/openai/openclaw.plugin.json index 100fed03b2b..e5148bac7ec 100644 --- a/extensions/openai/openclaw.plugin.json +++ b/extensions/openai/openclaw.plugin.json @@ -54,7 +54,7 @@ "realtimeVoiceProviders": ["openai"], "memoryEmbeddingProviders": ["openai"], "mediaUnderstandingProviders": ["openai", "openai-codex"], - "imageGenerationProviders": ["openai"], + "imageGenerationProviders": ["openai", "openai-codex"], "videoGenerationProviders": ["openai"] }, "mediaUnderstandingProviderMetadata": { diff --git a/test/helpers/plugins/plugin-registration-contract-cases.ts b/test/helpers/plugins/plugin-registration-contract-cases.ts index 2eef2e7ff34..cfff7e38f56 100644 --- a/test/helpers/plugins/plugin-registration-contract-cases.ts +++ b/test/helpers/plugins/plugin-registration-contract-cases.ts @@ -104,7 +104,7 @@ export const pluginRegistrationContractCases = { realtimeTranscriptionProviderIds: ["openai"], realtimeVoiceProviderIds: ["openai"], mediaUnderstandingProviderIds: ["openai", "openai-codex"], - imageGenerationProviderIds: ["openai"], + imageGenerationProviderIds: ["openai", "openai-codex"], requireSpeechVoices: true, requireDescribeImages: true, requireGenerateImage: true,