fix: support OpenAI Codex media understanding (#54829) (thanks @neeravmakwana)

* OpenAI: register Codex media understanding provider

* fix: route codex image prompts through system instructions

* fix: add changelog for codex image tool fix (#54829) (thanks @neeravmakwana)

* fix: remove any from provider registration tests (#54829) (thanks @neeravmakwana)

---------

Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
Neerav Makwana
2026-03-26 00:40:11 -04:00
committed by GitHub
parent 76ff0d9298
commit 6fd9d2ff38
9 changed files with 154 additions and 113 deletions

View File

@@ -177,6 +177,67 @@ describe("describeImageWithModel", () => {
expect(minimaxUnderstandImageMock).not.toHaveBeenCalled();
});
it("passes image prompt as system instructions for codex image requests", async () => {
discoverModelsMock.mockReturnValue({
find: vi.fn(() => ({
provider: "openai-codex",
id: "gpt-5.4",
input: ["text", "image"],
baseUrl: "https://chatgpt.com/backend-api",
})),
});
completeMock.mockResolvedValue({
role: "assistant",
api: "openai-codex-responses",
provider: "openai-codex",
model: "gpt-5.4",
stopReason: "stop",
timestamp: Date.now(),
content: [{ type: "text", text: "codex ok" }],
});
const result = await describeImageWithModel({
cfg: {},
agentDir: "/tmp/openclaw-agent",
provider: "openai-codex",
model: "gpt-5.4",
buffer: Buffer.from("png-bytes"),
fileName: "image.png",
mime: "image/png",
prompt: "Describe the image.",
timeoutMs: 1000,
});
expect(result).toEqual({
text: "codex ok",
model: "gpt-5.4",
});
expect(completeMock).toHaveBeenCalledOnce();
expect(completeMock).toHaveBeenCalledWith(
expect.objectContaining({
provider: "openai-codex",
id: "gpt-5.4",
}),
expect.objectContaining({
systemPrompt: "Describe the image.",
messages: [
expect.objectContaining({
role: "user",
content: [
expect.objectContaining({
type: "image",
mimeType: "image/png",
}),
],
}),
],
}),
expect.any(Object),
);
const [, context] = completeMock.mock.calls[0] ?? [];
expect(context?.messages?.[0]?.content).toHaveLength(1);
});
it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
const findMock = vi.fn((provider: string, modelId: string) => {
expect(provider).toBe("google");

View File

@@ -73,17 +73,15 @@ function buildImageContext(
images: Array<{ buffer: Buffer; mime?: string }>,
): Context {
return {
systemPrompt: prompt,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
...images.map((image) => ({
type: "image" as const,
data: image.buffer.toString("base64"),
mimeType: image.mime ?? "image/jpeg",
})),
],
content: images.map((image) => ({
type: "image" as const,
data: image.buffer.toString("base64"),
mimeType: image.mime ?? "image/jpeg",
})),
timestamp: Date.now(),
},
],

View File

@@ -184,7 +184,10 @@ describe("plugin contract registry", () => {
]);
expect(findMediaUnderstandingProviderIdsForPlugin("mistral")).toEqual(["mistral"]);
expect(findMediaUnderstandingProviderIdsForPlugin("moonshot")).toEqual(["moonshot"]);
expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual(["openai"]);
expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual([
"openai",
"openai-codex",
]);
expect(findMediaUnderstandingProviderIdsForPlugin("zai")).toEqual(["zai"]);
});
@@ -244,7 +247,7 @@ describe("plugin contract registry", () => {
expect(findRegistrationForPlugin("openai")).toMatchObject({
providerIds: ["openai", "openai-codex"],
speechProviderIds: ["openai"],
mediaUnderstandingProviderIds: ["openai"],
mediaUnderstandingProviderIds: ["openai", "openai-codex"],
imageGenerationProviderIds: ["openai"],
videoGenerationProviderIds: [],
});