From 2285bacd21db6445fcc326386c34cf98cc5092e7 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 6 Apr 2026 04:46:10 +0100 Subject: [PATCH] fix(qa): support image understanding inputs --- .../qa-lab/src/mock-openai-server.test.ts | 52 +++++++++++++++++++ extensions/qa-lab/src/mock-openai-server.ts | 25 +++++++++ extensions/qa-lab/src/qa-gateway-config.ts | 4 +- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/extensions/qa-lab/src/mock-openai-server.test.ts b/extensions/qa-lab/src/mock-openai-server.test.ts index 572b22db21e..59c0addcc55 100644 --- a/extensions/qa-lab/src/mock-openai-server.test.ts +++ b/extensions/qa-lab/src/mock-openai-server.test.ts @@ -2,6 +2,8 @@ import { afterEach, describe, expect, it } from "vitest"; import { startQaMockOpenAiServer } from "./mock-openai-server.js"; const cleanups: Array<() => Promise> = []; +const QA_IMAGE_PNG_BASE64 = + "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg=="; afterEach(async () => { while (cleanups.length > 0) { @@ -332,6 +334,56 @@ describe("qa mock openai server", () => { }); }); + it("records image inputs and describes attached images", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: false, + model: "mock-openai/gpt-5.4", + input: [ + { + role: "user", + content: [ + { type: "input_text", text: "Image understanding check: what do you see?" }, + { + type: "input_image", + source: { + type: "base64", + mime_type: "image/png", + data: QA_IMAGE_PNG_BASE64, + }, + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + const payload = (await response.json()) as { + output?: Array<{ content?: Array<{ text?: string }> }>; + }; + const text = payload.output?.[0]?.content?.[0]?.text ?? ""; + expect(text.toLowerCase()).toContain("red"); + expect(text.toLowerCase()).toContain("blue"); + + const debug = await fetch(`${server.baseUrl}/debug/requests`); + expect(debug.status).toBe(200); + expect(await debug.json()).toMatchObject([ + expect.objectContaining({ + imageInputCount: 1, + }), + ]); + }); + it("ignores stale tool output from prior turns when planning the current turn", async () => { const server = await startQaMockOpenAiServer({ host: "127.0.0.1", diff --git a/extensions/qa-lab/src/mock-openai-server.ts b/extensions/qa-lab/src/mock-openai-server.ts index b555bcbde91..64fe47e7ddd 100644 --- a/extensions/qa-lab/src/mock-openai-server.ts +++ b/extensions/qa-lab/src/mock-openai-server.ts @@ -27,6 +27,7 @@ type MockOpenAiRequestSnapshot = { allInputText: string; toolOutput: string; model: string; + imageInputCount: number; plannedToolName?: string; }; @@ -159,6 +160,25 @@ function extractAllInputTexts(input: ResponsesInputItem[]) { return texts.join("\n"); } +function countImageInputs(input: ResponsesInputItem[]) { + let count = 0; + for (const item of input) { + if (!Array.isArray(item.content)) { + continue; + } + for (const entry of item.content) { + if ( + entry && + typeof entry === "object" && + (entry as { type?: unknown }).type === "input_image" + ) { + count += 1; + } + } + } + return count; +} + function parseToolOutputJson(toolOutput: string): Record | null { if (!toolOutput.trim()) { return null; @@ -301,6 +321,7 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record 0) { + return "Protocol note: the attached image is split horizontally, with red on top and blue on the bottom."; + } if (toolOutput && /delegate|subagent/i.test(prompt)) { return `Protocol note: delegated result acknowledged. The bounded subagent task returned and is folded back into the main thread.`; } @@ -558,6 +582,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n allInputText: extractAllInputTexts(input), toolOutput: extractToolOutput(input), model: typeof body.model === "string" ? body.model : "", + imageInputCount: countImageInputs(input), plannedToolName: extractPlannedToolName(events), }; requests.push(lastRequest); diff --git a/extensions/qa-lab/src/qa-gateway-config.ts b/extensions/qa-lab/src/qa-gateway-config.ts index eaf1353a527..e64ded3d41a 100644 --- a/extensions/qa-lab/src/qa-gateway-config.ts +++ b/extensions/qa-lab/src/qa-gateway-config.ts @@ -49,7 +49,7 @@ export function buildQaGatewayConfig(params: { name: "gpt-5.4", api: "openai-responses", reasoning: false, - input: ["text"], + input: ["text", "image"], cost: { input: 0, output: 0, @@ -64,7 +64,7 @@ export function buildQaGatewayConfig(params: { name: "gpt-5.4-alt", api: "openai-responses", reasoning: false, - input: ["text"], + input: ["text", "image"], cost: { input: 0, output: 0,