fix(qa): support image understanding inputs

This commit is contained in:
Peter Steinberger
2026-04-06 04:46:10 +01:00
parent 9f8900bb3c
commit 2285bacd21
3 changed files with 79 additions and 2 deletions

View File

@@ -2,6 +2,8 @@ import { afterEach, describe, expect, it } from "vitest";
import { startQaMockOpenAiServer } from "./mock-openai-server.js";
const cleanups: Array<() => Promise<void>> = [];
const QA_IMAGE_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";
afterEach(async () => {
while (cleanups.length > 0) {
@@ -332,6 +334,56 @@ describe("qa mock openai server", () => {
});
});
it("records image inputs and describes attached images", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const response = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: false,
model: "mock-openai/gpt-5.4",
input: [
{
role: "user",
content: [
{ type: "input_text", text: "Image understanding check: what do you see?" },
{
type: "input_image",
source: {
type: "base64",
mime_type: "image/png",
data: QA_IMAGE_PNG_BASE64,
},
},
],
},
],
}),
});
expect(response.status).toBe(200);
const payload = (await response.json()) as {
output?: Array<{ content?: Array<{ text?: string }> }>;
};
const text = payload.output?.[0]?.content?.[0]?.text ?? "";
expect(text.toLowerCase()).toContain("red");
expect(text.toLowerCase()).toContain("blue");
const debug = await fetch(`${server.baseUrl}/debug/requests`);
expect(debug.status).toBe(200);
expect(await debug.json()).toMatchObject([
expect.objectContaining({
imageInputCount: 1,
}),
]);
});
it("ignores stale tool output from prior turns when planning the current turn", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",

View File

@@ -27,6 +27,7 @@ type MockOpenAiRequestSnapshot = {
allInputText: string;
toolOutput: string;
model: string;
imageInputCount: number;
plannedToolName?: string;
};
@@ -159,6 +160,25 @@ function extractAllInputTexts(input: ResponsesInputItem[]) {
return texts.join("\n");
}
function countImageInputs(input: ResponsesInputItem[]) {
let count = 0;
for (const item of input) {
if (!Array.isArray(item.content)) {
continue;
}
for (const entry of item.content) {
if (
entry &&
typeof entry === "object" &&
(entry as { type?: unknown }).type === "input_image"
) {
count += 1;
}
}
}
return count;
}
function parseToolOutputJson(toolOutput: string): Record<string, unknown> | null {
if (!toolOutput.trim()) {
return null;
@@ -301,6 +321,7 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
const orbitCode = extractOrbitCode(memorySnippet);
const mediaPath = /MEDIA:([^\n]+)/.exec(toolOutput)?.[1]?.trim();
const exactReplyDirective = extractExactReplyDirective(allInputText);
const imageInputCount = countImageInputs(input);
if (/what was the qa canary code/i.test(prompt) && rememberedFact) {
return `Protocol note: the QA canary code was ${rememberedFact}.`;
@@ -332,6 +353,9 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
if (/image generation check/i.test(prompt) && mediaPath) {
return `Protocol note: generated the QA lighthouse image successfully.\nMEDIA:${mediaPath}`;
}
if (/image understanding check/i.test(prompt) && imageInputCount > 0) {
return "Protocol note: the attached image is split horizontally, with red on top and blue on the bottom.";
}
if (toolOutput && /delegate|subagent/i.test(prompt)) {
return `Protocol note: delegated result acknowledged. The bounded subagent task returned and is folded back into the main thread.`;
}
@@ -558,6 +582,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
allInputText: extractAllInputTexts(input),
toolOutput: extractToolOutput(input),
model: typeof body.model === "string" ? body.model : "",
imageInputCount: countImageInputs(input),
plannedToolName: extractPlannedToolName(events),
};
requests.push(lastRequest);

View File

@@ -49,7 +49,7 @@ export function buildQaGatewayConfig(params: {
name: "gpt-5.4",
api: "openai-responses",
reasoning: false,
input: ["text"],
input: ["text", "image"],
cost: {
input: 0,
output: 0,
@@ -64,7 +64,7 @@ export function buildQaGatewayConfig(params: {
name: "gpt-5.4-alt",
api: "openai-responses",
reasoning: false,
input: ["text"],
input: ["text", "image"],
cost: {
input: 0,
output: 0,