fix(qa): support image understanding inputs

2026-04-12 09:41:11 +00:00 · 2026-04-06 04:46:10 +01:00
parent 9f8900bb3c
commit 2285bacd21
3 changed files with 79 additions and 2 deletions
--- a/extensions/qa-lab/src/mock-openai-server.test.ts
+++ b/extensions/qa-lab/src/mock-openai-server.test.ts
@@ -2,6 +2,8 @@ import { afterEach, describe, expect, it } from "vitest";
 import { startQaMockOpenAiServer } from "./mock-openai-server.js";

 const cleanups: Array<() => Promise<void>> = [];
+const QA_IMAGE_PNG_BASE64 =
+  "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";

 afterEach(async () => {
  while (cleanups.length > 0) {
@@ -332,6 +334,56 @@ describe("qa mock openai server", () => {
    });
  });

+  it("records image inputs and describes attached images", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        model: "mock-openai/gpt-5.4",
+        input: [
+          {
+            role: "user",
+            content: [
+              { type: "input_text", text: "Image understanding check: what do you see?" },
+              {
+                type: "input_image",
+                source: {
+                  type: "base64",
+                  mime_type: "image/png",
+                  data: QA_IMAGE_PNG_BASE64,
+                },
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const payload = (await response.json()) as {
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    };
+    const text = payload.output?.[0]?.content?.[0]?.text ?? "";
+    expect(text.toLowerCase()).toContain("red");
+    expect(text.toLowerCase()).toContain("blue");
+
+    const debug = await fetch(`${server.baseUrl}/debug/requests`);
+    expect(debug.status).toBe(200);
+    expect(await debug.json()).toMatchObject([
+      expect.objectContaining({
+        imageInputCount: 1,
+      }),
+    ]);
+  });
+
  it("ignores stale tool output from prior turns when planning the current turn", async () => {
    const server = await startQaMockOpenAiServer({
      host: "127.0.0.1",
--- a/extensions/qa-lab/src/mock-openai-server.ts
+++ b/extensions/qa-lab/src/mock-openai-server.ts
@@ -27,6 +27,7 @@ type MockOpenAiRequestSnapshot = {
  allInputText: string;
  toolOutput: string;
  model: string;
+  imageInputCount: number;
  plannedToolName?: string;
 };

@@ -159,6 +160,25 @@ function extractAllInputTexts(input: ResponsesInputItem[]) {
  return texts.join("\n");
 }

+function countImageInputs(input: ResponsesInputItem[]) {
+  let count = 0;
+  for (const item of input) {
+    if (!Array.isArray(item.content)) {
+      continue;
+    }
+    for (const entry of item.content) {
+      if (
+        entry &&
+        typeof entry === "object" &&
+        (entry as { type?: unknown }).type === "input_image"
+      ) {
+        count += 1;
+      }
+    }
+  }
+  return count;
+}
+
 function parseToolOutputJson(toolOutput: string): Record<string, unknown> | null {
  if (!toolOutput.trim()) {
    return null;
@@ -301,6 +321,7 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
  const orbitCode = extractOrbitCode(memorySnippet);
  const mediaPath = /MEDIA:([^\n]+)/.exec(toolOutput)?.[1]?.trim();
  const exactReplyDirective = extractExactReplyDirective(allInputText);
+  const imageInputCount = countImageInputs(input);

  if (/what was the qa canary code/i.test(prompt) && rememberedFact) {
    return `Protocol note: the QA canary code was ${rememberedFact}.`;
@@ -332,6 +353,9 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
  if (/image generation check/i.test(prompt) && mediaPath) {
    return `Protocol note: generated the QA lighthouse image successfully.\nMEDIA:${mediaPath}`;
  }
+  if (/image understanding check/i.test(prompt) && imageInputCount > 0) {
+    return "Protocol note: the attached image is split horizontally, with red on top and blue on the bottom.";
+  }
  if (toolOutput && /delegate|subagent/i.test(prompt)) {
    return `Protocol note: delegated result acknowledged. The bounded subagent task returned and is folded back into the main thread.`;
  }
@@ -558,6 +582,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
        allInputText: extractAllInputTexts(input),
        toolOutput: extractToolOutput(input),
        model: typeof body.model === "string" ? body.model : "",
+        imageInputCount: countImageInputs(input),
        plannedToolName: extractPlannedToolName(events),
      };
      requests.push(lastRequest);
--- a/extensions/qa-lab/src/qa-gateway-config.ts
+++ b/extensions/qa-lab/src/qa-gateway-config.ts
@@ -49,7 +49,7 @@ export function buildQaGatewayConfig(params: {
        name: "gpt-5.4",
        api: "openai-responses",
        reasoning: false,
-        input: ["text"],
+        input: ["text", "image"],
        cost: {
          input: 0,
          output: 0,
@@ -64,7 +64,7 @@ export function buildQaGatewayConfig(params: {
        name: "gpt-5.4-alt",
        api: "openai-responses",
        reasoning: false,
-        input: ["text"],
+        input: ["text", "image"],
        cost: {
          input: 0,
          output: 0,