mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 09:41:11 +00:00
fix(qa): support image understanding inputs
This commit is contained in:
@@ -2,6 +2,8 @@ import { afterEach, describe, expect, it } from "vitest";
|
||||
import { startQaMockOpenAiServer } from "./mock-openai-server.js";
|
||||
|
||||
const cleanups: Array<() => Promise<void>> = [];
|
||||
const QA_IMAGE_PNG_BASE64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";
|
||||
|
||||
afterEach(async () => {
|
||||
while (cleanups.length > 0) {
|
||||
@@ -332,6 +334,56 @@ describe("qa mock openai server", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("records image inputs and describes attached images", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
});
|
||||
cleanups.push(async () => {
|
||||
await server.stop();
|
||||
});
|
||||
|
||||
const response = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
model: "mock-openai/gpt-5.4",
|
||||
input: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "input_text", text: "Image understanding check: what do you see?" },
|
||||
{
|
||||
type: "input_image",
|
||||
source: {
|
||||
type: "base64",
|
||||
mime_type: "image/png",
|
||||
data: QA_IMAGE_PNG_BASE64,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(response.status).toBe(200);
|
||||
const payload = (await response.json()) as {
|
||||
output?: Array<{ content?: Array<{ text?: string }> }>;
|
||||
};
|
||||
const text = payload.output?.[0]?.content?.[0]?.text ?? "";
|
||||
expect(text.toLowerCase()).toContain("red");
|
||||
expect(text.toLowerCase()).toContain("blue");
|
||||
|
||||
const debug = await fetch(`${server.baseUrl}/debug/requests`);
|
||||
expect(debug.status).toBe(200);
|
||||
expect(await debug.json()).toMatchObject([
|
||||
expect.objectContaining({
|
||||
imageInputCount: 1,
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it("ignores stale tool output from prior turns when planning the current turn", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
|
||||
@@ -27,6 +27,7 @@ type MockOpenAiRequestSnapshot = {
|
||||
allInputText: string;
|
||||
toolOutput: string;
|
||||
model: string;
|
||||
imageInputCount: number;
|
||||
plannedToolName?: string;
|
||||
};
|
||||
|
||||
@@ -159,6 +160,25 @@ function extractAllInputTexts(input: ResponsesInputItem[]) {
|
||||
return texts.join("\n");
|
||||
}
|
||||
|
||||
function countImageInputs(input: ResponsesInputItem[]) {
|
||||
let count = 0;
|
||||
for (const item of input) {
|
||||
if (!Array.isArray(item.content)) {
|
||||
continue;
|
||||
}
|
||||
for (const entry of item.content) {
|
||||
if (
|
||||
entry &&
|
||||
typeof entry === "object" &&
|
||||
(entry as { type?: unknown }).type === "input_image"
|
||||
) {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
function parseToolOutputJson(toolOutput: string): Record<string, unknown> | null {
|
||||
if (!toolOutput.trim()) {
|
||||
return null;
|
||||
@@ -301,6 +321,7 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
|
||||
const orbitCode = extractOrbitCode(memorySnippet);
|
||||
const mediaPath = /MEDIA:([^\n]+)/.exec(toolOutput)?.[1]?.trim();
|
||||
const exactReplyDirective = extractExactReplyDirective(allInputText);
|
||||
const imageInputCount = countImageInputs(input);
|
||||
|
||||
if (/what was the qa canary code/i.test(prompt) && rememberedFact) {
|
||||
return `Protocol note: the QA canary code was ${rememberedFact}.`;
|
||||
@@ -332,6 +353,9 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
|
||||
if (/image generation check/i.test(prompt) && mediaPath) {
|
||||
return `Protocol note: generated the QA lighthouse image successfully.\nMEDIA:${mediaPath}`;
|
||||
}
|
||||
if (/image understanding check/i.test(prompt) && imageInputCount > 0) {
|
||||
return "Protocol note: the attached image is split horizontally, with red on top and blue on the bottom.";
|
||||
}
|
||||
if (toolOutput && /delegate|subagent/i.test(prompt)) {
|
||||
return `Protocol note: delegated result acknowledged. The bounded subagent task returned and is folded back into the main thread.`;
|
||||
}
|
||||
@@ -558,6 +582,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
|
||||
allInputText: extractAllInputTexts(input),
|
||||
toolOutput: extractToolOutput(input),
|
||||
model: typeof body.model === "string" ? body.model : "",
|
||||
imageInputCount: countImageInputs(input),
|
||||
plannedToolName: extractPlannedToolName(events),
|
||||
};
|
||||
requests.push(lastRequest);
|
||||
|
||||
@@ -49,7 +49,7 @@ export function buildQaGatewayConfig(params: {
|
||||
name: "gpt-5.4",
|
||||
api: "openai-responses",
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
input: ["text", "image"],
|
||||
cost: {
|
||||
input: 0,
|
||||
output: 0,
|
||||
@@ -64,7 +64,7 @@ export function buildQaGatewayConfig(params: {
|
||||
name: "gpt-5.4-alt",
|
||||
api: "openai-responses",
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
input: ["text", "image"],
|
||||
cost: {
|
||||
input: 0,
|
||||
output: 0,
|
||||
|
||||
Reference in New Issue
Block a user