From e63cbe831b14474384412d849a0b389598cfd1ff Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 14 Apr 2026 01:39:49 +0100 Subject: [PATCH] test(qa-lab): cover GPT-style broken turns --- .../qa-lab/src/mock-openai-server.test.ts | 250 ++++++++++++++++++ extensions/qa-lab/src/mock-openai-server.ts | 83 ++++++ 2 files changed, 333 insertions(+) diff --git a/extensions/qa-lab/src/mock-openai-server.test.ts b/extensions/qa-lab/src/mock-openai-server.test.ts index 5e598f7949c..6baf6c02cc4 100644 --- a/extensions/qa-lab/src/mock-openai-server.test.ts +++ b/extensions/qa-lab/src/mock-openai-server.test.ts @@ -4,6 +4,18 @@ import { resolveProviderVariant, startQaMockOpenAiServer } from "./mock-openai-s const cleanups: Array<() => Promise> = []; const QA_IMAGE_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg=="; +const QA_REASONING_ONLY_RECOVERY_PROMPT = + "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK."; +const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT = + "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK."; +const QA_EMPTY_RESPONSE_RECOVERY_PROMPT = + "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK."; +const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT = + "Empty response exhaustion QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-EXHAUSTED-OK."; +const QA_REASONING_ONLY_RETRY_INSTRUCTION = + "The previous assistant turn recorded reasoning but did not produce a user-visible answer. Continue from that partial turn and produce the visible answer now. Do not restate the reasoning or restart from scratch."; +const QA_EMPTY_RESPONSE_RETRY_INSTRUCTION = + "The previous attempt did not produce a user-visible answer. Continue from the current state and produce the visible answer now. Do not restart from scratch."; afterEach(async () => { while (cleanups.length > 0) { @@ -11,6 +23,46 @@ afterEach(async () => { } }); +async function startMockServer() { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + return server; +} + +async function postResponses(server: { baseUrl: string }, body: unknown) { + return fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify(body), + }); +} + +async function expectResponsesText(server: { baseUrl: string }, body: unknown) { + const response = await postResponses(server, body); + expect(response.status).toBe(200); + return response.text(); +} + +async function expectResponsesJson(server: { baseUrl: string }, body: unknown) { + const response = await postResponses(server, body); + expect(response.status).toBe(200); + return (await response.json()) as T; +} + +function makeUserInput(text: string) { + return { + role: "user" as const, + content: [{ type: "input_text" as const, text }], + }; +} + describe("qa mock openai server", () => { it("serves health and streamed responses", async () => { const server = await startQaMockOpenAiServer({ @@ -1750,6 +1802,204 @@ describe("qa mock openai server", () => { const debug = (await debugResponse.json()) as { model: string }; expect(debug.model).toBe("claude-opus-4-6"); }); + + it("scripts a reasoning-only recovery sequence after a replay-safe read", async () => { + const server = await startMockServer(); + + const toolPlan = await expectResponsesText(server, { + stream: true, + model: "gpt-5.4", + input: [makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT)], + }); + expect(toolPlan).toContain('"name":"read"'); + expect(toolPlan).toContain("QA_KICKOFF_TASK.md"); + + expect( + await expectResponsesJson<{ + output?: Array<{ type?: string; id?: string; summary?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [ + makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT), + { + type: "function_call_output", + output: "QA mission: Understand this OpenClaw repo from source + docs before acting.", + }, + ], + }), + ).toMatchObject({ + output: [ + { + type: "reasoning", + id: "rs_mock_reasoning_recovery", + summary: [{ text: expect.stringContaining("Need visible answer") }], + }, + ], + }); + + expect( + await expectResponsesJson<{ + output?: Array<{ content?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [ + makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT), + makeUserInput(QA_REASONING_ONLY_RETRY_INSTRUCTION), + { + type: "function_call_output", + output: "QA mission: Understand this OpenClaw repo from source + docs before acting.", + }, + ], + }), + ).toMatchObject({ + output: [ + { + content: [{ text: "REASONING-RECOVERED-OK" }], + }, + ], + }); + + const requests = await fetch(`${server.baseUrl}/debug/requests`); + expect(requests.status).toBe(200); + expect(await requests.json()).toMatchObject([ + { plannedToolName: "read" }, + { allInputText: expect.stringContaining(QA_REASONING_ONLY_RECOVERY_PROMPT) }, + { allInputText: expect.stringContaining(QA_REASONING_ONLY_RETRY_INSTRUCTION) }, + ]); + }); + + it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => { + const server = await startMockServer(); + + const toolPlan = await expectResponsesText(server, { + stream: true, + model: "gpt-5.4", + input: [makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT)], + }); + expect(toolPlan).toContain('"name":"write"'); + expect(toolPlan).toContain("reasoning-only-side-effect.txt"); + + expect( + await expectResponsesJson<{ + output?: Array<{ type?: string; id?: string }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [ + makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT), + { + type: "function_call_output", + output: "Successfully wrote 28 bytes to reasoning-only-side-effect.txt.", + }, + ], + }), + ).toMatchObject({ + output: [{ type: "reasoning", id: "rs_mock_reasoning_side_effect" }], + }); + + const requests = await fetch(`${server.baseUrl}/debug/requests`); + expect(requests.status).toBe(200); + expect((await requests.json()) as Array<{ allInputText?: string }>).toHaveLength(2); + }); + + it("scripts an empty-response recovery sequence after a replay-safe read", async () => { + const server = await startMockServer(); + + const toolPlan = await expectResponsesText(server, { + stream: true, + model: "gpt-5.4", + input: [makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT)], + }); + expect(toolPlan).toContain('"name":"read"'); + + expect( + await expectResponsesJson<{ + output?: Array<{ content?: Array<{ type?: string; text?: string }> }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [ + makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT), + { + type: "function_call_output", + output: "QA mission: Understand this OpenClaw repo from source + docs before acting.", + }, + ], + }), + ).toMatchObject({ + output: [ + { + content: [{ type: "output_text", text: "" }], + }, + ], + }); + + expect( + await expectResponsesJson<{ + output?: Array<{ content?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [ + makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT), + makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION), + { + type: "function_call_output", + output: "QA mission: Understand this OpenClaw repo from source + docs before acting.", + }, + ], + }), + ).toMatchObject({ + output: [ + { + content: [{ text: "EMPTY-RECOVERED-OK" }], + }, + ], + }); + }); + + it("can keep emitting empty GPT turns when the single retry budget should exhaust", async () => { + const server = await startMockServer(); + + await expectResponsesText(server, { + stream: true, + model: "gpt-5.4", + input: [makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT)], + }); + + const firstEmpty = await expectResponsesJson<{ + output?: Array<{ content?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [ + makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT), + { + type: "function_call_output", + output: "QA mission: Understand this OpenClaw repo from source + docs before acting.", + }, + ], + }); + expect(firstEmpty.output?.[0]?.content?.[0]?.text).toBe(""); + + const secondEmpty = await expectResponsesJson<{ + output?: Array<{ content?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [ + makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT), + makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION), + { + type: "function_call_output", + output: "QA mission: Understand this OpenClaw repo from source + docs before acting.", + }, + ], + }); + expect(secondEmpty.output?.[0]?.content?.[0]?.text).toBe(""); + }); }); describe("resolveProviderVariant", () => { diff --git a/extensions/qa-lab/src/mock-openai-server.ts b/extensions/qa-lab/src/mock-openai-server.ts index c0e9b6fdcab..c70a5d973ef 100644 --- a/extensions/qa-lab/src/mock-openai-server.ts +++ b/extensions/qa-lab/src/mock-openai-server.ts @@ -124,6 +124,14 @@ type AnthropicMessagesRequest = { const TINY_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII="; +const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i; +const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i; +const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i; +const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i; +const QA_REASONING_ONLY_RETRY_NEEDLE = + "recorded reasoning but did not produce a user-visible answer"; +const QA_EMPTY_RESPONSE_RETRY_NEEDLE = + "The previous attempt did not produce a user-visible answer."; type MockScenarioState = { subagentFanoutPhase: number; @@ -718,6 +726,37 @@ function buildAssistantEvents(text: string): StreamEvent[] { ]; } +function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[] { + const reasoningItem = { + type: "reasoning", + id, + summary: [{ text: summaryText }], + } as const; + return [ + { + type: "response.output_item.added", + item: { + type: "reasoning", + id, + summary: [], + }, + }, + { + type: "response.output_item.done", + item: reasoningItem, + }, + { + type: "response.completed", + response: { + id: `resp_${id}`, + status: "completed", + output: [reasoningItem], + usage: { input_tokens: 64, output_tokens: 8, total_tokens: 72 }, + }, + }, + ]; +} + async function buildResponsesPayload( body: Record, scenarioState: MockScenarioState, @@ -729,12 +768,56 @@ async function buildResponsesPayload( const allInputText = extractAllRequestTexts(input, body); const isGroupChat = allInputText.includes('"is_group_chat": true'); const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt); + const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE); + const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE); if (/remember this fact/i.test(prompt)) { return buildAssistantEvents(buildAssistantText(input, body, scenarioState)); } if (isHeartbeatPrompt(prompt)) { return buildAssistantEvents("HEARTBEAT_OK"); } + if (QA_REASONING_ONLY_RECOVERY_PROMPT_RE.test(allInputText)) { + if (!toolOutput) { + return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" }); + } + if (!hasReasoningOnlyRetryInstruction) { + return buildReasoningOnlyEvents( + "Need visible answer after reading the QA kickoff task.", + "rs_mock_reasoning_recovery", + ); + } + return buildAssistantEvents("REASONING-RECOVERED-OK"); + } + if (QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE.test(allInputText)) { + if (!toolOutput) { + return buildToolCallEventsWithArgs("write", { + path: "reasoning-only-side-effect.txt", + content: "side effects already happened\n", + }); + } + if (!hasReasoningOnlyRetryInstruction) { + return buildReasoningOnlyEvents( + "Need visible answer after the write, but the write already happened.", + "rs_mock_reasoning_side_effect", + ); + } + return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY"); + } + if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) { + if (!toolOutput) { + return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" }); + } + if (!hasEmptyResponseRetryInstruction) { + return buildAssistantEvents(""); + } + return buildAssistantEvents("EMPTY-RECOVERED-OK"); + } + if (QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE.test(allInputText)) { + if (!toolOutput) { + return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" }); + } + return buildAssistantEvents(""); + } if (/lobster invaders/i.test(prompt)) { if (!toolOutput) { return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });