test(qa-lab): cover GPT-style broken turns

2026-05-06 08:50:43 +00:00 · 2026-04-14 01:39:49 +01:00
parent df3e65c8d3
commit e63cbe831b
2 changed files with 333 additions and 0 deletions
--- a/extensions/qa-lab/src/mock-openai-server.test.ts
+++ b/extensions/qa-lab/src/mock-openai-server.test.ts
@@ -4,6 +4,18 @@ import { resolveProviderVariant, startQaMockOpenAiServer } from "./mock-openai-s
 const cleanups: Array<() => Promise<void>> = [];
 const QA_IMAGE_PNG_BASE64 =
  "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";
+const QA_REASONING_ONLY_RECOVERY_PROMPT =
+  "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK.";
+const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT =
+  "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK.";
+const QA_EMPTY_RESPONSE_RECOVERY_PROMPT =
+  "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK.";
+const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT =
+  "Empty response exhaustion QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-EXHAUSTED-OK.";
+const QA_REASONING_ONLY_RETRY_INSTRUCTION =
+  "The previous assistant turn recorded reasoning but did not produce a user-visible answer. Continue from that partial turn and produce the visible answer now. Do not restate the reasoning or restart from scratch.";
+const QA_EMPTY_RESPONSE_RETRY_INSTRUCTION =
+  "The previous attempt did not produce a user-visible answer. Continue from the current state and produce the visible answer now. Do not restart from scratch.";

 afterEach(async () => {
  while (cleanups.length > 0) {
@@ -11,6 +23,46 @@ afterEach(async () => {
  }
 });

+async function startMockServer() {
+  const server = await startQaMockOpenAiServer({
+    host: "127.0.0.1",
+    port: 0,
+  });
+  cleanups.push(async () => {
+    await server.stop();
+  });
+  return server;
+}
+
+async function postResponses(server: { baseUrl: string }, body: unknown) {
+  return fetch(`${server.baseUrl}/v1/responses`, {
+    method: "POST",
+    headers: {
+      "content-type": "application/json",
+    },
+    body: JSON.stringify(body),
+  });
+}
+
+async function expectResponsesText(server: { baseUrl: string }, body: unknown) {
+  const response = await postResponses(server, body);
+  expect(response.status).toBe(200);
+  return response.text();
+}
+
+async function expectResponsesJson<T>(server: { baseUrl: string }, body: unknown) {
+  const response = await postResponses(server, body);
+  expect(response.status).toBe(200);
+  return (await response.json()) as T;
+}
+
+function makeUserInput(text: string) {
+  return {
+    role: "user" as const,
+    content: [{ type: "input_text" as const, text }],
+  };
+}
+
 describe("qa mock openai server", () => {
  it("serves health and streamed responses", async () => {
    const server = await startQaMockOpenAiServer({
@@ -1750,6 +1802,204 @@ describe("qa mock openai server", () => {
    const debug = (await debugResponse.json()) as { model: string };
    expect(debug.model).toBe("claude-opus-4-6");
  });
+
+  it("scripts a reasoning-only recovery sequence after a replay-safe read", async () => {
+    const server = await startMockServer();
+
+    const toolPlan = await expectResponsesText(server, {
+      stream: true,
+      model: "gpt-5.4",
+      input: [makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT)],
+    });
+    expect(toolPlan).toContain('"name":"read"');
+    expect(toolPlan).toContain("QA_KICKOFF_TASK.md");
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{ type?: string; id?: string; summary?: Array<{ text?: string }> }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT),
+          {
+            type: "function_call_output",
+            output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
+          },
+        ],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          type: "reasoning",
+          id: "rs_mock_reasoning_recovery",
+          summary: [{ text: expect.stringContaining("Need visible answer") }],
+        },
+      ],
+    });
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{ content?: Array<{ text?: string }> }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT),
+          makeUserInput(QA_REASONING_ONLY_RETRY_INSTRUCTION),
+          {
+            type: "function_call_output",
+            output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
+          },
+        ],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          content: [{ text: "REASONING-RECOVERED-OK" }],
+        },
+      ],
+    });
+
+    const requests = await fetch(`${server.baseUrl}/debug/requests`);
+    expect(requests.status).toBe(200);
+    expect(await requests.json()).toMatchObject([
+      { plannedToolName: "read" },
+      { allInputText: expect.stringContaining(QA_REASONING_ONLY_RECOVERY_PROMPT) },
+      { allInputText: expect.stringContaining(QA_REASONING_ONLY_RETRY_INSTRUCTION) },
+    ]);
+  });
+
+  it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
+    const server = await startMockServer();
+
+    const toolPlan = await expectResponsesText(server, {
+      stream: true,
+      model: "gpt-5.4",
+      input: [makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT)],
+    });
+    expect(toolPlan).toContain('"name":"write"');
+    expect(toolPlan).toContain("reasoning-only-side-effect.txt");
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{ type?: string; id?: string }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT),
+          {
+            type: "function_call_output",
+            output: "Successfully wrote 28 bytes to reasoning-only-side-effect.txt.",
+          },
+        ],
+      }),
+    ).toMatchObject({
+      output: [{ type: "reasoning", id: "rs_mock_reasoning_side_effect" }],
+    });
+
+    const requests = await fetch(`${server.baseUrl}/debug/requests`);
+    expect(requests.status).toBe(200);
+    expect((await requests.json()) as Array<{ allInputText?: string }>).toHaveLength(2);
+  });
+
+  it("scripts an empty-response recovery sequence after a replay-safe read", async () => {
+    const server = await startMockServer();
+
+    const toolPlan = await expectResponsesText(server, {
+      stream: true,
+      model: "gpt-5.4",
+      input: [makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT)],
+    });
+    expect(toolPlan).toContain('"name":"read"');
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{ content?: Array<{ type?: string; text?: string }> }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT),
+          {
+            type: "function_call_output",
+            output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
+          },
+        ],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          content: [{ type: "output_text", text: "" }],
+        },
+      ],
+    });
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{ content?: Array<{ text?: string }> }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT),
+          makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION),
+          {
+            type: "function_call_output",
+            output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
+          },
+        ],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          content: [{ text: "EMPTY-RECOVERED-OK" }],
+        },
+      ],
+    });
+  });
+
+  it("can keep emitting empty GPT turns when the single retry budget should exhaust", async () => {
+    const server = await startMockServer();
+
+    await expectResponsesText(server, {
+      stream: true,
+      model: "gpt-5.4",
+      input: [makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT)],
+    });
+
+    const firstEmpty = await expectResponsesJson<{
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    }>(server, {
+      stream: false,
+      model: "gpt-5.4",
+      input: [
+        makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT),
+        {
+          type: "function_call_output",
+          output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
+        },
+      ],
+    });
+    expect(firstEmpty.output?.[0]?.content?.[0]?.text).toBe("");
+
+    const secondEmpty = await expectResponsesJson<{
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    }>(server, {
+      stream: false,
+      model: "gpt-5.4",
+      input: [
+        makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT),
+        makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION),
+        {
+          type: "function_call_output",
+          output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
+        },
+      ],
+    });
+    expect(secondEmpty.output?.[0]?.content?.[0]?.text).toBe("");
+  });
 });

 describe("resolveProviderVariant", () => {
--- a/extensions/qa-lab/src/mock-openai-server.ts
+++ b/extensions/qa-lab/src/mock-openai-server.ts
@@ -124,6 +124,14 @@ type AnthropicMessagesRequest = {

 const TINY_PNG_BASE64 =
  "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
+const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
+const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
+const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
+const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
+const QA_REASONING_ONLY_RETRY_NEEDLE =
+  "recorded reasoning but did not produce a user-visible answer";
+const QA_EMPTY_RESPONSE_RETRY_NEEDLE =
+  "The previous attempt did not produce a user-visible answer.";

 type MockScenarioState = {
  subagentFanoutPhase: number;
@@ -718,6 +726,37 @@ function buildAssistantEvents(text: string): StreamEvent[] {
  ];
 }

+function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[] {
+  const reasoningItem = {
+    type: "reasoning",
+    id,
+    summary: [{ text: summaryText }],
+  } as const;
+  return [
+    {
+      type: "response.output_item.added",
+      item: {
+        type: "reasoning",
+        id,
+        summary: [],
+      },
+    },
+    {
+      type: "response.output_item.done",
+      item: reasoningItem,
+    },
+    {
+      type: "response.completed",
+      response: {
+        id: `resp_${id}`,
+        status: "completed",
+        output: [reasoningItem],
+        usage: { input_tokens: 64, output_tokens: 8, total_tokens: 72 },
+      },
+    },
+  ];
+}
+
 async function buildResponsesPayload(
  body: Record<string, unknown>,
  scenarioState: MockScenarioState,
@@ -729,12 +768,56 @@ async function buildResponsesPayload(
  const allInputText = extractAllRequestTexts(input, body);
  const isGroupChat = allInputText.includes('"is_group_chat": true');
  const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
+  const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
+  const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
  if (/remember this fact/i.test(prompt)) {
    return buildAssistantEvents(buildAssistantText(input, body, scenarioState));
  }
  if (isHeartbeatPrompt(prompt)) {
    return buildAssistantEvents("HEARTBEAT_OK");
  }
+  if (QA_REASONING_ONLY_RECOVERY_PROMPT_RE.test(allInputText)) {
+    if (!toolOutput) {
+      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
+    }
+    if (!hasReasoningOnlyRetryInstruction) {
+      return buildReasoningOnlyEvents(
+        "Need visible answer after reading the QA kickoff task.",
+        "rs_mock_reasoning_recovery",
+      );
+    }
+    return buildAssistantEvents("REASONING-RECOVERED-OK");
+  }
+  if (QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE.test(allInputText)) {
+    if (!toolOutput) {
+      return buildToolCallEventsWithArgs("write", {
+        path: "reasoning-only-side-effect.txt",
+        content: "side effects already happened\n",
+      });
+    }
+    if (!hasReasoningOnlyRetryInstruction) {
+      return buildReasoningOnlyEvents(
+        "Need visible answer after the write, but the write already happened.",
+        "rs_mock_reasoning_side_effect",
+      );
+    }
+    return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
+  }
+  if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
+    if (!toolOutput) {
+      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
+    }
+    if (!hasEmptyResponseRetryInstruction) {
+      return buildAssistantEvents("");
+    }
+    return buildAssistantEvents("EMPTY-RECOVERED-OK");
+  }
+  if (QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE.test(allInputText)) {
+    if (!toolOutput) {
+      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
+    }
+    return buildAssistantEvents("");
+  }
  if (/lobster invaders/i.test(prompt)) {
    if (!toolOutput) {
      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });