test: add gpt-5.4 thinking visibility QA

2026-05-06 18:40:44 +00:00 · 2026-04-21 06:12:17 +01:00
parent 663501206f
commit f5be489266
10 changed files with 419 additions and 12 deletions
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -8,6 +8,10 @@ const QA_REASONING_ONLY_RECOVERY_PROMPT =
  "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK.";
 const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT =
  "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK.";
+const QA_THINKING_VISIBILITY_OFF_PROMPT =
+  "QA thinking visibility check off: answer exactly THINKING-OFF-OK.";
+const QA_THINKING_VISIBILITY_MAX_PROMPT =
+  "QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK.";
 const QA_EMPTY_RESPONSE_RECOVERY_PROMPT =
  "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK.";
 const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT =
@@ -2049,6 +2053,54 @@ describe("qa mock openai server", () => {
    ]);
  });

+  it("scripts the GPT-5.4 thinking visibility switch prompts", async () => {
+    const server = await startMockServer();
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{ type?: string; content?: Array<{ text?: string }> }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [makeUserInput(QA_THINKING_VISIBILITY_OFF_PROMPT)],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          type: "message",
+          content: [{ text: "THINKING-OFF-OK" }],
+        },
+      ],
+    });
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{
+          type?: string;
+          id?: string;
+          summary?: Array<{ text?: string }>;
+          content?: Array<{ text?: string }>;
+        }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          type: "reasoning",
+          id: "rs_mock_thinking_visibility_max",
+          summary: [],
+        },
+        {
+          type: "message",
+          content: [{ text: "THINKING-MAX-OK" }],
+        },
+      ],
+    });
+  });
+
  it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
    const server = await startMockServer();

--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -140,6 +140,8 @@ const TINY_PNG_BASE64 =
  "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
 const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
 const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
+const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i;
+const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
 const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
 const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
 const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
@@ -924,6 +926,61 @@ function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[
  ];
 }

+function buildReasoningAndAssistantEvents(params: {
+  reasoningId: string;
+  answerText: string;
+  answerId?: string;
+}): StreamEvent[] {
+  const reasoningItem = {
+    type: "reasoning",
+    id: params.reasoningId,
+    summary: [],
+  } as const;
+  const answerItem = buildAssistantOutputItem({
+    id: params.answerId ?? "msg_mock_reasoned_answer",
+    phase: "final_answer",
+    text: params.answerText,
+  });
+  return [
+    {
+      type: "response.output_item.added",
+      item: {
+        type: "reasoning",
+        id: params.reasoningId,
+        summary: [],
+      },
+    },
+    {
+      type: "response.output_item.done",
+      item: reasoningItem,
+    },
+    {
+      type: "response.output_item.added",
+      item: {
+        type: "message",
+        id: answerItem.id,
+        role: "assistant",
+        phase: "final_answer",
+        content: [],
+        status: "in_progress",
+      },
+    },
+    {
+      type: "response.output_item.done",
+      item: answerItem,
+    },
+    {
+      type: "response.completed",
+      response: {
+        id: `resp_${params.reasoningId}`,
+        status: "completed",
+        output: [reasoningItem, answerItem],
+        usage: { input_tokens: 64, output_tokens: 16, total_tokens: 80 },
+      },
+    },
+  ];
+}
+
 async function buildResponsesPayload(
  body: Record<string, unknown>,
  scenarioState: MockScenarioState,
@@ -981,6 +1038,15 @@ async function buildResponsesPayload(
    }
    return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
  }
+  if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) {
+    return buildReasoningAndAssistantEvents({
+      reasoningId: "rs_mock_thinking_visibility_max",
+      answerText: "THINKING-MAX-OK",
+    });
+  }
+  if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) {
+    return buildAssistantEvents("THINKING-OFF-OK");
+  }
  if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
    if (!toolOutput) {
      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -123,6 +123,32 @@ describe("qa scenario catalog", () => {
    );
  });

+  it("includes the GPT-5.4 thinking visibility switch scenario", () => {
+    const scenario = readQaScenarioById("gpt54-thinking-visibility-switch");
+    const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as
+      | {
+          requiredLiveProvider?: string;
+          requiredLiveModel?: string;
+          offDirective?: string;
+          maxDirective?: string;
+          reasoningDirective?: string;
+        }
+      | undefined;
+
+    expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
+    expect(config?.requiredLiveProvider).toBe("openai");
+    expect(config?.requiredLiveModel).toBe("gpt-5.4");
+    expect(config?.offDirective).toBe("/think off");
+    expect(config?.maxDirective).toBe("/think max");
+    expect(config?.reasoningDirective).toBe("/reasoning on");
+    expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
+      "enables reasoning display and disables thinking",
+      "switches to max thinking",
+      "verifies max thinking emits visible reasoning",
+      "verifies max thinking completes the answer",
+    ]);
+  });
+
  it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => {
    const scenarioIds = [
      "reasoning-only-recovery-replay-safe-read",