test: add gpt-5.4 thinking visibility QA

2026-05-06 06:00:43 +00:00 · 2026-04-21 06:12:17 +01:00
parent 663501206f
commit f5be489266
10 changed files with 419 additions and 12 deletions
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -8,6 +8,10 @@ const QA_REASONING_ONLY_RECOVERY_PROMPT =
  "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK.";
 const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT =
  "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK.";
+const QA_THINKING_VISIBILITY_OFF_PROMPT =
+  "QA thinking visibility check off: answer exactly THINKING-OFF-OK.";
+const QA_THINKING_VISIBILITY_MAX_PROMPT =
+  "QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK.";
 const QA_EMPTY_RESPONSE_RECOVERY_PROMPT =
  "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK.";
 const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT =
@@ -2049,6 +2053,54 @@ describe("qa mock openai server", () => {
    ]);
  });

+  it("scripts the GPT-5.4 thinking visibility switch prompts", async () => {
+    const server = await startMockServer();
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{ type?: string; content?: Array<{ text?: string }> }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [makeUserInput(QA_THINKING_VISIBILITY_OFF_PROMPT)],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          type: "message",
+          content: [{ text: "THINKING-OFF-OK" }],
+        },
+      ],
+    });
+
+    expect(
+      await expectResponsesJson<{
+        output?: Array<{
+          type?: string;
+          id?: string;
+          summary?: Array<{ text?: string }>;
+          content?: Array<{ text?: string }>;
+        }>;
+      }>(server, {
+        stream: false,
+        model: "gpt-5.4",
+        input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)],
+      }),
+    ).toMatchObject({
+      output: [
+        {
+          type: "reasoning",
+          id: "rs_mock_thinking_visibility_max",
+          summary: [],
+        },
+        {
+          type: "message",
+          content: [{ text: "THINKING-MAX-OK" }],
+        },
+      ],
+    });
+  });
+
  it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
    const server = await startMockServer();

--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -140,6 +140,8 @@ const TINY_PNG_BASE64 =
  "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
 const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
 const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
+const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i;
+const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
 const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
 const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
 const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
@@ -924,6 +926,61 @@ function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[
  ];
 }

+function buildReasoningAndAssistantEvents(params: {
+  reasoningId: string;
+  answerText: string;
+  answerId?: string;
+}): StreamEvent[] {
+  const reasoningItem = {
+    type: "reasoning",
+    id: params.reasoningId,
+    summary: [],
+  } as const;
+  const answerItem = buildAssistantOutputItem({
+    id: params.answerId ?? "msg_mock_reasoned_answer",
+    phase: "final_answer",
+    text: params.answerText,
+  });
+  return [
+    {
+      type: "response.output_item.added",
+      item: {
+        type: "reasoning",
+        id: params.reasoningId,
+        summary: [],
+      },
+    },
+    {
+      type: "response.output_item.done",
+      item: reasoningItem,
+    },
+    {
+      type: "response.output_item.added",
+      item: {
+        type: "message",
+        id: answerItem.id,
+        role: "assistant",
+        phase: "final_answer",
+        content: [],
+        status: "in_progress",
+      },
+    },
+    {
+      type: "response.output_item.done",
+      item: answerItem,
+    },
+    {
+      type: "response.completed",
+      response: {
+        id: `resp_${params.reasoningId}`,
+        status: "completed",
+        output: [reasoningItem, answerItem],
+        usage: { input_tokens: 64, output_tokens: 16, total_tokens: 80 },
+      },
+    },
+  ];
+}
+
 async function buildResponsesPayload(
  body: Record<string, unknown>,
  scenarioState: MockScenarioState,
@@ -981,6 +1038,15 @@ async function buildResponsesPayload(
    }
    return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
  }
+  if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) {
+    return buildReasoningAndAssistantEvents({
+      reasoningId: "rs_mock_thinking_visibility_max",
+      answerText: "THINKING-MAX-OK",
+    });
+  }
+  if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) {
+    return buildAssistantEvents("THINKING-OFF-OK");
+  }
  if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
    if (!toolOutput) {
      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -123,6 +123,32 @@ describe("qa scenario catalog", () => {
    );
  });

+  it("includes the GPT-5.4 thinking visibility switch scenario", () => {
+    const scenario = readQaScenarioById("gpt54-thinking-visibility-switch");
+    const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as
+      | {
+          requiredLiveProvider?: string;
+          requiredLiveModel?: string;
+          offDirective?: string;
+          maxDirective?: string;
+          reasoningDirective?: string;
+        }
+      | undefined;
+
+    expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
+    expect(config?.requiredLiveProvider).toBe("openai");
+    expect(config?.requiredLiveModel).toBe("gpt-5.4");
+    expect(config?.offDirective).toBe("/think off");
+    expect(config?.maxDirective).toBe("/think max");
+    expect(config?.reasoningDirective).toBe("/reasoning on");
+    expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
+      "enables reasoning display and disables thinking",
+      "switches to max thinking",
+      "verifies max thinking emits visible reasoning",
+      "verifies max thinking completes the answer",
+    ]);
+  });
+
  it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => {
    const scenarioIds = [
      "reasoning-only-recovery-replay-safe-read",
--- a/qa/scenarios/models/gpt54-thinking-visibility-switch.md
+++ b/qa/scenarios/models/gpt54-thinking-visibility-switch.md
@@ -0,0 +1,211 @@
+# GPT-5.4 thinking visibility switch
+
+```yaml qa-scenario
+id: gpt54-thinking-visibility-switch
+title: GPT-5.4 thinking visibility switch
+surface: models
+coverage:
+  primary:
+    - models.thinking
+  secondary:
+    - runtime.reasoning-visibility
+objective: Verify GPT-5.4 can switch from disabled thinking to max thinking while reasoning display stays enabled.
+successCriteria:
+  - Live runs target openai/gpt-5.4, not a mini or pro variant.
+  - The session enables reasoning display before the comparison turns.
+  - The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
+  - The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
+docsRefs:
+  - docs/tools/thinking.md
+  - docs/help/testing.md
+  - docs/concepts/qa-e2e-automation.md
+codeRefs:
+  - src/auto-reply/reply/directives.ts
+  - src/auto-reply/thinking.shared.ts
+  - src/agents/pi-embedded-runner/run/payloads.ts
+  - extensions/openai/openai-provider.ts
+  - extensions/qa-lab/src/providers/mock-openai/server.ts
+execution:
+  kind: flow
+  summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
+  config:
+    requiredLiveProvider: openai
+    requiredLiveModel: gpt-5.4
+    offDirective: /think off
+    maxDirective: /think max
+    reasoningDirective: /reasoning on
+    conversationId: qa-thinking-visibility
+    offPrompt: "QA thinking visibility check off: answer exactly THINKING-OFF-OK."
+    maxPrompt: "QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK."
+    offMarker: THINKING-OFF-OK
+    maxMarker: THINKING-MAX-OK
+```
+
+```yaml qa-flow
+steps:
+  - name: enables reasoning display and disables thinking
+    actions:
+      - call: waitForGatewayHealthy
+        args:
+          - ref: env
+          - 60000
+      - call: waitForQaChannelReady
+        args:
+          - ref: env
+          - 60000
+      - call: reset
+      - set: selected
+        value:
+          expr: splitModelRef(env.primaryModel)
+      - assert:
+          expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
+          message:
+            expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
+      - call: state.addInboundMessage
+        args:
+          - conversation:
+              id:
+                expr: config.conversationId
+              kind: direct
+            senderId: qa-operator
+            senderName: QA Operator
+            text:
+              expr: config.reasoningDirective
+      - call: waitForCondition
+        saveAs: reasoningAck
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 20000)
+      - call: state.addInboundMessage
+        args:
+          - conversation:
+              id:
+                expr: config.conversationId
+              kind: direct
+            senderId: qa-operator
+            senderName: QA Operator
+            text:
+              expr: config.offDirective
+      - call: waitForCondition
+        saveAs: offAck
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 20000)
+      - set: offCursor
+        value:
+          expr: state.getSnapshot().messages.length
+      - call: state.addInboundMessage
+        args:
+          - conversation:
+              id:
+                expr: config.conversationId
+              kind: direct
+            senderId: qa-operator
+            senderName: QA Operator
+            text:
+              expr: "`${config.offDirective} ${config.offPrompt}`"
+      - call: waitForCondition
+        saveAs: offAnswer
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.offMarker)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 30000)
+      - set: offMessages
+        value:
+          expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId)"
+      - assert:
+          expr: "offMessages.some((candidate) => candidate.text.includes(config.offMarker))"
+          message:
+            expr: "`missing off marker; saw ${offMessages.map((message) => message.text).join(' | ')}`"
+      - assert:
+          expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:'))"
+          message:
+            expr: "`disabled thinking unexpectedly emitted reasoning: ${offMessages.map((message) => message.text).join(' | ')}`"
+      - if:
+          expr: "Boolean(env.mock)"
+          then:
+            - set: requests
+              value:
+                expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)"
+            - set: offRequest
+              value:
+                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
+            - assert:
+                expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
+                message:
+                  expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
+    detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
+  - name: switches to max thinking
+    actions:
+      - call: state.addInboundMessage
+        args:
+          - conversation:
+              id:
+                expr: config.conversationId
+              kind: direct
+            senderId: qa-operator
+            senderName: QA Operator
+            text:
+              expr: config.maxDirective
+      - call: waitForCondition
+        saveAs: maxAck
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to high/i.test(candidate.text)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 20000)
+    detailsExpr: "`max ack=${maxAck.text}`"
+  - name: verifies max thinking emits visible reasoning
+    actions:
+      - set: maxCursor
+        value:
+          expr: state.getSnapshot().messages.length
+      - call: state.addInboundMessage
+        args:
+          - conversation:
+              id:
+                expr: config.conversationId
+              kind: direct
+            senderId: qa-operator
+            senderName: QA Operator
+            text:
+              expr: "`${config.maxDirective} ${config.maxPrompt}`"
+      - call: waitForCondition
+        saveAs: maxReasoning
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 120000)
+      - assert:
+          expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')"
+          message:
+            expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
+    detailsExpr: "`reasoning=${maxReasoning.text}`"
+  - name: verifies max thinking completes the answer
+    actions:
+      - call: waitForCondition
+        saveAs: maxAnswer
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.maxMarker)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 120000)
+      - assert:
+          expr: "maxAnswer.text.includes(config.maxMarker)"
+          message:
+            expr: "`missing max marker: ${maxAnswer.text}`"
+      - if:
+          expr: "Boolean(env.mock)"
+          then:
+            - set: requests
+              value:
+                expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)"
+            - set: maxRequest
+              value:
+                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
+            - assert:
+                expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
+                message:
+                  expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
+    detailsExpr: "`answer=${maxAnswer.text}`"
+```
--- a/src/agents/openai-transport-stream.test.ts
+++ b/src/agents/openai-transport-stream.test.ts
@@ -513,7 +513,7 @@ describe("openai transport stream", () => {
    expect(params.input?.[0]).toMatchObject({ role: "developer" });
  });

-  it("defaults OpenAI Responses reasoning effort to high when unset", () => {
+  it("does not infer high reasoning when Pi passes thinking off", () => {
    const params = buildOpenAIResponsesParams(
      {
        id: "gpt-5.4",
@@ -535,8 +535,8 @@ describe("openai transport stream", () => {
      undefined,
    ) as { reasoning?: unknown; include?: string[] };

-    expect(params.reasoning).toEqual({ effort: "high", summary: "auto" });
-    expect(params.include).toEqual(["reasoning.encrypted_content"]);
+    expect(params.reasoning).toEqual({ effort: "none" });
+    expect(params).not.toHaveProperty("include");
  });

  it("uses shared stream reasoning as OpenAI Responses effort", () => {
--- a/src/agents/openai-transport-stream.ts
+++ b/src/agents/openai-transport-stream.ts
@@ -814,16 +814,12 @@ export function buildOpenAIResponsesParams(
    } else if (model.provider !== "github-copilot") {
      const reasoningEffort = resolveOpenAIReasoningEffortForModel({
        model,
-        effort: "high",
+        effort: "none",
      });
      if (reasoningEffort) {
        params.reasoning = {
          effort: reasoningEffort,
-          ...(reasoningEffort === "none" ? {} : { summary: "auto" }),
        };
-        if (reasoningEffort !== "none") {
-          params.include = ["reasoning.encrypted_content"];
-        }
      }
    }
  }
--- a/src/agents/pi-embedded-utils.test.ts
+++ b/src/agents/pi-embedded-utils.test.ts
@@ -2,6 +2,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
 import { describe, expect, it } from "vitest";
 import {
  extractAssistantText,
+  extractAssistantThinking,
  extractAssistantVisibleText,
  formatReasoningMessage,
  promoteThinkingTagsToBlocks,
@@ -641,6 +642,27 @@ describe("formatReasoningMessage", () => {
  });
 });

+describe("extractAssistantThinking", () => {
+  it("surfaces signed native reasoning even when the provider returns an empty summary", () => {
+    const msg = makeAssistantMessage({
+      role: "assistant",
+      content: [
+        {
+          type: "thinking",
+          thinking: "",
+          thinkingSignature: JSON.stringify({ type: "reasoning", id: "rs_live", summary: [] }),
+        },
+        { type: "text", text: "Done." },
+      ],
+      timestamp: Date.now(),
+    });
+
+    expect(extractAssistantThinking(msg)).toBe(
+      "Native reasoning was produced; no summary text was returned.",
+    );
+  });
+});
+
 describe("stripDowngradedToolCallText", () => {
  it("strips downgraded marker blocks while preserving surrounding user-facing text", () => {
    const cases = [
--- a/src/agents/pi-embedded-utils.ts
+++ b/src/agents/pi-embedded-utils.ts
@@ -147,7 +147,13 @@ export function extractAssistantThinking(msg: AssistantMessage): string {
      }
      const record = block as unknown as Record<string, unknown>;
      if (record.type === "thinking" && typeof record.thinking === "string") {
-        return record.thinking.trim();
+        const thinking = record.thinking.trim();
+        if (thinking) {
+          return thinking;
+        }
+        if (typeof record.thinkingSignature === "string" && record.thinkingSignature.trim()) {
+          return "Native reasoning was produced; no summary text was returned.";
+        }
      }
      return "";
    })
--- a/src/auto-reply/reply/get-reply-directives.target-session.test.ts
+++ b/src/auto-reply/reply/get-reply-directives.target-session.test.ts
@@ -89,6 +89,7 @@ function parseInlineDirectivesForTest(body: string) {
 async function resolveHelloWithModelDefaults(params: {
  defaultThinking: "off" | "low";
  defaultReasoning: "on";
+  sessionEntry?: SessionEntry;
 }) {
  const resolveDefaultThinkingLevel = vi.fn(async () => params.defaultThinking);
  const resolveDefaultReasoningLevel = vi.fn(async () => params.defaultReasoning);
@@ -119,7 +120,7 @@ async function resolveHelloWithModelDefaults(params: {
      CommandBody: "hello",
      Provider: "whatsapp",
    } as TemplateContext,
-    sessionEntry: makeSessionEntry(),
+    sessionEntry: params.sessionEntry ?? makeSessionEntry(),
    sessionStore: {},
    sessionKey: "agent:main:whatsapp:+2000",
    storePath: "/tmp/sessions.json",
@@ -427,6 +428,23 @@ describe("resolveReplyDirectives", () => {
    expect(resolveDefaultReasoningLevel).toHaveBeenCalledOnce();
  });

+  it("does not re-enable model reasoning when thinking was explicitly disabled", async () => {
+    const { result, resolveDefaultReasoningLevel } = await resolveHelloWithModelDefaults({
+      defaultThinking: "off",
+      defaultReasoning: "on",
+      sessionEntry: makeSessionEntry({ thinkingLevel: "off" }),
+    });
+
+    expect(result).toEqual({
+      kind: "continue",
+      result: expect.objectContaining({
+        resolvedThinkLevel: "off",
+        resolvedReasoningLevel: "off",
+      }),
+    });
+    expect(resolveDefaultReasoningLevel).not.toHaveBeenCalled();
+  });
+
  it("skips the model reasoning default when thinking is active", async () => {
    const { result, resolveDefaultReasoningLevel } = await resolveHelloWithModelDefaults({
      defaultThinking: "low",
--- a/src/auto-reply/reply/get-reply-directives.ts
+++ b/src/auto-reply/reply/get-reply-directives.ts
@@ -495,9 +495,14 @@ export async function resolveReplyDirectives(params: {
    (await modelState.resolveDefaultThinkingLevel()) ??
    (agentCfg?.thinkingDefault as ThinkLevel | undefined);

+  const thinkingExplicitlySet =
+    directives.thinkLevel !== undefined ||
+    targetSessionEntry?.thinkingLevel !== undefined ||
+    agentCfg?.thinkingDefault !== undefined;
+
  // When neither directive nor session nor agent set reasoning, default to model capability
  // (e.g. OpenRouter with reasoning: true). Skip model default when thinking is active
-  // to avoid redundant Reasoning: output alongside internal thinking blocks.
+  // or when thinking was explicitly disabled.
  const hasAgentReasoningDefault =
    agentEntry?.reasoningDefault !== undefined && agentEntry?.reasoningDefault !== null;
  const reasoningExplicitlySet =
@@ -506,7 +511,12 @@ export async function resolveReplyDirectives(params: {
      targetSessionEntry?.reasoningLevel !== null) ||
    hasAgentReasoningDefault;
  const thinkingActive = resolvedThinkLevelWithDefault !== "off";
-  if (!reasoningExplicitlySet && resolvedReasoningLevel === "off" && !thinkingActive) {
+  if (
+    !reasoningExplicitlySet &&
+    resolvedReasoningLevel === "off" &&
+    !thinkingActive &&
+    !thinkingExplicitlySet
+  ) {
    resolvedReasoningLevel = await modelState.resolveDefaultReasoningLevel();
  }