From f5be489266e3cb6457d09b0c1c26df14e8db97f8 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 21 Apr 2026 06:12:17 +0100 Subject: [PATCH] test: add gpt-5.4 thinking visibility QA --- .../src/providers/mock-openai/server.test.ts | 52 +++++ .../src/providers/mock-openai/server.ts | 66 ++++++ .../qa-lab/src/scenario-catalog.test.ts | 26 +++ .../gpt54-thinking-visibility-switch.md | 211 ++++++++++++++++++ src/agents/openai-transport-stream.test.ts | 6 +- src/agents/openai-transport-stream.ts | 6 +- src/agents/pi-embedded-utils.test.ts | 22 ++ src/agents/pi-embedded-utils.ts | 8 +- ...et-reply-directives.target-session.test.ts | 20 +- src/auto-reply/reply/get-reply-directives.ts | 14 +- 10 files changed, 419 insertions(+), 12 deletions(-) create mode 100644 qa/scenarios/models/gpt54-thinking-visibility-switch.md diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index caebf0f2cfb..55264701dc0 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -8,6 +8,10 @@ const QA_REASONING_ONLY_RECOVERY_PROMPT = "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK."; const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT = "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK."; +const QA_THINKING_VISIBILITY_OFF_PROMPT = + "QA thinking visibility check off: answer exactly THINKING-OFF-OK."; +const QA_THINKING_VISIBILITY_MAX_PROMPT = + "QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK."; const QA_EMPTY_RESPONSE_RECOVERY_PROMPT = "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK."; const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT = @@ -2049,6 +2053,54 @@ describe("qa mock openai server", () => { ]); }); + it("scripts the GPT-5.4 thinking visibility switch prompts", async () => { + const server = await startMockServer(); + + expect( + await expectResponsesJson<{ + output?: Array<{ type?: string; content?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [makeUserInput(QA_THINKING_VISIBILITY_OFF_PROMPT)], + }), + ).toMatchObject({ + output: [ + { + type: "message", + content: [{ text: "THINKING-OFF-OK" }], + }, + ], + }); + + expect( + await expectResponsesJson<{ + output?: Array<{ + type?: string; + id?: string; + summary?: Array<{ text?: string }>; + content?: Array<{ text?: string }>; + }>; + }>(server, { + stream: false, + model: "gpt-5.4", + input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)], + }), + ).toMatchObject({ + output: [ + { + type: "reasoning", + id: "rs_mock_thinking_visibility_max", + summary: [], + }, + { + type: "message", + content: [{ text: "THINKING-MAX-OK" }], + }, + ], + }); + }); + it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => { const server = await startMockServer(); diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index fc91e3b1d33..a8363b46a8e 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -140,6 +140,8 @@ const TINY_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII="; const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i; const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i; +const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i; +const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i; const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i; const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i; const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i; @@ -924,6 +926,61 @@ function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[ ]; } +function buildReasoningAndAssistantEvents(params: { + reasoningId: string; + answerText: string; + answerId?: string; +}): StreamEvent[] { + const reasoningItem = { + type: "reasoning", + id: params.reasoningId, + summary: [], + } as const; + const answerItem = buildAssistantOutputItem({ + id: params.answerId ?? "msg_mock_reasoned_answer", + phase: "final_answer", + text: params.answerText, + }); + return [ + { + type: "response.output_item.added", + item: { + type: "reasoning", + id: params.reasoningId, + summary: [], + }, + }, + { + type: "response.output_item.done", + item: reasoningItem, + }, + { + type: "response.output_item.added", + item: { + type: "message", + id: answerItem.id, + role: "assistant", + phase: "final_answer", + content: [], + status: "in_progress", + }, + }, + { + type: "response.output_item.done", + item: answerItem, + }, + { + type: "response.completed", + response: { + id: `resp_${params.reasoningId}`, + status: "completed", + output: [reasoningItem, answerItem], + usage: { input_tokens: 64, output_tokens: 16, total_tokens: 80 }, + }, + }, + ]; +} + async function buildResponsesPayload( body: Record, scenarioState: MockScenarioState, @@ -981,6 +1038,15 @@ async function buildResponsesPayload( } return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY"); } + if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) { + return buildReasoningAndAssistantEvents({ + reasoningId: "rs_mock_thinking_visibility_max", + answerText: "THINKING-MAX-OK", + }); + } + if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) { + return buildAssistantEvents("THINKING-OFF-OK"); + } if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) { if (!toolOutput) { return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" }); diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index c237283535c..c38087d9fbb 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -123,6 +123,32 @@ describe("qa scenario catalog", () => { ); }); + it("includes the GPT-5.4 thinking visibility switch scenario", () => { + const scenario = readQaScenarioById("gpt54-thinking-visibility-switch"); + const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as + | { + requiredLiveProvider?: string; + requiredLiveModel?: string; + offDirective?: string; + maxDirective?: string; + reasoningDirective?: string; + } + | undefined; + + expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md"); + expect(config?.requiredLiveProvider).toBe("openai"); + expect(config?.requiredLiveModel).toBe("gpt-5.4"); + expect(config?.offDirective).toBe("/think off"); + expect(config?.maxDirective).toBe("/think max"); + expect(config?.reasoningDirective).toBe("/reasoning on"); + expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([ + "enables reasoning display and disables thinking", + "switches to max thinking", + "verifies max thinking emits visible reasoning", + "verifies max thinking completes the answer", + ]); + }); + it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => { const scenarioIds = [ "reasoning-only-recovery-replay-safe-read", diff --git a/qa/scenarios/models/gpt54-thinking-visibility-switch.md b/qa/scenarios/models/gpt54-thinking-visibility-switch.md new file mode 100644 index 00000000000..9b560f6c606 --- /dev/null +++ b/qa/scenarios/models/gpt54-thinking-visibility-switch.md @@ -0,0 +1,211 @@ +# GPT-5.4 thinking visibility switch + +```yaml qa-scenario +id: gpt54-thinking-visibility-switch +title: GPT-5.4 thinking visibility switch +surface: models +coverage: + primary: + - models.thinking + secondary: + - runtime.reasoning-visibility +objective: Verify GPT-5.4 can switch from disabled thinking to max thinking while reasoning display stays enabled. +successCriteria: + - Live runs target openai/gpt-5.4, not a mini or pro variant. + - The session enables reasoning display before the comparison turns. + - The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message. + - The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message. +docsRefs: + - docs/tools/thinking.md + - docs/help/testing.md + - docs/concepts/qa-e2e-automation.md +codeRefs: + - src/auto-reply/reply/directives.ts + - src/auto-reply/thinking.shared.ts + - src/agents/pi-embedded-runner/run/payloads.ts + - extensions/openai/openai-provider.ts + - extensions/qa-lab/src/providers/mock-openai/server.ts +execution: + kind: flow + summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn. + config: + requiredLiveProvider: openai + requiredLiveModel: gpt-5.4 + offDirective: /think off + maxDirective: /think max + reasoningDirective: /reasoning on + conversationId: qa-thinking-visibility + offPrompt: "QA thinking visibility check off: answer exactly THINKING-OFF-OK." + maxPrompt: "QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK." + offMarker: THINKING-OFF-OK + maxMarker: THINKING-MAX-OK +``` + +```yaml qa-flow +steps: + - name: enables reasoning display and disables thinking + actions: + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: waitForQaChannelReady + args: + - ref: env + - 60000 + - call: reset + - set: selected + value: + expr: splitModelRef(env.primaryModel) + - assert: + expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)" + message: + expr: "`expected live GPT-5.4, got ${env.primaryModel}`" + - call: state.addInboundMessage + args: + - conversation: + id: + expr: config.conversationId + kind: direct + senderId: qa-operator + senderName: QA Operator + text: + expr: config.reasoningDirective + - call: waitForCondition + saveAs: reasoningAck + args: + - lambda: + expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)" + - expr: liveTurnTimeoutMs(env, 20000) + - call: state.addInboundMessage + args: + - conversation: + id: + expr: config.conversationId + kind: direct + senderId: qa-operator + senderName: QA Operator + text: + expr: config.offDirective + - call: waitForCondition + saveAs: offAck + args: + - lambda: + expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)" + - expr: liveTurnTimeoutMs(env, 20000) + - set: offCursor + value: + expr: state.getSnapshot().messages.length + - call: state.addInboundMessage + args: + - conversation: + id: + expr: config.conversationId + kind: direct + senderId: qa-operator + senderName: QA Operator + text: + expr: "`${config.offDirective} ${config.offPrompt}`" + - call: waitForCondition + saveAs: offAnswer + args: + - lambda: + expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.offMarker)).at(-1)" + - expr: liveTurnTimeoutMs(env, 30000) + - set: offMessages + value: + expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId)" + - assert: + expr: "offMessages.some((candidate) => candidate.text.includes(config.offMarker))" + message: + expr: "`missing off marker; saw ${offMessages.map((message) => message.text).join(' | ')}`" + - assert: + expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:'))" + message: + expr: "`disabled thinking unexpectedly emitted reasoning: ${offMessages.map((message) => message.text).join(' | ')}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: requests + value: + expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)" + - set: offRequest + value: + expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))" + - assert: + expr: "String(offRequest?.model ?? '').includes('gpt-5.4')" + message: + expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`" + detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`" + - name: switches to max thinking + actions: + - call: state.addInboundMessage + args: + - conversation: + id: + expr: config.conversationId + kind: direct + senderId: qa-operator + senderName: QA Operator + text: + expr: config.maxDirective + - call: waitForCondition + saveAs: maxAck + args: + - lambda: + expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to high/i.test(candidate.text)).at(-1)" + - expr: liveTurnTimeoutMs(env, 20000) + detailsExpr: "`max ack=${maxAck.text}`" + - name: verifies max thinking emits visible reasoning + actions: + - set: maxCursor + value: + expr: state.getSnapshot().messages.length + - call: state.addInboundMessage + args: + - conversation: + id: + expr: config.conversationId + kind: direct + senderId: qa-operator + senderName: QA Operator + text: + expr: "`${config.maxDirective} ${config.maxPrompt}`" + - call: waitForCondition + saveAs: maxReasoning + args: + - lambda: + expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)" + - expr: liveTurnTimeoutMs(env, 120000) + - assert: + expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')" + message: + expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`" + detailsExpr: "`reasoning=${maxReasoning.text}`" + - name: verifies max thinking completes the answer + actions: + - call: waitForCondition + saveAs: maxAnswer + args: + - lambda: + expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.maxMarker)).at(-1)" + - expr: liveTurnTimeoutMs(env, 120000) + - assert: + expr: "maxAnswer.text.includes(config.maxMarker)" + message: + expr: "`missing max marker: ${maxAnswer.text}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: requests + value: + expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)" + - set: maxRequest + value: + expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))" + - assert: + expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')" + message: + expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`" + detailsExpr: "`answer=${maxAnswer.text}`" +``` diff --git a/src/agents/openai-transport-stream.test.ts b/src/agents/openai-transport-stream.test.ts index fd0e3c88114..59aae01b080 100644 --- a/src/agents/openai-transport-stream.test.ts +++ b/src/agents/openai-transport-stream.test.ts @@ -513,7 +513,7 @@ describe("openai transport stream", () => { expect(params.input?.[0]).toMatchObject({ role: "developer" }); }); - it("defaults OpenAI Responses reasoning effort to high when unset", () => { + it("does not infer high reasoning when Pi passes thinking off", () => { const params = buildOpenAIResponsesParams( { id: "gpt-5.4", @@ -535,8 +535,8 @@ describe("openai transport stream", () => { undefined, ) as { reasoning?: unknown; include?: string[] }; - expect(params.reasoning).toEqual({ effort: "high", summary: "auto" }); - expect(params.include).toEqual(["reasoning.encrypted_content"]); + expect(params.reasoning).toEqual({ effort: "none" }); + expect(params).not.toHaveProperty("include"); }); it("uses shared stream reasoning as OpenAI Responses effort", () => { diff --git a/src/agents/openai-transport-stream.ts b/src/agents/openai-transport-stream.ts index 9d9096a6cc1..d98f97560f0 100644 --- a/src/agents/openai-transport-stream.ts +++ b/src/agents/openai-transport-stream.ts @@ -814,16 +814,12 @@ export function buildOpenAIResponsesParams( } else if (model.provider !== "github-copilot") { const reasoningEffort = resolveOpenAIReasoningEffortForModel({ model, - effort: "high", + effort: "none", }); if (reasoningEffort) { params.reasoning = { effort: reasoningEffort, - ...(reasoningEffort === "none" ? {} : { summary: "auto" }), }; - if (reasoningEffort !== "none") { - params.include = ["reasoning.encrypted_content"]; - } } } } diff --git a/src/agents/pi-embedded-utils.test.ts b/src/agents/pi-embedded-utils.test.ts index 3b5b27500b4..2c69c9614bd 100644 --- a/src/agents/pi-embedded-utils.test.ts +++ b/src/agents/pi-embedded-utils.test.ts @@ -2,6 +2,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai"; import { describe, expect, it } from "vitest"; import { extractAssistantText, + extractAssistantThinking, extractAssistantVisibleText, formatReasoningMessage, promoteThinkingTagsToBlocks, @@ -641,6 +642,27 @@ describe("formatReasoningMessage", () => { }); }); +describe("extractAssistantThinking", () => { + it("surfaces signed native reasoning even when the provider returns an empty summary", () => { + const msg = makeAssistantMessage({ + role: "assistant", + content: [ + { + type: "thinking", + thinking: "", + thinkingSignature: JSON.stringify({ type: "reasoning", id: "rs_live", summary: [] }), + }, + { type: "text", text: "Done." }, + ], + timestamp: Date.now(), + }); + + expect(extractAssistantThinking(msg)).toBe( + "Native reasoning was produced; no summary text was returned.", + ); + }); +}); + describe("stripDowngradedToolCallText", () => { it("strips downgraded marker blocks while preserving surrounding user-facing text", () => { const cases = [ diff --git a/src/agents/pi-embedded-utils.ts b/src/agents/pi-embedded-utils.ts index 7a54079f4a3..666716ed20b 100644 --- a/src/agents/pi-embedded-utils.ts +++ b/src/agents/pi-embedded-utils.ts @@ -147,7 +147,13 @@ export function extractAssistantThinking(msg: AssistantMessage): string { } const record = block as unknown as Record; if (record.type === "thinking" && typeof record.thinking === "string") { - return record.thinking.trim(); + const thinking = record.thinking.trim(); + if (thinking) { + return thinking; + } + if (typeof record.thinkingSignature === "string" && record.thinkingSignature.trim()) { + return "Native reasoning was produced; no summary text was returned."; + } } return ""; }) diff --git a/src/auto-reply/reply/get-reply-directives.target-session.test.ts b/src/auto-reply/reply/get-reply-directives.target-session.test.ts index b199a05576e..91e2c054809 100644 --- a/src/auto-reply/reply/get-reply-directives.target-session.test.ts +++ b/src/auto-reply/reply/get-reply-directives.target-session.test.ts @@ -89,6 +89,7 @@ function parseInlineDirectivesForTest(body: string) { async function resolveHelloWithModelDefaults(params: { defaultThinking: "off" | "low"; defaultReasoning: "on"; + sessionEntry?: SessionEntry; }) { const resolveDefaultThinkingLevel = vi.fn(async () => params.defaultThinking); const resolveDefaultReasoningLevel = vi.fn(async () => params.defaultReasoning); @@ -119,7 +120,7 @@ async function resolveHelloWithModelDefaults(params: { CommandBody: "hello", Provider: "whatsapp", } as TemplateContext, - sessionEntry: makeSessionEntry(), + sessionEntry: params.sessionEntry ?? makeSessionEntry(), sessionStore: {}, sessionKey: "agent:main:whatsapp:+2000", storePath: "/tmp/sessions.json", @@ -427,6 +428,23 @@ describe("resolveReplyDirectives", () => { expect(resolveDefaultReasoningLevel).toHaveBeenCalledOnce(); }); + it("does not re-enable model reasoning when thinking was explicitly disabled", async () => { + const { result, resolveDefaultReasoningLevel } = await resolveHelloWithModelDefaults({ + defaultThinking: "off", + defaultReasoning: "on", + sessionEntry: makeSessionEntry({ thinkingLevel: "off" }), + }); + + expect(result).toEqual({ + kind: "continue", + result: expect.objectContaining({ + resolvedThinkLevel: "off", + resolvedReasoningLevel: "off", + }), + }); + expect(resolveDefaultReasoningLevel).not.toHaveBeenCalled(); + }); + it("skips the model reasoning default when thinking is active", async () => { const { result, resolveDefaultReasoningLevel } = await resolveHelloWithModelDefaults({ defaultThinking: "low", diff --git a/src/auto-reply/reply/get-reply-directives.ts b/src/auto-reply/reply/get-reply-directives.ts index d7b9d0d0f48..9eb3096f7d8 100644 --- a/src/auto-reply/reply/get-reply-directives.ts +++ b/src/auto-reply/reply/get-reply-directives.ts @@ -495,9 +495,14 @@ export async function resolveReplyDirectives(params: { (await modelState.resolveDefaultThinkingLevel()) ?? (agentCfg?.thinkingDefault as ThinkLevel | undefined); + const thinkingExplicitlySet = + directives.thinkLevel !== undefined || + targetSessionEntry?.thinkingLevel !== undefined || + agentCfg?.thinkingDefault !== undefined; + // When neither directive nor session nor agent set reasoning, default to model capability // (e.g. OpenRouter with reasoning: true). Skip model default when thinking is active - // to avoid redundant Reasoning: output alongside internal thinking blocks. + // or when thinking was explicitly disabled. const hasAgentReasoningDefault = agentEntry?.reasoningDefault !== undefined && agentEntry?.reasoningDefault !== null; const reasoningExplicitlySet = @@ -506,7 +511,12 @@ export async function resolveReplyDirectives(params: { targetSessionEntry?.reasoningLevel !== null) || hasAgentReasoningDefault; const thinkingActive = resolvedThinkLevelWithDefault !== "off"; - if (!reasoningExplicitlySet && resolvedReasoningLevel === "off" && !thinkingActive) { + if ( + !reasoningExplicitlySet && + resolvedReasoningLevel === "off" && + !thinkingActive && + !thinkingExplicitlySet + ) { resolvedReasoningLevel = await modelState.resolveDefaultReasoningLevel(); }