test: add gpt-5.4 thinking visibility QA

This commit is contained in:
Peter Steinberger
2026-04-21 06:12:17 +01:00
parent 663501206f
commit f5be489266
10 changed files with 419 additions and 12 deletions

View File

@@ -8,6 +8,10 @@ const QA_REASONING_ONLY_RECOVERY_PROMPT =
"Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK.";
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT =
"Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK.";
const QA_THINKING_VISIBILITY_OFF_PROMPT =
"QA thinking visibility check off: answer exactly THINKING-OFF-OK.";
const QA_THINKING_VISIBILITY_MAX_PROMPT =
"QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK.";
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT =
"Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK.";
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT =
@@ -2049,6 +2053,54 @@ describe("qa mock openai server", () => {
]);
});
it("scripts the GPT-5.4 thinking visibility switch prompts", async () => {
const server = await startMockServer();
expect(
await expectResponsesJson<{
output?: Array<{ type?: string; content?: Array<{ text?: string }> }>;
}>(server, {
stream: false,
model: "gpt-5.4",
input: [makeUserInput(QA_THINKING_VISIBILITY_OFF_PROMPT)],
}),
).toMatchObject({
output: [
{
type: "message",
content: [{ text: "THINKING-OFF-OK" }],
},
],
});
expect(
await expectResponsesJson<{
output?: Array<{
type?: string;
id?: string;
summary?: Array<{ text?: string }>;
content?: Array<{ text?: string }>;
}>;
}>(server, {
stream: false,
model: "gpt-5.4",
input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)],
}),
).toMatchObject({
output: [
{
type: "reasoning",
id: "rs_mock_thinking_visibility_max",
summary: [],
},
{
type: "message",
content: [{ text: "THINKING-MAX-OK" }],
},
],
});
});
it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
const server = await startMockServer();

View File

@@ -140,6 +140,8 @@ const TINY_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i;
const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
@@ -924,6 +926,61 @@ function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[
];
}
function buildReasoningAndAssistantEvents(params: {
reasoningId: string;
answerText: string;
answerId?: string;
}): StreamEvent[] {
const reasoningItem = {
type: "reasoning",
id: params.reasoningId,
summary: [],
} as const;
const answerItem = buildAssistantOutputItem({
id: params.answerId ?? "msg_mock_reasoned_answer",
phase: "final_answer",
text: params.answerText,
});
return [
{
type: "response.output_item.added",
item: {
type: "reasoning",
id: params.reasoningId,
summary: [],
},
},
{
type: "response.output_item.done",
item: reasoningItem,
},
{
type: "response.output_item.added",
item: {
type: "message",
id: answerItem.id,
role: "assistant",
phase: "final_answer",
content: [],
status: "in_progress",
},
},
{
type: "response.output_item.done",
item: answerItem,
},
{
type: "response.completed",
response: {
id: `resp_${params.reasoningId}`,
status: "completed",
output: [reasoningItem, answerItem],
usage: { input_tokens: 64, output_tokens: 16, total_tokens: 80 },
},
},
];
}
async function buildResponsesPayload(
body: Record<string, unknown>,
scenarioState: MockScenarioState,
@@ -981,6 +1038,15 @@ async function buildResponsesPayload(
}
return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
}
if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) {
return buildReasoningAndAssistantEvents({
reasoningId: "rs_mock_thinking_visibility_max",
answerText: "THINKING-MAX-OK",
});
}
if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) {
return buildAssistantEvents("THINKING-OFF-OK");
}
if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
if (!toolOutput) {
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });

View File

@@ -123,6 +123,32 @@ describe("qa scenario catalog", () => {
);
});
it("includes the GPT-5.4 thinking visibility switch scenario", () => {
const scenario = readQaScenarioById("gpt54-thinking-visibility-switch");
const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as
| {
requiredLiveProvider?: string;
requiredLiveModel?: string;
offDirective?: string;
maxDirective?: string;
reasoningDirective?: string;
}
| undefined;
expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
expect(config?.requiredLiveProvider).toBe("openai");
expect(config?.requiredLiveModel).toBe("gpt-5.4");
expect(config?.offDirective).toBe("/think off");
expect(config?.maxDirective).toBe("/think max");
expect(config?.reasoningDirective).toBe("/reasoning on");
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
"enables reasoning display and disables thinking",
"switches to max thinking",
"verifies max thinking emits visible reasoning",
"verifies max thinking completes the answer",
]);
});
it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => {
const scenarioIds = [
"reasoning-only-recovery-replay-safe-read",

View File

@@ -0,0 +1,211 @@
# GPT-5.4 thinking visibility switch
```yaml qa-scenario
id: gpt54-thinking-visibility-switch
title: GPT-5.4 thinking visibility switch
surface: models
coverage:
primary:
- models.thinking
secondary:
- runtime.reasoning-visibility
objective: Verify GPT-5.4 can switch from disabled thinking to max thinking while reasoning display stays enabled.
successCriteria:
- Live runs target openai/gpt-5.4, not a mini or pro variant.
- The session enables reasoning display before the comparison turns.
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
- The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
docsRefs:
- docs/tools/thinking.md
- docs/help/testing.md
- docs/concepts/qa-e2e-automation.md
codeRefs:
- src/auto-reply/reply/directives.ts
- src/auto-reply/thinking.shared.ts
- src/agents/pi-embedded-runner/run/payloads.ts
- extensions/openai/openai-provider.ts
- extensions/qa-lab/src/providers/mock-openai/server.ts
execution:
kind: flow
summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
config:
requiredLiveProvider: openai
requiredLiveModel: gpt-5.4
offDirective: /think off
maxDirective: /think max
reasoningDirective: /reasoning on
conversationId: qa-thinking-visibility
offPrompt: "QA thinking visibility check off: answer exactly THINKING-OFF-OK."
maxPrompt: "QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK."
offMarker: THINKING-OFF-OK
maxMarker: THINKING-MAX-OK
```
```yaml qa-flow
steps:
- name: enables reasoning display and disables thinking
actions:
- call: waitForGatewayHealthy
args:
- ref: env
- 60000
- call: waitForQaChannelReady
args:
- ref: env
- 60000
- call: reset
- set: selected
value:
expr: splitModelRef(env.primaryModel)
- assert:
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
message:
expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
- call: state.addInboundMessage
args:
- conversation:
id:
expr: config.conversationId
kind: direct
senderId: qa-operator
senderName: QA Operator
text:
expr: config.reasoningDirective
- call: waitForCondition
saveAs: reasoningAck
args:
- lambda:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)"
- expr: liveTurnTimeoutMs(env, 20000)
- call: state.addInboundMessage
args:
- conversation:
id:
expr: config.conversationId
kind: direct
senderId: qa-operator
senderName: QA Operator
text:
expr: config.offDirective
- call: waitForCondition
saveAs: offAck
args:
- lambda:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)"
- expr: liveTurnTimeoutMs(env, 20000)
- set: offCursor
value:
expr: state.getSnapshot().messages.length
- call: state.addInboundMessage
args:
- conversation:
id:
expr: config.conversationId
kind: direct
senderId: qa-operator
senderName: QA Operator
text:
expr: "`${config.offDirective} ${config.offPrompt}`"
- call: waitForCondition
saveAs: offAnswer
args:
- lambda:
expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.offMarker)).at(-1)"
- expr: liveTurnTimeoutMs(env, 30000)
- set: offMessages
value:
expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId)"
- assert:
expr: "offMessages.some((candidate) => candidate.text.includes(config.offMarker))"
message:
expr: "`missing off marker; saw ${offMessages.map((message) => message.text).join(' | ')}`"
- assert:
expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:'))"
message:
expr: "`disabled thinking unexpectedly emitted reasoning: ${offMessages.map((message) => message.text).join(' | ')}`"
- if:
expr: "Boolean(env.mock)"
then:
- set: requests
value:
expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)"
- set: offRequest
value:
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
- assert:
expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
message:
expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
- name: switches to max thinking
actions:
- call: state.addInboundMessage
args:
- conversation:
id:
expr: config.conversationId
kind: direct
senderId: qa-operator
senderName: QA Operator
text:
expr: config.maxDirective
- call: waitForCondition
saveAs: maxAck
args:
- lambda:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to high/i.test(candidate.text)).at(-1)"
- expr: liveTurnTimeoutMs(env, 20000)
detailsExpr: "`max ack=${maxAck.text}`"
- name: verifies max thinking emits visible reasoning
actions:
- set: maxCursor
value:
expr: state.getSnapshot().messages.length
- call: state.addInboundMessage
args:
- conversation:
id:
expr: config.conversationId
kind: direct
senderId: qa-operator
senderName: QA Operator
text:
expr: "`${config.maxDirective} ${config.maxPrompt}`"
- call: waitForCondition
saveAs: maxReasoning
args:
- lambda:
expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)"
- expr: liveTurnTimeoutMs(env, 120000)
- assert:
expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')"
message:
expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
detailsExpr: "`reasoning=${maxReasoning.text}`"
- name: verifies max thinking completes the answer
actions:
- call: waitForCondition
saveAs: maxAnswer
args:
- lambda:
expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.maxMarker)).at(-1)"
- expr: liveTurnTimeoutMs(env, 120000)
- assert:
expr: "maxAnswer.text.includes(config.maxMarker)"
message:
expr: "`missing max marker: ${maxAnswer.text}`"
- if:
expr: "Boolean(env.mock)"
then:
- set: requests
value:
expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)"
- set: maxRequest
value:
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
- assert:
expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
message:
expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
detailsExpr: "`answer=${maxAnswer.text}`"
```

View File

@@ -513,7 +513,7 @@ describe("openai transport stream", () => {
expect(params.input?.[0]).toMatchObject({ role: "developer" });
});
it("defaults OpenAI Responses reasoning effort to high when unset", () => {
it("does not infer high reasoning when Pi passes thinking off", () => {
const params = buildOpenAIResponsesParams(
{
id: "gpt-5.4",
@@ -535,8 +535,8 @@ describe("openai transport stream", () => {
undefined,
) as { reasoning?: unknown; include?: string[] };
expect(params.reasoning).toEqual({ effort: "high", summary: "auto" });
expect(params.include).toEqual(["reasoning.encrypted_content"]);
expect(params.reasoning).toEqual({ effort: "none" });
expect(params).not.toHaveProperty("include");
});
it("uses shared stream reasoning as OpenAI Responses effort", () => {

View File

@@ -814,16 +814,12 @@ export function buildOpenAIResponsesParams(
} else if (model.provider !== "github-copilot") {
const reasoningEffort = resolveOpenAIReasoningEffortForModel({
model,
effort: "high",
effort: "none",
});
if (reasoningEffort) {
params.reasoning = {
effort: reasoningEffort,
...(reasoningEffort === "none" ? {} : { summary: "auto" }),
};
if (reasoningEffort !== "none") {
params.include = ["reasoning.encrypted_content"];
}
}
}
}

View File

@@ -2,6 +2,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
import { describe, expect, it } from "vitest";
import {
extractAssistantText,
extractAssistantThinking,
extractAssistantVisibleText,
formatReasoningMessage,
promoteThinkingTagsToBlocks,
@@ -641,6 +642,27 @@ describe("formatReasoningMessage", () => {
});
});
describe("extractAssistantThinking", () => {
it("surfaces signed native reasoning even when the provider returns an empty summary", () => {
const msg = makeAssistantMessage({
role: "assistant",
content: [
{
type: "thinking",
thinking: "",
thinkingSignature: JSON.stringify({ type: "reasoning", id: "rs_live", summary: [] }),
},
{ type: "text", text: "Done." },
],
timestamp: Date.now(),
});
expect(extractAssistantThinking(msg)).toBe(
"Native reasoning was produced; no summary text was returned.",
);
});
});
describe("stripDowngradedToolCallText", () => {
it("strips downgraded marker blocks while preserving surrounding user-facing text", () => {
const cases = [

View File

@@ -147,7 +147,13 @@ export function extractAssistantThinking(msg: AssistantMessage): string {
}
const record = block as unknown as Record<string, unknown>;
if (record.type === "thinking" && typeof record.thinking === "string") {
return record.thinking.trim();
const thinking = record.thinking.trim();
if (thinking) {
return thinking;
}
if (typeof record.thinkingSignature === "string" && record.thinkingSignature.trim()) {
return "Native reasoning was produced; no summary text was returned.";
}
}
return "";
})

View File

@@ -89,6 +89,7 @@ function parseInlineDirectivesForTest(body: string) {
async function resolveHelloWithModelDefaults(params: {
defaultThinking: "off" | "low";
defaultReasoning: "on";
sessionEntry?: SessionEntry;
}) {
const resolveDefaultThinkingLevel = vi.fn(async () => params.defaultThinking);
const resolveDefaultReasoningLevel = vi.fn(async () => params.defaultReasoning);
@@ -119,7 +120,7 @@ async function resolveHelloWithModelDefaults(params: {
CommandBody: "hello",
Provider: "whatsapp",
} as TemplateContext,
sessionEntry: makeSessionEntry(),
sessionEntry: params.sessionEntry ?? makeSessionEntry(),
sessionStore: {},
sessionKey: "agent:main:whatsapp:+2000",
storePath: "/tmp/sessions.json",
@@ -427,6 +428,23 @@ describe("resolveReplyDirectives", () => {
expect(resolveDefaultReasoningLevel).toHaveBeenCalledOnce();
});
it("does not re-enable model reasoning when thinking was explicitly disabled", async () => {
const { result, resolveDefaultReasoningLevel } = await resolveHelloWithModelDefaults({
defaultThinking: "off",
defaultReasoning: "on",
sessionEntry: makeSessionEntry({ thinkingLevel: "off" }),
});
expect(result).toEqual({
kind: "continue",
result: expect.objectContaining({
resolvedThinkLevel: "off",
resolvedReasoningLevel: "off",
}),
});
expect(resolveDefaultReasoningLevel).not.toHaveBeenCalled();
});
it("skips the model reasoning default when thinking is active", async () => {
const { result, resolveDefaultReasoningLevel } = await resolveHelloWithModelDefaults({
defaultThinking: "low",

View File

@@ -495,9 +495,14 @@ export async function resolveReplyDirectives(params: {
(await modelState.resolveDefaultThinkingLevel()) ??
(agentCfg?.thinkingDefault as ThinkLevel | undefined);
const thinkingExplicitlySet =
directives.thinkLevel !== undefined ||
targetSessionEntry?.thinkingLevel !== undefined ||
agentCfg?.thinkingDefault !== undefined;
// When neither directive nor session nor agent set reasoning, default to model capability
// (e.g. OpenRouter with reasoning: true). Skip model default when thinking is active
// to avoid redundant Reasoning: output alongside internal thinking blocks.
// or when thinking was explicitly disabled.
const hasAgentReasoningDefault =
agentEntry?.reasoningDefault !== undefined && agentEntry?.reasoningDefault !== null;
const reasoningExplicitlySet =
@@ -506,7 +511,12 @@ export async function resolveReplyDirectives(params: {
targetSessionEntry?.reasoningLevel !== null) ||
hasAgentReasoningDefault;
const thinkingActive = resolvedThinkLevelWithDefault !== "off";
if (!reasoningExplicitlySet && resolvedReasoningLevel === "off" && !thinkingActive) {
if (
!reasoningExplicitlySet &&
resolvedReasoningLevel === "off" &&
!thinkingActive &&
!thinkingExplicitlySet
) {
resolvedReasoningLevel = await modelState.resolveDefaultReasoningLevel();
}