test: add gpt-5.4 thinking visibility QA

This commit is contained in:
Peter Steinberger
2026-04-21 06:12:17 +01:00
parent 663501206f
commit f5be489266
10 changed files with 419 additions and 12 deletions

View File

@@ -8,6 +8,10 @@ const QA_REASONING_ONLY_RECOVERY_PROMPT =
"Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK.";
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT =
"Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK.";
const QA_THINKING_VISIBILITY_OFF_PROMPT =
"QA thinking visibility check off: answer exactly THINKING-OFF-OK.";
const QA_THINKING_VISIBILITY_MAX_PROMPT =
"QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK.";
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT =
"Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK.";
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT =
@@ -2049,6 +2053,54 @@ describe("qa mock openai server", () => {
]);
});
it("scripts the GPT-5.4 thinking visibility switch prompts", async () => {
const server = await startMockServer();
expect(
await expectResponsesJson<{
output?: Array<{ type?: string; content?: Array<{ text?: string }> }>;
}>(server, {
stream: false,
model: "gpt-5.4",
input: [makeUserInput(QA_THINKING_VISIBILITY_OFF_PROMPT)],
}),
).toMatchObject({
output: [
{
type: "message",
content: [{ text: "THINKING-OFF-OK" }],
},
],
});
expect(
await expectResponsesJson<{
output?: Array<{
type?: string;
id?: string;
summary?: Array<{ text?: string }>;
content?: Array<{ text?: string }>;
}>;
}>(server, {
stream: false,
model: "gpt-5.4",
input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)],
}),
).toMatchObject({
output: [
{
type: "reasoning",
id: "rs_mock_thinking_visibility_max",
summary: [],
},
{
type: "message",
content: [{ text: "THINKING-MAX-OK" }],
},
],
});
});
it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
const server = await startMockServer();

View File

@@ -140,6 +140,8 @@ const TINY_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i;
const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
@@ -924,6 +926,61 @@ function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[
];
}
function buildReasoningAndAssistantEvents(params: {
reasoningId: string;
answerText: string;
answerId?: string;
}): StreamEvent[] {
const reasoningItem = {
type: "reasoning",
id: params.reasoningId,
summary: [],
} as const;
const answerItem = buildAssistantOutputItem({
id: params.answerId ?? "msg_mock_reasoned_answer",
phase: "final_answer",
text: params.answerText,
});
return [
{
type: "response.output_item.added",
item: {
type: "reasoning",
id: params.reasoningId,
summary: [],
},
},
{
type: "response.output_item.done",
item: reasoningItem,
},
{
type: "response.output_item.added",
item: {
type: "message",
id: answerItem.id,
role: "assistant",
phase: "final_answer",
content: [],
status: "in_progress",
},
},
{
type: "response.output_item.done",
item: answerItem,
},
{
type: "response.completed",
response: {
id: `resp_${params.reasoningId}`,
status: "completed",
output: [reasoningItem, answerItem],
usage: { input_tokens: 64, output_tokens: 16, total_tokens: 80 },
},
},
];
}
async function buildResponsesPayload(
body: Record<string, unknown>,
scenarioState: MockScenarioState,
@@ -981,6 +1038,15 @@ async function buildResponsesPayload(
}
return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
}
if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) {
return buildReasoningAndAssistantEvents({
reasoningId: "rs_mock_thinking_visibility_max",
answerText: "THINKING-MAX-OK",
});
}
if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) {
return buildAssistantEvents("THINKING-OFF-OK");
}
if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
if (!toolOutput) {
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });

View File

@@ -123,6 +123,32 @@ describe("qa scenario catalog", () => {
);
});
it("includes the GPT-5.4 thinking visibility switch scenario", () => {
const scenario = readQaScenarioById("gpt54-thinking-visibility-switch");
const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as
| {
requiredLiveProvider?: string;
requiredLiveModel?: string;
offDirective?: string;
maxDirective?: string;
reasoningDirective?: string;
}
| undefined;
expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
expect(config?.requiredLiveProvider).toBe("openai");
expect(config?.requiredLiveModel).toBe("gpt-5.4");
expect(config?.offDirective).toBe("/think off");
expect(config?.maxDirective).toBe("/think max");
expect(config?.reasoningDirective).toBe("/reasoning on");
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
"enables reasoning display and disables thinking",
"switches to max thinking",
"verifies max thinking emits visible reasoning",
"verifies max thinking completes the answer",
]);
});
it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => {
const scenarioIds = [
"reasoning-only-recovery-replay-safe-read",