mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 18:40:44 +00:00
test: add gpt-5.4 thinking visibility QA
This commit is contained in:
@@ -8,6 +8,10 @@ const QA_REASONING_ONLY_RECOVERY_PROMPT =
|
||||
"Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK.";
|
||||
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT =
|
||||
"Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK.";
|
||||
const QA_THINKING_VISIBILITY_OFF_PROMPT =
|
||||
"QA thinking visibility check off: answer exactly THINKING-OFF-OK.";
|
||||
const QA_THINKING_VISIBILITY_MAX_PROMPT =
|
||||
"QA thinking visibility check max: verify 17+24=41 internally, then answer exactly THINKING-MAX-OK.";
|
||||
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT =
|
||||
"Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK.";
|
||||
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT =
|
||||
@@ -2049,6 +2053,54 @@ describe("qa mock openai server", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
it("scripts the GPT-5.4 thinking visibility switch prompts", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
expect(
|
||||
await expectResponsesJson<{
|
||||
output?: Array<{ type?: string; content?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [makeUserInput(QA_THINKING_VISIBILITY_OFF_PROMPT)],
|
||||
}),
|
||||
).toMatchObject({
|
||||
output: [
|
||||
{
|
||||
type: "message",
|
||||
content: [{ text: "THINKING-OFF-OK" }],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(
|
||||
await expectResponsesJson<{
|
||||
output?: Array<{
|
||||
type?: string;
|
||||
id?: string;
|
||||
summary?: Array<{ text?: string }>;
|
||||
content?: Array<{ text?: string }>;
|
||||
}>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)],
|
||||
}),
|
||||
).toMatchObject({
|
||||
output: [
|
||||
{
|
||||
type: "reasoning",
|
||||
id: "rs_mock_thinking_visibility_max",
|
||||
summary: [],
|
||||
},
|
||||
{
|
||||
type: "message",
|
||||
content: [{ text: "THINKING-MAX-OK" }],
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
|
||||
@@ -140,6 +140,8 @@ const TINY_PNG_BASE64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
|
||||
const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
|
||||
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
|
||||
const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i;
|
||||
const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
|
||||
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
|
||||
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
|
||||
const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
|
||||
@@ -924,6 +926,61 @@ function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[
|
||||
];
|
||||
}
|
||||
|
||||
function buildReasoningAndAssistantEvents(params: {
|
||||
reasoningId: string;
|
||||
answerText: string;
|
||||
answerId?: string;
|
||||
}): StreamEvent[] {
|
||||
const reasoningItem = {
|
||||
type: "reasoning",
|
||||
id: params.reasoningId,
|
||||
summary: [],
|
||||
} as const;
|
||||
const answerItem = buildAssistantOutputItem({
|
||||
id: params.answerId ?? "msg_mock_reasoned_answer",
|
||||
phase: "final_answer",
|
||||
text: params.answerText,
|
||||
});
|
||||
return [
|
||||
{
|
||||
type: "response.output_item.added",
|
||||
item: {
|
||||
type: "reasoning",
|
||||
id: params.reasoningId,
|
||||
summary: [],
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "response.output_item.done",
|
||||
item: reasoningItem,
|
||||
},
|
||||
{
|
||||
type: "response.output_item.added",
|
||||
item: {
|
||||
type: "message",
|
||||
id: answerItem.id,
|
||||
role: "assistant",
|
||||
phase: "final_answer",
|
||||
content: [],
|
||||
status: "in_progress",
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "response.output_item.done",
|
||||
item: answerItem,
|
||||
},
|
||||
{
|
||||
type: "response.completed",
|
||||
response: {
|
||||
id: `resp_${params.reasoningId}`,
|
||||
status: "completed",
|
||||
output: [reasoningItem, answerItem],
|
||||
usage: { input_tokens: 64, output_tokens: 16, total_tokens: 80 },
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
async function buildResponsesPayload(
|
||||
body: Record<string, unknown>,
|
||||
scenarioState: MockScenarioState,
|
||||
@@ -981,6 +1038,15 @@ async function buildResponsesPayload(
|
||||
}
|
||||
return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
|
||||
}
|
||||
if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) {
|
||||
return buildReasoningAndAssistantEvents({
|
||||
reasoningId: "rs_mock_thinking_visibility_max",
|
||||
answerText: "THINKING-MAX-OK",
|
||||
});
|
||||
}
|
||||
if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) {
|
||||
return buildAssistantEvents("THINKING-OFF-OK");
|
||||
}
|
||||
if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
|
||||
|
||||
@@ -123,6 +123,32 @@ describe("qa scenario catalog", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("includes the GPT-5.4 thinking visibility switch scenario", () => {
|
||||
const scenario = readQaScenarioById("gpt54-thinking-visibility-switch");
|
||||
const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as
|
||||
| {
|
||||
requiredLiveProvider?: string;
|
||||
requiredLiveModel?: string;
|
||||
offDirective?: string;
|
||||
maxDirective?: string;
|
||||
reasoningDirective?: string;
|
||||
}
|
||||
| undefined;
|
||||
|
||||
expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
|
||||
expect(config?.requiredLiveProvider).toBe("openai");
|
||||
expect(config?.requiredLiveModel).toBe("gpt-5.4");
|
||||
expect(config?.offDirective).toBe("/think off");
|
||||
expect(config?.maxDirective).toBe("/think max");
|
||||
expect(config?.reasoningDirective).toBe("/reasoning on");
|
||||
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
|
||||
"enables reasoning display and disables thinking",
|
||||
"switches to max thinking",
|
||||
"verifies max thinking emits visible reasoning",
|
||||
"verifies max thinking completes the answer",
|
||||
]);
|
||||
});
|
||||
|
||||
it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => {
|
||||
const scenarioIds = [
|
||||
"reasoning-only-recovery-replay-safe-read",
|
||||
|
||||
Reference in New Issue
Block a user