test: update QA parity fixtures for GPT-5.5

This commit is contained in:
Peter Steinberger
2026-04-25 18:05:13 +01:00
parent 39343088ed
commit 6b3e4b88d6
59 changed files with 407 additions and 399 deletions

View File

@@ -24,10 +24,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
config:
requiredProvider: codex
requiredModel: gpt-5.4
requiredModel: gpt-5.5
harnessRuntime: codex
harnessFallback: none
expectedReply: QA_LEAK_OK
@@ -47,7 +47,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms GPT-5.4 Codex harness target
- name: confirms GPT-5.5 Codex harness target
actions:
- set: selected
value:

View File

@@ -1,17 +1,17 @@
# GPT-5.4 thinking visibility switch
# GPT-5.5 thinking visibility switch
```yaml qa-scenario
id: gpt54-thinking-visibility-switch
title: GPT-5.4 thinking visibility switch
id: gpt55-thinking-visibility-switch
title: GPT-5.5 thinking visibility switch
surface: models
coverage:
primary:
- models.thinking
secondary:
- runtime.reasoning-visibility
objective: Verify GPT-5.4 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
successCriteria:
- Live runs target openai/gpt-5.4, not a mini or pro variant.
- Live runs target openai/gpt-5.5, not a mini or pro variant.
- The session enables reasoning display before the comparison turns.
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
@@ -27,10 +27,10 @@ codeRefs:
- extensions/qa-lab/src/providers/mock-openai/server.ts
execution:
kind: flow
summary: Toggle reasoning display and GPT-5.4 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
summary: Toggle reasoning display and GPT-5.5 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
config:
requiredLiveProvider: openai
requiredLiveModel: gpt-5.4
requiredLiveModel: gpt-5.5
offDirective: /think off
maxDirective: /think medium
reasoningDirective: /reasoning on
@@ -60,7 +60,7 @@ steps:
- assert:
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
message:
expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
- call: state.addInboundMessage
args:
- conversation:
@@ -133,9 +133,9 @@ steps:
value:
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
- assert:
expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
message:
expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
- name: switches to medium thinking
actions:
@@ -204,8 +204,8 @@ steps:
value:
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
- assert:
expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
message:
expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
detailsExpr: "`answer=${maxAnswer.text}`"
```

View File

@@ -72,8 +72,8 @@ steps:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && (() => { const lower = normalizeLowercaseStringOrEmpty(candidate.text); return lower.includes('switch') || lower.includes('handoff'); })()).at(-1)"
- expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
- assert:
expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.4-alt')"
expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.5-alt')"
message:
expr: "`expected gpt-5.4-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`"
expr: "`expected gpt-5.5-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`"
detailsExpr: outbound.text
```

View File

@@ -12,7 +12,7 @@ coverage:
objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
successCriteria:
- A live-frontier run fails fast unless the selected primary provider is openai.
- The selected primary model is GPT-5.4, not a mini or pro variant.
- The selected primary model is GPT-5.5, not a mini or pro variant.
- Web search is enabled without pinning a managed web_search provider.
- The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
gatewayConfigPatch:
@@ -32,10 +32,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario openai-native-web-search-live`.
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario openai-native-web-search-live`.
config:
requiredProvider: openai
requiredModel: gpt-5.4
requiredModel: gpt-5.5
expectedMarker: WEB-SEARCH-OK
failureMarker: WEB-SEARCH-FAILED
searchPrompt: |-
@@ -49,7 +49,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms live OpenAI GPT-5.4 web search auto mode
- name: confirms live OpenAI GPT-5.5 web search auto mode
actions:
- call: waitForGatewayHealthy
args:

View File

@@ -21,8 +21,8 @@ gatewayConfigPatch:
params: {}
successCriteria:
- Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
- OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
- OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
- A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
docsRefs:
- docs/tools/thinking.md
@@ -41,7 +41,7 @@ execution:
config:
requiredProviderMode: live-frontier
anthropicModelRef: anthropic/claude-sonnet-4-6
openAiXhighModelRef: openai/gpt-5.4
openAiXhighModelRef: openai/gpt-5.5
noXhighModelRef: anthropic/claude-sonnet-4-6
conversationId: thinking-slash-remap
sessionKey: agent:qa:main
@@ -142,7 +142,7 @@ steps:
- assert:
expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
message:
expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${JSON.stringify(openAiModelAck.resolved)}; think=${openAiThinkStatus.text}`"
- name: maps xhigh to high on a model without xhigh
actions:

View File

@@ -17,7 +17,7 @@ successCriteria:
- Scenario details preserve the observed compaction count for review context.
docsRefs:
- docs/help/testing.md
- docs/help/gpt54-codex-agentic-parity.md
- docs/help/gpt55-codex-agentic-parity.md
codeRefs:
- extensions/qa-lab/src/suite.ts
- extensions/qa-lab/src/mock-openai-server.ts

View File

@@ -17,7 +17,7 @@ successCriteria:
- Mock trace stops after the write-side reasoning-only terminal turn instead of attempting a continuation.
docsRefs:
- docs/help/testing.md
- docs/help/gpt54-codex-agentic-parity.md
- docs/help/gpt55-codex-agentic-parity.md
codeRefs:
- extensions/qa-lab/src/mock-openai-server.ts
- src/agents/pi-embedded-runner/run/incomplete-turn.ts

View File

@@ -11,7 +11,7 @@ coverage:
- models.codex-cli
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
successCriteria:
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4 with the Codex harness forced.
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced.
- The scenario forces the Codex embedded harness and disables PI fallback.
- The prompt explicitly asks the agent to enter plan mode before editing.
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
config:
requiredProvider: codex
requiredModel: gpt-5.4
requiredModel: gpt-5.5
harnessRuntime: codex
harnessFallback: none
artifactFile: star-garden-defenders-codex.html
@@ -52,7 +52,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms GPT-5.4 Codex harness target
- name: confirms GPT-5.5 Codex harness target
actions:
- set: selected
value:

View File

@@ -9,9 +9,9 @@ coverage:
- workspace.planning
secondary:
- agents.pi-harness
objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
successCriteria:
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
- The scenario forces the embedded PI harness before the build turn.
- The prompt explicitly asks the agent to enter plan mode before editing.
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
config:
requiredProvider: openai
requiredModel: gpt-5.4
requiredModel: gpt-5.5
harnessRuntime: pi
harnessFallback: pi
artifactFile: star-garden-defenders-pi.html
@@ -52,7 +52,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms GPT-5.4 PI harness target
- name: confirms GPT-5.5 PI harness target
actions:
- set: selected
value: