test: update QA parity fixtures for GPT-5.5

2026-05-06 12:30:44 +00:00 · 2026-04-25 18:05:13 +01:00
parent 39343088ed
commit 6b3e4b88d6
59 changed files with 407 additions and 399 deletions
--- a/qa/scenarios/models/codex-harness-no-meta-leak.md
+++ b/qa/scenarios/models/codex-harness-no-meta-leak.md
@@ -24,10 +24,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
  config:
    requiredProvider: codex
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    harnessRuntime: codex
    harnessFallback: none
    expectedReply: QA_LEAK_OK
@@ -47,7 +47,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.4 Codex harness target
+  - name: confirms GPT-5.5 Codex harness target
    actions:
      - set: selected
        value:
--- a/qa/scenarios/models/gpt55-thinking-visibility-switch.md
+++ b/qa/scenarios/models/gpt55-thinking-visibility-switch.md
@@ -1,17 +1,17 @@
-# GPT-5.4 thinking visibility switch
+# GPT-5.5 thinking visibility switch

 ```yaml qa-scenario
-id: gpt54-thinking-visibility-switch
-title: GPT-5.4 thinking visibility switch
+id: gpt55-thinking-visibility-switch
+title: GPT-5.5 thinking visibility switch
 surface: models
 coverage:
  primary:
    - models.thinking
  secondary:
    - runtime.reasoning-visibility
-objective: Verify GPT-5.4 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
+objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
 successCriteria:
-  - Live runs target openai/gpt-5.4, not a mini or pro variant.
+  - Live runs target openai/gpt-5.5, not a mini or pro variant.
  - The session enables reasoning display before the comparison turns.
  - The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
  - The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
@@ -27,10 +27,10 @@ codeRefs:
  - extensions/qa-lab/src/providers/mock-openai/server.ts
 execution:
  kind: flow
-  summary: Toggle reasoning display and GPT-5.4 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
+  summary: Toggle reasoning display and GPT-5.5 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
  config:
    requiredLiveProvider: openai
-    requiredLiveModel: gpt-5.4
+    requiredLiveModel: gpt-5.5
    offDirective: /think off
    maxDirective: /think medium
    reasoningDirective: /reasoning on
@@ -60,7 +60,7 @@ steps:
      - assert:
          expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
          message:
-            expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
+            expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
      - call: state.addInboundMessage
        args:
          - conversation:
@@ -133,9 +133,9 @@ steps:
              value:
                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
            - assert:
-                expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
+                expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
                message:
-                  expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
+                  expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
    detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
  - name: switches to medium thinking
    actions:
@@ -204,8 +204,8 @@ steps:
              value:
                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
            - assert:
-                expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
+                expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
                message:
-                  expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
+                  expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
    detailsExpr: "`answer=${maxAnswer.text}`"
 ```
--- a/qa/scenarios/models/model-switch-follow-up.md
+++ b/qa/scenarios/models/model-switch-follow-up.md
@@ -72,8 +72,8 @@ steps:
              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && (() => { const lower = normalizeLowercaseStringOrEmpty(candidate.text); return lower.includes('switch') || lower.includes('handoff'); })()).at(-1)"
          - expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
      - assert:
-          expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.4-alt')"
+          expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.5-alt')"
          message:
-            expr: "`expected gpt-5.4-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`"
+            expr: "`expected gpt-5.5-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`"
    detailsExpr: outbound.text
 ```
--- a/qa/scenarios/models/openai-native-web-search-live.md
+++ b/qa/scenarios/models/openai-native-web-search-live.md
@@ -12,7 +12,7 @@ coverage:
 objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary provider is openai.
-  - The selected primary model is GPT-5.4, not a mini or pro variant.
+  - The selected primary model is GPT-5.5, not a mini or pro variant.
  - Web search is enabled without pinning a managed web_search provider.
  - The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
 gatewayConfigPatch:
@@ -32,10 +32,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario openai-native-web-search-live`.
+  summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario openai-native-web-search-live`.
  config:
    requiredProvider: openai
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    expectedMarker: WEB-SEARCH-OK
    failureMarker: WEB-SEARCH-FAILED
    searchPrompt: |-
@@ -49,7 +49,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms live OpenAI GPT-5.4 web search auto mode
+  - name: confirms live OpenAI GPT-5.5 web search auto mode
    actions:
      - call: waitForGatewayHealthy
        args:
--- a/qa/scenarios/models/thinking-slash-model-remap.md
+++ b/qa/scenarios/models/thinking-slash-model-remap.md
@@ -21,8 +21,8 @@ gatewayConfigPatch:
          params: {}
 successCriteria:
  - Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
-  - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
-  - OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
+  - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
+  - OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
  - A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
 docsRefs:
  - docs/tools/thinking.md
@@ -41,7 +41,7 @@ execution:
  config:
    requiredProviderMode: live-frontier
    anthropicModelRef: anthropic/claude-sonnet-4-6
-    openAiXhighModelRef: openai/gpt-5.4
+    openAiXhighModelRef: openai/gpt-5.5
    noXhighModelRef: anthropic/claude-sonnet-4-6
    conversationId: thinking-slash-remap
    sessionKey: agent:qa:main
@@ -142,7 +142,7 @@ steps:
      - assert:
          expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
          message:
-            expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
+            expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
    detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${JSON.stringify(openAiModelAck.resolved)}; think=${openAiThinkStatus.text}`"
  - name: maps xhigh to high on a model without xhigh
    actions:
--- a/qa/scenarios/runtime/compaction-retry-mutating-tool.md
+++ b/qa/scenarios/runtime/compaction-retry-mutating-tool.md
@@ -17,7 +17,7 @@ successCriteria:
  - Scenario details preserve the observed compaction count for review context.
 docsRefs:
  - docs/help/testing.md
-  - docs/help/gpt54-codex-agentic-parity.md
+  - docs/help/gpt55-codex-agentic-parity.md
 codeRefs:
  - extensions/qa-lab/src/suite.ts
  - extensions/qa-lab/src/mock-openai-server.ts
--- a/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md
+++ b/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md
@@ -17,7 +17,7 @@ successCriteria:
  - Mock trace stops after the write-side reasoning-only terminal turn instead of attempting a continuation.
 docsRefs:
  - docs/help/testing.md
-  - docs/help/gpt54-codex-agentic-parity.md
+  - docs/help/gpt55-codex-agentic-parity.md
 codeRefs:
  - extensions/qa-lab/src/mock-openai-server.ts
  - src/agents/pi-embedded-runner/run/incomplete-turn.ts
--- a/qa/scenarios/workspace/medium-game-plan-codex-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-codex-harness.md
@@ -11,7 +11,7 @@ coverage:
    - models.codex-cli
 objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
 successCriteria:
-  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4 with the Codex harness forced.
+  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced.
  - The scenario forces the Codex embedded harness and disables PI fallback.
  - The prompt explicitly asks the agent to enter plan mode before editing.
  - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
  config:
    requiredProvider: codex
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    harnessRuntime: codex
    harnessFallback: none
    artifactFile: star-garden-defenders-codex.html
@@ -52,7 +52,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.4 Codex harness target
+  - name: confirms GPT-5.5 Codex harness target
    actions:
      - set: selected
        value:
--- a/qa/scenarios/workspace/medium-game-plan-pi-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-pi-harness.md
@@ -9,9 +9,9 @@ coverage:
    - workspace.planning
  secondary:
    - agents.pi-harness
-objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
+objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
 successCriteria:
-  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
+  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
  - The scenario forces the embedded PI harness before the build turn.
  - The prompt explicitly asks the agent to enter plan mode before editing.
  - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
  config:
    requiredProvider: openai
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    harnessRuntime: pi
    harnessFallback: pi
    artifactFile: star-garden-defenders-pi.html
@@ -52,7 +52,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.4 PI harness target
+  - name: confirms GPT-5.5 PI harness target
    actions:
      - set: selected
        value: