test(openai): cover GPT-5.5 defaults

2026-05-06 21:40:44 +00:00 · 2026-04-23 20:00:51 +01:00
parent a36903b94c
commit cd5bc2fc93
65 changed files with 437 additions and 181 deletions
--- a/qa/frontier-harness-plan.md
+++ b/qa/frontier-harness-plan.md
@@ -37,8 +37,8 @@ GPT baseline:
 ```bash
 pnpm openclaw qa suite \
  --provider-mode live-frontier \
-  --model openai/gpt-5.4 \
-  --alt-model openai/gpt-5.4 \
+  --model openai/gpt-5.5 \
+  --alt-model openai/gpt-5.5 \
  --fast \
  --scenario approval-turn-tool-followthrough \
  --scenario model-switch-tool-continuity \
@@ -104,8 +104,8 @@ GPT manual lane:
 ```bash
 pnpm openclaw qa manual \
  --provider-mode live-frontier \
-  --model openai/gpt-5.4 \
-  --alt-model openai/gpt-5.4 \
+  --model openai/gpt-5.5 \
+  --alt-model openai/gpt-5.5 \
  --fast \
  --message "read QA_KICKOFF_TASK.md, tell me what feels half-baked about this qa mission, and keep it to two short sentences"
 ```
--- a/qa/scenarios/models/codex-harness-no-meta-leak.md
+++ b/qa/scenarios/models/codex-harness-no-meta-leak.md
@@ -24,10 +24,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.4 --alt-model codex/gpt-5.4 --scenario codex-harness-no-meta-leak`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.5 --alt-model codex/gpt-5.5 --scenario codex-harness-no-meta-leak`.
  config:
    requiredProvider: codex
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    harnessRuntime: codex
    harnessFallback: none
    expectedReply: QA_LEAK_OK
@@ -47,7 +47,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.4 Codex harness target
+  - name: confirms GPT-5.5 Codex harness target
    actions:
      - set: selected
        value:
--- a/qa/scenarios/models/gpt54-thinking-visibility-switch.md
+++ b/qa/scenarios/models/gpt54-thinking-visibility-switch.md
@@ -1,17 +1,17 @@
-# GPT-5.4 thinking visibility switch
+# GPT-5.5 thinking visibility switch

 ```yaml qa-scenario
 id: gpt54-thinking-visibility-switch
-title: GPT-5.4 thinking visibility switch
+title: GPT-5.5 thinking visibility switch
 surface: models
 coverage:
  primary:
    - models.thinking
  secondary:
    - runtime.reasoning-visibility
-objective: Verify GPT-5.4 can switch from disabled thinking to max thinking while reasoning display stays enabled.
+objective: Verify GPT-5.5 can switch from disabled thinking to max thinking while reasoning display stays enabled.
 successCriteria:
-  - Live runs target openai/gpt-5.4, not a mini or pro variant.
+  - Live runs target openai/gpt-5.5, not a mini or pro variant.
  - The session enables reasoning display before the comparison turns.
  - The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
  - The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
@@ -27,10 +27,10 @@ codeRefs:
  - extensions/qa-lab/src/providers/mock-openai/server.ts
 execution:
  kind: flow
-  summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
+  summary: Toggle reasoning display and GPT-5.5 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
  config:
    requiredLiveProvider: openai
-    requiredLiveModel: gpt-5.4
+    requiredLiveModel: gpt-5.5
    offDirective: /think off
    maxDirective: /think max
    reasoningDirective: /reasoning on
@@ -60,7 +60,7 @@ steps:
      - assert:
          expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
          message:
-            expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
+            expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
      - call: state.addInboundMessage
        args:
          - conversation:
@@ -133,9 +133,9 @@ steps:
              value:
                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
            - assert:
-                expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
+                expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
                message:
-                  expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
+                  expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
    detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
  - name: switches to max thinking
    actions:
@@ -204,8 +204,8 @@ steps:
              value:
                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
            - assert:
-                expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
+                expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
                message:
-                  expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
+                  expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
    detailsExpr: "`answer=${maxAnswer.text}`"
 ```
--- a/qa/scenarios/models/openai-native-web-search-live.md
+++ b/qa/scenarios/models/openai-native-web-search-live.md
@@ -12,7 +12,7 @@ coverage:
 objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary provider is openai.
-  - The selected primary model is GPT-5.4, not a mini or pro variant.
+  - The selected primary model is GPT-5.5, not a mini or pro variant.
  - Web search is enabled without pinning a managed web_search provider.
  - The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
 gatewayConfigPatch:
@@ -32,10 +32,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --scenario openai-native-web-search-live`.
+  summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario openai-native-web-search-live`.
  config:
    requiredProvider: openai
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    expectedMarker: WEB-SEARCH-OK
    failureMarker: WEB-SEARCH-FAILED
    searchPrompt: |-
@@ -49,7 +49,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms live OpenAI GPT-5.4 web search auto mode
+  - name: confirms live OpenAI GPT-5.5 web search auto mode
    actions:
      - call: waitForGatewayHealthy
        args:
--- a/qa/scenarios/models/thinking-slash-model-remap.md
+++ b/qa/scenarios/models/thinking-slash-model-remap.md
@@ -13,8 +13,8 @@ coverage:
 objective: Verify /think lists provider-owned levels and remaps stored thinking levels when /model changes provider capabilities.
 successCriteria:
  - Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
-  - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
-  - OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
+  - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
+  - OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
  - A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
 docsRefs:
  - docs/tools/thinking.md
@@ -33,7 +33,7 @@ execution:
  config:
    requiredProviderMode: live-frontier
    anthropicModelRef: anthropic/claude-sonnet-4-6
-    openAiXhighModelRef: openai/gpt-5.4
+    openAiXhighModelRef: openai/gpt-5.5
    noXhighModelRef: anthropic/claude-sonnet-4-6
    conversationId: qa-thinking-slash-remap
 ```
@@ -165,7 +165,7 @@ steps:
      - assert:
          expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
          message:
-            expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
+            expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
    detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${openAiModelAck.text}; think=${openAiThinkStatus.text}`"
  - name: maps xhigh to high on a model without xhigh
    actions:
--- a/qa/scenarios/workspace/medium-game-plan-codex-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-codex-harness.md
@@ -11,7 +11,7 @@ coverage:
    - models.codex-cli
 objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
 successCriteria:
-  - A live-frontier run fails fast unless the selected primary model is codex/gpt-5.4.
+  - A live-frontier run fails fast unless the selected primary model is codex/gpt-5.5.
  - The scenario forces the Codex embedded harness and disables PI fallback.
  - The prompt explicitly asks the agent to enter plan mode before editing.
  - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.4 --alt-model codex/gpt-5.4 --scenario medium-game-plan-codex-harness`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.5 --alt-model codex/gpt-5.5 --scenario medium-game-plan-codex-harness`.
  config:
    requiredProvider: codex
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    harnessRuntime: codex
    harnessFallback: none
    artifactFile: star-garden-defenders-codex.html
@@ -52,7 +52,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.4 Codex harness target
+  - name: confirms GPT-5.5 Codex harness target
    actions:
      - set: selected
        value:
--- a/qa/scenarios/workspace/medium-game-plan-pi-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-pi-harness.md
@@ -9,9 +9,9 @@ coverage:
    - workspace.planning
  secondary:
    - agents.pi-harness
-objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
+objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
 successCriteria:
-  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
+  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
  - The scenario forces the embedded PI harness before the build turn.
  - The prompt explicitly asks the agent to enter plan mode before editing.
  - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --scenario medium-game-plan-pi-harness`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-pi-harness`.
  config:
    requiredProvider: openai
-    requiredModel: gpt-5.4
+    requiredModel: gpt-5.5
    harnessRuntime: pi
    harnessFallback: pi
    artifactFile: star-garden-defenders-pi.html
@@ -52,7 +52,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.4 PI harness target
+  - name: confirms GPT-5.5 PI harness target
    actions:
      - set: selected
        value: