mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 21:40:44 +00:00
test(openai): cover GPT-5.5 defaults
This commit is contained in:
@@ -37,8 +37,8 @@ GPT baseline:
|
||||
```bash
|
||||
pnpm openclaw qa suite \
|
||||
--provider-mode live-frontier \
|
||||
--model openai/gpt-5.4 \
|
||||
--alt-model openai/gpt-5.4 \
|
||||
--model openai/gpt-5.5 \
|
||||
--alt-model openai/gpt-5.5 \
|
||||
--fast \
|
||||
--scenario approval-turn-tool-followthrough \
|
||||
--scenario model-switch-tool-continuity \
|
||||
@@ -104,8 +104,8 @@ GPT manual lane:
|
||||
```bash
|
||||
pnpm openclaw qa manual \
|
||||
--provider-mode live-frontier \
|
||||
--model openai/gpt-5.4 \
|
||||
--alt-model openai/gpt-5.4 \
|
||||
--model openai/gpt-5.5 \
|
||||
--alt-model openai/gpt-5.5 \
|
||||
--fast \
|
||||
--message "read QA_KICKOFF_TASK.md, tell me what feels half-baked about this qa mission, and keep it to two short sentences"
|
||||
```
|
||||
|
||||
@@ -24,10 +24,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.4 --alt-model codex/gpt-5.4 --scenario codex-harness-no-meta-leak`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.5 --alt-model codex/gpt-5.5 --scenario codex-harness-no-meta-leak`.
|
||||
config:
|
||||
requiredProvider: codex
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
harnessRuntime: codex
|
||||
harnessFallback: none
|
||||
expectedReply: QA_LEAK_OK
|
||||
@@ -47,7 +47,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.4 Codex harness target
|
||||
- name: confirms GPT-5.5 Codex harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
# GPT-5.4 thinking visibility switch
|
||||
# GPT-5.5 thinking visibility switch
|
||||
|
||||
```yaml qa-scenario
|
||||
id: gpt54-thinking-visibility-switch
|
||||
title: GPT-5.4 thinking visibility switch
|
||||
title: GPT-5.5 thinking visibility switch
|
||||
surface: models
|
||||
coverage:
|
||||
primary:
|
||||
- models.thinking
|
||||
secondary:
|
||||
- runtime.reasoning-visibility
|
||||
objective: Verify GPT-5.4 can switch from disabled thinking to max thinking while reasoning display stays enabled.
|
||||
objective: Verify GPT-5.5 can switch from disabled thinking to max thinking while reasoning display stays enabled.
|
||||
successCriteria:
|
||||
- Live runs target openai/gpt-5.4, not a mini or pro variant.
|
||||
- Live runs target openai/gpt-5.5, not a mini or pro variant.
|
||||
- The session enables reasoning display before the comparison turns.
|
||||
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
|
||||
- The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
|
||||
@@ -27,10 +27,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/providers/mock-openai/server.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
|
||||
summary: Toggle reasoning display and GPT-5.5 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
|
||||
config:
|
||||
requiredLiveProvider: openai
|
||||
requiredLiveModel: gpt-5.4
|
||||
requiredLiveModel: gpt-5.5
|
||||
offDirective: /think off
|
||||
maxDirective: /think max
|
||||
reasoningDirective: /reasoning on
|
||||
@@ -60,7 +60,7 @@ steps:
|
||||
- assert:
|
||||
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
|
||||
message:
|
||||
expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
|
||||
expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
|
||||
- call: state.addInboundMessage
|
||||
args:
|
||||
- conversation:
|
||||
@@ -133,9 +133,9 @@ steps:
|
||||
value:
|
||||
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
|
||||
- assert:
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
|
||||
- name: switches to max thinking
|
||||
actions:
|
||||
@@ -204,8 +204,8 @@ steps:
|
||||
value:
|
||||
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
|
||||
- assert:
|
||||
expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
|
||||
expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
|
||||
expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
|
||||
detailsExpr: "`answer=${maxAnswer.text}`"
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ coverage:
|
||||
objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is openai.
|
||||
- The selected primary model is GPT-5.4, not a mini or pro variant.
|
||||
- The selected primary model is GPT-5.5, not a mini or pro variant.
|
||||
- Web search is enabled without pinning a managed web_search provider.
|
||||
- The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
|
||||
gatewayConfigPatch:
|
||||
@@ -32,10 +32,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --scenario openai-native-web-search-live`.
|
||||
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario openai-native-web-search-live`.
|
||||
config:
|
||||
requiredProvider: openai
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
expectedMarker: WEB-SEARCH-OK
|
||||
failureMarker: WEB-SEARCH-FAILED
|
||||
searchPrompt: |-
|
||||
@@ -49,7 +49,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms live OpenAI GPT-5.4 web search auto mode
|
||||
- name: confirms live OpenAI GPT-5.5 web search auto mode
|
||||
actions:
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
|
||||
@@ -13,8 +13,8 @@ coverage:
|
||||
objective: Verify /think lists provider-owned levels and remaps stored thinking levels when /model changes provider capabilities.
|
||||
successCriteria:
|
||||
- Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
|
||||
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
|
||||
- OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
|
||||
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
|
||||
- OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
|
||||
- A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
|
||||
docsRefs:
|
||||
- docs/tools/thinking.md
|
||||
@@ -33,7 +33,7 @@ execution:
|
||||
config:
|
||||
requiredProviderMode: live-frontier
|
||||
anthropicModelRef: anthropic/claude-sonnet-4-6
|
||||
openAiXhighModelRef: openai/gpt-5.4
|
||||
openAiXhighModelRef: openai/gpt-5.5
|
||||
noXhighModelRef: anthropic/claude-sonnet-4-6
|
||||
conversationId: qa-thinking-slash-remap
|
||||
```
|
||||
@@ -165,7 +165,7 @@ steps:
|
||||
- assert:
|
||||
expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
|
||||
message:
|
||||
expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
|
||||
expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
|
||||
detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${openAiModelAck.text}; think=${openAiThinkStatus.text}`"
|
||||
- name: maps xhigh to high on a model without xhigh
|
||||
actions:
|
||||
|
||||
@@ -11,7 +11,7 @@ coverage:
|
||||
- models.codex-cli
|
||||
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is codex/gpt-5.4.
|
||||
- A live-frontier run fails fast unless the selected primary model is codex/gpt-5.5.
|
||||
- The scenario forces the Codex embedded harness and disables PI fallback.
|
||||
- The prompt explicitly asks the agent to enter plan mode before editing.
|
||||
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
|
||||
@@ -25,10 +25,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.4 --alt-model codex/gpt-5.4 --scenario medium-game-plan-codex-harness`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model codex/gpt-5.5 --alt-model codex/gpt-5.5 --scenario medium-game-plan-codex-harness`.
|
||||
config:
|
||||
requiredProvider: codex
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
harnessRuntime: codex
|
||||
harnessFallback: none
|
||||
artifactFile: star-garden-defenders-codex.html
|
||||
@@ -52,7 +52,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.4 Codex harness target
|
||||
- name: confirms GPT-5.5 Codex harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
@@ -9,9 +9,9 @@ coverage:
|
||||
- workspace.planning
|
||||
secondary:
|
||||
- agents.pi-harness
|
||||
objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
|
||||
objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
|
||||
- The scenario forces the embedded PI harness before the build turn.
|
||||
- The prompt explicitly asks the agent to enter plan mode before editing.
|
||||
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
|
||||
@@ -25,10 +25,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --scenario medium-game-plan-pi-harness`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-pi-harness`.
|
||||
config:
|
||||
requiredProvider: openai
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
harnessRuntime: pi
|
||||
harnessFallback: pi
|
||||
artifactFile: star-garden-defenders-pi.html
|
||||
@@ -52,7 +52,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.4 PI harness target
|
||||
- name: confirms GPT-5.5 PI harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
Reference in New Issue
Block a user