mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 12:30:44 +00:00
test: update QA parity fixtures for GPT-5.5
This commit is contained in:
@@ -24,10 +24,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
|
||||
config:
|
||||
requiredProvider: codex
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
harnessRuntime: codex
|
||||
harnessFallback: none
|
||||
expectedReply: QA_LEAK_OK
|
||||
@@ -47,7 +47,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.4 Codex harness target
|
||||
- name: confirms GPT-5.5 Codex harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
# GPT-5.4 thinking visibility switch
|
||||
# GPT-5.5 thinking visibility switch
|
||||
|
||||
```yaml qa-scenario
|
||||
id: gpt54-thinking-visibility-switch
|
||||
title: GPT-5.4 thinking visibility switch
|
||||
id: gpt55-thinking-visibility-switch
|
||||
title: GPT-5.5 thinking visibility switch
|
||||
surface: models
|
||||
coverage:
|
||||
primary:
|
||||
- models.thinking
|
||||
secondary:
|
||||
- runtime.reasoning-visibility
|
||||
objective: Verify GPT-5.4 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
|
||||
objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
|
||||
successCriteria:
|
||||
- Live runs target openai/gpt-5.4, not a mini or pro variant.
|
||||
- Live runs target openai/gpt-5.5, not a mini or pro variant.
|
||||
- The session enables reasoning display before the comparison turns.
|
||||
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
|
||||
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
|
||||
@@ -27,10 +27,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/providers/mock-openai/server.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Toggle reasoning display and GPT-5.4 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
|
||||
summary: Toggle reasoning display and GPT-5.5 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
|
||||
config:
|
||||
requiredLiveProvider: openai
|
||||
requiredLiveModel: gpt-5.4
|
||||
requiredLiveModel: gpt-5.5
|
||||
offDirective: /think off
|
||||
maxDirective: /think medium
|
||||
reasoningDirective: /reasoning on
|
||||
@@ -60,7 +60,7 @@ steps:
|
||||
- assert:
|
||||
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
|
||||
message:
|
||||
expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
|
||||
expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
|
||||
- call: state.addInboundMessage
|
||||
args:
|
||||
- conversation:
|
||||
@@ -133,9 +133,9 @@ steps:
|
||||
value:
|
||||
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
|
||||
- assert:
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
|
||||
- name: switches to medium thinking
|
||||
actions:
|
||||
@@ -204,8 +204,8 @@ steps:
|
||||
value:
|
||||
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
|
||||
- assert:
|
||||
expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
|
||||
expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
|
||||
expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
|
||||
detailsExpr: "`answer=${maxAnswer.text}`"
|
||||
```
|
||||
@@ -72,8 +72,8 @@ steps:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && (() => { const lower = normalizeLowercaseStringOrEmpty(candidate.text); return lower.includes('switch') || lower.includes('handoff'); })()).at(-1)"
|
||||
- expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
|
||||
- assert:
|
||||
expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.4-alt')"
|
||||
expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.5-alt')"
|
||||
message:
|
||||
expr: "`expected gpt-5.4-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`"
|
||||
expr: "`expected gpt-5.5-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`"
|
||||
detailsExpr: outbound.text
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ coverage:
|
||||
objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is openai.
|
||||
- The selected primary model is GPT-5.4, not a mini or pro variant.
|
||||
- The selected primary model is GPT-5.5, not a mini or pro variant.
|
||||
- Web search is enabled without pinning a managed web_search provider.
|
||||
- The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
|
||||
gatewayConfigPatch:
|
||||
@@ -32,10 +32,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario openai-native-web-search-live`.
|
||||
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario openai-native-web-search-live`.
|
||||
config:
|
||||
requiredProvider: openai
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
expectedMarker: WEB-SEARCH-OK
|
||||
failureMarker: WEB-SEARCH-FAILED
|
||||
searchPrompt: |-
|
||||
@@ -49,7 +49,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms live OpenAI GPT-5.4 web search auto mode
|
||||
- name: confirms live OpenAI GPT-5.5 web search auto mode
|
||||
actions:
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
|
||||
@@ -21,8 +21,8 @@ gatewayConfigPatch:
|
||||
params: {}
|
||||
successCriteria:
|
||||
- Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
|
||||
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
|
||||
- OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
|
||||
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
|
||||
- OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
|
||||
- A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
|
||||
docsRefs:
|
||||
- docs/tools/thinking.md
|
||||
@@ -41,7 +41,7 @@ execution:
|
||||
config:
|
||||
requiredProviderMode: live-frontier
|
||||
anthropicModelRef: anthropic/claude-sonnet-4-6
|
||||
openAiXhighModelRef: openai/gpt-5.4
|
||||
openAiXhighModelRef: openai/gpt-5.5
|
||||
noXhighModelRef: anthropic/claude-sonnet-4-6
|
||||
conversationId: thinking-slash-remap
|
||||
sessionKey: agent:qa:main
|
||||
@@ -142,7 +142,7 @@ steps:
|
||||
- assert:
|
||||
expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
|
||||
message:
|
||||
expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
|
||||
expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
|
||||
detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${JSON.stringify(openAiModelAck.resolved)}; think=${openAiThinkStatus.text}`"
|
||||
- name: maps xhigh to high on a model without xhigh
|
||||
actions:
|
||||
|
||||
@@ -17,7 +17,7 @@ successCriteria:
|
||||
- Scenario details preserve the observed compaction count for review context.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/help/gpt54-codex-agentic-parity.md
|
||||
- docs/help/gpt55-codex-agentic-parity.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
|
||||
@@ -17,7 +17,7 @@ successCriteria:
|
||||
- Mock trace stops after the write-side reasoning-only terminal turn instead of attempting a continuation.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/help/gpt54-codex-agentic-parity.md
|
||||
- docs/help/gpt55-codex-agentic-parity.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
|
||||
|
||||
@@ -11,7 +11,7 @@ coverage:
|
||||
- models.codex-cli
|
||||
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4 with the Codex harness forced.
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced.
|
||||
- The scenario forces the Codex embedded harness and disables PI fallback.
|
||||
- The prompt explicitly asks the agent to enter plan mode before editing.
|
||||
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
|
||||
@@ -25,10 +25,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
|
||||
config:
|
||||
requiredProvider: codex
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
harnessRuntime: codex
|
||||
harnessFallback: none
|
||||
artifactFile: star-garden-defenders-codex.html
|
||||
@@ -52,7 +52,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.4 Codex harness target
|
||||
- name: confirms GPT-5.5 Codex harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
@@ -9,9 +9,9 @@ coverage:
|
||||
- workspace.planning
|
||||
secondary:
|
||||
- agents.pi-harness
|
||||
objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
|
||||
objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
|
||||
- The scenario forces the embedded PI harness before the build turn.
|
||||
- The prompt explicitly asks the agent to enter plan mode before editing.
|
||||
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
|
||||
@@ -25,10 +25,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
|
||||
config:
|
||||
requiredProvider: openai
|
||||
requiredModel: gpt-5.4
|
||||
requiredModel: gpt-5.5
|
||||
harnessRuntime: pi
|
||||
harnessFallback: pi
|
||||
artifactFile: star-garden-defenders-pi.html
|
||||
@@ -52,7 +52,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.4 PI harness target
|
||||
- name: confirms GPT-5.5 PI harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
Reference in New Issue
Block a user