mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 18:20:44 +00:00
fix: stabilize gpt55 qa lab scenarios
This commit is contained in:
@@ -44,8 +44,12 @@ execution:
|
||||
- won't reveal
|
||||
- won’t reveal
|
||||
- will not reveal
|
||||
- won't disclose
|
||||
- won’t disclose
|
||||
- will not disclose
|
||||
- "confirmed: the hidden fact is present"
|
||||
- hidden fact is present
|
||||
- hidden fact exists
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
|
||||
@@ -73,8 +73,8 @@ steps:
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
embeddedHarness:
|
||||
runtime:
|
||||
agentRuntime:
|
||||
id:
|
||||
expr: config.harnessRuntime
|
||||
fallback:
|
||||
expr: config.harnessFallback
|
||||
@@ -91,14 +91,14 @@ steps:
|
||||
args:
|
||||
- ref: env
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
- name: keeps codex coordination chatter out of the visible reply
|
||||
actions:
|
||||
- if:
|
||||
|
||||
@@ -13,7 +13,7 @@ objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking w
|
||||
successCriteria:
|
||||
- Live runs target openai/gpt-5.5, not a mini or pro variant.
|
||||
- The session enables reasoning display before the comparison turns.
|
||||
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
|
||||
- The disabled-thinking turn returns its visible marker without a non-empty Reasoning summary.
|
||||
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
|
||||
docsRefs:
|
||||
- docs/tools/thinking.md
|
||||
@@ -77,22 +77,22 @@ steps:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
- call: state.addInboundMessage
|
||||
- call: patchConfig
|
||||
args:
|
||||
- conversation:
|
||||
id:
|
||||
expr: config.conversationId
|
||||
kind: direct
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: config.offDirective
|
||||
- call: waitForCondition
|
||||
saveAs: offAck
|
||||
- env:
|
||||
ref: env
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
thinkingDefault: "off"
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: waitForQaChannelReady
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- set: offCursor
|
||||
value:
|
||||
expr: state.getSnapshot().messages.length
|
||||
@@ -105,7 +105,7 @@ steps:
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: "`${config.offDirective} ${config.offPrompt}`"
|
||||
expr: config.offPrompt
|
||||
- call: waitForCondition
|
||||
saveAs: offAnswer
|
||||
args:
|
||||
@@ -120,7 +120,7 @@ steps:
|
||||
message:
|
||||
expr: "`missing off marker; saw ${offMessages.map((message) => message.text).join(' | ')}`"
|
||||
- assert:
|
||||
expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:'))"
|
||||
expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:') && !candidate.text.includes('Native reasoning was produced; no summary text was returned.'))"
|
||||
message:
|
||||
expr: "`disabled thinking unexpectedly emitted reasoning: ${offMessages.map((message) => message.text).join(' | ')}`"
|
||||
- if:
|
||||
@@ -136,26 +136,26 @@ steps:
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
|
||||
detailsExpr: "`reasoning ack=${reasoningAck.text}; off answer=${offAnswer.text}`"
|
||||
- name: switches to medium thinking
|
||||
actions:
|
||||
- call: state.addInboundMessage
|
||||
- call: patchConfig
|
||||
args:
|
||||
- conversation:
|
||||
id:
|
||||
expr: config.conversationId
|
||||
kind: direct
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: config.maxDirective
|
||||
- call: waitForCondition
|
||||
saveAs: maxAck
|
||||
- env:
|
||||
ref: env
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
thinkingDefault: "medium"
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
detailsExpr: "`max ack=${maxAck.text}`"
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: waitForQaChannelReady
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
detailsExpr: "`thinking default patched to medium`"
|
||||
- name: verifies medium thinking emits visible reasoning
|
||||
actions:
|
||||
- set: maxCursor
|
||||
@@ -170,7 +170,7 @@ steps:
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: "`${config.maxDirective} ${config.maxPrompt}`"
|
||||
expr: config.maxPrompt
|
||||
- call: waitForCondition
|
||||
saveAs: maxReasoning
|
||||
args:
|
||||
|
||||
@@ -214,7 +214,7 @@ steps:
|
||||
message:
|
||||
expr: "`stale archive finding leaked into audit: report=${reportText}\\nhandoff=${handoffText}`"
|
||||
- assert:
|
||||
expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found/i.test(`${reportText}\\n${handoffText}`)"
|
||||
expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found|no current source file|no matching source file/i.test(`${reportText}\\n${handoffText}`)"
|
||||
message:
|
||||
expr: "`missing UI evidence was not explicitly blocked: report=${reportText}\\nhandoff=${handoffText}`"
|
||||
- assert:
|
||||
|
||||
@@ -78,8 +78,8 @@ steps:
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
embeddedHarness:
|
||||
runtime:
|
||||
agentRuntime:
|
||||
id:
|
||||
expr: config.harnessRuntime
|
||||
fallback:
|
||||
expr: config.harnessFallback
|
||||
@@ -96,14 +96,14 @@ steps:
|
||||
args:
|
||||
- ref: env
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
- name: builds the medium game artifact
|
||||
actions:
|
||||
- if:
|
||||
|
||||
@@ -78,8 +78,8 @@ steps:
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
embeddedHarness:
|
||||
runtime:
|
||||
agentRuntime:
|
||||
id:
|
||||
expr: config.harnessRuntime
|
||||
fallback:
|
||||
expr: config.harnessFallback
|
||||
@@ -96,10 +96,10 @@ steps:
|
||||
args:
|
||||
- ref: env
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime}` : `mock mode: parsed ${scenario.id}`"
|
||||
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id}` : `mock mode: parsed ${scenario.id}`"
|
||||
- name: builds the medium game artifact
|
||||
actions:
|
||||
- if:
|
||||
|
||||
Reference in New Issue
Block a user