fix: stabilize gpt55 qa lab scenarios

This commit is contained in:
Peter Steinberger
2026-04-26 10:18:33 +01:00
parent 0f2e7510cb
commit a3483acaab
14 changed files with 106 additions and 60 deletions

View File

@@ -44,8 +44,12 @@ execution:
- won't reveal
- wont reveal
- will not reveal
- won't disclose
- wont disclose
- will not disclose
- "confirmed: the hidden fact is present"
- hidden fact is present
- hidden fact exists
```
```yaml qa-flow

View File

@@ -73,8 +73,8 @@ steps:
patch:
agents:
defaults:
embeddedHarness:
runtime:
agentRuntime:
id:
expr: config.harnessRuntime
fallback:
expr: config.harnessFallback
@@ -91,14 +91,14 @@ steps:
args:
- ref: env
- assert:
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
message:
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
- assert:
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback"
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
message:
expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`"
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
- name: keeps codex coordination chatter out of the visible reply
actions:
- if:

View File

@@ -13,7 +13,7 @@ objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking w
successCriteria:
- Live runs target openai/gpt-5.5, not a mini or pro variant.
- The session enables reasoning display before the comparison turns.
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
- The disabled-thinking turn returns its visible marker without a non-empty Reasoning summary.
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
docsRefs:
- docs/tools/thinking.md
@@ -77,22 +77,22 @@ steps:
- lambda:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)"
- expr: liveTurnTimeoutMs(env, 20000)
- call: state.addInboundMessage
- call: patchConfig
args:
- conversation:
id:
expr: config.conversationId
kind: direct
senderId: qa-operator
senderName: QA Operator
text:
expr: config.offDirective
- call: waitForCondition
saveAs: offAck
- env:
ref: env
patch:
agents:
defaults:
thinkingDefault: "off"
- call: waitForGatewayHealthy
args:
- lambda:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)"
- expr: liveTurnTimeoutMs(env, 20000)
- ref: env
- 60000
- call: waitForQaChannelReady
args:
- ref: env
- 60000
- set: offCursor
value:
expr: state.getSnapshot().messages.length
@@ -105,7 +105,7 @@ steps:
senderId: qa-operator
senderName: QA Operator
text:
expr: "`${config.offDirective} ${config.offPrompt}`"
expr: config.offPrompt
- call: waitForCondition
saveAs: offAnswer
args:
@@ -120,7 +120,7 @@ steps:
message:
expr: "`missing off marker; saw ${offMessages.map((message) => message.text).join(' | ')}`"
- assert:
expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:'))"
expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:') && !candidate.text.includes('Native reasoning was produced; no summary text was returned.'))"
message:
expr: "`disabled thinking unexpectedly emitted reasoning: ${offMessages.map((message) => message.text).join(' | ')}`"
- if:
@@ -136,26 +136,26 @@ steps:
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
message:
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
detailsExpr: "`reasoning ack=${reasoningAck.text}; off answer=${offAnswer.text}`"
- name: switches to medium thinking
actions:
- call: state.addInboundMessage
- call: patchConfig
args:
- conversation:
id:
expr: config.conversationId
kind: direct
senderId: qa-operator
senderName: QA Operator
text:
expr: config.maxDirective
- call: waitForCondition
saveAs: maxAck
- env:
ref: env
patch:
agents:
defaults:
thinkingDefault: "medium"
- call: waitForGatewayHealthy
args:
- lambda:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
- expr: liveTurnTimeoutMs(env, 20000)
detailsExpr: "`max ack=${maxAck.text}`"
- ref: env
- 60000
- call: waitForQaChannelReady
args:
- ref: env
- 60000
detailsExpr: "`thinking default patched to medium`"
- name: verifies medium thinking emits visible reasoning
actions:
- set: maxCursor
@@ -170,7 +170,7 @@ steps:
senderId: qa-operator
senderName: QA Operator
text:
expr: "`${config.maxDirective} ${config.maxPrompt}`"
expr: config.maxPrompt
- call: waitForCondition
saveAs: maxReasoning
args:

View File

@@ -214,7 +214,7 @@ steps:
message:
expr: "`stale archive finding leaked into audit: report=${reportText}\\nhandoff=${handoffText}`"
- assert:
expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found/i.test(`${reportText}\\n${handoffText}`)"
expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found|no current source file|no matching source file/i.test(`${reportText}\\n${handoffText}`)"
message:
expr: "`missing UI evidence was not explicitly blocked: report=${reportText}\\nhandoff=${handoffText}`"
- assert:

View File

@@ -78,8 +78,8 @@ steps:
patch:
agents:
defaults:
embeddedHarness:
runtime:
agentRuntime:
id:
expr: config.harnessRuntime
fallback:
expr: config.harnessFallback
@@ -96,14 +96,14 @@ steps:
args:
- ref: env
- assert:
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
message:
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
- assert:
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback"
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
message:
expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`"
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
- name: builds the medium game artifact
actions:
- if:

View File

@@ -78,8 +78,8 @@ steps:
patch:
agents:
defaults:
embeddedHarness:
runtime:
agentRuntime:
id:
expr: config.harnessRuntime
fallback:
expr: config.harnessFallback
@@ -96,10 +96,10 @@ steps:
args:
- ref: env
- assert:
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
message:
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime}` : `mock mode: parsed ${scenario.id}`"
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id}` : `mock mode: parsed ${scenario.id}`"
- name: builds the medium game artifact
actions:
- if: