test(qa-lab): seed broken-turn recovery scenarios (#66416)

This commit is contained in:
Vincent Koc
2026-04-14 09:03:49 +01:00
committed by GitHub
parent 37f449d7e1
commit 900681751d
7 changed files with 365 additions and 2 deletions

View File

@@ -118,6 +118,30 @@ describe("qa scenario catalog", () => {
);
});
it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => {
const scenarioIds = [
"reasoning-only-recovery-replay-safe-read",
"reasoning-only-no-auto-retry-after-write",
"empty-response-recovery-replay-safe-read",
"empty-response-retry-budget-exhausted",
];
for (const scenarioId of scenarioIds) {
const scenario = readQaScenarioById(scenarioId);
const config = readQaScenarioExecutionConfig(scenarioId) as
| {
requiredProvider?: string;
prompt?: string;
}
| undefined;
expect(scenario.sourcePath).toBe(`qa/scenarios/${scenarioId}.md`);
expect(config?.requiredProvider).toBe("mock-openai");
expect(config?.prompt).toContain("check");
expect(scenario.execution.flow?.steps.length).toBeGreaterThan(0);
}
});
it("keeps mock-only image debug assertions guarded in live-frontier runs", () => {
const scenario = readQaScenarioPack().scenarios.find(
(candidate) => candidate.id === "image-understanding-attachment",

View File

@@ -0,0 +1,81 @@
# Empty-response recovery after replay-safe read
```yaml qa-scenario
id: empty-response-recovery-replay-safe-read
title: Empty-response recovery after replay-safe read
surface: runtime
objective: Verify an empty visible GPT turn after a replay-safe read auto-continues into a visible answer.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
- The agent performs a replay-safe read before the empty response.
- The runtime injects the visible-answer continuation instruction after the empty turn.
- The final visible reply contains the exact recovery marker.
docsRefs:
- docs/help/testing.md
codeRefs:
- extensions/qa-lab/src/mock-openai-server.ts
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
execution:
kind: flow
summary: Verify empty OpenAI turns recover after a replay-safe read.
config:
requiredProvider: mock-openai
promptSnippet: Empty response continuation QA check
prompt: "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK."
expectedReply: EMPTY-RECOVERED-OK
retryNeedle: The previous attempt did not produce a user-visible answer.
```
```yaml qa-flow
steps:
- name: retries an empty replay-safe read into a visible answer
actions:
- assert:
expr: "env.providerMode === 'mock-openai'"
message: this seeded scenario is mock-openai only
- call: waitForGatewayHealthy
args:
- ref: env
- 60000
- call: reset
- set: requestCountBefore
value:
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
- set: sessionKey
value:
expr: "`agent:qa:empty-response-recovery:${randomUUID().slice(0, 8)}`"
- call: runAgentPrompt
args:
- ref: env
- sessionKey:
ref: sessionKey
message:
expr: config.prompt
timeoutMs:
expr: liveTurnTimeoutMs(env, 45000)
- call: waitForOutboundMessage
saveAs: outbound
args:
- ref: state
- lambda:
params: [candidate]
expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)"
- expr: liveTurnTimeoutMs(env, 30000)
- assert:
expr: "outbound.text.includes(config.expectedReply)"
message:
expr: "`missing empty-response recovery marker: ${outbound.text}`"
- if:
expr: "Boolean(env.mock)"
then:
- set: scenarioRequests
value:
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
- assert:
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')"
message: expected replay-safe read request in mock trace
- assert:
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))"
message: expected empty-response retry instruction in mock trace
detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text"
```

View File

@@ -0,0 +1,75 @@
# Empty-response retry budget exhausted
```yaml qa-scenario
id: empty-response-retry-budget-exhausted
title: Empty-response retry budget exhausted
surface: runtime
objective: Verify repeated empty GPT turns exhaust the retry budget after one continuation attempt.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
- The agent performs the replay-safe read that makes retrying allowed.
- Mock trace shows the run reaches a terminal post-read turn without ever producing the requested success marker.
docsRefs:
- docs/help/testing.md
codeRefs:
- extensions/qa-lab/src/mock-openai-server.ts
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
execution:
kind: flow
summary: Verify empty-response retry exhaustion still surfaces a visible failure.
config:
requiredProvider: mock-openai
promptSnippet: Empty response exhaustion QA check
prompt: "Empty response exhaustion QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-EXHAUSTED-OK."
retryNeedle: The previous attempt did not produce a user-visible answer.
```
```yaml qa-flow
steps:
- name: surfaces a retry error after empty-response exhaustion
actions:
- assert:
expr: "env.providerMode === 'mock-openai'"
message: this seeded scenario is mock-openai only
- call: waitForGatewayHealthy
args:
- ref: env
- 60000
- call: reset
- set: requestCountBefore
value:
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
- set: sessionKey
value:
expr: "`agent:qa:empty-response-exhausted:${randomUUID().slice(0, 8)}`"
- call: startAgentRun
saveAs: started
args:
- ref: env
- sessionKey:
ref: sessionKey
message:
expr: config.prompt
timeoutMs:
expr: liveTurnTimeoutMs(env, 45000)
- set: waited
value:
expr: "await env.gateway.call('agent.wait', { runId: started.runId, timeoutMs: liveTurnTimeoutMs(env, 45000) }, { timeoutMs: liveTurnTimeoutMs(env, 50000) })"
- assert:
expr: "waited?.status === 'ok'"
message:
expr: "`agent.wait returned ${String(waited?.status ?? 'unknown')}: ${String(waited?.error ?? '')}`"
- if:
expr: "Boolean(env.mock)"
then:
- set: scenarioRequests
value:
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
- assert:
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')"
message: expected replay-safe read request in mock trace
- assert:
expr: "scenarioRequests.length >= 2"
message: expected at least the replay-safe read request and one terminal post-read turn
detailsExpr: "env.mock ? `requests=${String(scenarioRequests?.length ?? 0)}` : String(waited?.status ?? '')"
```

View File

@@ -0,0 +1,90 @@
# Reasoning-only no-auto-retry after write
```yaml qa-scenario
id: reasoning-only-no-auto-retry-after-write
title: Reasoning-only no-auto-retry after write
surface: runtime
objective: Verify a GPT-style reasoning-only turn after a mutating write stays replay-unsafe and does not auto-retry.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
- The agent performs the seeded mutating write.
- Mock trace does not include an automatic reasoning-only retry instruction.
- Mock trace stops after the write-side reasoning-only terminal turn instead of attempting a continuation.
docsRefs:
- docs/help/testing.md
- docs/help/gpt54-codex-agentic-parity.md
codeRefs:
- extensions/qa-lab/src/mock-openai-server.ts
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
execution:
kind: flow
summary: Verify reasoning-only turns after a write do not auto-retry.
config:
requiredProvider: mock-openai
promptSnippet: Reasoning-only after write safety check
prompt: "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK."
retryNeedle: recorded reasoning but did not produce a user-visible answer
outputFile: reasoning-only-side-effect.txt
```
```yaml qa-flow
steps:
- name: keeps replay-unsafety explicit after a mutating write
actions:
- assert:
expr: "env.providerMode === 'mock-openai'"
message: this seeded scenario is mock-openai only
- call: waitForGatewayHealthy
args:
- ref: env
- 60000
- call: reset
- set: requestCountBefore
value:
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
- set: sessionKey
value:
expr: "`agent:qa:reasoning-only-write:${randomUUID().slice(0, 8)}`"
- call: startAgentRun
saveAs: started
args:
- ref: env
- sessionKey:
ref: sessionKey
message:
expr: config.prompt
timeoutMs:
expr: liveTurnTimeoutMs(env, 45000)
- set: waited
value:
expr: "await env.gateway.call('agent.wait', { runId: started.runId, timeoutMs: liveTurnTimeoutMs(env, 45000) }, { timeoutMs: liveTurnTimeoutMs(env, 50000) })"
- assert:
expr: "waited?.status === 'ok'"
message:
expr: "`agent.wait returned ${String(waited?.status ?? 'unknown')}: ${String(waited?.error ?? '')}`"
- call: fs.readFile
saveAs: sideEffect
args:
- expr: "path.join(env.gateway.workspaceDir, config.outputFile)"
- utf8
- assert:
expr: "sideEffect.includes('side effects already happened')"
message:
expr: "`side-effect file missing expected contents: ${sideEffect}`"
- if:
expr: "Boolean(env.mock)"
then:
- set: scenarioRequests
value:
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
- assert:
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'write')"
message: expected mutating write request in mock trace
- assert:
expr: "!scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))"
message: reasoning-only retry instruction should not be injected after a write
- assert:
expr: "scenarioRequests.filter((request) => String(request.allInputText ?? '').includes(config.promptSnippet)).length === 2"
message: expected exactly the write request plus the reasoning-only terminal request
detailsExpr: "env.mock ? `requests=${String(scenarioRequests?.length ?? 0)} sideEffect=${sideEffect.trim()}` : sideEffect"
```

View File

@@ -0,0 +1,81 @@
# Reasoning-only recovery after replay-safe read
```yaml qa-scenario
id: reasoning-only-recovery-replay-safe-read
title: Reasoning-only recovery after replay-safe read
surface: runtime
objective: Verify a GPT-style reasoning-only turn after a replay-safe read auto-continues into a visible answer.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
- The agent performs a replay-safe read before the reasoning-only turn.
- The runtime injects the visible-answer continuation instruction after the reasoning-only turn.
- The final visible reply contains the exact recovery marker.
docsRefs:
- docs/help/testing.md
codeRefs:
- extensions/qa-lab/src/mock-openai-server.ts
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
execution:
kind: flow
summary: Verify reasoning-only OpenAI turns recover after a replay-safe read.
config:
requiredProvider: mock-openai
promptSnippet: Reasoning-only continuation QA check
prompt: "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK."
expectedReply: REASONING-RECOVERED-OK
retryNeedle: recorded reasoning but did not produce a user-visible answer
```
```yaml qa-flow
steps:
- name: retries a replay-safe read into a visible answer
actions:
- assert:
expr: "env.providerMode === 'mock-openai'"
message: this seeded scenario is mock-openai only
- call: waitForGatewayHealthy
args:
- ref: env
- 60000
- call: reset
- set: requestCountBefore
value:
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
- set: sessionKey
value:
expr: "`agent:qa:reasoning-only-recovery:${randomUUID().slice(0, 8)}`"
- call: runAgentPrompt
args:
- ref: env
- sessionKey:
ref: sessionKey
message:
expr: config.prompt
timeoutMs:
expr: liveTurnTimeoutMs(env, 45000)
- call: waitForOutboundMessage
saveAs: outbound
args:
- ref: state
- lambda:
params: [candidate]
expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)"
- expr: liveTurnTimeoutMs(env, 30000)
- assert:
expr: "outbound.text.includes(config.expectedReply)"
message:
expr: "`missing recovery marker: ${outbound.text}`"
- if:
expr: "Boolean(env.mock)"
then:
- set: scenarioRequests
value:
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
- assert:
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')"
message: expected replay-safe read request in mock trace
- assert:
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))"
message: expected reasoning-only retry instruction in mock trace
detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text"
```

View File

@@ -21,6 +21,16 @@ describe("resolveEffectiveExecutionContract", () => {
).toBe("strict-agentic");
});
it("auto-activates on the mock-openai qa lane", () => {
expect(
resolveEffectiveExecutionContract({
config: emptyConfig,
provider: "mock-openai",
modelId: "mock-openai/gpt-5.4",
}),
).toBe("strict-agentic");
});
it("auto-activates on gpt-5o and variants without a separator", () => {
for (const modelId of ["gpt-5", "gpt-5o", "gpt-5o-mini"]) {
expect(

View File

@@ -39,14 +39,16 @@ const STRICT_AGENTIC_MODEL_ID_PATTERN = /^gpt-5(?:[.o-]|$)/i;
* Supported provider + model combinations where strict-agentic is the intended
* runtime contract. Kept as a narrow helper so both the execution-contract
* resolver and the `update_plan` auto-enable gate converge on the same
* definition of "GPT-5-family openai/openai-codex run".
* definition of "GPT-5-family openai/openai-codex run". The embedded
* `mock-openai` QA lane intentionally piggybacks on that contract so repo QA
* can exercise the same incomplete-turn recovery rules end to end.
*/
export function isStrictAgenticSupportedProviderModel(params: {
provider?: string | null;
modelId?: string | null;
}): boolean {
const provider = normalizeLowercaseStringOrEmpty(params.provider ?? "");
if (provider !== "openai" && provider !== "openai-codex") {
if (provider !== "openai" && provider !== "openai-codex" && provider !== "mock-openai") {
return false;
}
const modelId = typeof params.modelId === "string" ? params.modelId : "";