From 900681751da19048d0c17f4466ee9b67d5ebb490 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 14 Apr 2026 09:03:49 +0100 Subject: [PATCH] test(qa-lab): seed broken-turn recovery scenarios (#66416) --- .../qa-lab/src/scenario-catalog.test.ts | 24 +++++ ...mpty-response-recovery-replay-safe-read.md | 81 +++++++++++++++++ .../empty-response-retry-budget-exhausted.md | 75 ++++++++++++++++ ...easoning-only-no-auto-retry-after-write.md | 90 +++++++++++++++++++ ...easoning-only-recovery-replay-safe-read.md | 81 +++++++++++++++++ src/agents/execution-contract.test.ts | 10 +++ src/agents/execution-contract.ts | 6 +- 7 files changed, 365 insertions(+), 2 deletions(-) create mode 100644 qa/scenarios/empty-response-recovery-replay-safe-read.md create mode 100644 qa/scenarios/empty-response-retry-budget-exhausted.md create mode 100644 qa/scenarios/reasoning-only-no-auto-retry-after-write.md create mode 100644 qa/scenarios/reasoning-only-recovery-replay-safe-read.md diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 7918d285b29..db630b9a143 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -118,6 +118,30 @@ describe("qa scenario catalog", () => { ); }); + it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => { + const scenarioIds = [ + "reasoning-only-recovery-replay-safe-read", + "reasoning-only-no-auto-retry-after-write", + "empty-response-recovery-replay-safe-read", + "empty-response-retry-budget-exhausted", + ]; + + for (const scenarioId of scenarioIds) { + const scenario = readQaScenarioById(scenarioId); + const config = readQaScenarioExecutionConfig(scenarioId) as + | { + requiredProvider?: string; + prompt?: string; + } + | undefined; + + expect(scenario.sourcePath).toBe(`qa/scenarios/${scenarioId}.md`); + expect(config?.requiredProvider).toBe("mock-openai"); + expect(config?.prompt).toContain("check"); + expect(scenario.execution.flow?.steps.length).toBeGreaterThan(0); + } + }); + it("keeps mock-only image debug assertions guarded in live-frontier runs", () => { const scenario = readQaScenarioPack().scenarios.find( (candidate) => candidate.id === "image-understanding-attachment", diff --git a/qa/scenarios/empty-response-recovery-replay-safe-read.md b/qa/scenarios/empty-response-recovery-replay-safe-read.md new file mode 100644 index 00000000000..0f25b56b5bb --- /dev/null +++ b/qa/scenarios/empty-response-recovery-replay-safe-read.md @@ -0,0 +1,81 @@ +# Empty-response recovery after replay-safe read + +```yaml qa-scenario +id: empty-response-recovery-replay-safe-read +title: Empty-response recovery after replay-safe read +surface: runtime +objective: Verify an empty visible GPT turn after a replay-safe read auto-continues into a visible answer. +successCriteria: + - Scenario is mock-openai only so live lanes do not pick it up implicitly. + - The agent performs a replay-safe read before the empty response. + - The runtime injects the visible-answer continuation instruction after the empty turn. + - The final visible reply contains the exact recovery marker. +docsRefs: + - docs/help/testing.md +codeRefs: + - extensions/qa-lab/src/mock-openai-server.ts + - src/agents/pi-embedded-runner/run/incomplete-turn.ts +execution: + kind: flow + summary: Verify empty OpenAI turns recover after a replay-safe read. + config: + requiredProvider: mock-openai + promptSnippet: Empty response continuation QA check + prompt: "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK." + expectedReply: EMPTY-RECOVERED-OK + retryNeedle: The previous attempt did not produce a user-visible answer. +``` + +```yaml qa-flow +steps: + - name: retries an empty replay-safe read into a visible answer + actions: + - assert: + expr: "env.providerMode === 'mock-openai'" + message: this seeded scenario is mock-openai only + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: reset + - set: requestCountBefore + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - set: sessionKey + value: + expr: "`agent:qa:empty-response-recovery:${randomUUID().slice(0, 8)}`" + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + ref: sessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 45000) + - call: waitForOutboundMessage + saveAs: outbound + args: + - ref: state + - lambda: + params: [candidate] + expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)" + - expr: liveTurnTimeoutMs(env, 30000) + - assert: + expr: "outbound.text.includes(config.expectedReply)" + message: + expr: "`missing empty-response recovery marker: ${outbound.text}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: scenarioRequests + value: + expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)" + - assert: + expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')" + message: expected replay-safe read request in mock trace + - assert: + expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))" + message: expected empty-response retry instruction in mock trace + detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text" +``` diff --git a/qa/scenarios/empty-response-retry-budget-exhausted.md b/qa/scenarios/empty-response-retry-budget-exhausted.md new file mode 100644 index 00000000000..1e69b1ef603 --- /dev/null +++ b/qa/scenarios/empty-response-retry-budget-exhausted.md @@ -0,0 +1,75 @@ +# Empty-response retry budget exhausted + +```yaml qa-scenario +id: empty-response-retry-budget-exhausted +title: Empty-response retry budget exhausted +surface: runtime +objective: Verify repeated empty GPT turns exhaust the retry budget after one continuation attempt. +successCriteria: + - Scenario is mock-openai only so live lanes do not pick it up implicitly. + - The agent performs the replay-safe read that makes retrying allowed. + - Mock trace shows the run reaches a terminal post-read turn without ever producing the requested success marker. +docsRefs: + - docs/help/testing.md +codeRefs: + - extensions/qa-lab/src/mock-openai-server.ts + - src/agents/pi-embedded-runner/run/incomplete-turn.ts +execution: + kind: flow + summary: Verify empty-response retry exhaustion still surfaces a visible failure. + config: + requiredProvider: mock-openai + promptSnippet: Empty response exhaustion QA check + prompt: "Empty response exhaustion QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-EXHAUSTED-OK." + retryNeedle: The previous attempt did not produce a user-visible answer. +``` + +```yaml qa-flow +steps: + - name: surfaces a retry error after empty-response exhaustion + actions: + - assert: + expr: "env.providerMode === 'mock-openai'" + message: this seeded scenario is mock-openai only + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: reset + - set: requestCountBefore + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - set: sessionKey + value: + expr: "`agent:qa:empty-response-exhausted:${randomUUID().slice(0, 8)}`" + - call: startAgentRun + saveAs: started + args: + - ref: env + - sessionKey: + ref: sessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 45000) + - set: waited + value: + expr: "await env.gateway.call('agent.wait', { runId: started.runId, timeoutMs: liveTurnTimeoutMs(env, 45000) }, { timeoutMs: liveTurnTimeoutMs(env, 50000) })" + - assert: + expr: "waited?.status === 'ok'" + message: + expr: "`agent.wait returned ${String(waited?.status ?? 'unknown')}: ${String(waited?.error ?? '')}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: scenarioRequests + value: + expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)" + - assert: + expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')" + message: expected replay-safe read request in mock trace + - assert: + expr: "scenarioRequests.length >= 2" + message: expected at least the replay-safe read request and one terminal post-read turn + detailsExpr: "env.mock ? `requests=${String(scenarioRequests?.length ?? 0)}` : String(waited?.status ?? '')" +``` diff --git a/qa/scenarios/reasoning-only-no-auto-retry-after-write.md b/qa/scenarios/reasoning-only-no-auto-retry-after-write.md new file mode 100644 index 00000000000..21a15d54457 --- /dev/null +++ b/qa/scenarios/reasoning-only-no-auto-retry-after-write.md @@ -0,0 +1,90 @@ +# Reasoning-only no-auto-retry after write + +```yaml qa-scenario +id: reasoning-only-no-auto-retry-after-write +title: Reasoning-only no-auto-retry after write +surface: runtime +objective: Verify a GPT-style reasoning-only turn after a mutating write stays replay-unsafe and does not auto-retry. +successCriteria: + - Scenario is mock-openai only so live lanes do not pick it up implicitly. + - The agent performs the seeded mutating write. + - Mock trace does not include an automatic reasoning-only retry instruction. + - Mock trace stops after the write-side reasoning-only terminal turn instead of attempting a continuation. +docsRefs: + - docs/help/testing.md + - docs/help/gpt54-codex-agentic-parity.md +codeRefs: + - extensions/qa-lab/src/mock-openai-server.ts + - src/agents/pi-embedded-runner/run/incomplete-turn.ts +execution: + kind: flow + summary: Verify reasoning-only turns after a write do not auto-retry. + config: + requiredProvider: mock-openai + promptSnippet: Reasoning-only after write safety check + prompt: "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK." + retryNeedle: recorded reasoning but did not produce a user-visible answer + outputFile: reasoning-only-side-effect.txt +``` + +```yaml qa-flow +steps: + - name: keeps replay-unsafety explicit after a mutating write + actions: + - assert: + expr: "env.providerMode === 'mock-openai'" + message: this seeded scenario is mock-openai only + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: reset + - set: requestCountBefore + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - set: sessionKey + value: + expr: "`agent:qa:reasoning-only-write:${randomUUID().slice(0, 8)}`" + - call: startAgentRun + saveAs: started + args: + - ref: env + - sessionKey: + ref: sessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 45000) + - set: waited + value: + expr: "await env.gateway.call('agent.wait', { runId: started.runId, timeoutMs: liveTurnTimeoutMs(env, 45000) }, { timeoutMs: liveTurnTimeoutMs(env, 50000) })" + - assert: + expr: "waited?.status === 'ok'" + message: + expr: "`agent.wait returned ${String(waited?.status ?? 'unknown')}: ${String(waited?.error ?? '')}`" + - call: fs.readFile + saveAs: sideEffect + args: + - expr: "path.join(env.gateway.workspaceDir, config.outputFile)" + - utf8 + - assert: + expr: "sideEffect.includes('side effects already happened')" + message: + expr: "`side-effect file missing expected contents: ${sideEffect}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: scenarioRequests + value: + expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)" + - assert: + expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'write')" + message: expected mutating write request in mock trace + - assert: + expr: "!scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))" + message: reasoning-only retry instruction should not be injected after a write + - assert: + expr: "scenarioRequests.filter((request) => String(request.allInputText ?? '').includes(config.promptSnippet)).length === 2" + message: expected exactly the write request plus the reasoning-only terminal request + detailsExpr: "env.mock ? `requests=${String(scenarioRequests?.length ?? 0)} sideEffect=${sideEffect.trim()}` : sideEffect" +``` diff --git a/qa/scenarios/reasoning-only-recovery-replay-safe-read.md b/qa/scenarios/reasoning-only-recovery-replay-safe-read.md new file mode 100644 index 00000000000..95489b00c0f --- /dev/null +++ b/qa/scenarios/reasoning-only-recovery-replay-safe-read.md @@ -0,0 +1,81 @@ +# Reasoning-only recovery after replay-safe read + +```yaml qa-scenario +id: reasoning-only-recovery-replay-safe-read +title: Reasoning-only recovery after replay-safe read +surface: runtime +objective: Verify a GPT-style reasoning-only turn after a replay-safe read auto-continues into a visible answer. +successCriteria: + - Scenario is mock-openai only so live lanes do not pick it up implicitly. + - The agent performs a replay-safe read before the reasoning-only turn. + - The runtime injects the visible-answer continuation instruction after the reasoning-only turn. + - The final visible reply contains the exact recovery marker. +docsRefs: + - docs/help/testing.md +codeRefs: + - extensions/qa-lab/src/mock-openai-server.ts + - src/agents/pi-embedded-runner/run/incomplete-turn.ts +execution: + kind: flow + summary: Verify reasoning-only OpenAI turns recover after a replay-safe read. + config: + requiredProvider: mock-openai + promptSnippet: Reasoning-only continuation QA check + prompt: "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK." + expectedReply: REASONING-RECOVERED-OK + retryNeedle: recorded reasoning but did not produce a user-visible answer +``` + +```yaml qa-flow +steps: + - name: retries a replay-safe read into a visible answer + actions: + - assert: + expr: "env.providerMode === 'mock-openai'" + message: this seeded scenario is mock-openai only + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: reset + - set: requestCountBefore + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - set: sessionKey + value: + expr: "`agent:qa:reasoning-only-recovery:${randomUUID().slice(0, 8)}`" + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + ref: sessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 45000) + - call: waitForOutboundMessage + saveAs: outbound + args: + - ref: state + - lambda: + params: [candidate] + expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)" + - expr: liveTurnTimeoutMs(env, 30000) + - assert: + expr: "outbound.text.includes(config.expectedReply)" + message: + expr: "`missing recovery marker: ${outbound.text}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: scenarioRequests + value: + expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)" + - assert: + expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')" + message: expected replay-safe read request in mock trace + - assert: + expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))" + message: expected reasoning-only retry instruction in mock trace + detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text" +``` diff --git a/src/agents/execution-contract.test.ts b/src/agents/execution-contract.test.ts index fbdff9e4f6f..d426893a5b1 100644 --- a/src/agents/execution-contract.test.ts +++ b/src/agents/execution-contract.test.ts @@ -21,6 +21,16 @@ describe("resolveEffectiveExecutionContract", () => { ).toBe("strict-agentic"); }); + it("auto-activates on the mock-openai qa lane", () => { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: "mock-openai", + modelId: "mock-openai/gpt-5.4", + }), + ).toBe("strict-agentic"); + }); + it("auto-activates on gpt-5o and variants without a separator", () => { for (const modelId of ["gpt-5", "gpt-5o", "gpt-5o-mini"]) { expect( diff --git a/src/agents/execution-contract.ts b/src/agents/execution-contract.ts index f7bc1624bfd..bb3d72b822d 100644 --- a/src/agents/execution-contract.ts +++ b/src/agents/execution-contract.ts @@ -39,14 +39,16 @@ const STRICT_AGENTIC_MODEL_ID_PATTERN = /^gpt-5(?:[.o-]|$)/i; * Supported provider + model combinations where strict-agentic is the intended * runtime contract. Kept as a narrow helper so both the execution-contract * resolver and the `update_plan` auto-enable gate converge on the same - * definition of "GPT-5-family openai/openai-codex run". + * definition of "GPT-5-family openai/openai-codex run". The embedded + * `mock-openai` QA lane intentionally piggybacks on that contract so repo QA + * can exercise the same incomplete-turn recovery rules end to end. */ export function isStrictAgenticSupportedProviderModel(params: { provider?: string | null; modelId?: string | null; }): boolean { const provider = normalizeLowercaseStringOrEmpty(params.provider ?? ""); - if (provider !== "openai" && provider !== "openai-codex") { + if (provider !== "openai" && provider !== "openai-codex" && provider !== "mock-openai") { return false; } const modelId = typeof params.modelId === "string" ? params.modelId : "";