mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 09:40:43 +00:00
test(qa-lab): seed broken-turn recovery scenarios (#66416)
This commit is contained in:
@@ -118,6 +118,30 @@ describe("qa scenario catalog", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("includes the seeded mock-only broken-turn scenarios in the markdown pack", () => {
|
||||
const scenarioIds = [
|
||||
"reasoning-only-recovery-replay-safe-read",
|
||||
"reasoning-only-no-auto-retry-after-write",
|
||||
"empty-response-recovery-replay-safe-read",
|
||||
"empty-response-retry-budget-exhausted",
|
||||
];
|
||||
|
||||
for (const scenarioId of scenarioIds) {
|
||||
const scenario = readQaScenarioById(scenarioId);
|
||||
const config = readQaScenarioExecutionConfig(scenarioId) as
|
||||
| {
|
||||
requiredProvider?: string;
|
||||
prompt?: string;
|
||||
}
|
||||
| undefined;
|
||||
|
||||
expect(scenario.sourcePath).toBe(`qa/scenarios/${scenarioId}.md`);
|
||||
expect(config?.requiredProvider).toBe("mock-openai");
|
||||
expect(config?.prompt).toContain("check");
|
||||
expect(scenario.execution.flow?.steps.length).toBeGreaterThan(0);
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps mock-only image debug assertions guarded in live-frontier runs", () => {
|
||||
const scenario = readQaScenarioPack().scenarios.find(
|
||||
(candidate) => candidate.id === "image-understanding-attachment",
|
||||
|
||||
81
qa/scenarios/empty-response-recovery-replay-safe-read.md
Normal file
81
qa/scenarios/empty-response-recovery-replay-safe-read.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# Empty-response recovery after replay-safe read
|
||||
|
||||
```yaml qa-scenario
|
||||
id: empty-response-recovery-replay-safe-read
|
||||
title: Empty-response recovery after replay-safe read
|
||||
surface: runtime
|
||||
objective: Verify an empty visible GPT turn after a replay-safe read auto-continues into a visible answer.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
- The agent performs a replay-safe read before the empty response.
|
||||
- The runtime injects the visible-answer continuation instruction after the empty turn.
|
||||
- The final visible reply contains the exact recovery marker.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Verify empty OpenAI turns recover after a replay-safe read.
|
||||
config:
|
||||
requiredProvider: mock-openai
|
||||
promptSnippet: Empty response continuation QA check
|
||||
prompt: "Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK."
|
||||
expectedReply: EMPTY-RECOVERED-OK
|
||||
retryNeedle: The previous attempt did not produce a user-visible answer.
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: retries an empty replay-safe read into a visible answer
|
||||
actions:
|
||||
- assert:
|
||||
expr: "env.providerMode === 'mock-openai'"
|
||||
message: this seeded scenario is mock-openai only
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: reset
|
||||
- set: requestCountBefore
|
||||
value:
|
||||
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
|
||||
- set: sessionKey
|
||||
value:
|
||||
expr: "`agent:qa:empty-response-recovery:${randomUUID().slice(0, 8)}`"
|
||||
- call: runAgentPrompt
|
||||
args:
|
||||
- ref: env
|
||||
- sessionKey:
|
||||
ref: sessionKey
|
||||
message:
|
||||
expr: config.prompt
|
||||
timeoutMs:
|
||||
expr: liveTurnTimeoutMs(env, 45000)
|
||||
- call: waitForOutboundMessage
|
||||
saveAs: outbound
|
||||
args:
|
||||
- ref: state
|
||||
- lambda:
|
||||
params: [candidate]
|
||||
expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)"
|
||||
- expr: liveTurnTimeoutMs(env, 30000)
|
||||
- assert:
|
||||
expr: "outbound.text.includes(config.expectedReply)"
|
||||
message:
|
||||
expr: "`missing empty-response recovery marker: ${outbound.text}`"
|
||||
- if:
|
||||
expr: "Boolean(env.mock)"
|
||||
then:
|
||||
- set: scenarioRequests
|
||||
value:
|
||||
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
|
||||
- assert:
|
||||
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')"
|
||||
message: expected replay-safe read request in mock trace
|
||||
- assert:
|
||||
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))"
|
||||
message: expected empty-response retry instruction in mock trace
|
||||
detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text"
|
||||
```
|
||||
75
qa/scenarios/empty-response-retry-budget-exhausted.md
Normal file
75
qa/scenarios/empty-response-retry-budget-exhausted.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Empty-response retry budget exhausted
|
||||
|
||||
```yaml qa-scenario
|
||||
id: empty-response-retry-budget-exhausted
|
||||
title: Empty-response retry budget exhausted
|
||||
surface: runtime
|
||||
objective: Verify repeated empty GPT turns exhaust the retry budget after one continuation attempt.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
- The agent performs the replay-safe read that makes retrying allowed.
|
||||
- Mock trace shows the run reaches a terminal post-read turn without ever producing the requested success marker.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Verify empty-response retry exhaustion still surfaces a visible failure.
|
||||
config:
|
||||
requiredProvider: mock-openai
|
||||
promptSnippet: Empty response exhaustion QA check
|
||||
prompt: "Empty response exhaustion QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-EXHAUSTED-OK."
|
||||
retryNeedle: The previous attempt did not produce a user-visible answer.
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: surfaces a retry error after empty-response exhaustion
|
||||
actions:
|
||||
- assert:
|
||||
expr: "env.providerMode === 'mock-openai'"
|
||||
message: this seeded scenario is mock-openai only
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: reset
|
||||
- set: requestCountBefore
|
||||
value:
|
||||
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
|
||||
- set: sessionKey
|
||||
value:
|
||||
expr: "`agent:qa:empty-response-exhausted:${randomUUID().slice(0, 8)}`"
|
||||
- call: startAgentRun
|
||||
saveAs: started
|
||||
args:
|
||||
- ref: env
|
||||
- sessionKey:
|
||||
ref: sessionKey
|
||||
message:
|
||||
expr: config.prompt
|
||||
timeoutMs:
|
||||
expr: liveTurnTimeoutMs(env, 45000)
|
||||
- set: waited
|
||||
value:
|
||||
expr: "await env.gateway.call('agent.wait', { runId: started.runId, timeoutMs: liveTurnTimeoutMs(env, 45000) }, { timeoutMs: liveTurnTimeoutMs(env, 50000) })"
|
||||
- assert:
|
||||
expr: "waited?.status === 'ok'"
|
||||
message:
|
||||
expr: "`agent.wait returned ${String(waited?.status ?? 'unknown')}: ${String(waited?.error ?? '')}`"
|
||||
- if:
|
||||
expr: "Boolean(env.mock)"
|
||||
then:
|
||||
- set: scenarioRequests
|
||||
value:
|
||||
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
|
||||
- assert:
|
||||
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')"
|
||||
message: expected replay-safe read request in mock trace
|
||||
- assert:
|
||||
expr: "scenarioRequests.length >= 2"
|
||||
message: expected at least the replay-safe read request and one terminal post-read turn
|
||||
detailsExpr: "env.mock ? `requests=${String(scenarioRequests?.length ?? 0)}` : String(waited?.status ?? '')"
|
||||
```
|
||||
90
qa/scenarios/reasoning-only-no-auto-retry-after-write.md
Normal file
90
qa/scenarios/reasoning-only-no-auto-retry-after-write.md
Normal file
@@ -0,0 +1,90 @@
|
||||
# Reasoning-only no-auto-retry after write
|
||||
|
||||
```yaml qa-scenario
|
||||
id: reasoning-only-no-auto-retry-after-write
|
||||
title: Reasoning-only no-auto-retry after write
|
||||
surface: runtime
|
||||
objective: Verify a GPT-style reasoning-only turn after a mutating write stays replay-unsafe and does not auto-retry.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
- The agent performs the seeded mutating write.
|
||||
- Mock trace does not include an automatic reasoning-only retry instruction.
|
||||
- Mock trace stops after the write-side reasoning-only terminal turn instead of attempting a continuation.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/help/gpt54-codex-agentic-parity.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Verify reasoning-only turns after a write do not auto-retry.
|
||||
config:
|
||||
requiredProvider: mock-openai
|
||||
promptSnippet: Reasoning-only after write safety check
|
||||
prompt: "Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK."
|
||||
retryNeedle: recorded reasoning but did not produce a user-visible answer
|
||||
outputFile: reasoning-only-side-effect.txt
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: keeps replay-unsafety explicit after a mutating write
|
||||
actions:
|
||||
- assert:
|
||||
expr: "env.providerMode === 'mock-openai'"
|
||||
message: this seeded scenario is mock-openai only
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: reset
|
||||
- set: requestCountBefore
|
||||
value:
|
||||
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
|
||||
- set: sessionKey
|
||||
value:
|
||||
expr: "`agent:qa:reasoning-only-write:${randomUUID().slice(0, 8)}`"
|
||||
- call: startAgentRun
|
||||
saveAs: started
|
||||
args:
|
||||
- ref: env
|
||||
- sessionKey:
|
||||
ref: sessionKey
|
||||
message:
|
||||
expr: config.prompt
|
||||
timeoutMs:
|
||||
expr: liveTurnTimeoutMs(env, 45000)
|
||||
- set: waited
|
||||
value:
|
||||
expr: "await env.gateway.call('agent.wait', { runId: started.runId, timeoutMs: liveTurnTimeoutMs(env, 45000) }, { timeoutMs: liveTurnTimeoutMs(env, 50000) })"
|
||||
- assert:
|
||||
expr: "waited?.status === 'ok'"
|
||||
message:
|
||||
expr: "`agent.wait returned ${String(waited?.status ?? 'unknown')}: ${String(waited?.error ?? '')}`"
|
||||
- call: fs.readFile
|
||||
saveAs: sideEffect
|
||||
args:
|
||||
- expr: "path.join(env.gateway.workspaceDir, config.outputFile)"
|
||||
- utf8
|
||||
- assert:
|
||||
expr: "sideEffect.includes('side effects already happened')"
|
||||
message:
|
||||
expr: "`side-effect file missing expected contents: ${sideEffect}`"
|
||||
- if:
|
||||
expr: "Boolean(env.mock)"
|
||||
then:
|
||||
- set: scenarioRequests
|
||||
value:
|
||||
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
|
||||
- assert:
|
||||
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'write')"
|
||||
message: expected mutating write request in mock trace
|
||||
- assert:
|
||||
expr: "!scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))"
|
||||
message: reasoning-only retry instruction should not be injected after a write
|
||||
- assert:
|
||||
expr: "scenarioRequests.filter((request) => String(request.allInputText ?? '').includes(config.promptSnippet)).length === 2"
|
||||
message: expected exactly the write request plus the reasoning-only terminal request
|
||||
detailsExpr: "env.mock ? `requests=${String(scenarioRequests?.length ?? 0)} sideEffect=${sideEffect.trim()}` : sideEffect"
|
||||
```
|
||||
81
qa/scenarios/reasoning-only-recovery-replay-safe-read.md
Normal file
81
qa/scenarios/reasoning-only-recovery-replay-safe-read.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# Reasoning-only recovery after replay-safe read
|
||||
|
||||
```yaml qa-scenario
|
||||
id: reasoning-only-recovery-replay-safe-read
|
||||
title: Reasoning-only recovery after replay-safe read
|
||||
surface: runtime
|
||||
objective: Verify a GPT-style reasoning-only turn after a replay-safe read auto-continues into a visible answer.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
- The agent performs a replay-safe read before the reasoning-only turn.
|
||||
- The runtime injects the visible-answer continuation instruction after the reasoning-only turn.
|
||||
- The final visible reply contains the exact recovery marker.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Verify reasoning-only OpenAI turns recover after a replay-safe read.
|
||||
config:
|
||||
requiredProvider: mock-openai
|
||||
promptSnippet: Reasoning-only continuation QA check
|
||||
prompt: "Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK."
|
||||
expectedReply: REASONING-RECOVERED-OK
|
||||
retryNeedle: recorded reasoning but did not produce a user-visible answer
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: retries a replay-safe read into a visible answer
|
||||
actions:
|
||||
- assert:
|
||||
expr: "env.providerMode === 'mock-openai'"
|
||||
message: this seeded scenario is mock-openai only
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: reset
|
||||
- set: requestCountBefore
|
||||
value:
|
||||
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
|
||||
- set: sessionKey
|
||||
value:
|
||||
expr: "`agent:qa:reasoning-only-recovery:${randomUUID().slice(0, 8)}`"
|
||||
- call: runAgentPrompt
|
||||
args:
|
||||
- ref: env
|
||||
- sessionKey:
|
||||
ref: sessionKey
|
||||
message:
|
||||
expr: config.prompt
|
||||
timeoutMs:
|
||||
expr: liveTurnTimeoutMs(env, 45000)
|
||||
- call: waitForOutboundMessage
|
||||
saveAs: outbound
|
||||
args:
|
||||
- ref: state
|
||||
- lambda:
|
||||
params: [candidate]
|
||||
expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)"
|
||||
- expr: liveTurnTimeoutMs(env, 30000)
|
||||
- assert:
|
||||
expr: "outbound.text.includes(config.expectedReply)"
|
||||
message:
|
||||
expr: "`missing recovery marker: ${outbound.text}`"
|
||||
- if:
|
||||
expr: "Boolean(env.mock)"
|
||||
then:
|
||||
- set: scenarioRequests
|
||||
value:
|
||||
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
|
||||
- assert:
|
||||
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.plannedToolName === 'read')"
|
||||
message: expected replay-safe read request in mock trace
|
||||
- assert:
|
||||
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.retryNeedle))"
|
||||
message: expected reasoning-only retry instruction in mock trace
|
||||
detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text"
|
||||
```
|
||||
@@ -21,6 +21,16 @@ describe("resolveEffectiveExecutionContract", () => {
|
||||
).toBe("strict-agentic");
|
||||
});
|
||||
|
||||
it("auto-activates on the mock-openai qa lane", () => {
|
||||
expect(
|
||||
resolveEffectiveExecutionContract({
|
||||
config: emptyConfig,
|
||||
provider: "mock-openai",
|
||||
modelId: "mock-openai/gpt-5.4",
|
||||
}),
|
||||
).toBe("strict-agentic");
|
||||
});
|
||||
|
||||
it("auto-activates on gpt-5o and variants without a separator", () => {
|
||||
for (const modelId of ["gpt-5", "gpt-5o", "gpt-5o-mini"]) {
|
||||
expect(
|
||||
|
||||
@@ -39,14 +39,16 @@ const STRICT_AGENTIC_MODEL_ID_PATTERN = /^gpt-5(?:[.o-]|$)/i;
|
||||
* Supported provider + model combinations where strict-agentic is the intended
|
||||
* runtime contract. Kept as a narrow helper so both the execution-contract
|
||||
* resolver and the `update_plan` auto-enable gate converge on the same
|
||||
* definition of "GPT-5-family openai/openai-codex run".
|
||||
* definition of "GPT-5-family openai/openai-codex run". The embedded
|
||||
* `mock-openai` QA lane intentionally piggybacks on that contract so repo QA
|
||||
* can exercise the same incomplete-turn recovery rules end to end.
|
||||
*/
|
||||
export function isStrictAgenticSupportedProviderModel(params: {
|
||||
provider?: string | null;
|
||||
modelId?: string | null;
|
||||
}): boolean {
|
||||
const provider = normalizeLowercaseStringOrEmpty(params.provider ?? "");
|
||||
if (provider !== "openai" && provider !== "openai-codex") {
|
||||
if (provider !== "openai" && provider !== "openai-codex" && provider !== "mock-openai") {
|
||||
return false;
|
||||
}
|
||||
const modelId = typeof params.modelId === "string" ? params.modelId : "";
|
||||
|
||||
Reference in New Issue
Block a user