From e66a6c8c8d4fbe3331be19c8167f9d757e2bb2ba Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 17 May 2026 13:49:15 +0800 Subject: [PATCH] test(qa-lab): add runtime parity depth scenarios --- CHANGELOG.md | 1 + .../qa-lab/src/scenario-catalog.test.ts | 13 ++++ extensions/qa-lab/src/scenario-catalog.ts | 4 ++ qa/scenarios/index.md | 4 +- qa/scenarios/runtime/first-hour-20-turn.md | 69 +++++++++++++++++++ qa/scenarios/runtime/soak-100-turn.md | 68 ++++++++++++++++++ 6 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 qa/scenarios/runtime/first-hour-20-turn.md create mode 100644 qa/scenarios/runtime/soak-100-turn.md diff --git a/CHANGELOG.md b/CHANGELOG.md index a6574fb1a83..e54abd52a61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai ### Changes - Proxy: support HTTPS managed forward-proxy endpoints and scoped `proxy.tls.caFile` CA trust for proxy endpoint TLS. (#79171) Thanks @jesse-merhi. +- QA-Lab: add first-hour 20-turn and optional 100-turn runtime parity scenarios, with tier metadata for standard and soak QA gates. (#80323) Thanks @100yenadmin. ### Fixes diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index a69475d4179..7e1ce8698b7 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -103,6 +103,19 @@ describe("qa scenario catalog", () => { expect(scenario.gatewayRuntime?.forwardHostHome).toBe(true); }); + it("loads runtime parity tier metadata for first-hour and soak lanes", () => { + const firstHour = readQaScenarioById("runtime-first-hour-20-turn"); + const soak = readQaScenarioById("runtime-soak-100-turn"); + + expect(firstHour.runtimeParityTier).toBe("standard"); + expect(readQaScenarioExecutionConfig(firstHour.id)).toMatchObject({ + runtimeParityComparison: "outcome-only", + turnCount: 20, + }); + expect(soak.runtimeParityTier).toBe("soak"); + expect(readQaScenarioExecutionConfig(soak.id)).toMatchObject({ turnCount: 100 }); + }); + it("keeps the character eval scenario natural and task-shaped", () => { const characterConfig = readQaScenarioExecutionConfig("character-vibes-gollum") as | { diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index 554147732ee..1d93ba31951 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -93,6 +93,8 @@ const qaScenarioGatewayRuntimeSchema = z.object({ forwardHostHome: z.boolean().optional(), }); +const qaRuntimeParityTierSchema = z.enum(["standard", "optional", "live-only", "soak"]); + const qaFlowCallActionSchema = z.object({ call: z.string().trim().min(1), args: z.array(z.unknown()).optional(), @@ -176,6 +178,7 @@ const qaSeedScenarioSchema = z.object({ title: z.string().trim().min(1), surface: z.string().trim().min(1), category: z.string().trim().min(1).optional(), + runtimeParityTier: qaRuntimeParityTierSchema.optional(), coverage: qaScenarioCoverageSchema.optional(), surfaces: z.array(z.string().trim().min(1)).min(1).optional(), risk: z.enum(["low", "medium", "high"]).optional(), @@ -206,6 +209,7 @@ const qaScenarioPackSchema = z.object({ export type QaScenarioExecution = z.infer; export type QaScenarioFlow = z.infer; +export type QaRuntimeParityTier = z.infer; export type QaSeedScenario = z.infer; export type QaSeedScenarioWithSource = QaSeedScenario & { sourcePath: string; diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md index 0b9fe1710a1..7a7f019bad6 100644 --- a/qa/scenarios/index.md +++ b/qa/scenarios/index.md @@ -6,7 +6,7 @@ Single source of truth for repo-backed QA suite bootstrap data. - `index.md` defines pack-level bootstrap data - each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow` - scenario markdown may also define coverage IDs, category metadata, required plugins, - lane filters, and gateway config patching + lane filters, runtime parity tiers, and gateway config patching - kickoff mission - QA operator identity @@ -20,6 +20,8 @@ Coverage tracking: - prefer reusing an existing feature ID over minting a scenario-shaped ID - avoid copying the scenario title into coverage IDs - use `pnpm openclaw qa coverage` to render the current inventory +- use `runtimeParityTier` for runtime-pair gate membership: `standard`, + `optional`, `live-only`, or `soak` - treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid - keep source-path tracking in the report, not in the scenario schema diff --git a/qa/scenarios/runtime/first-hour-20-turn.md b/qa/scenarios/runtime/first-hour-20-turn.md new file mode 100644 index 00000000000..d9041c45eec --- /dev/null +++ b/qa/scenarios/runtime/first-hour-20-turn.md @@ -0,0 +1,69 @@ +# First-hour 20-turn runtime parity + +```yaml qa-scenario +id: runtime-first-hour-20-turn +title: First-hour 20-turn runtime parity +surface: runtime +runtimeParityTier: standard +coverage: + primary: + - runtime.first-hour-20 + secondary: + - runtime.long-context +objective: Verify both runtimes preserve a same-session conversation across the required 20-turn maintainer gate. +successCriteria: + - The same QA session accepts 20 sequential user turns. + - Every turn receives the requested marker reply without losing session state. + - Runtime parity captures wall-clock and token data for the whole 20-turn cell. +docsRefs: + - docs/concepts/qa-e2e-automation.md + - qa/scenarios/index.md +codeRefs: + - extensions/qa-lab/src/suite.ts + - extensions/qa-lab/src/runtime-parity.ts +execution: + kind: flow + summary: Run 20 deterministic same-session marker turns through the runtime pair. + config: + runtimeParityComparison: outcome-only + sessionKey: agent:qa:first-hour-20-turn + turnCount: 20 +``` + +```yaml qa-flow +steps: + - name: runs 20 same-session marker turns + actions: + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: reset + - set: turns + value: + expr: "Array.from({ length: config.turnCount }, (_entry, index) => ({ index, marker: `FIRST-HOUR-20-${String(index + 1).padStart(2, '0')}` }))" + - forEach: + items: + ref: turns + item: turn + actions: + - set: cursor + value: + expr: state.getSnapshot().messages.length + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + expr: config.sessionKey + message: + expr: "'first-hour 20-turn marker check ' + (turn.index + 1) + ': reply exactly `' + turn.marker + '`'" + timeoutMs: + expr: liveTurnTimeoutMs(env, 60000) + - call: waitForCondition + args: + - lambda: + expr: "state.getSnapshot().messages.slice(cursor).some((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && normalizeLowercaseStringOrEmpty(candidate.text).includes(normalizeLowercaseStringOrEmpty(turn.marker)))" + - expr: liveTurnTimeoutMs(env, 60000) + - expr: "env.providerMode === 'mock-openai' ? 100 : 250" + detailsExpr: "`completed ${turns.length} first-hour depth turns`" +``` diff --git a/qa/scenarios/runtime/soak-100-turn.md b/qa/scenarios/runtime/soak-100-turn.md new file mode 100644 index 00000000000..d4f4caa8b80 --- /dev/null +++ b/qa/scenarios/runtime/soak-100-turn.md @@ -0,0 +1,68 @@ +# 100-turn runtime parity soak + +```yaml qa-scenario +id: runtime-soak-100-turn +title: 100-turn runtime parity soak +surface: runtime +runtimeParityTier: soak +coverage: + primary: + - runtime.soak-100 + secondary: + - runtime.long-context +objective: Provide an optional long-run soak that can be scheduled or run in Testbox without entering the maintainer default gate. +successCriteria: + - The same QA session accepts 100 sequential user turns. + - Every turn receives the requested marker reply without losing session state. + - Runtime parity captures token estimate or live token usage for the full soak cell. +docsRefs: + - docs/concepts/qa-e2e-automation.md + - qa/scenarios/index.md +codeRefs: + - extensions/qa-lab/src/suite.ts + - extensions/qa-lab/src/runtime-parity.ts +execution: + kind: flow + summary: Run the optional 100-turn same-session runtime soak. + config: + sessionKey: agent:qa:runtime-soak-100 + turnCount: 100 +``` + +```yaml qa-flow +steps: + - name: runs 100 same-session marker turns + actions: + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: reset + - set: turns + value: + expr: "Array.from({ length: config.turnCount }, (_entry, index) => ({ index, marker: `SOAK-100-${String(index + 1).padStart(3, '0')}` }))" + - forEach: + items: + ref: turns + item: turn + actions: + - set: cursor + value: + expr: state.getSnapshot().messages.length + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + expr: config.sessionKey + message: + expr: "'runtime 100-turn soak marker check ' + (turn.index + 1) + ': reply exactly `' + turn.marker + '`'" + timeoutMs: + expr: liveTurnTimeoutMs(env, 60000) + - call: waitForCondition + args: + - lambda: + expr: "state.getSnapshot().messages.slice(cursor).some((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && normalizeLowercaseStringOrEmpty(candidate.text).includes(normalizeLowercaseStringOrEmpty(turn.marker)))" + - expr: liveTurnTimeoutMs(env, 60000) + - expr: "env.providerMode === 'mock-openai' ? 100 : 250" + detailsExpr: "`completed ${turns.length} soak turns`" +```