test(qa-lab): add runtime parity depth scenarios

2026-05-18 16:24:46 +00:00 · 2026-05-17 13:49:15 +08:00
parent 16ef041b5d
commit e66a6c8c8d
6 changed files with 158 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai
 ### Changes

 - Proxy: support HTTPS managed forward-proxy endpoints and scoped `proxy.tls.caFile` CA trust for proxy endpoint TLS. (#79171) Thanks @jesse-merhi.
+- QA-Lab: add first-hour 20-turn and optional 100-turn runtime parity scenarios, with tier metadata for standard and soak QA gates. (#80323) Thanks @100yenadmin.

 ### Fixes

--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -103,6 +103,19 @@ describe("qa scenario catalog", () => {
    expect(scenario.gatewayRuntime?.forwardHostHome).toBe(true);
  });

+  it("loads runtime parity tier metadata for first-hour and soak lanes", () => {
+    const firstHour = readQaScenarioById("runtime-first-hour-20-turn");
+    const soak = readQaScenarioById("runtime-soak-100-turn");
+
+    expect(firstHour.runtimeParityTier).toBe("standard");
+    expect(readQaScenarioExecutionConfig(firstHour.id)).toMatchObject({
+      runtimeParityComparison: "outcome-only",
+      turnCount: 20,
+    });
+    expect(soak.runtimeParityTier).toBe("soak");
+    expect(readQaScenarioExecutionConfig(soak.id)).toMatchObject({ turnCount: 100 });
+  });
+
  it("keeps the character eval scenario natural and task-shaped", () => {
    const characterConfig = readQaScenarioExecutionConfig("character-vibes-gollum") as
      | {
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -93,6 +93,8 @@ const qaScenarioGatewayRuntimeSchema = z.object({
  forwardHostHome: z.boolean().optional(),
 });

+const qaRuntimeParityTierSchema = z.enum(["standard", "optional", "live-only", "soak"]);
+
 const qaFlowCallActionSchema = z.object({
  call: z.string().trim().min(1),
  args: z.array(z.unknown()).optional(),
@@ -176,6 +178,7 @@ const qaSeedScenarioSchema = z.object({
  title: z.string().trim().min(1),
  surface: z.string().trim().min(1),
  category: z.string().trim().min(1).optional(),
+  runtimeParityTier: qaRuntimeParityTierSchema.optional(),
  coverage: qaScenarioCoverageSchema.optional(),
  surfaces: z.array(z.string().trim().min(1)).min(1).optional(),
  risk: z.enum(["low", "medium", "high"]).optional(),
@@ -206,6 +209,7 @@ const qaScenarioPackSchema = z.object({

 export type QaScenarioExecution = z.infer<typeof qaScenarioExecutionSchema>;
 export type QaScenarioFlow = z.infer<typeof qaFlowSchema>;
+export type QaRuntimeParityTier = z.infer<typeof qaRuntimeParityTierSchema>;
 export type QaSeedScenario = z.infer<typeof qaSeedScenarioSchema>;
 export type QaSeedScenarioWithSource = QaSeedScenario & {
  sourcePath: string;
--- a/qa/scenarios/index.md
+++ b/qa/scenarios/index.md
@@ -6,7 +6,7 @@ Single source of truth for repo-backed QA suite bootstrap data.
 - `index.md` defines pack-level bootstrap data
 - each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow`
 - scenario markdown may also define coverage IDs, category metadata, required plugins,
-  lane filters, and gateway config patching
+  lane filters, runtime parity tiers, and gateway config patching

 - kickoff mission
 - QA operator identity
@@ -20,6 +20,8 @@ Coverage tracking:
 - prefer reusing an existing feature ID over minting a scenario-shaped ID
 - avoid copying the scenario title into coverage IDs
 - use `pnpm openclaw qa coverage` to render the current inventory
+- use `runtimeParityTier` for runtime-pair gate membership: `standard`,
+  `optional`, `live-only`, or `soak`
 - treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
 - keep source-path tracking in the report, not in the scenario schema

--- a/qa/scenarios/runtime/first-hour-20-turn.md
+++ b/qa/scenarios/runtime/first-hour-20-turn.md
@@ -0,0 +1,69 @@
+# First-hour 20-turn runtime parity
+
+```yaml qa-scenario
+id: runtime-first-hour-20-turn
+title: First-hour 20-turn runtime parity
+surface: runtime
+runtimeParityTier: standard
+coverage:
+  primary:
+    - runtime.first-hour-20
+  secondary:
+    - runtime.long-context
+objective: Verify both runtimes preserve a same-session conversation across the required 20-turn maintainer gate.
+successCriteria:
+  - The same QA session accepts 20 sequential user turns.
+  - Every turn receives the requested marker reply without losing session state.
+  - Runtime parity captures wall-clock and token data for the whole 20-turn cell.
+docsRefs:
+  - docs/concepts/qa-e2e-automation.md
+  - qa/scenarios/index.md
+codeRefs:
+  - extensions/qa-lab/src/suite.ts
+  - extensions/qa-lab/src/runtime-parity.ts
+execution:
+  kind: flow
+  summary: Run 20 deterministic same-session marker turns through the runtime pair.
+  config:
+    runtimeParityComparison: outcome-only
+    sessionKey: agent:qa:first-hour-20-turn
+    turnCount: 20
+```
+
+```yaml qa-flow
+steps:
+  - name: runs 20 same-session marker turns
+    actions:
+      - call: waitForGatewayHealthy
+        args:
+          - ref: env
+          - 60000
+      - call: reset
+      - set: turns
+        value:
+          expr: "Array.from({ length: config.turnCount }, (_entry, index) => ({ index, marker: `FIRST-HOUR-20-${String(index + 1).padStart(2, '0')}` }))"
+      - forEach:
+          items:
+            ref: turns
+          item: turn
+          actions:
+            - set: cursor
+              value:
+                expr: state.getSnapshot().messages.length
+            - call: runAgentPrompt
+              args:
+                - ref: env
+                - sessionKey:
+                    expr: config.sessionKey
+                  message:
+                    expr: "'first-hour 20-turn marker check ' + (turn.index + 1) + ': reply exactly `' + turn.marker + '`'"
+                  timeoutMs:
+                    expr: liveTurnTimeoutMs(env, 60000)
+            - call: waitForCondition
+              args:
+                - lambda:
+                    expr: "state.getSnapshot().messages.slice(cursor).some((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && normalizeLowercaseStringOrEmpty(candidate.text).includes(normalizeLowercaseStringOrEmpty(turn.marker)))"
+                - expr: liveTurnTimeoutMs(env, 60000)
+                - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
+    detailsExpr: "`completed ${turns.length} first-hour depth turns`"
+```
--- a/qa/scenarios/runtime/soak-100-turn.md
+++ b/qa/scenarios/runtime/soak-100-turn.md
@@ -0,0 +1,68 @@
+# 100-turn runtime parity soak
+
+```yaml qa-scenario
+id: runtime-soak-100-turn
+title: 100-turn runtime parity soak
+surface: runtime
+runtimeParityTier: soak
+coverage:
+  primary:
+    - runtime.soak-100
+  secondary:
+    - runtime.long-context
+objective: Provide an optional long-run soak that can be scheduled or run in Testbox without entering the maintainer default gate.
+successCriteria:
+  - The same QA session accepts 100 sequential user turns.
+  - Every turn receives the requested marker reply without losing session state.
+  - Runtime parity captures token estimate or live token usage for the full soak cell.
+docsRefs:
+  - docs/concepts/qa-e2e-automation.md
+  - qa/scenarios/index.md
+codeRefs:
+  - extensions/qa-lab/src/suite.ts
+  - extensions/qa-lab/src/runtime-parity.ts
+execution:
+  kind: flow
+  summary: Run the optional 100-turn same-session runtime soak.
+  config:
+    sessionKey: agent:qa:runtime-soak-100
+    turnCount: 100
+```
+
+```yaml qa-flow
+steps:
+  - name: runs 100 same-session marker turns
+    actions:
+      - call: waitForGatewayHealthy
+        args:
+          - ref: env
+          - 60000
+      - call: reset
+      - set: turns
+        value:
+          expr: "Array.from({ length: config.turnCount }, (_entry, index) => ({ index, marker: `SOAK-100-${String(index + 1).padStart(3, '0')}` }))"
+      - forEach:
+          items:
+            ref: turns
+          item: turn
+          actions:
+            - set: cursor
+              value:
+                expr: state.getSnapshot().messages.length
+            - call: runAgentPrompt
+              args:
+                - ref: env
+                - sessionKey:
+                    expr: config.sessionKey
+                  message:
+                    expr: "'runtime 100-turn soak marker check ' + (turn.index + 1) + ': reply exactly `' + turn.marker + '`'"
+                  timeoutMs:
+                    expr: liveTurnTimeoutMs(env, 60000)
+            - call: waitForCondition
+              args:
+                - lambda:
+                    expr: "state.getSnapshot().messages.slice(cursor).some((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && normalizeLowercaseStringOrEmpty(candidate.text).includes(normalizeLowercaseStringOrEmpty(turn.marker)))"
+                - expr: liveTurnTimeoutMs(env, 60000)
+                - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
+    detailsExpr: "`completed ${turns.length} soak turns`"
+```