From 9bd97d2c60ebe68ea40c3a65d346ea56b0ed5a6c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 22 May 2026 09:48:57 +0800 Subject: [PATCH] test(qa-lab): remove generic evidence wording --- CHANGELOG.md | 1 + extensions/qa-lab/src/model-switch-eval.test.ts | 14 +++++++------- extensions/qa-lab/src/model-switch-eval.ts | 2 +- extensions/qa-lab/src/scenario-catalog.ts | 2 +- extensions/qa-lab/src/scenario-runtime-api.test.ts | 2 +- extensions/qa-lab/src/scenario-runtime-api.ts | 6 +++--- extensions/qa-lab/src/suite-runtime-flow.test.ts | 4 ++-- extensions/qa-lab/src/suite-runtime-flow.ts | 4 ++-- qa/scenarios/index.md | 4 ++-- .../models/model-switch-tool-continuity.md | 6 +++--- .../runtime/approval-turn-tool-followthrough.md | 2 +- 11 files changed, 24 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dba9e9cd6c2..e505e43cd33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1. - Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity. - QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin. +- QA-Lab: replace generic evidence framing in seeded scenario prompts with concrete observed QA behavior. - QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin. - QA-Lab: add a live-only long-context progress watchdog scenario for Codex app-server timeout and stalled-run sentinels. (#80323) Thanks @100yenadmin. - QA-Lab: tag gateway restart recovery and streaming final-integrity scenarios as live-only runtime parity lanes. (#80323) Thanks @100yenadmin. diff --git a/extensions/qa-lab/src/model-switch-eval.test.ts b/extensions/qa-lab/src/model-switch-eval.test.ts index 26dbe14c194..ee3af0e2e67 100644 --- a/extensions/qa-lab/src/model-switch-eval.test.ts +++ b/extensions/qa-lab/src/model-switch-eval.test.ts @@ -1,10 +1,10 @@ import { describe, expect, it } from "vitest"; -import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js"; +import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js"; describe("qa model-switch evaluation", () => { it("accepts direct handoff replies that mention the kickoff task", () => { expect( - hasModelSwitchContinuityEvidence( + hasModelSwitchContinuitySignal( "Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.", ), ).toBe(true); @@ -12,7 +12,7 @@ describe("qa model-switch evaluation", () => { it("accepts short mission-oriented switch confirmations", () => { expect( - hasModelSwitchContinuityEvidence( + hasModelSwitchContinuitySignal( "model switch complete. reread the kickoff task; qa mission stays the same.", ), ).toBe(true); @@ -20,7 +20,7 @@ describe("qa model-switch evaluation", () => { it("accepts concise kickoff note confirmations", () => { expect( - hasModelSwitchContinuityEvidence( + hasModelSwitchContinuitySignal( "Handoff clean: after the model switch, I reread the kickoff note.", ), ).toBe(true); @@ -28,7 +28,7 @@ describe("qa model-switch evaluation", () => { it("accepts concise paraphrases of the kickoff task after a handoff", () => { expect( - hasModelSwitchContinuityEvidence( + hasModelSwitchContinuitySignal( "Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.", ), ).toBe(true); @@ -36,7 +36,7 @@ describe("qa model-switch evaluation", () => { it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => { expect( - hasModelSwitchContinuityEvidence( + hasModelSwitchContinuitySignal( "subagent-handoff confirmed. qa report update: scenario pass. qa run complete.", ), ).toBe(false); @@ -44,7 +44,7 @@ describe("qa model-switch evaluation", () => { it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => { expect( - hasModelSwitchContinuityEvidence( + hasModelSwitchContinuitySignal( `model switch acknowledged. qa mission stays the same. Final QA tally update: all mandatory scenarios resolved. QA run complete.`, diff --git a/extensions/qa-lab/src/model-switch-eval.ts b/extensions/qa-lab/src/model-switch-eval.ts index 7583bbe08bf..7cf030d7ae9 100644 --- a/extensions/qa-lab/src/model-switch-eval.ts +++ b/extensions/qa-lab/src/model-switch-eval.ts @@ -1,6 +1,6 @@ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/string-coerce-runtime"; -export function hasModelSwitchContinuityEvidence(text: string) { +export function hasModelSwitchContinuitySignal(text: string) { const lower = normalizeLowercaseStringOrEmpty(text); const mentionsHandoff = lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched"); diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index ab0a82c0593..02f8cb3268d 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -17,7 +17,7 @@ Persona: Style: - read source and docs first - test systematically -- record evidence +- record what happened - end with a concise protocol report`; const qaScenarioConfigSchema = z.record(z.string(), z.unknown()).superRefine((config, ctx) => { diff --git a/extensions/qa-lab/src/scenario-runtime-api.test.ts b/extensions/qa-lab/src/scenario-runtime-api.test.ts index effee177572..9f15cc68dd4 100644 --- a/extensions/qa-lab/src/scenario-runtime-api.test.ts +++ b/extensions/qa-lab/src/scenario-runtime-api.test.ts @@ -83,7 +83,7 @@ function createDeps(overrides?: Partial): QaScenarioRunti hasDiscoveryLabels: fn, reportsDiscoveryScopeLeak: fn, reportsMissingDiscoveryFiles: fn, - hasModelSwitchContinuityEvidence: fn, + hasModelSwitchContinuitySignal: fn, ...overrides, }; } diff --git a/extensions/qa-lab/src/scenario-runtime-api.ts b/extensions/qa-lab/src/scenario-runtime-api.ts index 07274038b07..f607b14cce6 100644 --- a/extensions/qa-lab/src/scenario-runtime-api.ts +++ b/extensions/qa-lab/src/scenario-runtime-api.ts @@ -95,7 +95,7 @@ export type QaScenarioRuntimeDeps = { hasDiscoveryLabels: QaScenarioRuntimeFunction; reportsDiscoveryScopeLeak: QaScenarioRuntimeFunction; reportsMissingDiscoveryFiles: QaScenarioRuntimeFunction; - hasModelSwitchContinuityEvidence: QaScenarioRuntimeFunction; + hasModelSwitchContinuitySignal: QaScenarioRuntimeFunction; }; export type QaScenarioRuntimeConstants = { @@ -186,7 +186,7 @@ type QaScenarioRuntimeApi< hasDiscoveryLabels: TDeps["hasDiscoveryLabels"]; reportsDiscoveryScopeLeak: TDeps["reportsDiscoveryScopeLeak"]; reportsMissingDiscoveryFiles: TDeps["reportsMissingDiscoveryFiles"]; - hasModelSwitchContinuityEvidence: TDeps["hasModelSwitchContinuityEvidence"]; + hasModelSwitchContinuitySignal: TDeps["hasModelSwitchContinuitySignal"]; imageUnderstandingPngBase64: string; imageUnderstandingLargePngBase64: string; imageUnderstandingValidPngBase64: string; @@ -292,7 +292,7 @@ export function createQaScenarioRuntimeApi< hasDiscoveryLabels: params.deps.hasDiscoveryLabels, reportsDiscoveryScopeLeak: params.deps.reportsDiscoveryScopeLeak, reportsMissingDiscoveryFiles: params.deps.reportsMissingDiscoveryFiles, - hasModelSwitchContinuityEvidence: params.deps.hasModelSwitchContinuityEvidence, + hasModelSwitchContinuitySignal: params.deps.hasModelSwitchContinuitySignal, imageUnderstandingPngBase64: params.constants.imageUnderstandingPngBase64, imageUnderstandingLargePngBase64: params.constants.imageUnderstandingLargePngBase64, imageUnderstandingValidPngBase64: params.constants.imageUnderstandingValidPngBase64, diff --git a/extensions/qa-lab/src/suite-runtime-flow.test.ts b/extensions/qa-lab/src/suite-runtime-flow.test.ts index fc2001280aa..611fc838026 100644 --- a/extensions/qa-lab/src/suite-runtime-flow.test.ts +++ b/extensions/qa-lab/src/suite-runtime-flow.test.ts @@ -54,7 +54,7 @@ const webEvaluate = vi.hoisted(() => vi.fn()); const hasDiscoveryLabels = vi.hoisted(() => vi.fn()); const reportsDiscoveryScopeLeak = vi.hoisted(() => vi.fn()); const reportsMissingDiscoveryFiles = vi.hoisted(() => vi.fn()); -const hasModelSwitchContinuityEvidence = vi.hoisted(() => vi.fn()); +const hasModelSwitchContinuitySignal = vi.hoisted(() => vi.fn()); const qaChannelPlugin = vi.hoisted(() => ({ id: "qa-channel" })); const scanGatewayLogSentinels = vi.hoisted(() => vi.fn()); const assertNoGatewayLogSentinels = vi.hoisted(() => vi.fn()); @@ -144,7 +144,7 @@ vi.mock("./runtime-tool-fixture.js", () => ({ })); vi.mock("./model-switch-eval.js", () => ({ - hasModelSwitchContinuityEvidence, + hasModelSwitchContinuitySignal, })); vi.mock("./runtime-api.js", () => ({ diff --git a/extensions/qa-lab/src/suite-runtime-flow.ts b/extensions/qa-lab/src/suite-runtime-flow.ts index 01c23b8fb28..21f1139b8ed 100644 --- a/extensions/qa-lab/src/suite-runtime-flow.ts +++ b/extensions/qa-lab/src/suite-runtime-flow.ts @@ -21,7 +21,7 @@ import { } from "./discovery-eval.js"; import { extractQaToolPayload } from "./extract-tool-payload.js"; import { assertNoGatewayLogSentinels, scanGatewayLogSentinels } from "./gateway-log-sentinel.js"; -import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js"; +import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js"; import { qaChannelPlugin } from "./runtime-api.js"; import { runRuntimeToolFixture } from "./runtime-tool-fixture.js"; import type { QaSeedScenarioWithSource } from "./scenario-catalog.js"; @@ -213,7 +213,7 @@ function createQaSuiteScenarioDeps(params: QaSuiteScenarioDepsParams) { hasDiscoveryLabels, reportsDiscoveryScopeLeak, reportsMissingDiscoveryFiles, - hasModelSwitchContinuityEvidence, + hasModelSwitchContinuitySignal, }; } diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md index 8f0ee210377..7c246d742e8 100644 --- a/qa/scenarios/index.md +++ b/qa/scenarios/index.md @@ -76,7 +76,7 @@ agent: Style: - read source and docs first - test systematically - - record evidence + - record what happened - end with a concise protocol report kickoffTask: |- QA mission: @@ -84,7 +84,7 @@ kickoffTask: |- The repo is available in your workspace at `./repo/`. Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them. Run the scenarios through the real qa-channel surfaces where possible. - Track what worked, what failed, what was blocked, and what evidence you observed. + Track what worked, what failed, what was blocked, and what you observed. End with a concise report grouped into worked / failed / blocked / follow-up. Important expectations: diff --git a/qa/scenarios/models/model-switch-tool-continuity.md b/qa/scenarios/models/model-switch-tool-continuity.md index 067bae0ec41..6bcf49b38d0 100644 --- a/qa/scenarios/models/model-switch-tool-continuity.md +++ b/qa/scenarios/models/model-switch-tool-continuity.md @@ -13,7 +13,7 @@ objective: Verify switching models preserves session context and tool use instea successCriteria: - Alternate model is actually requested. - A tool call still happens after the model switch. - - Final answer acknowledges the handoff and uses the tool-derived evidence. + - Final answer acknowledges the handoff and reread QA mission. docsRefs: - docs/help/testing.md - docs/concepts/model-failover.md @@ -68,10 +68,10 @@ steps: saveAs: outbound args: - lambda: - expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuityEvidence(candidate.text)).at(-1)" + expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuitySignal(candidate.text)).at(-1)" - expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel) - assert: - expr: hasModelSwitchContinuityEvidence(outbound.text) + expr: hasModelSwitchContinuitySignal(outbound.text) message: expr: "`switch reply missed kickoff continuity: ${outbound.text}`" - if: diff --git a/qa/scenarios/runtime/approval-turn-tool-followthrough.md b/qa/scenarios/runtime/approval-turn-tool-followthrough.md index cdd8c7700ac..72b001e1dc8 100644 --- a/qa/scenarios/runtime/approval-turn-tool-followthrough.md +++ b/qa/scenarios/runtime/approval-turn-tool-followthrough.md @@ -13,7 +13,7 @@ objective: Verify a short approval like "ok do it" triggers immediate tool use i successCriteria: - Agent can keep the pre-action turn brief. - The short approval leads to a real tool call on the next turn. - - Final answer uses tool-derived evidence instead of placeholder progress text. + - Final answer cites the actual file read instead of placeholder progress text. docsRefs: - docs/help/testing.md - docs/channels/qa-channel.md