From 9bd97d2c60ebe68ea40c3a65d346ea56b0ed5a6c Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Fri, 22 May 2026 09:48:57 +0800
Subject: [PATCH] test(qa-lab): remove generic evidence wording

---
 CHANGELOG.md                                       |  1 +
 extensions/qa-lab/src/model-switch-eval.test.ts    | 14 +++++++-------
 extensions/qa-lab/src/model-switch-eval.ts         |  2 +-
 extensions/qa-lab/src/scenario-catalog.ts          |  2 +-
 extensions/qa-lab/src/scenario-runtime-api.test.ts |  2 +-
 extensions/qa-lab/src/scenario-runtime-api.ts      |  6 +++---
 extensions/qa-lab/src/suite-runtime-flow.test.ts   |  4 ++--
 extensions/qa-lab/src/suite-runtime-flow.ts        |  4 ++--
 qa/scenarios/index.md                              |  4 ++--
 .../models/model-switch-tool-continuity.md         |  6 +++---
 .../runtime/approval-turn-tool-followthrough.md    |  2 +-
 11 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dba9e9cd6c2..e505e43cd33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
 - Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1.
 - Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity.
 - QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
+- QA-Lab: replace generic evidence framing in seeded scenario prompts with concrete observed QA behavior.
 - QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin.
 - QA-Lab: add a live-only long-context progress watchdog scenario for Codex app-server timeout and stalled-run sentinels. (#80323) Thanks @100yenadmin.
 - QA-Lab: tag gateway restart recovery and streaming final-integrity scenarios as live-only runtime parity lanes. (#80323) Thanks @100yenadmin.
diff --git a/extensions/qa-lab/src/model-switch-eval.test.ts b/extensions/qa-lab/src/model-switch-eval.test.ts
index 26dbe14c194..ee3af0e2e67 100644
--- a/extensions/qa-lab/src/model-switch-eval.test.ts
+++ b/extensions/qa-lab/src/model-switch-eval.test.ts
@@ -1,10 +1,10 @@
 import { describe, expect, it } from "vitest";
-import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
+import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";
 
 describe("qa model-switch evaluation", () => {
   it("accepts direct handoff replies that mention the kickoff task", () => {
     expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
         "Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.",
       ),
     ).toBe(true);
@@ -12,7 +12,7 @@ describe("qa model-switch evaluation", () => {
 
   it("accepts short mission-oriented switch confirmations", () => {
     expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
         "model switch complete. reread the kickoff task; qa mission stays the same.",
       ),
     ).toBe(true);
@@ -20,7 +20,7 @@ describe("qa model-switch evaluation", () => {
 
   it("accepts concise kickoff note confirmations", () => {
     expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
         "Handoff clean: after the model switch, I reread the kickoff note.",
       ),
     ).toBe(true);
@@ -28,7 +28,7 @@ describe("qa model-switch evaluation", () => {
 
   it("accepts concise paraphrases of the kickoff task after a handoff", () => {
     expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
         "Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.",
       ),
     ).toBe(true);
@@ -36,7 +36,7 @@ describe("qa model-switch evaluation", () => {
 
   it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
     expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
         "subagent-handoff confirmed. qa report update: scenario pass. qa run complete.",
       ),
     ).toBe(false);
@@ -44,7 +44,7 @@ describe("qa model-switch evaluation", () => {
 
   it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => {
     expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
         `model switch acknowledged. qa mission stays the same.
 
 Final QA tally update: all mandatory scenarios resolved. QA run complete.`,
diff --git a/extensions/qa-lab/src/model-switch-eval.ts b/extensions/qa-lab/src/model-switch-eval.ts
index 7583bbe08bf..7cf030d7ae9 100644
--- a/extensions/qa-lab/src/model-switch-eval.ts
+++ b/extensions/qa-lab/src/model-switch-eval.ts
@@ -1,6 +1,6 @@
 import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/string-coerce-runtime";
 
-export function hasModelSwitchContinuityEvidence(text: string) {
+export function hasModelSwitchContinuitySignal(text: string) {
   const lower = normalizeLowercaseStringOrEmpty(text);
   const mentionsHandoff =
     lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts
index ab0a82c0593..02f8cb3268d 100644
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -17,7 +17,7 @@ Persona:
 Style:
 - read source and docs first
 - test systematically
-- record evidence
+- record what happened
 - end with a concise protocol report`;
 
 const qaScenarioConfigSchema = z.record(z.string(), z.unknown()).superRefine((config, ctx) => {
diff --git a/extensions/qa-lab/src/scenario-runtime-api.test.ts b/extensions/qa-lab/src/scenario-runtime-api.test.ts
index effee177572..9f15cc68dd4 100644
--- a/extensions/qa-lab/src/scenario-runtime-api.test.ts
+++ b/extensions/qa-lab/src/scenario-runtime-api.test.ts
@@ -83,7 +83,7 @@ function createDeps(overrides?: Partial<QaScenarioRuntimeDeps>): QaScenarioRunti
     hasDiscoveryLabels: fn,
     reportsDiscoveryScopeLeak: fn,
     reportsMissingDiscoveryFiles: fn,
-    hasModelSwitchContinuityEvidence: fn,
+    hasModelSwitchContinuitySignal: fn,
     ...overrides,
   };
 }
diff --git a/extensions/qa-lab/src/scenario-runtime-api.ts b/extensions/qa-lab/src/scenario-runtime-api.ts
index 07274038b07..f607b14cce6 100644
--- a/extensions/qa-lab/src/scenario-runtime-api.ts
+++ b/extensions/qa-lab/src/scenario-runtime-api.ts
@@ -95,7 +95,7 @@ export type QaScenarioRuntimeDeps = {
   hasDiscoveryLabels: QaScenarioRuntimeFunction;
   reportsDiscoveryScopeLeak: QaScenarioRuntimeFunction;
   reportsMissingDiscoveryFiles: QaScenarioRuntimeFunction;
-  hasModelSwitchContinuityEvidence: QaScenarioRuntimeFunction;
+  hasModelSwitchContinuitySignal: QaScenarioRuntimeFunction;
 };
 
 export type QaScenarioRuntimeConstants = {
@@ -186,7 +186,7 @@ type QaScenarioRuntimeApi<
   hasDiscoveryLabels: TDeps["hasDiscoveryLabels"];
   reportsDiscoveryScopeLeak: TDeps["reportsDiscoveryScopeLeak"];
   reportsMissingDiscoveryFiles: TDeps["reportsMissingDiscoveryFiles"];
-  hasModelSwitchContinuityEvidence: TDeps["hasModelSwitchContinuityEvidence"];
+  hasModelSwitchContinuitySignal: TDeps["hasModelSwitchContinuitySignal"];
   imageUnderstandingPngBase64: string;
   imageUnderstandingLargePngBase64: string;
   imageUnderstandingValidPngBase64: string;
@@ -292,7 +292,7 @@ export function createQaScenarioRuntimeApi<
     hasDiscoveryLabels: params.deps.hasDiscoveryLabels,
     reportsDiscoveryScopeLeak: params.deps.reportsDiscoveryScopeLeak,
     reportsMissingDiscoveryFiles: params.deps.reportsMissingDiscoveryFiles,
-    hasModelSwitchContinuityEvidence: params.deps.hasModelSwitchContinuityEvidence,
+    hasModelSwitchContinuitySignal: params.deps.hasModelSwitchContinuitySignal,
     imageUnderstandingPngBase64: params.constants.imageUnderstandingPngBase64,
     imageUnderstandingLargePngBase64: params.constants.imageUnderstandingLargePngBase64,
     imageUnderstandingValidPngBase64: params.constants.imageUnderstandingValidPngBase64,
diff --git a/extensions/qa-lab/src/suite-runtime-flow.test.ts b/extensions/qa-lab/src/suite-runtime-flow.test.ts
index fc2001280aa..611fc838026 100644
--- a/extensions/qa-lab/src/suite-runtime-flow.test.ts
+++ b/extensions/qa-lab/src/suite-runtime-flow.test.ts
@@ -54,7 +54,7 @@ const webEvaluate = vi.hoisted(() => vi.fn());
 const hasDiscoveryLabels = vi.hoisted(() => vi.fn());
 const reportsDiscoveryScopeLeak = vi.hoisted(() => vi.fn());
 const reportsMissingDiscoveryFiles = vi.hoisted(() => vi.fn());
-const hasModelSwitchContinuityEvidence = vi.hoisted(() => vi.fn());
+const hasModelSwitchContinuitySignal = vi.hoisted(() => vi.fn());
 const qaChannelPlugin = vi.hoisted(() => ({ id: "qa-channel" }));
 const scanGatewayLogSentinels = vi.hoisted(() => vi.fn());
 const assertNoGatewayLogSentinels = vi.hoisted(() => vi.fn());
@@ -144,7 +144,7 @@ vi.mock("./runtime-tool-fixture.js", () => ({
 }));
 
 vi.mock("./model-switch-eval.js", () => ({
-  hasModelSwitchContinuityEvidence,
+  hasModelSwitchContinuitySignal,
 }));
 
 vi.mock("./runtime-api.js", () => ({
diff --git a/extensions/qa-lab/src/suite-runtime-flow.ts b/extensions/qa-lab/src/suite-runtime-flow.ts
index 01c23b8fb28..21f1139b8ed 100644
--- a/extensions/qa-lab/src/suite-runtime-flow.ts
+++ b/extensions/qa-lab/src/suite-runtime-flow.ts
@@ -21,7 +21,7 @@ import {
 } from "./discovery-eval.js";
 import { extractQaToolPayload } from "./extract-tool-payload.js";
 import { assertNoGatewayLogSentinels, scanGatewayLogSentinels } from "./gateway-log-sentinel.js";
-import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
+import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";
 import { qaChannelPlugin } from "./runtime-api.js";
 import { runRuntimeToolFixture } from "./runtime-tool-fixture.js";
 import type { QaSeedScenarioWithSource } from "./scenario-catalog.js";
@@ -213,7 +213,7 @@ function createQaSuiteScenarioDeps(params: QaSuiteScenarioDepsParams) {
     hasDiscoveryLabels,
     reportsDiscoveryScopeLeak,
     reportsMissingDiscoveryFiles,
-    hasModelSwitchContinuityEvidence,
+    hasModelSwitchContinuitySignal,
   };
 }
 
diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md
index 8f0ee210377..7c246d742e8 100644
--- a/qa/scenarios/index.md
+++ b/qa/scenarios/index.md
@@ -76,7 +76,7 @@ agent:
     Style:
     - read source and docs first
     - test systematically
-    - record evidence
+    - record what happened
     - end with a concise protocol report
 kickoffTask: |-
   QA mission:
@@ -84,7 +84,7 @@ kickoffTask: |-
   The repo is available in your workspace at `./repo/`.
   Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
   Run the scenarios through the real qa-channel surfaces where possible.
-  Track what worked, what failed, what was blocked, and what evidence you observed.
+  Track what worked, what failed, what was blocked, and what you observed.
   End with a concise report grouped into worked / failed / blocked / follow-up.
 
   Important expectations:
diff --git a/qa/scenarios/models/model-switch-tool-continuity.md b/qa/scenarios/models/model-switch-tool-continuity.md
index 067bae0ec41..6bcf49b38d0 100644
--- a/qa/scenarios/models/model-switch-tool-continuity.md
+++ b/qa/scenarios/models/model-switch-tool-continuity.md
@@ -13,7 +13,7 @@ objective: Verify switching models preserves session context and tool use instea
 successCriteria:
   - Alternate model is actually requested.
   - A tool call still happens after the model switch.
-  - Final answer acknowledges the handoff and uses the tool-derived evidence.
+  - Final answer acknowledges the handoff and reread QA mission.
 docsRefs:
   - docs/help/testing.md
   - docs/concepts/model-failover.md
@@ -68,10 +68,10 @@ steps:
         saveAs: outbound
         args:
           - lambda:
-              expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuityEvidence(candidate.text)).at(-1)"
+              expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuitySignal(candidate.text)).at(-1)"
           - expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
       - assert:
-          expr: hasModelSwitchContinuityEvidence(outbound.text)
+          expr: hasModelSwitchContinuitySignal(outbound.text)
           message:
             expr: "`switch reply missed kickoff continuity: ${outbound.text}`"
       - if:
diff --git a/qa/scenarios/runtime/approval-turn-tool-followthrough.md b/qa/scenarios/runtime/approval-turn-tool-followthrough.md
index cdd8c7700ac..72b001e1dc8 100644
--- a/qa/scenarios/runtime/approval-turn-tool-followthrough.md
+++ b/qa/scenarios/runtime/approval-turn-tool-followthrough.md
@@ -13,7 +13,7 @@ objective: Verify a short approval like "ok do it" triggers immediate tool use i
 successCriteria:
   - Agent can keep the pre-action turn brief.
   - The short approval leads to a real tool call on the next turn.
-  - Final answer uses tool-derived evidence instead of placeholder progress text.
+  - Final answer cites the actual file read instead of placeholder progress text.
 docsRefs:
   - docs/help/testing.md
   - docs/channels/qa-channel.md