test(qa-lab): remove generic evidence wording

2026-07-12 09:06:07 +00:00 · 2026-05-22 09:48:57 +08:00
parent a9176e9190
commit 9bd97d2c60
11 changed files with 24 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
 - Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1.
 - Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity.
 - QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
+- QA-Lab: replace generic evidence framing in seeded scenario prompts with concrete observed QA behavior.
 - QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin.
 - QA-Lab: add a live-only long-context progress watchdog scenario for Codex app-server timeout and stalled-run sentinels. (#80323) Thanks @100yenadmin.
 - QA-Lab: tag gateway restart recovery and streaming final-integrity scenarios as live-only runtime parity lanes. (#80323) Thanks @100yenadmin.
--- a/extensions/qa-lab/src/model-switch-eval.test.ts
+++ b/extensions/qa-lab/src/model-switch-eval.test.ts
@@ -1,10 +1,10 @@
 import { describe, expect, it } from "vitest";
-import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
+import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";

 describe("qa model-switch evaluation", () => {
  it("accepts direct handoff replies that mention the kickoff task", () => {
    expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
        "Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.",
      ),
    ).toBe(true);
@@ -12,7 +12,7 @@ describe("qa model-switch evaluation", () => {

  it("accepts short mission-oriented switch confirmations", () => {
    expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
        "model switch complete. reread the kickoff task; qa mission stays the same.",
      ),
    ).toBe(true);
@@ -20,7 +20,7 @@ describe("qa model-switch evaluation", () => {

  it("accepts concise kickoff note confirmations", () => {
    expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
        "Handoff clean: after the model switch, I reread the kickoff note.",
      ),
    ).toBe(true);
@@ -28,7 +28,7 @@ describe("qa model-switch evaluation", () => {

  it("accepts concise paraphrases of the kickoff task after a handoff", () => {
    expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
        "Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.",
      ),
    ).toBe(true);
@@ -36,7 +36,7 @@ describe("qa model-switch evaluation", () => {

  it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
    expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
        "subagent-handoff confirmed. qa report update: scenario pass. qa run complete.",
      ),
    ).toBe(false);
@@ -44,7 +44,7 @@ describe("qa model-switch evaluation", () => {

  it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => {
    expect(
-      hasModelSwitchContinuityEvidence(
+      hasModelSwitchContinuitySignal(
        `model switch acknowledged. qa mission stays the same.

 Final QA tally update: all mandatory scenarios resolved. QA run complete.`,
--- a/extensions/qa-lab/src/model-switch-eval.ts
+++ b/extensions/qa-lab/src/model-switch-eval.ts
@@ -1,6 +1,6 @@
 import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/string-coerce-runtime";

-export function hasModelSwitchContinuityEvidence(text: string) {
+export function hasModelSwitchContinuitySignal(text: string) {
  const lower = normalizeLowercaseStringOrEmpty(text);
  const mentionsHandoff =
    lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -17,7 +17,7 @@ Persona:
 Style:
 - read source and docs first
 - test systematically
- record evidence
+- record what happened
 - end with a concise protocol report`;

 const qaScenarioConfigSchema = z.record(z.string(), z.unknown()).superRefine((config, ctx) => {
--- a/extensions/qa-lab/src/scenario-runtime-api.test.ts
+++ b/extensions/qa-lab/src/scenario-runtime-api.test.ts
@@ -83,7 +83,7 @@ function createDeps(overrides?: Partial<QaScenarioRuntimeDeps>): QaScenarioRunti
    hasDiscoveryLabels: fn,
    reportsDiscoveryScopeLeak: fn,
    reportsMissingDiscoveryFiles: fn,
-    hasModelSwitchContinuityEvidence: fn,
+    hasModelSwitchContinuitySignal: fn,
    ...overrides,
  };
 }
--- a/extensions/qa-lab/src/scenario-runtime-api.ts
+++ b/extensions/qa-lab/src/scenario-runtime-api.ts
@@ -95,7 +95,7 @@ export type QaScenarioRuntimeDeps = {
  hasDiscoveryLabels: QaScenarioRuntimeFunction;
  reportsDiscoveryScopeLeak: QaScenarioRuntimeFunction;
  reportsMissingDiscoveryFiles: QaScenarioRuntimeFunction;
-  hasModelSwitchContinuityEvidence: QaScenarioRuntimeFunction;
+  hasModelSwitchContinuitySignal: QaScenarioRuntimeFunction;
 };

 export type QaScenarioRuntimeConstants = {
@@ -186,7 +186,7 @@ type QaScenarioRuntimeApi<
  hasDiscoveryLabels: TDeps["hasDiscoveryLabels"];
  reportsDiscoveryScopeLeak: TDeps["reportsDiscoveryScopeLeak"];
  reportsMissingDiscoveryFiles: TDeps["reportsMissingDiscoveryFiles"];
-  hasModelSwitchContinuityEvidence: TDeps["hasModelSwitchContinuityEvidence"];
+  hasModelSwitchContinuitySignal: TDeps["hasModelSwitchContinuitySignal"];
  imageUnderstandingPngBase64: string;
  imageUnderstandingLargePngBase64: string;
  imageUnderstandingValidPngBase64: string;
@@ -292,7 +292,7 @@ export function createQaScenarioRuntimeApi<
    hasDiscoveryLabels: params.deps.hasDiscoveryLabels,
    reportsDiscoveryScopeLeak: params.deps.reportsDiscoveryScopeLeak,
    reportsMissingDiscoveryFiles: params.deps.reportsMissingDiscoveryFiles,
-    hasModelSwitchContinuityEvidence: params.deps.hasModelSwitchContinuityEvidence,
+    hasModelSwitchContinuitySignal: params.deps.hasModelSwitchContinuitySignal,
    imageUnderstandingPngBase64: params.constants.imageUnderstandingPngBase64,
    imageUnderstandingLargePngBase64: params.constants.imageUnderstandingLargePngBase64,
    imageUnderstandingValidPngBase64: params.constants.imageUnderstandingValidPngBase64,
--- a/extensions/qa-lab/src/suite-runtime-flow.test.ts
+++ b/extensions/qa-lab/src/suite-runtime-flow.test.ts
@@ -54,7 +54,7 @@ const webEvaluate = vi.hoisted(() => vi.fn());
 const hasDiscoveryLabels = vi.hoisted(() => vi.fn());
 const reportsDiscoveryScopeLeak = vi.hoisted(() => vi.fn());
 const reportsMissingDiscoveryFiles = vi.hoisted(() => vi.fn());
-const hasModelSwitchContinuityEvidence = vi.hoisted(() => vi.fn());
+const hasModelSwitchContinuitySignal = vi.hoisted(() => vi.fn());
 const qaChannelPlugin = vi.hoisted(() => ({ id: "qa-channel" }));
 const scanGatewayLogSentinels = vi.hoisted(() => vi.fn());
 const assertNoGatewayLogSentinels = vi.hoisted(() => vi.fn());
@@ -144,7 +144,7 @@ vi.mock("./runtime-tool-fixture.js", () => ({
 }));

 vi.mock("./model-switch-eval.js", () => ({
-  hasModelSwitchContinuityEvidence,
+  hasModelSwitchContinuitySignal,
 }));

 vi.mock("./runtime-api.js", () => ({
--- a/extensions/qa-lab/src/suite-runtime-flow.ts
+++ b/extensions/qa-lab/src/suite-runtime-flow.ts
@@ -21,7 +21,7 @@ import {
 } from "./discovery-eval.js";
 import { extractQaToolPayload } from "./extract-tool-payload.js";
 import { assertNoGatewayLogSentinels, scanGatewayLogSentinels } from "./gateway-log-sentinel.js";
-import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
+import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";
 import { qaChannelPlugin } from "./runtime-api.js";
 import { runRuntimeToolFixture } from "./runtime-tool-fixture.js";
 import type { QaSeedScenarioWithSource } from "./scenario-catalog.js";
@@ -213,7 +213,7 @@ function createQaSuiteScenarioDeps(params: QaSuiteScenarioDepsParams) {
    hasDiscoveryLabels,
    reportsDiscoveryScopeLeak,
    reportsMissingDiscoveryFiles,
-    hasModelSwitchContinuityEvidence,
+    hasModelSwitchContinuitySignal,
  };
 }

--- a/qa/scenarios/index.md
+++ b/qa/scenarios/index.md
@@ -76,7 +76,7 @@ agent:
    Style:
    - read source and docs first
    - test systematically
-    - record evidence
+    - record what happened
    - end with a concise protocol report
 kickoffTask: |-
  QA mission:
@@ -84,7 +84,7 @@ kickoffTask: |-
  The repo is available in your workspace at `./repo/`.
  Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
  Run the scenarios through the real qa-channel surfaces where possible.
-  Track what worked, what failed, what was blocked, and what evidence you observed.
+  Track what worked, what failed, what was blocked, and what you observed.
  End with a concise report grouped into worked / failed / blocked / follow-up.

  Important expectations:
--- a/qa/scenarios/models/model-switch-tool-continuity.md
+++ b/qa/scenarios/models/model-switch-tool-continuity.md
@@ -13,7 +13,7 @@ objective: Verify switching models preserves session context and tool use instea
 successCriteria:
  - Alternate model is actually requested.
  - A tool call still happens after the model switch.
-  - Final answer acknowledges the handoff and uses the tool-derived evidence.
+  - Final answer acknowledges the handoff and reread QA mission.
 docsRefs:
  - docs/help/testing.md
  - docs/concepts/model-failover.md
@@ -68,10 +68,10 @@ steps:
        saveAs: outbound
        args:
          - lambda:
-              expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuityEvidence(candidate.text)).at(-1)"
+              expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuitySignal(candidate.text)).at(-1)"
          - expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
      - assert:
-          expr: hasModelSwitchContinuityEvidence(outbound.text)
+          expr: hasModelSwitchContinuitySignal(outbound.text)
          message:
            expr: "`switch reply missed kickoff continuity: ${outbound.text}`"
      - if:
--- a/qa/scenarios/runtime/approval-turn-tool-followthrough.md
+++ b/qa/scenarios/runtime/approval-turn-tool-followthrough.md
@@ -13,7 +13,7 @@ objective: Verify a short approval like "ok do it" triggers immediate tool use i
 successCriteria:
  - Agent can keep the pre-action turn brief.
  - The short approval leads to a real tool call on the next turn.
-  - Final answer uses tool-derived evidence instead of placeholder progress text.
+  - Final answer cites the actual file read instead of placeholder progress text.
 docsRefs:
  - docs/help/testing.md
  - docs/channels/qa-channel.md