test(qa-lab): remove generic evidence wording

This commit is contained in:
Vincent Koc
2026-05-22 09:48:57 +08:00
parent a9176e9190
commit 9bd97d2c60
11 changed files with 24 additions and 23 deletions

View File

@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
- Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1.
- Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity.
- QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
- QA-Lab: replace generic evidence framing in seeded scenario prompts with concrete observed QA behavior.
- QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin.
- QA-Lab: add a live-only long-context progress watchdog scenario for Codex app-server timeout and stalled-run sentinels. (#80323) Thanks @100yenadmin.
- QA-Lab: tag gateway restart recovery and streaming final-integrity scenarios as live-only runtime parity lanes. (#80323) Thanks @100yenadmin.

View File

@@ -1,10 +1,10 @@
import { describe, expect, it } from "vitest";
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";
describe("qa model-switch evaluation", () => {
it("accepts direct handoff replies that mention the kickoff task", () => {
expect(
hasModelSwitchContinuityEvidence(
hasModelSwitchContinuitySignal(
"Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.",
),
).toBe(true);
@@ -12,7 +12,7 @@ describe("qa model-switch evaluation", () => {
it("accepts short mission-oriented switch confirmations", () => {
expect(
hasModelSwitchContinuityEvidence(
hasModelSwitchContinuitySignal(
"model switch complete. reread the kickoff task; qa mission stays the same.",
),
).toBe(true);
@@ -20,7 +20,7 @@ describe("qa model-switch evaluation", () => {
it("accepts concise kickoff note confirmations", () => {
expect(
hasModelSwitchContinuityEvidence(
hasModelSwitchContinuitySignal(
"Handoff clean: after the model switch, I reread the kickoff note.",
),
).toBe(true);
@@ -28,7 +28,7 @@ describe("qa model-switch evaluation", () => {
it("accepts concise paraphrases of the kickoff task after a handoff", () => {
expect(
hasModelSwitchContinuityEvidence(
hasModelSwitchContinuitySignal(
"Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.",
),
).toBe(true);
@@ -36,7 +36,7 @@ describe("qa model-switch evaluation", () => {
it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
expect(
hasModelSwitchContinuityEvidence(
hasModelSwitchContinuitySignal(
"subagent-handoff confirmed. qa report update: scenario pass. qa run complete.",
),
).toBe(false);
@@ -44,7 +44,7 @@ describe("qa model-switch evaluation", () => {
it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => {
expect(
hasModelSwitchContinuityEvidence(
hasModelSwitchContinuitySignal(
`model switch acknowledged. qa mission stays the same.
Final QA tally update: all mandatory scenarios resolved. QA run complete.`,

View File

@@ -1,6 +1,6 @@
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/string-coerce-runtime";
export function hasModelSwitchContinuityEvidence(text: string) {
export function hasModelSwitchContinuitySignal(text: string) {
const lower = normalizeLowercaseStringOrEmpty(text);
const mentionsHandoff =
lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");

View File

@@ -17,7 +17,7 @@ Persona:
Style:
- read source and docs first
- test systematically
- record evidence
- record what happened
- end with a concise protocol report`;
const qaScenarioConfigSchema = z.record(z.string(), z.unknown()).superRefine((config, ctx) => {

View File

@@ -83,7 +83,7 @@ function createDeps(overrides?: Partial<QaScenarioRuntimeDeps>): QaScenarioRunti
hasDiscoveryLabels: fn,
reportsDiscoveryScopeLeak: fn,
reportsMissingDiscoveryFiles: fn,
hasModelSwitchContinuityEvidence: fn,
hasModelSwitchContinuitySignal: fn,
...overrides,
};
}

View File

@@ -95,7 +95,7 @@ export type QaScenarioRuntimeDeps = {
hasDiscoveryLabels: QaScenarioRuntimeFunction;
reportsDiscoveryScopeLeak: QaScenarioRuntimeFunction;
reportsMissingDiscoveryFiles: QaScenarioRuntimeFunction;
hasModelSwitchContinuityEvidence: QaScenarioRuntimeFunction;
hasModelSwitchContinuitySignal: QaScenarioRuntimeFunction;
};
export type QaScenarioRuntimeConstants = {
@@ -186,7 +186,7 @@ type QaScenarioRuntimeApi<
hasDiscoveryLabels: TDeps["hasDiscoveryLabels"];
reportsDiscoveryScopeLeak: TDeps["reportsDiscoveryScopeLeak"];
reportsMissingDiscoveryFiles: TDeps["reportsMissingDiscoveryFiles"];
hasModelSwitchContinuityEvidence: TDeps["hasModelSwitchContinuityEvidence"];
hasModelSwitchContinuitySignal: TDeps["hasModelSwitchContinuitySignal"];
imageUnderstandingPngBase64: string;
imageUnderstandingLargePngBase64: string;
imageUnderstandingValidPngBase64: string;
@@ -292,7 +292,7 @@ export function createQaScenarioRuntimeApi<
hasDiscoveryLabels: params.deps.hasDiscoveryLabels,
reportsDiscoveryScopeLeak: params.deps.reportsDiscoveryScopeLeak,
reportsMissingDiscoveryFiles: params.deps.reportsMissingDiscoveryFiles,
hasModelSwitchContinuityEvidence: params.deps.hasModelSwitchContinuityEvidence,
hasModelSwitchContinuitySignal: params.deps.hasModelSwitchContinuitySignal,
imageUnderstandingPngBase64: params.constants.imageUnderstandingPngBase64,
imageUnderstandingLargePngBase64: params.constants.imageUnderstandingLargePngBase64,
imageUnderstandingValidPngBase64: params.constants.imageUnderstandingValidPngBase64,

View File

@@ -54,7 +54,7 @@ const webEvaluate = vi.hoisted(() => vi.fn());
const hasDiscoveryLabels = vi.hoisted(() => vi.fn());
const reportsDiscoveryScopeLeak = vi.hoisted(() => vi.fn());
const reportsMissingDiscoveryFiles = vi.hoisted(() => vi.fn());
const hasModelSwitchContinuityEvidence = vi.hoisted(() => vi.fn());
const hasModelSwitchContinuitySignal = vi.hoisted(() => vi.fn());
const qaChannelPlugin = vi.hoisted(() => ({ id: "qa-channel" }));
const scanGatewayLogSentinels = vi.hoisted(() => vi.fn());
const assertNoGatewayLogSentinels = vi.hoisted(() => vi.fn());
@@ -144,7 +144,7 @@ vi.mock("./runtime-tool-fixture.js", () => ({
}));
vi.mock("./model-switch-eval.js", () => ({
hasModelSwitchContinuityEvidence,
hasModelSwitchContinuitySignal,
}));
vi.mock("./runtime-api.js", () => ({

View File

@@ -21,7 +21,7 @@ import {
} from "./discovery-eval.js";
import { extractQaToolPayload } from "./extract-tool-payload.js";
import { assertNoGatewayLogSentinels, scanGatewayLogSentinels } from "./gateway-log-sentinel.js";
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";
import { qaChannelPlugin } from "./runtime-api.js";
import { runRuntimeToolFixture } from "./runtime-tool-fixture.js";
import type { QaSeedScenarioWithSource } from "./scenario-catalog.js";
@@ -213,7 +213,7 @@ function createQaSuiteScenarioDeps(params: QaSuiteScenarioDepsParams) {
hasDiscoveryLabels,
reportsDiscoveryScopeLeak,
reportsMissingDiscoveryFiles,
hasModelSwitchContinuityEvidence,
hasModelSwitchContinuitySignal,
};
}

View File

@@ -76,7 +76,7 @@ agent:
Style:
- read source and docs first
- test systematically
- record evidence
- record what happened
- end with a concise protocol report
kickoffTask: |-
QA mission:
@@ -84,7 +84,7 @@ kickoffTask: |-
The repo is available in your workspace at `./repo/`.
Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
Run the scenarios through the real qa-channel surfaces where possible.
Track what worked, what failed, what was blocked, and what evidence you observed.
Track what worked, what failed, what was blocked, and what you observed.
End with a concise report grouped into worked / failed / blocked / follow-up.
Important expectations:

View File

@@ -13,7 +13,7 @@ objective: Verify switching models preserves session context and tool use instea
successCriteria:
- Alternate model is actually requested.
- A tool call still happens after the model switch.
- Final answer acknowledges the handoff and uses the tool-derived evidence.
- Final answer acknowledges the handoff and reread QA mission.
docsRefs:
- docs/help/testing.md
- docs/concepts/model-failover.md
@@ -68,10 +68,10 @@ steps:
saveAs: outbound
args:
- lambda:
expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuityEvidence(candidate.text)).at(-1)"
expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuitySignal(candidate.text)).at(-1)"
- expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
- assert:
expr: hasModelSwitchContinuityEvidence(outbound.text)
expr: hasModelSwitchContinuitySignal(outbound.text)
message:
expr: "`switch reply missed kickoff continuity: ${outbound.text}`"
- if:

View File

@@ -13,7 +13,7 @@ objective: Verify a short approval like "ok do it" triggers immediate tool use i
successCriteria:
- Agent can keep the pre-action turn brief.
- The short approval leads to a real tool call on the next turn.
- Final answer uses tool-derived evidence instead of placeholder progress text.
- Final answer cites the actual file read instead of placeholder progress text.
docsRefs:
- docs/help/testing.md
- docs/channels/qa-channel.md