From b5d2bd6f41f76bee851cd7ee1afc0cac95976eb8 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 7 Apr 2026 16:25:24 +0100 Subject: [PATCH] fix(qa): tighten frontier scope evals --- extensions/qa-lab/src/discovery-eval.test.ts | 29 ++++++++++++++++++- extensions/qa-lab/src/discovery-eval.ts | 14 +++++++++ .../qa-lab/src/model-switch-eval.test.ts | 10 +++++++ extensions/qa-lab/src/model-switch-eval.ts | 10 ++++++- extensions/qa-lab/src/suite.ts | 9 +++++- 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/extensions/qa-lab/src/discovery-eval.test.ts b/extensions/qa-lab/src/discovery-eval.test.ts index 8478319fe75..981357e8358 100644 --- a/extensions/qa-lab/src/discovery-eval.test.ts +++ b/extensions/qa-lab/src/discovery-eval.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from "vitest"; -import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js"; +import { + hasDiscoveryLabels, + reportsDiscoveryScopeLeak, + reportsMissingDiscoveryFiles, +} from "./discovery-eval.js"; describe("qa discovery evaluation", () => { it("accepts rich discovery reports that explicitly confirm all required files were read", () => { @@ -18,6 +22,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl expect(hasDiscoveryLabels(report)).toBe(true); expect(reportsMissingDiscoveryFiles(report)).toBe(false); + expect(reportsDiscoveryScopeLeak(report)).toBe(false); }); it("accepts numeric 'all 4 required files read' confirmations", () => { @@ -37,6 +42,7 @@ The report may quote phrases like "not present" while describing the evaluator, expect(hasDiscoveryLabels(report)).toBe(true); expect(reportsMissingDiscoveryFiles(report)).toBe(false); + expect(reportsDiscoveryScopeLeak(report)).toBe(false); }); it("accepts claude-style 'all four files retrieved' discovery summaries", () => { @@ -54,6 +60,7 @@ Follow-up expect(hasDiscoveryLabels(report)).toBe(true); expect(reportsMissingDiscoveryFiles(report)).toBe(false); + expect(reportsDiscoveryScopeLeak(report)).toBe(false); }); it("still flags genuine file-miss language when the report never confirms the required reads", () => { @@ -70,5 +77,25 @@ Follow-up expect(hasDiscoveryLabels(report)).toBe(true); expect(reportsMissingDiscoveryFiles(report)).toBe(true); + expect(reportsDiscoveryScopeLeak(report)).toBe(false); + }); + + it("flags discovery replies that drift into unrelated suite wrap-up claims", () => { + const report = ` +Worked +- All four requested files were read: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md. +Failed +- None. +Blocked +- Runtime execution not attempted here. +Follow-up +- Run the live suite next. + +Final QA tally update: all mandatory scenarios resolved. QA run complete. +`.trim(); + + expect(hasDiscoveryLabels(report)).toBe(true); + expect(reportsMissingDiscoveryFiles(report)).toBe(false); + expect(reportsDiscoveryScopeLeak(report)).toBe(true); }); }); diff --git a/extensions/qa-lab/src/discovery-eval.ts b/extensions/qa-lab/src/discovery-eval.ts index da0e9bb824a..2145a9a38c9 100644 --- a/extensions/qa-lab/src/discovery-eval.ts +++ b/extensions/qa-lab/src/discovery-eval.ts @@ -5,6 +5,15 @@ const REQUIRED_DISCOVERY_REFS = [ "repo/docs/help/testing.md", ] as const; +const DISCOVERY_SCOPE_LEAK_PHRASES = [ + "all mandatory scenarios", + "final qa tally", + "final qa tally update", + "qa run complete", + "scenario: `subagent-handoff`", + "scenario: subagent-handoff", +] as const; + function confirmsDiscoveryFileRead(text: string) { const lower = text.toLowerCase(); const mentionsAllRefs = REQUIRED_DISCOVERY_REFS.every((ref) => lower.includes(ref.toLowerCase())); @@ -41,3 +50,8 @@ export function reportsMissingDiscoveryFiles(text: string) { lower.includes("could not inspect") ); } + +export function reportsDiscoveryScopeLeak(text: string) { + const lower = text.toLowerCase(); + return DISCOVERY_SCOPE_LEAK_PHRASES.some((phrase) => lower.includes(phrase)); +} diff --git a/extensions/qa-lab/src/model-switch-eval.test.ts b/extensions/qa-lab/src/model-switch-eval.test.ts index cb146c406e2..d4c36b25b31 100644 --- a/extensions/qa-lab/src/model-switch-eval.test.ts +++ b/extensions/qa-lab/src/model-switch-eval.test.ts @@ -25,4 +25,14 @@ describe("qa model-switch evaluation", () => { ), ).toBe(false); }); + + it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => { + expect( + hasModelSwitchContinuityEvidence( + `model switch acknowledged. qa mission stays the same. + +Final QA tally update: all mandatory scenarios resolved. QA run complete.`, + ), + ).toBe(false); + }); }); diff --git a/extensions/qa-lab/src/model-switch-eval.ts b/extensions/qa-lab/src/model-switch-eval.ts index 9ec300485d8..b716c1eff6b 100644 --- a/extensions/qa-lab/src/model-switch-eval.ts +++ b/extensions/qa-lab/src/model-switch-eval.ts @@ -6,5 +6,13 @@ export function hasModelSwitchContinuityEvidence(text: string) { lower.includes("qa_kickoff_task") || lower.includes("kickoff task") || lower.includes("qa mission"); - return mentionsHandoff && mentionsKickoffTask; + const hasScopeLeak = + lower.includes("subagent-handoff") || + lower.includes("delegated task") || + lower.includes("final qa tally") || + lower.includes("qa run complete") || + lower.includes("all mandatory scenarios"); + const looksOverlong = + text.length > 280 || text.includes("\n\n") || text.includes("|---") || text.includes("### "); + return mentionsHandoff && mentionsKickoffTask && !hasScopeLeak && !looksOverlong; } diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index c9b154ee8ce..99e234a0ada 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -14,7 +14,11 @@ import { import { buildAgentSessionKey } from "openclaw/plugin-sdk/routing"; import type { QaBusState } from "./bus-state.js"; import { waitForCronRunCompletion } from "./cron-run-wait.js"; -import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js"; +import { + hasDiscoveryLabels, + reportsDiscoveryScopeLeak, + reportsMissingDiscoveryFiles, +} from "./discovery-eval.js"; import { extractQaToolPayload } from "./extract-tool-payload.js"; import { startQaGatewayChild } from "./gateway-child.js"; import { startQaLabServer } from "./lab-server.js"; @@ -1068,6 +1072,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) { if (reportsMissingDiscoveryFiles(outbound.text)) { throw new Error(`discovery report still missed repo files: ${outbound.text}`); } + if (reportsDiscoveryScopeLeak(outbound.text)) { + throw new Error(`discovery report drifted beyond scope: ${outbound.text}`); + } return outbound.text; }, },