fix(qa): tighten frontier scope evals

2026-04-12 09:41:11 +00:00 · 2026-04-07 16:25:24 +01:00
parent 4e69a9b329
commit b5d2bd6f41
5 changed files with 69 additions and 3 deletions
--- a/extensions/qa-lab/src/discovery-eval.test.ts
+++ b/extensions/qa-lab/src/discovery-eval.test.ts
@@ -1,5 +1,9 @@
 import { describe, expect, it } from "vitest";
-import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
+import {
+  hasDiscoveryLabels,
+  reportsDiscoveryScopeLeak,
+  reportsMissingDiscoveryFiles,
+} from "./discovery-eval.js";

 describe("qa discovery evaluation", () => {
  it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
@@ -18,6 +22,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl

    expect(hasDiscoveryLabels(report)).toBe(true);
    expect(reportsMissingDiscoveryFiles(report)).toBe(false);
+    expect(reportsDiscoveryScopeLeak(report)).toBe(false);
  });

  it("accepts numeric 'all 4 required files read' confirmations", () => {
@@ -37,6 +42,7 @@ The report may quote phrases like "not present" while describing the evaluator,

    expect(hasDiscoveryLabels(report)).toBe(true);
    expect(reportsMissingDiscoveryFiles(report)).toBe(false);
+    expect(reportsDiscoveryScopeLeak(report)).toBe(false);
  });

  it("accepts claude-style 'all four files retrieved' discovery summaries", () => {
@@ -54,6 +60,7 @@ Follow-up

    expect(hasDiscoveryLabels(report)).toBe(true);
    expect(reportsMissingDiscoveryFiles(report)).toBe(false);
+    expect(reportsDiscoveryScopeLeak(report)).toBe(false);
  });

  it("still flags genuine file-miss language when the report never confirms the required reads", () => {
@@ -70,5 +77,25 @@ Follow-up

    expect(hasDiscoveryLabels(report)).toBe(true);
    expect(reportsMissingDiscoveryFiles(report)).toBe(true);
+    expect(reportsDiscoveryScopeLeak(report)).toBe(false);
+  });
+
+  it("flags discovery replies that drift into unrelated suite wrap-up claims", () => {
+    const report = `
+Worked
+- All four requested files were read: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
+Failed
+- None.
+Blocked
+- Runtime execution not attempted here.
+Follow-up
+- Run the live suite next.
+
+Final QA tally update: all mandatory scenarios resolved. QA run complete.
+`.trim();
+
+    expect(hasDiscoveryLabels(report)).toBe(true);
+    expect(reportsMissingDiscoveryFiles(report)).toBe(false);
+    expect(reportsDiscoveryScopeLeak(report)).toBe(true);
  });
 });
--- a/extensions/qa-lab/src/discovery-eval.ts
+++ b/extensions/qa-lab/src/discovery-eval.ts
@@ -5,6 +5,15 @@ const REQUIRED_DISCOVERY_REFS = [
  "repo/docs/help/testing.md",
 ] as const;

+const DISCOVERY_SCOPE_LEAK_PHRASES = [
+  "all mandatory scenarios",
+  "final qa tally",
+  "final qa tally update",
+  "qa run complete",
+  "scenario: `subagent-handoff`",
+  "scenario: subagent-handoff",
+] as const;
+
 function confirmsDiscoveryFileRead(text: string) {
  const lower = text.toLowerCase();
  const mentionsAllRefs = REQUIRED_DISCOVERY_REFS.every((ref) => lower.includes(ref.toLowerCase()));
@@ -41,3 +50,8 @@ export function reportsMissingDiscoveryFiles(text: string) {
    lower.includes("could not inspect")
  );
 }
+
+export function reportsDiscoveryScopeLeak(text: string) {
+  const lower = text.toLowerCase();
+  return DISCOVERY_SCOPE_LEAK_PHRASES.some((phrase) => lower.includes(phrase));
+}
--- a/extensions/qa-lab/src/model-switch-eval.test.ts
+++ b/extensions/qa-lab/src/model-switch-eval.test.ts
@@ -25,4 +25,14 @@ describe("qa model-switch evaluation", () => {
      ),
    ).toBe(false);
  });
+
+  it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => {
+    expect(
+      hasModelSwitchContinuityEvidence(
+        `model switch acknowledged. qa mission stays the same.
+
+Final QA tally update: all mandatory scenarios resolved. QA run complete.`,
+      ),
+    ).toBe(false);
+  });
 });
--- a/extensions/qa-lab/src/model-switch-eval.ts
+++ b/extensions/qa-lab/src/model-switch-eval.ts
@@ -6,5 +6,13 @@ export function hasModelSwitchContinuityEvidence(text: string) {
    lower.includes("qa_kickoff_task") ||
    lower.includes("kickoff task") ||
    lower.includes("qa mission");
-  return mentionsHandoff && mentionsKickoffTask;
+  const hasScopeLeak =
+    lower.includes("subagent-handoff") ||
+    lower.includes("delegated task") ||
+    lower.includes("final qa tally") ||
+    lower.includes("qa run complete") ||
+    lower.includes("all mandatory scenarios");
+  const looksOverlong =
+    text.length > 280 || text.includes("\n\n") || text.includes("|---") || text.includes("### ");
+  return mentionsHandoff && mentionsKickoffTask && !hasScopeLeak && !looksOverlong;
 }
--- a/extensions/qa-lab/src/suite.ts
+++ b/extensions/qa-lab/src/suite.ts
@@ -14,7 +14,11 @@ import {
 import { buildAgentSessionKey } from "openclaw/plugin-sdk/routing";
 import type { QaBusState } from "./bus-state.js";
 import { waitForCronRunCompletion } from "./cron-run-wait.js";
-import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
+import {
+  hasDiscoveryLabels,
+  reportsDiscoveryScopeLeak,
+  reportsMissingDiscoveryFiles,
+} from "./discovery-eval.js";
 import { extractQaToolPayload } from "./extract-tool-payload.js";
 import { startQaGatewayChild } from "./gateway-child.js";
 import { startQaLabServer } from "./lab-server.js";
@@ -1068,6 +1072,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
              if (reportsMissingDiscoveryFiles(outbound.text)) {
                throw new Error(`discovery report still missed repo files: ${outbound.text}`);
              }
+              if (reportsDiscoveryScopeLeak(outbound.text)) {
+                throw new Error(`discovery report drifted beyond scope: ${outbound.text}`);
+              }
              return outbound.text;
            },
          },