fix(qa): tighten frontier scope evals

This commit is contained in:
Peter Steinberger
2026-04-07 16:25:24 +01:00
parent 4e69a9b329
commit b5d2bd6f41
5 changed files with 69 additions and 3 deletions

View File

@@ -1,5 +1,9 @@
import { describe, expect, it } from "vitest";
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
import {
hasDiscoveryLabels,
reportsDiscoveryScopeLeak,
reportsMissingDiscoveryFiles,
} from "./discovery-eval.js";
describe("qa discovery evaluation", () => {
it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
@@ -18,6 +22,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
});
it("accepts numeric 'all 4 required files read' confirmations", () => {
@@ -37,6 +42,7 @@ The report may quote phrases like "not present" while describing the evaluator,
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
});
it("accepts claude-style 'all four files retrieved' discovery summaries", () => {
@@ -54,6 +60,7 @@ Follow-up
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
});
it("still flags genuine file-miss language when the report never confirms the required reads", () => {
@@ -70,5 +77,25 @@ Follow-up
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(true);
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
});
it("flags discovery replies that drift into unrelated suite wrap-up claims", () => {
const report = `
Worked
- All four requested files were read: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
Failed
- None.
Blocked
- Runtime execution not attempted here.
Follow-up
- Run the live suite next.
Final QA tally update: all mandatory scenarios resolved. QA run complete.
`.trim();
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
expect(reportsDiscoveryScopeLeak(report)).toBe(true);
});
});

View File

@@ -5,6 +5,15 @@ const REQUIRED_DISCOVERY_REFS = [
"repo/docs/help/testing.md",
] as const;
const DISCOVERY_SCOPE_LEAK_PHRASES = [
"all mandatory scenarios",
"final qa tally",
"final qa tally update",
"qa run complete",
"scenario: `subagent-handoff`",
"scenario: subagent-handoff",
] as const;
function confirmsDiscoveryFileRead(text: string) {
const lower = text.toLowerCase();
const mentionsAllRefs = REQUIRED_DISCOVERY_REFS.every((ref) => lower.includes(ref.toLowerCase()));
@@ -41,3 +50,8 @@ export function reportsMissingDiscoveryFiles(text: string) {
lower.includes("could not inspect")
);
}
export function reportsDiscoveryScopeLeak(text: string) {
const lower = text.toLowerCase();
return DISCOVERY_SCOPE_LEAK_PHRASES.some((phrase) => lower.includes(phrase));
}

View File

@@ -25,4 +25,14 @@ describe("qa model-switch evaluation", () => {
),
).toBe(false);
});
it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => {
expect(
hasModelSwitchContinuityEvidence(
`model switch acknowledged. qa mission stays the same.
Final QA tally update: all mandatory scenarios resolved. QA run complete.`,
),
).toBe(false);
});
});

View File

@@ -6,5 +6,13 @@ export function hasModelSwitchContinuityEvidence(text: string) {
lower.includes("qa_kickoff_task") ||
lower.includes("kickoff task") ||
lower.includes("qa mission");
return mentionsHandoff && mentionsKickoffTask;
const hasScopeLeak =
lower.includes("subagent-handoff") ||
lower.includes("delegated task") ||
lower.includes("final qa tally") ||
lower.includes("qa run complete") ||
lower.includes("all mandatory scenarios");
const looksOverlong =
text.length > 280 || text.includes("\n\n") || text.includes("|---") || text.includes("### ");
return mentionsHandoff && mentionsKickoffTask && !hasScopeLeak && !looksOverlong;
}

View File

@@ -14,7 +14,11 @@ import {
import { buildAgentSessionKey } from "openclaw/plugin-sdk/routing";
import type { QaBusState } from "./bus-state.js";
import { waitForCronRunCompletion } from "./cron-run-wait.js";
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
import {
hasDiscoveryLabels,
reportsDiscoveryScopeLeak,
reportsMissingDiscoveryFiles,
} from "./discovery-eval.js";
import { extractQaToolPayload } from "./extract-tool-payload.js";
import { startQaGatewayChild } from "./gateway-child.js";
import { startQaLabServer } from "./lab-server.js";
@@ -1068,6 +1072,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
if (reportsMissingDiscoveryFiles(outbound.text)) {
throw new Error(`discovery report still missed repo files: ${outbound.text}`);
}
if (reportsDiscoveryScopeLeak(outbound.text)) {
throw new Error(`discovery report drifted beyond scope: ${outbound.text}`);
}
return outbound.text;
},
},