mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 09:41:11 +00:00
fix(qa): tighten frontier scope evals
This commit is contained in:
@@ -1,5 +1,9 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
|
||||
import {
|
||||
hasDiscoveryLabels,
|
||||
reportsDiscoveryScopeLeak,
|
||||
reportsMissingDiscoveryFiles,
|
||||
} from "./discovery-eval.js";
|
||||
|
||||
describe("qa discovery evaluation", () => {
|
||||
it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
|
||||
@@ -18,6 +22,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
|
||||
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
|
||||
});
|
||||
|
||||
it("accepts numeric 'all 4 required files read' confirmations", () => {
|
||||
@@ -37,6 +42,7 @@ The report may quote phrases like "not present" while describing the evaluator,
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
|
||||
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
|
||||
});
|
||||
|
||||
it("accepts claude-style 'all four files retrieved' discovery summaries", () => {
|
||||
@@ -54,6 +60,7 @@ Follow-up
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
|
||||
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
|
||||
});
|
||||
|
||||
it("still flags genuine file-miss language when the report never confirms the required reads", () => {
|
||||
@@ -70,5 +77,25 @@ Follow-up
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(true);
|
||||
expect(reportsDiscoveryScopeLeak(report)).toBe(false);
|
||||
});
|
||||
|
||||
it("flags discovery replies that drift into unrelated suite wrap-up claims", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- All four requested files were read: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
|
||||
Failed
|
||||
- None.
|
||||
Blocked
|
||||
- Runtime execution not attempted here.
|
||||
Follow-up
|
||||
- Run the live suite next.
|
||||
|
||||
Final QA tally update: all mandatory scenarios resolved. QA run complete.
|
||||
`.trim();
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
|
||||
expect(reportsDiscoveryScopeLeak(report)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -5,6 +5,15 @@ const REQUIRED_DISCOVERY_REFS = [
|
||||
"repo/docs/help/testing.md",
|
||||
] as const;
|
||||
|
||||
const DISCOVERY_SCOPE_LEAK_PHRASES = [
|
||||
"all mandatory scenarios",
|
||||
"final qa tally",
|
||||
"final qa tally update",
|
||||
"qa run complete",
|
||||
"scenario: `subagent-handoff`",
|
||||
"scenario: subagent-handoff",
|
||||
] as const;
|
||||
|
||||
function confirmsDiscoveryFileRead(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
const mentionsAllRefs = REQUIRED_DISCOVERY_REFS.every((ref) => lower.includes(ref.toLowerCase()));
|
||||
@@ -41,3 +50,8 @@ export function reportsMissingDiscoveryFiles(text: string) {
|
||||
lower.includes("could not inspect")
|
||||
);
|
||||
}
|
||||
|
||||
export function reportsDiscoveryScopeLeak(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
return DISCOVERY_SCOPE_LEAK_PHRASES.some((phrase) => lower.includes(phrase));
|
||||
}
|
||||
|
||||
@@ -25,4 +25,14 @@ describe("qa model-switch evaluation", () => {
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
`model switch acknowledged. qa mission stays the same.
|
||||
|
||||
Final QA tally update: all mandatory scenarios resolved. QA run complete.`,
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,5 +6,13 @@ export function hasModelSwitchContinuityEvidence(text: string) {
|
||||
lower.includes("qa_kickoff_task") ||
|
||||
lower.includes("kickoff task") ||
|
||||
lower.includes("qa mission");
|
||||
return mentionsHandoff && mentionsKickoffTask;
|
||||
const hasScopeLeak =
|
||||
lower.includes("subagent-handoff") ||
|
||||
lower.includes("delegated task") ||
|
||||
lower.includes("final qa tally") ||
|
||||
lower.includes("qa run complete") ||
|
||||
lower.includes("all mandatory scenarios");
|
||||
const looksOverlong =
|
||||
text.length > 280 || text.includes("\n\n") || text.includes("|---") || text.includes("### ");
|
||||
return mentionsHandoff && mentionsKickoffTask && !hasScopeLeak && !looksOverlong;
|
||||
}
|
||||
|
||||
@@ -14,7 +14,11 @@ import {
|
||||
import { buildAgentSessionKey } from "openclaw/plugin-sdk/routing";
|
||||
import type { QaBusState } from "./bus-state.js";
|
||||
import { waitForCronRunCompletion } from "./cron-run-wait.js";
|
||||
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
|
||||
import {
|
||||
hasDiscoveryLabels,
|
||||
reportsDiscoveryScopeLeak,
|
||||
reportsMissingDiscoveryFiles,
|
||||
} from "./discovery-eval.js";
|
||||
import { extractQaToolPayload } from "./extract-tool-payload.js";
|
||||
import { startQaGatewayChild } from "./gateway-child.js";
|
||||
import { startQaLabServer } from "./lab-server.js";
|
||||
@@ -1068,6 +1072,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
if (reportsMissingDiscoveryFiles(outbound.text)) {
|
||||
throw new Error(`discovery report still missed repo files: ${outbound.text}`);
|
||||
}
|
||||
if (reportsDiscoveryScopeLeak(outbound.text)) {
|
||||
throw new Error(`discovery report drifted beyond scope: ${outbound.text}`);
|
||||
}
|
||||
return outbound.text;
|
||||
},
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user