test: harden qa eval scenarios

This commit is contained in:
Peter Steinberger
2026-04-10 10:11:04 +01:00
parent 005b629b6d
commit 68b4b36a90
6 changed files with 42 additions and 10 deletions

View File

@@ -18,6 +18,22 @@ describe("qa model-switch evaluation", () => {
).toBe(true);
});
it("accepts concise kickoff note confirmations", () => {
expect(
hasModelSwitchContinuityEvidence(
"Handoff clean: after the model switch, I reread the kickoff note.",
),
).toBe(true);
});
it("accepts concise paraphrases of the kickoff task after a handoff", () => {
expect(
hasModelSwitchContinuityEvidence(
"Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.",
),
).toBe(true);
});
it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
expect(
hasModelSwitchContinuityEvidence(

View File

@@ -7,7 +7,13 @@ export function hasModelSwitchContinuityEvidence(text: string) {
const mentionsKickoffTask =
lower.includes("qa_kickoff_task") ||
lower.includes("kickoff task") ||
lower.includes("qa mission");
lower.includes("kickoff note") ||
lower.includes("qa mission") ||
(lower.includes("source and docs") &&
lower.includes("qa-channel scenarios") &&
lower.includes("worked") &&
lower.includes("blocked") &&
lower.includes("follow-up"));
const hasScopeLeak =
lower.includes("subagent-handoff") ||
lower.includes("delegated task") ||

View File

@@ -38,6 +38,9 @@ describe("qa scenario catalog", () => {
const discovery = readQaScenarioById("source-docs-discovery-report");
const discoveryConfig = readQaScenarioExecutionConfig("source-docs-discovery-report");
const fallbackConfig = readQaScenarioExecutionConfig("memory-failure-fallback");
const fanoutConfig = readQaScenarioExecutionConfig("subagent-fanout-synthesis") as
| { expectedReplyGroups?: unknown[][] }
| undefined;
expect(discovery.title).toBe("Source and docs discovery report");
expect((discoveryConfig?.requiredFiles as string[] | undefined)?.[0]).toBe(
@@ -46,6 +49,8 @@ describe("qa scenario catalog", () => {
expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain(
"will not reveal",
);
expect(fanoutConfig?.expectedReplyGroups?.flat()).toContain("subagent-1: ok");
expect(fanoutConfig?.expectedReplyGroups?.flat()).toContain("subagent-2: ok");
});
it("keeps the character eval scenario natural and task-shaped", () => {