fix(qa): harden frontier claude bakeoffs

This commit is contained in:
Vincent Koc
2026-04-07 09:58:30 +01:00
committed by Peter Steinberger
parent 18fb171179
commit 4f421fa0f1
7 changed files with 264 additions and 40 deletions

View File

@@ -0,0 +1,74 @@
import { describe, expect, it } from "vitest";
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
describe("qa discovery evaluation", () => {
it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
const report = `
Worked
- Read all four requested files: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
Failed
- None.
Blocked
- Runtime execution not attempted here.
Follow-up
- Run the live suite next.
The helper text mentions banned phrases like "not present", "missing files", "blocked by missing", and "could not inspect", but only as quoted examples.
`.trim();
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
});
it("accepts numeric 'all 4 required files read' confirmations", () => {
const report = `
Worked
- Source: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
- all 4 required files read.
Failed
- None.
Blocked
- No runtime execution in this pass.
Follow-up
- Run the live suite next.
The report may quote phrases like "not present" while describing the evaluator, but the files were read.
`.trim();
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
});
it("accepts claude-style 'all four files retrieved' discovery summaries", () => {
const report = `
Worked
- All four files retrieved. Now let me compile the protocol report.
- All four mandated files read successfully: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
Failed
- None.
Blocked
- Runtime execution not attempted here.
Follow-up
- Run the live suite next.
`.trim();
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
});
it("still flags genuine file-miss language when the report never confirms the required reads", () => {
const report = `
Worked
- Read some of the requested files.
Failed
- repo/docs/help/testing.md was not present.
Blocked
- Could not inspect the remaining refs.
Follow-up
- Fix the workspace mount.
`.trim();
expect(hasDiscoveryLabels(report)).toBe(true);
expect(reportsMissingDiscoveryFiles(report)).toBe(true);
});
});

View File

@@ -0,0 +1,43 @@
const REQUIRED_DISCOVERY_REFS = [
"repo/qa/seed-scenarios.json",
"repo/qa/QA_KICKOFF_TASK.md",
"repo/extensions/qa-lab/src/suite.ts",
"repo/docs/help/testing.md",
] as const;
function confirmsDiscoveryFileRead(text: string) {
const lower = text.toLowerCase();
const mentionsAllRefs = REQUIRED_DISCOVERY_REFS.every((ref) => lower.includes(ref.toLowerCase()));
const confirmsRead =
/(?:read|retrieved|inspected|loaded|accessed|digested)\s+all\s+(?:four|4)\s+(?:(?:requested|required|mandated|seeded)\s+)?files/.test(
lower,
) ||
/all\s+(?:four|4)\s+(?:(?:requested|required|mandated|seeded)\s+)?files\s+(?:were\s+)?(?:read|retrieved|inspected|loaded|accessed|digested)(?:\s+\w+)?/.test(
lower,
) ||
/all (?:four|4) seeded files readable/.test(lower);
return mentionsAllRefs && confirmsRead;
}
export function hasDiscoveryLabels(text: string) {
const lower = text.toLowerCase();
return (
lower.includes("worked") &&
lower.includes("failed") &&
lower.includes("blocked") &&
(lower.includes("follow-up") || lower.includes("follow up"))
);
}
export function reportsMissingDiscoveryFiles(text: string) {
const lower = text.toLowerCase();
if (confirmsDiscoveryFileRead(text)) {
return false;
}
return (
lower.includes("not present") ||
lower.includes("missing files") ||
lower.includes("blocked by missing") ||
lower.includes("could not inspect")
);
}

View File

@@ -0,0 +1,57 @@
import { describe, expect, it } from "vitest";
import { resolveQaLiveTurnTimeoutMs } from "./live-timeout.js";
describe("qa live timeout policy", () => {
it("keeps mock lanes on the caller fallback", () => {
expect(
resolveQaLiveTurnTimeoutMs(
{
providerMode: "mock-openai",
primaryModel: "anthropic/claude-sonnet-4-6",
alternateModel: "anthropic/claude-opus-4-6",
},
30_000,
),
).toBe(30_000);
});
it("uses the standard live floor for non-anthropic models", () => {
expect(
resolveQaLiveTurnTimeoutMs(
{
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
},
30_000,
),
).toBe(120_000);
});
it("uses the anthropic floor for sonnet turns", () => {
expect(
resolveQaLiveTurnTimeoutMs(
{
providerMode: "live-frontier",
primaryModel: "anthropic/claude-sonnet-4-6",
alternateModel: "anthropic/claude-opus-4-6",
},
30_000,
),
).toBe(180_000);
});
it("uses the opus floor when the switched turn runs on claude opus", () => {
expect(
resolveQaLiveTurnTimeoutMs(
{
providerMode: "live-frontier",
primaryModel: "anthropic/claude-sonnet-4-6",
alternateModel: "anthropic/claude-opus-4-6",
},
30_000,
"anthropic/claude-opus-4-6",
),
).toBe(240_000);
});
});

View File

@@ -0,0 +1,30 @@
type QaLiveTimeoutProfile = {
providerMode: "mock-openai" | "live-frontier";
primaryModel: string;
alternateModel: string;
};
function isAnthropicModel(modelRef: string) {
return modelRef.startsWith("anthropic/");
}
function isClaudeOpusModel(modelRef: string) {
return isAnthropicModel(modelRef) && modelRef.includes("claude-opus");
}
export function resolveQaLiveTurnTimeoutMs(
profile: QaLiveTimeoutProfile,
fallbackMs: number,
modelRef = profile.primaryModel,
) {
if (profile.providerMode === "mock-openai") {
return fallbackMs;
}
if (isClaudeOpusModel(modelRef)) {
return Math.max(fallbackMs, 240_000);
}
if (isAnthropicModel(modelRef)) {
return Math.max(fallbackMs, 180_000);
}
return Math.max(fallbackMs, 120_000);
}

View File

@@ -0,0 +1,28 @@
import { describe, expect, it } from "vitest";
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
describe("qa model-switch evaluation", () => {
it("accepts direct handoff replies that mention the kickoff task", () => {
expect(
hasModelSwitchContinuityEvidence(
"Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.",
),
).toBe(true);
});
it("accepts short mission-oriented switch confirmations", () => {
expect(
hasModelSwitchContinuityEvidence(
"model switch complete. reread the kickoff task; qa mission stays the same.",
),
).toBe(true);
});
it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
expect(
hasModelSwitchContinuityEvidence(
"subagent-handoff confirmed. qa report update: scenario pass. qa run complete.",
),
).toBe(false);
});
});

View File

@@ -0,0 +1,10 @@
export function hasModelSwitchContinuityEvidence(text: string) {
const lower = text.toLowerCase();
const mentionsHandoff =
lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
const mentionsKickoffTask =
lower.includes("qa_kickoff_task") ||
lower.includes("kickoff task") ||
lower.includes("qa mission");
return mentionsHandoff && mentionsKickoffTask;
}

View File

@@ -14,10 +14,12 @@ import {
import { buildAgentSessionKey } from "openclaw/plugin-sdk/routing";
import type { QaBusState } from "./bus-state.js";
import { waitForCronRunCompletion } from "./cron-run-wait.js";
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
import { extractQaToolPayload } from "./extract-tool-payload.js";
import { startQaGatewayChild } from "./gateway-child.js";
import { startQaLabServer } from "./lab-server.js";
import type { QaLabLatestReport, QaLabScenarioOutcome } from "./lab-server.js";
import { resolveQaLiveTurnTimeoutMs } from "./live-timeout.js";
import { startQaMockOpenAiServer } from "./mock-openai-server.js";
import {
defaultQaModelForMode,
@@ -25,6 +27,7 @@ import {
normalizeQaProviderMode,
type QaProviderMode,
} from "./model-selection.js";
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } from "./report.js";
import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js";
import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js";
@@ -93,27 +96,7 @@ function splitModelRef(ref: string) {
}
function liveTurnTimeoutMs(env: QaSuiteEnvironment, fallbackMs: number) {
return env.providerMode === "mock-openai" ? fallbackMs : Math.max(fallbackMs, 120_000);
}
function hasDiscoveryLabels(text: string) {
const lower = text.toLowerCase();
return (
lower.includes("worked") &&
lower.includes("failed") &&
lower.includes("blocked") &&
(lower.includes("follow-up") || lower.includes("follow up"))
);
}
function reportsMissingDiscoveryFiles(text: string) {
const lower = text.toLowerCase();
return (
lower.includes("not present") ||
lower.includes("missing files") ||
lower.includes("blocked by missing") ||
lower.includes("could not inspect")
);
return resolveQaLiveTurnTimeoutMs(env, fallbackMs);
}
export type QaSuiteResult = {
@@ -916,7 +899,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
message: "Continue the exchange after switching models and note the handoff.",
provider: alternate?.provider,
model: alternate?.model,
timeoutMs: liveTurnTimeoutMs(env, 30_000),
timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
});
const outbound = await waitForCondition(
() =>
@@ -930,7 +913,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
candidate.text.toLowerCase().includes("handoff")),
)
.at(-1),
liveTurnTimeoutMs(env, 20_000),
resolveQaLiveTurnTimeoutMs(env, 20_000, env.alternateModel),
);
if (env.mock) {
const request = await fetchJson<{ body?: { model?: string } }>(
@@ -1630,24 +1613,23 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
"Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence.",
provider: alternate?.provider,
model: alternate?.model,
timeoutMs: liveTurnTimeoutMs(env, 30_000),
timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
});
const outbound = await waitForCondition(
() => {
const snapshot = state.getSnapshot();
return snapshot.messages
.slice(beforeSwitchCursor)
.filter(
(candidate) =>
candidate.direction === "outbound" &&
candidate.conversation.id === "qa-operator" &&
(candidate.text.toLowerCase().includes("model switch") ||
candidate.text.toLowerCase().includes("handoff")),
)
.at(-1);
},
liveTurnTimeoutMs(env, 30_000),
);
const outbound = await waitForCondition(() => {
const snapshot = state.getSnapshot();
return snapshot.messages
.slice(beforeSwitchCursor)
.filter(
(candidate) =>
candidate.direction === "outbound" &&
candidate.conversation.id === "qa-operator" &&
hasModelSwitchContinuityEvidence(candidate.text),
)
.at(-1);
}, 10_000);
if (!hasModelSwitchContinuityEvidence(outbound.text)) {
throw new Error(`switch reply missed kickoff continuity: ${outbound.text}`);
}
if (env.mock) {
const requests = await fetchJson<
Array<{ allInputText?: string; plannedToolName?: string; model?: string }>