mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-26 08:31:55 +00:00
fix(qa): harden frontier claude bakeoffs
This commit is contained in:
committed by
Peter Steinberger
parent
18fb171179
commit
4f421fa0f1
74
extensions/qa-lab/src/discovery-eval.test.ts
Normal file
74
extensions/qa-lab/src/discovery-eval.test.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
|
||||
|
||||
describe("qa discovery evaluation", () => {
|
||||
it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- Read all four requested files: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
|
||||
Failed
|
||||
- None.
|
||||
Blocked
|
||||
- Runtime execution not attempted here.
|
||||
Follow-up
|
||||
- Run the live suite next.
|
||||
|
||||
The helper text mentions banned phrases like "not present", "missing files", "blocked by missing", and "could not inspect", but only as quoted examples.
|
||||
`.trim();
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
|
||||
});
|
||||
|
||||
it("accepts numeric 'all 4 required files read' confirmations", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- Source: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
|
||||
- all 4 required files read.
|
||||
Failed
|
||||
- None.
|
||||
Blocked
|
||||
- No runtime execution in this pass.
|
||||
Follow-up
|
||||
- Run the live suite next.
|
||||
|
||||
The report may quote phrases like "not present" while describing the evaluator, but the files were read.
|
||||
`.trim();
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
|
||||
});
|
||||
|
||||
it("accepts claude-style 'all four files retrieved' discovery summaries", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- All four files retrieved. Now let me compile the protocol report.
|
||||
- All four mandated files read successfully: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
|
||||
Failed
|
||||
- None.
|
||||
Blocked
|
||||
- Runtime execution not attempted here.
|
||||
Follow-up
|
||||
- Run the live suite next.
|
||||
`.trim();
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(false);
|
||||
});
|
||||
|
||||
it("still flags genuine file-miss language when the report never confirms the required reads", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- Read some of the requested files.
|
||||
Failed
|
||||
- repo/docs/help/testing.md was not present.
|
||||
Blocked
|
||||
- Could not inspect the remaining refs.
|
||||
Follow-up
|
||||
- Fix the workspace mount.
|
||||
`.trim();
|
||||
|
||||
expect(hasDiscoveryLabels(report)).toBe(true);
|
||||
expect(reportsMissingDiscoveryFiles(report)).toBe(true);
|
||||
});
|
||||
});
|
||||
43
extensions/qa-lab/src/discovery-eval.ts
Normal file
43
extensions/qa-lab/src/discovery-eval.ts
Normal file
@@ -0,0 +1,43 @@
|
||||
const REQUIRED_DISCOVERY_REFS = [
|
||||
"repo/qa/seed-scenarios.json",
|
||||
"repo/qa/QA_KICKOFF_TASK.md",
|
||||
"repo/extensions/qa-lab/src/suite.ts",
|
||||
"repo/docs/help/testing.md",
|
||||
] as const;
|
||||
|
||||
function confirmsDiscoveryFileRead(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
const mentionsAllRefs = REQUIRED_DISCOVERY_REFS.every((ref) => lower.includes(ref.toLowerCase()));
|
||||
const confirmsRead =
|
||||
/(?:read|retrieved|inspected|loaded|accessed|digested)\s+all\s+(?:four|4)\s+(?:(?:requested|required|mandated|seeded)\s+)?files/.test(
|
||||
lower,
|
||||
) ||
|
||||
/all\s+(?:four|4)\s+(?:(?:requested|required|mandated|seeded)\s+)?files\s+(?:were\s+)?(?:read|retrieved|inspected|loaded|accessed|digested)(?:\s+\w+)?/.test(
|
||||
lower,
|
||||
) ||
|
||||
/all (?:four|4) seeded files readable/.test(lower);
|
||||
return mentionsAllRefs && confirmsRead;
|
||||
}
|
||||
|
||||
export function hasDiscoveryLabels(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
return (
|
||||
lower.includes("worked") &&
|
||||
lower.includes("failed") &&
|
||||
lower.includes("blocked") &&
|
||||
(lower.includes("follow-up") || lower.includes("follow up"))
|
||||
);
|
||||
}
|
||||
|
||||
export function reportsMissingDiscoveryFiles(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
if (confirmsDiscoveryFileRead(text)) {
|
||||
return false;
|
||||
}
|
||||
return (
|
||||
lower.includes("not present") ||
|
||||
lower.includes("missing files") ||
|
||||
lower.includes("blocked by missing") ||
|
||||
lower.includes("could not inspect")
|
||||
);
|
||||
}
|
||||
57
extensions/qa-lab/src/live-timeout.test.ts
Normal file
57
extensions/qa-lab/src/live-timeout.test.ts
Normal file
@@ -0,0 +1,57 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { resolveQaLiveTurnTimeoutMs } from "./live-timeout.js";
|
||||
|
||||
describe("qa live timeout policy", () => {
|
||||
it("keeps mock lanes on the caller fallback", () => {
|
||||
expect(
|
||||
resolveQaLiveTurnTimeoutMs(
|
||||
{
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "anthropic/claude-sonnet-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
},
|
||||
30_000,
|
||||
),
|
||||
).toBe(30_000);
|
||||
});
|
||||
|
||||
it("uses the standard live floor for non-anthropic models", () => {
|
||||
expect(
|
||||
resolveQaLiveTurnTimeoutMs(
|
||||
{
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
},
|
||||
30_000,
|
||||
),
|
||||
).toBe(120_000);
|
||||
});
|
||||
|
||||
it("uses the anthropic floor for sonnet turns", () => {
|
||||
expect(
|
||||
resolveQaLiveTurnTimeoutMs(
|
||||
{
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "anthropic/claude-sonnet-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
},
|
||||
30_000,
|
||||
),
|
||||
).toBe(180_000);
|
||||
});
|
||||
|
||||
it("uses the opus floor when the switched turn runs on claude opus", () => {
|
||||
expect(
|
||||
resolveQaLiveTurnTimeoutMs(
|
||||
{
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "anthropic/claude-sonnet-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
},
|
||||
30_000,
|
||||
"anthropic/claude-opus-4-6",
|
||||
),
|
||||
).toBe(240_000);
|
||||
});
|
||||
});
|
||||
30
extensions/qa-lab/src/live-timeout.ts
Normal file
30
extensions/qa-lab/src/live-timeout.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
type QaLiveTimeoutProfile = {
|
||||
providerMode: "mock-openai" | "live-frontier";
|
||||
primaryModel: string;
|
||||
alternateModel: string;
|
||||
};
|
||||
|
||||
function isAnthropicModel(modelRef: string) {
|
||||
return modelRef.startsWith("anthropic/");
|
||||
}
|
||||
|
||||
function isClaudeOpusModel(modelRef: string) {
|
||||
return isAnthropicModel(modelRef) && modelRef.includes("claude-opus");
|
||||
}
|
||||
|
||||
export function resolveQaLiveTurnTimeoutMs(
|
||||
profile: QaLiveTimeoutProfile,
|
||||
fallbackMs: number,
|
||||
modelRef = profile.primaryModel,
|
||||
) {
|
||||
if (profile.providerMode === "mock-openai") {
|
||||
return fallbackMs;
|
||||
}
|
||||
if (isClaudeOpusModel(modelRef)) {
|
||||
return Math.max(fallbackMs, 240_000);
|
||||
}
|
||||
if (isAnthropicModel(modelRef)) {
|
||||
return Math.max(fallbackMs, 180_000);
|
||||
}
|
||||
return Math.max(fallbackMs, 120_000);
|
||||
}
|
||||
28
extensions/qa-lab/src/model-switch-eval.test.ts
Normal file
28
extensions/qa-lab/src/model-switch-eval.test.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
|
||||
|
||||
describe("qa model-switch evaluation", () => {
|
||||
it("accepts direct handoff replies that mention the kickoff task", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
"Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("accepts short mission-oriented switch confirmations", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
"model switch complete. reread the kickoff task; qa mission stays the same.",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
"subagent-handoff confirmed. qa report update: scenario pass. qa run complete.",
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
});
|
||||
10
extensions/qa-lab/src/model-switch-eval.ts
Normal file
10
extensions/qa-lab/src/model-switch-eval.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
export function hasModelSwitchContinuityEvidence(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
const mentionsHandoff =
|
||||
lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
|
||||
const mentionsKickoffTask =
|
||||
lower.includes("qa_kickoff_task") ||
|
||||
lower.includes("kickoff task") ||
|
||||
lower.includes("qa mission");
|
||||
return mentionsHandoff && mentionsKickoffTask;
|
||||
}
|
||||
@@ -14,10 +14,12 @@ import {
|
||||
import { buildAgentSessionKey } from "openclaw/plugin-sdk/routing";
|
||||
import type { QaBusState } from "./bus-state.js";
|
||||
import { waitForCronRunCompletion } from "./cron-run-wait.js";
|
||||
import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
|
||||
import { extractQaToolPayload } from "./extract-tool-payload.js";
|
||||
import { startQaGatewayChild } from "./gateway-child.js";
|
||||
import { startQaLabServer } from "./lab-server.js";
|
||||
import type { QaLabLatestReport, QaLabScenarioOutcome } from "./lab-server.js";
|
||||
import { resolveQaLiveTurnTimeoutMs } from "./live-timeout.js";
|
||||
import { startQaMockOpenAiServer } from "./mock-openai-server.js";
|
||||
import {
|
||||
defaultQaModelForMode,
|
||||
@@ -25,6 +27,7 @@ import {
|
||||
normalizeQaProviderMode,
|
||||
type QaProviderMode,
|
||||
} from "./model-selection.js";
|
||||
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
|
||||
import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } from "./report.js";
|
||||
import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js";
|
||||
import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js";
|
||||
@@ -93,27 +96,7 @@ function splitModelRef(ref: string) {
|
||||
}
|
||||
|
||||
function liveTurnTimeoutMs(env: QaSuiteEnvironment, fallbackMs: number) {
|
||||
return env.providerMode === "mock-openai" ? fallbackMs : Math.max(fallbackMs, 120_000);
|
||||
}
|
||||
|
||||
function hasDiscoveryLabels(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
return (
|
||||
lower.includes("worked") &&
|
||||
lower.includes("failed") &&
|
||||
lower.includes("blocked") &&
|
||||
(lower.includes("follow-up") || lower.includes("follow up"))
|
||||
);
|
||||
}
|
||||
|
||||
function reportsMissingDiscoveryFiles(text: string) {
|
||||
const lower = text.toLowerCase();
|
||||
return (
|
||||
lower.includes("not present") ||
|
||||
lower.includes("missing files") ||
|
||||
lower.includes("blocked by missing") ||
|
||||
lower.includes("could not inspect")
|
||||
);
|
||||
return resolveQaLiveTurnTimeoutMs(env, fallbackMs);
|
||||
}
|
||||
|
||||
export type QaSuiteResult = {
|
||||
@@ -916,7 +899,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
message: "Continue the exchange after switching models and note the handoff.",
|
||||
provider: alternate?.provider,
|
||||
model: alternate?.model,
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
|
||||
});
|
||||
const outbound = await waitForCondition(
|
||||
() =>
|
||||
@@ -930,7 +913,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
candidate.text.toLowerCase().includes("handoff")),
|
||||
)
|
||||
.at(-1),
|
||||
liveTurnTimeoutMs(env, 20_000),
|
||||
resolveQaLiveTurnTimeoutMs(env, 20_000, env.alternateModel),
|
||||
);
|
||||
if (env.mock) {
|
||||
const request = await fetchJson<{ body?: { model?: string } }>(
|
||||
@@ -1630,24 +1613,23 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
"Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence.",
|
||||
provider: alternate?.provider,
|
||||
model: alternate?.model,
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
|
||||
});
|
||||
const outbound = await waitForCondition(
|
||||
() => {
|
||||
const snapshot = state.getSnapshot();
|
||||
return snapshot.messages
|
||||
.slice(beforeSwitchCursor)
|
||||
.filter(
|
||||
(candidate) =>
|
||||
candidate.direction === "outbound" &&
|
||||
candidate.conversation.id === "qa-operator" &&
|
||||
(candidate.text.toLowerCase().includes("model switch") ||
|
||||
candidate.text.toLowerCase().includes("handoff")),
|
||||
)
|
||||
.at(-1);
|
||||
},
|
||||
liveTurnTimeoutMs(env, 30_000),
|
||||
);
|
||||
const outbound = await waitForCondition(() => {
|
||||
const snapshot = state.getSnapshot();
|
||||
return snapshot.messages
|
||||
.slice(beforeSwitchCursor)
|
||||
.filter(
|
||||
(candidate) =>
|
||||
candidate.direction === "outbound" &&
|
||||
candidate.conversation.id === "qa-operator" &&
|
||||
hasModelSwitchContinuityEvidence(candidate.text),
|
||||
)
|
||||
.at(-1);
|
||||
}, 10_000);
|
||||
if (!hasModelSwitchContinuityEvidence(outbound.text)) {
|
||||
throw new Error(`switch reply missed kickoff continuity: ${outbound.text}`);
|
||||
}
|
||||
if (env.mock) {
|
||||
const requests = await fetchJson<
|
||||
Array<{ allInputText?: string; plannedToolName?: string; model?: string }>
|
||||
|
||||
Reference in New Issue
Block a user