fix(qa): harden frontier claude bakeoffs

2026-04-26 08:31:55 +00:00 · 2026-04-07 09:58:30 +01:00
parent 18fb171179
commit 4f421fa0f1
7 changed files with 264 additions and 40 deletions
--- a/extensions/qa-lab/src/discovery-eval.test.ts
+++ b/extensions/qa-lab/src/discovery-eval.test.ts
@@ -0,0 +1,74 @@
+import { describe, expect, it } from "vitest";
+import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
+
+describe("qa discovery evaluation", () => {
+  it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
+    const report = `
+Worked
+- Read all four requested files: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
+Failed
+- None.
+Blocked
+- Runtime execution not attempted here.
+Follow-up
+- Run the live suite next.
+
+The helper text mentions banned phrases like "not present", "missing files", "blocked by missing", and "could not inspect", but only as quoted examples.
+`.trim();
+
+    expect(hasDiscoveryLabels(report)).toBe(true);
+    expect(reportsMissingDiscoveryFiles(report)).toBe(false);
+  });
+
+  it("accepts numeric 'all 4 required files read' confirmations", () => {
+    const report = `
+Worked
+- Source: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
+- all 4 required files read.
+Failed
+- None.
+Blocked
+- No runtime execution in this pass.
+Follow-up
+- Run the live suite next.
+
+The report may quote phrases like "not present" while describing the evaluator, but the files were read.
+`.trim();
+
+    expect(hasDiscoveryLabels(report)).toBe(true);
+    expect(reportsMissingDiscoveryFiles(report)).toBe(false);
+  });
+
+  it("accepts claude-style 'all four files retrieved' discovery summaries", () => {
+    const report = `
+Worked
+- All four files retrieved. Now let me compile the protocol report.
+- All four mandated files read successfully: repo/qa/seed-scenarios.json, repo/qa/QA_KICKOFF_TASK.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
+Failed
+- None.
+Blocked
+- Runtime execution not attempted here.
+Follow-up
+- Run the live suite next.
+`.trim();
+
+    expect(hasDiscoveryLabels(report)).toBe(true);
+    expect(reportsMissingDiscoveryFiles(report)).toBe(false);
+  });
+
+  it("still flags genuine file-miss language when the report never confirms the required reads", () => {
+    const report = `
+Worked
+- Read some of the requested files.
+Failed
+- repo/docs/help/testing.md was not present.
+Blocked
+- Could not inspect the remaining refs.
+Follow-up
+- Fix the workspace mount.
+`.trim();
+
+    expect(hasDiscoveryLabels(report)).toBe(true);
+    expect(reportsMissingDiscoveryFiles(report)).toBe(true);
+  });
+});
--- a/extensions/qa-lab/src/discovery-eval.ts
+++ b/extensions/qa-lab/src/discovery-eval.ts
@@ -0,0 +1,43 @@
+const REQUIRED_DISCOVERY_REFS = [
+  "repo/qa/seed-scenarios.json",
+  "repo/qa/QA_KICKOFF_TASK.md",
+  "repo/extensions/qa-lab/src/suite.ts",
+  "repo/docs/help/testing.md",
+] as const;
+
+function confirmsDiscoveryFileRead(text: string) {
+  const lower = text.toLowerCase();
+  const mentionsAllRefs = REQUIRED_DISCOVERY_REFS.every((ref) => lower.includes(ref.toLowerCase()));
+  const confirmsRead =
+    /(?:read|retrieved|inspected|loaded|accessed|digested)\s+all\s+(?:four|4)\s+(?:(?:requested|required|mandated|seeded)\s+)?files/.test(
+      lower,
+    ) ||
+    /all\s+(?:four|4)\s+(?:(?:requested|required|mandated|seeded)\s+)?files\s+(?:were\s+)?(?:read|retrieved|inspected|loaded|accessed|digested)(?:\s+\w+)?/.test(
+      lower,
+    ) ||
+    /all (?:four|4) seeded files readable/.test(lower);
+  return mentionsAllRefs && confirmsRead;
+}
+
+export function hasDiscoveryLabels(text: string) {
+  const lower = text.toLowerCase();
+  return (
+    lower.includes("worked") &&
+    lower.includes("failed") &&
+    lower.includes("blocked") &&
+    (lower.includes("follow-up") || lower.includes("follow up"))
+  );
+}
+
+export function reportsMissingDiscoveryFiles(text: string) {
+  const lower = text.toLowerCase();
+  if (confirmsDiscoveryFileRead(text)) {
+    return false;
+  }
+  return (
+    lower.includes("not present") ||
+    lower.includes("missing files") ||
+    lower.includes("blocked by missing") ||
+    lower.includes("could not inspect")
+  );
+}
--- a/extensions/qa-lab/src/live-timeout.test.ts
+++ b/extensions/qa-lab/src/live-timeout.test.ts
@@ -0,0 +1,57 @@
+import { describe, expect, it } from "vitest";
+import { resolveQaLiveTurnTimeoutMs } from "./live-timeout.js";
+
+describe("qa live timeout policy", () => {
+  it("keeps mock lanes on the caller fallback", () => {
+    expect(
+      resolveQaLiveTurnTimeoutMs(
+        {
+          providerMode: "mock-openai",
+          primaryModel: "anthropic/claude-sonnet-4-6",
+          alternateModel: "anthropic/claude-opus-4-6",
+        },
+        30_000,
+      ),
+    ).toBe(30_000);
+  });
+
+  it("uses the standard live floor for non-anthropic models", () => {
+    expect(
+      resolveQaLiveTurnTimeoutMs(
+        {
+          providerMode: "live-frontier",
+          primaryModel: "openai/gpt-5.4",
+          alternateModel: "openai/gpt-5.4",
+        },
+        30_000,
+      ),
+    ).toBe(120_000);
+  });
+
+  it("uses the anthropic floor for sonnet turns", () => {
+    expect(
+      resolveQaLiveTurnTimeoutMs(
+        {
+          providerMode: "live-frontier",
+          primaryModel: "anthropic/claude-sonnet-4-6",
+          alternateModel: "anthropic/claude-opus-4-6",
+        },
+        30_000,
+      ),
+    ).toBe(180_000);
+  });
+
+  it("uses the opus floor when the switched turn runs on claude opus", () => {
+    expect(
+      resolveQaLiveTurnTimeoutMs(
+        {
+          providerMode: "live-frontier",
+          primaryModel: "anthropic/claude-sonnet-4-6",
+          alternateModel: "anthropic/claude-opus-4-6",
+        },
+        30_000,
+        "anthropic/claude-opus-4-6",
+      ),
+    ).toBe(240_000);
+  });
+});
--- a/extensions/qa-lab/src/live-timeout.ts
+++ b/extensions/qa-lab/src/live-timeout.ts
@@ -0,0 +1,30 @@
+type QaLiveTimeoutProfile = {
+  providerMode: "mock-openai" | "live-frontier";
+  primaryModel: string;
+  alternateModel: string;
+};
+
+function isAnthropicModel(modelRef: string) {
+  return modelRef.startsWith("anthropic/");
+}
+
+function isClaudeOpusModel(modelRef: string) {
+  return isAnthropicModel(modelRef) && modelRef.includes("claude-opus");
+}
+
+export function resolveQaLiveTurnTimeoutMs(
+  profile: QaLiveTimeoutProfile,
+  fallbackMs: number,
+  modelRef = profile.primaryModel,
+) {
+  if (profile.providerMode === "mock-openai") {
+    return fallbackMs;
+  }
+  if (isClaudeOpusModel(modelRef)) {
+    return Math.max(fallbackMs, 240_000);
+  }
+  if (isAnthropicModel(modelRef)) {
+    return Math.max(fallbackMs, 180_000);
+  }
+  return Math.max(fallbackMs, 120_000);
+}
--- a/extensions/qa-lab/src/model-switch-eval.test.ts
+++ b/extensions/qa-lab/src/model-switch-eval.test.ts
@@ -0,0 +1,28 @@
+import { describe, expect, it } from "vitest";
+import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
+
+describe("qa model-switch evaluation", () => {
+  it("accepts direct handoff replies that mention the kickoff task", () => {
+    expect(
+      hasModelSwitchContinuityEvidence(
+        "Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.",
+      ),
+    ).toBe(true);
+  });
+
+  it("accepts short mission-oriented switch confirmations", () => {
+    expect(
+      hasModelSwitchContinuityEvidence(
+        "model switch complete. reread the kickoff task; qa mission stays the same.",
+      ),
+    ).toBe(true);
+  });
+
+  it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
+    expect(
+      hasModelSwitchContinuityEvidence(
+        "subagent-handoff confirmed. qa report update: scenario pass. qa run complete.",
+      ),
+    ).toBe(false);
+  });
+});
--- a/extensions/qa-lab/src/model-switch-eval.ts
+++ b/extensions/qa-lab/src/model-switch-eval.ts
@@ -0,0 +1,10 @@
+export function hasModelSwitchContinuityEvidence(text: string) {
+  const lower = text.toLowerCase();
+  const mentionsHandoff =
+    lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
+  const mentionsKickoffTask =
+    lower.includes("qa_kickoff_task") ||
+    lower.includes("kickoff task") ||
+    lower.includes("qa mission");
+  return mentionsHandoff && mentionsKickoffTask;
+}
--- a/extensions/qa-lab/src/suite.ts
+++ b/extensions/qa-lab/src/suite.ts
@@ -14,10 +14,12 @@ import {
 import { buildAgentSessionKey } from "openclaw/plugin-sdk/routing";
 import type { QaBusState } from "./bus-state.js";
 import { waitForCronRunCompletion } from "./cron-run-wait.js";
+import { hasDiscoveryLabels, reportsMissingDiscoveryFiles } from "./discovery-eval.js";
 import { extractQaToolPayload } from "./extract-tool-payload.js";
 import { startQaGatewayChild } from "./gateway-child.js";
 import { startQaLabServer } from "./lab-server.js";
 import type { QaLabLatestReport, QaLabScenarioOutcome } from "./lab-server.js";
+import { resolveQaLiveTurnTimeoutMs } from "./live-timeout.js";
 import { startQaMockOpenAiServer } from "./mock-openai-server.js";
 import {
  defaultQaModelForMode,
@@ -25,6 +27,7 @@ import {
  normalizeQaProviderMode,
  type QaProviderMode,
 } from "./model-selection.js";
+import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
 import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } from "./report.js";
 import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js";
 import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js";
@@ -93,27 +96,7 @@ function splitModelRef(ref: string) {
 }

 function liveTurnTimeoutMs(env: QaSuiteEnvironment, fallbackMs: number) {
-  return env.providerMode === "mock-openai" ? fallbackMs : Math.max(fallbackMs, 120_000);
-}
-
-function hasDiscoveryLabels(text: string) {
-  const lower = text.toLowerCase();
-  return (
-    lower.includes("worked") &&
-    lower.includes("failed") &&
-    lower.includes("blocked") &&
-    (lower.includes("follow-up") || lower.includes("follow up"))
-  );
-}
-
-function reportsMissingDiscoveryFiles(text: string) {
-  const lower = text.toLowerCase();
-  return (
-    lower.includes("not present") ||
-    lower.includes("missing files") ||
-    lower.includes("blocked by missing") ||
-    lower.includes("could not inspect")
-  );
+  return resolveQaLiveTurnTimeoutMs(env, fallbackMs);
 }

 export type QaSuiteResult = {
@@ -916,7 +899,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                message: "Continue the exchange after switching models and note the handoff.",
                provider: alternate?.provider,
                model: alternate?.model,
-                timeoutMs: liveTurnTimeoutMs(env, 30_000),
+                timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
              });
              const outbound = await waitForCondition(
                () =>
@@ -930,7 +913,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                          candidate.text.toLowerCase().includes("handoff")),
                    )
                    .at(-1),
-                liveTurnTimeoutMs(env, 20_000),
+                resolveQaLiveTurnTimeoutMs(env, 20_000, env.alternateModel),
              );
              if (env.mock) {
                const request = await fetchJson<{ body?: { model?: string } }>(
@@ -1630,24 +1613,23 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                  "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence.",
                provider: alternate?.provider,
                model: alternate?.model,
-                timeoutMs: liveTurnTimeoutMs(env, 30_000),
+                timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
              });
-              const outbound = await waitForCondition(
-                () => {
-                  const snapshot = state.getSnapshot();
-                  return snapshot.messages
-                    .slice(beforeSwitchCursor)
-                    .filter(
-                      (candidate) =>
-                        candidate.direction === "outbound" &&
-                        candidate.conversation.id === "qa-operator" &&
-                        (candidate.text.toLowerCase().includes("model switch") ||
-                          candidate.text.toLowerCase().includes("handoff")),
-                    )
-                    .at(-1);
-                },
-                liveTurnTimeoutMs(env, 30_000),
-              );
+              const outbound = await waitForCondition(() => {
+                const snapshot = state.getSnapshot();
+                return snapshot.messages
+                  .slice(beforeSwitchCursor)
+                  .filter(
+                    (candidate) =>
+                      candidate.direction === "outbound" &&
+                      candidate.conversation.id === "qa-operator" &&
+                      hasModelSwitchContinuityEvidence(candidate.text),
+                  )
+                  .at(-1);
+              }, 10_000);
+              if (!hasModelSwitchContinuityEvidence(outbound.text)) {
+                throw new Error(`switch reply missed kickoff continuity: ${outbound.text}`);
+              }
              if (env.mock) {
                const requests = await fetchJson<
                  Array<{ allInputText?: string; plannedToolName?: string; model?: string }>