test: filter live qa scenario lanes

2026-05-06 11:50:43 +00:00 · 2026-04-21 12:43:30 +01:00
parent 7e4a5f8a6e
commit b835337cd6
7 changed files with 61 additions and 8 deletions
--- a/extensions/qa-lab/src/model-switch-eval.test.ts
+++ b/extensions/qa-lab/src/model-switch-eval.test.ts
@@ -26,6 +26,14 @@ describe("qa model-switch evaluation", () => {
    ).toBe(true);
  });

+  it("accepts concise handed-off phrasing from live models", () => {
+    expect(
+      hasModelSwitchContinuityEvidence(
+        "The harness has handed off to the alternate model for this turn, and the read tool confirms continued access to the QA scenario pack mission.",
+      ),
+    ).toBe(true);
+  });
+
  it("accepts concise paraphrases of the kickoff task after a handoff", () => {
    expect(
      hasModelSwitchContinuityEvidence(
--- a/extensions/qa-lab/src/model-switch-eval.ts
+++ b/extensions/qa-lab/src/model-switch-eval.ts
@@ -3,7 +3,11 @@ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtim
 export function hasModelSwitchContinuityEvidence(text: string) {
  const lower = normalizeLowercaseStringOrEmpty(text);
  const mentionsHandoff =
-    lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
+    lower.includes("handoff") ||
+    lower.includes("handed off") ||
+    lower.includes("handed-off") ||
+    lower.includes("model switch") ||
+    lower.includes("switched");
  const mentionsKickoffTask =
    lower.includes("qa_kickoff_task") ||
    lower.includes("qa/scenarios/index.md") ||
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -127,8 +127,8 @@ describe("qa scenario catalog", () => {
    const scenario = readQaScenarioById("gpt54-thinking-visibility-switch");
    const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as
      | {
-          requiredLiveProvider?: string;
-          requiredLiveModel?: string;
+          requiredProvider?: string;
+          requiredModel?: string;
          offDirective?: string;
          maxDirective?: string;
          reasoningDirective?: string;
@@ -136,8 +136,8 @@ describe("qa scenario catalog", () => {
      | undefined;

    expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
-    expect(config?.requiredLiveProvider).toBe("openai");
-    expect(config?.requiredLiveModel).toBe("gpt-5.4");
+    expect(config?.requiredProvider).toBe("openai");
+    expect(config?.requiredModel).toBe("gpt-5.4");
    expect(config?.offDirective).toBe("/think off");
    expect(config?.maxDirective).toBe("/think max");
    expect(config?.reasoningDirective).toBe("/reasoning on");
--- a/extensions/qa-lab/src/suite-planning.test.ts
+++ b/extensions/qa-lab/src/suite-planning.test.ts
@@ -250,4 +250,38 @@ describe("qa suite planning helpers", () => {
      }).map((scenario) => scenario.id),
    ).toEqual(["generic", "claude-subscription"]);
  });
+
+  it("filters env-gated scenarios from an implicit live lane", () => {
+    const previous = process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE;
+    delete process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE;
+    try {
+      const scenarios = [
+        makeQaSuiteTestScenario("generic"),
+        makeQaSuiteTestScenario("anthropic-api-key", {
+          config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-6" },
+        }),
+        makeQaSuiteTestScenario("anthropic-setup-token", {
+          config: {
+            requiredProvider: "anthropic",
+            requiredModel: "claude-opus-4-6",
+            requiredEnv: "OPENCLAW_LIVE_SETUP_TOKEN_VALUE",
+          },
+        }),
+      ];
+
+      expect(
+        selectQaSuiteScenarios({
+          scenarios,
+          providerMode: "live-frontier",
+          primaryModel: "anthropic/claude-opus-4-6",
+        }).map((scenario) => scenario.id),
+      ).toEqual(["generic", "anthropic-api-key"]);
+    } finally {
+      if (previous === undefined) {
+        delete process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE;
+      } else {
+        process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE = previous;
+      }
+    }
+  });
 });
--- a/extensions/qa-lab/src/suite-planning.ts
+++ b/extensions/qa-lab/src/suite-planning.ts
@@ -32,10 +32,12 @@ function scenarioMatchesLiveLane(params: {
  primaryModel: string;
  providerMode: QaProviderMode;
  claudeCliAuthMode?: QaCliBackendAuthMode;
+  env?: NodeJS.ProcessEnv;
 }) {
  if (getQaProvider(params.providerMode).kind !== "live") {
    return true;
  }
+  const env = params.env ?? process.env;
  const selected = splitModelRef(params.primaryModel);
  const config = params.scenario.execution.config ?? {};
  const requiredProvider = normalizeQaConfigString(config.requiredProvider);
@@ -50,6 +52,10 @@ function scenarioMatchesLiveLane(params: {
  if (requiredAuthMode && params.claudeCliAuthMode !== requiredAuthMode) {
    return false;
  }
+  const requiredEnv = normalizeQaConfigString(config.requiredEnv);
+  if (requiredEnv && !env[requiredEnv]?.trim()) {
+    return false;
+  }
  return true;
 }

--- a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md
+++ b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md
@@ -28,6 +28,7 @@ execution:
  config:
    requiredProvider: anthropic
    requiredModel: claude-opus-4-6
+    requiredEnv: OPENCLAW_LIVE_SETUP_TOKEN_VALUE
    profileId: "anthropic:qa-setup-token"
    chatPrompt: "Anthropic Opus setup-token smoke. Reply exactly: ANTHROPIC-OPUS-SETUP-TOKEN-OK"
    chatExpected: ANTHROPIC-OPUS-SETUP-TOKEN-OK
--- a/qa/scenarios/models/gpt54-thinking-visibility-switch.md
+++ b/qa/scenarios/models/gpt54-thinking-visibility-switch.md
@@ -29,8 +29,8 @@ execution:
  kind: flow
  summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
  config:
-    requiredLiveProvider: openai
-    requiredLiveModel: gpt-5.4
+    requiredProvider: openai
+    requiredModel: gpt-5.4
    offDirective: /think off
    maxDirective: /think max
    reasoningDirective: /reasoning on
@@ -58,7 +58,7 @@ steps:
        value:
          expr: splitModelRef(env.primaryModel)
      - assert:
-          expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
+          expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredProvider && selected?.model === config.requiredModel)"
          message:
            expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
      - call: state.addInboundMessage