diff --git a/extensions/qa-lab/src/model-switch-eval.test.ts b/extensions/qa-lab/src/model-switch-eval.test.ts index 26dbe14c194..c21422560cb 100644 --- a/extensions/qa-lab/src/model-switch-eval.test.ts +++ b/extensions/qa-lab/src/model-switch-eval.test.ts @@ -26,6 +26,14 @@ describe("qa model-switch evaluation", () => { ).toBe(true); }); + it("accepts concise handed-off phrasing from live models", () => { + expect( + hasModelSwitchContinuityEvidence( + "The harness has handed off to the alternate model for this turn, and the read tool confirms continued access to the QA scenario pack mission.", + ), + ).toBe(true); + }); + it("accepts concise paraphrases of the kickoff task after a handoff", () => { expect( hasModelSwitchContinuityEvidence( diff --git a/extensions/qa-lab/src/model-switch-eval.ts b/extensions/qa-lab/src/model-switch-eval.ts index 4b6f0f35b63..6a195eba3e3 100644 --- a/extensions/qa-lab/src/model-switch-eval.ts +++ b/extensions/qa-lab/src/model-switch-eval.ts @@ -3,7 +3,11 @@ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtim export function hasModelSwitchContinuityEvidence(text: string) { const lower = normalizeLowercaseStringOrEmpty(text); const mentionsHandoff = - lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched"); + lower.includes("handoff") || + lower.includes("handed off") || + lower.includes("handed-off") || + lower.includes("model switch") || + lower.includes("switched"); const mentionsKickoffTask = lower.includes("qa_kickoff_task") || lower.includes("qa/scenarios/index.md") || diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index c38087d9fbb..244ce045275 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -127,8 +127,8 @@ describe("qa scenario catalog", () => { const scenario = readQaScenarioById("gpt54-thinking-visibility-switch"); const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as | { - requiredLiveProvider?: string; - requiredLiveModel?: string; + requiredProvider?: string; + requiredModel?: string; offDirective?: string; maxDirective?: string; reasoningDirective?: string; @@ -136,8 +136,8 @@ describe("qa scenario catalog", () => { | undefined; expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md"); - expect(config?.requiredLiveProvider).toBe("openai"); - expect(config?.requiredLiveModel).toBe("gpt-5.4"); + expect(config?.requiredProvider).toBe("openai"); + expect(config?.requiredModel).toBe("gpt-5.4"); expect(config?.offDirective).toBe("/think off"); expect(config?.maxDirective).toBe("/think max"); expect(config?.reasoningDirective).toBe("/reasoning on"); diff --git a/extensions/qa-lab/src/suite-planning.test.ts b/extensions/qa-lab/src/suite-planning.test.ts index 15ca4a1e7a6..5b7742c9f3d 100644 --- a/extensions/qa-lab/src/suite-planning.test.ts +++ b/extensions/qa-lab/src/suite-planning.test.ts @@ -250,4 +250,38 @@ describe("qa suite planning helpers", () => { }).map((scenario) => scenario.id), ).toEqual(["generic", "claude-subscription"]); }); + + it("filters env-gated scenarios from an implicit live lane", () => { + const previous = process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE; + delete process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE; + try { + const scenarios = [ + makeQaSuiteTestScenario("generic"), + makeQaSuiteTestScenario("anthropic-api-key", { + config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-6" }, + }), + makeQaSuiteTestScenario("anthropic-setup-token", { + config: { + requiredProvider: "anthropic", + requiredModel: "claude-opus-4-6", + requiredEnv: "OPENCLAW_LIVE_SETUP_TOKEN_VALUE", + }, + }), + ]; + + expect( + selectQaSuiteScenarios({ + scenarios, + providerMode: "live-frontier", + primaryModel: "anthropic/claude-opus-4-6", + }).map((scenario) => scenario.id), + ).toEqual(["generic", "anthropic-api-key"]); + } finally { + if (previous === undefined) { + delete process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE; + } else { + process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE = previous; + } + } + }); }); diff --git a/extensions/qa-lab/src/suite-planning.ts b/extensions/qa-lab/src/suite-planning.ts index d130a36d170..dba1ce61403 100644 --- a/extensions/qa-lab/src/suite-planning.ts +++ b/extensions/qa-lab/src/suite-planning.ts @@ -32,10 +32,12 @@ function scenarioMatchesLiveLane(params: { primaryModel: string; providerMode: QaProviderMode; claudeCliAuthMode?: QaCliBackendAuthMode; + env?: NodeJS.ProcessEnv; }) { if (getQaProvider(params.providerMode).kind !== "live") { return true; } + const env = params.env ?? process.env; const selected = splitModelRef(params.primaryModel); const config = params.scenario.execution.config ?? {}; const requiredProvider = normalizeQaConfigString(config.requiredProvider); @@ -50,6 +52,10 @@ function scenarioMatchesLiveLane(params: { if (requiredAuthMode && params.claudeCliAuthMode !== requiredAuthMode) { return false; } + const requiredEnv = normalizeQaConfigString(config.requiredEnv); + if (requiredEnv && !env[requiredEnv]?.trim()) { + return false; + } return true; } diff --git a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md index 231403d1e7c..7ea97dc9f08 100644 --- a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md +++ b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md @@ -28,6 +28,7 @@ execution: config: requiredProvider: anthropic requiredModel: claude-opus-4-6 + requiredEnv: OPENCLAW_LIVE_SETUP_TOKEN_VALUE profileId: "anthropic:qa-setup-token" chatPrompt: "Anthropic Opus setup-token smoke. Reply exactly: ANTHROPIC-OPUS-SETUP-TOKEN-OK" chatExpected: ANTHROPIC-OPUS-SETUP-TOKEN-OK diff --git a/qa/scenarios/models/gpt54-thinking-visibility-switch.md b/qa/scenarios/models/gpt54-thinking-visibility-switch.md index 243227853c2..7215ec52a05 100644 --- a/qa/scenarios/models/gpt54-thinking-visibility-switch.md +++ b/qa/scenarios/models/gpt54-thinking-visibility-switch.md @@ -29,8 +29,8 @@ execution: kind: flow summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn. config: - requiredLiveProvider: openai - requiredLiveModel: gpt-5.4 + requiredProvider: openai + requiredModel: gpt-5.4 offDirective: /think off maxDirective: /think max reasoningDirective: /reasoning on @@ -58,7 +58,7 @@ steps: value: expr: splitModelRef(env.primaryModel) - assert: - expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)" + expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredProvider && selected?.model === config.requiredModel)" message: expr: "`expected live GPT-5.4, got ${env.primaryModel}`" - call: state.addInboundMessage