test: filter live qa scenario lanes

This commit is contained in:
Peter Steinberger
2026-04-21 12:43:30 +01:00
parent 7e4a5f8a6e
commit b835337cd6
7 changed files with 61 additions and 8 deletions

View File

@@ -26,6 +26,14 @@ describe("qa model-switch evaluation", () => {
).toBe(true);
});
it("accepts concise handed-off phrasing from live models", () => {
expect(
hasModelSwitchContinuityEvidence(
"The harness has handed off to the alternate model for this turn, and the read tool confirms continued access to the QA scenario pack mission.",
),
).toBe(true);
});
it("accepts concise paraphrases of the kickoff task after a handoff", () => {
expect(
hasModelSwitchContinuityEvidence(

View File

@@ -3,7 +3,11 @@ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtim
export function hasModelSwitchContinuityEvidence(text: string) {
const lower = normalizeLowercaseStringOrEmpty(text);
const mentionsHandoff =
lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
lower.includes("handoff") ||
lower.includes("handed off") ||
lower.includes("handed-off") ||
lower.includes("model switch") ||
lower.includes("switched");
const mentionsKickoffTask =
lower.includes("qa_kickoff_task") ||
lower.includes("qa/scenarios/index.md") ||

View File

@@ -127,8 +127,8 @@ describe("qa scenario catalog", () => {
const scenario = readQaScenarioById("gpt54-thinking-visibility-switch");
const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as
| {
requiredLiveProvider?: string;
requiredLiveModel?: string;
requiredProvider?: string;
requiredModel?: string;
offDirective?: string;
maxDirective?: string;
reasoningDirective?: string;
@@ -136,8 +136,8 @@ describe("qa scenario catalog", () => {
| undefined;
expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
expect(config?.requiredLiveProvider).toBe("openai");
expect(config?.requiredLiveModel).toBe("gpt-5.4");
expect(config?.requiredProvider).toBe("openai");
expect(config?.requiredModel).toBe("gpt-5.4");
expect(config?.offDirective).toBe("/think off");
expect(config?.maxDirective).toBe("/think max");
expect(config?.reasoningDirective).toBe("/reasoning on");

View File

@@ -250,4 +250,38 @@ describe("qa suite planning helpers", () => {
}).map((scenario) => scenario.id),
).toEqual(["generic", "claude-subscription"]);
});
it("filters env-gated scenarios from an implicit live lane", () => {
const previous = process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE;
delete process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE;
try {
const scenarios = [
makeQaSuiteTestScenario("generic"),
makeQaSuiteTestScenario("anthropic-api-key", {
config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-6" },
}),
makeQaSuiteTestScenario("anthropic-setup-token", {
config: {
requiredProvider: "anthropic",
requiredModel: "claude-opus-4-6",
requiredEnv: "OPENCLAW_LIVE_SETUP_TOKEN_VALUE",
},
}),
];
expect(
selectQaSuiteScenarios({
scenarios,
providerMode: "live-frontier",
primaryModel: "anthropic/claude-opus-4-6",
}).map((scenario) => scenario.id),
).toEqual(["generic", "anthropic-api-key"]);
} finally {
if (previous === undefined) {
delete process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE;
} else {
process.env.OPENCLAW_LIVE_SETUP_TOKEN_VALUE = previous;
}
}
});
});

View File

@@ -32,10 +32,12 @@ function scenarioMatchesLiveLane(params: {
primaryModel: string;
providerMode: QaProviderMode;
claudeCliAuthMode?: QaCliBackendAuthMode;
env?: NodeJS.ProcessEnv;
}) {
if (getQaProvider(params.providerMode).kind !== "live") {
return true;
}
const env = params.env ?? process.env;
const selected = splitModelRef(params.primaryModel);
const config = params.scenario.execution.config ?? {};
const requiredProvider = normalizeQaConfigString(config.requiredProvider);
@@ -50,6 +52,10 @@ function scenarioMatchesLiveLane(params: {
if (requiredAuthMode && params.claudeCliAuthMode !== requiredAuthMode) {
return false;
}
const requiredEnv = normalizeQaConfigString(config.requiredEnv);
if (requiredEnv && !env[requiredEnv]?.trim()) {
return false;
}
return true;
}

View File

@@ -28,6 +28,7 @@ execution:
config:
requiredProvider: anthropic
requiredModel: claude-opus-4-6
requiredEnv: OPENCLAW_LIVE_SETUP_TOKEN_VALUE
profileId: "anthropic:qa-setup-token"
chatPrompt: "Anthropic Opus setup-token smoke. Reply exactly: ANTHROPIC-OPUS-SETUP-TOKEN-OK"
chatExpected: ANTHROPIC-OPUS-SETUP-TOKEN-OK

View File

@@ -29,8 +29,8 @@ execution:
kind: flow
summary: Toggle reasoning display and GPT-5.4 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
config:
requiredLiveProvider: openai
requiredLiveModel: gpt-5.4
requiredProvider: openai
requiredModel: gpt-5.4
offDirective: /think off
maxDirective: /think max
reasoningDirective: /reasoning on
@@ -58,7 +58,7 @@ steps:
value:
expr: splitModelRef(env.primaryModel)
- assert:
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredProvider && selected?.model === config.requiredModel)"
message:
expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
- call: state.addInboundMessage