fix: stabilize qa lab mock suite

This commit is contained in:
Peter Steinberger
2026-04-24 02:46:25 +01:00
parent 2779020cbe
commit 903308dbf2
26 changed files with 302 additions and 104 deletions

View File

@@ -207,7 +207,7 @@ refs and write a judged Markdown report:
```bash
pnpm openclaw qa character-eval \
--model openai-codex/gpt-5.5,thinking=xhigh \
--model openai/gpt-5.4,thinking=medium,fast \
--model openai/gpt-5.2,thinking=xhigh \
--model openai/gpt-5,thinking=xhigh \
--model anthropic/claude-opus-4-6,thinking=high \
@@ -215,7 +215,7 @@ pnpm openclaw qa character-eval \
--model zai/glm-5.1,thinking=high \
--model moonshot/kimi-k2.5,thinking=high \
--model google/gemini-3.1-pro-preview,thinking=high \
--judge-model openai-codex/gpt-5.5,thinking=xhigh,fast \
--judge-model openai/gpt-5.4,thinking=xhigh,fast \
--judge-model anthropic/claude-opus-4-6,thinking=high \
--blind-judge-models \
--concurrency 16 \
@@ -227,13 +227,13 @@ scenarios should set the persona through `SOUL.md`, then run ordinary user turns
such as chat, workspace help, and small file tasks. The candidate model should
not be told that it is being evaluated. The command preserves each full
transcript, records basic run stats, then asks the judge models in fast mode with
`xhigh` reasoning to rank the runs by naturalness, vibe, and humor.
`xhigh` reasoning where supported to rank the runs by naturalness, vibe, and humor.
Use `--blind-judge-models` when comparing providers: the judge prompt still gets
every transcript and run status, but candidate refs are replaced with neutral
labels such as `candidate-01`; the report maps rankings back to real refs after
parsing.
Candidate runs default to `high` thinking, with `xhigh` for OpenAI models that
support it. Override a specific candidate inline with
Candidate runs default to `high` thinking, with `medium` for GPT-5.4 and `xhigh`
for older OpenAI eval refs that support it. Override a specific candidate inline with
`--model provider/model,thinking=<level>`. `--thinking <level>` still sets a
global fallback, and the older `--model-thinking <provider/model=level>` form is
kept for compatibility.
@@ -247,12 +247,12 @@ Candidate and judge model runs both default to concurrency 16. Lower
`--concurrency` or `--judge-concurrency` when provider limits or local gateway
pressure make a run too noisy.
When no candidate `--model` is passed, the character eval defaults to
`openai-codex/gpt-5.5`, `openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`,
`openai/gpt-5.4`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-6`,
`anthropic/claude-sonnet-4-6`, `zai/glm-5.1`,
`moonshot/kimi-k2.5`, and
`google/gemini-3.1-pro-preview` when no `--model` is passed.
When no `--judge-model` is passed, the judges default to
`openai-codex/gpt-5.5,thinking=xhigh,fast` and
`openai/gpt-5.4,thinking=xhigh,fast` and
`anthropic/claude-opus-4-6,thinking=high`.
## Related docs

View File

@@ -680,7 +680,7 @@ Docker notes:
`agent` method:
- load the bundled `codex` plugin
- select `OPENCLAW_AGENT_RUNTIME=codex`
- send a first gateway agent turn to `openai/gpt-5.5` with the Codex harness forced
- send a first gateway agent turn to `openai/gpt-5.4` with the Codex harness forced
- send a second turn to the same OpenClaw session and verify the app-server
thread can resume
- run `/codex status` and `/codex models` through the same gateway command
@@ -690,7 +690,7 @@ Docker notes:
denied so the agent asks back
- Test: `src/gateway/gateway-codex-harness.live.test.ts`
- Enable: `OPENCLAW_LIVE_CODEX_HARNESS=1`
- Default model: `openai/gpt-5.5`
- Default model: `openai/gpt-5.4`
- Optional image probe: `OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1`
- Optional MCP/tool probe: `OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1`
- Optional Guardian probe: `OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1`
@@ -708,7 +708,7 @@ OPENCLAW_LIVE_CODEX_HARNESS=1 \
OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1 \
OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1 \
OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1 \
OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.5 \
OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.4 \
pnpm test:live -- src/gateway/gateway-codex-harness.live.test.ts
```

View File

@@ -125,7 +125,7 @@ describe("runQaCharacterEval", () => {
expect.objectContaining({
judgeModel: "openai/gpt-5.4",
judgeThinkingDefault: "xhigh",
judgeFastMode: false,
judgeFastMode: true,
timeoutMs: 300_000,
}),
);
@@ -223,7 +223,7 @@ describe("runQaCharacterEval", () => {
expect(runSuite).toHaveBeenCalledTimes(8);
expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
"openai/gpt-5.5",
"openai/gpt-5.4",
"openai/gpt-5.2",
"openai/gpt-5",
"anthropic/claude-opus-4-6",
@@ -233,7 +233,7 @@ describe("runQaCharacterEval", () => {
"google/gemini-3.1-pro-preview",
]);
expect(runSuite.mock.calls.map(([params]) => params.thinkingDefault)).toEqual([
"xhigh",
"medium",
"xhigh",
"xhigh",
"high",
@@ -254,14 +254,14 @@ describe("runQaCharacterEval", () => {
]);
expect(runJudge).toHaveBeenCalledTimes(2);
expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([
"openai/gpt-5.5",
"openai/gpt-5.4",
"anthropic/claude-opus-4-6",
]);
expect(runJudge.mock.calls.map(([params]) => params.judgeThinkingDefault)).toEqual([
"xhigh",
"high",
]);
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([false, false]);
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
});
it("runs candidate models with bounded concurrency while preserving result order", async () => {

View File

@@ -189,6 +189,7 @@ describe("qa cli runtime", () => {
primaryModel: "openai/gpt-5.4",
alternateModel: "anthropic/claude-sonnet-4-6",
fastMode: true,
thinking: "medium",
scenarioIds: ["approval-turn-tool-followthrough"],
});
@@ -200,6 +201,7 @@ describe("qa cli runtime", () => {
primaryModel: "openai/gpt-5.4",
alternateModel: "anthropic/claude-sonnet-4-6",
fastMode: true,
thinkingDefault: "medium",
scenarioIds: ["approval-turn-tool-followthrough"],
});
});
@@ -1135,8 +1137,8 @@ describe("qa cli runtime", () => {
repoRoot: path.resolve("/tmp/openclaw-repo"),
transportId: "qa-channel",
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.5",
alternateModel: "openai/gpt-5.5",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
fastMode: undefined,
message: "read qa kickoff and reply short",
timeoutMs: undefined,
@@ -1166,7 +1168,7 @@ describe("qa cli runtime", () => {
it("defaults manual frontier runs onto Codex OAuth when the runtime resolver prefers it", async () => {
defaultQaRuntimeModelForMode.mockImplementation((mode, options) =>
mode === "live-frontier"
? "openai/gpt-5.5"
? "openai/gpt-5.4"
: defaultQaProviderModelForMode(mode as QaProviderModeInput, options),
);
@@ -1179,8 +1181,8 @@ describe("qa cli runtime", () => {
repoRoot: path.resolve("/tmp/openclaw-repo"),
transportId: "qa-channel",
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.5",
alternateModel: "openai/gpt-5.5",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
fastMode: undefined,
message: "read qa kickoff and reply short",
timeoutMs: undefined,

View File

@@ -450,6 +450,7 @@ export async function runQaSuiteCommand(opts: {
primaryModel?: string;
alternateModel?: string;
fastMode?: boolean;
thinking?: string;
cliAuthMode?: string;
parityPack?: string;
scenarioIds?: string[];
@@ -490,6 +491,7 @@ export async function runQaSuiteCommand(opts: {
throw new Error("--cli-auth-mode requires --runner host.");
}
if (runner === "multipass") {
const thinkingDefault = parseQaThinkingLevel("--thinking", opts.thinking);
const result = await runQaMultipass({
repoRoot,
outputDir: resolveRepoRelativeOutputDir(repoRoot, opts.outputDir),
@@ -498,6 +500,7 @@ export async function runQaSuiteCommand(opts: {
primaryModel: opts.primaryModel,
alternateModel: opts.alternateModel,
fastMode: opts.fastMode,
...(thinkingDefault ? { thinkingDefault } : {}),
allowFailures: true,
scenarioIds,
...(opts.concurrency !== undefined
@@ -532,6 +535,7 @@ export async function runQaSuiteCommand(opts: {
});
return;
}
const thinkingDefault = parseQaThinkingLevel("--thinking", opts.thinking);
const result = await runQaSuiteFromRuntimeWithInfraRetry({
repoRoot,
outputDir: resolveRepoRelativeOutputDir(repoRoot, opts.outputDir),
@@ -540,6 +544,7 @@ export async function runQaSuiteCommand(opts: {
primaryModel: opts.primaryModel,
alternateModel: opts.alternateModel,
fastMode: opts.fastMode,
...(thinkingDefault ? { thinkingDefault } : {}),
...(claudeCliAuthMode ? { claudeCliAuthMode } : {}),
scenarioIds,
...(opts.concurrency !== undefined

View File

@@ -35,6 +35,7 @@ async function runQaSuite(opts: {
primaryModel?: string;
alternateModel?: string;
fastMode?: boolean;
thinking?: string;
allowFailures?: boolean;
cliAuthMode?: string;
parityPack?: string;
@@ -247,6 +248,10 @@ export function registerQaLabCli(program: Command) {
false,
)
.option("--fast", "Enable provider fast mode where supported", false)
.option(
"--thinking <level>",
"Suite thinking default: off|minimal|low|medium|high|xhigh|adaptive|max",
)
.option("--image <alias>", "Multipass image alias")
.option("--cpus <count>", "Multipass vCPU count", (value: string) => Number(value))
.option("--memory <size>", "Multipass memory size")
@@ -266,6 +271,7 @@ export function registerQaLabCli(program: Command) {
concurrency?: number;
allowFailures?: boolean;
fast?: boolean;
thinking?: string;
image?: string;
cpus?: number;
memory?: string;
@@ -281,6 +287,7 @@ export function registerQaLabCli(program: Command) {
primaryModel: opts.model,
alternateModel: opts.altModel,
fastMode: opts.fast,
thinking: opts.thinking,
cliAuthMode: opts.cliAuthMode,
parityPack: opts.parityPack,
scenarioIds: opts.scenario,

View File

@@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest";
import { selectQaRunnerModelOptions } from "./model-catalog.runtime.js";
describe("qa runner model catalog", () => {
it("filters to available rows and prefers gpt-5.5 first", () => {
it("filters to available rows and prefers gpt-5.4 first", () => {
expect(
selectQaRunnerModelOptions([
{
@@ -13,8 +13,8 @@ describe("qa runner model catalog", () => {
missing: false,
},
{
key: "openai/gpt-5.5",
name: "gpt-5.5",
key: "openai/gpt-5.4",
name: "gpt-5.4",
input: "text,image",
available: true,
missing: false,
@@ -27,6 +27,6 @@ describe("qa runner model catalog", () => {
missing: false,
},
]).map((entry) => entry.key),
).toEqual(["openai/gpt-5.5", "anthropic/claude-sonnet-4-6"]);
).toEqual(["openai/gpt-5.4", "anthropic/claude-sonnet-4-6"]);
});
});

View File

@@ -34,7 +34,7 @@ describe("qa model selection runtime", () => {
resolveEnvApiKey.mockReturnValue({ apiKey: "sk-test" });
expect(resolveQaPreferredLiveModel()).toBeUndefined();
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
expect(loadAuthProfileStoreForRuntime).not.toHaveBeenCalled();
});
@@ -43,8 +43,8 @@ describe("qa model selection runtime", () => {
provider === "openai-codex" ? ["openai-codex:user@example.com"] : [],
);
expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.5");
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.4");
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
});
it("keeps the OpenAI live default when stored OpenAI profiles are available", () => {
@@ -53,7 +53,7 @@ describe("qa model selection runtime", () => {
);
expect(resolveQaPreferredLiveModel()).toBeUndefined();
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
});
it("leaves mock defaults unchanged", () => {

View File

@@ -71,6 +71,7 @@ export type QaMultipassPlan = {
primaryModel?: string;
alternateModel?: string;
fastMode?: boolean;
thinkingDefault?: string;
scenarioIds: string[];
forwardedEnv: Record<string, string>;
hostCodexHomePath?: string;
@@ -237,6 +238,7 @@ export function createQaMultipassPlan(params: {
primaryModel?: string;
alternateModel?: string;
fastMode?: boolean;
thinkingDefault?: string;
allowFailures?: boolean;
scenarioIds?: string[];
concurrency?: number;
@@ -276,6 +278,7 @@ export function createQaMultipassPlan(params: {
...(params.primaryModel ? ["--model", params.primaryModel] : []),
...(params.alternateModel ? ["--alt-model", params.alternateModel] : []),
...(params.fastMode ? ["--fast"] : []),
...(params.thinkingDefault ? ["--thinking", params.thinkingDefault] : []),
...(params.allowFailures ? ["--allow-failures"] : []),
...(params.concurrency ? ["--concurrency", String(params.concurrency)] : []),
],
@@ -301,6 +304,7 @@ export function createQaMultipassPlan(params: {
primaryModel: params.primaryModel,
alternateModel: params.alternateModel,
fastMode: params.fastMode,
thinkingDefault: params.thinkingDefault,
scenarioIds,
forwardedEnv,
hostCodexHomePath,

View File

@@ -1,5 +1,5 @@
export const QA_FRONTIER_PROVIDER_IDS = ["anthropic", "google", "openai"] as const;
export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.5";
export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.4";
export const QA_FRONTIER_CATALOG_ALTERNATE_MODEL = "anthropic/claude-sonnet-4-6";
export function isPreferredQaLiveFrontierCatalogModel(modelRef: string) {

View File

@@ -6,7 +6,7 @@ type QaFrontierCharacterModelOptions = {
};
export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([
"openai/gpt-5.5",
"openai/gpt-5.4",
"openai/gpt-5.2",
"openai/gpt-5",
"anthropic/claude-opus-4-6",
@@ -18,19 +18,19 @@ export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([
export const QA_FRONTIER_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
Object.freeze({
"openai/gpt-5.5": "xhigh",
"openai/gpt-5.4": "medium",
"openai/gpt-5.2": "xhigh",
"openai/gpt-5": "xhigh",
});
export const QA_FRONTIER_CHARACTER_JUDGE_MODELS = Object.freeze([
"openai/gpt-5.5",
"openai/gpt-5.4",
"anthropic/claude-opus-4-6",
]);
export const QA_FRONTIER_CHARACTER_JUDGE_MODEL_OPTIONS: Readonly<
Record<string, QaFrontierCharacterModelOptions>
> = Object.freeze({
"openai/gpt-5.5": { thinkingDefault: "xhigh" },
"openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true },
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
});

View File

@@ -23,7 +23,7 @@ function isClaudeOpusModel(modelRef: string) {
export const liveFrontierProviderDefinition: QaProviderDefinition = {
mode: "live-frontier",
kind: "live",
defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.5",
defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.4",
defaultImageGenerationProviderIds: ["openai"],
defaultImageGenerationModel: ({ modelProviderIds }) =>
modelProviderIds.includes("openai") ? "openai/gpt-image-1" : null,

View File

@@ -4,7 +4,7 @@ import {
} from "openclaw/plugin-sdk/agent-runtime";
import { resolveEnvApiKey } from "openclaw/plugin-sdk/provider-auth";
const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.5";
const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.4";
export function resolveQaLiveFrontierPreferredModel() {
if (resolveEnvApiKey("openai")?.apiKey) {

View File

@@ -1,2 +1,2 @@
export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.5";
export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.4";
export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-6";

View File

@@ -151,6 +151,10 @@ const QA_REASONING_ONLY_RETRY_NEEDLE =
"recorded reasoning but did not produce a user-visible answer";
const QA_EMPTY_RESPONSE_RETRY_NEEDLE =
"The previous attempt did not produce a user-visible answer.";
const QA_SKILL_WORKSHOP_GIF_PROMPT_RE =
/externally sourced animated GIF asset|animated GIF asset in a product UI/i;
const QA_SKILL_WORKSHOP_REVIEW_PROMPT_RE = /Review transcript for durable skill updates/i;
const QA_RELEASE_AUDIT_PROMPT_RE = /release readiness audit for the small project/i;
type MockScenarioState = {
subagentFanoutPhase: number;
@@ -727,6 +731,16 @@ function buildAssistantText(
if (/(image generation check|capability flip image check)/i.test(prompt) && mediaPath) {
return `Protocol note: generated the QA lighthouse image successfully.\nMEDIA:${mediaPath}`;
}
if (QA_SKILL_WORKSHOP_GIF_PROMPT_RE.test(prompt) && toolOutput) {
return [
"Animated GIF QA checklist ready.",
"- Confirm true animation, not a static preview.",
"- Verify dimensions and product UI fit.",
"- Record attribution and license.",
"- Keep a local copy before using the asset.",
"- Re-open the copied file for final verification.",
].join("\n");
}
if (/roundtrip image inspection check/i.test(prompt) && imageInputCount > 0) {
return "Protocol note: the generated attachment shows the same QA lighthouse scene from the previous step.";
}
@@ -808,6 +822,79 @@ function buildToolCallEvents(prompt: string): StreamEvent[] {
return buildToolCallEventsWithArgs("read", { path: targetPath });
}
function buildReleaseAuditJson() {
return `${JSON.stringify(
{
verified: true,
findings: [
{
id: "REL-GATEWAY-417",
source: "src/gateway/reconnect.ts",
status: "retry jitter verified, resume token fallback still needs manual spot check",
},
{
id: "REL-CHANNEL-238",
source: "src/channels/delivery.ts",
status: "thread replies preserve ordering, root-channel fallback needs handoff note",
},
{
id: "REL-CRON-904",
source: "src/scheduling/cron.ts",
status: "single-run lock verified for restart wakeups",
},
{
id: "REL-MEMORY-552",
source: "src/memory/recall.ts",
status:
"fallback summary survives empty memory search; ranking sample needs second reviewer",
},
{
id: "REL-PLUGIN-319",
source: "src/plugins/runtime.ts",
status: "bundled runtime manifest loads cleanly after restart",
},
{
id: "REL-INSTALL-846",
source: "install/update.ts",
status: "update smoke passed from previous stable tag",
},
{
id: "REL-DOCS-611",
source: "docs/operator-notes.md",
status:
"docs mention reconnect, cron, memory, plugin, and installer checks; channel ordering and UI notes need maintainer handoff",
},
{
id: "REL-UI-BLOCKED",
source: "ui/control-panel.ts",
status: "blocked: source file was referenced by checklist but missing from the fixture",
},
],
},
null,
2,
)}\n`;
}
function buildReleaseHandoffMarkdown() {
return [
"# Release Handoff",
"",
"Ready:",
"- REL-GATEWAY-417: gateway reconnect handling checked in `src/gateway/reconnect.ts`.",
"- REL-CRON-904: cron duplicate prevention checked in `src/scheduling/cron.ts`.",
"- REL-PLUGIN-319: plugin runtime loading checked in `src/plugins/runtime.ts`.",
"- REL-INSTALL-846: installer update path checked in `install/update.ts`.",
"",
"Follow-up:",
"- REL-CHANNEL-238: channel delivery ordering needs maintainer handoff.",
"- REL-MEMORY-552: memory recall fallback ranking sample needs a second reviewer.",
"- REL-DOCS-611: docs update status needs channel ordering and UI notes.",
"- `ui/control-panel.ts` is blocked/not found in the fixture.",
"",
].join("\n");
}
function extractPlannedToolName(events: StreamEvent[]) {
for (const event of events) {
if (event.type !== "response.output_item.done") {
@@ -1128,6 +1215,63 @@ async function buildResponsesPayload(
},
]);
}
if (QA_SKILL_WORKSHOP_REVIEW_PROMPT_RE.test(allInputText)) {
return buildAssistantEvents(
JSON.stringify({
action: "create",
skillName: "animated-gif-workflow",
title: "Animated GIF Workflow",
reason: "Transcript captured a reusable animated media QA checklist.",
description: "Reusable workflow notes for animated GIF QA tasks.",
body: [
"- Confirm the asset has true animation, not a static preview.",
"- Check dimensions against the target product UI slot.",
"- Record attribution and license before using the file.",
"- Keep a local copy under the workspace before integration.",
"- Re-open the local copy for final verification.",
].join("\n"),
}),
);
}
if (QA_SKILL_WORKSHOP_GIF_PROMPT_RE.test(prompt) && !toolOutput) {
return buildToolCallEventsWithArgs("write", {
path: "animated-gif-qa-checklist.md",
content: [
"# Animated GIF QA Checklist",
"",
"- Confirm true animation.",
"- Verify dimensions.",
"- Record attribution.",
"- Keep a local copy.",
"- Perform final verification.",
].join("\n"),
});
}
if (QA_RELEASE_AUDIT_PROMPT_RE.test(prompt)) {
if (!toolOutput) {
return buildToolCallEventsWithArgs("read", { path: "audit-fixture/README.md" });
}
if (/Release readiness task|current checklist/i.test(toolOutput)) {
return buildToolCallEventsWithArgs("read", {
path: "audit-fixture/docs/current-readiness-checklist.md",
});
}
if (/Current release readiness requires checking eight areas/i.test(toolOutput)) {
return buildToolCallEventsWithArgs("write", {
path: "audit-fixture/release-audit.json",
content: buildReleaseAuditJson(),
});
}
if (/release-audit\.json/i.test(toolOutput)) {
return buildToolCallEventsWithArgs("write", {
path: "audit-fixture/release-handoff.md",
content: buildReleaseHandoffMarkdown(),
});
}
if (/release-handoff\.md/i.test(toolOutput)) {
return buildAssistantEvents("RELEASE-AUDIT-COMPLETE");
}
}
if (/lobster invaders/i.test(prompt)) {
if (!toolOutput) {
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });

View File

@@ -45,8 +45,8 @@ describe("qa run config", () => {
it("creates a live-by-default selection that arms every scenario", () => {
expect(createDefaultQaRunSelection(scenarios)).toEqual({
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.5",
alternateModel: "openai/gpt-5.5",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
fastMode: true,
scenarioIds: ["dm-chat-baseline", "thread-lifecycle"],
});
@@ -57,7 +57,7 @@ describe("qa run config", () => {
normalizeQaRunSelection(
{
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.5",
primaryModel: "openai/gpt-5.4",
alternateModel: "",
fastMode: false,
scenarioIds: ["thread-lifecycle", "missing", "thread-lifecycle"],
@@ -66,8 +66,8 @@ describe("qa run config", () => {
),
).toEqual({
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.5",
alternateModel: "openai/gpt-5.5",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
fastMode: true,
scenarioIds: ["thread-lifecycle"],
});
@@ -99,13 +99,13 @@ describe("qa run config", () => {
});
it("keeps idle snapshots on static defaults so startup does not inspect auth profiles", () => {
defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.5");
defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.4");
defaultQaRuntimeModelForMode.mockClear();
expect(createIdleQaRunnerSnapshot(scenarios).selection).toMatchObject({
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.5",
alternateModel: "openai/gpt-5.5",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
});
expect(defaultQaRuntimeModelForMode).not.toHaveBeenCalled();
});
@@ -138,14 +138,14 @@ describe("qa run config", () => {
it("prefers the Codex OAuth default when the runtime resolver says it is available", () => {
defaultQaRuntimeModelForMode.mockImplementation((mode, options) =>
mode === "live-frontier"
? "openai/gpt-5.5"
? "openai/gpt-5.4"
: defaultQaProviderModelForMode(mode as QaProviderModeInput, options),
);
expect(createDefaultQaRunSelection(scenarios)).toEqual({
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.5",
alternateModel: "openai/gpt-5.5",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
fastMode: true,
scenarioIds: ["dm-chat-baseline", "thread-lifecycle"],
});

View File

@@ -137,15 +137,15 @@ describe("qa scenario catalog", () => {
expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
expect(config?.requiredLiveProvider).toBe("openai");
expect(config?.requiredLiveModel).toBe("gpt-5.5");
expect(config?.requiredLiveModel).toBe("gpt-5.4");
expect(config?.offDirective).toBe("/think off");
expect(config?.maxDirective).toBe("/think max");
expect(config?.maxDirective).toBe("/think medium");
expect(config?.reasoningDirective).toBe("/reasoning on");
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
"enables reasoning display and disables thinking",
"switches to max thinking",
"verifies max thinking emits visible reasoning",
"verifies max thinking completes the answer",
"switches to medium thinking",
"verifies medium thinking emits visible reasoning",
"verifies medium thinking completes the answer",
]);
});
@@ -169,10 +169,10 @@ describe("qa scenario catalog", () => {
},
});
expect(config?.requiredProvider).toBe("openai");
expect(config?.requiredModel).toBe("gpt-5.5");
expect(config?.requiredModel).toBe("gpt-5.4");
expect(config?.expectedMarker).toBe("WEB-SEARCH-OK");
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
"confirms live OpenAI GPT-5.5 web search auto mode",
"confirms live OpenAI GPT-5.4 web search auto mode",
"searches official OpenAI News through the live model",
]);
});
@@ -191,7 +191,7 @@ describe("qa scenario catalog", () => {
expect(scenario.sourcePath).toBe("qa/scenarios/models/thinking-slash-model-remap.md");
expect(config?.requiredProviderMode).toBe("live-frontier");
expect(config?.anthropicModelRef).toBe("anthropic/claude-sonnet-4-6");
expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.5");
expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.4");
expect(config?.noXhighModelRef).toBe("anthropic/claude-sonnet-4-6");
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
"selects Anthropic and verifies adaptive options",

View File

@@ -250,4 +250,32 @@ describe("qa suite planning helpers", () => {
}).map((scenario) => scenario.id),
).toEqual(["generic", "claude-subscription"]);
});
it("filters provider-mode-specific scenarios from implicit suite selections", () => {
const scenarios = [
makeQaSuiteTestScenario("generic"),
makeQaSuiteTestScenario("live-only", {
config: { requiredProviderMode: "live-frontier" },
}),
makeQaSuiteTestScenario("mock-only", {
config: { requiredProviderMode: "mock-openai" },
}),
];
expect(
selectQaSuiteScenarios({
scenarios,
providerMode: "mock-openai",
primaryModel: "mock-openai/gpt-5.4",
}).map((scenario) => scenario.id),
).toEqual(["generic", "mock-only"]);
expect(
selectQaSuiteScenarios({
scenarios,
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.4",
}).map((scenario) => scenario.id),
).toEqual(["generic", "live-only"]);
});
});

View File

@@ -33,11 +33,15 @@ function scenarioMatchesLiveLane(params: {
providerMode: QaProviderMode;
claudeCliAuthMode?: QaCliBackendAuthMode;
}) {
const config = params.scenario.execution.config ?? {};
const requiredProviderMode = normalizeQaConfigString(config.requiredProviderMode);
if (requiredProviderMode && params.providerMode !== requiredProviderMode) {
return false;
}
if (getQaProvider(params.providerMode).kind !== "live") {
return true;
}
const selected = splitModelRef(params.primaryModel);
const config = params.scenario.execution.config ?? {};
const requiredProvider = normalizeQaConfigString(config.requiredProvider);
if (requiredProvider && selected?.provider !== requiredProvider) {
return false;

View File

@@ -50,6 +50,9 @@ steps:
expr: "tools.has('image_generate')"
message: image_generate not present after imageGenerationModel patch
- call: reset
- set: generationStartedAt
value:
expr: Date.now()
- call: runAgentPrompt
args:
- ref: env
@@ -70,17 +73,18 @@ steps:
expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName === 'image_generate')"
message:
expr: "`expected image_generate, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName ?? '')}`"
- call: waitForCondition
saveAs: generated
- call: resolveGeneratedImagePath
saveAs: generatedPath
args:
- lambda:
async: true
expr: "!env.mock ? true : (await fetchJson(`${env.mock.baseUrl}/debug/image-generations`)).find((request) => request.model === 'gpt-image-1' && String(request.prompt ?? '').includes(config.generatedNeedle))"
- 15000
- 250
- env:
ref: env
promptSnippet:
expr: config.promptSnippet
startedAtMs:
ref: generationStartedAt
timeoutMs: 15000
- assert:
expr: "!env.mock || Boolean(generated)"
message:
expr: "`image provider was never invoked`"
detailsExpr: "env.mock ? `${outbound.text}\\nIMAGE_PROMPT:${generated.prompt ?? ''}` : outbound.text"
expr: "typeof generatedPath === 'string' && generatedPath.length > 0"
message: image generation did not produce a saved media path
detailsExpr: "`${outbound.text}\\nIMAGE_PATH:${generatedPath}`"
```

View File

@@ -24,10 +24,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario codex-harness-no-meta-leak`.
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
config:
requiredProvider: codex
requiredModel: gpt-5.5
requiredModel: gpt-5.4
harnessRuntime: codex
harnessFallback: none
expectedReply: QA_LEAK_OK
@@ -47,7 +47,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms GPT-5.5 Codex harness target
- name: confirms GPT-5.4 Codex harness target
actions:
- set: selected
value:

View File

@@ -1,20 +1,20 @@
# GPT-5.5 thinking visibility switch
# GPT-5.4 thinking visibility switch
```yaml qa-scenario
id: gpt54-thinking-visibility-switch
title: GPT-5.5 thinking visibility switch
title: GPT-5.4 thinking visibility switch
surface: models
coverage:
primary:
- models.thinking
secondary:
- runtime.reasoning-visibility
objective: Verify GPT-5.5 can switch from disabled thinking to max thinking while reasoning display stays enabled.
objective: Verify GPT-5.4 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
successCriteria:
- Live runs target openai/gpt-5.5, not a mini or pro variant.
- Live runs target openai/gpt-5.4, not a mini or pro variant.
- The session enables reasoning display before the comparison turns.
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
- The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
docsRefs:
- docs/tools/thinking.md
- docs/help/testing.md
@@ -27,12 +27,12 @@ codeRefs:
- extensions/qa-lab/src/providers/mock-openai/server.ts
execution:
kind: flow
summary: Toggle reasoning display and GPT-5.5 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
summary: Toggle reasoning display and GPT-5.4 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
config:
requiredLiveProvider: openai
requiredLiveModel: gpt-5.5
requiredLiveModel: gpt-5.4
offDirective: /think off
maxDirective: /think max
maxDirective: /think medium
reasoningDirective: /reasoning on
conversationId: qa-thinking-visibility
offPrompt: "QA thinking visibility check off: answer exactly THINKING-OFF-OK."
@@ -60,7 +60,7 @@ steps:
- assert:
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
message:
expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
- call: state.addInboundMessage
args:
- conversation:
@@ -133,11 +133,11 @@ steps:
value:
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
- assert:
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
message:
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
- name: switches to max thinking
- name: switches to medium thinking
actions:
- call: state.addInboundMessage
args:
@@ -153,10 +153,10 @@ steps:
saveAs: maxAck
args:
- lambda:
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to high/i.test(candidate.text)).at(-1)"
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
- expr: liveTurnTimeoutMs(env, 20000)
detailsExpr: "`max ack=${maxAck.text}`"
- name: verifies max thinking emits visible reasoning
- name: verifies medium thinking emits visible reasoning
actions:
- set: maxCursor
value:
@@ -182,7 +182,7 @@ steps:
message:
expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
detailsExpr: "`reasoning=${maxReasoning.text}`"
- name: verifies max thinking completes the answer
- name: verifies medium thinking completes the answer
actions:
- call: waitForCondition
saveAs: maxAnswer
@@ -204,8 +204,8 @@ steps:
value:
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
- assert:
expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
message:
expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
detailsExpr: "`answer=${maxAnswer.text}`"
```

View File

@@ -12,7 +12,7 @@ coverage:
objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
successCriteria:
- A live-frontier run fails fast unless the selected primary provider is openai.
- The selected primary model is GPT-5.5, not a mini or pro variant.
- The selected primary model is GPT-5.4, not a mini or pro variant.
- Web search is enabled without pinning a managed web_search provider.
- The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
gatewayConfigPatch:
@@ -32,10 +32,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario openai-native-web-search-live`.
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario openai-native-web-search-live`.
config:
requiredProvider: openai
requiredModel: gpt-5.5
requiredModel: gpt-5.4
expectedMarker: WEB-SEARCH-OK
failureMarker: WEB-SEARCH-FAILED
searchPrompt: |-
@@ -49,7 +49,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms live OpenAI GPT-5.5 web search auto mode
- name: confirms live OpenAI GPT-5.4 web search auto mode
actions:
- call: waitForGatewayHealthy
args:

View File

@@ -13,8 +13,8 @@ coverage:
objective: Verify /think lists provider-owned levels and remaps stored thinking levels when /model changes provider capabilities.
successCriteria:
- Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
- OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
- OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
- A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
docsRefs:
- docs/tools/thinking.md
@@ -33,7 +33,7 @@ execution:
config:
requiredProviderMode: live-frontier
anthropicModelRef: anthropic/claude-sonnet-4-6
openAiXhighModelRef: openai/gpt-5.5
openAiXhighModelRef: openai/gpt-5.4
noXhighModelRef: anthropic/claude-sonnet-4-6
conversationId: qa-thinking-slash-remap
```
@@ -165,7 +165,7 @@ steps:
- assert:
expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
message:
expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${openAiModelAck.text}; think=${openAiThinkStatus.text}`"
- name: maps xhigh to high on a model without xhigh
actions:

View File

@@ -11,7 +11,7 @@ coverage:
- models.codex-cli
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
successCriteria:
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced.
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4 with the Codex harness forced.
- The scenario forces the Codex embedded harness and disables PI fallback.
- The prompt explicitly asks the agent to enter plan mode before editing.
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-codex-harness`.
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
config:
requiredProvider: codex
requiredModel: gpt-5.5
requiredModel: gpt-5.4
harnessRuntime: codex
harnessFallback: none
artifactFile: star-garden-defenders-codex.html
@@ -52,7 +52,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms GPT-5.5 Codex harness target
- name: confirms GPT-5.4 Codex harness target
actions:
- set: selected
value:

View File

@@ -9,9 +9,9 @@ coverage:
- workspace.planning
secondary:
- agents.pi-harness
objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
successCriteria:
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
- The scenario forces the embedded PI harness before the build turn.
- The prompt explicitly asks the agent to enter plan mode before editing.
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
- extensions/qa-lab/src/suite.ts
execution:
kind: flow
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-pi-harness`.
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
config:
requiredProvider: openai
requiredModel: gpt-5.5
requiredModel: gpt-5.4
harnessRuntime: pi
harnessFallback: pi
artifactFile: star-garden-defenders-pi.html
@@ -52,7 +52,7 @@ execution:
```yaml qa-flow
steps:
- name: confirms GPT-5.5 PI harness target
- name: confirms GPT-5.4 PI harness target
actions:
- set: selected
value: