fix: stabilize qa lab mock suite

2026-05-06 05:30:42 +00:00 · 2026-04-24 02:46:25 +01:00
parent 2779020cbe
commit 903308dbf2
26 changed files with 302 additions and 104 deletions
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -207,7 +207,7 @@ refs and write a judged Markdown report:

 ```bash
 pnpm openclaw qa character-eval \
-  --model openai-codex/gpt-5.5,thinking=xhigh \
+  --model openai/gpt-5.4,thinking=medium,fast \
  --model openai/gpt-5.2,thinking=xhigh \
  --model openai/gpt-5,thinking=xhigh \
  --model anthropic/claude-opus-4-6,thinking=high \
@@ -215,7 +215,7 @@ pnpm openclaw qa character-eval \
  --model zai/glm-5.1,thinking=high \
  --model moonshot/kimi-k2.5,thinking=high \
  --model google/gemini-3.1-pro-preview,thinking=high \
-  --judge-model openai-codex/gpt-5.5,thinking=xhigh,fast \
+  --judge-model openai/gpt-5.4,thinking=xhigh,fast \
  --judge-model anthropic/claude-opus-4-6,thinking=high \
  --blind-judge-models \
  --concurrency 16 \
@@ -227,13 +227,13 @@ scenarios should set the persona through `SOUL.md`, then run ordinary user turns
 such as chat, workspace help, and small file tasks. The candidate model should
 not be told that it is being evaluated. The command preserves each full
 transcript, records basic run stats, then asks the judge models in fast mode with
-`xhigh` reasoning to rank the runs by naturalness, vibe, and humor.
+`xhigh` reasoning where supported to rank the runs by naturalness, vibe, and humor.
 Use `--blind-judge-models` when comparing providers: the judge prompt still gets
 every transcript and run status, but candidate refs are replaced with neutral
 labels such as `candidate-01`; the report maps rankings back to real refs after
 parsing.
-Candidate runs default to `high` thinking, with `xhigh` for OpenAI models that
-support it. Override a specific candidate inline with
+Candidate runs default to `high` thinking, with `medium` for GPT-5.4 and `xhigh`
+for older OpenAI eval refs that support it. Override a specific candidate inline with
 `--model provider/model,thinking=<level>`. `--thinking <level>` still sets a
 global fallback, and the older `--model-thinking <provider/model=level>` form is
 kept for compatibility.
@@ -247,12 +247,12 @@ Candidate and judge model runs both default to concurrency 16. Lower
 `--concurrency` or `--judge-concurrency` when provider limits or local gateway
 pressure make a run too noisy.
 When no candidate `--model` is passed, the character eval defaults to
-`openai-codex/gpt-5.5`, `openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`,
+`openai/gpt-5.4`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-6`,
 `anthropic/claude-sonnet-4-6`, `zai/glm-5.1`,
 `moonshot/kimi-k2.5`, and
 `google/gemini-3.1-pro-preview` when no `--model` is passed.
 When no `--judge-model` is passed, the judges default to
-`openai-codex/gpt-5.5,thinking=xhigh,fast` and
+`openai/gpt-5.4,thinking=xhigh,fast` and
 `anthropic/claude-opus-4-6,thinking=high`.

 ## Related docs
--- a/docs/help/testing.md
+++ b/docs/help/testing.md
@@ -680,7 +680,7 @@ Docker notes:
  `agent` method:
  - load the bundled `codex` plugin
  - select `OPENCLAW_AGENT_RUNTIME=codex`
-  - send a first gateway agent turn to `openai/gpt-5.5` with the Codex harness forced
+  - send a first gateway agent turn to `openai/gpt-5.4` with the Codex harness forced
  - send a second turn to the same OpenClaw session and verify the app-server
    thread can resume
  - run `/codex status` and `/codex models` through the same gateway command
@@ -690,7 +690,7 @@ Docker notes:
    denied so the agent asks back
 - Test: `src/gateway/gateway-codex-harness.live.test.ts`
 - Enable: `OPENCLAW_LIVE_CODEX_HARNESS=1`
- Default model: `openai/gpt-5.5`
+- Default model: `openai/gpt-5.4`
 - Optional image probe: `OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1`
 - Optional MCP/tool probe: `OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1`
 - Optional Guardian probe: `OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1`
@@ -708,7 +708,7 @@ OPENCLAW_LIVE_CODEX_HARNESS=1 \
  OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1 \
  OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1 \
  OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1 \
-  OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.5 \
+  OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.4 \
  pnpm test:live -- src/gateway/gateway-codex-harness.live.test.ts
 ```

--- a/extensions/qa-lab/src/character-eval.test.ts
+++ b/extensions/qa-lab/src/character-eval.test.ts
@@ -125,7 +125,7 @@ describe("runQaCharacterEval", () => {
      expect.objectContaining({
        judgeModel: "openai/gpt-5.4",
        judgeThinkingDefault: "xhigh",
-        judgeFastMode: false,
+        judgeFastMode: true,
        timeoutMs: 300_000,
      }),
    );
@@ -223,7 +223,7 @@ describe("runQaCharacterEval", () => {

    expect(runSuite).toHaveBeenCalledTimes(8);
    expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
-      "openai/gpt-5.5",
+      "openai/gpt-5.4",
      "openai/gpt-5.2",
      "openai/gpt-5",
      "anthropic/claude-opus-4-6",
@@ -233,7 +233,7 @@ describe("runQaCharacterEval", () => {
      "google/gemini-3.1-pro-preview",
    ]);
    expect(runSuite.mock.calls.map(([params]) => params.thinkingDefault)).toEqual([
-      "xhigh",
+      "medium",
      "xhigh",
      "xhigh",
      "high",
@@ -254,14 +254,14 @@ describe("runQaCharacterEval", () => {
    ]);
    expect(runJudge).toHaveBeenCalledTimes(2);
    expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([
-      "openai/gpt-5.5",
+      "openai/gpt-5.4",
      "anthropic/claude-opus-4-6",
    ]);
    expect(runJudge.mock.calls.map(([params]) => params.judgeThinkingDefault)).toEqual([
      "xhigh",
      "high",
    ]);
-    expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([false, false]);
+    expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
  });

  it("runs candidate models with bounded concurrency while preserving result order", async () => {
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -189,6 +189,7 @@ describe("qa cli runtime", () => {
      primaryModel: "openai/gpt-5.4",
      alternateModel: "anthropic/claude-sonnet-4-6",
      fastMode: true,
+      thinking: "medium",
      scenarioIds: ["approval-turn-tool-followthrough"],
    });

@@ -200,6 +201,7 @@ describe("qa cli runtime", () => {
      primaryModel: "openai/gpt-5.4",
      alternateModel: "anthropic/claude-sonnet-4-6",
      fastMode: true,
+      thinkingDefault: "medium",
      scenarioIds: ["approval-turn-tool-followthrough"],
    });
  });
@@ -1135,8 +1137,8 @@ describe("qa cli runtime", () => {
      repoRoot: path.resolve("/tmp/openclaw-repo"),
      transportId: "qa-channel",
      providerMode: "live-frontier",
-      primaryModel: "openai/gpt-5.5",
-      alternateModel: "openai/gpt-5.5",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "openai/gpt-5.4",
      fastMode: undefined,
      message: "read qa kickoff and reply short",
      timeoutMs: undefined,
@@ -1166,7 +1168,7 @@ describe("qa cli runtime", () => {
  it("defaults manual frontier runs onto Codex OAuth when the runtime resolver prefers it", async () => {
    defaultQaRuntimeModelForMode.mockImplementation((mode, options) =>
      mode === "live-frontier"
-        ? "openai/gpt-5.5"
+        ? "openai/gpt-5.4"
        : defaultQaProviderModelForMode(mode as QaProviderModeInput, options),
    );

@@ -1179,8 +1181,8 @@ describe("qa cli runtime", () => {
      repoRoot: path.resolve("/tmp/openclaw-repo"),
      transportId: "qa-channel",
      providerMode: "live-frontier",
-      primaryModel: "openai/gpt-5.5",
-      alternateModel: "openai/gpt-5.5",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "openai/gpt-5.4",
      fastMode: undefined,
      message: "read qa kickoff and reply short",
      timeoutMs: undefined,
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -450,6 +450,7 @@ export async function runQaSuiteCommand(opts: {
  primaryModel?: string;
  alternateModel?: string;
  fastMode?: boolean;
+  thinking?: string;
  cliAuthMode?: string;
  parityPack?: string;
  scenarioIds?: string[];
@@ -490,6 +491,7 @@ export async function runQaSuiteCommand(opts: {
    throw new Error("--cli-auth-mode requires --runner host.");
  }
  if (runner === "multipass") {
+    const thinkingDefault = parseQaThinkingLevel("--thinking", opts.thinking);
    const result = await runQaMultipass({
      repoRoot,
      outputDir: resolveRepoRelativeOutputDir(repoRoot, opts.outputDir),
@@ -498,6 +500,7 @@ export async function runQaSuiteCommand(opts: {
      primaryModel: opts.primaryModel,
      alternateModel: opts.alternateModel,
      fastMode: opts.fastMode,
+      ...(thinkingDefault ? { thinkingDefault } : {}),
      allowFailures: true,
      scenarioIds,
      ...(opts.concurrency !== undefined
@@ -532,6 +535,7 @@ export async function runQaSuiteCommand(opts: {
    });
    return;
  }
+  const thinkingDefault = parseQaThinkingLevel("--thinking", opts.thinking);
  const result = await runQaSuiteFromRuntimeWithInfraRetry({
    repoRoot,
    outputDir: resolveRepoRelativeOutputDir(repoRoot, opts.outputDir),
@@ -540,6 +544,7 @@ export async function runQaSuiteCommand(opts: {
    primaryModel: opts.primaryModel,
    alternateModel: opts.alternateModel,
    fastMode: opts.fastMode,
+    ...(thinkingDefault ? { thinkingDefault } : {}),
    ...(claudeCliAuthMode ? { claudeCliAuthMode } : {}),
    scenarioIds,
    ...(opts.concurrency !== undefined
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -35,6 +35,7 @@ async function runQaSuite(opts: {
  primaryModel?: string;
  alternateModel?: string;
  fastMode?: boolean;
+  thinking?: string;
  allowFailures?: boolean;
  cliAuthMode?: string;
  parityPack?: string;
@@ -247,6 +248,10 @@ export function registerQaLabCli(program: Command) {
      false,
    )
    .option("--fast", "Enable provider fast mode where supported", false)
+    .option(
+      "--thinking <level>",
+      "Suite thinking default: off|minimal|low|medium|high|xhigh|adaptive|max",
+    )
    .option("--image <alias>", "Multipass image alias")
    .option("--cpus <count>", "Multipass vCPU count", (value: string) => Number(value))
    .option("--memory <size>", "Multipass memory size")
@@ -266,6 +271,7 @@ export function registerQaLabCli(program: Command) {
        concurrency?: number;
        allowFailures?: boolean;
        fast?: boolean;
+        thinking?: string;
        image?: string;
        cpus?: number;
        memory?: string;
@@ -281,6 +287,7 @@ export function registerQaLabCli(program: Command) {
          primaryModel: opts.model,
          alternateModel: opts.altModel,
          fastMode: opts.fast,
+          thinking: opts.thinking,
          cliAuthMode: opts.cliAuthMode,
          parityPack: opts.parityPack,
          scenarioIds: opts.scenario,
--- a/extensions/qa-lab/src/model-catalog.runtime.test.ts
+++ b/extensions/qa-lab/src/model-catalog.runtime.test.ts
@@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest";
 import { selectQaRunnerModelOptions } from "./model-catalog.runtime.js";

 describe("qa runner model catalog", () => {
-  it("filters to available rows and prefers gpt-5.5 first", () => {
+  it("filters to available rows and prefers gpt-5.4 first", () => {
    expect(
      selectQaRunnerModelOptions([
        {
@@ -13,8 +13,8 @@ describe("qa runner model catalog", () => {
          missing: false,
        },
        {
-          key: "openai/gpt-5.5",
-          name: "gpt-5.5",
+          key: "openai/gpt-5.4",
+          name: "gpt-5.4",
          input: "text,image",
          available: true,
          missing: false,
@@ -27,6 +27,6 @@ describe("qa runner model catalog", () => {
          missing: false,
        },
      ]).map((entry) => entry.key),
-    ).toEqual(["openai/gpt-5.5", "anthropic/claude-sonnet-4-6"]);
+    ).toEqual(["openai/gpt-5.4", "anthropic/claude-sonnet-4-6"]);
  });
 });
--- a/extensions/qa-lab/src/model-selection.runtime.test.ts
+++ b/extensions/qa-lab/src/model-selection.runtime.test.ts
@@ -34,7 +34,7 @@ describe("qa model selection runtime", () => {
    resolveEnvApiKey.mockReturnValue({ apiKey: "sk-test" });

    expect(resolveQaPreferredLiveModel()).toBeUndefined();
-    expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
+    expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
    expect(loadAuthProfileStoreForRuntime).not.toHaveBeenCalled();
  });

@@ -43,8 +43,8 @@ describe("qa model selection runtime", () => {
      provider === "openai-codex" ? ["openai-codex:user@example.com"] : [],
    );

-    expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.5");
-    expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
+    expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.4");
+    expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
  });

  it("keeps the OpenAI live default when stored OpenAI profiles are available", () => {
@@ -53,7 +53,7 @@ describe("qa model selection runtime", () => {
    );

    expect(resolveQaPreferredLiveModel()).toBeUndefined();
-    expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
+    expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
  });

  it("leaves mock defaults unchanged", () => {
--- a/extensions/qa-lab/src/multipass.runtime.ts
+++ b/extensions/qa-lab/src/multipass.runtime.ts
@@ -71,6 +71,7 @@ export type QaMultipassPlan = {
  primaryModel?: string;
  alternateModel?: string;
  fastMode?: boolean;
+  thinkingDefault?: string;
  scenarioIds: string[];
  forwardedEnv: Record<string, string>;
  hostCodexHomePath?: string;
@@ -237,6 +238,7 @@ export function createQaMultipassPlan(params: {
  primaryModel?: string;
  alternateModel?: string;
  fastMode?: boolean;
+  thinkingDefault?: string;
  allowFailures?: boolean;
  scenarioIds?: string[];
  concurrency?: number;
@@ -276,6 +278,7 @@ export function createQaMultipassPlan(params: {
      ...(params.primaryModel ? ["--model", params.primaryModel] : []),
      ...(params.alternateModel ? ["--alt-model", params.alternateModel] : []),
      ...(params.fastMode ? ["--fast"] : []),
+      ...(params.thinkingDefault ? ["--thinking", params.thinkingDefault] : []),
      ...(params.allowFailures ? ["--allow-failures"] : []),
      ...(params.concurrency ? ["--concurrency", String(params.concurrency)] : []),
    ],
@@ -301,6 +304,7 @@ export function createQaMultipassPlan(params: {
    primaryModel: params.primaryModel,
    alternateModel: params.alternateModel,
    fastMode: params.fastMode,
+    thinkingDefault: params.thinkingDefault,
    scenarioIds,
    forwardedEnv,
    hostCodexHomePath,
--- a/extensions/qa-lab/src/providers/live-frontier/catalog.ts
+++ b/extensions/qa-lab/src/providers/live-frontier/catalog.ts
@@ -1,5 +1,5 @@
 export const QA_FRONTIER_PROVIDER_IDS = ["anthropic", "google", "openai"] as const;
-export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.5";
+export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.4";
 export const QA_FRONTIER_CATALOG_ALTERNATE_MODEL = "anthropic/claude-sonnet-4-6";

 export function isPreferredQaLiveFrontierCatalogModel(modelRef: string) {
--- a/extensions/qa-lab/src/providers/live-frontier/character-eval.ts
+++ b/extensions/qa-lab/src/providers/live-frontier/character-eval.ts
@@ -6,7 +6,7 @@ type QaFrontierCharacterModelOptions = {
 };

 export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([
-  "openai/gpt-5.5",
+  "openai/gpt-5.4",
  "openai/gpt-5.2",
  "openai/gpt-5",
  "anthropic/claude-opus-4-6",
@@ -18,19 +18,19 @@ export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([

 export const QA_FRONTIER_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
  Object.freeze({
-    "openai/gpt-5.5": "xhigh",
+    "openai/gpt-5.4": "medium",
    "openai/gpt-5.2": "xhigh",
    "openai/gpt-5": "xhigh",
  });

 export const QA_FRONTIER_CHARACTER_JUDGE_MODELS = Object.freeze([
-  "openai/gpt-5.5",
+  "openai/gpt-5.4",
  "anthropic/claude-opus-4-6",
 ]);

 export const QA_FRONTIER_CHARACTER_JUDGE_MODEL_OPTIONS: Readonly<
  Record<string, QaFrontierCharacterModelOptions>
 > = Object.freeze({
-  "openai/gpt-5.5": { thinkingDefault: "xhigh" },
+  "openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true },
  "anthropic/claude-opus-4-6": { thinkingDefault: "high" },
 });
--- a/extensions/qa-lab/src/providers/live-frontier/index.ts
+++ b/extensions/qa-lab/src/providers/live-frontier/index.ts
@@ -23,7 +23,7 @@ function isClaudeOpusModel(modelRef: string) {
 export const liveFrontierProviderDefinition: QaProviderDefinition = {
  mode: "live-frontier",
  kind: "live",
-  defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.5",
+  defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.4",
  defaultImageGenerationProviderIds: ["openai"],
  defaultImageGenerationModel: ({ modelProviderIds }) =>
    modelProviderIds.includes("openai") ? "openai/gpt-image-1" : null,
--- a/extensions/qa-lab/src/providers/live-frontier/model-selection.runtime.ts
+++ b/extensions/qa-lab/src/providers/live-frontier/model-selection.runtime.ts
@@ -4,7 +4,7 @@ import {
 } from "openclaw/plugin-sdk/agent-runtime";
 import { resolveEnvApiKey } from "openclaw/plugin-sdk/provider-auth";

-const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.5";
+const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.4";

 export function resolveQaLiveFrontierPreferredModel() {
  if (resolveEnvApiKey("openai")?.apiKey) {
--- a/extensions/qa-lab/src/providers/live-frontier/parity.ts
+++ b/extensions/qa-lab/src/providers/live-frontier/parity.ts
@@ -1,2 +1,2 @@
-export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.5";
+export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.4";
 export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-6";
--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -151,6 +151,10 @@ const QA_REASONING_ONLY_RETRY_NEEDLE =
  "recorded reasoning but did not produce a user-visible answer";
 const QA_EMPTY_RESPONSE_RETRY_NEEDLE =
  "The previous attempt did not produce a user-visible answer.";
+const QA_SKILL_WORKSHOP_GIF_PROMPT_RE =
+  /externally sourced animated GIF asset|animated GIF asset in a product UI/i;
+const QA_SKILL_WORKSHOP_REVIEW_PROMPT_RE = /Review transcript for durable skill updates/i;
+const QA_RELEASE_AUDIT_PROMPT_RE = /release readiness audit for the small project/i;

 type MockScenarioState = {
  subagentFanoutPhase: number;
@@ -727,6 +731,16 @@ function buildAssistantText(
  if (/(image generation check|capability flip image check)/i.test(prompt) && mediaPath) {
    return `Protocol note: generated the QA lighthouse image successfully.\nMEDIA:${mediaPath}`;
  }
+  if (QA_SKILL_WORKSHOP_GIF_PROMPT_RE.test(prompt) && toolOutput) {
+    return [
+      "Animated GIF QA checklist ready.",
+      "- Confirm true animation, not a static preview.",
+      "- Verify dimensions and product UI fit.",
+      "- Record attribution and license.",
+      "- Keep a local copy before using the asset.",
+      "- Re-open the copied file for final verification.",
+    ].join("\n");
+  }
  if (/roundtrip image inspection check/i.test(prompt) && imageInputCount > 0) {
    return "Protocol note: the generated attachment shows the same QA lighthouse scene from the previous step.";
  }
@@ -808,6 +822,79 @@ function buildToolCallEvents(prompt: string): StreamEvent[] {
  return buildToolCallEventsWithArgs("read", { path: targetPath });
 }

+function buildReleaseAuditJson() {
+  return `${JSON.stringify(
+    {
+      verified: true,
+      findings: [
+        {
+          id: "REL-GATEWAY-417",
+          source: "src/gateway/reconnect.ts",
+          status: "retry jitter verified, resume token fallback still needs manual spot check",
+        },
+        {
+          id: "REL-CHANNEL-238",
+          source: "src/channels/delivery.ts",
+          status: "thread replies preserve ordering, root-channel fallback needs handoff note",
+        },
+        {
+          id: "REL-CRON-904",
+          source: "src/scheduling/cron.ts",
+          status: "single-run lock verified for restart wakeups",
+        },
+        {
+          id: "REL-MEMORY-552",
+          source: "src/memory/recall.ts",
+          status:
+            "fallback summary survives empty memory search; ranking sample needs second reviewer",
+        },
+        {
+          id: "REL-PLUGIN-319",
+          source: "src/plugins/runtime.ts",
+          status: "bundled runtime manifest loads cleanly after restart",
+        },
+        {
+          id: "REL-INSTALL-846",
+          source: "install/update.ts",
+          status: "update smoke passed from previous stable tag",
+        },
+        {
+          id: "REL-DOCS-611",
+          source: "docs/operator-notes.md",
+          status:
+            "docs mention reconnect, cron, memory, plugin, and installer checks; channel ordering and UI notes need maintainer handoff",
+        },
+        {
+          id: "REL-UI-BLOCKED",
+          source: "ui/control-panel.ts",
+          status: "blocked: source file was referenced by checklist but missing from the fixture",
+        },
+      ],
+    },
+    null,
+    2,
+  )}\n`;
+}
+
+function buildReleaseHandoffMarkdown() {
+  return [
+    "# Release Handoff",
+    "",
+    "Ready:",
+    "- REL-GATEWAY-417: gateway reconnect handling checked in `src/gateway/reconnect.ts`.",
+    "- REL-CRON-904: cron duplicate prevention checked in `src/scheduling/cron.ts`.",
+    "- REL-PLUGIN-319: plugin runtime loading checked in `src/plugins/runtime.ts`.",
+    "- REL-INSTALL-846: installer update path checked in `install/update.ts`.",
+    "",
+    "Follow-up:",
+    "- REL-CHANNEL-238: channel delivery ordering needs maintainer handoff.",
+    "- REL-MEMORY-552: memory recall fallback ranking sample needs a second reviewer.",
+    "- REL-DOCS-611: docs update status needs channel ordering and UI notes.",
+    "- `ui/control-panel.ts` is blocked/not found in the fixture.",
+    "",
+  ].join("\n");
+}
+
 function extractPlannedToolName(events: StreamEvent[]) {
  for (const event of events) {
    if (event.type !== "response.output_item.done") {
@@ -1128,6 +1215,63 @@ async function buildResponsesPayload(
      },
    ]);
  }
+  if (QA_SKILL_WORKSHOP_REVIEW_PROMPT_RE.test(allInputText)) {
+    return buildAssistantEvents(
+      JSON.stringify({
+        action: "create",
+        skillName: "animated-gif-workflow",
+        title: "Animated GIF Workflow",
+        reason: "Transcript captured a reusable animated media QA checklist.",
+        description: "Reusable workflow notes for animated GIF QA tasks.",
+        body: [
+          "- Confirm the asset has true animation, not a static preview.",
+          "- Check dimensions against the target product UI slot.",
+          "- Record attribution and license before using the file.",
+          "- Keep a local copy under the workspace before integration.",
+          "- Re-open the local copy for final verification.",
+        ].join("\n"),
+      }),
+    );
+  }
+  if (QA_SKILL_WORKSHOP_GIF_PROMPT_RE.test(prompt) && !toolOutput) {
+    return buildToolCallEventsWithArgs("write", {
+      path: "animated-gif-qa-checklist.md",
+      content: [
+        "# Animated GIF QA Checklist",
+        "",
+        "- Confirm true animation.",
+        "- Verify dimensions.",
+        "- Record attribution.",
+        "- Keep a local copy.",
+        "- Perform final verification.",
+      ].join("\n"),
+    });
+  }
+  if (QA_RELEASE_AUDIT_PROMPT_RE.test(prompt)) {
+    if (!toolOutput) {
+      return buildToolCallEventsWithArgs("read", { path: "audit-fixture/README.md" });
+    }
+    if (/Release readiness task|current checklist/i.test(toolOutput)) {
+      return buildToolCallEventsWithArgs("read", {
+        path: "audit-fixture/docs/current-readiness-checklist.md",
+      });
+    }
+    if (/Current release readiness requires checking eight areas/i.test(toolOutput)) {
+      return buildToolCallEventsWithArgs("write", {
+        path: "audit-fixture/release-audit.json",
+        content: buildReleaseAuditJson(),
+      });
+    }
+    if (/release-audit\.json/i.test(toolOutput)) {
+      return buildToolCallEventsWithArgs("write", {
+        path: "audit-fixture/release-handoff.md",
+        content: buildReleaseHandoffMarkdown(),
+      });
+    }
+    if (/release-handoff\.md/i.test(toolOutput)) {
+      return buildAssistantEvents("RELEASE-AUDIT-COMPLETE");
+    }
+  }
  if (/lobster invaders/i.test(prompt)) {
    if (!toolOutput) {
      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
--- a/extensions/qa-lab/src/run-config.test.ts
+++ b/extensions/qa-lab/src/run-config.test.ts
@@ -45,8 +45,8 @@ describe("qa run config", () => {
  it("creates a live-by-default selection that arms every scenario", () => {
    expect(createDefaultQaRunSelection(scenarios)).toEqual({
      providerMode: "live-frontier",
-      primaryModel: "openai/gpt-5.5",
-      alternateModel: "openai/gpt-5.5",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "openai/gpt-5.4",
      fastMode: true,
      scenarioIds: ["dm-chat-baseline", "thread-lifecycle"],
    });
@@ -57,7 +57,7 @@ describe("qa run config", () => {
      normalizeQaRunSelection(
        {
          providerMode: "live-frontier",
-          primaryModel: "openai/gpt-5.5",
+          primaryModel: "openai/gpt-5.4",
          alternateModel: "",
          fastMode: false,
          scenarioIds: ["thread-lifecycle", "missing", "thread-lifecycle"],
@@ -66,8 +66,8 @@ describe("qa run config", () => {
      ),
    ).toEqual({
      providerMode: "live-frontier",
-      primaryModel: "openai/gpt-5.5",
-      alternateModel: "openai/gpt-5.5",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "openai/gpt-5.4",
      fastMode: true,
      scenarioIds: ["thread-lifecycle"],
    });
@@ -99,13 +99,13 @@ describe("qa run config", () => {
  });

  it("keeps idle snapshots on static defaults so startup does not inspect auth profiles", () => {
-    defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.5");
+    defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.4");
    defaultQaRuntimeModelForMode.mockClear();

    expect(createIdleQaRunnerSnapshot(scenarios).selection).toMatchObject({
      providerMode: "live-frontier",
-      primaryModel: "openai/gpt-5.5",
-      alternateModel: "openai/gpt-5.5",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "openai/gpt-5.4",
    });
    expect(defaultQaRuntimeModelForMode).not.toHaveBeenCalled();
  });
@@ -138,14 +138,14 @@ describe("qa run config", () => {
  it("prefers the Codex OAuth default when the runtime resolver says it is available", () => {
    defaultQaRuntimeModelForMode.mockImplementation((mode, options) =>
      mode === "live-frontier"
-        ? "openai/gpt-5.5"
+        ? "openai/gpt-5.4"
        : defaultQaProviderModelForMode(mode as QaProviderModeInput, options),
    );

    expect(createDefaultQaRunSelection(scenarios)).toEqual({
      providerMode: "live-frontier",
-      primaryModel: "openai/gpt-5.5",
-      alternateModel: "openai/gpt-5.5",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "openai/gpt-5.4",
      fastMode: true,
      scenarioIds: ["dm-chat-baseline", "thread-lifecycle"],
    });
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -137,15 +137,15 @@ describe("qa scenario catalog", () => {

    expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
    expect(config?.requiredLiveProvider).toBe("openai");
-    expect(config?.requiredLiveModel).toBe("gpt-5.5");
+    expect(config?.requiredLiveModel).toBe("gpt-5.4");
    expect(config?.offDirective).toBe("/think off");
-    expect(config?.maxDirective).toBe("/think max");
+    expect(config?.maxDirective).toBe("/think medium");
    expect(config?.reasoningDirective).toBe("/reasoning on");
    expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
      "enables reasoning display and disables thinking",
-      "switches to max thinking",
-      "verifies max thinking emits visible reasoning",
-      "verifies max thinking completes the answer",
+      "switches to medium thinking",
+      "verifies medium thinking emits visible reasoning",
+      "verifies medium thinking completes the answer",
    ]);
  });

@@ -169,10 +169,10 @@ describe("qa scenario catalog", () => {
      },
    });
    expect(config?.requiredProvider).toBe("openai");
-    expect(config?.requiredModel).toBe("gpt-5.5");
+    expect(config?.requiredModel).toBe("gpt-5.4");
    expect(config?.expectedMarker).toBe("WEB-SEARCH-OK");
    expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
-      "confirms live OpenAI GPT-5.5 web search auto mode",
+      "confirms live OpenAI GPT-5.4 web search auto mode",
      "searches official OpenAI News through the live model",
    ]);
  });
@@ -191,7 +191,7 @@ describe("qa scenario catalog", () => {
    expect(scenario.sourcePath).toBe("qa/scenarios/models/thinking-slash-model-remap.md");
    expect(config?.requiredProviderMode).toBe("live-frontier");
    expect(config?.anthropicModelRef).toBe("anthropic/claude-sonnet-4-6");
-    expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.5");
+    expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.4");
    expect(config?.noXhighModelRef).toBe("anthropic/claude-sonnet-4-6");
    expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
      "selects Anthropic and verifies adaptive options",
--- a/extensions/qa-lab/src/suite-planning.test.ts
+++ b/extensions/qa-lab/src/suite-planning.test.ts
@@ -250,4 +250,32 @@ describe("qa suite planning helpers", () => {
      }).map((scenario) => scenario.id),
    ).toEqual(["generic", "claude-subscription"]);
  });
+
+  it("filters provider-mode-specific scenarios from implicit suite selections", () => {
+    const scenarios = [
+      makeQaSuiteTestScenario("generic"),
+      makeQaSuiteTestScenario("live-only", {
+        config: { requiredProviderMode: "live-frontier" },
+      }),
+      makeQaSuiteTestScenario("mock-only", {
+        config: { requiredProviderMode: "mock-openai" },
+      }),
+    ];
+
+    expect(
+      selectQaSuiteScenarios({
+        scenarios,
+        providerMode: "mock-openai",
+        primaryModel: "mock-openai/gpt-5.4",
+      }).map((scenario) => scenario.id),
+    ).toEqual(["generic", "mock-only"]);
+
+    expect(
+      selectQaSuiteScenarios({
+        scenarios,
+        providerMode: "live-frontier",
+        primaryModel: "openai/gpt-5.4",
+      }).map((scenario) => scenario.id),
+    ).toEqual(["generic", "live-only"]);
+  });
 });
--- a/extensions/qa-lab/src/suite-planning.ts
+++ b/extensions/qa-lab/src/suite-planning.ts
@@ -33,11 +33,15 @@ function scenarioMatchesLiveLane(params: {
  providerMode: QaProviderMode;
  claudeCliAuthMode?: QaCliBackendAuthMode;
 }) {
+  const config = params.scenario.execution.config ?? {};
+  const requiredProviderMode = normalizeQaConfigString(config.requiredProviderMode);
+  if (requiredProviderMode && params.providerMode !== requiredProviderMode) {
+    return false;
+  }
  if (getQaProvider(params.providerMode).kind !== "live") {
    return true;
  }
  const selected = splitModelRef(params.primaryModel);
-  const config = params.scenario.execution.config ?? {};
  const requiredProvider = normalizeQaConfigString(config.requiredProvider);
  if (requiredProvider && selected?.provider !== requiredProvider) {
    return false;
--- a/qa/scenarios/media/native-image-generation.md
+++ b/qa/scenarios/media/native-image-generation.md
@@ -50,6 +50,9 @@ steps:
          expr: "tools.has('image_generate')"
          message: image_generate not present after imageGenerationModel patch
      - call: reset
+      - set: generationStartedAt
+        value:
+          expr: Date.now()
      - call: runAgentPrompt
        args:
          - ref: env
@@ -70,17 +73,18 @@ steps:
          expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName === 'image_generate')"
          message:
            expr: "`expected image_generate, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName ?? '')}`"
-      - call: waitForCondition
-        saveAs: generated
+      - call: resolveGeneratedImagePath
+        saveAs: generatedPath
        args:
-          - lambda:
-              async: true
-              expr: "!env.mock ? true : (await fetchJson(`${env.mock.baseUrl}/debug/image-generations`)).find((request) => request.model === 'gpt-image-1' && String(request.prompt ?? '').includes(config.generatedNeedle))"
-          - 15000
-          - 250
+          - env:
+              ref: env
+            promptSnippet:
+              expr: config.promptSnippet
+            startedAtMs:
+              ref: generationStartedAt
+            timeoutMs: 15000
      - assert:
-          expr: "!env.mock || Boolean(generated)"
-          message:
-            expr: "`image provider was never invoked`"
-    detailsExpr: "env.mock ? `${outbound.text}\\nIMAGE_PROMPT:${generated.prompt ?? ''}` : outbound.text"
+          expr: "typeof generatedPath === 'string' && generatedPath.length > 0"
+          message: image generation did not produce a saved media path
+    detailsExpr: "`${outbound.text}\\nIMAGE_PATH:${generatedPath}`"
 ```
--- a/qa/scenarios/models/codex-harness-no-meta-leak.md
+++ b/qa/scenarios/models/codex-harness-no-meta-leak.md
@@ -24,10 +24,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario codex-harness-no-meta-leak`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
  config:
    requiredProvider: codex
-    requiredModel: gpt-5.5
+    requiredModel: gpt-5.4
    harnessRuntime: codex
    harnessFallback: none
    expectedReply: QA_LEAK_OK
@@ -47,7 +47,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.5 Codex harness target
+  - name: confirms GPT-5.4 Codex harness target
    actions:
      - set: selected
        value:
--- a/qa/scenarios/models/gpt54-thinking-visibility-switch.md
+++ b/qa/scenarios/models/gpt54-thinking-visibility-switch.md
@@ -1,20 +1,20 @@
-# GPT-5.5 thinking visibility switch
+# GPT-5.4 thinking visibility switch

 ```yaml qa-scenario
 id: gpt54-thinking-visibility-switch
-title: GPT-5.5 thinking visibility switch
+title: GPT-5.4 thinking visibility switch
 surface: models
 coverage:
  primary:
    - models.thinking
  secondary:
    - runtime.reasoning-visibility
-objective: Verify GPT-5.5 can switch from disabled thinking to max thinking while reasoning display stays enabled.
+objective: Verify GPT-5.4 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
 successCriteria:
-  - Live runs target openai/gpt-5.5, not a mini or pro variant.
+  - Live runs target openai/gpt-5.4, not a mini or pro variant.
  - The session enables reasoning display before the comparison turns.
  - The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
-  - The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
+  - The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
 docsRefs:
  - docs/tools/thinking.md
  - docs/help/testing.md
@@ -27,12 +27,12 @@ codeRefs:
  - extensions/qa-lab/src/providers/mock-openai/server.ts
 execution:
  kind: flow
-  summary: Toggle reasoning display and GPT-5.5 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
+  summary: Toggle reasoning display and GPT-5.4 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
  config:
    requiredLiveProvider: openai
-    requiredLiveModel: gpt-5.5
+    requiredLiveModel: gpt-5.4
    offDirective: /think off
-    maxDirective: /think max
+    maxDirective: /think medium
    reasoningDirective: /reasoning on
    conversationId: qa-thinking-visibility
    offPrompt: "QA thinking visibility check off: answer exactly THINKING-OFF-OK."
@@ -60,7 +60,7 @@ steps:
      - assert:
          expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
          message:
-            expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
+            expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
      - call: state.addInboundMessage
        args:
          - conversation:
@@ -133,11 +133,11 @@ steps:
              value:
                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
            - assert:
-                expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
+                expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
                message:
-                  expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
+                  expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
    detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
-  - name: switches to max thinking
+  - name: switches to medium thinking
    actions:
      - call: state.addInboundMessage
        args:
@@ -153,10 +153,10 @@ steps:
        saveAs: maxAck
        args:
          - lambda:
-              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to high/i.test(candidate.text)).at(-1)"
+              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
          - expr: liveTurnTimeoutMs(env, 20000)
    detailsExpr: "`max ack=${maxAck.text}`"
-  - name: verifies max thinking emits visible reasoning
+  - name: verifies medium thinking emits visible reasoning
    actions:
      - set: maxCursor
        value:
@@ -182,7 +182,7 @@ steps:
          message:
            expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
    detailsExpr: "`reasoning=${maxReasoning.text}`"
-  - name: verifies max thinking completes the answer
+  - name: verifies medium thinking completes the answer
    actions:
      - call: waitForCondition
        saveAs: maxAnswer
@@ -204,8 +204,8 @@ steps:
              value:
                expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
            - assert:
-                expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
+                expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
                message:
-                  expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
+                  expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
    detailsExpr: "`answer=${maxAnswer.text}`"
 ```
--- a/qa/scenarios/models/openai-native-web-search-live.md
+++ b/qa/scenarios/models/openai-native-web-search-live.md
@@ -12,7 +12,7 @@ coverage:
 objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary provider is openai.
-  - The selected primary model is GPT-5.5, not a mini or pro variant.
+  - The selected primary model is GPT-5.4, not a mini or pro variant.
  - Web search is enabled without pinning a managed web_search provider.
  - The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
 gatewayConfigPatch:
@@ -32,10 +32,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario openai-native-web-search-live`.
+  summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario openai-native-web-search-live`.
  config:
    requiredProvider: openai
-    requiredModel: gpt-5.5
+    requiredModel: gpt-5.4
    expectedMarker: WEB-SEARCH-OK
    failureMarker: WEB-SEARCH-FAILED
    searchPrompt: |-
@@ -49,7 +49,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms live OpenAI GPT-5.5 web search auto mode
+  - name: confirms live OpenAI GPT-5.4 web search auto mode
    actions:
      - call: waitForGatewayHealthy
        args:
--- a/qa/scenarios/models/thinking-slash-model-remap.md
+++ b/qa/scenarios/models/thinking-slash-model-remap.md
@@ -13,8 +13,8 @@ coverage:
 objective: Verify /think lists provider-owned levels and remaps stored thinking levels when /model changes provider capabilities.
 successCriteria:
  - Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
-  - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
-  - OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
+  - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
+  - OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
  - A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
 docsRefs:
  - docs/tools/thinking.md
@@ -33,7 +33,7 @@ execution:
  config:
    requiredProviderMode: live-frontier
    anthropicModelRef: anthropic/claude-sonnet-4-6
-    openAiXhighModelRef: openai/gpt-5.5
+    openAiXhighModelRef: openai/gpt-5.4
    noXhighModelRef: anthropic/claude-sonnet-4-6
    conversationId: qa-thinking-slash-remap
 ```
@@ -165,7 +165,7 @@ steps:
      - assert:
          expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
          message:
-            expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
+            expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
    detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${openAiModelAck.text}; think=${openAiThinkStatus.text}`"
  - name: maps xhigh to high on a model without xhigh
    actions:
--- a/qa/scenarios/workspace/medium-game-plan-codex-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-codex-harness.md
@@ -11,7 +11,7 @@ coverage:
    - models.codex-cli
 objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
 successCriteria:
-  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced.
+  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4 with the Codex harness forced.
  - The scenario forces the Codex embedded harness and disables PI fallback.
  - The prompt explicitly asks the agent to enter plan mode before editing.
  - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-codex-harness`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
  config:
    requiredProvider: codex
-    requiredModel: gpt-5.5
+    requiredModel: gpt-5.4
    harnessRuntime: codex
    harnessFallback: none
    artifactFile: star-garden-defenders-codex.html
@@ -52,7 +52,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.5 Codex harness target
+  - name: confirms GPT-5.4 Codex harness target
    actions:
      - set: selected
        value:
--- a/qa/scenarios/workspace/medium-game-plan-pi-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-pi-harness.md
@@ -9,9 +9,9 @@ coverage:
    - workspace.planning
  secondary:
    - agents.pi-harness
-objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
+objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
 successCriteria:
-  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
+  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
  - The scenario forces the embedded PI harness before the build turn.
  - The prompt explicitly asks the agent to enter plan mode before editing.
  - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
@@ -25,10 +25,10 @@ codeRefs:
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
-  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-pi-harness`.
+  summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
  config:
    requiredProvider: openai
-    requiredModel: gpt-5.5
+    requiredModel: gpt-5.4
    harnessRuntime: pi
    harnessFallback: pi
    artifactFile: star-garden-defenders-pi.html
@@ -52,7 +52,7 @@ execution:

 ```yaml qa-flow
 steps:
-  - name: confirms GPT-5.5 PI harness target
+  - name: confirms GPT-5.4 PI harness target
    actions:
      - set: selected
        value: