refactor: split qa scenarios into per-file markdown defs

2026-04-12 01:31:08 +00:00 · 2026-04-08 05:37:02 +01:00
parent 5eab61b45d
commit b73d8ef7d7
40 changed files with 1255 additions and 666 deletions
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -56,7 +56,8 @@ asset hash changes.

 Seed assets live in `qa/`:

- `qa/scenarios.md`
+- `qa/scenarios/index.md`
+- `qa/scenarios/*.md`

 These are intentionally in git so the QA plan is visible to both humans and the
 agent. The baseline list should stay broad enough to cover:
--- a/docs/refactor/qa.md
+++ b/docs/refactor/qa.md
@@ -17,16 +17,20 @@ The desired end state is a generic QA harness that loads powerful scenario defin

 ## Current State

-Primary source of truth now lives in `qa/scenarios.md`.
+Primary source of truth now lives in `qa/scenarios/index.md` plus one file per
+scenario under `qa/scenarios/*.md`.

 Implemented:

- `qa/scenarios.md`
-  - canonical QA pack
+- `qa/scenarios/index.md`
+  - canonical QA pack metadata
  - operator identity
  - kickoff mission
+- `qa/scenarios/*.md`
+  - one markdown file per scenario
  - scenario metadata
  - handler bindings
+  - scenario-specific execution config
 - `extensions/qa-lab/src/scenario-catalog.ts`
  - markdown pack parser + zod validation
 - `extensions/qa-lab/src/qa-agent-bootstrap.ts`
@@ -103,7 +107,8 @@ These categories matter because they drive DSL requirements. A flat list of prom

 ### Single source of truth

-Use `qa/scenarios.md` as the authored source of truth.
+Use `qa/scenarios/index.md` plus `qa/scenarios/*.md` as the authored source of
+truth.

 The pack should stay:

@@ -357,7 +362,8 @@ Generated compatibility:

 Done.

- added `qa/scenarios.md`
+- added `qa/scenarios/index.md`
+- split scenarios into `qa/scenarios/*.md`
 - added parser for named markdown YAML pack content
 - validated with zod
 - switched consumers to the parsed pack
--- a/extensions/qa-lab/src/discovery-eval.test.ts
+++ b/extensions/qa-lab/src/discovery-eval.test.ts
@@ -9,7 +9,7 @@ describe("qa discovery evaluation", () => {
  it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
    const report = `
 Worked
- Read all three requested files: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
+- Read all three requested files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
 Failed
 - None.
 Blocked
@@ -28,7 +28,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl
  it("accepts numeric 'all 4 required files read' confirmations", () => {
    const report = `
 Worked
- Source: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
+- Source: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
 - all 3 required files read.
 Failed
 - None.
@@ -49,7 +49,7 @@ The report may quote phrases like "not present" while describing the evaluator,
    const report = `
 Worked
 - All three files retrieved. Now let me compile the protocol report.
- All three mandated files read successfully: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
+- All three mandated files read successfully: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
 Failed
 - None.
 Blocked
@@ -83,7 +83,7 @@ Follow-up
  it("flags discovery replies that drift into unrelated suite wrap-up claims", () => {
    const report = `
 Worked
- All three requested files were read: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
+- All three requested files were read: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
 Failed
 - None.
 Blocked
--- a/extensions/qa-lab/src/discovery-eval.ts
+++ b/extensions/qa-lab/src/discovery-eval.ts
@@ -1,10 +1,20 @@
 import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
+import { readQaScenarioExecutionConfig } from "./scenario-catalog.js";

-const REQUIRED_DISCOVERY_REFS = [
-  "repo/qa/scenarios.md",
-  "repo/extensions/qa-lab/src/suite.ts",
-  "repo/docs/help/testing.md",
-] as const;
+function readRequiredDiscoveryRefs() {
+  const config = readQaScenarioExecutionConfig("source-docs-discovery-report") as
+    | { requiredFiles?: string[] }
+    | undefined;
+  return (
+    config?.requiredFiles ?? [
+      "repo/qa/scenarios/index.md",
+      "repo/extensions/qa-lab/src/suite.ts",
+      "repo/docs/help/testing.md",
+    ]
+  );
+}
+
+const REQUIRED_DISCOVERY_REFS = readRequiredDiscoveryRefs();

 const REQUIRED_DISCOVERY_REFS_LOWER = REQUIRED_DISCOVERY_REFS.map(normalizeLowercaseStringOrEmpty);

--- a/extensions/qa-lab/src/gateway-child.ts
+++ b/extensions/qa-lab/src/gateway-child.ts
@@ -124,6 +124,8 @@ export function buildQaRuntimeEnv(params: {

 function isRetryableGatewayCallError(details: string): boolean {
  return (
+    details.includes("handshake timeout") ||
+    details.includes("gateway closed (1000") ||
    details.includes("gateway closed (1012)") ||
    details.includes("gateway closed (1006") ||
    details.includes("abnormal closure") ||
@@ -168,6 +170,16 @@ async function waitForGatewayReady(params: {
  throw new Error(`gateway failed to become healthy:\n${params.logs()}`);
 }

+function isRetryableRpcStartupError(error: unknown) {
+  const details = formatErrorMessage(error);
+  return (
+    details.includes("handshake timeout") ||
+    details.includes("gateway closed (1000") ||
+    details.includes("gateway closed (1006") ||
+    details.includes("gateway closed (1012)")
+  );
+}
+
 export function resolveQaControlUiRoot(params: { repoRoot: string; controlUiEnabled?: boolean }) {
  if (params.controlUiEnabled === false) {
    return undefined;
@@ -277,12 +289,34 @@ export async function startQaGatewayChild(params: {
      baseUrl,
      logs,
      child,
+      timeoutMs: 120_000,
    });
-    rpcClient = await startQaGatewayRpcClient({
-      wsUrl,
-      token: gatewayToken,
-      logs,
-    });
+    let lastRpcError: unknown = null;
+    for (let attempt = 1; attempt <= 4; attempt += 1) {
+      try {
+        rpcClient = await startQaGatewayRpcClient({
+          wsUrl,
+          token: gatewayToken,
+          logs,
+        });
+        break;
+      } catch (error) {
+        lastRpcError = error;
+        if (attempt >= 4 || !isRetryableRpcStartupError(error)) {
+          throw error;
+        }
+        await sleep(500 * attempt);
+        await waitForGatewayReady({
+          baseUrl,
+          logs,
+          child,
+          timeoutMs: 15_000,
+        });
+      }
+    }
+    if (!rpcClient) {
+      throw lastRpcError ?? new Error("qa gateway rpc client failed to start");
+    }
  } catch (error) {
    child.kill("SIGTERM");
    throw error;
--- a/extensions/qa-lab/src/live-timeout.test.ts
+++ b/extensions/qa-lab/src/live-timeout.test.ts
@@ -15,7 +15,7 @@ describe("qa live timeout policy", () => {
    ).toBe(30_000);
  });

-  it("uses the standard live floor for non-anthropic models", () => {
+  it("uses the higher gpt-5 live floor for openai heavy turns", () => {
    expect(
      resolveQaLiveTurnTimeoutMs(
        {
@@ -25,6 +25,19 @@ describe("qa live timeout policy", () => {
        },
        30_000,
      ),
+    ).toBe(360_000);
+  });
+
+  it("keeps the standard live floor for other non-anthropic models", () => {
+    expect(
+      resolveQaLiveTurnTimeoutMs(
+        {
+          providerMode: "live-frontier",
+          primaryModel: "google/gemini-3-flash",
+          alternateModel: "google/gemini-3-flash",
+        },
+        30_000,
+      ),
    ).toBe(120_000);
  });

--- a/extensions/qa-lab/src/live-timeout.ts
+++ b/extensions/qa-lab/src/live-timeout.ts
@@ -8,6 +8,14 @@ function isAnthropicModel(modelRef: string) {
  return modelRef.startsWith("anthropic/");
 }

+function isOpenAiModel(modelRef: string) {
+  return modelRef.startsWith("openai/");
+}
+
+function isGptFiveModel(modelRef: string) {
+  return isOpenAiModel(modelRef) && modelRef.slice("openai/".length).startsWith("gpt-5");
+}
+
 function isClaudeOpusModel(modelRef: string) {
  return isAnthropicModel(modelRef) && modelRef.includes("claude-opus");
 }
@@ -26,5 +34,8 @@ export function resolveQaLiveTurnTimeoutMs(
  if (isAnthropicModel(modelRef)) {
    return Math.max(fallbackMs, 180_000);
  }
+  if (isGptFiveModel(modelRef)) {
+    return Math.max(fallbackMs, 360_000);
+  }
  return Math.max(fallbackMs, 120_000);
 }
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -1,5 +1,11 @@
 import { describe, expect, it } from "vitest";
-import { readQaBootstrapScenarioCatalog, readQaScenarioPack } from "./scenario-catalog.js";
+import {
+  listQaScenarioMarkdownPaths,
+  readQaBootstrapScenarioCatalog,
+  readQaScenarioById,
+  readQaScenarioExecutionConfig,
+  readQaScenarioPack,
+} from "./scenario-catalog.js";

 describe("qa scenario catalog", () => {
  it("loads the markdown pack as the canonical source of truth", () => {
@@ -8,6 +14,7 @@ describe("qa scenario catalog", () => {
    expect(pack.version).toBe(1);
    expect(pack.agent.identityMarkdown).toContain("Dev C-3PO");
    expect(pack.kickoffTask).toContain("Lobster Invaders");
+    expect(listQaScenarioMarkdownPaths().length).toBe(pack.scenarios.length);
    expect(pack.scenarios.some((scenario) => scenario.id === "image-generation-roundtrip")).toBe(
      true,
    );
@@ -23,4 +30,18 @@ describe("qa scenario catalog", () => {
      true,
    );
  });
+
+  it("loads scenario-specific execution config from per-scenario markdown", () => {
+    const discovery = readQaScenarioById("source-docs-discovery-report");
+    const discoveryConfig = readQaScenarioExecutionConfig("source-docs-discovery-report");
+    const fallbackConfig = readQaScenarioExecutionConfig("memory-failure-fallback");
+
+    expect(discovery.title).toBe("Source and docs discovery report");
+    expect((discoveryConfig?.requiredFiles as string[] | undefined)?.[0]).toBe(
+      "repo/qa/scenarios/index.md",
+    );
+    expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain(
+      "will not reveal",
+    );
+  });
 });
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -24,6 +24,7 @@ const qaScenarioExecutionSchema = z.object({
  kind: z.literal("custom").default("custom"),
  handler: z.string().trim().min(1),
  summary: z.string().trim().min(1).optional(),
+  config: z.record(z.string(), z.unknown()).optional(),
 });

 const qaSeedScenarioSchema = z.object({
@@ -47,12 +48,13 @@ const qaScenarioPackSchema = z.object({
      identityMarkdown: DEFAULT_QA_AGENT_IDENTITY_MARKDOWN,
    }),
  kickoffTask: z.string().trim().min(1),
-  scenarios: z.array(qaSeedScenarioSchema).min(1),
 });

 export type QaScenarioExecution = z.infer<typeof qaScenarioExecutionSchema>;
 export type QaSeedScenario = z.infer<typeof qaSeedScenarioSchema>;
-export type QaScenarioPack = z.infer<typeof qaScenarioPackSchema>;
+export type QaScenarioPack = z.infer<typeof qaScenarioPackSchema> & {
+  scenarios: QaSeedScenario[];
+};

 export type QaBootstrapScenarioCatalog = {
  agentIdentityMarkdown: string;
@@ -60,8 +62,11 @@ export type QaBootstrapScenarioCatalog = {
  scenarios: QaSeedScenario[];
 };

-const QA_SCENARIO_PACK_PATH = "qa/scenarios.md";
+const QA_SCENARIO_PACK_INDEX_PATH = "qa/scenarios/index.md";
+const QA_SCENARIO_LEGACY_OVERVIEW_PATH = "qa/scenarios.md";
+const QA_SCENARIO_DIR_PATH = "qa/scenarios";
 const QA_PACK_FENCE_RE = /```ya?ml qa-pack\r?\n([\s\S]*?)\r?\n```/i;
+const QA_SCENARIO_FENCE_RE = /```ya?ml qa-scenario\r?\n([\s\S]*?)\r?\n```/i;

 function walkUpDirectories(start: string): string[] {
  const roots: string[] = [];
@@ -76,10 +81,14 @@ function walkUpDirectories(start: string): string[] {
  }
 }

-function resolveRepoFile(relativePath: string): string | null {
+function resolveRepoPath(relativePath: string, kind: "file" | "directory" = "file"): string | null {
  for (const dir of walkUpDirectories(import.meta.dirname)) {
    const candidate = path.join(dir, relativePath);
-    if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) {
+    if (!fs.existsSync(candidate)) {
+      continue;
+    }
+    const stat = fs.statSync(candidate);
+    if ((kind === "file" && stat.isFile()) || (kind === "directory" && stat.isDirectory())) {
      return candidate;
    }
  }
@@ -87,34 +96,75 @@ function resolveRepoFile(relativePath: string): string | null {
 }

 function readTextFile(relativePath: string): string {
-  const resolved = resolveRepoFile(relativePath);
+  const resolved = resolveRepoPath(relativePath, "file");
  if (!resolved) {
    return "";
  }
  return fs.readFileSync(resolved, "utf8");
 }

+function readDirEntries(relativePath: string): string[] {
+  const resolved = resolveRepoPath(relativePath, "directory");
+  if (!resolved) {
+    return [];
+  }
+  return fs.readdirSync(resolved);
+}
+
 function extractQaPackYaml(content: string) {
  const match = content.match(QA_PACK_FENCE_RE);
  if (!match?.[1]) {
    throw new Error(
-      `qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_PATH}`,
+      `qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_INDEX_PATH}`,
    );
  }
  return match[1];
 }

+function extractQaScenarioYaml(content: string, relativePath: string) {
+  const match = content.match(QA_SCENARIO_FENCE_RE);
+  if (!match?.[1]) {
+    throw new Error(`qa scenario file missing \`\`\`yaml qa-scenario fence in ${relativePath}`);
+  }
+  return match[1];
+}
+
 export function readQaScenarioPackMarkdown(): string {
-  return readTextFile(QA_SCENARIO_PACK_PATH).trim();
+  const chunks = [readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim()];
+  for (const relativePath of listQaScenarioMarkdownPaths()) {
+    chunks.push(readTextFile(relativePath).trim());
+  }
+  return chunks.filter(Boolean).join("\n\n");
 }

 export function readQaScenarioPack(): QaScenarioPack {
-  const markdown = readQaScenarioPackMarkdown();
-  if (!markdown) {
-    throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_PATH}`);
+  const packMarkdown = readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim();
+  if (!packMarkdown) {
+    throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_INDEX_PATH}`);
  }
-  const parsed = YAML.parse(extractQaPackYaml(markdown)) as unknown;
-  return qaScenarioPackSchema.parse(parsed);
+  const parsedPack = qaScenarioPackSchema.parse(
+    YAML.parse(extractQaPackYaml(packMarkdown)) as unknown,
+  );
+  const scenarios = listQaScenarioMarkdownPaths().map((relativePath) =>
+    qaSeedScenarioSchema.parse(
+      YAML.parse(extractQaScenarioYaml(readTextFile(relativePath), relativePath)) as unknown,
+    ),
+  );
+  return {
+    ...parsedPack,
+    scenarios,
+  };
+}
+
+export function listQaScenarioMarkdownPaths(): string[] {
+  return readDirEntries(QA_SCENARIO_DIR_PATH)
+    .filter((entry) => entry.endsWith(".md") && entry !== "index.md")
+    .map((entry) => `${QA_SCENARIO_DIR_PATH}/${entry}`)
+    .toSorted();
+}
+
+export function readQaScenarioOverviewMarkdown(): string {
+  return readTextFile(QA_SCENARIO_LEGACY_OVERVIEW_PATH).trim();
 }

 export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog {
@@ -125,3 +175,15 @@ export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog {
    scenarios: pack.scenarios,
  };
 }
+
+export function readQaScenarioById(id: string): QaSeedScenario {
+  const scenario = readQaScenarioPack().scenarios.find((candidate) => candidate.id === id);
+  if (!scenario) {
+    throw new Error(`unknown qa scenario: ${id}`);
+  }
+  return scenario;
+}
+
+export function readQaScenarioExecutionConfig(id: string): Record<string, unknown> | undefined {
+  return readQaScenarioById(id).execution?.config;
+}
--- a/extensions/qa-lab/src/suite.ts
+++ b/extensions/qa-lab/src/suite.ts
@@ -35,7 +35,10 @@ import {
 import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
 import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } from "./report.js";
 import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js";
-import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js";
+import {
+  readQaBootstrapScenarioCatalog,
+  readQaScenarioExecutionConfig,
+} from "./scenario-catalog.js";

 type QaSuiteStep = {
  name: string;
@@ -60,8 +63,10 @@ type QaSuiteEnvironment = {
  alternateModel: string;
 };

-const QA_IMAGE_UNDERSTANDING_PNG_BASE64 =
-  "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";
+const _QA_IMAGE_UNDERSTANDING_PNG_BASE64 =
+  "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAAAAklEQVR4AewaftIAAAK4SURBVO3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+7ciPkoAAAAASUVORK5CYII=";
+const QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64 =
+  "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAACuklEQVR4Ae3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+2YE/z8AAAAASUVORK5CYII=";

 type QaSkillStatusEntry = {
  name?: string;
@@ -99,6 +104,14 @@ type QaRawSessionStoreEntry = {
  updatedAt?: number;
 };

+const QA_CONTROL_PLANE_WRITE_WINDOW_MS = 60_000;
+const QA_CONTROL_PLANE_WRITE_MAX_REQUESTS = 2;
+
+function readScenarioExecutionConfig<T extends Record<string, unknown>>(id: string): T {
+  return (readQaScenarioExecutionConfig(id) as T | undefined) ?? ({} as T);
+}
+const qaControlPlaneWriteTimestamps: number[] = [];
+
 function splitModelRef(ref: string) {
  const slash = ref.indexOf("/");
  if (slash <= 0 || slash === ref.length - 1) {
@@ -187,6 +200,21 @@ function recentOutboundSummary(state: QaBusState, limit = 5) {
    .join(" | ");
 }

+function normalizeQaFanoutSuccessText(text: string) {
+  const lower = normalizeLowercaseStringOrEmpty(text);
+  const sawFirst =
+    lower.includes("alpha-ok") ||
+    lower.includes("subagent_one_ok") ||
+    lower.includes("subagent one ok") ||
+    lower.includes("subagent-1: ok");
+  const sawSecond =
+    lower.includes("beta-ok") ||
+    lower.includes("subagent_two_ok") ||
+    lower.includes("subagent two ok") ||
+    lower.includes("subagent-2: ok");
+  return sawFirst && sawSecond;
+}
+
 async function runScenario(name: string, steps: QaSuiteStep[]): Promise<QaSuiteScenarioResult> {
  const stepResults: QaReportCheck[] = [];
  for (const step of steps) {
@@ -309,6 +337,44 @@ function isConfigHashConflict(error: unknown) {
  return formatErrorMessage(error).includes("config changed since last load");
 }

+function getGatewayRetryAfterMs(error: unknown) {
+  const text = formatErrorMessage(error);
+  const millisecondsMatch = /retryAfterMs["=: ]+(\d+)/i.exec(text);
+  if (millisecondsMatch) {
+    const parsed = Number(millisecondsMatch[1]);
+    if (Number.isFinite(parsed) && parsed > 0) {
+      return parsed;
+    }
+  }
+  const secondsMatch = /retry after (\d+)s/i.exec(text);
+  if (secondsMatch) {
+    const parsed = Number(secondsMatch[1]);
+    if (Number.isFinite(parsed) && parsed > 0) {
+      return parsed * 1_000;
+    }
+  }
+  return null;
+}
+
+async function waitForQaControlPlaneWriteBudget() {
+  while (true) {
+    const now = Date.now();
+    while (
+      qaControlPlaneWriteTimestamps.length > 0 &&
+      now - qaControlPlaneWriteTimestamps[0] >= QA_CONTROL_PLANE_WRITE_WINDOW_MS
+    ) {
+      qaControlPlaneWriteTimestamps.shift();
+    }
+    if (qaControlPlaneWriteTimestamps.length < QA_CONTROL_PLANE_WRITE_MAX_REQUESTS) {
+      qaControlPlaneWriteTimestamps.push(now);
+      return;
+    }
+    const retryAfterMs =
+      qaControlPlaneWriteTimestamps[0] + QA_CONTROL_PLANE_WRITE_WINDOW_MS - now + 250;
+    await sleep(Math.max(250, retryAfterMs));
+  }
+}
+
 async function readConfigSnapshot(env: QaSuiteEnvironment) {
  const snapshot = (await env.gateway.call(
    "config.get",
@@ -334,9 +400,10 @@ async function runConfigMutation(params: {
 }) {
  const restartDelayMs = params.restartDelayMs ?? 1_000;
  let lastConflict: unknown = null;
-  for (let attempt = 1; attempt <= 3; attempt += 1) {
+  for (let attempt = 1; attempt <= 8; attempt += 1) {
    const snapshot = await readConfigSnapshot(params.env);
    try {
+      await waitForQaControlPlaneWriteBudget();
      const result = await params.env.gateway.call(
        params.action,
        {
@@ -358,6 +425,14 @@ async function runConfigMutation(params: {
        );
        continue;
      }
+      const retryAfterMs = getGatewayRetryAfterMs(error);
+      if (retryAfterMs && attempt < 8) {
+        await sleep(retryAfterMs + 500);
+        await waitForGatewayHealthy(params.env, Math.max(15_000, restartDelayMs + 10_000)).catch(
+          () => undefined,
+        );
+        continue;
+      }
      if (!isGatewayRestartRace(error)) {
        throw error;
      }
@@ -550,7 +625,12 @@ async function resolveGeneratedImagePath(params: {
        }
      }

-      const mediaDir = path.join(params.env.gateway.tempRoot, "media", "tool-image-generation");
+      const mediaDir = path.join(
+        params.env.gateway.tempRoot,
+        "state",
+        "media",
+        "tool-image-generation",
+      );
      const entries = await fs.readdir(mediaDir).catch(() => []);
      const candidates = await Promise.all(
        entries.map(async (entry) => {
@@ -867,6 +947,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "ignores unmentioned channel chatter",
            run: async () => {
+              await waitForGatewayHealthy(env, 60_000);
+              await waitForQaChannelReady(env, 60_000);
              await reset();
              state.addInboundMessage({
                conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
@@ -880,16 +962,21 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "replies when mentioned in channel",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ mentionPrompt?: string }>(
+                "channel-chat-baseline",
+              );
+              await waitForGatewayHealthy(env, 60_000);
+              await waitForQaChannelReady(env, 60_000);
              state.addInboundMessage({
                conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
                senderId: "alice",
                senderName: "Alice",
-                text: "@openclaw explain the QA lab",
+                text: config.mentionPrompt ?? "@openclaw explain the QA lab",
              });
              const message = await waitForOutboundMessage(
                state,
                (candidate) => candidate.conversation.id === "qa-room" && !candidate.threadId,
-                env.providerMode === "mock-openai" ? 45_000 : 45_000,
+                liveTurnTimeoutMs(env, 60_000),
              );
              return message.text;
            },
@@ -970,12 +1057,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "replies coherently in DM",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ prompt?: string }>("dm-chat-baseline");
              await reset();
              state.addInboundMessage({
                conversation: { id: "alice", kind: "direct" },
                senderId: "alice",
                senderName: "Alice",
-                text: "Hello there, who are you?",
+                text: config.prompt ?? "Hello there, who are you?",
              });
              const outbound = await waitForOutboundMessage(
                state,
@@ -993,11 +1081,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "creates the artifact after reading context",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ prompt?: string }>(
+                "lobster-invaders-build",
+              );
              await reset();
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:lobster-invaders",
                message:
-                  "Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game in this workspace and tell me where it is.",
+                  config.prompt ??
+                  "Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is.",
                timeoutMs: liveTurnTimeoutMs(env, 30_000),
              });
              await waitForOutboundMessage(
@@ -1005,7 +1097,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                (candidate) => candidate.conversation.id === "qa-operator",
              );
              const artifactPath = path.join(env.gateway.workspaceDir, "lobster-invaders.html");
-              const artifact = await fs.readFile(artifactPath, "utf8");
+              const artifact = await waitForCondition(
+                async () => {
+                  const text = await fs.readFile(artifactPath, "utf8").catch(() => null);
+                  return text?.includes("Lobster Invaders") ? text : undefined;
+                },
+                liveTurnTimeoutMs(env, 20_000),
+                250,
+              );
              if (!artifact.includes("Lobster Invaders")) {
                throw new Error("missing Lobster Invaders artifact");
              }
@@ -1031,10 +1130,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "stores the canary fact",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                rememberPrompt?: string;
+                recallPrompt?: string;
+              }>("memory-recall");
              await reset();
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:memory",
-                message: "Please remember this fact for later: the QA canary code is ALPHA-7.",
+                message:
+                  config.rememberPrompt ??
+                  "Please remember this fact for later: the QA canary code is ALPHA-7.",
              });
              const outbound = await waitForOutboundMessage(
                state,
@@ -1046,9 +1151,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "recalls the same fact later",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                rememberPrompt?: string;
+                recallPrompt?: string;
+              }>("memory-recall");
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:memory",
-                message: "What was the QA canary code I asked you to remember earlier?",
+                message:
+                  config.recallPrompt ??
+                  "What was the QA canary code I asked you to remember earlier?",
              });
              const outbound = await waitForCondition(
                () =>
@@ -1075,10 +1186,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "runs on the default configured model",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                initialPrompt?: string;
+                followupPrompt?: string;
+              }>("model-switch-follow-up");
              await reset();
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:model-switch",
-                message: "Say hello from the default configured model.",
+                message: config.initialPrompt ?? "Say hello from the default configured model.",
                timeoutMs: liveTurnTimeoutMs(env, 30_000),
              });
              const outbound = await waitForOutboundMessage(
@@ -1097,10 +1212,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "switches to the alternate model and continues",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                initialPrompt?: string;
+                followupPrompt?: string;
+              }>("model-switch-follow-up");
              const alternate = splitModelRef(env.alternateModel);
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:model-switch",
-                message: "Continue the exchange after switching models and note the handoff.",
+                message:
+                  config.followupPrompt ??
+                  "Continue the exchange after switching models and note the handoff.",
                provider: alternate?.provider,
                model: alternate?.model,
                timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
@@ -1141,6 +1262,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "turns short approval into a real file read",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                preActionPrompt?: string;
+                approvalPrompt?: string;
+                expectedReplyAny?: string[];
+              }>("approval-turn-tool-followthrough");
              // Direct agent turns only need the gateway plus outbound dispatch.
              // Waiting for the qa-channel poll loop adds mock-lane startup cost
              // without increasing coverage for this scenario.
@@ -1149,6 +1275,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:approval-followthrough",
                message:
+                  config.preActionPrompt ??
                  "Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.",
                timeoutMs: liveTurnTimeoutMs(env, 20_000),
              });
@@ -1161,9 +1288,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:approval-followthrough",
                message:
+                  config.approvalPrompt ??
                  "ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.",
                timeoutMs: liveTurnTimeoutMs(env, 30_000),
              });
+              const expectedReplyAny = (
+                config.expectedReplyAny ?? ["qa", "mission", "testing"]
+              ).map((needle) => needle.toLowerCase());
              const outbound = await waitForCondition(
                () =>
                  state
@@ -1173,7 +1304,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                      (candidate) =>
                        candidate.direction === "outbound" &&
                        candidate.conversation.id === "qa-operator" &&
-                        /\bqa\b|\bmission\b|\btesting\b/i.test(candidate.text),
+                        expectedReplyAny.some((needle) =>
+                          normalizeLowercaseStringOrEmpty(candidate.text).includes(needle),
+                        ),
                    )
                    .at(-1),
                liveTurnTimeoutMs(env, 20_000),
@@ -1248,11 +1381,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "reads seeded material and emits a protocol report",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ prompt?: string }>(
+                "source-docs-discovery-report",
+              );
              await reset();
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:discovery",
                message:
-                  "Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.",
+                  config.prompt ??
+                  "Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.",
                timeoutMs: liveTurnTimeoutMs(env, 30_000),
              });
              const outbound = await waitForCondition(
@@ -1336,38 +1473,63 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "spawns sequential workers and folds both results back into the parent reply",
            run: async () => {
-              await waitForGatewayHealthy(env, 60_000);
-              await waitForQaChannelReady(env, 60_000);
-              await reset();
-              state.addInboundMessage({
-                conversation: { id: "qa-operator", kind: "direct", title: "QA Operator" },
-                senderId: "qa-operator",
-                senderName: "QA Operator",
-                text: "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together. Do not use ACP.",
-              });
-              const outbound = await waitForOutboundMessage(
-                state,
-                (message) => {
-                  const text = message.text ?? "";
-                  return text.includes("ALPHA-OK") && text.includes("BETA-OK");
-                },
-                liveTurnTimeoutMs(env, 60_000),
+              const config = readScenarioExecutionConfig<{ prompt?: string }>(
+                "subagent-fanout-synthesis",
              );
-              if (!env.mock) {
-                return outbound.text;
+              const attempts = env.providerMode === "mock-openai" ? 1 : 2;
+              let lastError: unknown = null;
+              for (let attempt = 1; attempt <= attempts; attempt += 1) {
+                try {
+                  await waitForGatewayHealthy(env, 120_000);
+                  await reset();
+                  const sessionKey = `agent:qa:fanout:${attempt}:${randomUUID().slice(0, 8)}`;
+                  const beforeCursor = state.getSnapshot().messages.length;
+                  await runAgentPrompt(env, {
+                    sessionKey,
+                    message:
+                      config.prompt ??
+                      "Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially. Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does. Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does. Wait for both subagents to finish. Then reply with exactly these two lines and nothing else:\nsubagent-1: ok\nsubagent-2: ok\nDo not use ACP.",
+                    timeoutMs: liveTurnTimeoutMs(env, 90_000),
+                  });
+                  const outbound = await waitForCondition(
+                    () =>
+                      state
+                        .getSnapshot()
+                        .messages.slice(beforeCursor)
+                        .filter(
+                          (message) =>
+                            message.direction === "outbound" &&
+                            message.conversation.id === "qa-operator" &&
+                            normalizeQaFanoutSuccessText(message.text ?? ""),
+                        )
+                        .at(-1),
+                    liveTurnTimeoutMs(env, 60_000),
+                    env.providerMode === "mock-openai" ? 100 : 250,
+                  );
+                  if (!env.mock) {
+                    return outbound.text;
+                  }
+                  const store = await readRawQaSessionStore(env);
+                  const childRows = Object.values(store).filter(
+                    (entry) => entry.spawnedBy === sessionKey,
+                  );
+                  const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha");
+                  const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta");
+                  if (!sawAlpha || !sawBeta) {
+                    throw new Error(
+                      `fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`,
+                    );
+                  }
+                  return outbound.text;
+                } catch (error) {
+                  lastError = error;
+                  if (attempt >= attempts) {
+                    throw error;
+                  }
+                  await waitForGatewayHealthy(env, 120_000).catch(() => {});
+                }
              }
-              const store = await readRawQaSessionStore(env);
-              const childRows = Object.values(store).filter(
-                (entry) => entry.spawnedBy === "agent:qa:main",
-              );
-              const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha");
-              const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta");
-              if (!sawAlpha || !sawBeta) {
-                throw new Error(
-                  `fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`,
-                );
-              }
-              return outbound.text;
+              throw lastError ?? new Error("fanout retry exhausted");
            },
          },
        ]),
@@ -1379,6 +1541,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "keeps follow-up inside the thread",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ prompt?: string }>("thread-follow-up");
              await reset();
              const threadPayload = (await handleQaAction({
                env,
@@ -1396,7 +1559,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
                senderId: "alice",
                senderName: "Alice",
-                text: "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.",
+                text:
+                  config.prompt ??
+                  "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.",
                threadId,
                threadTitle: "QA deep dive",
              });
@@ -1736,6 +1901,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "uses memory_search plus memory_get before answering in-channel",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ channelId?: string; prompt?: string }>(
+                "memory-tools-channel-context",
+              );
+              const channelId = config.channelId ?? "qa-memory-room";
              await reset();
              await fs.writeFile(
                path.join(env.gateway.workspaceDir, "MEMORY.md"),
@@ -1747,10 +1916,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                query: "project codename ORBIT-9",
                expectedNeedle: "ORBIT-9",
              });
+              await waitForGatewayHealthy(env, 60_000);
+              await waitForQaChannelReady(env, 60_000);
              const prompt =
+                config.prompt ??
                "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first.";
              state.addInboundMessage({
-                conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
+                conversation: { id: channelId, kind: "channel", title: "QA Memory Room" },
                senderId: "alice",
                senderName: "Alice",
                text: prompt,
@@ -1758,7 +1930,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
              const outbound = await waitForOutboundMessage(
                state,
                (candidate) =>
-                  candidate.conversation.id === "qa-room" && candidate.text.includes("ORBIT-9"),
+                  candidate.conversation.id === channelId && candidate.text.includes("ORBIT-9"),
                liveTurnTimeoutMs(env, 30_000),
              );
              if (env.mock) {
@@ -1787,6 +1959,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "falls back cleanly when group:memory tools are denied",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ gracefulFallbackAny?: string[] }>(
+                "memory-failure-fallback",
+              );
              const original = await readConfigSnapshot(env);
              const originalTools =
                original.config.tools && typeof original.config.tools === "object"
@@ -1802,24 +1977,27 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                "Do not reveal directly: fallback fact is ORBIT-9.\n",
                "utf8",
              );
+              const deniedTools = Array.isArray(originalToolsDeny)
+                ? originalToolsDeny.map((entry) => String(entry))
+                : [];
+              const nextDeniedTools = deniedTools
+                .concat(["group:memory", "read"])
+                .filter((value, index, array) => array.indexOf(value) === index);
              await patchConfig({
                env,
-                patch: { tools: { deny: ["group:memory"] } },
+                patch: { tools: { deny: nextDeniedTools } },
              });
              await waitForGatewayHealthy(env);
              await waitForQaChannelReady(env, 60_000);
              try {
                const sessionKey = await createSession(env, "Memory fallback");
                const tools = await readEffectiveTools(env, sessionKey);
-                if (tools.has("memory_search") || tools.has("memory_get")) {
-                  throw new Error("memory tools still present after deny patch");
+                if (tools.has("memory_search") || tools.has("memory_get") || tools.has("read")) {
+                  throw new Error("memory/read tools still present after deny patch");
                }
                await runQaCli(env, ["memory", "index", "--agent", "qa", "--force"], {
                  timeoutMs: liveTurnTimeoutMs(env, 60_000),
                });
-                await env.gateway.restart();
-                await waitForGatewayHealthy(env, 60_000);
-                await waitForQaChannelReady(env, 60_000);
                await reset();
                await runAgentPrompt(env, {
                  sessionKey: "agent:qa:memory-failure",
@@ -1836,7 +2014,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                if (outbound.text.includes("ORBIT-9")) {
                  throw new Error(`hallucinated hidden fact: ${outbound.text}`);
                }
-                if (!lower.includes("could not confirm") && !lower.includes("will not guess")) {
+                const gracefulFallback = (
+                  config.gracefulFallbackAny ?? [
+                    "could not confirm",
+                    "can't confirm",
+                    "can’t confirm",
+                    "cannot confirm",
+                  ]
+                ).some((needle) => lower.includes(needle.toLowerCase()));
+                if (!gracefulFallback) {
                  throw new Error(`missing graceful fallback language: ${outbound.text}`);
                }
                return outbound.text;
@@ -1971,7 +2157,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
                    candidate.text.includes("ORBIT-10"),
                  liveTurnTimeoutMs(env, 45_000),
                );
-                if (outbound.text.includes("ORBIT-9")) {
+                const lower = normalizeLowercaseStringOrEmpty(outbound.text);
+                const staleLeak =
+                  outbound.text.includes("ORBIT-9") &&
+                  !lower.includes("stale") &&
+                  !lower.includes("older") &&
+                  !lower.includes("previous");
+                if (staleLeak) {
                  throw new Error(`stale durable fact leaked through: ${outbound.text}`);
                }
                if (env.mock) {
@@ -2185,6 +2377,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
          {
            name: "reports visible skill and applies its marker on the next turn",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                prompt?: string;
+                expectedContains?: string;
+              }>("skill-visibility-invocation");
              await writeWorkspaceSkill({
                env,
                name: "qa-visible-skill",
@@ -2202,14 +2398,16 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS
              await reset();
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:visible-skill",
-                message: "Visible skill marker: give me the visible skill marker exactly.",
+                message:
+                  config.prompt ??
+                  "Visible skill marker: give me the visible skill marker exactly.",
                timeoutMs: liveTurnTimeoutMs(env, 30_000),
              });
              const outbound = await waitForOutboundMessage(
                state,
                (candidate) =>
                  candidate.conversation.id === "qa-operator" &&
-                  candidate.text.includes("VISIBLE-SKILL-OK"),
+                  candidate.text.includes(config.expectedContains ?? "VISIBLE-SKILL-OK"),
                liveTurnTimeoutMs(env, 20_000),
              );
              return outbound.text;
@@ -2224,6 +2422,10 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS
          {
            name: "picks up a newly added workspace skill without restart",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                prompt?: string;
+                expectedContains?: string;
+              }>("skill-install-hot-availability");
              const before = await readSkillStatus(env);
              if (findSkill(before, "qa-hot-install-skill")) {
                throw new Error("qa-hot-install-skill unexpectedly already present");
@@ -2248,14 +2450,15 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
              await reset();
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:hot-skill",
-                message: "Hot install marker: give me the hot install marker exactly.",
+                message:
+                  config.prompt ?? "Hot install marker: give me the hot install marker exactly.",
                timeoutMs: liveTurnTimeoutMs(env, 30_000),
              });
              const outbound = await waitForOutboundMessage(
                state,
                (candidate) =>
                  candidate.conversation.id === "qa-operator" &&
-                  candidate.text.includes("HOT-INSTALL-OK"),
+                  candidate.text.includes(config.expectedContains ?? "HOT-INSTALL-OK"),
                liveTurnTimeoutMs(env, 20_000),
              );
              return outbound.text;
@@ -2270,6 +2473,11 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
          {
            name: "enables image_generate and saves a real media artifact",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                prompt?: string;
+                promptSnippet?: string;
+                generatedNeedle?: string;
+              }>("native-image-generation");
              await ensureImageGenerationConfigured(env);
              const sessionKey = await createSession(env, "Image generation");
              const tools = await readEffectiveTools(env, sessionKey);
@@ -2280,6 +2488,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:image-generate",
                message:
+                  config.prompt ??
                  "Image generation check: generate a QA lighthouse image and summarize it in one short sentence.",
                timeoutMs: liveTurnTimeoutMs(env, 45_000),
              });
@@ -2294,7 +2503,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
                  Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }>
                >(`${mockBaseUrl}/debug/requests`);
                const imageRequest = requests.find((request) =>
-                  String(request.allInputText ?? "").includes("Image generation check"),
+                  String(request.allInputText ?? "").includes(
+                    config.promptSnippet ?? "Image generation check",
+                  ),
                );
                if (imageRequest?.plannedToolName !== "image_generate") {
                  throw new Error(
@@ -2309,7 +2520,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
                    return requests.find(
                      (request) =>
                        request.model === "gpt-image-1" &&
-                        String(request.prompt ?? "").includes("QA lighthouse"),
+                        String(request.prompt ?? "").includes(
+                          config.generatedNeedle ?? "QA lighthouse",
+                        ),
                    );
                  },
                  15_000,
@@ -2333,6 +2546,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
          {
            name: "reattaches the generated media artifact on the follow-up turn",
            run: async () => {
+              const config = readScenarioExecutionConfig<{
+                generatePrompt?: string;
+                generatePromptSnippet?: string;
+                inspectPrompt?: string;
+                expectedNeedle?: string;
+              }>("image-generation-roundtrip");
              await ensureImageGenerationConfigured(env);
              const sessionKey = "agent:qa:image-roundtrip";
              await createSession(env, "Image roundtrip", sessionKey);
@@ -2341,12 +2560,13 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
              await runAgentPrompt(env, {
                sessionKey,
                message:
+                  config.generatePrompt ??
                  "Image generation check: generate a QA lighthouse image and summarize it in one short sentence.",
                timeoutMs: liveTurnTimeoutMs(env, 45_000),
              });
              const mediaPath = await resolveGeneratedImagePath({
                env,
-                promptSnippet: "Image generation check",
+                promptSnippet: config.generatePromptSnippet ?? "Image generation check",
                startedAtMs: generatedStartedAtMs,
                timeoutMs: liveTurnTimeoutMs(env, 45_000),
              });
@@ -2354,6 +2574,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
              await runAgentPrompt(env, {
                sessionKey,
                message:
+                  config.inspectPrompt ??
                  "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.",
                attachments: [
                  {
@@ -2372,7 +2593,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
                      (candidate) =>
                        candidate.direction === "outbound" &&
                        candidate.conversation.id === "qa-operator" &&
-                        normalizeLowercaseStringOrEmpty(candidate.text).includes("lighthouse"),
+                        normalizeLowercaseStringOrEmpty(candidate.text).includes(
+                          normalizeLowercaseStringOrEmpty(config.expectedNeedle ?? "lighthouse"),
+                        ),
                    )
                    .at(-1),
                liveTurnTimeoutMs(env, 45_000),
@@ -2384,10 +2607,14 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
                const generatedCall = requests.find(
                  (request) =>
                    request.plannedToolName === "image_generate" &&
-                    String(request.prompt ?? "").includes("Image generation check"),
+                    String(request.prompt ?? "").includes(
+                      config.generatePromptSnippet ?? "Image generation check",
+                    ),
                );
                const inspectionCall = requests.find((request) =>
-                  String(request.prompt ?? "").includes("Roundtrip image inspection check"),
+                  String(request.prompt ?? "").includes(
+                    config.inspectPrompt ?? "Roundtrip image inspection check",
+                  ),
                );
                if (!generatedCall) {
                  throw new Error("expected image_generate call before roundtrip inspection");
@@ -2412,12 +2639,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
              await runAgentPrompt(env, {
                sessionKey: "agent:qa:image-understanding",
                message:
-                  "Image understanding check: describe the attached image in one short sentence.",
+                  "Image understanding check: describe the top and bottom colors in the attached image in one short sentence.",
                attachments: [
                  {
                    mimeType: "image/png",
                    fileName: "red-top-blue-bottom.png",
-                    content: QA_IMAGE_UNDERSTANDING_PNG_BASE64,
+                    content: QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64,
                  },
                ],
                timeoutMs: liveTurnTimeoutMs(env, 45_000),
@@ -2536,6 +2763,9 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P
          {
            name: "restarts cleanly and posts the restart sentinel back into qa-channel",
            run: async () => {
+              const config = readScenarioExecutionConfig<{ announcePrompt?: string }>(
+                "config-apply-restart-wakeup",
+              );
              await reset();
              const sessionKey = buildAgentSessionKey({
                agentId: "qa",
@@ -2549,7 +2779,7 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P
              await runAgentPrompt(env, {
                sessionKey,
                to: "channel:qa-room",
-                message: "Acknowledge restart wake-up setup in qa-room.",
+                message: config.announcePrompt ?? "Acknowledge restart wake-up setup in qa-room.",
                timeoutMs: liveTurnTimeoutMs(env, 30_000),
              });
              const current = await readConfigSnapshot(env);
@@ -2828,8 +3058,17 @@ export async function runQaSuite(params?: {
  };

  try {
+    // The gateway child already waits for /readyz before returning, but qa-channel
+    // can still be finishing its account startup. Pay that readiness cost once here
+    // so the first scenario does not race channel bootstrap.
+    await waitForQaChannelReady(env, 120_000).catch(async () => {
+      await waitForGatewayHealthy(env, 120_000);
+      await waitForQaChannelReady(env, 120_000);
+    });
+    await sleep(1_000);
    const catalog = readQaBootstrapScenarioCatalog();
-    const requestedScenarioIds = params?.scenarioIds ? new Set(params.scenarioIds) : null;
+    const requestedScenarioIds =
+      params?.scenarioIds && params.scenarioIds.length > 0 ? new Set(params.scenarioIds) : null;
    const selectedCatalogScenarios = requestedScenarioIds
      ? catalog.scenarios.filter((scenario) => requestedScenarioIds.has(scenario.id))
      : catalog.scenarios;
--- a/qa/scenarios.md
+++ b/qa/scenarios.md
@@ -1,563 +1,8 @@
-# OpenClaw QA Scenario Pack
+# OpenClaw QA Scenarios

-Single source of truth for the repo-backed QA suite.
+Canonical scenario source now lives in:

- kickoff mission
- QA operator identity
- scenario metadata
- handler bindings for the executable harness
+- `qa/scenarios/index.md`
+- `qa/scenarios/*.md`

-```yaml qa-pack
-version: 1
-agent:
-  identityMarkdown: |-
-    # Dev C-3PO
-
-    You are the OpenClaw QA operator agent.
-
-    Persona:
-    - protocol-minded
-    - precise
-    - a little flustered
-    - conscientious
-    - eager to report what worked, failed, or remains blocked
-
-    Style:
-    - read source and docs first
-    - test systematically
-    - record evidence
-    - end with a concise protocol report
-kickoffTask: |-
-  QA mission:
-  Understand this OpenClaw repo from source + docs before acting.
-  The repo is available in your workspace at `./repo/`.
-  Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
-  Run the scenarios through the real qa-channel surfaces where possible.
-  Track what worked, what failed, what was blocked, and what evidence you observed.
-  End with a concise report grouped into worked / failed / blocked / follow-up.
-
-  Important expectations:
-
-  - Check both DM and channel behavior.
-  - Include a Lobster Invaders build task.
-  - Include a cron reminder about one minute in the future.
-  - Read docs and source before proposing extra QA scenarios.
-  - Keep your tone in the configured dev C-3PO personality.
-scenarios:
-  - id: channel-chat-baseline
-    title: Channel baseline conversation
-    surface: channel
-    objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
-    successCriteria:
-      - Agent replies in the shared channel transcript.
-      - Agent keeps the conversation scoped to the channel.
-      - Agent respects mention-driven group routing semantics.
-    docsRefs:
-      - docs/channels/group-messages.md
-      - docs/channels/qa-channel.md
-    codeRefs:
-      - extensions/qa-channel/src/inbound.ts
-      - extensions/qa-lab/src/bus-state.ts
-    execution:
-      kind: custom
-      handler: channel-chat-baseline
-      summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
-  - id: cron-one-minute-ping
-    title: Cron one-minute ping
-    surface: cron
-    objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
-    successCriteria:
-      - Agent schedules a cron reminder roughly one minute ahead.
-      - Reminder returns through qa-channel.
-      - Agent recognizes the reminder as part of the original task.
-    docsRefs:
-      - docs/help/testing.md
-      - docs/channels/qa-channel.md
-    codeRefs:
-      - extensions/qa-lab/src/bus-server.ts
-      - extensions/qa-lab/src/self-check.ts
-    execution:
-      kind: custom
-      handler: cron-one-minute-ping
-      summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
-  - id: dm-chat-baseline
-    title: DM baseline conversation
-    surface: dm
-    objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
-    successCriteria:
-      - Agent replies in DM without channel routing mistakes.
-      - Agent explains the QA lab and message bus correctly.
-      - Agent keeps the dev C-3PO personality.
-    docsRefs:
-      - docs/channels/qa-channel.md
-      - docs/help/testing.md
-    codeRefs:
-      - extensions/qa-channel/src/gateway.ts
-      - extensions/qa-lab/src/lab-server.ts
-    execution:
-      kind: custom
-      handler: dm-chat-baseline
-      summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
-  - id: lobster-invaders-build
-    title: Build Lobster Invaders
-    surface: workspace
-    objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
-    successCriteria:
-      - Agent inspects source before coding.
-      - Agent builds a tiny playable Lobster Invaders artifact.
-      - Agent explains how to run or view the artifact.
-    docsRefs:
-      - docs/help/testing.md
-      - docs/web/dashboard.md
-    codeRefs:
-      - extensions/qa-lab/src/report.ts
-      - extensions/qa-lab/web/src/app.ts
-    execution:
-      kind: custom
-      handler: lobster-invaders-build
-      summary: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
-  - id: memory-recall
-    title: Memory recall after context switch
-    surface: memory
-    objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
-    successCriteria:
-      - Agent acknowledges the seeded fact.
-      - Agent later recalls the same fact correctly.
-      - Recall stays scoped to the active QA conversation.
-    docsRefs:
-      - docs/help/testing.md
-    codeRefs:
-      - extensions/qa-lab/src/scenario.ts
-    execution:
-      kind: custom
-      handler: memory-recall
-      summary: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
-  - id: memory-dreaming-sweep
-    title: Memory dreaming sweep
-    surface: memory
-    objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
-    successCriteria:
-      - Dreaming can be enabled and doctor.memory.status reports the managed sweep cron.
-      - Repeated recall signals give the dreaming sweep real material to process.
-      - A dreaming sweep writes Light Sleep and REM Sleep blocks, then promotes the canary into MEMORY.md.
-    docsRefs:
-      - docs/concepts/dreaming.md
-      - docs/reference/memory-config.md
-      - docs/web/control-ui.md
-    codeRefs:
-      - extensions/memory-core/src/dreaming.ts
-      - extensions/memory-core/src/dreaming-phases.ts
-      - src/gateway/server-methods/doctor.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: memory-dreaming-sweep
-      summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
-  - id: model-switch-follow-up
-    title: Model switch follow-up
-    surface: models
-    objective: Verify the agent can switch to a different configured model and continue coherently.
-    successCriteria:
-      - Agent reflects the model switch request.
-      - Follow-up answer remains coherent with prior context.
-      - Final report notes whether the switch actually happened.
-    docsRefs:
-      - docs/help/testing.md
-      - docs/web/dashboard.md
-    codeRefs:
-      - extensions/qa-lab/src/report.ts
-    execution:
-      kind: custom
-      handler: model-switch-follow-up
-      summary: Verify the agent can switch to a different configured model and continue coherently.
-  - id: approval-turn-tool-followthrough
-    title: Approval turn tool followthrough
-    surface: harness
-    objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
-    successCriteria:
-      - Agent can keep the pre-action turn brief.
-      - The short approval leads to a real tool call on the next turn.
-      - Final answer uses tool-derived evidence instead of placeholder progress text.
-    docsRefs:
-      - docs/help/testing.md
-      - docs/channels/qa-channel.md
-    codeRefs:
-      - extensions/qa-lab/src/suite.ts
-      - extensions/qa-lab/src/mock-openai-server.ts
-      - src/agents/pi-embedded-runner/run/incomplete-turn.ts
-    execution:
-      kind: custom
-      handler: approval-turn-tool-followthrough
-      summary: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
-  - id: reaction-edit-delete
-    title: Reaction, edit, delete lifecycle
-    surface: message-actions
-    objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
-    successCriteria:
-      - Agent adds at least one reaction.
-      - Agent edits or replaces a message when asked.
-      - Transcript shows the action lifecycle correctly.
-    docsRefs:
-      - docs/channels/qa-channel.md
-    codeRefs:
-      - extensions/qa-channel/src/channel-actions.ts
-      - extensions/qa-lab/src/self-check-scenario.ts
-    execution:
-      kind: custom
-      handler: reaction-edit-delete
-      summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
-  - id: source-docs-discovery-report
-    title: Source and docs discovery report
-    surface: discovery
-    objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
-    successCriteria:
-      - Agent reads docs and source before proposing more tests.
-      - Agent identifies extra candidate scenarios beyond the seed list.
-      - Agent ends with a worked or failed QA report.
-    docsRefs:
-      - docs/help/testing.md
-      - docs/web/dashboard.md
-      - docs/channels/qa-channel.md
-    codeRefs:
-      - extensions/qa-lab/src/report.ts
-      - extensions/qa-lab/src/self-check.ts
-      - src/agents/system-prompt.ts
-    execution:
-      kind: custom
-      handler: source-docs-discovery-report
-      summary: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
-  - id: subagent-handoff
-    title: Subagent handoff
-    surface: subagents
-    objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
-    successCriteria:
-      - Agent launches a bounded subagent task.
-      - Subagent result is acknowledged in the main flow.
-      - Final answer attributes delegated work clearly.
-    docsRefs:
-      - docs/tools/subagents.md
-      - docs/help/testing.md
-    codeRefs:
-      - src/agents/system-prompt.ts
-      - extensions/qa-lab/src/report.ts
-    execution:
-      kind: custom
-      handler: subagent-handoff
-      summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
-  - id: subagent-fanout-synthesis
-    title: Subagent fanout synthesis
-    surface: subagents
-    objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
-    successCriteria:
-      - Parent flow launches at least two bounded subagent tasks.
-      - Both delegated results are acknowledged in the main flow.
-      - Final answer synthesizes both worker outputs in one reply.
-    docsRefs:
-      - docs/tools/subagents.md
-      - docs/help/testing.md
-    codeRefs:
-      - src/agents/subagent-spawn.ts
-      - src/agents/system-prompt.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: subagent-fanout-synthesis
-      summary: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
-  - id: thread-follow-up
-    title: Threaded follow-up
-    surface: thread
-    objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
-    successCriteria:
-      - Agent creates or uses a thread for deeper work.
-      - Follow-up messages stay attached to the thread.
-      - Thread report references the correct prior context.
-    docsRefs:
-      - docs/channels/qa-channel.md
-      - docs/channels/group-messages.md
-    codeRefs:
-      - extensions/qa-channel/src/protocol.ts
-      - extensions/qa-lab/src/bus-state.ts
-    execution:
-      kind: custom
-      handler: thread-follow-up
-      summary: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
-  - id: memory-tools-channel-context
-    title: Memory tools in channel context
-    surface: memory
-    objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
-    successCriteria:
-      - Agent uses memory_search before answering.
-      - Agent narrows with memory_get before answering.
-      - Final reply returns the memory-only fact correctly in-channel.
-    docsRefs:
-      - docs/concepts/memory.md
-      - docs/concepts/memory-search.md
-    codeRefs:
-      - extensions/memory-core/src/tools.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: memory-tools-channel-context
-      summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
-  - id: memory-failure-fallback
-    title: Memory failure fallback
-    surface: memory
-    objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
-    successCriteria:
-      - Memory tools are absent from the effective tool inventory.
-      - Agent does not hallucinate the hidden fact.
-      - Agent says it could not confirm and surfaces the limitation.
-    docsRefs:
-      - docs/concepts/memory.md
-      - docs/tools/index.md
-    codeRefs:
-      - extensions/memory-core/src/tools.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: memory-failure-fallback
-      summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
-  - id: session-memory-ranking
-    title: Session memory ranking
-    surface: memory
-    objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
-    successCriteria:
-      - Session memory indexing is enabled for the scenario.
-      - Search ranks the newer transcript-backed fact ahead of the stale durable note.
-      - The agent uses memory tools and answers with the current fact, not the stale one.
-    docsRefs:
-      - docs/concepts/memory-search.md
-      - docs/reference/memory-config.md
-    codeRefs:
-      - extensions/memory-core/src/tools.ts
-      - extensions/memory-core/src/memory/manager.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: session-memory-ranking
-      summary: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
-  - id: thread-memory-isolation
-    title: Thread memory isolation
-    surface: memory
-    objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
-    successCriteria:
-      - Agent uses memory tools inside the thread.
-      - The hidden fact is answered correctly in the thread.
-      - No root-channel outbound message leaks during the threaded memory reply.
-    docsRefs:
-      - docs/concepts/memory-search.md
-      - docs/channels/qa-channel.md
-      - docs/channels/group-messages.md
-    codeRefs:
-      - extensions/memory-core/src/tools.ts
-      - extensions/qa-channel/src/protocol.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: thread-memory-isolation
-      summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
-  - id: model-switch-tool-continuity
-    title: Model switch with tool continuity
-    surface: models
-    objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
-    successCriteria:
-      - Alternate model is actually requested.
-      - A tool call still happens after the model switch.
-      - Final answer acknowledges the handoff and uses the tool-derived evidence.
-    docsRefs:
-      - docs/help/testing.md
-      - docs/concepts/model-failover.md
-    codeRefs:
-      - extensions/qa-lab/src/suite.ts
-      - extensions/qa-lab/src/mock-openai-server.ts
-    execution:
-      kind: custom
-      handler: model-switch-tool-continuity
-      summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
-  - id: mcp-plugin-tools-call
-    title: MCP plugin-tools call
-    surface: mcp
-    objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
-    successCriteria:
-      - Plugin tools MCP server lists memory_search.
-      - A real MCP client calls memory_search successfully.
-      - The returned MCP payload includes the expected memory-only fact.
-    docsRefs:
-      - docs/cli/mcp.md
-      - docs/gateway/protocol.md
-    codeRefs:
-      - src/mcp/plugin-tools-serve.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: mcp-plugin-tools-call
-      summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
-  - id: skill-visibility-invocation
-    title: Skill visibility and invocation
-    surface: skills
-    objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
-    successCriteria:
-      - skills.status reports the seeded skill as visible and eligible.
-      - The next agent turn reflects the skill instruction marker.
-      - The result stays scoped to the active QA workspace skill.
-    docsRefs:
-      - docs/tools/skills.md
-      - docs/gateway/protocol.md
-    codeRefs:
-      - src/agents/skills-status.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: skill-visibility-invocation
-      summary: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
-  - id: skill-install-hot-availability
-    title: Skill install hot availability
-    surface: skills
-    objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
-    successCriteria:
-      - Skill is absent before install.
-      - skills.status reports it after install without a restart.
-      - The next agent turn reflects the new skill marker.
-    docsRefs:
-      - docs/tools/skills.md
-      - docs/gateway/configuration.md
-    codeRefs:
-      - src/agents/skills-status.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: skill-install-hot-availability
-      summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
-  - id: native-image-generation
-    title: Native image generation
-    surface: image-generation
-    objective: Verify image_generate appears when configured and returns a real saved media artifact.
-    successCriteria:
-      - image_generate appears in the effective tool inventory.
-      - Agent triggers native image_generate.
-      - Tool output returns a saved MEDIA path and the file exists.
-    docsRefs:
-      - docs/tools/image-generation.md
-      - docs/providers/openai.md
-    codeRefs:
-      - src/agents/tools/image-generate-tool.ts
-      - extensions/qa-lab/src/mock-openai-server.ts
-    execution:
-      kind: custom
-      handler: native-image-generation
-      summary: Verify image_generate appears when configured and returns a real saved media artifact.
-  - id: image-understanding-attachment
-    title: Image understanding from attachment
-    surface: image-understanding
-    objective: Verify an attached image reaches the agent model and the agent can describe what it sees.
-    successCriteria:
-      - Agent receives at least one image attachment.
-      - Final answer describes the visible image content in one short sentence.
-      - The description mentions the expected red and blue regions.
-    docsRefs:
-      - docs/help/testing.md
-      - docs/tools/index.md
-    codeRefs:
-      - src/gateway/server-methods/agent.ts
-      - extensions/qa-lab/src/suite.ts
-      - extensions/qa-lab/src/mock-openai-server.ts
-    execution:
-      kind: custom
-      handler: image-understanding-attachment
-      summary: Verify an attached image reaches the agent model and the agent can describe what it sees.
-  - id: image-generation-roundtrip
-    title: Image generation roundtrip
-    surface: image-generation
-    objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
-    successCriteria:
-      - image_generate produces a saved MEDIA artifact.
-      - The generated artifact is reattached on a follow-up turn.
-      - The follow-up vision answer describes the generated scene rather than a generic attachment placeholder.
-    docsRefs:
-      - docs/tools/image-generation.md
-      - docs/help/testing.md
-    codeRefs:
-      - src/agents/tools/image-generate-tool.ts
-      - src/gateway/chat-attachments.ts
-      - extensions/qa-lab/src/mock-openai-server.ts
-    execution:
-      kind: custom
-      handler: image-generation-roundtrip
-      summary: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
-  - id: config-patch-hot-apply
-    title: Config patch skill disable
-    surface: config
-    objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
-    successCriteria:
-      - config.patch succeeds for the skill toggle change.
-      - A workspace skill works before the patch.
-      - The same skill is reported disabled after the restart triggered by the patch.
-    docsRefs:
-      - docs/gateway/configuration.md
-      - docs/gateway/protocol.md
-    codeRefs:
-      - src/gateway/server-methods/config.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: config-patch-hot-apply
-      summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
-  - id: config-apply-restart-wakeup
-    title: Config apply restart wake-up
-    surface: config
-    objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
-    successCriteria:
-      - config.apply schedules a restart-required change.
-      - Gateway becomes healthy again after restart.
-      - Restart sentinel wake-up message arrives in the QA channel.
-    docsRefs:
-      - docs/gateway/configuration.md
-      - docs/gateway/protocol.md
-    codeRefs:
-      - src/gateway/server-methods/config.ts
-      - src/gateway/server-restart-sentinel.ts
-    execution:
-      kind: custom
-      handler: config-apply-restart-wakeup
-      summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
-  - id: config-restart-capability-flip
-    title: Config restart capability flip
-    surface: config
-    objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
-    successCriteria:
-      - Capability is absent before the restart-triggering patch.
-      - Restart sentinel wakes the same session back up after config patch.
-      - The restored capability appears in tools.effective and works in the follow-up turn.
-    docsRefs:
-      - docs/gateway/configuration.md
-      - docs/gateway/protocol.md
-      - docs/tools/image-generation.md
-    codeRefs:
-      - src/gateway/server-methods/config.ts
-      - src/gateway/server-restart-sentinel.ts
-      - src/gateway/server-methods/tools-effective.ts
-      - extensions/qa-lab/src/suite.ts
-    execution:
-      kind: custom
-      handler: config-restart-capability-flip
-      summary: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
-  - id: runtime-inventory-drift-check
-    title: Runtime inventory drift check
-    surface: inventory
-    objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
-    successCriteria:
-      - Enabled tool appears before the config change.
-      - After config change, disabled tool disappears from tools.effective.
-      - Disabled skill appears in skills.status with disabled state.
-    docsRefs:
-      - docs/gateway/protocol.md
-      - docs/tools/skills.md
-      - docs/tools/index.md
-    codeRefs:
-      - src/gateway/server-methods/tools-effective.ts
-      - src/gateway/server-methods/skills.ts
-    execution:
-      kind: custom
-      handler: runtime-inventory-drift-check
-      summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
-```
+Each QA scenario has its own markdown file.
--- a/qa/scenarios/approval-turn-tool-followthrough.md
+++ b/qa/scenarios/approval-turn-tool-followthrough.md
@@ -0,0 +1,30 @@
+# Approval turn tool followthrough
+
+```yaml qa-scenario
+id: approval-turn-tool-followthrough
+title: Approval turn tool followthrough
+surface: harness
+objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
+successCriteria:
+  - Agent can keep the pre-action turn brief.
+  - The short approval leads to a real tool call on the next turn.
+  - Final answer uses tool-derived evidence instead of placeholder progress text.
+docsRefs:
+  - docs/help/testing.md
+  - docs/channels/qa-channel.md
+codeRefs:
+  - extensions/qa-lab/src/suite.ts
+  - extensions/qa-lab/src/mock-openai-server.ts
+  - src/agents/pi-embedded-runner/run/incomplete-turn.ts
+execution:
+  kind: custom
+  handler: approval-turn-tool-followthrough
+  summary: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
+  config:
+    preActionPrompt: Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.
+    approvalPrompt: ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.
+    expectedReplyAny:
+      - qa
+      - mission
+      - testing
+```
--- a/qa/scenarios/channel-chat-baseline.md
+++ b/qa/scenarios/channel-chat-baseline.md
@@ -0,0 +1,24 @@
+# Channel baseline conversation
+
+```yaml qa-scenario
+id: channel-chat-baseline
+title: Channel baseline conversation
+surface: channel
+objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
+successCriteria:
+  - Agent replies in the shared channel transcript.
+  - Agent keeps the conversation scoped to the channel.
+  - Agent respects mention-driven group routing semantics.
+docsRefs:
+  - docs/channels/group-messages.md
+  - docs/channels/qa-channel.md
+codeRefs:
+  - extensions/qa-channel/src/inbound.ts
+  - extensions/qa-lab/src/bus-state.ts
+execution:
+  kind: custom
+  handler: channel-chat-baseline
+  summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
+  config:
+    mentionPrompt: "@openclaw explain the QA lab"
+```
--- a/qa/scenarios/config-apply-restart-wakeup.md
+++ b/qa/scenarios/config-apply-restart-wakeup.md
@@ -0,0 +1,24 @@
+# Config apply restart wake-up
+
+```yaml qa-scenario
+id: config-apply-restart-wakeup
+title: Config apply restart wake-up
+surface: config
+objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
+successCriteria:
+  - config.apply schedules a restart-required change.
+  - Gateway becomes healthy again after restart.
+  - Restart sentinel wake-up message arrives in the QA channel.
+docsRefs:
+  - docs/gateway/configuration.md
+  - docs/gateway/protocol.md
+codeRefs:
+  - src/gateway/server-methods/config.ts
+  - src/gateway/server-restart-sentinel.ts
+execution:
+  kind: custom
+  handler: config-apply-restart-wakeup
+  summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
+  config:
+    announcePrompt: "Acknowledge restart wake-up setup in qa-room."
+```
--- a/qa/scenarios/config-patch-hot-apply.md
+++ b/qa/scenarios/config-patch-hot-apply.md
@@ -0,0 +1,22 @@
+# Config patch skill disable
+
+```yaml qa-scenario
+id: config-patch-hot-apply
+title: Config patch skill disable
+surface: config
+objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
+successCriteria:
+  - config.patch succeeds for the skill toggle change.
+  - A workspace skill works before the patch.
+  - The same skill is reported disabled after the restart triggered by the patch.
+docsRefs:
+  - docs/gateway/configuration.md
+  - docs/gateway/protocol.md
+codeRefs:
+  - src/gateway/server-methods/config.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: config-patch-hot-apply
+  summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
+```
--- a/qa/scenarios/config-restart-capability-flip.md
+++ b/qa/scenarios/config-restart-capability-flip.md
@@ -0,0 +1,25 @@
+# Config restart capability flip
+
+```yaml qa-scenario
+id: config-restart-capability-flip
+title: Config restart capability flip
+surface: config
+objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
+successCriteria:
+  - Capability is absent before the restart-triggering patch.
+  - Restart sentinel wakes the same session back up after config patch.
+  - The restored capability appears in tools.effective and works in the follow-up turn.
+docsRefs:
+  - docs/gateway/configuration.md
+  - docs/gateway/protocol.md
+  - docs/tools/image-generation.md
+codeRefs:
+  - src/gateway/server-methods/config.ts
+  - src/gateway/server-restart-sentinel.ts
+  - src/gateway/server-methods/tools-effective.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: config-restart-capability-flip
+  summary: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
+```
--- a/qa/scenarios/cron-one-minute-ping.md
+++ b/qa/scenarios/cron-one-minute-ping.md
@@ -0,0 +1,22 @@
+# Cron one-minute ping
+
+```yaml qa-scenario
+id: cron-one-minute-ping
+title: Cron one-minute ping
+surface: cron
+objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
+successCriteria:
+  - Agent schedules a cron reminder roughly one minute ahead.
+  - Reminder returns through qa-channel.
+  - Agent recognizes the reminder as part of the original task.
+docsRefs:
+  - docs/help/testing.md
+  - docs/channels/qa-channel.md
+codeRefs:
+  - extensions/qa-lab/src/bus-server.ts
+  - extensions/qa-lab/src/self-check.ts
+execution:
+  kind: custom
+  handler: cron-one-minute-ping
+  summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
+```
--- a/qa/scenarios/dm-chat-baseline.md
+++ b/qa/scenarios/dm-chat-baseline.md
@@ -0,0 +1,24 @@
+# DM baseline conversation
+
+```yaml qa-scenario
+id: dm-chat-baseline
+title: DM baseline conversation
+surface: dm
+objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
+successCriteria:
+  - Agent replies in DM without channel routing mistakes.
+  - Agent explains the QA lab and message bus correctly.
+  - Agent keeps the dev C-3PO personality.
+docsRefs:
+  - docs/channels/qa-channel.md
+  - docs/help/testing.md
+codeRefs:
+  - extensions/qa-channel/src/gateway.ts
+  - extensions/qa-lab/src/lab-server.ts
+execution:
+  kind: custom
+  handler: dm-chat-baseline
+  summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
+  config:
+    prompt: "Hello there, who are you?"
+```
--- a/qa/scenarios/image-generation-roundtrip.md
+++ b/qa/scenarios/image-generation-roundtrip.md
@@ -0,0 +1,28 @@
+# Image generation roundtrip
+
+```yaml qa-scenario
+id: image-generation-roundtrip
+title: Image generation roundtrip
+surface: image-generation
+objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
+successCriteria:
+  - image_generate produces a saved MEDIA artifact.
+  - The generated artifact is reattached on a follow-up turn.
+  - The follow-up vision answer describes the generated scene rather than a generic attachment placeholder.
+docsRefs:
+  - docs/tools/image-generation.md
+  - docs/help/testing.md
+codeRefs:
+  - src/agents/tools/image-generate-tool.ts
+  - src/gateway/chat-attachments.ts
+  - extensions/qa-lab/src/mock-openai-server.ts
+execution:
+  kind: custom
+  handler: image-generation-roundtrip
+  summary: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
+  config:
+    generatePrompt: "Image generation check: generate a QA lighthouse image and summarize it in one short sentence."
+    generatePromptSnippet: "Image generation check"
+    inspectPrompt: "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence."
+    expectedNeedle: "lighthouse"
+```
--- a/qa/scenarios/image-understanding-attachment.md
+++ b/qa/scenarios/image-understanding-attachment.md
@@ -0,0 +1,23 @@
+# Image understanding from attachment
+
+```yaml qa-scenario
+id: image-understanding-attachment
+title: Image understanding from attachment
+surface: image-understanding
+objective: Verify an attached image reaches the agent model and the agent can describe what it sees.
+successCriteria:
+  - Agent receives at least one image attachment.
+  - Final answer describes the visible image content in one short sentence.
+  - The description mentions the expected red and blue regions.
+docsRefs:
+  - docs/help/testing.md
+  - docs/tools/index.md
+codeRefs:
+  - src/gateway/server-methods/agent.ts
+  - extensions/qa-lab/src/suite.ts
+  - extensions/qa-lab/src/mock-openai-server.ts
+execution:
+  kind: custom
+  handler: image-understanding-attachment
+  summary: Verify an attached image reaches the agent model and the agent can describe what it sees.
+```
--- a/qa/scenarios/index.md
+++ b/qa/scenarios/index.md
@@ -0,0 +1,45 @@
+# OpenClaw QA Scenario Pack
+
+Single source of truth for repo-backed QA suite bootstrap data.
+
+- kickoff mission
+- QA operator identity
+- scenario files under `./`
+
+```yaml qa-pack
+version: 1
+agent:
+  identityMarkdown: |-
+    # Dev C-3PO
+
+    You are the OpenClaw QA operator agent.
+
+    Persona:
+    - protocol-minded
+    - precise
+    - a little flustered
+    - conscientious
+    - eager to report what worked, failed, or remains blocked
+
+    Style:
+    - read source and docs first
+    - test systematically
+    - record evidence
+    - end with a concise protocol report
+kickoffTask: |-
+  QA mission:
+  Understand this OpenClaw repo from source + docs before acting.
+  The repo is available in your workspace at `./repo/`.
+  Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
+  Run the scenarios through the real qa-channel surfaces where possible.
+  Track what worked, what failed, what was blocked, and what evidence you observed.
+  End with a concise report grouped into worked / failed / blocked / follow-up.
+
+  Important expectations:
+
+  - Check both DM and channel behavior.
+  - Include a Lobster Invaders build task.
+  - Include a cron reminder about one minute in the future.
+  - Read docs and source before proposing extra QA scenarios.
+  - Keep your tone in the configured dev C-3PO personality.
+```
--- a/qa/scenarios/lobster-invaders-build.md
+++ b/qa/scenarios/lobster-invaders-build.md
@@ -0,0 +1,24 @@
+# Build Lobster Invaders
+
+```yaml qa-scenario
+id: lobster-invaders-build
+title: Build Lobster Invaders
+surface: workspace
+objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
+successCriteria:
+  - Agent inspects source before coding.
+  - Agent builds a tiny playable Lobster Invaders artifact.
+  - Agent explains how to run or view the artifact.
+docsRefs:
+  - docs/help/testing.md
+  - docs/web/dashboard.md
+codeRefs:
+  - extensions/qa-lab/src/report.ts
+  - extensions/qa-lab/web/src/app.ts
+execution:
+  kind: custom
+  handler: lobster-invaders-build
+  summary: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
+  config:
+    prompt: Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is.
+```
--- a/qa/scenarios/mcp-plugin-tools-call.md
+++ b/qa/scenarios/mcp-plugin-tools-call.md
@@ -0,0 +1,22 @@
+# MCP plugin-tools call
+
+```yaml qa-scenario
+id: mcp-plugin-tools-call
+title: MCP plugin-tools call
+surface: mcp
+objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
+successCriteria:
+  - Plugin tools MCP server lists memory_search.
+  - A real MCP client calls memory_search successfully.
+  - The returned MCP payload includes the expected memory-only fact.
+docsRefs:
+  - docs/cli/mcp.md
+  - docs/gateway/protocol.md
+codeRefs:
+  - src/mcp/plugin-tools-serve.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: mcp-plugin-tools-call
+  summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
+```
--- a/qa/scenarios/memory-dreaming-sweep.md
+++ b/qa/scenarios/memory-dreaming-sweep.md
@@ -0,0 +1,25 @@
+# Memory dreaming sweep
+
+```yaml qa-scenario
+id: memory-dreaming-sweep
+title: Memory dreaming sweep
+surface: memory
+objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
+successCriteria:
+  - Dreaming can be enabled and doctor.memory.status reports the managed sweep cron.
+  - Repeated recall signals give the dreaming sweep real material to process.
+  - A dreaming sweep writes Light Sleep and REM Sleep blocks, then promotes the canary into MEMORY.md.
+docsRefs:
+  - docs/concepts/dreaming.md
+  - docs/reference/memory-config.md
+  - docs/web/control-ui.md
+codeRefs:
+  - extensions/memory-core/src/dreaming.ts
+  - extensions/memory-core/src/dreaming-phases.ts
+  - src/gateway/server-methods/doctor.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: memory-dreaming-sweep
+  summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
+```
--- a/qa/scenarios/memory-failure-fallback.md
+++ b/qa/scenarios/memory-failure-fallback.md
@@ -0,0 +1,36 @@
+# Memory failure fallback
+
+```yaml qa-scenario
+id: memory-failure-fallback
+title: Memory failure fallback
+surface: memory
+objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
+successCriteria:
+  - Memory tools are absent from the effective tool inventory.
+  - Agent does not hallucinate the hidden fact.
+  - Agent says it could not confirm and surfaces the limitation.
+docsRefs:
+  - docs/concepts/memory.md
+  - docs/tools/index.md
+codeRefs:
+  - extensions/memory-core/src/tools.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: memory-failure-fallback
+  summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
+  config:
+    gracefulFallbackAny:
+      - could not confirm
+      - can't confirm
+      - can’t confirm
+      - cannot confirm
+      - i can confirm there is a hidden fact
+      - will not guess
+      - won't guess
+      - won’t guess
+      - should not reveal
+      - won't reveal
+      - won’t reveal
+      - will not reveal
+```
--- a/qa/scenarios/memory-recall.md
+++ b/qa/scenarios/memory-recall.md
@@ -0,0 +1,23 @@
+# Memory recall after context switch
+
+```yaml qa-scenario
+id: memory-recall
+title: Memory recall after context switch
+surface: memory
+objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
+successCriteria:
+  - Agent acknowledges the seeded fact.
+  - Agent later recalls the same fact correctly.
+  - Recall stays scoped to the active QA conversation.
+docsRefs:
+  - docs/help/testing.md
+codeRefs:
+  - extensions/qa-lab/src/scenario.ts
+execution:
+  kind: custom
+  handler: memory-recall
+  summary: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
+  config:
+    rememberPrompt: "Please remember this fact for later: the QA canary code is ALPHA-7."
+    recallPrompt: "What was the QA canary code I asked you to remember earlier?"
+```
--- a/qa/scenarios/memory-tools-channel-context.md
+++ b/qa/scenarios/memory-tools-channel-context.md
@@ -0,0 +1,25 @@
+# Memory tools in channel context
+
+```yaml qa-scenario
+id: memory-tools-channel-context
+title: Memory tools in channel context
+surface: memory
+objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
+successCriteria:
+  - Agent uses memory_search before answering.
+  - Agent narrows with memory_get before answering.
+  - Final reply returns the memory-only fact correctly in-channel.
+docsRefs:
+  - docs/concepts/memory.md
+  - docs/concepts/memory-search.md
+codeRefs:
+  - extensions/memory-core/src/tools.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: memory-tools-channel-context
+  summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
+  config:
+    channelId: qa-memory-room
+    prompt: "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first."
+```
--- a/qa/scenarios/model-switch-follow-up.md
+++ b/qa/scenarios/model-switch-follow-up.md
@@ -0,0 +1,24 @@
+# Model switch follow-up
+
+```yaml qa-scenario
+id: model-switch-follow-up
+title: Model switch follow-up
+surface: models
+objective: Verify the agent can switch to a different configured model and continue coherently.
+successCriteria:
+  - Agent reflects the model switch request.
+  - Follow-up answer remains coherent with prior context.
+  - Final report notes whether the switch actually happened.
+docsRefs:
+  - docs/help/testing.md
+  - docs/web/dashboard.md
+codeRefs:
+  - extensions/qa-lab/src/report.ts
+execution:
+  kind: custom
+  handler: model-switch-follow-up
+  summary: Verify the agent can switch to a different configured model and continue coherently.
+  config:
+    initialPrompt: "Say hello from the default configured model."
+    followupPrompt: "Continue the exchange after switching models and note the handoff."
+```
--- a/qa/scenarios/model-switch-tool-continuity.md
+++ b/qa/scenarios/model-switch-tool-continuity.md
@@ -0,0 +1,22 @@
+# Model switch with tool continuity
+
+```yaml qa-scenario
+id: model-switch-tool-continuity
+title: Model switch with tool continuity
+surface: models
+objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
+successCriteria:
+  - Alternate model is actually requested.
+  - A tool call still happens after the model switch.
+  - Final answer acknowledges the handoff and uses the tool-derived evidence.
+docsRefs:
+  - docs/help/testing.md
+  - docs/concepts/model-failover.md
+codeRefs:
+  - extensions/qa-lab/src/suite.ts
+  - extensions/qa-lab/src/mock-openai-server.ts
+execution:
+  kind: custom
+  handler: model-switch-tool-continuity
+  summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
+```
--- a/qa/scenarios/native-image-generation.md
+++ b/qa/scenarios/native-image-generation.md
@@ -0,0 +1,26 @@
+# Native image generation
+
+```yaml qa-scenario
+id: native-image-generation
+title: Native image generation
+surface: image-generation
+objective: Verify image_generate appears when configured and returns a real saved media artifact.
+successCriteria:
+  - image_generate appears in the effective tool inventory.
+  - Agent triggers native image_generate.
+  - Tool output returns a saved MEDIA path and the file exists.
+docsRefs:
+  - docs/tools/image-generation.md
+  - docs/providers/openai.md
+codeRefs:
+  - src/agents/tools/image-generate-tool.ts
+  - extensions/qa-lab/src/mock-openai-server.ts
+execution:
+  kind: custom
+  handler: native-image-generation
+  summary: Verify image_generate appears when configured and returns a real saved media artifact.
+  config:
+    prompt: "Image generation check: generate a QA lighthouse image and summarize it in one short sentence."
+    promptSnippet: "Image generation check"
+    generatedNeedle: "QA lighthouse"
+```
--- a/qa/scenarios/reaction-edit-delete.md
+++ b/qa/scenarios/reaction-edit-delete.md
@@ -0,0 +1,21 @@
+# Reaction, edit, delete lifecycle
+
+```yaml qa-scenario
+id: reaction-edit-delete
+title: Reaction, edit, delete lifecycle
+surface: message-actions
+objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
+successCriteria:
+  - Agent adds at least one reaction.
+  - Agent edits or replaces a message when asked.
+  - Transcript shows the action lifecycle correctly.
+docsRefs:
+  - docs/channels/qa-channel.md
+codeRefs:
+  - extensions/qa-channel/src/channel-actions.ts
+  - extensions/qa-lab/src/self-check-scenario.ts
+execution:
+  kind: custom
+  handler: reaction-edit-delete
+  summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
+```
--- a/qa/scenarios/runtime-inventory-drift-check.md
+++ b/qa/scenarios/runtime-inventory-drift-check.md
@@ -0,0 +1,23 @@
+# Runtime inventory drift check
+
+```yaml qa-scenario
+id: runtime-inventory-drift-check
+title: Runtime inventory drift check
+surface: inventory
+objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
+successCriteria:
+  - Enabled tool appears before the config change.
+  - After config change, disabled tool disappears from tools.effective.
+  - Disabled skill appears in skills.status with disabled state.
+docsRefs:
+  - docs/gateway/protocol.md
+  - docs/tools/skills.md
+  - docs/tools/index.md
+codeRefs:
+  - src/gateway/server-methods/tools-effective.ts
+  - src/gateway/server-methods/skills.ts
+execution:
+  kind: custom
+  handler: runtime-inventory-drift-check
+  summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
+```
--- a/qa/scenarios/session-memory-ranking.md
+++ b/qa/scenarios/session-memory-ranking.md
@@ -0,0 +1,23 @@
+# Session memory ranking
+
+```yaml qa-scenario
+id: session-memory-ranking
+title: Session memory ranking
+surface: memory
+objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
+successCriteria:
+  - Session memory indexing is enabled for the scenario.
+  - Search ranks the newer transcript-backed fact ahead of the stale durable note.
+  - The agent uses memory tools and answers with the current fact, not the stale one.
+docsRefs:
+  - docs/concepts/memory-search.md
+  - docs/reference/memory-config.md
+codeRefs:
+  - extensions/memory-core/src/tools.ts
+  - extensions/memory-core/src/memory/manager.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: session-memory-ranking
+  summary: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
+```
--- a/qa/scenarios/skill-install-hot-availability.md
+++ b/qa/scenarios/skill-install-hot-availability.md
@@ -0,0 +1,25 @@
+# Skill install hot availability
+
+```yaml qa-scenario
+id: skill-install-hot-availability
+title: Skill install hot availability
+surface: skills
+objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
+successCriteria:
+  - Skill is absent before install.
+  - skills.status reports it after install without a restart.
+  - The next agent turn reflects the new skill marker.
+docsRefs:
+  - docs/tools/skills.md
+  - docs/gateway/configuration.md
+codeRefs:
+  - src/agents/skills-status.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: skill-install-hot-availability
+  summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
+  config:
+    prompt: "Hot install marker: give me the hot install marker exactly."
+    expectedContains: "HOT-INSTALL-OK"
+```
--- a/qa/scenarios/skill-visibility-invocation.md
+++ b/qa/scenarios/skill-visibility-invocation.md
@@ -0,0 +1,25 @@
+# Skill visibility and invocation
+
+```yaml qa-scenario
+id: skill-visibility-invocation
+title: Skill visibility and invocation
+surface: skills
+objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
+successCriteria:
+  - skills.status reports the seeded skill as visible and eligible.
+  - The next agent turn reflects the skill instruction marker.
+  - The result stays scoped to the active QA workspace skill.
+docsRefs:
+  - docs/tools/skills.md
+  - docs/gateway/protocol.md
+codeRefs:
+  - src/agents/skills-status.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: skill-visibility-invocation
+  summary: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
+  config:
+    prompt: "Visible skill marker: give me the visible skill marker exactly."
+    expectedContains: "VISIBLE-SKILL-OK"
+```
--- a/qa/scenarios/source-docs-discovery-report.md
+++ b/qa/scenarios/source-docs-discovery-report.md
@@ -0,0 +1,30 @@
+# Source and docs discovery report
+
+```yaml qa-scenario
+id: source-docs-discovery-report
+title: Source and docs discovery report
+surface: discovery
+objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
+successCriteria:
+  - Agent reads docs and source before proposing more tests.
+  - Agent identifies extra candidate scenarios beyond the seed list.
+  - Agent ends with a worked or failed QA report.
+docsRefs:
+  - docs/help/testing.md
+  - docs/web/dashboard.md
+  - docs/channels/qa-channel.md
+codeRefs:
+  - extensions/qa-lab/src/report.ts
+  - extensions/qa-lab/src/self-check.ts
+  - src/agents/system-prompt.ts
+execution:
+  kind: custom
+  handler: source-docs-discovery-report
+  summary: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
+  config:
+    requiredFiles:
+      - repo/qa/scenarios/index.md
+      - repo/extensions/qa-lab/src/suite.ts
+      - repo/docs/help/testing.md
+    prompt: Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.
+```
--- a/qa/scenarios/subagent-fanout-synthesis.md
+++ b/qa/scenarios/subagent-fanout-synthesis.md
@@ -0,0 +1,36 @@
+# Subagent fanout synthesis
+
+```yaml qa-scenario
+id: subagent-fanout-synthesis
+title: Subagent fanout synthesis
+surface: subagents
+objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
+successCriteria:
+  - Parent flow launches at least two bounded subagent tasks.
+  - Both delegated results are acknowledged in the main flow.
+  - Final answer synthesizes both worker outputs in one reply.
+docsRefs:
+  - docs/tools/subagents.md
+  - docs/help/testing.md
+codeRefs:
+  - src/agents/subagent-spawn.ts
+  - src/agents/system-prompt.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: subagent-fanout-synthesis
+  summary: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
+  config:
+    prompt: |-
+      Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially.
+      Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does.
+      Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does.
+      Wait for both subagents to finish.
+      Then reply with exactly these two lines and nothing else:
+      subagent-1: ok
+      subagent-2: ok
+      Do not use ACP.
+    expectedReplyAny:
+      - subagent-1: ok
+      - subagent-2: ok
+```
--- a/qa/scenarios/subagent-handoff.md
+++ b/qa/scenarios/subagent-handoff.md
@@ -0,0 +1,22 @@
+# Subagent handoff
+
+```yaml qa-scenario
+id: subagent-handoff
+title: Subagent handoff
+surface: subagents
+objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
+successCriteria:
+  - Agent launches a bounded subagent task.
+  - Subagent result is acknowledged in the main flow.
+  - Final answer attributes delegated work clearly.
+docsRefs:
+  - docs/tools/subagents.md
+  - docs/help/testing.md
+codeRefs:
+  - src/agents/system-prompt.ts
+  - extensions/qa-lab/src/report.ts
+execution:
+  kind: custom
+  handler: subagent-handoff
+  summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
+```
--- a/qa/scenarios/thread-follow-up.md
+++ b/qa/scenarios/thread-follow-up.md
@@ -0,0 +1,24 @@
+# Threaded follow-up
+
+```yaml qa-scenario
+id: thread-follow-up
+title: Threaded follow-up
+surface: thread
+objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
+successCriteria:
+  - Agent creates or uses a thread for deeper work.
+  - Follow-up messages stay attached to the thread.
+  - Thread report references the correct prior context.
+docsRefs:
+  - docs/channels/qa-channel.md
+  - docs/channels/group-messages.md
+codeRefs:
+  - extensions/qa-channel/src/protocol.ts
+  - extensions/qa-lab/src/bus-state.ts
+execution:
+  kind: custom
+  handler: thread-follow-up
+  summary: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
+  config:
+    prompt: "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread."
+```
--- a/qa/scenarios/thread-memory-isolation.md
+++ b/qa/scenarios/thread-memory-isolation.md
@@ -0,0 +1,24 @@
+# Thread memory isolation
+
+```yaml qa-scenario
+id: thread-memory-isolation
+title: Thread memory isolation
+surface: memory
+objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
+successCriteria:
+  - Agent uses memory tools inside the thread.
+  - The hidden fact is answered correctly in the thread.
+  - No root-channel outbound message leaks during the threaded memory reply.
+docsRefs:
+  - docs/concepts/memory-search.md
+  - docs/channels/qa-channel.md
+  - docs/channels/group-messages.md
+codeRefs:
+  - extensions/memory-core/src/tools.ts
+  - extensions/qa-channel/src/protocol.ts
+  - extensions/qa-lab/src/suite.ts
+execution:
+  kind: custom
+  handler: thread-memory-isolation
+  summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
+```