QA: track scenario coverage intent

2026-05-06 05:20:43 +00:00 · 2026-04-17 14:01:20 -04:00
parent f334ca2b50
commit 3a1e469732
56 changed files with 576 additions and 3 deletions
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -72,6 +72,7 @@ import {
  runQaDockerScaffoldCommand,
  runQaDockerUpCommand,
  runQaCharacterEvalCommand,
+  runQaCoverageReportCommand,
  runQaManualLaneCommand,
  runQaParityReportCommand,
  runQaSuiteCommand,
@@ -336,6 +337,13 @@ describe("qa cli runtime", () => {
    }
  });

+  it("prints a markdown coverage report from scenario metadata", async () => {
+    await runQaCoverageReportCommand({ repoRoot: process.cwd() });
+
+    expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("# QA Coverage Inventory"));
+    expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("memory.recall"));
+  });
+
  it("resolves character eval paths and passes model refs through", async () => {
    await runQaCharacterEvalCommand({
      repoRoot: "/tmp/openclaw-repo",
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -9,6 +9,7 @@ import {
 import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
 import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
 import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
+import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js";
 import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
 import { runQaDockerUp } from "./docker-up.runtime.js";
 import type { QaCliBackendAuthMode } from "./gateway-child.js";
@@ -36,6 +37,7 @@ import {
  type QaProviderMode,
  type QaProviderModeInput,
 } from "./run-config.js";
+import { readQaScenarioPack } from "./scenario-catalog.js";
 import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js";

 type InterruptibleServer = {
@@ -442,6 +444,29 @@ export async function runQaParityReportCommand(opts: {
    process.exitCode = 1;
  }
 }
+
+export async function runQaCoverageReportCommand(opts: {
+  repoRoot?: string;
+  output?: string;
+  json?: boolean;
+}) {
+  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
+  const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios);
+  const outputPath = opts.output ? path.resolve(repoRoot, opts.output) : undefined;
+  const body = opts.json
+    ? `${JSON.stringify(inventory, null, 2)}\n`
+    : renderQaCoverageMarkdownReport(inventory);
+
+  if (outputPath) {
+    await fs.mkdir(path.dirname(outputPath), { recursive: true });
+    await fs.writeFile(outputPath, body, "utf8");
+    process.stdout.write(`QA coverage report: ${outputPath}\n`);
+    return;
+  }
+
+  process.stdout.write(body);
+}
+
 export async function runQaCharacterEvalCommand(opts: {
  repoRoot?: string;
  outputDir?: string;
--- a/extensions/qa-lab/src/cli.test.ts
+++ b/extensions/qa-lab/src/cli.test.ts
@@ -44,12 +44,14 @@ const {
  runQaCredentialsAddCommand,
  runQaCredentialsListCommand,
  runQaCredentialsRemoveCommand,
+  runQaCoverageReportCommand,
  runQaProviderServerCommand,
  runQaTelegramCommand,
 } = vi.hoisted(() => ({
  runQaCredentialsAddCommand: vi.fn(),
  runQaCredentialsListCommand: vi.fn(),
  runQaCredentialsRemoveCommand: vi.fn(),
+  runQaCoverageReportCommand: vi.fn(),
  runQaProviderServerCommand: vi.fn(),
  runQaTelegramCommand: vi.fn(),
 }));
@@ -72,6 +74,7 @@ vi.mock("./cli.runtime.js", () => ({
  runQaCredentialsAddCommand,
  runQaCredentialsListCommand,
  runQaCredentialsRemoveCommand,
+  runQaCoverageReportCommand,
  runQaProviderServerCommand,
 }));

@@ -85,6 +88,7 @@ describe("qa cli registration", () => {
    runQaCredentialsAddCommand.mockReset();
    runQaCredentialsListCommand.mockReset();
    runQaCredentialsRemoveCommand.mockReset();
+    runQaCoverageReportCommand.mockReset();
    runQaProviderServerCommand.mockReset();
    runQaTelegramCommand.mockReset();
    listQaRunnerCliContributions
@@ -101,10 +105,30 @@ describe("qa cli registration", () => {
    const qa = program.commands.find((command) => command.name() === "qa");
    expect(qa).toBeDefined();
    expect(qa?.commands.map((command) => command.name())).toEqual(
-      expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials"]),
+      expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials", "coverage"]),
    );
  });

+  it("routes coverage report flags into the qa runtime command", async () => {
+    await program.parseAsync([
+      "node",
+      "openclaw",
+      "qa",
+      "coverage",
+      "--repo-root",
+      "/tmp/openclaw-repo",
+      "--output",
+      ".artifacts/qa-coverage.md",
+      "--json",
+    ]);
+
+    expect(runQaCoverageReportCommand).toHaveBeenCalledWith({
+      repoRoot: "/tmp/openclaw-repo",
+      output: ".artifacts/qa-coverage.md",
+      json: true,
+    });
+  });
+
  it("delegates discovered qa runner registration through the generic host seam", () => {
    const [{ registration }] = listQaRunnerCliContributions.mock.results[0]?.value;
    expect(registration.register).toHaveBeenCalledTimes(1);
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -60,6 +60,12 @@ async function runQaParityReport(opts: {
  const runtime = await loadQaLabCliRuntime();
  await runtime.runQaParityReportCommand(opts);
 }
+
+async function runQaCoverageReport(opts: { repoRoot?: string; output?: string; json?: boolean }) {
+  const runtime = await loadQaLabCliRuntime();
+  await runtime.runQaCoverageReportCommand(opts);
+}
+
 async function runQaCharacterEval(opts: {
  repoRoot?: string;
  outputDir?: string;
@@ -302,6 +308,15 @@ export function registerQaLabCli(program: Command) {
      },
    );

+  qa.command("coverage")
+    .description("Print the markdown scenario coverage inventory")
+    .option("--repo-root <path>", "Repository root to target when writing --output")
+    .option("--output <path>", "Write the coverage inventory to this path")
+    .option("--json", "Print JSON instead of Markdown", false)
+    .action(async (opts: { repoRoot?: string; output?: string; json?: boolean }) => {
+      await runQaCoverageReport(opts);
+    });
+
  qa.command("character-eval")
    .description("Run the character QA scenario across live models and write a judged report")
    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
--- a/extensions/qa-lab/src/coverage-report.test.ts
+++ b/extensions/qa-lab/src/coverage-report.test.ts
@@ -0,0 +1,31 @@
+import { describe, expect, it } from "vitest";
+import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js";
+import { readQaScenarioPack } from "./scenario-catalog.js";
+
+describe("qa coverage report", () => {
+  it("groups scenario coverage metadata by theme and surface", () => {
+    const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios);
+
+    expect(inventory.scenarioCount).toBeGreaterThan(0);
+    expect(inventory.coverageIdCount).toBeGreaterThan(0);
+    expect(inventory.primaryCoverageIdCount).toBeGreaterThan(0);
+    expect(inventory.secondaryCoverageIdCount).toBeGreaterThan(0);
+    expect(inventory.overlappingCoverage.length).toBeGreaterThan(0);
+    expect(inventory.missingCoverage).toEqual([]);
+    expect(inventory.byTheme.memory.some((feature) => feature.id === "memory.recall")).toBe(true);
+    expect(inventory.bySurface.memory.some((feature) => feature.id === "memory.recall")).toBe(true);
+  });
+
+  it("renders a compact markdown inventory", () => {
+    const report = renderQaCoverageMarkdownReport(
+      buildQaCoverageInventory(readQaScenarioPack().scenarios),
+    );
+
+    expect(report).toContain("# QA Coverage Inventory");
+    expect(report).toContain("- Missing coverage metadata: 0");
+    expect(report).toContain("- Overlapping coverage IDs:");
+    expect(report).toContain("memory.recall");
+    expect(report).toContain("primary: memory-recall (qa/scenarios/memory/memory-recall.md)");
+    expect(report).toContain("secondary: active-memory-preprompt-recall");
+  });
+});
--- a/extensions/qa-lab/src/coverage-report.ts
+++ b/extensions/qa-lab/src/coverage-report.ts
@@ -0,0 +1,192 @@
+import type { QaSeedScenarioWithSource } from "./scenario-catalog.js";
+
+export type QaCoverageScenarioSummary = {
+  id: string;
+  title: string;
+  sourcePath: string;
+  theme: string;
+  surfaces: string[];
+  risk: string;
+};
+
+export type QaCoverageIntent = "primary" | "secondary";
+
+export type QaCoverageScenarioReference = QaCoverageScenarioSummary & {
+  intent: QaCoverageIntent;
+};
+
+export type QaCoverageFeatureSummary = {
+  id: string;
+  scenarios: QaCoverageScenarioReference[];
+};
+
+export type QaCoverageInventory = {
+  scenarioCount: number;
+  coverageIdCount: number;
+  primaryCoverageIdCount: number;
+  secondaryCoverageIdCount: number;
+  features: QaCoverageFeatureSummary[];
+  overlappingCoverage: QaCoverageFeatureSummary[];
+  missingCoverage: QaCoverageScenarioSummary[];
+  byTheme: Record<string, QaCoverageFeatureSummary[]>;
+  bySurface: Record<string, QaCoverageFeatureSummary[]>;
+};
+
+function scenarioTheme(sourcePath: string) {
+  const parts = sourcePath.split("/");
+  return parts[2] ?? "unknown";
+}
+
+function scenarioSurfaces(scenario: QaSeedScenarioWithSource) {
+  return scenario.surfaces && scenario.surfaces.length > 0 ? scenario.surfaces : [scenario.surface];
+}
+
+function scenarioRisk(scenario: QaSeedScenarioWithSource) {
+  return scenario.risk ?? scenario.riskLevel ?? "unassigned";
+}
+
+function summarizeScenario(scenario: QaSeedScenarioWithSource): QaCoverageScenarioSummary {
+  return {
+    id: scenario.id,
+    title: scenario.title,
+    sourcePath: scenario.sourcePath,
+    theme: scenarioTheme(scenario.sourcePath),
+    surfaces: scenarioSurfaces(scenario),
+    risk: scenarioRisk(scenario),
+  };
+}
+
+function sortFeatures(features: readonly QaCoverageFeatureSummary[]) {
+  return features.toSorted((left, right) => left.id.localeCompare(right.id));
+}
+
+export function buildQaCoverageInventory(
+  scenarios: readonly QaSeedScenarioWithSource[],
+): QaCoverageInventory {
+  const byCoverageId = new Map<string, QaCoverageFeatureSummary>();
+  const primaryCoverageIds = new Set<string>();
+  const secondaryCoverageIds = new Set<string>();
+  const missingCoverage: QaCoverageScenarioSummary[] = [];
+
+  const addCoverage = (
+    scenario: QaSeedScenarioWithSource,
+    coverageIds: readonly string[] | undefined,
+    intent: QaCoverageIntent,
+  ) => {
+    const summary = summarizeScenario(scenario);
+    for (const coverageId of coverageIds ?? []) {
+      const feature = byCoverageId.get(coverageId) ?? {
+        id: coverageId,
+        scenarios: [],
+      };
+      feature.scenarios.push({ ...summary, intent });
+      byCoverageId.set(coverageId, feature);
+      if (intent === "primary") {
+        primaryCoverageIds.add(coverageId);
+      } else {
+        secondaryCoverageIds.add(coverageId);
+      }
+    }
+  };
+
+  for (const scenario of scenarios) {
+    if (!scenario.coverage) {
+      missingCoverage.push(summarizeScenario(scenario));
+      continue;
+    }
+    addCoverage(scenario, scenario.coverage.primary, "primary");
+    addCoverage(scenario, scenario.coverage.secondary, "secondary");
+  }
+
+  const features = sortFeatures([...byCoverageId.values()]);
+  const overlappingCoverage = features.filter((feature) => feature.scenarios.length > 1);
+  const byTheme: Record<string, QaCoverageFeatureSummary[]> = {};
+  const bySurface: Record<string, QaCoverageFeatureSummary[]> = {};
+
+  for (const feature of features) {
+    const themes = new Set(feature.scenarios.map((scenario) => scenario.theme));
+    for (const theme of themes) {
+      byTheme[theme] ??= [];
+      byTheme[theme].push({
+        ...feature,
+        scenarios: feature.scenarios.filter((scenario) => scenario.theme === theme),
+      });
+    }
+    const surfaces = new Set(feature.scenarios.flatMap((scenario) => scenario.surfaces));
+    for (const surface of surfaces) {
+      bySurface[surface] ??= [];
+      bySurface[surface].push({
+        ...feature,
+        scenarios: feature.scenarios.filter((scenario) => scenario.surfaces.includes(surface)),
+      });
+    }
+  }
+
+  return {
+    scenarioCount: scenarios.length,
+    coverageIdCount: features.length,
+    primaryCoverageIdCount: primaryCoverageIds.size,
+    secondaryCoverageIdCount: secondaryCoverageIds.size,
+    features,
+    overlappingCoverage,
+    missingCoverage,
+    byTheme,
+    bySurface,
+  };
+}
+
+function pushFeatureLines(lines: string[], features: readonly QaCoverageFeatureSummary[]) {
+  for (const feature of sortFeatures(features)) {
+    const scenarios = feature.scenarios
+      .map((scenario) => `${scenario.intent}: ${scenario.id} (${scenario.sourcePath})`)
+      .join(", ");
+    lines.push(`- ${feature.id}: ${scenarios}`);
+  }
+}
+
+export function renderQaCoverageMarkdownReport(inventory: QaCoverageInventory): string {
+  const lines: string[] = [
+    "# QA Coverage Inventory",
+    "",
+    `- Scenarios: ${inventory.scenarioCount}`,
+    `- Coverage IDs: ${inventory.coverageIdCount}`,
+    `- Primary coverage IDs: ${inventory.primaryCoverageIdCount}`,
+    `- Secondary coverage IDs: ${inventory.secondaryCoverageIdCount}`,
+    `- Overlapping coverage IDs: ${inventory.overlappingCoverage.length}`,
+    `- Missing coverage metadata: ${inventory.missingCoverage.length}`,
+    "",
+    "## By Theme",
+    "",
+  ];
+
+  for (const theme of Object.keys(inventory.byTheme).toSorted()) {
+    lines.push(`### ${theme}`, "");
+    pushFeatureLines(lines, inventory.byTheme[theme] ?? []);
+    lines.push("");
+  }
+
+  lines.push("## By Surface", "");
+  for (const surface of Object.keys(inventory.bySurface).toSorted()) {
+    lines.push(`### ${surface}`, "");
+    pushFeatureLines(lines, inventory.bySurface[surface] ?? []);
+    lines.push("");
+  }
+
+  if (inventory.overlappingCoverage.length > 0) {
+    lines.push("## Overlap", "");
+    pushFeatureLines(lines, inventory.overlappingCoverage);
+    lines.push("");
+  }
+
+  if (inventory.missingCoverage.length > 0) {
+    lines.push("## Missing Metadata", "");
+    for (const scenario of inventory.missingCoverage.toSorted((left, right) =>
+      left.id.localeCompare(right.id),
+    )) {
+      lines.push(`- ${scenario.id}: ${scenario.sourcePath}`);
+    }
+    lines.push("");
+  }
+
+  return `${lines.join("\n").trimEnd()}\n`;
+}
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -27,6 +27,8 @@ describe("qa scenario catalog", () => {
    expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-c3po")).toBe(true);
    expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
    expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
+    expect(pack.scenarios.every((scenario) => scenario.coverage?.primary.length)).toBe(true);
+    expect(readQaScenarioById("memory-recall").coverage?.primary).toContain("memory.recall");
  });

  it("exposes bootstrap data from the markdown pack", () => {
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -51,6 +51,44 @@ const qaScenarioExecutionSchema = z.object({
  config: qaScenarioConfigSchema.optional(),
 });

+const qaCoverageIdSchema = z
+  .string()
+  .trim()
+  .regex(/^[a-z0-9]+(?:[.-][a-z0-9]+)*$/, {
+    message: "coverage ids must use lowercase dotted or dashed tokens",
+  });
+
+const qaCoverageIdListSchema = z.array(qaCoverageIdSchema).min(1);
+
+const qaScenarioCoverageSchema = z
+  .object({
+    primary: qaCoverageIdListSchema,
+    secondary: qaCoverageIdListSchema.optional(),
+  })
+  .superRefine((coverage, ctx) => {
+    const seen = new Set<string>();
+    const coverageEntries = [
+      ["primary", coverage.primary],
+      ["secondary", coverage.secondary],
+    ] as const;
+    for (const [intent, ids] of coverageEntries) {
+      if (!ids) {
+        continue;
+      }
+      for (const [index, id] of ids.entries()) {
+        if (!seen.has(id)) {
+          seen.add(id);
+          continue;
+        }
+        ctx.addIssue({
+          code: z.ZodIssueCode.custom,
+          path: [intent, index],
+          message: `duplicate coverage id: ${id}`,
+        });
+      }
+    }
+  });
+
 const qaScenarioGatewayRuntimeSchema = z.object({
  forwardHostHome: z.boolean().optional(),
 });
@@ -138,6 +176,9 @@ const qaSeedScenarioSchema = z.object({
  title: z.string().trim().min(1),
  surface: z.string().trim().min(1),
  category: z.string().trim().min(1).optional(),
+  coverage: qaScenarioCoverageSchema.optional(),
+  surfaces: z.array(z.string().trim().min(1)).min(1).optional(),
+  risk: z.enum(["low", "medium", "high"]).optional(),
  capabilities: z.array(z.string().trim().min(1)).optional(),
  lane: z.record(z.string(), z.union([z.boolean(), z.string()])).optional(),
  riskLevel: z.string().trim().min(1).optional(),
--- a/qa/README.md
+++ b/qa/README.md
@@ -13,5 +13,6 @@ Key workflow:

 - `qa suite` is the executable frontier subset / regression loop.
 - `qa manual` is the scoped personality and style probe after the executable subset is green.
+- `qa coverage` prints the scenario coverage inventory from scenario frontmatter.

 Keep this folder in git. Add new scenarios here before wiring them into automation.
--- a/qa/scenarios/agents/instruction-followthrough-repo-contract.md
+++ b/qa/scenarios/agents/instruction-followthrough-repo-contract.md
@@ -4,6 +4,11 @@
 id: instruction-followthrough-repo-contract
 title: Instruction followthrough repo contract
 surface: repo-contract
+coverage:
+  primary:
+    - agents.instructions
+  secondary:
+    - runtime.first-action
 objective: Verify the agent reads repo instruction files first, follows the required tool order, and completes the first feasible action instead of stopping at a plan.
 successCriteria:
  - Agent reads the seeded instruction files before writing the requested artifact.
--- a/qa/scenarios/agents/subagent-fanout-synthesis.md
+++ b/qa/scenarios/agents/subagent-fanout-synthesis.md
@@ -4,6 +4,11 @@
 id: subagent-fanout-synthesis
 title: Subagent fanout synthesis
 surface: subagents
+coverage:
+  primary:
+    - agents.subagents
+  secondary:
+    - agents.synthesis
 objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
 successCriteria:
  - Parent flow launches at least two bounded subagent tasks.
--- a/qa/scenarios/agents/subagent-handoff.md
+++ b/qa/scenarios/agents/subagent-handoff.md
@@ -4,6 +4,9 @@
 id: subagent-handoff
 title: Subagent handoff
 surface: subagents
+coverage:
+  primary:
+    - agents.subagents
 objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
 successCriteria:
  - Agent launches a bounded subagent task.
--- a/qa/scenarios/channels/channel-chat-baseline.md
+++ b/qa/scenarios/channels/channel-chat-baseline.md
@@ -4,6 +4,11 @@
 id: channel-chat-baseline
 title: Channel baseline conversation
 surface: channel
+coverage:
+  primary:
+    - channels.group-messages
+  secondary:
+    - channels.qa-channel
 objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
 successCriteria:
  - Agent replies in the shared channel transcript.
--- a/qa/scenarios/channels/dm-chat-baseline.md
+++ b/qa/scenarios/channels/dm-chat-baseline.md
@@ -4,6 +4,11 @@
 id: dm-chat-baseline
 title: DM baseline conversation
 surface: dm
+coverage:
+  primary:
+    - channels.dm
+  secondary:
+    - channels.qa-channel
 objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
 successCriteria:
  - Agent replies in DM without channel routing mistakes.
--- a/qa/scenarios/channels/reaction-edit-delete.md
+++ b/qa/scenarios/channels/reaction-edit-delete.md
@@ -4,6 +4,11 @@
 id: reaction-edit-delete
 title: Reaction, edit, delete lifecycle
 surface: message-actions
+coverage:
+  primary:
+    - channels.message-actions
+  secondary:
+    - channels.qa-channel
 objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
 successCriteria:
  - Agent adds at least one reaction.
--- a/qa/scenarios/channels/thread-follow-up.md
+++ b/qa/scenarios/channels/thread-follow-up.md
@@ -4,6 +4,11 @@
 id: thread-follow-up
 title: Threaded follow-up
 surface: thread
+coverage:
+  primary:
+    - channels.threads
+  secondary:
+    - channels.qa-channel
 objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
 successCriteria:
  - Agent creates or uses a thread for deeper work.
--- a/qa/scenarios/character/character-vibes-c3po.md
+++ b/qa/scenarios/character/character-vibes-c3po.md
@@ -4,6 +4,11 @@
 id: character-vibes-c3po
 title: "Nervous release protocol chat"
 surface: character
+coverage:
+  primary:
+    - character.persona
+  secondary:
+    - workspace.artifacts
 objective: Capture a natural multi-turn C-3PO-flavored character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
 successCriteria:
  - Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.
--- a/qa/scenarios/character/character-vibes-gollum.md
+++ b/qa/scenarios/character/character-vibes-gollum.md
@@ -4,6 +4,11 @@
 id: character-vibes-gollum
 title: "Late-night deploy helper chat"
 surface: character
+coverage:
+  primary:
+    - character.persona
+  secondary:
+    - workspace.artifacts
 objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
 successCriteria:
  - Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.
--- a/qa/scenarios/config/config-apply-restart-wakeup.md
+++ b/qa/scenarios/config/config-apply-restart-wakeup.md
@@ -4,6 +4,11 @@
 id: config-apply-restart-wakeup
 title: Config apply restart wake-up
 surface: config
+coverage:
+  primary:
+    - config.restart-apply
+  secondary:
+    - runtime.gateway-restart
 objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
 successCriteria:
  - config.apply schedules a restart-required change.
--- a/qa/scenarios/config/config-patch-hot-apply.md
+++ b/qa/scenarios/config/config-patch-hot-apply.md
@@ -4,6 +4,11 @@
 id: config-patch-hot-apply
 title: Config patch skill disable
 surface: config
+coverage:
+  primary:
+    - config.hot-apply
+  secondary:
+    - plugins.skills
 objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
 successCriteria:
  - config.patch succeeds for the skill toggle change.
--- a/qa/scenarios/config/config-restart-capability-flip.md
+++ b/qa/scenarios/config/config-restart-capability-flip.md
@@ -4,6 +4,11 @@
 id: config-restart-capability-flip
 title: Config restart capability flip
 surface: config
+coverage:
+  primary:
+    - config.restart-apply
+  secondary:
+    - plugins.capabilities
 objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
 successCriteria:
  - Capability is absent before the restart-triggering patch.
--- a/qa/scenarios/index.md
+++ b/qa/scenarios/index.md
@@ -5,13 +5,24 @@ Single source of truth for repo-backed QA suite bootstrap data.

 - `index.md` defines pack-level bootstrap data
 - each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow`
- scenario markdown may also define category metadata, required plugins, lane filters,
-  and gateway config patching
+- scenario markdown may also define coverage IDs, category metadata, required plugins,
+  lane filters, and gateway config patching

 - kickoff mission
 - QA operator identity
 - scenario files under one-level theme directories

+Coverage tracking:
+
+- add `coverage.primary` IDs to each scenario's `qa-scenario` block
+- add `coverage.secondary` only when a scenario intentionally protects another behavior
+- keep IDs behavior-shaped, broad enough to reuse, lowercase, and dotted or dashed
+- prefer reusing an existing feature ID over minting a scenario-shaped ID
+- avoid copying the scenario title into coverage IDs
+- use `pnpm openclaw qa coverage` to render the current inventory
+- treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
+- keep source-path tracking in the report, not in the scenario schema
+
 Theme directories:

 - `agents/` - agent behavior, instructions, and subagent flows
--- a/qa/scenarios/media/image-generation-roundtrip.md
+++ b/qa/scenarios/media/image-generation-roundtrip.md
@@ -4,6 +4,11 @@
 id: image-generation-roundtrip
 title: Image generation roundtrip
 surface: image-generation
+coverage:
+  primary:
+    - media.image-generation
+  secondary:
+    - channels.qa-channel
 objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
 successCriteria:
  - image_generate produces a saved MEDIA artifact.
--- a/qa/scenarios/media/image-understanding-attachment.md
+++ b/qa/scenarios/media/image-understanding-attachment.md
@@ -4,6 +4,11 @@
 id: image-understanding-attachment
 title: Image understanding from attachment
 surface: image-understanding
+coverage:
+  primary:
+    - media.image-understanding
+  secondary:
+    - channels.qa-channel
 objective: Verify an attached image reaches the agent model and the agent can describe what it sees.
 successCriteria:
  - Agent receives at least one image attachment.
--- a/qa/scenarios/media/native-image-generation.md
+++ b/qa/scenarios/media/native-image-generation.md
@@ -4,6 +4,11 @@
 id: native-image-generation
 title: Native image generation
 surface: image-generation
+coverage:
+  primary:
+    - media.image-generation
+  secondary:
+    - tools.native-image-generation
 objective: Verify image_generate appears when configured and returns a real saved media artifact.
 successCriteria:
  - image_generate appears in the effective tool inventory.
--- a/qa/scenarios/memory/active-memory-preprompt-recall.md
+++ b/qa/scenarios/memory/active-memory-preprompt-recall.md
@@ -4,6 +4,11 @@
 id: active-memory-preprompt-recall
 title: Active Memory pre-reply recall
 surface: memory
+coverage:
+  primary:
+    - memory.active-recall
+  secondary:
+    - memory.recall
 objective: Verify Active Memory surfaces a memory-only preference before the main reply, and that the same question stays unresolved when the plugin is off.
 plugins:
  - active-memory
--- a/qa/scenarios/memory/memory-dreaming-sweep.md
+++ b/qa/scenarios/memory/memory-dreaming-sweep.md
@@ -4,6 +4,9 @@
 id: memory-dreaming-sweep
 title: Memory dreaming sweep
 surface: memory
+coverage:
+  primary:
+    - memory.dreaming
 objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
 successCriteria:
  - Dreaming can be enabled and doctor.memory.status reports the managed sweep cron.
--- a/qa/scenarios/memory/memory-failure-fallback.md
+++ b/qa/scenarios/memory/memory-failure-fallback.md
@@ -4,6 +4,11 @@
 id: memory-failure-fallback
 title: Memory failure fallback
 surface: memory
+coverage:
+  primary:
+    - memory.failure-handling
+  secondary:
+    - runtime.fallbacks
 objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
 successCriteria:
  - Memory tools are absent from the effective tool inventory.
--- a/qa/scenarios/memory/memory-recall.md
+++ b/qa/scenarios/memory/memory-recall.md
@@ -35,6 +35,9 @@
 id: memory-recall
 title: Memory recall after context switch
 surface: memory
+coverage:
+  primary:
+    - memory.recall
 objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
 successCriteria:
  - Agent acknowledges the seeded fact.
--- a/qa/scenarios/memory/memory-tools-channel-context.md
+++ b/qa/scenarios/memory/memory-tools-channel-context.md
@@ -4,6 +4,11 @@
 id: memory-tools-channel-context
 title: Memory tools in channel context
 surface: memory
+coverage:
+  primary:
+    - memory.tools
+  secondary:
+    - channels.group-messages
 objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
 successCriteria:
  - Agent uses memory_search before answering.
--- a/qa/scenarios/memory/session-memory-ranking.md
+++ b/qa/scenarios/memory/session-memory-ranking.md
@@ -4,6 +4,11 @@
 id: session-memory-ranking
 title: Session memory ranking
 surface: memory
+coverage:
+  primary:
+    - memory.ranking
+  secondary:
+    - memory.recall
 objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
 successCriteria:
  - Session memory indexing is enabled for the scenario.
--- a/qa/scenarios/memory/thread-memory-isolation.md
+++ b/qa/scenarios/memory/thread-memory-isolation.md
@@ -4,6 +4,11 @@
 id: thread-memory-isolation
 title: Thread memory isolation
 surface: memory
+coverage:
+  primary:
+    - memory.thread-isolation
+  secondary:
+    - channels.threads
 objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
 successCriteria:
  - Agent uses memory tools inside the thread.
--- a/qa/scenarios/models/anthropic-opus-api-key-smoke.md
+++ b/qa/scenarios/models/anthropic-opus-api-key-smoke.md
@@ -4,6 +4,11 @@
 id: anthropic-opus-api-key-smoke
 title: Anthropic Opus API key smoke
 surface: model-provider
+coverage:
+  primary:
+    - models.provider-auth
+  secondary:
+    - models.anthropic
 objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using API-key auth.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary provider is anthropic.
--- a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md
+++ b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md
@@ -4,6 +4,11 @@
 id: anthropic-opus-setup-token-smoke
 title: Anthropic Opus setup-token smoke
 surface: model-provider
+coverage:
+  primary:
+    - models.provider-auth
+  secondary:
+    - models.anthropic
 objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using setup-token auth.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary provider is anthropic.
--- a/qa/scenarios/models/claude-cli-provider-capabilities-subscription.md
+++ b/qa/scenarios/models/claude-cli-provider-capabilities-subscription.md
@@ -4,6 +4,11 @@
 id: claude-cli-provider-capabilities-subscription
 title: Claude CLI provider capabilities subscription
 surface: model-provider
+coverage:
+  primary:
+    - models.provider-capabilities
+  secondary:
+    - models.claude-cli
 objective: Verify the Claude CLI model-provider lane can use native Claude subscription auth to talk, read an attached image, use bundled MCP tools, and apply workspace skills.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary provider is claude-cli.
--- a/qa/scenarios/models/claude-cli-provider-capabilities.md
+++ b/qa/scenarios/models/claude-cli-provider-capabilities.md
@@ -4,6 +4,11 @@
 id: claude-cli-provider-capabilities
 title: Claude CLI provider capabilities API key
 surface: model-provider
+coverage:
+  primary:
+    - models.provider-capabilities
+  secondary:
+    - models.claude-cli
 objective: Verify the Claude CLI model-provider lane can use the Anthropic API key path to talk, read an attached image, use bundled MCP tools, and apply workspace skills.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary provider is claude-cli.
--- a/qa/scenarios/models/codex-harness-no-meta-leak.md
+++ b/qa/scenarios/models/codex-harness-no-meta-leak.md
@@ -4,6 +4,11 @@
 id: codex-harness-no-meta-leak
 title: Codex harness no meta leak
 surface: dm
+coverage:
+  primary:
+    - models.codex-cli
+  secondary:
+    - runtime.no-meta-leak
 objective: Verify the Codex app-server harness keeps coordination/meta chatter out of the visible reply.
 successCriteria:
  - The scenario forces the Codex embedded harness and disables PI fallback.
--- a/qa/scenarios/models/model-switch-follow-up.md
+++ b/qa/scenarios/models/model-switch-follow-up.md
@@ -4,6 +4,11 @@
 id: model-switch-follow-up
 title: Model switch follow-up
 surface: models
+coverage:
+  primary:
+    - models.switching
+  secondary:
+    - runtime.session-continuity
 objective: Verify the agent can switch to a different configured model and continue coherently.
 successCriteria:
  - Agent reflects the model switch request.
--- a/qa/scenarios/models/model-switch-tool-continuity.md
+++ b/qa/scenarios/models/model-switch-tool-continuity.md
@@ -4,6 +4,11 @@
 id: model-switch-tool-continuity
 title: Model switch with tool continuity
 surface: models
+coverage:
+  primary:
+    - models.switching
+  secondary:
+    - runtime.tool-continuity
 objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
 successCriteria:
  - Alternate model is actually requested.
--- a/qa/scenarios/plugins/bundled-plugin-skill-runtime.md
+++ b/qa/scenarios/plugins/bundled-plugin-skill-runtime.md
@@ -4,6 +4,11 @@
 id: bundled-plugin-skill-runtime
 title: Bundled plugin skill runtime
 surface: skills
+coverage:
+  primary:
+    - plugins.skills
+  secondary:
+    - plugins.runtime
 objective: Verify packaged bundled plugin skills load from dist-runtime instead of being skipped by path-containment checks.
 successCriteria:
  - The runtime-packaged bundled plugin tree is used as OPENCLAW_BUNDLED_PLUGINS_DIR.
--- a/qa/scenarios/plugins/mcp-plugin-tools-call.md
+++ b/qa/scenarios/plugins/mcp-plugin-tools-call.md
@@ -4,6 +4,11 @@
 id: mcp-plugin-tools-call
 title: MCP plugin-tools call
 surface: mcp
+coverage:
+  primary:
+    - plugins.mcp-tools
+  secondary:
+    - tools.invocation
 objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
 successCriteria:
  - Plugin tools MCP server lists memory_search.
--- a/qa/scenarios/plugins/skill-install-hot-availability.md
+++ b/qa/scenarios/plugins/skill-install-hot-availability.md
@@ -4,6 +4,11 @@
 id: skill-install-hot-availability
 title: Skill install hot availability
 surface: skills
+coverage:
+  primary:
+    - plugins.skills
+  secondary:
+    - plugins.hot-install
 objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
 successCriteria:
  - Skill is absent before install.
--- a/qa/scenarios/plugins/skill-visibility-invocation.md
+++ b/qa/scenarios/plugins/skill-visibility-invocation.md
@@ -4,6 +4,11 @@
 id: skill-visibility-invocation
 title: Skill visibility and invocation
 surface: skills
+coverage:
+  primary:
+    - plugins.skills
+  secondary:
+    - tools.invocation
 objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
 successCriteria:
  - skills.status reports the seeded skill as visible and eligible.
--- a/qa/scenarios/runtime/approval-turn-tool-followthrough.md
+++ b/qa/scenarios/runtime/approval-turn-tool-followthrough.md
@@ -4,6 +4,11 @@
 id: approval-turn-tool-followthrough
 title: Approval turn tool followthrough
 surface: harness
+coverage:
+  primary:
+    - runtime.approvals
+  secondary:
+    - tools.followthrough
 objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
 successCriteria:
  - Agent can keep the pre-action turn brief.
--- a/qa/scenarios/runtime/compaction-retry-mutating-tool.md
+++ b/qa/scenarios/runtime/compaction-retry-mutating-tool.md
@@ -4,6 +4,11 @@
 id: compaction-retry-mutating-tool
 title: Compaction retry after mutating tool
 surface: runtime
+coverage:
+  primary:
+    - runtime.compaction
+  secondary:
+    - runtime.retry-policy
 objective: Verify a real mutating tool step keeps replay-unsafety explicit instead of disappearing into a clean-looking success if the run compacts or retries.
 successCriteria:
  - Agent reads the seeded large context before it writes.
--- a/qa/scenarios/runtime/empty-response-recovery-replay-safe-read.md
+++ b/qa/scenarios/runtime/empty-response-recovery-replay-safe-read.md
@@ -4,6 +4,11 @@
 id: empty-response-recovery-replay-safe-read
 title: Empty-response recovery after replay-safe read
 surface: runtime
+coverage:
+  primary:
+    - runtime.empty-response-recovery
+  secondary:
+    - runtime.retry-policy
 objective: Verify an empty visible GPT turn after a replay-safe read auto-continues into a visible answer.
 successCriteria:
  - Scenario is mock-openai only so live lanes do not pick it up implicitly.
--- a/qa/scenarios/runtime/empty-response-retry-budget-exhausted.md
+++ b/qa/scenarios/runtime/empty-response-retry-budget-exhausted.md
@@ -4,6 +4,11 @@
 id: empty-response-retry-budget-exhausted
 title: Empty-response retry budget exhausted
 surface: runtime
+coverage:
+  primary:
+    - runtime.empty-response-recovery
+  secondary:
+    - runtime.retry-policy
 objective: Verify repeated empty GPT turns exhaust the retry budget after one continuation attempt.
 successCriteria:
  - Scenario is mock-openai only so live lanes do not pick it up implicitly.
--- a/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md
+++ b/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md
@@ -4,6 +4,11 @@
 id: reasoning-only-no-auto-retry-after-write
 title: Reasoning-only no-auto-retry after write
 surface: runtime
+coverage:
+  primary:
+    - runtime.reasoning-only-recovery
+  secondary:
+    - runtime.retry-policy
 objective: Verify a GPT-style reasoning-only turn after a mutating write stays replay-unsafe and does not auto-retry.
 successCriteria:
  - Scenario is mock-openai only so live lanes do not pick it up implicitly.
--- a/qa/scenarios/runtime/reasoning-only-recovery-replay-safe-read.md
+++ b/qa/scenarios/runtime/reasoning-only-recovery-replay-safe-read.md
@@ -4,6 +4,11 @@
 id: reasoning-only-recovery-replay-safe-read
 title: Reasoning-only recovery after replay-safe read
 surface: runtime
+coverage:
+  primary:
+    - runtime.reasoning-only-recovery
+  secondary:
+    - runtime.retry-policy
 objective: Verify a GPT-style reasoning-only turn after a replay-safe read auto-continues into a visible answer.
 successCriteria:
  - Scenario is mock-openai only so live lanes do not pick it up implicitly.
--- a/qa/scenarios/runtime/runtime-inventory-drift-check.md
+++ b/qa/scenarios/runtime/runtime-inventory-drift-check.md
@@ -4,6 +4,9 @@
 id: runtime-inventory-drift-check
 title: Runtime inventory drift check
 surface: inventory
+coverage:
+  primary:
+    - runtime.inventory
 objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
 successCriteria:
  - Enabled tool appears before the config change.
--- a/qa/scenarios/scheduling/cron-one-minute-ping.md
+++ b/qa/scenarios/scheduling/cron-one-minute-ping.md
@@ -4,6 +4,11 @@
 id: cron-one-minute-ping
 title: Cron one-minute ping
 surface: cron
+coverage:
+  primary:
+    - scheduling.cron
+  secondary:
+    - channels.qa-channel
 objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
 successCriteria:
  - Agent schedules a cron reminder roughly one minute ahead.
--- a/qa/scenarios/ui/control-ui-qa-channel-image-roundtrip.md
+++ b/qa/scenarios/ui/control-ui-qa-channel-image-roundtrip.md
@@ -4,6 +4,12 @@
 id: control-ui-qa-channel-image-roundtrip
 title: Control UI plus qa-channel image roundtrip
 surface: control-ui
+coverage:
+  primary:
+    - ui.control
+  secondary:
+    - media.image-understanding
+    - channels.qa-channel
 objective: Verify the embedded Control UI can observe a qa-channel-backed session while the fake channel injects text and image turns that the agent answers correctly.
 successCriteria:
  - Control UI opens directly on the target qa-channel session.
--- a/qa/scenarios/workspace/lobster-invaders-build.md
+++ b/qa/scenarios/workspace/lobster-invaders-build.md
@@ -4,6 +4,11 @@
 id: lobster-invaders-build
 title: Build Lobster Invaders
 surface: workspace
+coverage:
+  primary:
+    - workspace.artifacts
+  secondary:
+    - workspace.builds
 objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
 successCriteria:
  - Agent inspects source before coding.
--- a/qa/scenarios/workspace/medium-game-plan-codex-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-codex-harness.md
@@ -4,6 +4,11 @@
 id: medium-game-plan-codex-harness
 title: Medium game plan Codex harness
 surface: workspace
+coverage:
+  primary:
+    - workspace.planning
+  secondary:
+    - models.codex-cli
 objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary model is codex/gpt-5.4.
--- a/qa/scenarios/workspace/medium-game-plan-pi-harness.md
+++ b/qa/scenarios/workspace/medium-game-plan-pi-harness.md
@@ -4,6 +4,11 @@
 id: medium-game-plan-pi-harness
 title: Medium game plan PI harness
 surface: workspace
+coverage:
+  primary:
+    - workspace.planning
+  secondary:
+    - agents.pi-harness
 objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
 successCriteria:
  - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
--- a/qa/scenarios/workspace/source-docs-discovery-report.md
+++ b/qa/scenarios/workspace/source-docs-discovery-report.md
@@ -4,6 +4,11 @@
 id: source-docs-discovery-report
 title: Source and docs discovery report
 surface: discovery
+coverage:
+  primary:
+    - workspace.repo-discovery
+  secondary:
+    - docs.discovery
 objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
 successCriteria:
  - Agent reads docs and source before proposing more tests.