From 3a1e46973235b503167e976d493a71613c22a8f5 Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Fri, 17 Apr 2026 14:01:20 -0400 Subject: [PATCH] QA: track scenario coverage intent --- extensions/qa-lab/src/cli.runtime.test.ts | 8 + extensions/qa-lab/src/cli.runtime.ts | 25 +++ extensions/qa-lab/src/cli.test.ts | 26 ++- extensions/qa-lab/src/cli.ts | 15 ++ extensions/qa-lab/src/coverage-report.test.ts | 31 +++ extensions/qa-lab/src/coverage-report.ts | 192 ++++++++++++++++++ .../qa-lab/src/scenario-catalog.test.ts | 2 + extensions/qa-lab/src/scenario-catalog.ts | 41 ++++ qa/README.md | 1 + ...instruction-followthrough-repo-contract.md | 5 + .../agents/subagent-fanout-synthesis.md | 5 + qa/scenarios/agents/subagent-handoff.md | 3 + .../channels/channel-chat-baseline.md | 5 + qa/scenarios/channels/dm-chat-baseline.md | 5 + qa/scenarios/channels/reaction-edit-delete.md | 5 + qa/scenarios/channels/thread-follow-up.md | 5 + .../character/character-vibes-c3po.md | 5 + .../character/character-vibes-gollum.md | 5 + .../config/config-apply-restart-wakeup.md | 5 + qa/scenarios/config/config-patch-hot-apply.md | 5 + .../config/config-restart-capability-flip.md | 5 + qa/scenarios/index.md | 15 +- .../media/image-generation-roundtrip.md | 5 + .../media/image-understanding-attachment.md | 5 + qa/scenarios/media/native-image-generation.md | 5 + .../memory/active-memory-preprompt-recall.md | 5 + qa/scenarios/memory/memory-dreaming-sweep.md | 3 + .../memory/memory-failure-fallback.md | 5 + qa/scenarios/memory/memory-recall.md | 3 + .../memory/memory-tools-channel-context.md | 5 + qa/scenarios/memory/session-memory-ranking.md | 5 + .../memory/thread-memory-isolation.md | 5 + .../models/anthropic-opus-api-key-smoke.md | 5 + .../anthropic-opus-setup-token-smoke.md | 5 + ...-cli-provider-capabilities-subscription.md | 5 + .../claude-cli-provider-capabilities.md | 5 + .../models/codex-harness-no-meta-leak.md | 5 + qa/scenarios/models/model-switch-follow-up.md | 5 + .../models/model-switch-tool-continuity.md | 5 + .../plugins/bundled-plugin-skill-runtime.md | 5 + qa/scenarios/plugins/mcp-plugin-tools-call.md | 5 + .../plugins/skill-install-hot-availability.md | 5 + .../plugins/skill-visibility-invocation.md | 5 + .../approval-turn-tool-followthrough.md | 5 + .../runtime/compaction-retry-mutating-tool.md | 5 + ...mpty-response-recovery-replay-safe-read.md | 5 + .../empty-response-retry-budget-exhausted.md | 5 + ...easoning-only-no-auto-retry-after-write.md | 5 + ...easoning-only-recovery-replay-safe-read.md | 5 + .../runtime/runtime-inventory-drift-check.md | 3 + .../scheduling/cron-one-minute-ping.md | 5 + .../control-ui-qa-channel-image-roundtrip.md | 6 + .../workspace/lobster-invaders-build.md | 5 + .../medium-game-plan-codex-harness.md | 5 + .../workspace/medium-game-plan-pi-harness.md | 5 + .../workspace/source-docs-discovery-report.md | 5 + 56 files changed, 576 insertions(+), 3 deletions(-) create mode 100644 extensions/qa-lab/src/coverage-report.test.ts create mode 100644 extensions/qa-lab/src/coverage-report.ts diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index a87d055b068..418d8a001c1 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -72,6 +72,7 @@ import { runQaDockerScaffoldCommand, runQaDockerUpCommand, runQaCharacterEvalCommand, + runQaCoverageReportCommand, runQaManualLaneCommand, runQaParityReportCommand, runQaSuiteCommand, @@ -336,6 +337,13 @@ describe("qa cli runtime", () => { } }); + it("prints a markdown coverage report from scenario metadata", async () => { + await runQaCoverageReportCommand({ repoRoot: process.cwd() }); + + expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("# QA Coverage Inventory")); + expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("memory.recall")); + }); + it("resolves character eval paths and passes model refs through", async () => { await runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", diff --git a/extensions/qa-lab/src/cli.runtime.ts b/extensions/qa-lab/src/cli.runtime.ts index 6b2f78a86ce..8fa2ba216a2 100644 --- a/extensions/qa-lab/src/cli.runtime.ts +++ b/extensions/qa-lab/src/cli.runtime.ts @@ -9,6 +9,7 @@ import { import { resolveQaParityPackScenarioIds } from "./agentic-parity.js"; import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js"; import { resolveRepoRelativeOutputDir } from "./cli-paths.js"; +import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js"; import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js"; import { runQaDockerUp } from "./docker-up.runtime.js"; import type { QaCliBackendAuthMode } from "./gateway-child.js"; @@ -36,6 +37,7 @@ import { type QaProviderMode, type QaProviderModeInput, } from "./run-config.js"; +import { readQaScenarioPack } from "./scenario-catalog.js"; import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js"; type InterruptibleServer = { @@ -442,6 +444,29 @@ export async function runQaParityReportCommand(opts: { process.exitCode = 1; } } + +export async function runQaCoverageReportCommand(opts: { + repoRoot?: string; + output?: string; + json?: boolean; +}) { + const repoRoot = path.resolve(opts.repoRoot ?? process.cwd()); + const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios); + const outputPath = opts.output ? path.resolve(repoRoot, opts.output) : undefined; + const body = opts.json + ? `${JSON.stringify(inventory, null, 2)}\n` + : renderQaCoverageMarkdownReport(inventory); + + if (outputPath) { + await fs.mkdir(path.dirname(outputPath), { recursive: true }); + await fs.writeFile(outputPath, body, "utf8"); + process.stdout.write(`QA coverage report: ${outputPath}\n`); + return; + } + + process.stdout.write(body); +} + export async function runQaCharacterEvalCommand(opts: { repoRoot?: string; outputDir?: string; diff --git a/extensions/qa-lab/src/cli.test.ts b/extensions/qa-lab/src/cli.test.ts index f71ba5ca5f7..5db64663f3e 100644 --- a/extensions/qa-lab/src/cli.test.ts +++ b/extensions/qa-lab/src/cli.test.ts @@ -44,12 +44,14 @@ const { runQaCredentialsAddCommand, runQaCredentialsListCommand, runQaCredentialsRemoveCommand, + runQaCoverageReportCommand, runQaProviderServerCommand, runQaTelegramCommand, } = vi.hoisted(() => ({ runQaCredentialsAddCommand: vi.fn(), runQaCredentialsListCommand: vi.fn(), runQaCredentialsRemoveCommand: vi.fn(), + runQaCoverageReportCommand: vi.fn(), runQaProviderServerCommand: vi.fn(), runQaTelegramCommand: vi.fn(), })); @@ -72,6 +74,7 @@ vi.mock("./cli.runtime.js", () => ({ runQaCredentialsAddCommand, runQaCredentialsListCommand, runQaCredentialsRemoveCommand, + runQaCoverageReportCommand, runQaProviderServerCommand, })); @@ -85,6 +88,7 @@ describe("qa cli registration", () => { runQaCredentialsAddCommand.mockReset(); runQaCredentialsListCommand.mockReset(); runQaCredentialsRemoveCommand.mockReset(); + runQaCoverageReportCommand.mockReset(); runQaProviderServerCommand.mockReset(); runQaTelegramCommand.mockReset(); listQaRunnerCliContributions @@ -101,10 +105,30 @@ describe("qa cli registration", () => { const qa = program.commands.find((command) => command.name() === "qa"); expect(qa).toBeDefined(); expect(qa?.commands.map((command) => command.name())).toEqual( - expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials"]), + expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials", "coverage"]), ); }); + it("routes coverage report flags into the qa runtime command", async () => { + await program.parseAsync([ + "node", + "openclaw", + "qa", + "coverage", + "--repo-root", + "/tmp/openclaw-repo", + "--output", + ".artifacts/qa-coverage.md", + "--json", + ]); + + expect(runQaCoverageReportCommand).toHaveBeenCalledWith({ + repoRoot: "/tmp/openclaw-repo", + output: ".artifacts/qa-coverage.md", + json: true, + }); + }); + it("delegates discovered qa runner registration through the generic host seam", () => { const [{ registration }] = listQaRunnerCliContributions.mock.results[0]?.value; expect(registration.register).toHaveBeenCalledTimes(1); diff --git a/extensions/qa-lab/src/cli.ts b/extensions/qa-lab/src/cli.ts index 0abba6901d8..d4f1feea9aa 100644 --- a/extensions/qa-lab/src/cli.ts +++ b/extensions/qa-lab/src/cli.ts @@ -60,6 +60,12 @@ async function runQaParityReport(opts: { const runtime = await loadQaLabCliRuntime(); await runtime.runQaParityReportCommand(opts); } + +async function runQaCoverageReport(opts: { repoRoot?: string; output?: string; json?: boolean }) { + const runtime = await loadQaLabCliRuntime(); + await runtime.runQaCoverageReportCommand(opts); +} + async function runQaCharacterEval(opts: { repoRoot?: string; outputDir?: string; @@ -302,6 +308,15 @@ export function registerQaLabCli(program: Command) { }, ); + qa.command("coverage") + .description("Print the markdown scenario coverage inventory") + .option("--repo-root ", "Repository root to target when writing --output") + .option("--output ", "Write the coverage inventory to this path") + .option("--json", "Print JSON instead of Markdown", false) + .action(async (opts: { repoRoot?: string; output?: string; json?: boolean }) => { + await runQaCoverageReport(opts); + }); + qa.command("character-eval") .description("Run the character QA scenario across live models and write a judged report") .option("--repo-root ", "Repository root to target when running from a neutral cwd") diff --git a/extensions/qa-lab/src/coverage-report.test.ts b/extensions/qa-lab/src/coverage-report.test.ts new file mode 100644 index 00000000000..2ced93d062c --- /dev/null +++ b/extensions/qa-lab/src/coverage-report.test.ts @@ -0,0 +1,31 @@ +import { describe, expect, it } from "vitest"; +import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js"; +import { readQaScenarioPack } from "./scenario-catalog.js"; + +describe("qa coverage report", () => { + it("groups scenario coverage metadata by theme and surface", () => { + const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios); + + expect(inventory.scenarioCount).toBeGreaterThan(0); + expect(inventory.coverageIdCount).toBeGreaterThan(0); + expect(inventory.primaryCoverageIdCount).toBeGreaterThan(0); + expect(inventory.secondaryCoverageIdCount).toBeGreaterThan(0); + expect(inventory.overlappingCoverage.length).toBeGreaterThan(0); + expect(inventory.missingCoverage).toEqual([]); + expect(inventory.byTheme.memory.some((feature) => feature.id === "memory.recall")).toBe(true); + expect(inventory.bySurface.memory.some((feature) => feature.id === "memory.recall")).toBe(true); + }); + + it("renders a compact markdown inventory", () => { + const report = renderQaCoverageMarkdownReport( + buildQaCoverageInventory(readQaScenarioPack().scenarios), + ); + + expect(report).toContain("# QA Coverage Inventory"); + expect(report).toContain("- Missing coverage metadata: 0"); + expect(report).toContain("- Overlapping coverage IDs:"); + expect(report).toContain("memory.recall"); + expect(report).toContain("primary: memory-recall (qa/scenarios/memory/memory-recall.md)"); + expect(report).toContain("secondary: active-memory-preprompt-recall"); + }); +}); diff --git a/extensions/qa-lab/src/coverage-report.ts b/extensions/qa-lab/src/coverage-report.ts new file mode 100644 index 00000000000..7ea2fa574a7 --- /dev/null +++ b/extensions/qa-lab/src/coverage-report.ts @@ -0,0 +1,192 @@ +import type { QaSeedScenarioWithSource } from "./scenario-catalog.js"; + +export type QaCoverageScenarioSummary = { + id: string; + title: string; + sourcePath: string; + theme: string; + surfaces: string[]; + risk: string; +}; + +export type QaCoverageIntent = "primary" | "secondary"; + +export type QaCoverageScenarioReference = QaCoverageScenarioSummary & { + intent: QaCoverageIntent; +}; + +export type QaCoverageFeatureSummary = { + id: string; + scenarios: QaCoverageScenarioReference[]; +}; + +export type QaCoverageInventory = { + scenarioCount: number; + coverageIdCount: number; + primaryCoverageIdCount: number; + secondaryCoverageIdCount: number; + features: QaCoverageFeatureSummary[]; + overlappingCoverage: QaCoverageFeatureSummary[]; + missingCoverage: QaCoverageScenarioSummary[]; + byTheme: Record; + bySurface: Record; +}; + +function scenarioTheme(sourcePath: string) { + const parts = sourcePath.split("/"); + return parts[2] ?? "unknown"; +} + +function scenarioSurfaces(scenario: QaSeedScenarioWithSource) { + return scenario.surfaces && scenario.surfaces.length > 0 ? scenario.surfaces : [scenario.surface]; +} + +function scenarioRisk(scenario: QaSeedScenarioWithSource) { + return scenario.risk ?? scenario.riskLevel ?? "unassigned"; +} + +function summarizeScenario(scenario: QaSeedScenarioWithSource): QaCoverageScenarioSummary { + return { + id: scenario.id, + title: scenario.title, + sourcePath: scenario.sourcePath, + theme: scenarioTheme(scenario.sourcePath), + surfaces: scenarioSurfaces(scenario), + risk: scenarioRisk(scenario), + }; +} + +function sortFeatures(features: readonly QaCoverageFeatureSummary[]) { + return features.toSorted((left, right) => left.id.localeCompare(right.id)); +} + +export function buildQaCoverageInventory( + scenarios: readonly QaSeedScenarioWithSource[], +): QaCoverageInventory { + const byCoverageId = new Map(); + const primaryCoverageIds = new Set(); + const secondaryCoverageIds = new Set(); + const missingCoverage: QaCoverageScenarioSummary[] = []; + + const addCoverage = ( + scenario: QaSeedScenarioWithSource, + coverageIds: readonly string[] | undefined, + intent: QaCoverageIntent, + ) => { + const summary = summarizeScenario(scenario); + for (const coverageId of coverageIds ?? []) { + const feature = byCoverageId.get(coverageId) ?? { + id: coverageId, + scenarios: [], + }; + feature.scenarios.push({ ...summary, intent }); + byCoverageId.set(coverageId, feature); + if (intent === "primary") { + primaryCoverageIds.add(coverageId); + } else { + secondaryCoverageIds.add(coverageId); + } + } + }; + + for (const scenario of scenarios) { + if (!scenario.coverage) { + missingCoverage.push(summarizeScenario(scenario)); + continue; + } + addCoverage(scenario, scenario.coverage.primary, "primary"); + addCoverage(scenario, scenario.coverage.secondary, "secondary"); + } + + const features = sortFeatures([...byCoverageId.values()]); + const overlappingCoverage = features.filter((feature) => feature.scenarios.length > 1); + const byTheme: Record = {}; + const bySurface: Record = {}; + + for (const feature of features) { + const themes = new Set(feature.scenarios.map((scenario) => scenario.theme)); + for (const theme of themes) { + byTheme[theme] ??= []; + byTheme[theme].push({ + ...feature, + scenarios: feature.scenarios.filter((scenario) => scenario.theme === theme), + }); + } + const surfaces = new Set(feature.scenarios.flatMap((scenario) => scenario.surfaces)); + for (const surface of surfaces) { + bySurface[surface] ??= []; + bySurface[surface].push({ + ...feature, + scenarios: feature.scenarios.filter((scenario) => scenario.surfaces.includes(surface)), + }); + } + } + + return { + scenarioCount: scenarios.length, + coverageIdCount: features.length, + primaryCoverageIdCount: primaryCoverageIds.size, + secondaryCoverageIdCount: secondaryCoverageIds.size, + features, + overlappingCoverage, + missingCoverage, + byTheme, + bySurface, + }; +} + +function pushFeatureLines(lines: string[], features: readonly QaCoverageFeatureSummary[]) { + for (const feature of sortFeatures(features)) { + const scenarios = feature.scenarios + .map((scenario) => `${scenario.intent}: ${scenario.id} (${scenario.sourcePath})`) + .join(", "); + lines.push(`- ${feature.id}: ${scenarios}`); + } +} + +export function renderQaCoverageMarkdownReport(inventory: QaCoverageInventory): string { + const lines: string[] = [ + "# QA Coverage Inventory", + "", + `- Scenarios: ${inventory.scenarioCount}`, + `- Coverage IDs: ${inventory.coverageIdCount}`, + `- Primary coverage IDs: ${inventory.primaryCoverageIdCount}`, + `- Secondary coverage IDs: ${inventory.secondaryCoverageIdCount}`, + `- Overlapping coverage IDs: ${inventory.overlappingCoverage.length}`, + `- Missing coverage metadata: ${inventory.missingCoverage.length}`, + "", + "## By Theme", + "", + ]; + + for (const theme of Object.keys(inventory.byTheme).toSorted()) { + lines.push(`### ${theme}`, ""); + pushFeatureLines(lines, inventory.byTheme[theme] ?? []); + lines.push(""); + } + + lines.push("## By Surface", ""); + for (const surface of Object.keys(inventory.bySurface).toSorted()) { + lines.push(`### ${surface}`, ""); + pushFeatureLines(lines, inventory.bySurface[surface] ?? []); + lines.push(""); + } + + if (inventory.overlappingCoverage.length > 0) { + lines.push("## Overlap", ""); + pushFeatureLines(lines, inventory.overlappingCoverage); + lines.push(""); + } + + if (inventory.missingCoverage.length > 0) { + lines.push("## Missing Metadata", ""); + for (const scenario of inventory.missingCoverage.toSorted((left, right) => + left.id.localeCompare(right.id), + )) { + lines.push(`- ${scenario.id}: ${scenario.sourcePath}`); + } + lines.push(""); + } + + return `${lines.join("\n").trimEnd()}\n`; +} diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index dbebaa1182d..c237283535c 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -27,6 +27,8 @@ describe("qa scenario catalog", () => { expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-c3po")).toBe(true); expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true); expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true); + expect(pack.scenarios.every((scenario) => scenario.coverage?.primary.length)).toBe(true); + expect(readQaScenarioById("memory-recall").coverage?.primary).toContain("memory.recall"); }); it("exposes bootstrap data from the markdown pack", () => { diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index 64dee666683..496ad55f96e 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -51,6 +51,44 @@ const qaScenarioExecutionSchema = z.object({ config: qaScenarioConfigSchema.optional(), }); +const qaCoverageIdSchema = z + .string() + .trim() + .regex(/^[a-z0-9]+(?:[.-][a-z0-9]+)*$/, { + message: "coverage ids must use lowercase dotted or dashed tokens", + }); + +const qaCoverageIdListSchema = z.array(qaCoverageIdSchema).min(1); + +const qaScenarioCoverageSchema = z + .object({ + primary: qaCoverageIdListSchema, + secondary: qaCoverageIdListSchema.optional(), + }) + .superRefine((coverage, ctx) => { + const seen = new Set(); + const coverageEntries = [ + ["primary", coverage.primary], + ["secondary", coverage.secondary], + ] as const; + for (const [intent, ids] of coverageEntries) { + if (!ids) { + continue; + } + for (const [index, id] of ids.entries()) { + if (!seen.has(id)) { + seen.add(id); + continue; + } + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: [intent, index], + message: `duplicate coverage id: ${id}`, + }); + } + } + }); + const qaScenarioGatewayRuntimeSchema = z.object({ forwardHostHome: z.boolean().optional(), }); @@ -138,6 +176,9 @@ const qaSeedScenarioSchema = z.object({ title: z.string().trim().min(1), surface: z.string().trim().min(1), category: z.string().trim().min(1).optional(), + coverage: qaScenarioCoverageSchema.optional(), + surfaces: z.array(z.string().trim().min(1)).min(1).optional(), + risk: z.enum(["low", "medium", "high"]).optional(), capabilities: z.array(z.string().trim().min(1)).optional(), lane: z.record(z.string(), z.union([z.boolean(), z.string()])).optional(), riskLevel: z.string().trim().min(1).optional(), diff --git a/qa/README.md b/qa/README.md index 98447b0c65c..cc07d65936d 100644 --- a/qa/README.md +++ b/qa/README.md @@ -13,5 +13,6 @@ Key workflow: - `qa suite` is the executable frontier subset / regression loop. - `qa manual` is the scoped personality and style probe after the executable subset is green. +- `qa coverage` prints the scenario coverage inventory from scenario frontmatter. Keep this folder in git. Add new scenarios here before wiring them into automation. diff --git a/qa/scenarios/agents/instruction-followthrough-repo-contract.md b/qa/scenarios/agents/instruction-followthrough-repo-contract.md index 8605da10c8c..8a7d756d298 100644 --- a/qa/scenarios/agents/instruction-followthrough-repo-contract.md +++ b/qa/scenarios/agents/instruction-followthrough-repo-contract.md @@ -4,6 +4,11 @@ id: instruction-followthrough-repo-contract title: Instruction followthrough repo contract surface: repo-contract +coverage: + primary: + - agents.instructions + secondary: + - runtime.first-action objective: Verify the agent reads repo instruction files first, follows the required tool order, and completes the first feasible action instead of stopping at a plan. successCriteria: - Agent reads the seeded instruction files before writing the requested artifact. diff --git a/qa/scenarios/agents/subagent-fanout-synthesis.md b/qa/scenarios/agents/subagent-fanout-synthesis.md index 60104f44de6..e8932431f01 100644 --- a/qa/scenarios/agents/subagent-fanout-synthesis.md +++ b/qa/scenarios/agents/subagent-fanout-synthesis.md @@ -4,6 +4,11 @@ id: subagent-fanout-synthesis title: Subagent fanout synthesis surface: subagents +coverage: + primary: + - agents.subagents + secondary: + - agents.synthesis objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply. successCriteria: - Parent flow launches at least two bounded subagent tasks. diff --git a/qa/scenarios/agents/subagent-handoff.md b/qa/scenarios/agents/subagent-handoff.md index 74853aa65d9..328935fbf06 100644 --- a/qa/scenarios/agents/subagent-handoff.md +++ b/qa/scenarios/agents/subagent-handoff.md @@ -4,6 +4,9 @@ id: subagent-handoff title: Subagent handoff surface: subagents +coverage: + primary: + - agents.subagents objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread. successCriteria: - Agent launches a bounded subagent task. diff --git a/qa/scenarios/channels/channel-chat-baseline.md b/qa/scenarios/channels/channel-chat-baseline.md index 50d4b65b734..2aa90a60641 100644 --- a/qa/scenarios/channels/channel-chat-baseline.md +++ b/qa/scenarios/channels/channel-chat-baseline.md @@ -4,6 +4,11 @@ id: channel-chat-baseline title: Channel baseline conversation surface: channel +coverage: + primary: + - channels.group-messages + secondary: + - channels.qa-channel objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics. successCriteria: - Agent replies in the shared channel transcript. diff --git a/qa/scenarios/channels/dm-chat-baseline.md b/qa/scenarios/channels/dm-chat-baseline.md index a38ec8b2066..39d8fd474b9 100644 --- a/qa/scenarios/channels/dm-chat-baseline.md +++ b/qa/scenarios/channels/dm-chat-baseline.md @@ -4,6 +4,11 @@ id: dm-chat-baseline title: DM baseline conversation surface: dm +coverage: + primary: + - channels.dm + secondary: + - channels.qa-channel objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character. successCriteria: - Agent replies in DM without channel routing mistakes. diff --git a/qa/scenarios/channels/reaction-edit-delete.md b/qa/scenarios/channels/reaction-edit-delete.md index 9d858bdc9f5..67fa230126f 100644 --- a/qa/scenarios/channels/reaction-edit-delete.md +++ b/qa/scenarios/channels/reaction-edit-delete.md @@ -4,6 +4,11 @@ id: reaction-edit-delete title: Reaction, edit, delete lifecycle surface: message-actions +coverage: + primary: + - channels.message-actions + secondary: + - channels.qa-channel objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them. successCriteria: - Agent adds at least one reaction. diff --git a/qa/scenarios/channels/thread-follow-up.md b/qa/scenarios/channels/thread-follow-up.md index 0349445179a..1d13db939cf 100644 --- a/qa/scenarios/channels/thread-follow-up.md +++ b/qa/scenarios/channels/thread-follow-up.md @@ -4,6 +4,11 @@ id: thread-follow-up title: Threaded follow-up surface: thread +coverage: + primary: + - channels.threads + secondary: + - channels.qa-channel objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel. successCriteria: - Agent creates or uses a thread for deeper work. diff --git a/qa/scenarios/character/character-vibes-c3po.md b/qa/scenarios/character/character-vibes-c3po.md index d708a6aa3a8..c75aee1ccb5 100644 --- a/qa/scenarios/character/character-vibes-c3po.md +++ b/qa/scenarios/character/character-vibes-c3po.md @@ -4,6 +4,11 @@ id: character-vibes-c3po title: "Nervous release protocol chat" surface: character +coverage: + primary: + - character.persona + secondary: + - workspace.artifacts objective: Capture a natural multi-turn C-3PO-flavored character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript. successCriteria: - Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture. diff --git a/qa/scenarios/character/character-vibes-gollum.md b/qa/scenarios/character/character-vibes-gollum.md index 0fc0d62b642..e004ad07baf 100644 --- a/qa/scenarios/character/character-vibes-gollum.md +++ b/qa/scenarios/character/character-vibes-gollum.md @@ -4,6 +4,11 @@ id: character-vibes-gollum title: "Late-night deploy helper chat" surface: character +coverage: + primary: + - character.persona + secondary: + - workspace.artifacts objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript. successCriteria: - Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture. diff --git a/qa/scenarios/config/config-apply-restart-wakeup.md b/qa/scenarios/config/config-apply-restart-wakeup.md index a22e97dd424..981569b679e 100644 --- a/qa/scenarios/config/config-apply-restart-wakeup.md +++ b/qa/scenarios/config/config-apply-restart-wakeup.md @@ -4,6 +4,11 @@ id: config-apply-restart-wakeup title: Config apply restart wake-up surface: config +coverage: + primary: + - config.restart-apply + secondary: + - runtime.gateway-restart objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel. successCriteria: - config.apply schedules a restart-required change. diff --git a/qa/scenarios/config/config-patch-hot-apply.md b/qa/scenarios/config/config-patch-hot-apply.md index 218f5f5199c..5569e3424b2 100644 --- a/qa/scenarios/config/config-patch-hot-apply.md +++ b/qa/scenarios/config/config-patch-hot-apply.md @@ -4,6 +4,11 @@ id: config-patch-hot-apply title: Config patch skill disable surface: config +coverage: + primary: + - config.hot-apply + secondary: + - plugins.skills objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly. successCriteria: - config.patch succeeds for the skill toggle change. diff --git a/qa/scenarios/config/config-restart-capability-flip.md b/qa/scenarios/config/config-restart-capability-flip.md index b25cb5e40b7..0b180b5955a 100644 --- a/qa/scenarios/config/config-restart-capability-flip.md +++ b/qa/scenarios/config/config-restart-capability-flip.md @@ -4,6 +4,11 @@ id: config-restart-capability-flip title: Config restart capability flip surface: config +coverage: + primary: + - config.restart-apply + secondary: + - plugins.capabilities objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up. successCriteria: - Capability is absent before the restart-triggering patch. diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md index 29ad2d1d6aa..d1d1edd4ef2 100644 --- a/qa/scenarios/index.md +++ b/qa/scenarios/index.md @@ -5,13 +5,24 @@ Single source of truth for repo-backed QA suite bootstrap data. - `index.md` defines pack-level bootstrap data - each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow` -- scenario markdown may also define category metadata, required plugins, lane filters, - and gateway config patching +- scenario markdown may also define coverage IDs, category metadata, required plugins, + lane filters, and gateway config patching - kickoff mission - QA operator identity - scenario files under one-level theme directories +Coverage tracking: + +- add `coverage.primary` IDs to each scenario's `qa-scenario` block +- add `coverage.secondary` only when a scenario intentionally protects another behavior +- keep IDs behavior-shaped, broad enough to reuse, lowercase, and dotted or dashed +- prefer reusing an existing feature ID over minting a scenario-shaped ID +- avoid copying the scenario title into coverage IDs +- use `pnpm openclaw qa coverage` to render the current inventory +- treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid +- keep source-path tracking in the report, not in the scenario schema + Theme directories: - `agents/` - agent behavior, instructions, and subagent flows diff --git a/qa/scenarios/media/image-generation-roundtrip.md b/qa/scenarios/media/image-generation-roundtrip.md index 430e0f9dfdb..a3ba5ba6a04 100644 --- a/qa/scenarios/media/image-generation-roundtrip.md +++ b/qa/scenarios/media/image-generation-roundtrip.md @@ -4,6 +4,11 @@ id: image-generation-roundtrip title: Image generation roundtrip surface: image-generation +coverage: + primary: + - media.image-generation + secondary: + - channels.qa-channel objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path. successCriteria: - image_generate produces a saved MEDIA artifact. diff --git a/qa/scenarios/media/image-understanding-attachment.md b/qa/scenarios/media/image-understanding-attachment.md index 31801ee207f..c76d39ea588 100644 --- a/qa/scenarios/media/image-understanding-attachment.md +++ b/qa/scenarios/media/image-understanding-attachment.md @@ -4,6 +4,11 @@ id: image-understanding-attachment title: Image understanding from attachment surface: image-understanding +coverage: + primary: + - media.image-understanding + secondary: + - channels.qa-channel objective: Verify an attached image reaches the agent model and the agent can describe what it sees. successCriteria: - Agent receives at least one image attachment. diff --git a/qa/scenarios/media/native-image-generation.md b/qa/scenarios/media/native-image-generation.md index 805c54a7bc4..3a9ab415c9e 100644 --- a/qa/scenarios/media/native-image-generation.md +++ b/qa/scenarios/media/native-image-generation.md @@ -4,6 +4,11 @@ id: native-image-generation title: Native image generation surface: image-generation +coverage: + primary: + - media.image-generation + secondary: + - tools.native-image-generation objective: Verify image_generate appears when configured and returns a real saved media artifact. successCriteria: - image_generate appears in the effective tool inventory. diff --git a/qa/scenarios/memory/active-memory-preprompt-recall.md b/qa/scenarios/memory/active-memory-preprompt-recall.md index 02ca35fbb92..4f9a1c506e0 100644 --- a/qa/scenarios/memory/active-memory-preprompt-recall.md +++ b/qa/scenarios/memory/active-memory-preprompt-recall.md @@ -4,6 +4,11 @@ id: active-memory-preprompt-recall title: Active Memory pre-reply recall surface: memory +coverage: + primary: + - memory.active-recall + secondary: + - memory.recall objective: Verify Active Memory surfaces a memory-only preference before the main reply, and that the same question stays unresolved when the plugin is off. plugins: - active-memory diff --git a/qa/scenarios/memory/memory-dreaming-sweep.md b/qa/scenarios/memory/memory-dreaming-sweep.md index acd01a3c640..38ff22a8408 100644 --- a/qa/scenarios/memory/memory-dreaming-sweep.md +++ b/qa/scenarios/memory/memory-dreaming-sweep.md @@ -4,6 +4,9 @@ id: memory-dreaming-sweep title: Memory dreaming sweep surface: memory +coverage: + primary: + - memory.dreaming objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory. successCriteria: - Dreaming can be enabled and doctor.memory.status reports the managed sweep cron. diff --git a/qa/scenarios/memory/memory-failure-fallback.md b/qa/scenarios/memory/memory-failure-fallback.md index ed48187376a..f8ca52ca509 100644 --- a/qa/scenarios/memory/memory-failure-fallback.md +++ b/qa/scenarios/memory/memory-failure-fallback.md @@ -4,6 +4,11 @@ id: memory-failure-fallback title: Memory failure fallback surface: memory +coverage: + primary: + - memory.failure-handling + secondary: + - runtime.fallbacks objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes. successCriteria: - Memory tools are absent from the effective tool inventory. diff --git a/qa/scenarios/memory/memory-recall.md b/qa/scenarios/memory/memory-recall.md index 908cbdca72c..bc1657170a8 100644 --- a/qa/scenarios/memory/memory-recall.md +++ b/qa/scenarios/memory/memory-recall.md @@ -35,6 +35,9 @@ id: memory-recall title: Memory recall after context switch surface: memory +coverage: + primary: + - memory.recall objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later. successCriteria: - Agent acknowledges the seeded fact. diff --git a/qa/scenarios/memory/memory-tools-channel-context.md b/qa/scenarios/memory/memory-tools-channel-context.md index a13a1173d27..8e470d4c42e 100644 --- a/qa/scenarios/memory/memory-tools-channel-context.md +++ b/qa/scenarios/memory/memory-tools-channel-context.md @@ -4,6 +4,11 @@ id: memory-tools-channel-context title: Memory tools in channel context surface: memory +coverage: + primary: + - memory.tools + secondary: + - channels.group-messages objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript. successCriteria: - Agent uses memory_search before answering. diff --git a/qa/scenarios/memory/session-memory-ranking.md b/qa/scenarios/memory/session-memory-ranking.md index dd153b5e6e6..a17dbcb24fb 100644 --- a/qa/scenarios/memory/session-memory-ranking.md +++ b/qa/scenarios/memory/session-memory-ranking.md @@ -4,6 +4,11 @@ id: session-memory-ranking title: Session memory ranking surface: memory +coverage: + primary: + - memory.ranking + secondary: + - memory.recall objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact. successCriteria: - Session memory indexing is enabled for the scenario. diff --git a/qa/scenarios/memory/thread-memory-isolation.md b/qa/scenarios/memory/thread-memory-isolation.md index 68d6923e603..49171352151 100644 --- a/qa/scenarios/memory/thread-memory-isolation.md +++ b/qa/scenarios/memory/thread-memory-isolation.md @@ -4,6 +4,11 @@ id: thread-memory-isolation title: Thread memory isolation surface: memory +coverage: + primary: + - memory.thread-isolation + secondary: + - channels.threads objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel. successCriteria: - Agent uses memory tools inside the thread. diff --git a/qa/scenarios/models/anthropic-opus-api-key-smoke.md b/qa/scenarios/models/anthropic-opus-api-key-smoke.md index b530620e50b..21b1f993171 100644 --- a/qa/scenarios/models/anthropic-opus-api-key-smoke.md +++ b/qa/scenarios/models/anthropic-opus-api-key-smoke.md @@ -4,6 +4,11 @@ id: anthropic-opus-api-key-smoke title: Anthropic Opus API key smoke surface: model-provider +coverage: + primary: + - models.provider-auth + secondary: + - models.anthropic objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using API-key auth. successCriteria: - A live-frontier run fails fast unless the selected primary provider is anthropic. diff --git a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md index df3a2ae6a06..231403d1e7c 100644 --- a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md +++ b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md @@ -4,6 +4,11 @@ id: anthropic-opus-setup-token-smoke title: Anthropic Opus setup-token smoke surface: model-provider +coverage: + primary: + - models.provider-auth + secondary: + - models.anthropic objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using setup-token auth. successCriteria: - A live-frontier run fails fast unless the selected primary provider is anthropic. diff --git a/qa/scenarios/models/claude-cli-provider-capabilities-subscription.md b/qa/scenarios/models/claude-cli-provider-capabilities-subscription.md index 03d97d572fd..32778636aac 100644 --- a/qa/scenarios/models/claude-cli-provider-capabilities-subscription.md +++ b/qa/scenarios/models/claude-cli-provider-capabilities-subscription.md @@ -4,6 +4,11 @@ id: claude-cli-provider-capabilities-subscription title: Claude CLI provider capabilities subscription surface: model-provider +coverage: + primary: + - models.provider-capabilities + secondary: + - models.claude-cli objective: Verify the Claude CLI model-provider lane can use native Claude subscription auth to talk, read an attached image, use bundled MCP tools, and apply workspace skills. successCriteria: - A live-frontier run fails fast unless the selected primary provider is claude-cli. diff --git a/qa/scenarios/models/claude-cli-provider-capabilities.md b/qa/scenarios/models/claude-cli-provider-capabilities.md index 6d7cb123f27..f4b3cff31c7 100644 --- a/qa/scenarios/models/claude-cli-provider-capabilities.md +++ b/qa/scenarios/models/claude-cli-provider-capabilities.md @@ -4,6 +4,11 @@ id: claude-cli-provider-capabilities title: Claude CLI provider capabilities API key surface: model-provider +coverage: + primary: + - models.provider-capabilities + secondary: + - models.claude-cli objective: Verify the Claude CLI model-provider lane can use the Anthropic API key path to talk, read an attached image, use bundled MCP tools, and apply workspace skills. successCriteria: - A live-frontier run fails fast unless the selected primary provider is claude-cli. diff --git a/qa/scenarios/models/codex-harness-no-meta-leak.md b/qa/scenarios/models/codex-harness-no-meta-leak.md index a1ee6606207..1d568b003b0 100644 --- a/qa/scenarios/models/codex-harness-no-meta-leak.md +++ b/qa/scenarios/models/codex-harness-no-meta-leak.md @@ -4,6 +4,11 @@ id: codex-harness-no-meta-leak title: Codex harness no meta leak surface: dm +coverage: + primary: + - models.codex-cli + secondary: + - runtime.no-meta-leak objective: Verify the Codex app-server harness keeps coordination/meta chatter out of the visible reply. successCriteria: - The scenario forces the Codex embedded harness and disables PI fallback. diff --git a/qa/scenarios/models/model-switch-follow-up.md b/qa/scenarios/models/model-switch-follow-up.md index 2744dda4e47..733eff5e4fa 100644 --- a/qa/scenarios/models/model-switch-follow-up.md +++ b/qa/scenarios/models/model-switch-follow-up.md @@ -4,6 +4,11 @@ id: model-switch-follow-up title: Model switch follow-up surface: models +coverage: + primary: + - models.switching + secondary: + - runtime.session-continuity objective: Verify the agent can switch to a different configured model and continue coherently. successCriteria: - Agent reflects the model switch request. diff --git a/qa/scenarios/models/model-switch-tool-continuity.md b/qa/scenarios/models/model-switch-tool-continuity.md index 7e162b2e331..067bae0ec41 100644 --- a/qa/scenarios/models/model-switch-tool-continuity.md +++ b/qa/scenarios/models/model-switch-tool-continuity.md @@ -4,6 +4,11 @@ id: model-switch-tool-continuity title: Model switch with tool continuity surface: models +coverage: + primary: + - models.switching + secondary: + - runtime.tool-continuity objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior. successCriteria: - Alternate model is actually requested. diff --git a/qa/scenarios/plugins/bundled-plugin-skill-runtime.md b/qa/scenarios/plugins/bundled-plugin-skill-runtime.md index 0a959713abc..fdd29e141f5 100644 --- a/qa/scenarios/plugins/bundled-plugin-skill-runtime.md +++ b/qa/scenarios/plugins/bundled-plugin-skill-runtime.md @@ -4,6 +4,11 @@ id: bundled-plugin-skill-runtime title: Bundled plugin skill runtime surface: skills +coverage: + primary: + - plugins.skills + secondary: + - plugins.runtime objective: Verify packaged bundled plugin skills load from dist-runtime instead of being skipped by path-containment checks. successCriteria: - The runtime-packaged bundled plugin tree is used as OPENCLAW_BUNDLED_PLUGINS_DIR. diff --git a/qa/scenarios/plugins/mcp-plugin-tools-call.md b/qa/scenarios/plugins/mcp-plugin-tools-call.md index 04cdc26b79d..20a0f33a4b9 100644 --- a/qa/scenarios/plugins/mcp-plugin-tools-call.md +++ b/qa/scenarios/plugins/mcp-plugin-tools-call.md @@ -4,6 +4,11 @@ id: mcp-plugin-tools-call title: MCP plugin-tools call surface: mcp +coverage: + primary: + - plugins.mcp-tools + secondary: + - tools.invocation objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully. successCriteria: - Plugin tools MCP server lists memory_search. diff --git a/qa/scenarios/plugins/skill-install-hot-availability.md b/qa/scenarios/plugins/skill-install-hot-availability.md index 751f91e2230..39b669392c9 100644 --- a/qa/scenarios/plugins/skill-install-hot-availability.md +++ b/qa/scenarios/plugins/skill-install-hot-availability.md @@ -4,6 +4,11 @@ id: skill-install-hot-availability title: Skill install hot availability surface: skills +coverage: + primary: + - plugins.skills + secondary: + - plugins.hot-install objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately. successCriteria: - Skill is absent before install. diff --git a/qa/scenarios/plugins/skill-visibility-invocation.md b/qa/scenarios/plugins/skill-visibility-invocation.md index 8ae68a2a302..4fc70003a48 100644 --- a/qa/scenarios/plugins/skill-visibility-invocation.md +++ b/qa/scenarios/plugins/skill-visibility-invocation.md @@ -4,6 +4,11 @@ id: skill-visibility-invocation title: Skill visibility and invocation surface: skills +coverage: + primary: + - plugins.skills + secondary: + - tools.invocation objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn. successCriteria: - skills.status reports the seeded skill as visible and eligible. diff --git a/qa/scenarios/runtime/approval-turn-tool-followthrough.md b/qa/scenarios/runtime/approval-turn-tool-followthrough.md index af2d87a3b47..bc086ca0674 100644 --- a/qa/scenarios/runtime/approval-turn-tool-followthrough.md +++ b/qa/scenarios/runtime/approval-turn-tool-followthrough.md @@ -4,6 +4,11 @@ id: approval-turn-tool-followthrough title: Approval turn tool followthrough surface: harness +coverage: + primary: + - runtime.approvals + secondary: + - tools.followthrough objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration. successCriteria: - Agent can keep the pre-action turn brief. diff --git a/qa/scenarios/runtime/compaction-retry-mutating-tool.md b/qa/scenarios/runtime/compaction-retry-mutating-tool.md index 54c33702acb..c67ad6a53d9 100644 --- a/qa/scenarios/runtime/compaction-retry-mutating-tool.md +++ b/qa/scenarios/runtime/compaction-retry-mutating-tool.md @@ -4,6 +4,11 @@ id: compaction-retry-mutating-tool title: Compaction retry after mutating tool surface: runtime +coverage: + primary: + - runtime.compaction + secondary: + - runtime.retry-policy objective: Verify a real mutating tool step keeps replay-unsafety explicit instead of disappearing into a clean-looking success if the run compacts or retries. successCriteria: - Agent reads the seeded large context before it writes. diff --git a/qa/scenarios/runtime/empty-response-recovery-replay-safe-read.md b/qa/scenarios/runtime/empty-response-recovery-replay-safe-read.md index 0f25b56b5bb..d84107c0e42 100644 --- a/qa/scenarios/runtime/empty-response-recovery-replay-safe-read.md +++ b/qa/scenarios/runtime/empty-response-recovery-replay-safe-read.md @@ -4,6 +4,11 @@ id: empty-response-recovery-replay-safe-read title: Empty-response recovery after replay-safe read surface: runtime +coverage: + primary: + - runtime.empty-response-recovery + secondary: + - runtime.retry-policy objective: Verify an empty visible GPT turn after a replay-safe read auto-continues into a visible answer. successCriteria: - Scenario is mock-openai only so live lanes do not pick it up implicitly. diff --git a/qa/scenarios/runtime/empty-response-retry-budget-exhausted.md b/qa/scenarios/runtime/empty-response-retry-budget-exhausted.md index 1e69b1ef603..51fa187ca83 100644 --- a/qa/scenarios/runtime/empty-response-retry-budget-exhausted.md +++ b/qa/scenarios/runtime/empty-response-retry-budget-exhausted.md @@ -4,6 +4,11 @@ id: empty-response-retry-budget-exhausted title: Empty-response retry budget exhausted surface: runtime +coverage: + primary: + - runtime.empty-response-recovery + secondary: + - runtime.retry-policy objective: Verify repeated empty GPT turns exhaust the retry budget after one continuation attempt. successCriteria: - Scenario is mock-openai only so live lanes do not pick it up implicitly. diff --git a/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md b/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md index 21a15d54457..d98edf5491f 100644 --- a/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md +++ b/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md @@ -4,6 +4,11 @@ id: reasoning-only-no-auto-retry-after-write title: Reasoning-only no-auto-retry after write surface: runtime +coverage: + primary: + - runtime.reasoning-only-recovery + secondary: + - runtime.retry-policy objective: Verify a GPT-style reasoning-only turn after a mutating write stays replay-unsafe and does not auto-retry. successCriteria: - Scenario is mock-openai only so live lanes do not pick it up implicitly. diff --git a/qa/scenarios/runtime/reasoning-only-recovery-replay-safe-read.md b/qa/scenarios/runtime/reasoning-only-recovery-replay-safe-read.md index 95489b00c0f..1696cc6cadb 100644 --- a/qa/scenarios/runtime/reasoning-only-recovery-replay-safe-read.md +++ b/qa/scenarios/runtime/reasoning-only-recovery-replay-safe-read.md @@ -4,6 +4,11 @@ id: reasoning-only-recovery-replay-safe-read title: Reasoning-only recovery after replay-safe read surface: runtime +coverage: + primary: + - runtime.reasoning-only-recovery + secondary: + - runtime.retry-policy objective: Verify a GPT-style reasoning-only turn after a replay-safe read auto-continues into a visible answer. successCriteria: - Scenario is mock-openai only so live lanes do not pick it up implicitly. diff --git a/qa/scenarios/runtime/runtime-inventory-drift-check.md b/qa/scenarios/runtime/runtime-inventory-drift-check.md index 4305aa58482..9d3f978a175 100644 --- a/qa/scenarios/runtime/runtime-inventory-drift-check.md +++ b/qa/scenarios/runtime/runtime-inventory-drift-check.md @@ -4,6 +4,9 @@ id: runtime-inventory-drift-check title: Runtime inventory drift check surface: inventory +coverage: + primary: + - runtime.inventory objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes. successCriteria: - Enabled tool appears before the config change. diff --git a/qa/scenarios/scheduling/cron-one-minute-ping.md b/qa/scenarios/scheduling/cron-one-minute-ping.md index 36039659460..2e7b5a464cf 100644 --- a/qa/scenarios/scheduling/cron-one-minute-ping.md +++ b/qa/scenarios/scheduling/cron-one-minute-ping.md @@ -4,6 +4,11 @@ id: cron-one-minute-ping title: Cron one-minute ping surface: cron +coverage: + primary: + - scheduling.cron + secondary: + - channels.qa-channel objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel. successCriteria: - Agent schedules a cron reminder roughly one minute ahead. diff --git a/qa/scenarios/ui/control-ui-qa-channel-image-roundtrip.md b/qa/scenarios/ui/control-ui-qa-channel-image-roundtrip.md index a8cc5f2bdbc..31ac791d7cc 100644 --- a/qa/scenarios/ui/control-ui-qa-channel-image-roundtrip.md +++ b/qa/scenarios/ui/control-ui-qa-channel-image-roundtrip.md @@ -4,6 +4,12 @@ id: control-ui-qa-channel-image-roundtrip title: Control UI plus qa-channel image roundtrip surface: control-ui +coverage: + primary: + - ui.control + secondary: + - media.image-understanding + - channels.qa-channel objective: Verify the embedded Control UI can observe a qa-channel-backed session while the fake channel injects text and image turns that the agent answers correctly. successCriteria: - Control UI opens directly on the target qa-channel session. diff --git a/qa/scenarios/workspace/lobster-invaders-build.md b/qa/scenarios/workspace/lobster-invaders-build.md index d10ac59c2ac..92292f8e013 100644 --- a/qa/scenarios/workspace/lobster-invaders-build.md +++ b/qa/scenarios/workspace/lobster-invaders-build.md @@ -4,6 +4,11 @@ id: lobster-invaders-build title: Build Lobster Invaders surface: workspace +coverage: + primary: + - workspace.artifacts + secondary: + - workspace.builds objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed. successCriteria: - Agent inspects source before coding. diff --git a/qa/scenarios/workspace/medium-game-plan-codex-harness.md b/qa/scenarios/workspace/medium-game-plan-codex-harness.md index e566f349f45..2e9d0bcb642 100644 --- a/qa/scenarios/workspace/medium-game-plan-codex-harness.md +++ b/qa/scenarios/workspace/medium-game-plan-codex-harness.md @@ -4,6 +4,11 @@ id: medium-game-plan-codex-harness title: Medium game plan Codex harness surface: workspace +coverage: + primary: + - workspace.planning + secondary: + - models.codex-cli objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game. successCriteria: - A live-frontier run fails fast unless the selected primary model is codex/gpt-5.4. diff --git a/qa/scenarios/workspace/medium-game-plan-pi-harness.md b/qa/scenarios/workspace/medium-game-plan-pi-harness.md index e4ce8ea56c1..9c22709285d 100644 --- a/qa/scenarios/workspace/medium-game-plan-pi-harness.md +++ b/qa/scenarios/workspace/medium-game-plan-pi-harness.md @@ -4,6 +4,11 @@ id: medium-game-plan-pi-harness title: Medium game plan PI harness surface: workspace +coverage: + primary: + - workspace.planning + secondary: + - agents.pi-harness objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game. successCriteria: - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4. diff --git a/qa/scenarios/workspace/source-docs-discovery-report.md b/qa/scenarios/workspace/source-docs-discovery-report.md index 8a4f999478a..e0f52673e99 100644 --- a/qa/scenarios/workspace/source-docs-discovery-report.md +++ b/qa/scenarios/workspace/source-docs-discovery-report.md @@ -4,6 +4,11 @@ id: source-docs-discovery-report title: Source and docs discovery report surface: discovery +coverage: + primary: + - workspace.repo-discovery + secondary: + - docs.discovery objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report. successCriteria: - Agent reads docs and source before proposing more tests.