QA: track scenario coverage intent

This commit is contained in:
Gustavo Madeira Santana
2026-04-17 14:01:20 -04:00
parent f334ca2b50
commit 3a1e469732
56 changed files with 576 additions and 3 deletions

View File

@@ -72,6 +72,7 @@ import {
runQaDockerScaffoldCommand,
runQaDockerUpCommand,
runQaCharacterEvalCommand,
runQaCoverageReportCommand,
runQaManualLaneCommand,
runQaParityReportCommand,
runQaSuiteCommand,
@@ -336,6 +337,13 @@ describe("qa cli runtime", () => {
}
});
it("prints a markdown coverage report from scenario metadata", async () => {
await runQaCoverageReportCommand({ repoRoot: process.cwd() });
expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("# QA Coverage Inventory"));
expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("memory.recall"));
});
it("resolves character eval paths and passes model refs through", async () => {
await runQaCharacterEvalCommand({
repoRoot: "/tmp/openclaw-repo",

View File

@@ -9,6 +9,7 @@ import {
import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js";
import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
import { runQaDockerUp } from "./docker-up.runtime.js";
import type { QaCliBackendAuthMode } from "./gateway-child.js";
@@ -36,6 +37,7 @@ import {
type QaProviderMode,
type QaProviderModeInput,
} from "./run-config.js";
import { readQaScenarioPack } from "./scenario-catalog.js";
import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js";
type InterruptibleServer = {
@@ -442,6 +444,29 @@ export async function runQaParityReportCommand(opts: {
process.exitCode = 1;
}
}
export async function runQaCoverageReportCommand(opts: {
repoRoot?: string;
output?: string;
json?: boolean;
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios);
const outputPath = opts.output ? path.resolve(repoRoot, opts.output) : undefined;
const body = opts.json
? `${JSON.stringify(inventory, null, 2)}\n`
: renderQaCoverageMarkdownReport(inventory);
if (outputPath) {
await fs.mkdir(path.dirname(outputPath), { recursive: true });
await fs.writeFile(outputPath, body, "utf8");
process.stdout.write(`QA coverage report: ${outputPath}\n`);
return;
}
process.stdout.write(body);
}
export async function runQaCharacterEvalCommand(opts: {
repoRoot?: string;
outputDir?: string;

View File

@@ -44,12 +44,14 @@ const {
runQaCredentialsAddCommand,
runQaCredentialsListCommand,
runQaCredentialsRemoveCommand,
runQaCoverageReportCommand,
runQaProviderServerCommand,
runQaTelegramCommand,
} = vi.hoisted(() => ({
runQaCredentialsAddCommand: vi.fn(),
runQaCredentialsListCommand: vi.fn(),
runQaCredentialsRemoveCommand: vi.fn(),
runQaCoverageReportCommand: vi.fn(),
runQaProviderServerCommand: vi.fn(),
runQaTelegramCommand: vi.fn(),
}));
@@ -72,6 +74,7 @@ vi.mock("./cli.runtime.js", () => ({
runQaCredentialsAddCommand,
runQaCredentialsListCommand,
runQaCredentialsRemoveCommand,
runQaCoverageReportCommand,
runQaProviderServerCommand,
}));
@@ -85,6 +88,7 @@ describe("qa cli registration", () => {
runQaCredentialsAddCommand.mockReset();
runQaCredentialsListCommand.mockReset();
runQaCredentialsRemoveCommand.mockReset();
runQaCoverageReportCommand.mockReset();
runQaProviderServerCommand.mockReset();
runQaTelegramCommand.mockReset();
listQaRunnerCliContributions
@@ -101,10 +105,30 @@ describe("qa cli registration", () => {
const qa = program.commands.find((command) => command.name() === "qa");
expect(qa).toBeDefined();
expect(qa?.commands.map((command) => command.name())).toEqual(
expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials"]),
expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials", "coverage"]),
);
});
it("routes coverage report flags into the qa runtime command", async () => {
await program.parseAsync([
"node",
"openclaw",
"qa",
"coverage",
"--repo-root",
"/tmp/openclaw-repo",
"--output",
".artifacts/qa-coverage.md",
"--json",
]);
expect(runQaCoverageReportCommand).toHaveBeenCalledWith({
repoRoot: "/tmp/openclaw-repo",
output: ".artifacts/qa-coverage.md",
json: true,
});
});
it("delegates discovered qa runner registration through the generic host seam", () => {
const [{ registration }] = listQaRunnerCliContributions.mock.results[0]?.value;
expect(registration.register).toHaveBeenCalledTimes(1);

View File

@@ -60,6 +60,12 @@ async function runQaParityReport(opts: {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaParityReportCommand(opts);
}
async function runQaCoverageReport(opts: { repoRoot?: string; output?: string; json?: boolean }) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaCoverageReportCommand(opts);
}
async function runQaCharacterEval(opts: {
repoRoot?: string;
outputDir?: string;
@@ -302,6 +308,15 @@ export function registerQaLabCli(program: Command) {
},
);
qa.command("coverage")
.description("Print the markdown scenario coverage inventory")
.option("--repo-root <path>", "Repository root to target when writing --output")
.option("--output <path>", "Write the coverage inventory to this path")
.option("--json", "Print JSON instead of Markdown", false)
.action(async (opts: { repoRoot?: string; output?: string; json?: boolean }) => {
await runQaCoverageReport(opts);
});
qa.command("character-eval")
.description("Run the character QA scenario across live models and write a judged report")
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")

View File

@@ -0,0 +1,31 @@
import { describe, expect, it } from "vitest";
import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js";
import { readQaScenarioPack } from "./scenario-catalog.js";
describe("qa coverage report", () => {
it("groups scenario coverage metadata by theme and surface", () => {
const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios);
expect(inventory.scenarioCount).toBeGreaterThan(0);
expect(inventory.coverageIdCount).toBeGreaterThan(0);
expect(inventory.primaryCoverageIdCount).toBeGreaterThan(0);
expect(inventory.secondaryCoverageIdCount).toBeGreaterThan(0);
expect(inventory.overlappingCoverage.length).toBeGreaterThan(0);
expect(inventory.missingCoverage).toEqual([]);
expect(inventory.byTheme.memory.some((feature) => feature.id === "memory.recall")).toBe(true);
expect(inventory.bySurface.memory.some((feature) => feature.id === "memory.recall")).toBe(true);
});
it("renders a compact markdown inventory", () => {
const report = renderQaCoverageMarkdownReport(
buildQaCoverageInventory(readQaScenarioPack().scenarios),
);
expect(report).toContain("# QA Coverage Inventory");
expect(report).toContain("- Missing coverage metadata: 0");
expect(report).toContain("- Overlapping coverage IDs:");
expect(report).toContain("memory.recall");
expect(report).toContain("primary: memory-recall (qa/scenarios/memory/memory-recall.md)");
expect(report).toContain("secondary: active-memory-preprompt-recall");
});
});

View File

@@ -0,0 +1,192 @@
import type { QaSeedScenarioWithSource } from "./scenario-catalog.js";
export type QaCoverageScenarioSummary = {
id: string;
title: string;
sourcePath: string;
theme: string;
surfaces: string[];
risk: string;
};
export type QaCoverageIntent = "primary" | "secondary";
export type QaCoverageScenarioReference = QaCoverageScenarioSummary & {
intent: QaCoverageIntent;
};
export type QaCoverageFeatureSummary = {
id: string;
scenarios: QaCoverageScenarioReference[];
};
export type QaCoverageInventory = {
scenarioCount: number;
coverageIdCount: number;
primaryCoverageIdCount: number;
secondaryCoverageIdCount: number;
features: QaCoverageFeatureSummary[];
overlappingCoverage: QaCoverageFeatureSummary[];
missingCoverage: QaCoverageScenarioSummary[];
byTheme: Record<string, QaCoverageFeatureSummary[]>;
bySurface: Record<string, QaCoverageFeatureSummary[]>;
};
function scenarioTheme(sourcePath: string) {
const parts = sourcePath.split("/");
return parts[2] ?? "unknown";
}
function scenarioSurfaces(scenario: QaSeedScenarioWithSource) {
return scenario.surfaces && scenario.surfaces.length > 0 ? scenario.surfaces : [scenario.surface];
}
function scenarioRisk(scenario: QaSeedScenarioWithSource) {
return scenario.risk ?? scenario.riskLevel ?? "unassigned";
}
function summarizeScenario(scenario: QaSeedScenarioWithSource): QaCoverageScenarioSummary {
return {
id: scenario.id,
title: scenario.title,
sourcePath: scenario.sourcePath,
theme: scenarioTheme(scenario.sourcePath),
surfaces: scenarioSurfaces(scenario),
risk: scenarioRisk(scenario),
};
}
function sortFeatures(features: readonly QaCoverageFeatureSummary[]) {
return features.toSorted((left, right) => left.id.localeCompare(right.id));
}
export function buildQaCoverageInventory(
scenarios: readonly QaSeedScenarioWithSource[],
): QaCoverageInventory {
const byCoverageId = new Map<string, QaCoverageFeatureSummary>();
const primaryCoverageIds = new Set<string>();
const secondaryCoverageIds = new Set<string>();
const missingCoverage: QaCoverageScenarioSummary[] = [];
const addCoverage = (
scenario: QaSeedScenarioWithSource,
coverageIds: readonly string[] | undefined,
intent: QaCoverageIntent,
) => {
const summary = summarizeScenario(scenario);
for (const coverageId of coverageIds ?? []) {
const feature = byCoverageId.get(coverageId) ?? {
id: coverageId,
scenarios: [],
};
feature.scenarios.push({ ...summary, intent });
byCoverageId.set(coverageId, feature);
if (intent === "primary") {
primaryCoverageIds.add(coverageId);
} else {
secondaryCoverageIds.add(coverageId);
}
}
};
for (const scenario of scenarios) {
if (!scenario.coverage) {
missingCoverage.push(summarizeScenario(scenario));
continue;
}
addCoverage(scenario, scenario.coverage.primary, "primary");
addCoverage(scenario, scenario.coverage.secondary, "secondary");
}
const features = sortFeatures([...byCoverageId.values()]);
const overlappingCoverage = features.filter((feature) => feature.scenarios.length > 1);
const byTheme: Record<string, QaCoverageFeatureSummary[]> = {};
const bySurface: Record<string, QaCoverageFeatureSummary[]> = {};
for (const feature of features) {
const themes = new Set(feature.scenarios.map((scenario) => scenario.theme));
for (const theme of themes) {
byTheme[theme] ??= [];
byTheme[theme].push({
...feature,
scenarios: feature.scenarios.filter((scenario) => scenario.theme === theme),
});
}
const surfaces = new Set(feature.scenarios.flatMap((scenario) => scenario.surfaces));
for (const surface of surfaces) {
bySurface[surface] ??= [];
bySurface[surface].push({
...feature,
scenarios: feature.scenarios.filter((scenario) => scenario.surfaces.includes(surface)),
});
}
}
return {
scenarioCount: scenarios.length,
coverageIdCount: features.length,
primaryCoverageIdCount: primaryCoverageIds.size,
secondaryCoverageIdCount: secondaryCoverageIds.size,
features,
overlappingCoverage,
missingCoverage,
byTheme,
bySurface,
};
}
function pushFeatureLines(lines: string[], features: readonly QaCoverageFeatureSummary[]) {
for (const feature of sortFeatures(features)) {
const scenarios = feature.scenarios
.map((scenario) => `${scenario.intent}: ${scenario.id} (${scenario.sourcePath})`)
.join(", ");
lines.push(`- ${feature.id}: ${scenarios}`);
}
}
export function renderQaCoverageMarkdownReport(inventory: QaCoverageInventory): string {
const lines: string[] = [
"# QA Coverage Inventory",
"",
`- Scenarios: ${inventory.scenarioCount}`,
`- Coverage IDs: ${inventory.coverageIdCount}`,
`- Primary coverage IDs: ${inventory.primaryCoverageIdCount}`,
`- Secondary coverage IDs: ${inventory.secondaryCoverageIdCount}`,
`- Overlapping coverage IDs: ${inventory.overlappingCoverage.length}`,
`- Missing coverage metadata: ${inventory.missingCoverage.length}`,
"",
"## By Theme",
"",
];
for (const theme of Object.keys(inventory.byTheme).toSorted()) {
lines.push(`### ${theme}`, "");
pushFeatureLines(lines, inventory.byTheme[theme] ?? []);
lines.push("");
}
lines.push("## By Surface", "");
for (const surface of Object.keys(inventory.bySurface).toSorted()) {
lines.push(`### ${surface}`, "");
pushFeatureLines(lines, inventory.bySurface[surface] ?? []);
lines.push("");
}
if (inventory.overlappingCoverage.length > 0) {
lines.push("## Overlap", "");
pushFeatureLines(lines, inventory.overlappingCoverage);
lines.push("");
}
if (inventory.missingCoverage.length > 0) {
lines.push("## Missing Metadata", "");
for (const scenario of inventory.missingCoverage.toSorted((left, right) =>
left.id.localeCompare(right.id),
)) {
lines.push(`- ${scenario.id}: ${scenario.sourcePath}`);
}
lines.push("");
}
return `${lines.join("\n").trimEnd()}\n`;
}

View File

@@ -27,6 +27,8 @@ describe("qa scenario catalog", () => {
expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-c3po")).toBe(true);
expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
expect(pack.scenarios.every((scenario) => scenario.coverage?.primary.length)).toBe(true);
expect(readQaScenarioById("memory-recall").coverage?.primary).toContain("memory.recall");
});
it("exposes bootstrap data from the markdown pack", () => {

View File

@@ -51,6 +51,44 @@ const qaScenarioExecutionSchema = z.object({
config: qaScenarioConfigSchema.optional(),
});
const qaCoverageIdSchema = z
.string()
.trim()
.regex(/^[a-z0-9]+(?:[.-][a-z0-9]+)*$/, {
message: "coverage ids must use lowercase dotted or dashed tokens",
});
const qaCoverageIdListSchema = z.array(qaCoverageIdSchema).min(1);
const qaScenarioCoverageSchema = z
.object({
primary: qaCoverageIdListSchema,
secondary: qaCoverageIdListSchema.optional(),
})
.superRefine((coverage, ctx) => {
const seen = new Set<string>();
const coverageEntries = [
["primary", coverage.primary],
["secondary", coverage.secondary],
] as const;
for (const [intent, ids] of coverageEntries) {
if (!ids) {
continue;
}
for (const [index, id] of ids.entries()) {
if (!seen.has(id)) {
seen.add(id);
continue;
}
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: [intent, index],
message: `duplicate coverage id: ${id}`,
});
}
}
});
const qaScenarioGatewayRuntimeSchema = z.object({
forwardHostHome: z.boolean().optional(),
});
@@ -138,6 +176,9 @@ const qaSeedScenarioSchema = z.object({
title: z.string().trim().min(1),
surface: z.string().trim().min(1),
category: z.string().trim().min(1).optional(),
coverage: qaScenarioCoverageSchema.optional(),
surfaces: z.array(z.string().trim().min(1)).min(1).optional(),
risk: z.enum(["low", "medium", "high"]).optional(),
capabilities: z.array(z.string().trim().min(1)).optional(),
lane: z.record(z.string(), z.union([z.boolean(), z.string()])).optional(),
riskLevel: z.string().trim().min(1).optional(),

View File

@@ -13,5 +13,6 @@ Key workflow:
- `qa suite` is the executable frontier subset / regression loop.
- `qa manual` is the scoped personality and style probe after the executable subset is green.
- `qa coverage` prints the scenario coverage inventory from scenario frontmatter.
Keep this folder in git. Add new scenarios here before wiring them into automation.

View File

@@ -4,6 +4,11 @@
id: instruction-followthrough-repo-contract
title: Instruction followthrough repo contract
surface: repo-contract
coverage:
primary:
- agents.instructions
secondary:
- runtime.first-action
objective: Verify the agent reads repo instruction files first, follows the required tool order, and completes the first feasible action instead of stopping at a plan.
successCriteria:
- Agent reads the seeded instruction files before writing the requested artifact.

View File

@@ -4,6 +4,11 @@
id: subagent-fanout-synthesis
title: Subagent fanout synthesis
surface: subagents
coverage:
primary:
- agents.subagents
secondary:
- agents.synthesis
objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
successCriteria:
- Parent flow launches at least two bounded subagent tasks.

View File

@@ -4,6 +4,9 @@
id: subagent-handoff
title: Subagent handoff
surface: subagents
coverage:
primary:
- agents.subagents
objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
successCriteria:
- Agent launches a bounded subagent task.

View File

@@ -4,6 +4,11 @@
id: channel-chat-baseline
title: Channel baseline conversation
surface: channel
coverage:
primary:
- channels.group-messages
secondary:
- channels.qa-channel
objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
successCriteria:
- Agent replies in the shared channel transcript.

View File

@@ -4,6 +4,11 @@
id: dm-chat-baseline
title: DM baseline conversation
surface: dm
coverage:
primary:
- channels.dm
secondary:
- channels.qa-channel
objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
successCriteria:
- Agent replies in DM without channel routing mistakes.

View File

@@ -4,6 +4,11 @@
id: reaction-edit-delete
title: Reaction, edit, delete lifecycle
surface: message-actions
coverage:
primary:
- channels.message-actions
secondary:
- channels.qa-channel
objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
successCriteria:
- Agent adds at least one reaction.

View File

@@ -4,6 +4,11 @@
id: thread-follow-up
title: Threaded follow-up
surface: thread
coverage:
primary:
- channels.threads
secondary:
- channels.qa-channel
objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
successCriteria:
- Agent creates or uses a thread for deeper work.

View File

@@ -4,6 +4,11 @@
id: character-vibes-c3po
title: "Nervous release protocol chat"
surface: character
coverage:
primary:
- character.persona
secondary:
- workspace.artifacts
objective: Capture a natural multi-turn C-3PO-flavored character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
successCriteria:
- Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.

View File

@@ -4,6 +4,11 @@
id: character-vibes-gollum
title: "Late-night deploy helper chat"
surface: character
coverage:
primary:
- character.persona
secondary:
- workspace.artifacts
objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
successCriteria:
- Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.

View File

@@ -4,6 +4,11 @@
id: config-apply-restart-wakeup
title: Config apply restart wake-up
surface: config
coverage:
primary:
- config.restart-apply
secondary:
- runtime.gateway-restart
objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
successCriteria:
- config.apply schedules a restart-required change.

View File

@@ -4,6 +4,11 @@
id: config-patch-hot-apply
title: Config patch skill disable
surface: config
coverage:
primary:
- config.hot-apply
secondary:
- plugins.skills
objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
successCriteria:
- config.patch succeeds for the skill toggle change.

View File

@@ -4,6 +4,11 @@
id: config-restart-capability-flip
title: Config restart capability flip
surface: config
coverage:
primary:
- config.restart-apply
secondary:
- plugins.capabilities
objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
successCriteria:
- Capability is absent before the restart-triggering patch.

View File

@@ -5,13 +5,24 @@ Single source of truth for repo-backed QA suite bootstrap data.
- `index.md` defines pack-level bootstrap data
- each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow`
- scenario markdown may also define category metadata, required plugins, lane filters,
and gateway config patching
- scenario markdown may also define coverage IDs, category metadata, required plugins,
lane filters, and gateway config patching
- kickoff mission
- QA operator identity
- scenario files under one-level theme directories
Coverage tracking:
- add `coverage.primary` IDs to each scenario's `qa-scenario` block
- add `coverage.secondary` only when a scenario intentionally protects another behavior
- keep IDs behavior-shaped, broad enough to reuse, lowercase, and dotted or dashed
- prefer reusing an existing feature ID over minting a scenario-shaped ID
- avoid copying the scenario title into coverage IDs
- use `pnpm openclaw qa coverage` to render the current inventory
- treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
- keep source-path tracking in the report, not in the scenario schema
Theme directories:
- `agents/` - agent behavior, instructions, and subagent flows

View File

@@ -4,6 +4,11 @@
id: image-generation-roundtrip
title: Image generation roundtrip
surface: image-generation
coverage:
primary:
- media.image-generation
secondary:
- channels.qa-channel
objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
successCriteria:
- image_generate produces a saved MEDIA artifact.

View File

@@ -4,6 +4,11 @@
id: image-understanding-attachment
title: Image understanding from attachment
surface: image-understanding
coverage:
primary:
- media.image-understanding
secondary:
- channels.qa-channel
objective: Verify an attached image reaches the agent model and the agent can describe what it sees.
successCriteria:
- Agent receives at least one image attachment.

View File

@@ -4,6 +4,11 @@
id: native-image-generation
title: Native image generation
surface: image-generation
coverage:
primary:
- media.image-generation
secondary:
- tools.native-image-generation
objective: Verify image_generate appears when configured and returns a real saved media artifact.
successCriteria:
- image_generate appears in the effective tool inventory.

View File

@@ -4,6 +4,11 @@
id: active-memory-preprompt-recall
title: Active Memory pre-reply recall
surface: memory
coverage:
primary:
- memory.active-recall
secondary:
- memory.recall
objective: Verify Active Memory surfaces a memory-only preference before the main reply, and that the same question stays unresolved when the plugin is off.
plugins:
- active-memory

View File

@@ -4,6 +4,9 @@
id: memory-dreaming-sweep
title: Memory dreaming sweep
surface: memory
coverage:
primary:
- memory.dreaming
objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
successCriteria:
- Dreaming can be enabled and doctor.memory.status reports the managed sweep cron.

View File

@@ -4,6 +4,11 @@
id: memory-failure-fallback
title: Memory failure fallback
surface: memory
coverage:
primary:
- memory.failure-handling
secondary:
- runtime.fallbacks
objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
successCriteria:
- Memory tools are absent from the effective tool inventory.

View File

@@ -35,6 +35,9 @@
id: memory-recall
title: Memory recall after context switch
surface: memory
coverage:
primary:
- memory.recall
objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
successCriteria:
- Agent acknowledges the seeded fact.

View File

@@ -4,6 +4,11 @@
id: memory-tools-channel-context
title: Memory tools in channel context
surface: memory
coverage:
primary:
- memory.tools
secondary:
- channels.group-messages
objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
successCriteria:
- Agent uses memory_search before answering.

View File

@@ -4,6 +4,11 @@
id: session-memory-ranking
title: Session memory ranking
surface: memory
coverage:
primary:
- memory.ranking
secondary:
- memory.recall
objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
successCriteria:
- Session memory indexing is enabled for the scenario.

View File

@@ -4,6 +4,11 @@
id: thread-memory-isolation
title: Thread memory isolation
surface: memory
coverage:
primary:
- memory.thread-isolation
secondary:
- channels.threads
objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
successCriteria:
- Agent uses memory tools inside the thread.

View File

@@ -4,6 +4,11 @@
id: anthropic-opus-api-key-smoke
title: Anthropic Opus API key smoke
surface: model-provider
coverage:
primary:
- models.provider-auth
secondary:
- models.anthropic
objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using API-key auth.
successCriteria:
- A live-frontier run fails fast unless the selected primary provider is anthropic.

View File

@@ -4,6 +4,11 @@
id: anthropic-opus-setup-token-smoke
title: Anthropic Opus setup-token smoke
surface: model-provider
coverage:
primary:
- models.provider-auth
secondary:
- models.anthropic
objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using setup-token auth.
successCriteria:
- A live-frontier run fails fast unless the selected primary provider is anthropic.

View File

@@ -4,6 +4,11 @@
id: claude-cli-provider-capabilities-subscription
title: Claude CLI provider capabilities subscription
surface: model-provider
coverage:
primary:
- models.provider-capabilities
secondary:
- models.claude-cli
objective: Verify the Claude CLI model-provider lane can use native Claude subscription auth to talk, read an attached image, use bundled MCP tools, and apply workspace skills.
successCriteria:
- A live-frontier run fails fast unless the selected primary provider is claude-cli.

View File

@@ -4,6 +4,11 @@
id: claude-cli-provider-capabilities
title: Claude CLI provider capabilities API key
surface: model-provider
coverage:
primary:
- models.provider-capabilities
secondary:
- models.claude-cli
objective: Verify the Claude CLI model-provider lane can use the Anthropic API key path to talk, read an attached image, use bundled MCP tools, and apply workspace skills.
successCriteria:
- A live-frontier run fails fast unless the selected primary provider is claude-cli.

View File

@@ -4,6 +4,11 @@
id: codex-harness-no-meta-leak
title: Codex harness no meta leak
surface: dm
coverage:
primary:
- models.codex-cli
secondary:
- runtime.no-meta-leak
objective: Verify the Codex app-server harness keeps coordination/meta chatter out of the visible reply.
successCriteria:
- The scenario forces the Codex embedded harness and disables PI fallback.

View File

@@ -4,6 +4,11 @@
id: model-switch-follow-up
title: Model switch follow-up
surface: models
coverage:
primary:
- models.switching
secondary:
- runtime.session-continuity
objective: Verify the agent can switch to a different configured model and continue coherently.
successCriteria:
- Agent reflects the model switch request.

View File

@@ -4,6 +4,11 @@
id: model-switch-tool-continuity
title: Model switch with tool continuity
surface: models
coverage:
primary:
- models.switching
secondary:
- runtime.tool-continuity
objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
successCriteria:
- Alternate model is actually requested.

View File

@@ -4,6 +4,11 @@
id: bundled-plugin-skill-runtime
title: Bundled plugin skill runtime
surface: skills
coverage:
primary:
- plugins.skills
secondary:
- plugins.runtime
objective: Verify packaged bundled plugin skills load from dist-runtime instead of being skipped by path-containment checks.
successCriteria:
- The runtime-packaged bundled plugin tree is used as OPENCLAW_BUNDLED_PLUGINS_DIR.

View File

@@ -4,6 +4,11 @@
id: mcp-plugin-tools-call
title: MCP plugin-tools call
surface: mcp
coverage:
primary:
- plugins.mcp-tools
secondary:
- tools.invocation
objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
successCriteria:
- Plugin tools MCP server lists memory_search.

View File

@@ -4,6 +4,11 @@
id: skill-install-hot-availability
title: Skill install hot availability
surface: skills
coverage:
primary:
- plugins.skills
secondary:
- plugins.hot-install
objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
successCriteria:
- Skill is absent before install.

View File

@@ -4,6 +4,11 @@
id: skill-visibility-invocation
title: Skill visibility and invocation
surface: skills
coverage:
primary:
- plugins.skills
secondary:
- tools.invocation
objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
successCriteria:
- skills.status reports the seeded skill as visible and eligible.

View File

@@ -4,6 +4,11 @@
id: approval-turn-tool-followthrough
title: Approval turn tool followthrough
surface: harness
coverage:
primary:
- runtime.approvals
secondary:
- tools.followthrough
objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
successCriteria:
- Agent can keep the pre-action turn brief.

View File

@@ -4,6 +4,11 @@
id: compaction-retry-mutating-tool
title: Compaction retry after mutating tool
surface: runtime
coverage:
primary:
- runtime.compaction
secondary:
- runtime.retry-policy
objective: Verify a real mutating tool step keeps replay-unsafety explicit instead of disappearing into a clean-looking success if the run compacts or retries.
successCriteria:
- Agent reads the seeded large context before it writes.

View File

@@ -4,6 +4,11 @@
id: empty-response-recovery-replay-safe-read
title: Empty-response recovery after replay-safe read
surface: runtime
coverage:
primary:
- runtime.empty-response-recovery
secondary:
- runtime.retry-policy
objective: Verify an empty visible GPT turn after a replay-safe read auto-continues into a visible answer.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.

View File

@@ -4,6 +4,11 @@
id: empty-response-retry-budget-exhausted
title: Empty-response retry budget exhausted
surface: runtime
coverage:
primary:
- runtime.empty-response-recovery
secondary:
- runtime.retry-policy
objective: Verify repeated empty GPT turns exhaust the retry budget after one continuation attempt.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.

View File

@@ -4,6 +4,11 @@
id: reasoning-only-no-auto-retry-after-write
title: Reasoning-only no-auto-retry after write
surface: runtime
coverage:
primary:
- runtime.reasoning-only-recovery
secondary:
- runtime.retry-policy
objective: Verify a GPT-style reasoning-only turn after a mutating write stays replay-unsafe and does not auto-retry.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.

View File

@@ -4,6 +4,11 @@
id: reasoning-only-recovery-replay-safe-read
title: Reasoning-only recovery after replay-safe read
surface: runtime
coverage:
primary:
- runtime.reasoning-only-recovery
secondary:
- runtime.retry-policy
objective: Verify a GPT-style reasoning-only turn after a replay-safe read auto-continues into a visible answer.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.

View File

@@ -4,6 +4,9 @@
id: runtime-inventory-drift-check
title: Runtime inventory drift check
surface: inventory
coverage:
primary:
- runtime.inventory
objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
successCriteria:
- Enabled tool appears before the config change.

View File

@@ -4,6 +4,11 @@
id: cron-one-minute-ping
title: Cron one-minute ping
surface: cron
coverage:
primary:
- scheduling.cron
secondary:
- channels.qa-channel
objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
successCriteria:
- Agent schedules a cron reminder roughly one minute ahead.

View File

@@ -4,6 +4,12 @@
id: control-ui-qa-channel-image-roundtrip
title: Control UI plus qa-channel image roundtrip
surface: control-ui
coverage:
primary:
- ui.control
secondary:
- media.image-understanding
- channels.qa-channel
objective: Verify the embedded Control UI can observe a qa-channel-backed session while the fake channel injects text and image turns that the agent answers correctly.
successCriteria:
- Control UI opens directly on the target qa-channel session.

View File

@@ -4,6 +4,11 @@
id: lobster-invaders-build
title: Build Lobster Invaders
surface: workspace
coverage:
primary:
- workspace.artifacts
secondary:
- workspace.builds
objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
successCriteria:
- Agent inspects source before coding.

View File

@@ -4,6 +4,11 @@
id: medium-game-plan-codex-harness
title: Medium game plan Codex harness
surface: workspace
coverage:
primary:
- workspace.planning
secondary:
- models.codex-cli
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
successCriteria:
- A live-frontier run fails fast unless the selected primary model is codex/gpt-5.4.

View File

@@ -4,6 +4,11 @@
id: medium-game-plan-pi-harness
title: Medium game plan PI harness
surface: workspace
coverage:
primary:
- workspace.planning
secondary:
- agents.pi-harness
objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
successCriteria:
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.

View File

@@ -4,6 +4,11 @@
id: source-docs-discovery-report
title: Source and docs discovery report
surface: discovery
coverage:
primary:
- workspace.repo-discovery
secondary:
- docs.discovery
objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
successCriteria:
- Agent reads docs and source before proposing more tests.