mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:20:43 +00:00
QA: track scenario coverage intent
This commit is contained in:
@@ -72,6 +72,7 @@ import {
|
||||
runQaDockerScaffoldCommand,
|
||||
runQaDockerUpCommand,
|
||||
runQaCharacterEvalCommand,
|
||||
runQaCoverageReportCommand,
|
||||
runQaManualLaneCommand,
|
||||
runQaParityReportCommand,
|
||||
runQaSuiteCommand,
|
||||
@@ -336,6 +337,13 @@ describe("qa cli runtime", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("prints a markdown coverage report from scenario metadata", async () => {
|
||||
await runQaCoverageReportCommand({ repoRoot: process.cwd() });
|
||||
|
||||
expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("# QA Coverage Inventory"));
|
||||
expect(stdoutWrite).toHaveBeenCalledWith(expect.stringContaining("memory.recall"));
|
||||
});
|
||||
|
||||
it("resolves character eval paths and passes model refs through", async () => {
|
||||
await runQaCharacterEvalCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
|
||||
import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
|
||||
import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
|
||||
import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js";
|
||||
import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
|
||||
import { runQaDockerUp } from "./docker-up.runtime.js";
|
||||
import type { QaCliBackendAuthMode } from "./gateway-child.js";
|
||||
@@ -36,6 +37,7 @@ import {
|
||||
type QaProviderMode,
|
||||
type QaProviderModeInput,
|
||||
} from "./run-config.js";
|
||||
import { readQaScenarioPack } from "./scenario-catalog.js";
|
||||
import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js";
|
||||
|
||||
type InterruptibleServer = {
|
||||
@@ -442,6 +444,29 @@ export async function runQaParityReportCommand(opts: {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
export async function runQaCoverageReportCommand(opts: {
|
||||
repoRoot?: string;
|
||||
output?: string;
|
||||
json?: boolean;
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios);
|
||||
const outputPath = opts.output ? path.resolve(repoRoot, opts.output) : undefined;
|
||||
const body = opts.json
|
||||
? `${JSON.stringify(inventory, null, 2)}\n`
|
||||
: renderQaCoverageMarkdownReport(inventory);
|
||||
|
||||
if (outputPath) {
|
||||
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
||||
await fs.writeFile(outputPath, body, "utf8");
|
||||
process.stdout.write(`QA coverage report: ${outputPath}\n`);
|
||||
return;
|
||||
}
|
||||
|
||||
process.stdout.write(body);
|
||||
}
|
||||
|
||||
export async function runQaCharacterEvalCommand(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
|
||||
@@ -44,12 +44,14 @@ const {
|
||||
runQaCredentialsAddCommand,
|
||||
runQaCredentialsListCommand,
|
||||
runQaCredentialsRemoveCommand,
|
||||
runQaCoverageReportCommand,
|
||||
runQaProviderServerCommand,
|
||||
runQaTelegramCommand,
|
||||
} = vi.hoisted(() => ({
|
||||
runQaCredentialsAddCommand: vi.fn(),
|
||||
runQaCredentialsListCommand: vi.fn(),
|
||||
runQaCredentialsRemoveCommand: vi.fn(),
|
||||
runQaCoverageReportCommand: vi.fn(),
|
||||
runQaProviderServerCommand: vi.fn(),
|
||||
runQaTelegramCommand: vi.fn(),
|
||||
}));
|
||||
@@ -72,6 +74,7 @@ vi.mock("./cli.runtime.js", () => ({
|
||||
runQaCredentialsAddCommand,
|
||||
runQaCredentialsListCommand,
|
||||
runQaCredentialsRemoveCommand,
|
||||
runQaCoverageReportCommand,
|
||||
runQaProviderServerCommand,
|
||||
}));
|
||||
|
||||
@@ -85,6 +88,7 @@ describe("qa cli registration", () => {
|
||||
runQaCredentialsAddCommand.mockReset();
|
||||
runQaCredentialsListCommand.mockReset();
|
||||
runQaCredentialsRemoveCommand.mockReset();
|
||||
runQaCoverageReportCommand.mockReset();
|
||||
runQaProviderServerCommand.mockReset();
|
||||
runQaTelegramCommand.mockReset();
|
||||
listQaRunnerCliContributions
|
||||
@@ -101,10 +105,30 @@ describe("qa cli registration", () => {
|
||||
const qa = program.commands.find((command) => command.name() === "qa");
|
||||
expect(qa).toBeDefined();
|
||||
expect(qa?.commands.map((command) => command.name())).toEqual(
|
||||
expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials"]),
|
||||
expect.arrayContaining([TEST_QA_RUNNER.commandName, "telegram", "credentials", "coverage"]),
|
||||
);
|
||||
});
|
||||
|
||||
it("routes coverage report flags into the qa runtime command", async () => {
|
||||
await program.parseAsync([
|
||||
"node",
|
||||
"openclaw",
|
||||
"qa",
|
||||
"coverage",
|
||||
"--repo-root",
|
||||
"/tmp/openclaw-repo",
|
||||
"--output",
|
||||
".artifacts/qa-coverage.md",
|
||||
"--json",
|
||||
]);
|
||||
|
||||
expect(runQaCoverageReportCommand).toHaveBeenCalledWith({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
output: ".artifacts/qa-coverage.md",
|
||||
json: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("delegates discovered qa runner registration through the generic host seam", () => {
|
||||
const [{ registration }] = listQaRunnerCliContributions.mock.results[0]?.value;
|
||||
expect(registration.register).toHaveBeenCalledTimes(1);
|
||||
|
||||
@@ -60,6 +60,12 @@ async function runQaParityReport(opts: {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaParityReportCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaCoverageReport(opts: { repoRoot?: string; output?: string; json?: boolean }) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaCoverageReportCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaCharacterEval(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
@@ -302,6 +308,15 @@ export function registerQaLabCli(program: Command) {
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("coverage")
|
||||
.description("Print the markdown scenario coverage inventory")
|
||||
.option("--repo-root <path>", "Repository root to target when writing --output")
|
||||
.option("--output <path>", "Write the coverage inventory to this path")
|
||||
.option("--json", "Print JSON instead of Markdown", false)
|
||||
.action(async (opts: { repoRoot?: string; output?: string; json?: boolean }) => {
|
||||
await runQaCoverageReport(opts);
|
||||
});
|
||||
|
||||
qa.command("character-eval")
|
||||
.description("Run the character QA scenario across live models and write a judged report")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
|
||||
31
extensions/qa-lab/src/coverage-report.test.ts
Normal file
31
extensions/qa-lab/src/coverage-report.test.ts
Normal file
@@ -0,0 +1,31 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./coverage-report.js";
|
||||
import { readQaScenarioPack } from "./scenario-catalog.js";
|
||||
|
||||
describe("qa coverage report", () => {
|
||||
it("groups scenario coverage metadata by theme and surface", () => {
|
||||
const inventory = buildQaCoverageInventory(readQaScenarioPack().scenarios);
|
||||
|
||||
expect(inventory.scenarioCount).toBeGreaterThan(0);
|
||||
expect(inventory.coverageIdCount).toBeGreaterThan(0);
|
||||
expect(inventory.primaryCoverageIdCount).toBeGreaterThan(0);
|
||||
expect(inventory.secondaryCoverageIdCount).toBeGreaterThan(0);
|
||||
expect(inventory.overlappingCoverage.length).toBeGreaterThan(0);
|
||||
expect(inventory.missingCoverage).toEqual([]);
|
||||
expect(inventory.byTheme.memory.some((feature) => feature.id === "memory.recall")).toBe(true);
|
||||
expect(inventory.bySurface.memory.some((feature) => feature.id === "memory.recall")).toBe(true);
|
||||
});
|
||||
|
||||
it("renders a compact markdown inventory", () => {
|
||||
const report = renderQaCoverageMarkdownReport(
|
||||
buildQaCoverageInventory(readQaScenarioPack().scenarios),
|
||||
);
|
||||
|
||||
expect(report).toContain("# QA Coverage Inventory");
|
||||
expect(report).toContain("- Missing coverage metadata: 0");
|
||||
expect(report).toContain("- Overlapping coverage IDs:");
|
||||
expect(report).toContain("memory.recall");
|
||||
expect(report).toContain("primary: memory-recall (qa/scenarios/memory/memory-recall.md)");
|
||||
expect(report).toContain("secondary: active-memory-preprompt-recall");
|
||||
});
|
||||
});
|
||||
192
extensions/qa-lab/src/coverage-report.ts
Normal file
192
extensions/qa-lab/src/coverage-report.ts
Normal file
@@ -0,0 +1,192 @@
|
||||
import type { QaSeedScenarioWithSource } from "./scenario-catalog.js";
|
||||
|
||||
export type QaCoverageScenarioSummary = {
|
||||
id: string;
|
||||
title: string;
|
||||
sourcePath: string;
|
||||
theme: string;
|
||||
surfaces: string[];
|
||||
risk: string;
|
||||
};
|
||||
|
||||
export type QaCoverageIntent = "primary" | "secondary";
|
||||
|
||||
export type QaCoverageScenarioReference = QaCoverageScenarioSummary & {
|
||||
intent: QaCoverageIntent;
|
||||
};
|
||||
|
||||
export type QaCoverageFeatureSummary = {
|
||||
id: string;
|
||||
scenarios: QaCoverageScenarioReference[];
|
||||
};
|
||||
|
||||
export type QaCoverageInventory = {
|
||||
scenarioCount: number;
|
||||
coverageIdCount: number;
|
||||
primaryCoverageIdCount: number;
|
||||
secondaryCoverageIdCount: number;
|
||||
features: QaCoverageFeatureSummary[];
|
||||
overlappingCoverage: QaCoverageFeatureSummary[];
|
||||
missingCoverage: QaCoverageScenarioSummary[];
|
||||
byTheme: Record<string, QaCoverageFeatureSummary[]>;
|
||||
bySurface: Record<string, QaCoverageFeatureSummary[]>;
|
||||
};
|
||||
|
||||
function scenarioTheme(sourcePath: string) {
|
||||
const parts = sourcePath.split("/");
|
||||
return parts[2] ?? "unknown";
|
||||
}
|
||||
|
||||
function scenarioSurfaces(scenario: QaSeedScenarioWithSource) {
|
||||
return scenario.surfaces && scenario.surfaces.length > 0 ? scenario.surfaces : [scenario.surface];
|
||||
}
|
||||
|
||||
function scenarioRisk(scenario: QaSeedScenarioWithSource) {
|
||||
return scenario.risk ?? scenario.riskLevel ?? "unassigned";
|
||||
}
|
||||
|
||||
function summarizeScenario(scenario: QaSeedScenarioWithSource): QaCoverageScenarioSummary {
|
||||
return {
|
||||
id: scenario.id,
|
||||
title: scenario.title,
|
||||
sourcePath: scenario.sourcePath,
|
||||
theme: scenarioTheme(scenario.sourcePath),
|
||||
surfaces: scenarioSurfaces(scenario),
|
||||
risk: scenarioRisk(scenario),
|
||||
};
|
||||
}
|
||||
|
||||
function sortFeatures(features: readonly QaCoverageFeatureSummary[]) {
|
||||
return features.toSorted((left, right) => left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
export function buildQaCoverageInventory(
|
||||
scenarios: readonly QaSeedScenarioWithSource[],
|
||||
): QaCoverageInventory {
|
||||
const byCoverageId = new Map<string, QaCoverageFeatureSummary>();
|
||||
const primaryCoverageIds = new Set<string>();
|
||||
const secondaryCoverageIds = new Set<string>();
|
||||
const missingCoverage: QaCoverageScenarioSummary[] = [];
|
||||
|
||||
const addCoverage = (
|
||||
scenario: QaSeedScenarioWithSource,
|
||||
coverageIds: readonly string[] | undefined,
|
||||
intent: QaCoverageIntent,
|
||||
) => {
|
||||
const summary = summarizeScenario(scenario);
|
||||
for (const coverageId of coverageIds ?? []) {
|
||||
const feature = byCoverageId.get(coverageId) ?? {
|
||||
id: coverageId,
|
||||
scenarios: [],
|
||||
};
|
||||
feature.scenarios.push({ ...summary, intent });
|
||||
byCoverageId.set(coverageId, feature);
|
||||
if (intent === "primary") {
|
||||
primaryCoverageIds.add(coverageId);
|
||||
} else {
|
||||
secondaryCoverageIds.add(coverageId);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (const scenario of scenarios) {
|
||||
if (!scenario.coverage) {
|
||||
missingCoverage.push(summarizeScenario(scenario));
|
||||
continue;
|
||||
}
|
||||
addCoverage(scenario, scenario.coverage.primary, "primary");
|
||||
addCoverage(scenario, scenario.coverage.secondary, "secondary");
|
||||
}
|
||||
|
||||
const features = sortFeatures([...byCoverageId.values()]);
|
||||
const overlappingCoverage = features.filter((feature) => feature.scenarios.length > 1);
|
||||
const byTheme: Record<string, QaCoverageFeatureSummary[]> = {};
|
||||
const bySurface: Record<string, QaCoverageFeatureSummary[]> = {};
|
||||
|
||||
for (const feature of features) {
|
||||
const themes = new Set(feature.scenarios.map((scenario) => scenario.theme));
|
||||
for (const theme of themes) {
|
||||
byTheme[theme] ??= [];
|
||||
byTheme[theme].push({
|
||||
...feature,
|
||||
scenarios: feature.scenarios.filter((scenario) => scenario.theme === theme),
|
||||
});
|
||||
}
|
||||
const surfaces = new Set(feature.scenarios.flatMap((scenario) => scenario.surfaces));
|
||||
for (const surface of surfaces) {
|
||||
bySurface[surface] ??= [];
|
||||
bySurface[surface].push({
|
||||
...feature,
|
||||
scenarios: feature.scenarios.filter((scenario) => scenario.surfaces.includes(surface)),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
scenarioCount: scenarios.length,
|
||||
coverageIdCount: features.length,
|
||||
primaryCoverageIdCount: primaryCoverageIds.size,
|
||||
secondaryCoverageIdCount: secondaryCoverageIds.size,
|
||||
features,
|
||||
overlappingCoverage,
|
||||
missingCoverage,
|
||||
byTheme,
|
||||
bySurface,
|
||||
};
|
||||
}
|
||||
|
||||
function pushFeatureLines(lines: string[], features: readonly QaCoverageFeatureSummary[]) {
|
||||
for (const feature of sortFeatures(features)) {
|
||||
const scenarios = feature.scenarios
|
||||
.map((scenario) => `${scenario.intent}: ${scenario.id} (${scenario.sourcePath})`)
|
||||
.join(", ");
|
||||
lines.push(`- ${feature.id}: ${scenarios}`);
|
||||
}
|
||||
}
|
||||
|
||||
export function renderQaCoverageMarkdownReport(inventory: QaCoverageInventory): string {
|
||||
const lines: string[] = [
|
||||
"# QA Coverage Inventory",
|
||||
"",
|
||||
`- Scenarios: ${inventory.scenarioCount}`,
|
||||
`- Coverage IDs: ${inventory.coverageIdCount}`,
|
||||
`- Primary coverage IDs: ${inventory.primaryCoverageIdCount}`,
|
||||
`- Secondary coverage IDs: ${inventory.secondaryCoverageIdCount}`,
|
||||
`- Overlapping coverage IDs: ${inventory.overlappingCoverage.length}`,
|
||||
`- Missing coverage metadata: ${inventory.missingCoverage.length}`,
|
||||
"",
|
||||
"## By Theme",
|
||||
"",
|
||||
];
|
||||
|
||||
for (const theme of Object.keys(inventory.byTheme).toSorted()) {
|
||||
lines.push(`### ${theme}`, "");
|
||||
pushFeatureLines(lines, inventory.byTheme[theme] ?? []);
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
lines.push("## By Surface", "");
|
||||
for (const surface of Object.keys(inventory.bySurface).toSorted()) {
|
||||
lines.push(`### ${surface}`, "");
|
||||
pushFeatureLines(lines, inventory.bySurface[surface] ?? []);
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
if (inventory.overlappingCoverage.length > 0) {
|
||||
lines.push("## Overlap", "");
|
||||
pushFeatureLines(lines, inventory.overlappingCoverage);
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
if (inventory.missingCoverage.length > 0) {
|
||||
lines.push("## Missing Metadata", "");
|
||||
for (const scenario of inventory.missingCoverage.toSorted((left, right) =>
|
||||
left.id.localeCompare(right.id),
|
||||
)) {
|
||||
lines.push(`- ${scenario.id}: ${scenario.sourcePath}`);
|
||||
}
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
return `${lines.join("\n").trimEnd()}\n`;
|
||||
}
|
||||
@@ -27,6 +27,8 @@ describe("qa scenario catalog", () => {
|
||||
expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-c3po")).toBe(true);
|
||||
expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
|
||||
expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
|
||||
expect(pack.scenarios.every((scenario) => scenario.coverage?.primary.length)).toBe(true);
|
||||
expect(readQaScenarioById("memory-recall").coverage?.primary).toContain("memory.recall");
|
||||
});
|
||||
|
||||
it("exposes bootstrap data from the markdown pack", () => {
|
||||
|
||||
@@ -51,6 +51,44 @@ const qaScenarioExecutionSchema = z.object({
|
||||
config: qaScenarioConfigSchema.optional(),
|
||||
});
|
||||
|
||||
const qaCoverageIdSchema = z
|
||||
.string()
|
||||
.trim()
|
||||
.regex(/^[a-z0-9]+(?:[.-][a-z0-9]+)*$/, {
|
||||
message: "coverage ids must use lowercase dotted or dashed tokens",
|
||||
});
|
||||
|
||||
const qaCoverageIdListSchema = z.array(qaCoverageIdSchema).min(1);
|
||||
|
||||
const qaScenarioCoverageSchema = z
|
||||
.object({
|
||||
primary: qaCoverageIdListSchema,
|
||||
secondary: qaCoverageIdListSchema.optional(),
|
||||
})
|
||||
.superRefine((coverage, ctx) => {
|
||||
const seen = new Set<string>();
|
||||
const coverageEntries = [
|
||||
["primary", coverage.primary],
|
||||
["secondary", coverage.secondary],
|
||||
] as const;
|
||||
for (const [intent, ids] of coverageEntries) {
|
||||
if (!ids) {
|
||||
continue;
|
||||
}
|
||||
for (const [index, id] of ids.entries()) {
|
||||
if (!seen.has(id)) {
|
||||
seen.add(id);
|
||||
continue;
|
||||
}
|
||||
ctx.addIssue({
|
||||
code: z.ZodIssueCode.custom,
|
||||
path: [intent, index],
|
||||
message: `duplicate coverage id: ${id}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const qaScenarioGatewayRuntimeSchema = z.object({
|
||||
forwardHostHome: z.boolean().optional(),
|
||||
});
|
||||
@@ -138,6 +176,9 @@ const qaSeedScenarioSchema = z.object({
|
||||
title: z.string().trim().min(1),
|
||||
surface: z.string().trim().min(1),
|
||||
category: z.string().trim().min(1).optional(),
|
||||
coverage: qaScenarioCoverageSchema.optional(),
|
||||
surfaces: z.array(z.string().trim().min(1)).min(1).optional(),
|
||||
risk: z.enum(["low", "medium", "high"]).optional(),
|
||||
capabilities: z.array(z.string().trim().min(1)).optional(),
|
||||
lane: z.record(z.string(), z.union([z.boolean(), z.string()])).optional(),
|
||||
riskLevel: z.string().trim().min(1).optional(),
|
||||
|
||||
@@ -13,5 +13,6 @@ Key workflow:
|
||||
|
||||
- `qa suite` is the executable frontier subset / regression loop.
|
||||
- `qa manual` is the scoped personality and style probe after the executable subset is green.
|
||||
- `qa coverage` prints the scenario coverage inventory from scenario frontmatter.
|
||||
|
||||
Keep this folder in git. Add new scenarios here before wiring them into automation.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: instruction-followthrough-repo-contract
|
||||
title: Instruction followthrough repo contract
|
||||
surface: repo-contract
|
||||
coverage:
|
||||
primary:
|
||||
- agents.instructions
|
||||
secondary:
|
||||
- runtime.first-action
|
||||
objective: Verify the agent reads repo instruction files first, follows the required tool order, and completes the first feasible action instead of stopping at a plan.
|
||||
successCriteria:
|
||||
- Agent reads the seeded instruction files before writing the requested artifact.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: subagent-fanout-synthesis
|
||||
title: Subagent fanout synthesis
|
||||
surface: subagents
|
||||
coverage:
|
||||
primary:
|
||||
- agents.subagents
|
||||
secondary:
|
||||
- agents.synthesis
|
||||
objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
|
||||
successCriteria:
|
||||
- Parent flow launches at least two bounded subagent tasks.
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
id: subagent-handoff
|
||||
title: Subagent handoff
|
||||
surface: subagents
|
||||
coverage:
|
||||
primary:
|
||||
- agents.subagents
|
||||
objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
|
||||
successCriteria:
|
||||
- Agent launches a bounded subagent task.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: channel-chat-baseline
|
||||
title: Channel baseline conversation
|
||||
surface: channel
|
||||
coverage:
|
||||
primary:
|
||||
- channels.group-messages
|
||||
secondary:
|
||||
- channels.qa-channel
|
||||
objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
|
||||
successCriteria:
|
||||
- Agent replies in the shared channel transcript.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: dm-chat-baseline
|
||||
title: DM baseline conversation
|
||||
surface: dm
|
||||
coverage:
|
||||
primary:
|
||||
- channels.dm
|
||||
secondary:
|
||||
- channels.qa-channel
|
||||
objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
|
||||
successCriteria:
|
||||
- Agent replies in DM without channel routing mistakes.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: reaction-edit-delete
|
||||
title: Reaction, edit, delete lifecycle
|
||||
surface: message-actions
|
||||
coverage:
|
||||
primary:
|
||||
- channels.message-actions
|
||||
secondary:
|
||||
- channels.qa-channel
|
||||
objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
|
||||
successCriteria:
|
||||
- Agent adds at least one reaction.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: thread-follow-up
|
||||
title: Threaded follow-up
|
||||
surface: thread
|
||||
coverage:
|
||||
primary:
|
||||
- channels.threads
|
||||
secondary:
|
||||
- channels.qa-channel
|
||||
objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
|
||||
successCriteria:
|
||||
- Agent creates or uses a thread for deeper work.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: character-vibes-c3po
|
||||
title: "Nervous release protocol chat"
|
||||
surface: character
|
||||
coverage:
|
||||
primary:
|
||||
- character.persona
|
||||
secondary:
|
||||
- workspace.artifacts
|
||||
objective: Capture a natural multi-turn C-3PO-flavored character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
|
||||
successCriteria:
|
||||
- Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: character-vibes-gollum
|
||||
title: "Late-night deploy helper chat"
|
||||
surface: character
|
||||
coverage:
|
||||
primary:
|
||||
- character.persona
|
||||
secondary:
|
||||
- workspace.artifacts
|
||||
objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
|
||||
successCriteria:
|
||||
- Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: config-apply-restart-wakeup
|
||||
title: Config apply restart wake-up
|
||||
surface: config
|
||||
coverage:
|
||||
primary:
|
||||
- config.restart-apply
|
||||
secondary:
|
||||
- runtime.gateway-restart
|
||||
objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
|
||||
successCriteria:
|
||||
- config.apply schedules a restart-required change.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: config-patch-hot-apply
|
||||
title: Config patch skill disable
|
||||
surface: config
|
||||
coverage:
|
||||
primary:
|
||||
- config.hot-apply
|
||||
secondary:
|
||||
- plugins.skills
|
||||
objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
|
||||
successCriteria:
|
||||
- config.patch succeeds for the skill toggle change.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: config-restart-capability-flip
|
||||
title: Config restart capability flip
|
||||
surface: config
|
||||
coverage:
|
||||
primary:
|
||||
- config.restart-apply
|
||||
secondary:
|
||||
- plugins.capabilities
|
||||
objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
|
||||
successCriteria:
|
||||
- Capability is absent before the restart-triggering patch.
|
||||
|
||||
@@ -5,13 +5,24 @@ Single source of truth for repo-backed QA suite bootstrap data.
|
||||
|
||||
- `index.md` defines pack-level bootstrap data
|
||||
- each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow`
|
||||
- scenario markdown may also define category metadata, required plugins, lane filters,
|
||||
and gateway config patching
|
||||
- scenario markdown may also define coverage IDs, category metadata, required plugins,
|
||||
lane filters, and gateway config patching
|
||||
|
||||
- kickoff mission
|
||||
- QA operator identity
|
||||
- scenario files under one-level theme directories
|
||||
|
||||
Coverage tracking:
|
||||
|
||||
- add `coverage.primary` IDs to each scenario's `qa-scenario` block
|
||||
- add `coverage.secondary` only when a scenario intentionally protects another behavior
|
||||
- keep IDs behavior-shaped, broad enough to reuse, lowercase, and dotted or dashed
|
||||
- prefer reusing an existing feature ID over minting a scenario-shaped ID
|
||||
- avoid copying the scenario title into coverage IDs
|
||||
- use `pnpm openclaw qa coverage` to render the current inventory
|
||||
- treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
|
||||
- keep source-path tracking in the report, not in the scenario schema
|
||||
|
||||
Theme directories:
|
||||
|
||||
- `agents/` - agent behavior, instructions, and subagent flows
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: image-generation-roundtrip
|
||||
title: Image generation roundtrip
|
||||
surface: image-generation
|
||||
coverage:
|
||||
primary:
|
||||
- media.image-generation
|
||||
secondary:
|
||||
- channels.qa-channel
|
||||
objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
|
||||
successCriteria:
|
||||
- image_generate produces a saved MEDIA artifact.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: image-understanding-attachment
|
||||
title: Image understanding from attachment
|
||||
surface: image-understanding
|
||||
coverage:
|
||||
primary:
|
||||
- media.image-understanding
|
||||
secondary:
|
||||
- channels.qa-channel
|
||||
objective: Verify an attached image reaches the agent model and the agent can describe what it sees.
|
||||
successCriteria:
|
||||
- Agent receives at least one image attachment.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: native-image-generation
|
||||
title: Native image generation
|
||||
surface: image-generation
|
||||
coverage:
|
||||
primary:
|
||||
- media.image-generation
|
||||
secondary:
|
||||
- tools.native-image-generation
|
||||
objective: Verify image_generate appears when configured and returns a real saved media artifact.
|
||||
successCriteria:
|
||||
- image_generate appears in the effective tool inventory.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: active-memory-preprompt-recall
|
||||
title: Active Memory pre-reply recall
|
||||
surface: memory
|
||||
coverage:
|
||||
primary:
|
||||
- memory.active-recall
|
||||
secondary:
|
||||
- memory.recall
|
||||
objective: Verify Active Memory surfaces a memory-only preference before the main reply, and that the same question stays unresolved when the plugin is off.
|
||||
plugins:
|
||||
- active-memory
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
id: memory-dreaming-sweep
|
||||
title: Memory dreaming sweep
|
||||
surface: memory
|
||||
coverage:
|
||||
primary:
|
||||
- memory.dreaming
|
||||
objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
|
||||
successCriteria:
|
||||
- Dreaming can be enabled and doctor.memory.status reports the managed sweep cron.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: memory-failure-fallback
|
||||
title: Memory failure fallback
|
||||
surface: memory
|
||||
coverage:
|
||||
primary:
|
||||
- memory.failure-handling
|
||||
secondary:
|
||||
- runtime.fallbacks
|
||||
objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
|
||||
successCriteria:
|
||||
- Memory tools are absent from the effective tool inventory.
|
||||
|
||||
@@ -35,6 +35,9 @@
|
||||
id: memory-recall
|
||||
title: Memory recall after context switch
|
||||
surface: memory
|
||||
coverage:
|
||||
primary:
|
||||
- memory.recall
|
||||
objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
|
||||
successCriteria:
|
||||
- Agent acknowledges the seeded fact.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: memory-tools-channel-context
|
||||
title: Memory tools in channel context
|
||||
surface: memory
|
||||
coverage:
|
||||
primary:
|
||||
- memory.tools
|
||||
secondary:
|
||||
- channels.group-messages
|
||||
objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
|
||||
successCriteria:
|
||||
- Agent uses memory_search before answering.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: session-memory-ranking
|
||||
title: Session memory ranking
|
||||
surface: memory
|
||||
coverage:
|
||||
primary:
|
||||
- memory.ranking
|
||||
secondary:
|
||||
- memory.recall
|
||||
objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
|
||||
successCriteria:
|
||||
- Session memory indexing is enabled for the scenario.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: thread-memory-isolation
|
||||
title: Thread memory isolation
|
||||
surface: memory
|
||||
coverage:
|
||||
primary:
|
||||
- memory.thread-isolation
|
||||
secondary:
|
||||
- channels.threads
|
||||
objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
|
||||
successCriteria:
|
||||
- Agent uses memory tools inside the thread.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: anthropic-opus-api-key-smoke
|
||||
title: Anthropic Opus API key smoke
|
||||
surface: model-provider
|
||||
coverage:
|
||||
primary:
|
||||
- models.provider-auth
|
||||
secondary:
|
||||
- models.anthropic
|
||||
objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using API-key auth.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is anthropic.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: anthropic-opus-setup-token-smoke
|
||||
title: Anthropic Opus setup-token smoke
|
||||
surface: model-provider
|
||||
coverage:
|
||||
primary:
|
||||
- models.provider-auth
|
||||
secondary:
|
||||
- models.anthropic
|
||||
objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using setup-token auth.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is anthropic.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: claude-cli-provider-capabilities-subscription
|
||||
title: Claude CLI provider capabilities subscription
|
||||
surface: model-provider
|
||||
coverage:
|
||||
primary:
|
||||
- models.provider-capabilities
|
||||
secondary:
|
||||
- models.claude-cli
|
||||
objective: Verify the Claude CLI model-provider lane can use native Claude subscription auth to talk, read an attached image, use bundled MCP tools, and apply workspace skills.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is claude-cli.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: claude-cli-provider-capabilities
|
||||
title: Claude CLI provider capabilities API key
|
||||
surface: model-provider
|
||||
coverage:
|
||||
primary:
|
||||
- models.provider-capabilities
|
||||
secondary:
|
||||
- models.claude-cli
|
||||
objective: Verify the Claude CLI model-provider lane can use the Anthropic API key path to talk, read an attached image, use bundled MCP tools, and apply workspace skills.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is claude-cli.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: codex-harness-no-meta-leak
|
||||
title: Codex harness no meta leak
|
||||
surface: dm
|
||||
coverage:
|
||||
primary:
|
||||
- models.codex-cli
|
||||
secondary:
|
||||
- runtime.no-meta-leak
|
||||
objective: Verify the Codex app-server harness keeps coordination/meta chatter out of the visible reply.
|
||||
successCriteria:
|
||||
- The scenario forces the Codex embedded harness and disables PI fallback.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: model-switch-follow-up
|
||||
title: Model switch follow-up
|
||||
surface: models
|
||||
coverage:
|
||||
primary:
|
||||
- models.switching
|
||||
secondary:
|
||||
- runtime.session-continuity
|
||||
objective: Verify the agent can switch to a different configured model and continue coherently.
|
||||
successCriteria:
|
||||
- Agent reflects the model switch request.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: model-switch-tool-continuity
|
||||
title: Model switch with tool continuity
|
||||
surface: models
|
||||
coverage:
|
||||
primary:
|
||||
- models.switching
|
||||
secondary:
|
||||
- runtime.tool-continuity
|
||||
objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
|
||||
successCriteria:
|
||||
- Alternate model is actually requested.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: bundled-plugin-skill-runtime
|
||||
title: Bundled plugin skill runtime
|
||||
surface: skills
|
||||
coverage:
|
||||
primary:
|
||||
- plugins.skills
|
||||
secondary:
|
||||
- plugins.runtime
|
||||
objective: Verify packaged bundled plugin skills load from dist-runtime instead of being skipped by path-containment checks.
|
||||
successCriteria:
|
||||
- The runtime-packaged bundled plugin tree is used as OPENCLAW_BUNDLED_PLUGINS_DIR.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: mcp-plugin-tools-call
|
||||
title: MCP plugin-tools call
|
||||
surface: mcp
|
||||
coverage:
|
||||
primary:
|
||||
- plugins.mcp-tools
|
||||
secondary:
|
||||
- tools.invocation
|
||||
objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
|
||||
successCriteria:
|
||||
- Plugin tools MCP server lists memory_search.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: skill-install-hot-availability
|
||||
title: Skill install hot availability
|
||||
surface: skills
|
||||
coverage:
|
||||
primary:
|
||||
- plugins.skills
|
||||
secondary:
|
||||
- plugins.hot-install
|
||||
objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
|
||||
successCriteria:
|
||||
- Skill is absent before install.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: skill-visibility-invocation
|
||||
title: Skill visibility and invocation
|
||||
surface: skills
|
||||
coverage:
|
||||
primary:
|
||||
- plugins.skills
|
||||
secondary:
|
||||
- tools.invocation
|
||||
objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
|
||||
successCriteria:
|
||||
- skills.status reports the seeded skill as visible and eligible.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: approval-turn-tool-followthrough
|
||||
title: Approval turn tool followthrough
|
||||
surface: harness
|
||||
coverage:
|
||||
primary:
|
||||
- runtime.approvals
|
||||
secondary:
|
||||
- tools.followthrough
|
||||
objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
|
||||
successCriteria:
|
||||
- Agent can keep the pre-action turn brief.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: compaction-retry-mutating-tool
|
||||
title: Compaction retry after mutating tool
|
||||
surface: runtime
|
||||
coverage:
|
||||
primary:
|
||||
- runtime.compaction
|
||||
secondary:
|
||||
- runtime.retry-policy
|
||||
objective: Verify a real mutating tool step keeps replay-unsafety explicit instead of disappearing into a clean-looking success if the run compacts or retries.
|
||||
successCriteria:
|
||||
- Agent reads the seeded large context before it writes.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: empty-response-recovery-replay-safe-read
|
||||
title: Empty-response recovery after replay-safe read
|
||||
surface: runtime
|
||||
coverage:
|
||||
primary:
|
||||
- runtime.empty-response-recovery
|
||||
secondary:
|
||||
- runtime.retry-policy
|
||||
objective: Verify an empty visible GPT turn after a replay-safe read auto-continues into a visible answer.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: empty-response-retry-budget-exhausted
|
||||
title: Empty-response retry budget exhausted
|
||||
surface: runtime
|
||||
coverage:
|
||||
primary:
|
||||
- runtime.empty-response-recovery
|
||||
secondary:
|
||||
- runtime.retry-policy
|
||||
objective: Verify repeated empty GPT turns exhaust the retry budget after one continuation attempt.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: reasoning-only-no-auto-retry-after-write
|
||||
title: Reasoning-only no-auto-retry after write
|
||||
surface: runtime
|
||||
coverage:
|
||||
primary:
|
||||
- runtime.reasoning-only-recovery
|
||||
secondary:
|
||||
- runtime.retry-policy
|
||||
objective: Verify a GPT-style reasoning-only turn after a mutating write stays replay-unsafe and does not auto-retry.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: reasoning-only-recovery-replay-safe-read
|
||||
title: Reasoning-only recovery after replay-safe read
|
||||
surface: runtime
|
||||
coverage:
|
||||
primary:
|
||||
- runtime.reasoning-only-recovery
|
||||
secondary:
|
||||
- runtime.retry-policy
|
||||
objective: Verify a GPT-style reasoning-only turn after a replay-safe read auto-continues into a visible answer.
|
||||
successCriteria:
|
||||
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
id: runtime-inventory-drift-check
|
||||
title: Runtime inventory drift check
|
||||
surface: inventory
|
||||
coverage:
|
||||
primary:
|
||||
- runtime.inventory
|
||||
objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
|
||||
successCriteria:
|
||||
- Enabled tool appears before the config change.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: cron-one-minute-ping
|
||||
title: Cron one-minute ping
|
||||
surface: cron
|
||||
coverage:
|
||||
primary:
|
||||
- scheduling.cron
|
||||
secondary:
|
||||
- channels.qa-channel
|
||||
objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
|
||||
successCriteria:
|
||||
- Agent schedules a cron reminder roughly one minute ahead.
|
||||
|
||||
@@ -4,6 +4,12 @@
|
||||
id: control-ui-qa-channel-image-roundtrip
|
||||
title: Control UI plus qa-channel image roundtrip
|
||||
surface: control-ui
|
||||
coverage:
|
||||
primary:
|
||||
- ui.control
|
||||
secondary:
|
||||
- media.image-understanding
|
||||
- channels.qa-channel
|
||||
objective: Verify the embedded Control UI can observe a qa-channel-backed session while the fake channel injects text and image turns that the agent answers correctly.
|
||||
successCriteria:
|
||||
- Control UI opens directly on the target qa-channel session.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: lobster-invaders-build
|
||||
title: Build Lobster Invaders
|
||||
surface: workspace
|
||||
coverage:
|
||||
primary:
|
||||
- workspace.artifacts
|
||||
secondary:
|
||||
- workspace.builds
|
||||
objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
|
||||
successCriteria:
|
||||
- Agent inspects source before coding.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: medium-game-plan-codex-harness
|
||||
title: Medium game plan Codex harness
|
||||
surface: workspace
|
||||
coverage:
|
||||
primary:
|
||||
- workspace.planning
|
||||
secondary:
|
||||
- models.codex-cli
|
||||
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is codex/gpt-5.4.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: medium-game-plan-pi-harness
|
||||
title: Medium game plan PI harness
|
||||
surface: workspace
|
||||
coverage:
|
||||
primary:
|
||||
- workspace.planning
|
||||
secondary:
|
||||
- agents.pi-harness
|
||||
objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
id: source-docs-discovery-report
|
||||
title: Source and docs discovery report
|
||||
surface: discovery
|
||||
coverage:
|
||||
primary:
|
||||
- workspace.repo-discovery
|
||||
secondary:
|
||||
- docs.discovery
|
||||
objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
|
||||
successCriteria:
|
||||
- Agent reads docs and source before proposing more tests.
|
||||
|
||||
Reference in New Issue
Block a user