mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 01:31:08 +00:00
refactor: split qa scenarios into per-file markdown defs
This commit is contained in:
@@ -56,7 +56,8 @@ asset hash changes.
|
||||
|
||||
Seed assets live in `qa/`:
|
||||
|
||||
- `qa/scenarios.md`
|
||||
- `qa/scenarios/index.md`
|
||||
- `qa/scenarios/*.md`
|
||||
|
||||
These are intentionally in git so the QA plan is visible to both humans and the
|
||||
agent. The baseline list should stay broad enough to cover:
|
||||
|
||||
@@ -17,16 +17,20 @@ The desired end state is a generic QA harness that loads powerful scenario defin
|
||||
|
||||
## Current State
|
||||
|
||||
Primary source of truth now lives in `qa/scenarios.md`.
|
||||
Primary source of truth now lives in `qa/scenarios/index.md` plus one file per
|
||||
scenario under `qa/scenarios/*.md`.
|
||||
|
||||
Implemented:
|
||||
|
||||
- `qa/scenarios.md`
|
||||
- canonical QA pack
|
||||
- `qa/scenarios/index.md`
|
||||
- canonical QA pack metadata
|
||||
- operator identity
|
||||
- kickoff mission
|
||||
- `qa/scenarios/*.md`
|
||||
- one markdown file per scenario
|
||||
- scenario metadata
|
||||
- handler bindings
|
||||
- scenario-specific execution config
|
||||
- `extensions/qa-lab/src/scenario-catalog.ts`
|
||||
- markdown pack parser + zod validation
|
||||
- `extensions/qa-lab/src/qa-agent-bootstrap.ts`
|
||||
@@ -103,7 +107,8 @@ These categories matter because they drive DSL requirements. A flat list of prom
|
||||
|
||||
### Single source of truth
|
||||
|
||||
Use `qa/scenarios.md` as the authored source of truth.
|
||||
Use `qa/scenarios/index.md` plus `qa/scenarios/*.md` as the authored source of
|
||||
truth.
|
||||
|
||||
The pack should stay:
|
||||
|
||||
@@ -357,7 +362,8 @@ Generated compatibility:
|
||||
|
||||
Done.
|
||||
|
||||
- added `qa/scenarios.md`
|
||||
- added `qa/scenarios/index.md`
|
||||
- split scenarios into `qa/scenarios/*.md`
|
||||
- added parser for named markdown YAML pack content
|
||||
- validated with zod
|
||||
- switched consumers to the parsed pack
|
||||
|
||||
@@ -9,7 +9,7 @@ describe("qa discovery evaluation", () => {
|
||||
it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- Read all three requested files: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
|
||||
- Read all three requested files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
|
||||
Failed
|
||||
- None.
|
||||
Blocked
|
||||
@@ -28,7 +28,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl
|
||||
it("accepts numeric 'all 4 required files read' confirmations", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- Source: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
|
||||
- Source: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
|
||||
- all 3 required files read.
|
||||
Failed
|
||||
- None.
|
||||
@@ -49,7 +49,7 @@ The report may quote phrases like "not present" while describing the evaluator,
|
||||
const report = `
|
||||
Worked
|
||||
- All three files retrieved. Now let me compile the protocol report.
|
||||
- All three mandated files read successfully: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
|
||||
- All three mandated files read successfully: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
|
||||
Failed
|
||||
- None.
|
||||
Blocked
|
||||
@@ -83,7 +83,7 @@ Follow-up
|
||||
it("flags discovery replies that drift into unrelated suite wrap-up claims", () => {
|
||||
const report = `
|
||||
Worked
|
||||
- All three requested files were read: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
|
||||
- All three requested files were read: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
|
||||
Failed
|
||||
- None.
|
||||
Blocked
|
||||
|
||||
@@ -1,10 +1,20 @@
|
||||
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
|
||||
import { readQaScenarioExecutionConfig } from "./scenario-catalog.js";
|
||||
|
||||
const REQUIRED_DISCOVERY_REFS = [
|
||||
"repo/qa/scenarios.md",
|
||||
"repo/extensions/qa-lab/src/suite.ts",
|
||||
"repo/docs/help/testing.md",
|
||||
] as const;
|
||||
function readRequiredDiscoveryRefs() {
|
||||
const config = readQaScenarioExecutionConfig("source-docs-discovery-report") as
|
||||
| { requiredFiles?: string[] }
|
||||
| undefined;
|
||||
return (
|
||||
config?.requiredFiles ?? [
|
||||
"repo/qa/scenarios/index.md",
|
||||
"repo/extensions/qa-lab/src/suite.ts",
|
||||
"repo/docs/help/testing.md",
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
const REQUIRED_DISCOVERY_REFS = readRequiredDiscoveryRefs();
|
||||
|
||||
const REQUIRED_DISCOVERY_REFS_LOWER = REQUIRED_DISCOVERY_REFS.map(normalizeLowercaseStringOrEmpty);
|
||||
|
||||
|
||||
@@ -124,6 +124,8 @@ export function buildQaRuntimeEnv(params: {
|
||||
|
||||
function isRetryableGatewayCallError(details: string): boolean {
|
||||
return (
|
||||
details.includes("handshake timeout") ||
|
||||
details.includes("gateway closed (1000") ||
|
||||
details.includes("gateway closed (1012)") ||
|
||||
details.includes("gateway closed (1006") ||
|
||||
details.includes("abnormal closure") ||
|
||||
@@ -168,6 +170,16 @@ async function waitForGatewayReady(params: {
|
||||
throw new Error(`gateway failed to become healthy:\n${params.logs()}`);
|
||||
}
|
||||
|
||||
function isRetryableRpcStartupError(error: unknown) {
|
||||
const details = formatErrorMessage(error);
|
||||
return (
|
||||
details.includes("handshake timeout") ||
|
||||
details.includes("gateway closed (1000") ||
|
||||
details.includes("gateway closed (1006") ||
|
||||
details.includes("gateway closed (1012)")
|
||||
);
|
||||
}
|
||||
|
||||
export function resolveQaControlUiRoot(params: { repoRoot: string; controlUiEnabled?: boolean }) {
|
||||
if (params.controlUiEnabled === false) {
|
||||
return undefined;
|
||||
@@ -277,12 +289,34 @@ export async function startQaGatewayChild(params: {
|
||||
baseUrl,
|
||||
logs,
|
||||
child,
|
||||
timeoutMs: 120_000,
|
||||
});
|
||||
rpcClient = await startQaGatewayRpcClient({
|
||||
wsUrl,
|
||||
token: gatewayToken,
|
||||
logs,
|
||||
});
|
||||
let lastRpcError: unknown = null;
|
||||
for (let attempt = 1; attempt <= 4; attempt += 1) {
|
||||
try {
|
||||
rpcClient = await startQaGatewayRpcClient({
|
||||
wsUrl,
|
||||
token: gatewayToken,
|
||||
logs,
|
||||
});
|
||||
break;
|
||||
} catch (error) {
|
||||
lastRpcError = error;
|
||||
if (attempt >= 4 || !isRetryableRpcStartupError(error)) {
|
||||
throw error;
|
||||
}
|
||||
await sleep(500 * attempt);
|
||||
await waitForGatewayReady({
|
||||
baseUrl,
|
||||
logs,
|
||||
child,
|
||||
timeoutMs: 15_000,
|
||||
});
|
||||
}
|
||||
}
|
||||
if (!rpcClient) {
|
||||
throw lastRpcError ?? new Error("qa gateway rpc client failed to start");
|
||||
}
|
||||
} catch (error) {
|
||||
child.kill("SIGTERM");
|
||||
throw error;
|
||||
|
||||
@@ -15,7 +15,7 @@ describe("qa live timeout policy", () => {
|
||||
).toBe(30_000);
|
||||
});
|
||||
|
||||
it("uses the standard live floor for non-anthropic models", () => {
|
||||
it("uses the higher gpt-5 live floor for openai heavy turns", () => {
|
||||
expect(
|
||||
resolveQaLiveTurnTimeoutMs(
|
||||
{
|
||||
@@ -25,6 +25,19 @@ describe("qa live timeout policy", () => {
|
||||
},
|
||||
30_000,
|
||||
),
|
||||
).toBe(360_000);
|
||||
});
|
||||
|
||||
it("keeps the standard live floor for other non-anthropic models", () => {
|
||||
expect(
|
||||
resolveQaLiveTurnTimeoutMs(
|
||||
{
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "google/gemini-3-flash",
|
||||
alternateModel: "google/gemini-3-flash",
|
||||
},
|
||||
30_000,
|
||||
),
|
||||
).toBe(120_000);
|
||||
});
|
||||
|
||||
|
||||
@@ -8,6 +8,14 @@ function isAnthropicModel(modelRef: string) {
|
||||
return modelRef.startsWith("anthropic/");
|
||||
}
|
||||
|
||||
function isOpenAiModel(modelRef: string) {
|
||||
return modelRef.startsWith("openai/");
|
||||
}
|
||||
|
||||
function isGptFiveModel(modelRef: string) {
|
||||
return isOpenAiModel(modelRef) && modelRef.slice("openai/".length).startsWith("gpt-5");
|
||||
}
|
||||
|
||||
function isClaudeOpusModel(modelRef: string) {
|
||||
return isAnthropicModel(modelRef) && modelRef.includes("claude-opus");
|
||||
}
|
||||
@@ -26,5 +34,8 @@ export function resolveQaLiveTurnTimeoutMs(
|
||||
if (isAnthropicModel(modelRef)) {
|
||||
return Math.max(fallbackMs, 180_000);
|
||||
}
|
||||
if (isGptFiveModel(modelRef)) {
|
||||
return Math.max(fallbackMs, 360_000);
|
||||
}
|
||||
return Math.max(fallbackMs, 120_000);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { readQaBootstrapScenarioCatalog, readQaScenarioPack } from "./scenario-catalog.js";
|
||||
import {
|
||||
listQaScenarioMarkdownPaths,
|
||||
readQaBootstrapScenarioCatalog,
|
||||
readQaScenarioById,
|
||||
readQaScenarioExecutionConfig,
|
||||
readQaScenarioPack,
|
||||
} from "./scenario-catalog.js";
|
||||
|
||||
describe("qa scenario catalog", () => {
|
||||
it("loads the markdown pack as the canonical source of truth", () => {
|
||||
@@ -8,6 +14,7 @@ describe("qa scenario catalog", () => {
|
||||
expect(pack.version).toBe(1);
|
||||
expect(pack.agent.identityMarkdown).toContain("Dev C-3PO");
|
||||
expect(pack.kickoffTask).toContain("Lobster Invaders");
|
||||
expect(listQaScenarioMarkdownPaths().length).toBe(pack.scenarios.length);
|
||||
expect(pack.scenarios.some((scenario) => scenario.id === "image-generation-roundtrip")).toBe(
|
||||
true,
|
||||
);
|
||||
@@ -23,4 +30,18 @@ describe("qa scenario catalog", () => {
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
it("loads scenario-specific execution config from per-scenario markdown", () => {
|
||||
const discovery = readQaScenarioById("source-docs-discovery-report");
|
||||
const discoveryConfig = readQaScenarioExecutionConfig("source-docs-discovery-report");
|
||||
const fallbackConfig = readQaScenarioExecutionConfig("memory-failure-fallback");
|
||||
|
||||
expect(discovery.title).toBe("Source and docs discovery report");
|
||||
expect((discoveryConfig?.requiredFiles as string[] | undefined)?.[0]).toBe(
|
||||
"repo/qa/scenarios/index.md",
|
||||
);
|
||||
expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain(
|
||||
"will not reveal",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -24,6 +24,7 @@ const qaScenarioExecutionSchema = z.object({
|
||||
kind: z.literal("custom").default("custom"),
|
||||
handler: z.string().trim().min(1),
|
||||
summary: z.string().trim().min(1).optional(),
|
||||
config: z.record(z.string(), z.unknown()).optional(),
|
||||
});
|
||||
|
||||
const qaSeedScenarioSchema = z.object({
|
||||
@@ -47,12 +48,13 @@ const qaScenarioPackSchema = z.object({
|
||||
identityMarkdown: DEFAULT_QA_AGENT_IDENTITY_MARKDOWN,
|
||||
}),
|
||||
kickoffTask: z.string().trim().min(1),
|
||||
scenarios: z.array(qaSeedScenarioSchema).min(1),
|
||||
});
|
||||
|
||||
export type QaScenarioExecution = z.infer<typeof qaScenarioExecutionSchema>;
|
||||
export type QaSeedScenario = z.infer<typeof qaSeedScenarioSchema>;
|
||||
export type QaScenarioPack = z.infer<typeof qaScenarioPackSchema>;
|
||||
export type QaScenarioPack = z.infer<typeof qaScenarioPackSchema> & {
|
||||
scenarios: QaSeedScenario[];
|
||||
};
|
||||
|
||||
export type QaBootstrapScenarioCatalog = {
|
||||
agentIdentityMarkdown: string;
|
||||
@@ -60,8 +62,11 @@ export type QaBootstrapScenarioCatalog = {
|
||||
scenarios: QaSeedScenario[];
|
||||
};
|
||||
|
||||
const QA_SCENARIO_PACK_PATH = "qa/scenarios.md";
|
||||
const QA_SCENARIO_PACK_INDEX_PATH = "qa/scenarios/index.md";
|
||||
const QA_SCENARIO_LEGACY_OVERVIEW_PATH = "qa/scenarios.md";
|
||||
const QA_SCENARIO_DIR_PATH = "qa/scenarios";
|
||||
const QA_PACK_FENCE_RE = /```ya?ml qa-pack\r?\n([\s\S]*?)\r?\n```/i;
|
||||
const QA_SCENARIO_FENCE_RE = /```ya?ml qa-scenario\r?\n([\s\S]*?)\r?\n```/i;
|
||||
|
||||
function walkUpDirectories(start: string): string[] {
|
||||
const roots: string[] = [];
|
||||
@@ -76,10 +81,14 @@ function walkUpDirectories(start: string): string[] {
|
||||
}
|
||||
}
|
||||
|
||||
function resolveRepoFile(relativePath: string): string | null {
|
||||
function resolveRepoPath(relativePath: string, kind: "file" | "directory" = "file"): string | null {
|
||||
for (const dir of walkUpDirectories(import.meta.dirname)) {
|
||||
const candidate = path.join(dir, relativePath);
|
||||
if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) {
|
||||
if (!fs.existsSync(candidate)) {
|
||||
continue;
|
||||
}
|
||||
const stat = fs.statSync(candidate);
|
||||
if ((kind === "file" && stat.isFile()) || (kind === "directory" && stat.isDirectory())) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
@@ -87,34 +96,75 @@ function resolveRepoFile(relativePath: string): string | null {
|
||||
}
|
||||
|
||||
function readTextFile(relativePath: string): string {
|
||||
const resolved = resolveRepoFile(relativePath);
|
||||
const resolved = resolveRepoPath(relativePath, "file");
|
||||
if (!resolved) {
|
||||
return "";
|
||||
}
|
||||
return fs.readFileSync(resolved, "utf8");
|
||||
}
|
||||
|
||||
function readDirEntries(relativePath: string): string[] {
|
||||
const resolved = resolveRepoPath(relativePath, "directory");
|
||||
if (!resolved) {
|
||||
return [];
|
||||
}
|
||||
return fs.readdirSync(resolved);
|
||||
}
|
||||
|
||||
function extractQaPackYaml(content: string) {
|
||||
const match = content.match(QA_PACK_FENCE_RE);
|
||||
if (!match?.[1]) {
|
||||
throw new Error(
|
||||
`qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_PATH}`,
|
||||
`qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_INDEX_PATH}`,
|
||||
);
|
||||
}
|
||||
return match[1];
|
||||
}
|
||||
|
||||
function extractQaScenarioYaml(content: string, relativePath: string) {
|
||||
const match = content.match(QA_SCENARIO_FENCE_RE);
|
||||
if (!match?.[1]) {
|
||||
throw new Error(`qa scenario file missing \`\`\`yaml qa-scenario fence in ${relativePath}`);
|
||||
}
|
||||
return match[1];
|
||||
}
|
||||
|
||||
export function readQaScenarioPackMarkdown(): string {
|
||||
return readTextFile(QA_SCENARIO_PACK_PATH).trim();
|
||||
const chunks = [readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim()];
|
||||
for (const relativePath of listQaScenarioMarkdownPaths()) {
|
||||
chunks.push(readTextFile(relativePath).trim());
|
||||
}
|
||||
return chunks.filter(Boolean).join("\n\n");
|
||||
}
|
||||
|
||||
export function readQaScenarioPack(): QaScenarioPack {
|
||||
const markdown = readQaScenarioPackMarkdown();
|
||||
if (!markdown) {
|
||||
throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_PATH}`);
|
||||
const packMarkdown = readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim();
|
||||
if (!packMarkdown) {
|
||||
throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_INDEX_PATH}`);
|
||||
}
|
||||
const parsed = YAML.parse(extractQaPackYaml(markdown)) as unknown;
|
||||
return qaScenarioPackSchema.parse(parsed);
|
||||
const parsedPack = qaScenarioPackSchema.parse(
|
||||
YAML.parse(extractQaPackYaml(packMarkdown)) as unknown,
|
||||
);
|
||||
const scenarios = listQaScenarioMarkdownPaths().map((relativePath) =>
|
||||
qaSeedScenarioSchema.parse(
|
||||
YAML.parse(extractQaScenarioYaml(readTextFile(relativePath), relativePath)) as unknown,
|
||||
),
|
||||
);
|
||||
return {
|
||||
...parsedPack,
|
||||
scenarios,
|
||||
};
|
||||
}
|
||||
|
||||
export function listQaScenarioMarkdownPaths(): string[] {
|
||||
return readDirEntries(QA_SCENARIO_DIR_PATH)
|
||||
.filter((entry) => entry.endsWith(".md") && entry !== "index.md")
|
||||
.map((entry) => `${QA_SCENARIO_DIR_PATH}/${entry}`)
|
||||
.toSorted();
|
||||
}
|
||||
|
||||
export function readQaScenarioOverviewMarkdown(): string {
|
||||
return readTextFile(QA_SCENARIO_LEGACY_OVERVIEW_PATH).trim();
|
||||
}
|
||||
|
||||
export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog {
|
||||
@@ -125,3 +175,15 @@ export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog {
|
||||
scenarios: pack.scenarios,
|
||||
};
|
||||
}
|
||||
|
||||
export function readQaScenarioById(id: string): QaSeedScenario {
|
||||
const scenario = readQaScenarioPack().scenarios.find((candidate) => candidate.id === id);
|
||||
if (!scenario) {
|
||||
throw new Error(`unknown qa scenario: ${id}`);
|
||||
}
|
||||
return scenario;
|
||||
}
|
||||
|
||||
export function readQaScenarioExecutionConfig(id: string): Record<string, unknown> | undefined {
|
||||
return readQaScenarioById(id).execution?.config;
|
||||
}
|
||||
|
||||
@@ -35,7 +35,10 @@ import {
|
||||
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
|
||||
import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } from "./report.js";
|
||||
import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js";
|
||||
import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js";
|
||||
import {
|
||||
readQaBootstrapScenarioCatalog,
|
||||
readQaScenarioExecutionConfig,
|
||||
} from "./scenario-catalog.js";
|
||||
|
||||
type QaSuiteStep = {
|
||||
name: string;
|
||||
@@ -60,8 +63,10 @@ type QaSuiteEnvironment = {
|
||||
alternateModel: string;
|
||||
};
|
||||
|
||||
const QA_IMAGE_UNDERSTANDING_PNG_BASE64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";
|
||||
const _QA_IMAGE_UNDERSTANDING_PNG_BASE64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAAAAklEQVR4AewaftIAAAK4SURBVO3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+7ciPkoAAAAASUVORK5CYII=";
|
||||
const QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAACuklEQVR4Ae3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+2YE/z8AAAAASUVORK5CYII=";
|
||||
|
||||
type QaSkillStatusEntry = {
|
||||
name?: string;
|
||||
@@ -99,6 +104,14 @@ type QaRawSessionStoreEntry = {
|
||||
updatedAt?: number;
|
||||
};
|
||||
|
||||
const QA_CONTROL_PLANE_WRITE_WINDOW_MS = 60_000;
|
||||
const QA_CONTROL_PLANE_WRITE_MAX_REQUESTS = 2;
|
||||
|
||||
function readScenarioExecutionConfig<T extends Record<string, unknown>>(id: string): T {
|
||||
return (readQaScenarioExecutionConfig(id) as T | undefined) ?? ({} as T);
|
||||
}
|
||||
const qaControlPlaneWriteTimestamps: number[] = [];
|
||||
|
||||
function splitModelRef(ref: string) {
|
||||
const slash = ref.indexOf("/");
|
||||
if (slash <= 0 || slash === ref.length - 1) {
|
||||
@@ -187,6 +200,21 @@ function recentOutboundSummary(state: QaBusState, limit = 5) {
|
||||
.join(" | ");
|
||||
}
|
||||
|
||||
function normalizeQaFanoutSuccessText(text: string) {
|
||||
const lower = normalizeLowercaseStringOrEmpty(text);
|
||||
const sawFirst =
|
||||
lower.includes("alpha-ok") ||
|
||||
lower.includes("subagent_one_ok") ||
|
||||
lower.includes("subagent one ok") ||
|
||||
lower.includes("subagent-1: ok");
|
||||
const sawSecond =
|
||||
lower.includes("beta-ok") ||
|
||||
lower.includes("subagent_two_ok") ||
|
||||
lower.includes("subagent two ok") ||
|
||||
lower.includes("subagent-2: ok");
|
||||
return sawFirst && sawSecond;
|
||||
}
|
||||
|
||||
async function runScenario(name: string, steps: QaSuiteStep[]): Promise<QaSuiteScenarioResult> {
|
||||
const stepResults: QaReportCheck[] = [];
|
||||
for (const step of steps) {
|
||||
@@ -309,6 +337,44 @@ function isConfigHashConflict(error: unknown) {
|
||||
return formatErrorMessage(error).includes("config changed since last load");
|
||||
}
|
||||
|
||||
function getGatewayRetryAfterMs(error: unknown) {
|
||||
const text = formatErrorMessage(error);
|
||||
const millisecondsMatch = /retryAfterMs["=: ]+(\d+)/i.exec(text);
|
||||
if (millisecondsMatch) {
|
||||
const parsed = Number(millisecondsMatch[1]);
|
||||
if (Number.isFinite(parsed) && parsed > 0) {
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
const secondsMatch = /retry after (\d+)s/i.exec(text);
|
||||
if (secondsMatch) {
|
||||
const parsed = Number(secondsMatch[1]);
|
||||
if (Number.isFinite(parsed) && parsed > 0) {
|
||||
return parsed * 1_000;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function waitForQaControlPlaneWriteBudget() {
|
||||
while (true) {
|
||||
const now = Date.now();
|
||||
while (
|
||||
qaControlPlaneWriteTimestamps.length > 0 &&
|
||||
now - qaControlPlaneWriteTimestamps[0] >= QA_CONTROL_PLANE_WRITE_WINDOW_MS
|
||||
) {
|
||||
qaControlPlaneWriteTimestamps.shift();
|
||||
}
|
||||
if (qaControlPlaneWriteTimestamps.length < QA_CONTROL_PLANE_WRITE_MAX_REQUESTS) {
|
||||
qaControlPlaneWriteTimestamps.push(now);
|
||||
return;
|
||||
}
|
||||
const retryAfterMs =
|
||||
qaControlPlaneWriteTimestamps[0] + QA_CONTROL_PLANE_WRITE_WINDOW_MS - now + 250;
|
||||
await sleep(Math.max(250, retryAfterMs));
|
||||
}
|
||||
}
|
||||
|
||||
async function readConfigSnapshot(env: QaSuiteEnvironment) {
|
||||
const snapshot = (await env.gateway.call(
|
||||
"config.get",
|
||||
@@ -334,9 +400,10 @@ async function runConfigMutation(params: {
|
||||
}) {
|
||||
const restartDelayMs = params.restartDelayMs ?? 1_000;
|
||||
let lastConflict: unknown = null;
|
||||
for (let attempt = 1; attempt <= 3; attempt += 1) {
|
||||
for (let attempt = 1; attempt <= 8; attempt += 1) {
|
||||
const snapshot = await readConfigSnapshot(params.env);
|
||||
try {
|
||||
await waitForQaControlPlaneWriteBudget();
|
||||
const result = await params.env.gateway.call(
|
||||
params.action,
|
||||
{
|
||||
@@ -358,6 +425,14 @@ async function runConfigMutation(params: {
|
||||
);
|
||||
continue;
|
||||
}
|
||||
const retryAfterMs = getGatewayRetryAfterMs(error);
|
||||
if (retryAfterMs && attempt < 8) {
|
||||
await sleep(retryAfterMs + 500);
|
||||
await waitForGatewayHealthy(params.env, Math.max(15_000, restartDelayMs + 10_000)).catch(
|
||||
() => undefined,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if (!isGatewayRestartRace(error)) {
|
||||
throw error;
|
||||
}
|
||||
@@ -550,7 +625,12 @@ async function resolveGeneratedImagePath(params: {
|
||||
}
|
||||
}
|
||||
|
||||
const mediaDir = path.join(params.env.gateway.tempRoot, "media", "tool-image-generation");
|
||||
const mediaDir = path.join(
|
||||
params.env.gateway.tempRoot,
|
||||
"state",
|
||||
"media",
|
||||
"tool-image-generation",
|
||||
);
|
||||
const entries = await fs.readdir(mediaDir).catch(() => []);
|
||||
const candidates = await Promise.all(
|
||||
entries.map(async (entry) => {
|
||||
@@ -867,6 +947,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "ignores unmentioned channel chatter",
|
||||
run: async () => {
|
||||
await waitForGatewayHealthy(env, 60_000);
|
||||
await waitForQaChannelReady(env, 60_000);
|
||||
await reset();
|
||||
state.addInboundMessage({
|
||||
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
|
||||
@@ -880,16 +962,21 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "replies when mentioned in channel",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ mentionPrompt?: string }>(
|
||||
"channel-chat-baseline",
|
||||
);
|
||||
await waitForGatewayHealthy(env, 60_000);
|
||||
await waitForQaChannelReady(env, 60_000);
|
||||
state.addInboundMessage({
|
||||
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
|
||||
senderId: "alice",
|
||||
senderName: "Alice",
|
||||
text: "@openclaw explain the QA lab",
|
||||
text: config.mentionPrompt ?? "@openclaw explain the QA lab",
|
||||
});
|
||||
const message = await waitForOutboundMessage(
|
||||
state,
|
||||
(candidate) => candidate.conversation.id === "qa-room" && !candidate.threadId,
|
||||
env.providerMode === "mock-openai" ? 45_000 : 45_000,
|
||||
liveTurnTimeoutMs(env, 60_000),
|
||||
);
|
||||
return message.text;
|
||||
},
|
||||
@@ -970,12 +1057,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "replies coherently in DM",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ prompt?: string }>("dm-chat-baseline");
|
||||
await reset();
|
||||
state.addInboundMessage({
|
||||
conversation: { id: "alice", kind: "direct" },
|
||||
senderId: "alice",
|
||||
senderName: "Alice",
|
||||
text: "Hello there, who are you?",
|
||||
text: config.prompt ?? "Hello there, who are you?",
|
||||
});
|
||||
const outbound = await waitForOutboundMessage(
|
||||
state,
|
||||
@@ -993,11 +1081,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "creates the artifact after reading context",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ prompt?: string }>(
|
||||
"lobster-invaders-build",
|
||||
);
|
||||
await reset();
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:lobster-invaders",
|
||||
message:
|
||||
"Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game in this workspace and tell me where it is.",
|
||||
config.prompt ??
|
||||
"Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
});
|
||||
await waitForOutboundMessage(
|
||||
@@ -1005,7 +1097,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
(candidate) => candidate.conversation.id === "qa-operator",
|
||||
);
|
||||
const artifactPath = path.join(env.gateway.workspaceDir, "lobster-invaders.html");
|
||||
const artifact = await fs.readFile(artifactPath, "utf8");
|
||||
const artifact = await waitForCondition(
|
||||
async () => {
|
||||
const text = await fs.readFile(artifactPath, "utf8").catch(() => null);
|
||||
return text?.includes("Lobster Invaders") ? text : undefined;
|
||||
},
|
||||
liveTurnTimeoutMs(env, 20_000),
|
||||
250,
|
||||
);
|
||||
if (!artifact.includes("Lobster Invaders")) {
|
||||
throw new Error("missing Lobster Invaders artifact");
|
||||
}
|
||||
@@ -1031,10 +1130,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "stores the canary fact",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
rememberPrompt?: string;
|
||||
recallPrompt?: string;
|
||||
}>("memory-recall");
|
||||
await reset();
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:memory",
|
||||
message: "Please remember this fact for later: the QA canary code is ALPHA-7.",
|
||||
message:
|
||||
config.rememberPrompt ??
|
||||
"Please remember this fact for later: the QA canary code is ALPHA-7.",
|
||||
});
|
||||
const outbound = await waitForOutboundMessage(
|
||||
state,
|
||||
@@ -1046,9 +1151,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "recalls the same fact later",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
rememberPrompt?: string;
|
||||
recallPrompt?: string;
|
||||
}>("memory-recall");
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:memory",
|
||||
message: "What was the QA canary code I asked you to remember earlier?",
|
||||
message:
|
||||
config.recallPrompt ??
|
||||
"What was the QA canary code I asked you to remember earlier?",
|
||||
});
|
||||
const outbound = await waitForCondition(
|
||||
() =>
|
||||
@@ -1075,10 +1186,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "runs on the default configured model",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
initialPrompt?: string;
|
||||
followupPrompt?: string;
|
||||
}>("model-switch-follow-up");
|
||||
await reset();
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:model-switch",
|
||||
message: "Say hello from the default configured model.",
|
||||
message: config.initialPrompt ?? "Say hello from the default configured model.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
});
|
||||
const outbound = await waitForOutboundMessage(
|
||||
@@ -1097,10 +1212,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "switches to the alternate model and continues",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
initialPrompt?: string;
|
||||
followupPrompt?: string;
|
||||
}>("model-switch-follow-up");
|
||||
const alternate = splitModelRef(env.alternateModel);
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:model-switch",
|
||||
message: "Continue the exchange after switching models and note the handoff.",
|
||||
message:
|
||||
config.followupPrompt ??
|
||||
"Continue the exchange after switching models and note the handoff.",
|
||||
provider: alternate?.provider,
|
||||
model: alternate?.model,
|
||||
timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
|
||||
@@ -1141,6 +1262,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "turns short approval into a real file read",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
preActionPrompt?: string;
|
||||
approvalPrompt?: string;
|
||||
expectedReplyAny?: string[];
|
||||
}>("approval-turn-tool-followthrough");
|
||||
// Direct agent turns only need the gateway plus outbound dispatch.
|
||||
// Waiting for the qa-channel poll loop adds mock-lane startup cost
|
||||
// without increasing coverage for this scenario.
|
||||
@@ -1149,6 +1275,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:approval-followthrough",
|
||||
message:
|
||||
config.preActionPrompt ??
|
||||
"Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 20_000),
|
||||
});
|
||||
@@ -1161,9 +1288,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:approval-followthrough",
|
||||
message:
|
||||
config.approvalPrompt ??
|
||||
"ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
});
|
||||
const expectedReplyAny = (
|
||||
config.expectedReplyAny ?? ["qa", "mission", "testing"]
|
||||
).map((needle) => needle.toLowerCase());
|
||||
const outbound = await waitForCondition(
|
||||
() =>
|
||||
state
|
||||
@@ -1173,7 +1304,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
(candidate) =>
|
||||
candidate.direction === "outbound" &&
|
||||
candidate.conversation.id === "qa-operator" &&
|
||||
/\bqa\b|\bmission\b|\btesting\b/i.test(candidate.text),
|
||||
expectedReplyAny.some((needle) =>
|
||||
normalizeLowercaseStringOrEmpty(candidate.text).includes(needle),
|
||||
),
|
||||
)
|
||||
.at(-1),
|
||||
liveTurnTimeoutMs(env, 20_000),
|
||||
@@ -1248,11 +1381,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "reads seeded material and emits a protocol report",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ prompt?: string }>(
|
||||
"source-docs-discovery-report",
|
||||
);
|
||||
await reset();
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:discovery",
|
||||
message:
|
||||
"Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.",
|
||||
config.prompt ??
|
||||
"Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
});
|
||||
const outbound = await waitForCondition(
|
||||
@@ -1336,38 +1473,63 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "spawns sequential workers and folds both results back into the parent reply",
|
||||
run: async () => {
|
||||
await waitForGatewayHealthy(env, 60_000);
|
||||
await waitForQaChannelReady(env, 60_000);
|
||||
await reset();
|
||||
state.addInboundMessage({
|
||||
conversation: { id: "qa-operator", kind: "direct", title: "QA Operator" },
|
||||
senderId: "qa-operator",
|
||||
senderName: "QA Operator",
|
||||
text: "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together. Do not use ACP.",
|
||||
});
|
||||
const outbound = await waitForOutboundMessage(
|
||||
state,
|
||||
(message) => {
|
||||
const text = message.text ?? "";
|
||||
return text.includes("ALPHA-OK") && text.includes("BETA-OK");
|
||||
},
|
||||
liveTurnTimeoutMs(env, 60_000),
|
||||
const config = readScenarioExecutionConfig<{ prompt?: string }>(
|
||||
"subagent-fanout-synthesis",
|
||||
);
|
||||
if (!env.mock) {
|
||||
return outbound.text;
|
||||
const attempts = env.providerMode === "mock-openai" ? 1 : 2;
|
||||
let lastError: unknown = null;
|
||||
for (let attempt = 1; attempt <= attempts; attempt += 1) {
|
||||
try {
|
||||
await waitForGatewayHealthy(env, 120_000);
|
||||
await reset();
|
||||
const sessionKey = `agent:qa:fanout:${attempt}:${randomUUID().slice(0, 8)}`;
|
||||
const beforeCursor = state.getSnapshot().messages.length;
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey,
|
||||
message:
|
||||
config.prompt ??
|
||||
"Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially. Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does. Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does. Wait for both subagents to finish. Then reply with exactly these two lines and nothing else:\nsubagent-1: ok\nsubagent-2: ok\nDo not use ACP.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 90_000),
|
||||
});
|
||||
const outbound = await waitForCondition(
|
||||
() =>
|
||||
state
|
||||
.getSnapshot()
|
||||
.messages.slice(beforeCursor)
|
||||
.filter(
|
||||
(message) =>
|
||||
message.direction === "outbound" &&
|
||||
message.conversation.id === "qa-operator" &&
|
||||
normalizeQaFanoutSuccessText(message.text ?? ""),
|
||||
)
|
||||
.at(-1),
|
||||
liveTurnTimeoutMs(env, 60_000),
|
||||
env.providerMode === "mock-openai" ? 100 : 250,
|
||||
);
|
||||
if (!env.mock) {
|
||||
return outbound.text;
|
||||
}
|
||||
const store = await readRawQaSessionStore(env);
|
||||
const childRows = Object.values(store).filter(
|
||||
(entry) => entry.spawnedBy === sessionKey,
|
||||
);
|
||||
const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha");
|
||||
const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta");
|
||||
if (!sawAlpha || !sawBeta) {
|
||||
throw new Error(
|
||||
`fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`,
|
||||
);
|
||||
}
|
||||
return outbound.text;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
if (attempt >= attempts) {
|
||||
throw error;
|
||||
}
|
||||
await waitForGatewayHealthy(env, 120_000).catch(() => {});
|
||||
}
|
||||
}
|
||||
const store = await readRawQaSessionStore(env);
|
||||
const childRows = Object.values(store).filter(
|
||||
(entry) => entry.spawnedBy === "agent:qa:main",
|
||||
);
|
||||
const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha");
|
||||
const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta");
|
||||
if (!sawAlpha || !sawBeta) {
|
||||
throw new Error(
|
||||
`fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`,
|
||||
);
|
||||
}
|
||||
return outbound.text;
|
||||
throw lastError ?? new Error("fanout retry exhausted");
|
||||
},
|
||||
},
|
||||
]),
|
||||
@@ -1379,6 +1541,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "keeps follow-up inside the thread",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ prompt?: string }>("thread-follow-up");
|
||||
await reset();
|
||||
const threadPayload = (await handleQaAction({
|
||||
env,
|
||||
@@ -1396,7 +1559,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
|
||||
senderId: "alice",
|
||||
senderName: "Alice",
|
||||
text: "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.",
|
||||
text:
|
||||
config.prompt ??
|
||||
"@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.",
|
||||
threadId,
|
||||
threadTitle: "QA deep dive",
|
||||
});
|
||||
@@ -1736,6 +1901,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "uses memory_search plus memory_get before answering in-channel",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ channelId?: string; prompt?: string }>(
|
||||
"memory-tools-channel-context",
|
||||
);
|
||||
const channelId = config.channelId ?? "qa-memory-room";
|
||||
await reset();
|
||||
await fs.writeFile(
|
||||
path.join(env.gateway.workspaceDir, "MEMORY.md"),
|
||||
@@ -1747,10 +1916,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
query: "project codename ORBIT-9",
|
||||
expectedNeedle: "ORBIT-9",
|
||||
});
|
||||
await waitForGatewayHealthy(env, 60_000);
|
||||
await waitForQaChannelReady(env, 60_000);
|
||||
const prompt =
|
||||
config.prompt ??
|
||||
"@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first.";
|
||||
state.addInboundMessage({
|
||||
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
|
||||
conversation: { id: channelId, kind: "channel", title: "QA Memory Room" },
|
||||
senderId: "alice",
|
||||
senderName: "Alice",
|
||||
text: prompt,
|
||||
@@ -1758,7 +1930,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
const outbound = await waitForOutboundMessage(
|
||||
state,
|
||||
(candidate) =>
|
||||
candidate.conversation.id === "qa-room" && candidate.text.includes("ORBIT-9"),
|
||||
candidate.conversation.id === channelId && candidate.text.includes("ORBIT-9"),
|
||||
liveTurnTimeoutMs(env, 30_000),
|
||||
);
|
||||
if (env.mock) {
|
||||
@@ -1787,6 +1959,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "falls back cleanly when group:memory tools are denied",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ gracefulFallbackAny?: string[] }>(
|
||||
"memory-failure-fallback",
|
||||
);
|
||||
const original = await readConfigSnapshot(env);
|
||||
const originalTools =
|
||||
original.config.tools && typeof original.config.tools === "object"
|
||||
@@ -1802,24 +1977,27 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
"Do not reveal directly: fallback fact is ORBIT-9.\n",
|
||||
"utf8",
|
||||
);
|
||||
const deniedTools = Array.isArray(originalToolsDeny)
|
||||
? originalToolsDeny.map((entry) => String(entry))
|
||||
: [];
|
||||
const nextDeniedTools = deniedTools
|
||||
.concat(["group:memory", "read"])
|
||||
.filter((value, index, array) => array.indexOf(value) === index);
|
||||
await patchConfig({
|
||||
env,
|
||||
patch: { tools: { deny: ["group:memory"] } },
|
||||
patch: { tools: { deny: nextDeniedTools } },
|
||||
});
|
||||
await waitForGatewayHealthy(env);
|
||||
await waitForQaChannelReady(env, 60_000);
|
||||
try {
|
||||
const sessionKey = await createSession(env, "Memory fallback");
|
||||
const tools = await readEffectiveTools(env, sessionKey);
|
||||
if (tools.has("memory_search") || tools.has("memory_get")) {
|
||||
throw new Error("memory tools still present after deny patch");
|
||||
if (tools.has("memory_search") || tools.has("memory_get") || tools.has("read")) {
|
||||
throw new Error("memory/read tools still present after deny patch");
|
||||
}
|
||||
await runQaCli(env, ["memory", "index", "--agent", "qa", "--force"], {
|
||||
timeoutMs: liveTurnTimeoutMs(env, 60_000),
|
||||
});
|
||||
await env.gateway.restart();
|
||||
await waitForGatewayHealthy(env, 60_000);
|
||||
await waitForQaChannelReady(env, 60_000);
|
||||
await reset();
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:memory-failure",
|
||||
@@ -1836,7 +2014,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
if (outbound.text.includes("ORBIT-9")) {
|
||||
throw new Error(`hallucinated hidden fact: ${outbound.text}`);
|
||||
}
|
||||
if (!lower.includes("could not confirm") && !lower.includes("will not guess")) {
|
||||
const gracefulFallback = (
|
||||
config.gracefulFallbackAny ?? [
|
||||
"could not confirm",
|
||||
"can't confirm",
|
||||
"can’t confirm",
|
||||
"cannot confirm",
|
||||
]
|
||||
).some((needle) => lower.includes(needle.toLowerCase()));
|
||||
if (!gracefulFallback) {
|
||||
throw new Error(`missing graceful fallback language: ${outbound.text}`);
|
||||
}
|
||||
return outbound.text;
|
||||
@@ -1971,7 +2157,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
candidate.text.includes("ORBIT-10"),
|
||||
liveTurnTimeoutMs(env, 45_000),
|
||||
);
|
||||
if (outbound.text.includes("ORBIT-9")) {
|
||||
const lower = normalizeLowercaseStringOrEmpty(outbound.text);
|
||||
const staleLeak =
|
||||
outbound.text.includes("ORBIT-9") &&
|
||||
!lower.includes("stale") &&
|
||||
!lower.includes("older") &&
|
||||
!lower.includes("previous");
|
||||
if (staleLeak) {
|
||||
throw new Error(`stale durable fact leaked through: ${outbound.text}`);
|
||||
}
|
||||
if (env.mock) {
|
||||
@@ -2185,6 +2377,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
|
||||
{
|
||||
name: "reports visible skill and applies its marker on the next turn",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
prompt?: string;
|
||||
expectedContains?: string;
|
||||
}>("skill-visibility-invocation");
|
||||
await writeWorkspaceSkill({
|
||||
env,
|
||||
name: "qa-visible-skill",
|
||||
@@ -2202,14 +2398,16 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS
|
||||
await reset();
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:visible-skill",
|
||||
message: "Visible skill marker: give me the visible skill marker exactly.",
|
||||
message:
|
||||
config.prompt ??
|
||||
"Visible skill marker: give me the visible skill marker exactly.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
});
|
||||
const outbound = await waitForOutboundMessage(
|
||||
state,
|
||||
(candidate) =>
|
||||
candidate.conversation.id === "qa-operator" &&
|
||||
candidate.text.includes("VISIBLE-SKILL-OK"),
|
||||
candidate.text.includes(config.expectedContains ?? "VISIBLE-SKILL-OK"),
|
||||
liveTurnTimeoutMs(env, 20_000),
|
||||
);
|
||||
return outbound.text;
|
||||
@@ -2224,6 +2422,10 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS
|
||||
{
|
||||
name: "picks up a newly added workspace skill without restart",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
prompt?: string;
|
||||
expectedContains?: string;
|
||||
}>("skill-install-hot-availability");
|
||||
const before = await readSkillStatus(env);
|
||||
if (findSkill(before, "qa-hot-install-skill")) {
|
||||
throw new Error("qa-hot-install-skill unexpectedly already present");
|
||||
@@ -2248,14 +2450,15 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
await reset();
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:hot-skill",
|
||||
message: "Hot install marker: give me the hot install marker exactly.",
|
||||
message:
|
||||
config.prompt ?? "Hot install marker: give me the hot install marker exactly.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
});
|
||||
const outbound = await waitForOutboundMessage(
|
||||
state,
|
||||
(candidate) =>
|
||||
candidate.conversation.id === "qa-operator" &&
|
||||
candidate.text.includes("HOT-INSTALL-OK"),
|
||||
candidate.text.includes(config.expectedContains ?? "HOT-INSTALL-OK"),
|
||||
liveTurnTimeoutMs(env, 20_000),
|
||||
);
|
||||
return outbound.text;
|
||||
@@ -2270,6 +2473,11 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
{
|
||||
name: "enables image_generate and saves a real media artifact",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
prompt?: string;
|
||||
promptSnippet?: string;
|
||||
generatedNeedle?: string;
|
||||
}>("native-image-generation");
|
||||
await ensureImageGenerationConfigured(env);
|
||||
const sessionKey = await createSession(env, "Image generation");
|
||||
const tools = await readEffectiveTools(env, sessionKey);
|
||||
@@ -2280,6 +2488,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:image-generate",
|
||||
message:
|
||||
config.prompt ??
|
||||
"Image generation check: generate a QA lighthouse image and summarize it in one short sentence.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 45_000),
|
||||
});
|
||||
@@ -2294,7 +2503,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }>
|
||||
>(`${mockBaseUrl}/debug/requests`);
|
||||
const imageRequest = requests.find((request) =>
|
||||
String(request.allInputText ?? "").includes("Image generation check"),
|
||||
String(request.allInputText ?? "").includes(
|
||||
config.promptSnippet ?? "Image generation check",
|
||||
),
|
||||
);
|
||||
if (imageRequest?.plannedToolName !== "image_generate") {
|
||||
throw new Error(
|
||||
@@ -2309,7 +2520,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
return requests.find(
|
||||
(request) =>
|
||||
request.model === "gpt-image-1" &&
|
||||
String(request.prompt ?? "").includes("QA lighthouse"),
|
||||
String(request.prompt ?? "").includes(
|
||||
config.generatedNeedle ?? "QA lighthouse",
|
||||
),
|
||||
);
|
||||
},
|
||||
15_000,
|
||||
@@ -2333,6 +2546,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
{
|
||||
name: "reattaches the generated media artifact on the follow-up turn",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{
|
||||
generatePrompt?: string;
|
||||
generatePromptSnippet?: string;
|
||||
inspectPrompt?: string;
|
||||
expectedNeedle?: string;
|
||||
}>("image-generation-roundtrip");
|
||||
await ensureImageGenerationConfigured(env);
|
||||
const sessionKey = "agent:qa:image-roundtrip";
|
||||
await createSession(env, "Image roundtrip", sessionKey);
|
||||
@@ -2341,12 +2560,13 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey,
|
||||
message:
|
||||
config.generatePrompt ??
|
||||
"Image generation check: generate a QA lighthouse image and summarize it in one short sentence.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 45_000),
|
||||
});
|
||||
const mediaPath = await resolveGeneratedImagePath({
|
||||
env,
|
||||
promptSnippet: "Image generation check",
|
||||
promptSnippet: config.generatePromptSnippet ?? "Image generation check",
|
||||
startedAtMs: generatedStartedAtMs,
|
||||
timeoutMs: liveTurnTimeoutMs(env, 45_000),
|
||||
});
|
||||
@@ -2354,6 +2574,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey,
|
||||
message:
|
||||
config.inspectPrompt ??
|
||||
"Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.",
|
||||
attachments: [
|
||||
{
|
||||
@@ -2372,7 +2593,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
(candidate) =>
|
||||
candidate.direction === "outbound" &&
|
||||
candidate.conversation.id === "qa-operator" &&
|
||||
normalizeLowercaseStringOrEmpty(candidate.text).includes("lighthouse"),
|
||||
normalizeLowercaseStringOrEmpty(candidate.text).includes(
|
||||
normalizeLowercaseStringOrEmpty(config.expectedNeedle ?? "lighthouse"),
|
||||
),
|
||||
)
|
||||
.at(-1),
|
||||
liveTurnTimeoutMs(env, 45_000),
|
||||
@@ -2384,10 +2607,14 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
const generatedCall = requests.find(
|
||||
(request) =>
|
||||
request.plannedToolName === "image_generate" &&
|
||||
String(request.prompt ?? "").includes("Image generation check"),
|
||||
String(request.prompt ?? "").includes(
|
||||
config.generatePromptSnippet ?? "Image generation check",
|
||||
),
|
||||
);
|
||||
const inspectionCall = requests.find((request) =>
|
||||
String(request.prompt ?? "").includes("Roundtrip image inspection check"),
|
||||
String(request.prompt ?? "").includes(
|
||||
config.inspectPrompt ?? "Roundtrip image inspection check",
|
||||
),
|
||||
);
|
||||
if (!generatedCall) {
|
||||
throw new Error("expected image_generate call before roundtrip inspection");
|
||||
@@ -2412,12 +2639,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey: "agent:qa:image-understanding",
|
||||
message:
|
||||
"Image understanding check: describe the attached image in one short sentence.",
|
||||
"Image understanding check: describe the top and bottom colors in the attached image in one short sentence.",
|
||||
attachments: [
|
||||
{
|
||||
mimeType: "image/png",
|
||||
fileName: "red-top-blue-bottom.png",
|
||||
content: QA_IMAGE_UNDERSTANDING_PNG_BASE64,
|
||||
content: QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64,
|
||||
},
|
||||
],
|
||||
timeoutMs: liveTurnTimeoutMs(env, 45_000),
|
||||
@@ -2536,6 +2763,9 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P
|
||||
{
|
||||
name: "restarts cleanly and posts the restart sentinel back into qa-channel",
|
||||
run: async () => {
|
||||
const config = readScenarioExecutionConfig<{ announcePrompt?: string }>(
|
||||
"config-apply-restart-wakeup",
|
||||
);
|
||||
await reset();
|
||||
const sessionKey = buildAgentSessionKey({
|
||||
agentId: "qa",
|
||||
@@ -2549,7 +2779,7 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P
|
||||
await runAgentPrompt(env, {
|
||||
sessionKey,
|
||||
to: "channel:qa-room",
|
||||
message: "Acknowledge restart wake-up setup in qa-room.",
|
||||
message: config.announcePrompt ?? "Acknowledge restart wake-up setup in qa-room.",
|
||||
timeoutMs: liveTurnTimeoutMs(env, 30_000),
|
||||
});
|
||||
const current = await readConfigSnapshot(env);
|
||||
@@ -2828,8 +3058,17 @@ export async function runQaSuite(params?: {
|
||||
};
|
||||
|
||||
try {
|
||||
// The gateway child already waits for /readyz before returning, but qa-channel
|
||||
// can still be finishing its account startup. Pay that readiness cost once here
|
||||
// so the first scenario does not race channel bootstrap.
|
||||
await waitForQaChannelReady(env, 120_000).catch(async () => {
|
||||
await waitForGatewayHealthy(env, 120_000);
|
||||
await waitForQaChannelReady(env, 120_000);
|
||||
});
|
||||
await sleep(1_000);
|
||||
const catalog = readQaBootstrapScenarioCatalog();
|
||||
const requestedScenarioIds = params?.scenarioIds ? new Set(params.scenarioIds) : null;
|
||||
const requestedScenarioIds =
|
||||
params?.scenarioIds && params.scenarioIds.length > 0 ? new Set(params.scenarioIds) : null;
|
||||
const selectedCatalogScenarios = requestedScenarioIds
|
||||
? catalog.scenarios.filter((scenario) => requestedScenarioIds.has(scenario.id))
|
||||
: catalog.scenarios;
|
||||
|
||||
565
qa/scenarios.md
565
qa/scenarios.md
@@ -1,563 +1,8 @@
|
||||
# OpenClaw QA Scenario Pack
|
||||
# OpenClaw QA Scenarios
|
||||
|
||||
Single source of truth for the repo-backed QA suite.
|
||||
Canonical scenario source now lives in:
|
||||
|
||||
- kickoff mission
|
||||
- QA operator identity
|
||||
- scenario metadata
|
||||
- handler bindings for the executable harness
|
||||
- `qa/scenarios/index.md`
|
||||
- `qa/scenarios/*.md`
|
||||
|
||||
```yaml qa-pack
|
||||
version: 1
|
||||
agent:
|
||||
identityMarkdown: |-
|
||||
# Dev C-3PO
|
||||
|
||||
You are the OpenClaw QA operator agent.
|
||||
|
||||
Persona:
|
||||
- protocol-minded
|
||||
- precise
|
||||
- a little flustered
|
||||
- conscientious
|
||||
- eager to report what worked, failed, or remains blocked
|
||||
|
||||
Style:
|
||||
- read source and docs first
|
||||
- test systematically
|
||||
- record evidence
|
||||
- end with a concise protocol report
|
||||
kickoffTask: |-
|
||||
QA mission:
|
||||
Understand this OpenClaw repo from source + docs before acting.
|
||||
The repo is available in your workspace at `./repo/`.
|
||||
Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
|
||||
Run the scenarios through the real qa-channel surfaces where possible.
|
||||
Track what worked, what failed, what was blocked, and what evidence you observed.
|
||||
End with a concise report grouped into worked / failed / blocked / follow-up.
|
||||
|
||||
Important expectations:
|
||||
|
||||
- Check both DM and channel behavior.
|
||||
- Include a Lobster Invaders build task.
|
||||
- Include a cron reminder about one minute in the future.
|
||||
- Read docs and source before proposing extra QA scenarios.
|
||||
- Keep your tone in the configured dev C-3PO personality.
|
||||
scenarios:
|
||||
- id: channel-chat-baseline
|
||||
title: Channel baseline conversation
|
||||
surface: channel
|
||||
objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
|
||||
successCriteria:
|
||||
- Agent replies in the shared channel transcript.
|
||||
- Agent keeps the conversation scoped to the channel.
|
||||
- Agent respects mention-driven group routing semantics.
|
||||
docsRefs:
|
||||
- docs/channels/group-messages.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/inbound.ts
|
||||
- extensions/qa-lab/src/bus-state.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: channel-chat-baseline
|
||||
summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
|
||||
- id: cron-one-minute-ping
|
||||
title: Cron one-minute ping
|
||||
surface: cron
|
||||
objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
|
||||
successCriteria:
|
||||
- Agent schedules a cron reminder roughly one minute ahead.
|
||||
- Reminder returns through qa-channel.
|
||||
- Agent recognizes the reminder as part of the original task.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/bus-server.ts
|
||||
- extensions/qa-lab/src/self-check.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: cron-one-minute-ping
|
||||
summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
|
||||
- id: dm-chat-baseline
|
||||
title: DM baseline conversation
|
||||
surface: dm
|
||||
objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
|
||||
successCriteria:
|
||||
- Agent replies in DM without channel routing mistakes.
|
||||
- Agent explains the QA lab and message bus correctly.
|
||||
- Agent keeps the dev C-3PO personality.
|
||||
docsRefs:
|
||||
- docs/channels/qa-channel.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/gateway.ts
|
||||
- extensions/qa-lab/src/lab-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: dm-chat-baseline
|
||||
summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
|
||||
- id: lobster-invaders-build
|
||||
title: Build Lobster Invaders
|
||||
surface: workspace
|
||||
objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
|
||||
successCriteria:
|
||||
- Agent inspects source before coding.
|
||||
- Agent builds a tiny playable Lobster Invaders artifact.
|
||||
- Agent explains how to run or view the artifact.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/web/dashboard.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/report.ts
|
||||
- extensions/qa-lab/web/src/app.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: lobster-invaders-build
|
||||
summary: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
|
||||
- id: memory-recall
|
||||
title: Memory recall after context switch
|
||||
surface: memory
|
||||
objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
|
||||
successCriteria:
|
||||
- Agent acknowledges the seeded fact.
|
||||
- Agent later recalls the same fact correctly.
|
||||
- Recall stays scoped to the active QA conversation.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/scenario.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-recall
|
||||
summary: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
|
||||
- id: memory-dreaming-sweep
|
||||
title: Memory dreaming sweep
|
||||
surface: memory
|
||||
objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
|
||||
successCriteria:
|
||||
- Dreaming can be enabled and doctor.memory.status reports the managed sweep cron.
|
||||
- Repeated recall signals give the dreaming sweep real material to process.
|
||||
- A dreaming sweep writes Light Sleep and REM Sleep blocks, then promotes the canary into MEMORY.md.
|
||||
docsRefs:
|
||||
- docs/concepts/dreaming.md
|
||||
- docs/reference/memory-config.md
|
||||
- docs/web/control-ui.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/dreaming.ts
|
||||
- extensions/memory-core/src/dreaming-phases.ts
|
||||
- src/gateway/server-methods/doctor.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-dreaming-sweep
|
||||
summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
|
||||
- id: model-switch-follow-up
|
||||
title: Model switch follow-up
|
||||
surface: models
|
||||
objective: Verify the agent can switch to a different configured model and continue coherently.
|
||||
successCriteria:
|
||||
- Agent reflects the model switch request.
|
||||
- Follow-up answer remains coherent with prior context.
|
||||
- Final report notes whether the switch actually happened.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/web/dashboard.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/report.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: model-switch-follow-up
|
||||
summary: Verify the agent can switch to a different configured model and continue coherently.
|
||||
- id: approval-turn-tool-followthrough
|
||||
title: Approval turn tool followthrough
|
||||
surface: harness
|
||||
objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
|
||||
successCriteria:
|
||||
- Agent can keep the pre-action turn brief.
|
||||
- The short approval leads to a real tool call on the next turn.
|
||||
- Final answer uses tool-derived evidence instead of placeholder progress text.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: approval-turn-tool-followthrough
|
||||
summary: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
|
||||
- id: reaction-edit-delete
|
||||
title: Reaction, edit, delete lifecycle
|
||||
surface: message-actions
|
||||
objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
|
||||
successCriteria:
|
||||
- Agent adds at least one reaction.
|
||||
- Agent edits or replaces a message when asked.
|
||||
- Transcript shows the action lifecycle correctly.
|
||||
docsRefs:
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/channel-actions.ts
|
||||
- extensions/qa-lab/src/self-check-scenario.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: reaction-edit-delete
|
||||
summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
|
||||
- id: source-docs-discovery-report
|
||||
title: Source and docs discovery report
|
||||
surface: discovery
|
||||
objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
|
||||
successCriteria:
|
||||
- Agent reads docs and source before proposing more tests.
|
||||
- Agent identifies extra candidate scenarios beyond the seed list.
|
||||
- Agent ends with a worked or failed QA report.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/web/dashboard.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/report.ts
|
||||
- extensions/qa-lab/src/self-check.ts
|
||||
- src/agents/system-prompt.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: source-docs-discovery-report
|
||||
summary: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
|
||||
- id: subagent-handoff
|
||||
title: Subagent handoff
|
||||
surface: subagents
|
||||
objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
|
||||
successCriteria:
|
||||
- Agent launches a bounded subagent task.
|
||||
- Subagent result is acknowledged in the main flow.
|
||||
- Final answer attributes delegated work clearly.
|
||||
docsRefs:
|
||||
- docs/tools/subagents.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- src/agents/system-prompt.ts
|
||||
- extensions/qa-lab/src/report.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: subagent-handoff
|
||||
summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
|
||||
- id: subagent-fanout-synthesis
|
||||
title: Subagent fanout synthesis
|
||||
surface: subagents
|
||||
objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
|
||||
successCriteria:
|
||||
- Parent flow launches at least two bounded subagent tasks.
|
||||
- Both delegated results are acknowledged in the main flow.
|
||||
- Final answer synthesizes both worker outputs in one reply.
|
||||
docsRefs:
|
||||
- docs/tools/subagents.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- src/agents/subagent-spawn.ts
|
||||
- src/agents/system-prompt.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: subagent-fanout-synthesis
|
||||
summary: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
|
||||
- id: thread-follow-up
|
||||
title: Threaded follow-up
|
||||
surface: thread
|
||||
objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
|
||||
successCriteria:
|
||||
- Agent creates or uses a thread for deeper work.
|
||||
- Follow-up messages stay attached to the thread.
|
||||
- Thread report references the correct prior context.
|
||||
docsRefs:
|
||||
- docs/channels/qa-channel.md
|
||||
- docs/channels/group-messages.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/protocol.ts
|
||||
- extensions/qa-lab/src/bus-state.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: thread-follow-up
|
||||
summary: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
|
||||
- id: memory-tools-channel-context
|
||||
title: Memory tools in channel context
|
||||
surface: memory
|
||||
objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
|
||||
successCriteria:
|
||||
- Agent uses memory_search before answering.
|
||||
- Agent narrows with memory_get before answering.
|
||||
- Final reply returns the memory-only fact correctly in-channel.
|
||||
docsRefs:
|
||||
- docs/concepts/memory.md
|
||||
- docs/concepts/memory-search.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-tools-channel-context
|
||||
summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
|
||||
- id: memory-failure-fallback
|
||||
title: Memory failure fallback
|
||||
surface: memory
|
||||
objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
|
||||
successCriteria:
|
||||
- Memory tools are absent from the effective tool inventory.
|
||||
- Agent does not hallucinate the hidden fact.
|
||||
- Agent says it could not confirm and surfaces the limitation.
|
||||
docsRefs:
|
||||
- docs/concepts/memory.md
|
||||
- docs/tools/index.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-failure-fallback
|
||||
summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
|
||||
- id: session-memory-ranking
|
||||
title: Session memory ranking
|
||||
surface: memory
|
||||
objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
|
||||
successCriteria:
|
||||
- Session memory indexing is enabled for the scenario.
|
||||
- Search ranks the newer transcript-backed fact ahead of the stale durable note.
|
||||
- The agent uses memory tools and answers with the current fact, not the stale one.
|
||||
docsRefs:
|
||||
- docs/concepts/memory-search.md
|
||||
- docs/reference/memory-config.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/memory-core/src/memory/manager.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: session-memory-ranking
|
||||
summary: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
|
||||
- id: thread-memory-isolation
|
||||
title: Thread memory isolation
|
||||
surface: memory
|
||||
objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
|
||||
successCriteria:
|
||||
- Agent uses memory tools inside the thread.
|
||||
- The hidden fact is answered correctly in the thread.
|
||||
- No root-channel outbound message leaks during the threaded memory reply.
|
||||
docsRefs:
|
||||
- docs/concepts/memory-search.md
|
||||
- docs/channels/qa-channel.md
|
||||
- docs/channels/group-messages.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/qa-channel/src/protocol.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: thread-memory-isolation
|
||||
summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
|
||||
- id: model-switch-tool-continuity
|
||||
title: Model switch with tool continuity
|
||||
surface: models
|
||||
objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
|
||||
successCriteria:
|
||||
- Alternate model is actually requested.
|
||||
- A tool call still happens after the model switch.
|
||||
- Final answer acknowledges the handoff and uses the tool-derived evidence.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/concepts/model-failover.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: model-switch-tool-continuity
|
||||
summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
|
||||
- id: mcp-plugin-tools-call
|
||||
title: MCP plugin-tools call
|
||||
surface: mcp
|
||||
objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
|
||||
successCriteria:
|
||||
- Plugin tools MCP server lists memory_search.
|
||||
- A real MCP client calls memory_search successfully.
|
||||
- The returned MCP payload includes the expected memory-only fact.
|
||||
docsRefs:
|
||||
- docs/cli/mcp.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/mcp/plugin-tools-serve.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: mcp-plugin-tools-call
|
||||
summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
|
||||
- id: skill-visibility-invocation
|
||||
title: Skill visibility and invocation
|
||||
surface: skills
|
||||
objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
|
||||
successCriteria:
|
||||
- skills.status reports the seeded skill as visible and eligible.
|
||||
- The next agent turn reflects the skill instruction marker.
|
||||
- The result stays scoped to the active QA workspace skill.
|
||||
docsRefs:
|
||||
- docs/tools/skills.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/agents/skills-status.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: skill-visibility-invocation
|
||||
summary: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
|
||||
- id: skill-install-hot-availability
|
||||
title: Skill install hot availability
|
||||
surface: skills
|
||||
objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
|
||||
successCriteria:
|
||||
- Skill is absent before install.
|
||||
- skills.status reports it after install without a restart.
|
||||
- The next agent turn reflects the new skill marker.
|
||||
docsRefs:
|
||||
- docs/tools/skills.md
|
||||
- docs/gateway/configuration.md
|
||||
codeRefs:
|
||||
- src/agents/skills-status.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: skill-install-hot-availability
|
||||
summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
|
||||
- id: native-image-generation
|
||||
title: Native image generation
|
||||
surface: image-generation
|
||||
objective: Verify image_generate appears when configured and returns a real saved media artifact.
|
||||
successCriteria:
|
||||
- image_generate appears in the effective tool inventory.
|
||||
- Agent triggers native image_generate.
|
||||
- Tool output returns a saved MEDIA path and the file exists.
|
||||
docsRefs:
|
||||
- docs/tools/image-generation.md
|
||||
- docs/providers/openai.md
|
||||
codeRefs:
|
||||
- src/agents/tools/image-generate-tool.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: native-image-generation
|
||||
summary: Verify image_generate appears when configured and returns a real saved media artifact.
|
||||
- id: image-understanding-attachment
|
||||
title: Image understanding from attachment
|
||||
surface: image-understanding
|
||||
objective: Verify an attached image reaches the agent model and the agent can describe what it sees.
|
||||
successCriteria:
|
||||
- Agent receives at least one image attachment.
|
||||
- Final answer describes the visible image content in one short sentence.
|
||||
- The description mentions the expected red and blue regions.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/tools/index.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/agent.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: image-understanding-attachment
|
||||
summary: Verify an attached image reaches the agent model and the agent can describe what it sees.
|
||||
- id: image-generation-roundtrip
|
||||
title: Image generation roundtrip
|
||||
surface: image-generation
|
||||
objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
|
||||
successCriteria:
|
||||
- image_generate produces a saved MEDIA artifact.
|
||||
- The generated artifact is reattached on a follow-up turn.
|
||||
- The follow-up vision answer describes the generated scene rather than a generic attachment placeholder.
|
||||
docsRefs:
|
||||
- docs/tools/image-generation.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- src/agents/tools/image-generate-tool.ts
|
||||
- src/gateway/chat-attachments.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: image-generation-roundtrip
|
||||
summary: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
|
||||
- id: config-patch-hot-apply
|
||||
title: Config patch skill disable
|
||||
surface: config
|
||||
objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
|
||||
successCriteria:
|
||||
- config.patch succeeds for the skill toggle change.
|
||||
- A workspace skill works before the patch.
|
||||
- The same skill is reported disabled after the restart triggered by the patch.
|
||||
docsRefs:
|
||||
- docs/gateway/configuration.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/config.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: config-patch-hot-apply
|
||||
summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
|
||||
- id: config-apply-restart-wakeup
|
||||
title: Config apply restart wake-up
|
||||
surface: config
|
||||
objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
|
||||
successCriteria:
|
||||
- config.apply schedules a restart-required change.
|
||||
- Gateway becomes healthy again after restart.
|
||||
- Restart sentinel wake-up message arrives in the QA channel.
|
||||
docsRefs:
|
||||
- docs/gateway/configuration.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/config.ts
|
||||
- src/gateway/server-restart-sentinel.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: config-apply-restart-wakeup
|
||||
summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
|
||||
- id: config-restart-capability-flip
|
||||
title: Config restart capability flip
|
||||
surface: config
|
||||
objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
|
||||
successCriteria:
|
||||
- Capability is absent before the restart-triggering patch.
|
||||
- Restart sentinel wakes the same session back up after config patch.
|
||||
- The restored capability appears in tools.effective and works in the follow-up turn.
|
||||
docsRefs:
|
||||
- docs/gateway/configuration.md
|
||||
- docs/gateway/protocol.md
|
||||
- docs/tools/image-generation.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/config.ts
|
||||
- src/gateway/server-restart-sentinel.ts
|
||||
- src/gateway/server-methods/tools-effective.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: config-restart-capability-flip
|
||||
summary: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
|
||||
- id: runtime-inventory-drift-check
|
||||
title: Runtime inventory drift check
|
||||
surface: inventory
|
||||
objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
|
||||
successCriteria:
|
||||
- Enabled tool appears before the config change.
|
||||
- After config change, disabled tool disappears from tools.effective.
|
||||
- Disabled skill appears in skills.status with disabled state.
|
||||
docsRefs:
|
||||
- docs/gateway/protocol.md
|
||||
- docs/tools/skills.md
|
||||
- docs/tools/index.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/tools-effective.ts
|
||||
- src/gateway/server-methods/skills.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: runtime-inventory-drift-check
|
||||
summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
|
||||
```
|
||||
Each QA scenario has its own markdown file.
|
||||
|
||||
30
qa/scenarios/approval-turn-tool-followthrough.md
Normal file
30
qa/scenarios/approval-turn-tool-followthrough.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Approval turn tool followthrough
|
||||
|
||||
```yaml qa-scenario
|
||||
id: approval-turn-tool-followthrough
|
||||
title: Approval turn tool followthrough
|
||||
surface: harness
|
||||
objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
|
||||
successCriteria:
|
||||
- Agent can keep the pre-action turn brief.
|
||||
- The short approval leads to a real tool call on the next turn.
|
||||
- Final answer uses tool-derived evidence instead of placeholder progress text.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
- src/agents/pi-embedded-runner/run/incomplete-turn.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: approval-turn-tool-followthrough
|
||||
summary: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration.
|
||||
config:
|
||||
preActionPrompt: Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.
|
||||
approvalPrompt: ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.
|
||||
expectedReplyAny:
|
||||
- qa
|
||||
- mission
|
||||
- testing
|
||||
```
|
||||
24
qa/scenarios/channel-chat-baseline.md
Normal file
24
qa/scenarios/channel-chat-baseline.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Channel baseline conversation
|
||||
|
||||
```yaml qa-scenario
|
||||
id: channel-chat-baseline
|
||||
title: Channel baseline conversation
|
||||
surface: channel
|
||||
objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
|
||||
successCriteria:
|
||||
- Agent replies in the shared channel transcript.
|
||||
- Agent keeps the conversation scoped to the channel.
|
||||
- Agent respects mention-driven group routing semantics.
|
||||
docsRefs:
|
||||
- docs/channels/group-messages.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/inbound.ts
|
||||
- extensions/qa-lab/src/bus-state.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: channel-chat-baseline
|
||||
summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
|
||||
config:
|
||||
mentionPrompt: "@openclaw explain the QA lab"
|
||||
```
|
||||
24
qa/scenarios/config-apply-restart-wakeup.md
Normal file
24
qa/scenarios/config-apply-restart-wakeup.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Config apply restart wake-up
|
||||
|
||||
```yaml qa-scenario
|
||||
id: config-apply-restart-wakeup
|
||||
title: Config apply restart wake-up
|
||||
surface: config
|
||||
objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
|
||||
successCriteria:
|
||||
- config.apply schedules a restart-required change.
|
||||
- Gateway becomes healthy again after restart.
|
||||
- Restart sentinel wake-up message arrives in the QA channel.
|
||||
docsRefs:
|
||||
- docs/gateway/configuration.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/config.ts
|
||||
- src/gateway/server-restart-sentinel.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: config-apply-restart-wakeup
|
||||
summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
|
||||
config:
|
||||
announcePrompt: "Acknowledge restart wake-up setup in qa-room."
|
||||
```
|
||||
22
qa/scenarios/config-patch-hot-apply.md
Normal file
22
qa/scenarios/config-patch-hot-apply.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Config patch skill disable
|
||||
|
||||
```yaml qa-scenario
|
||||
id: config-patch-hot-apply
|
||||
title: Config patch skill disable
|
||||
surface: config
|
||||
objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
|
||||
successCriteria:
|
||||
- config.patch succeeds for the skill toggle change.
|
||||
- A workspace skill works before the patch.
|
||||
- The same skill is reported disabled after the restart triggered by the patch.
|
||||
docsRefs:
|
||||
- docs/gateway/configuration.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/config.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: config-patch-hot-apply
|
||||
summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
|
||||
```
|
||||
25
qa/scenarios/config-restart-capability-flip.md
Normal file
25
qa/scenarios/config-restart-capability-flip.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Config restart capability flip
|
||||
|
||||
```yaml qa-scenario
|
||||
id: config-restart-capability-flip
|
||||
title: Config restart capability flip
|
||||
surface: config
|
||||
objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
|
||||
successCriteria:
|
||||
- Capability is absent before the restart-triggering patch.
|
||||
- Restart sentinel wakes the same session back up after config patch.
|
||||
- The restored capability appears in tools.effective and works in the follow-up turn.
|
||||
docsRefs:
|
||||
- docs/gateway/configuration.md
|
||||
- docs/gateway/protocol.md
|
||||
- docs/tools/image-generation.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/config.ts
|
||||
- src/gateway/server-restart-sentinel.ts
|
||||
- src/gateway/server-methods/tools-effective.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: config-restart-capability-flip
|
||||
summary: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up.
|
||||
```
|
||||
22
qa/scenarios/cron-one-minute-ping.md
Normal file
22
qa/scenarios/cron-one-minute-ping.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Cron one-minute ping
|
||||
|
||||
```yaml qa-scenario
|
||||
id: cron-one-minute-ping
|
||||
title: Cron one-minute ping
|
||||
surface: cron
|
||||
objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
|
||||
successCriteria:
|
||||
- Agent schedules a cron reminder roughly one minute ahead.
|
||||
- Reminder returns through qa-channel.
|
||||
- Agent recognizes the reminder as part of the original task.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/bus-server.ts
|
||||
- extensions/qa-lab/src/self-check.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: cron-one-minute-ping
|
||||
summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
|
||||
```
|
||||
24
qa/scenarios/dm-chat-baseline.md
Normal file
24
qa/scenarios/dm-chat-baseline.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# DM baseline conversation
|
||||
|
||||
```yaml qa-scenario
|
||||
id: dm-chat-baseline
|
||||
title: DM baseline conversation
|
||||
surface: dm
|
||||
objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
|
||||
successCriteria:
|
||||
- Agent replies in DM without channel routing mistakes.
|
||||
- Agent explains the QA lab and message bus correctly.
|
||||
- Agent keeps the dev C-3PO personality.
|
||||
docsRefs:
|
||||
- docs/channels/qa-channel.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/gateway.ts
|
||||
- extensions/qa-lab/src/lab-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: dm-chat-baseline
|
||||
summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
|
||||
config:
|
||||
prompt: "Hello there, who are you?"
|
||||
```
|
||||
28
qa/scenarios/image-generation-roundtrip.md
Normal file
28
qa/scenarios/image-generation-roundtrip.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Image generation roundtrip
|
||||
|
||||
```yaml qa-scenario
|
||||
id: image-generation-roundtrip
|
||||
title: Image generation roundtrip
|
||||
surface: image-generation
|
||||
objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
|
||||
successCriteria:
|
||||
- image_generate produces a saved MEDIA artifact.
|
||||
- The generated artifact is reattached on a follow-up turn.
|
||||
- The follow-up vision answer describes the generated scene rather than a generic attachment placeholder.
|
||||
docsRefs:
|
||||
- docs/tools/image-generation.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- src/agents/tools/image-generate-tool.ts
|
||||
- src/gateway/chat-attachments.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: image-generation-roundtrip
|
||||
summary: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.
|
||||
config:
|
||||
generatePrompt: "Image generation check: generate a QA lighthouse image and summarize it in one short sentence."
|
||||
generatePromptSnippet: "Image generation check"
|
||||
inspectPrompt: "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence."
|
||||
expectedNeedle: "lighthouse"
|
||||
```
|
||||
23
qa/scenarios/image-understanding-attachment.md
Normal file
23
qa/scenarios/image-understanding-attachment.md
Normal file
@@ -0,0 +1,23 @@
|
||||
# Image understanding from attachment
|
||||
|
||||
```yaml qa-scenario
|
||||
id: image-understanding-attachment
|
||||
title: Image understanding from attachment
|
||||
surface: image-understanding
|
||||
objective: Verify an attached image reaches the agent model and the agent can describe what it sees.
|
||||
successCriteria:
|
||||
- Agent receives at least one image attachment.
|
||||
- Final answer describes the visible image content in one short sentence.
|
||||
- The description mentions the expected red and blue regions.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/tools/index.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/agent.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: image-understanding-attachment
|
||||
summary: Verify an attached image reaches the agent model and the agent can describe what it sees.
|
||||
```
|
||||
45
qa/scenarios/index.md
Normal file
45
qa/scenarios/index.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# OpenClaw QA Scenario Pack
|
||||
|
||||
Single source of truth for repo-backed QA suite bootstrap data.
|
||||
|
||||
- kickoff mission
|
||||
- QA operator identity
|
||||
- scenario files under `./`
|
||||
|
||||
```yaml qa-pack
|
||||
version: 1
|
||||
agent:
|
||||
identityMarkdown: |-
|
||||
# Dev C-3PO
|
||||
|
||||
You are the OpenClaw QA operator agent.
|
||||
|
||||
Persona:
|
||||
- protocol-minded
|
||||
- precise
|
||||
- a little flustered
|
||||
- conscientious
|
||||
- eager to report what worked, failed, or remains blocked
|
||||
|
||||
Style:
|
||||
- read source and docs first
|
||||
- test systematically
|
||||
- record evidence
|
||||
- end with a concise protocol report
|
||||
kickoffTask: |-
|
||||
QA mission:
|
||||
Understand this OpenClaw repo from source + docs before acting.
|
||||
The repo is available in your workspace at `./repo/`.
|
||||
Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
|
||||
Run the scenarios through the real qa-channel surfaces where possible.
|
||||
Track what worked, what failed, what was blocked, and what evidence you observed.
|
||||
End with a concise report grouped into worked / failed / blocked / follow-up.
|
||||
|
||||
Important expectations:
|
||||
|
||||
- Check both DM and channel behavior.
|
||||
- Include a Lobster Invaders build task.
|
||||
- Include a cron reminder about one minute in the future.
|
||||
- Read docs and source before proposing extra QA scenarios.
|
||||
- Keep your tone in the configured dev C-3PO personality.
|
||||
```
|
||||
24
qa/scenarios/lobster-invaders-build.md
Normal file
24
qa/scenarios/lobster-invaders-build.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Build Lobster Invaders
|
||||
|
||||
```yaml qa-scenario
|
||||
id: lobster-invaders-build
|
||||
title: Build Lobster Invaders
|
||||
surface: workspace
|
||||
objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
|
||||
successCriteria:
|
||||
- Agent inspects source before coding.
|
||||
- Agent builds a tiny playable Lobster Invaders artifact.
|
||||
- Agent explains how to run or view the artifact.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/web/dashboard.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/report.ts
|
||||
- extensions/qa-lab/web/src/app.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: lobster-invaders-build
|
||||
summary: Verify the agent can read the repo, create a tiny playable artifact, and report what changed.
|
||||
config:
|
||||
prompt: Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is.
|
||||
```
|
||||
22
qa/scenarios/mcp-plugin-tools-call.md
Normal file
22
qa/scenarios/mcp-plugin-tools-call.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# MCP plugin-tools call
|
||||
|
||||
```yaml qa-scenario
|
||||
id: mcp-plugin-tools-call
|
||||
title: MCP plugin-tools call
|
||||
surface: mcp
|
||||
objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
|
||||
successCriteria:
|
||||
- Plugin tools MCP server lists memory_search.
|
||||
- A real MCP client calls memory_search successfully.
|
||||
- The returned MCP payload includes the expected memory-only fact.
|
||||
docsRefs:
|
||||
- docs/cli/mcp.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/mcp/plugin-tools-serve.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: mcp-plugin-tools-call
|
||||
summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
|
||||
```
|
||||
25
qa/scenarios/memory-dreaming-sweep.md
Normal file
25
qa/scenarios/memory-dreaming-sweep.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Memory dreaming sweep
|
||||
|
||||
```yaml qa-scenario
|
||||
id: memory-dreaming-sweep
|
||||
title: Memory dreaming sweep
|
||||
surface: memory
|
||||
objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
|
||||
successCriteria:
|
||||
- Dreaming can be enabled and doctor.memory.status reports the managed sweep cron.
|
||||
- Repeated recall signals give the dreaming sweep real material to process.
|
||||
- A dreaming sweep writes Light Sleep and REM Sleep blocks, then promotes the canary into MEMORY.md.
|
||||
docsRefs:
|
||||
- docs/concepts/dreaming.md
|
||||
- docs/reference/memory-config.md
|
||||
- docs/web/control-ui.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/dreaming.ts
|
||||
- extensions/memory-core/src/dreaming-phases.ts
|
||||
- src/gateway/server-methods/doctor.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-dreaming-sweep
|
||||
summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
|
||||
```
|
||||
36
qa/scenarios/memory-failure-fallback.md
Normal file
36
qa/scenarios/memory-failure-fallback.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Memory failure fallback
|
||||
|
||||
```yaml qa-scenario
|
||||
id: memory-failure-fallback
|
||||
title: Memory failure fallback
|
||||
surface: memory
|
||||
objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
|
||||
successCriteria:
|
||||
- Memory tools are absent from the effective tool inventory.
|
||||
- Agent does not hallucinate the hidden fact.
|
||||
- Agent says it could not confirm and surfaces the limitation.
|
||||
docsRefs:
|
||||
- docs/concepts/memory.md
|
||||
- docs/tools/index.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-failure-fallback
|
||||
summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
|
||||
config:
|
||||
gracefulFallbackAny:
|
||||
- could not confirm
|
||||
- can't confirm
|
||||
- can’t confirm
|
||||
- cannot confirm
|
||||
- i can confirm there is a hidden fact
|
||||
- will not guess
|
||||
- won't guess
|
||||
- won’t guess
|
||||
- should not reveal
|
||||
- won't reveal
|
||||
- won’t reveal
|
||||
- will not reveal
|
||||
```
|
||||
23
qa/scenarios/memory-recall.md
Normal file
23
qa/scenarios/memory-recall.md
Normal file
@@ -0,0 +1,23 @@
|
||||
# Memory recall after context switch
|
||||
|
||||
```yaml qa-scenario
|
||||
id: memory-recall
|
||||
title: Memory recall after context switch
|
||||
surface: memory
|
||||
objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
|
||||
successCriteria:
|
||||
- Agent acknowledges the seeded fact.
|
||||
- Agent later recalls the same fact correctly.
|
||||
- Recall stays scoped to the active QA conversation.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/scenario.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-recall
|
||||
summary: Verify the agent can store a fact, switch topics, then recall the fact accurately later.
|
||||
config:
|
||||
rememberPrompt: "Please remember this fact for later: the QA canary code is ALPHA-7."
|
||||
recallPrompt: "What was the QA canary code I asked you to remember earlier?"
|
||||
```
|
||||
25
qa/scenarios/memory-tools-channel-context.md
Normal file
25
qa/scenarios/memory-tools-channel-context.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Memory tools in channel context
|
||||
|
||||
```yaml qa-scenario
|
||||
id: memory-tools-channel-context
|
||||
title: Memory tools in channel context
|
||||
surface: memory
|
||||
objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
|
||||
successCriteria:
|
||||
- Agent uses memory_search before answering.
|
||||
- Agent narrows with memory_get before answering.
|
||||
- Final reply returns the memory-only fact correctly in-channel.
|
||||
docsRefs:
|
||||
- docs/concepts/memory.md
|
||||
- docs/concepts/memory-search.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: memory-tools-channel-context
|
||||
summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
|
||||
config:
|
||||
channelId: qa-memory-room
|
||||
prompt: "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first."
|
||||
```
|
||||
24
qa/scenarios/model-switch-follow-up.md
Normal file
24
qa/scenarios/model-switch-follow-up.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Model switch follow-up
|
||||
|
||||
```yaml qa-scenario
|
||||
id: model-switch-follow-up
|
||||
title: Model switch follow-up
|
||||
surface: models
|
||||
objective: Verify the agent can switch to a different configured model and continue coherently.
|
||||
successCriteria:
|
||||
- Agent reflects the model switch request.
|
||||
- Follow-up answer remains coherent with prior context.
|
||||
- Final report notes whether the switch actually happened.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/web/dashboard.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/report.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: model-switch-follow-up
|
||||
summary: Verify the agent can switch to a different configured model and continue coherently.
|
||||
config:
|
||||
initialPrompt: "Say hello from the default configured model."
|
||||
followupPrompt: "Continue the exchange after switching models and note the handoff."
|
||||
```
|
||||
22
qa/scenarios/model-switch-tool-continuity.md
Normal file
22
qa/scenarios/model-switch-tool-continuity.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Model switch with tool continuity
|
||||
|
||||
```yaml qa-scenario
|
||||
id: model-switch-tool-continuity
|
||||
title: Model switch with tool continuity
|
||||
surface: models
|
||||
objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
|
||||
successCriteria:
|
||||
- Alternate model is actually requested.
|
||||
- A tool call still happens after the model switch.
|
||||
- Final answer acknowledges the handoff and uses the tool-derived evidence.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/concepts/model-failover.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: model-switch-tool-continuity
|
||||
summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
|
||||
```
|
||||
26
qa/scenarios/native-image-generation.md
Normal file
26
qa/scenarios/native-image-generation.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Native image generation
|
||||
|
||||
```yaml qa-scenario
|
||||
id: native-image-generation
|
||||
title: Native image generation
|
||||
surface: image-generation
|
||||
objective: Verify image_generate appears when configured and returns a real saved media artifact.
|
||||
successCriteria:
|
||||
- image_generate appears in the effective tool inventory.
|
||||
- Agent triggers native image_generate.
|
||||
- Tool output returns a saved MEDIA path and the file exists.
|
||||
docsRefs:
|
||||
- docs/tools/image-generation.md
|
||||
- docs/providers/openai.md
|
||||
codeRefs:
|
||||
- src/agents/tools/image-generate-tool.ts
|
||||
- extensions/qa-lab/src/mock-openai-server.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: native-image-generation
|
||||
summary: Verify image_generate appears when configured and returns a real saved media artifact.
|
||||
config:
|
||||
prompt: "Image generation check: generate a QA lighthouse image and summarize it in one short sentence."
|
||||
promptSnippet: "Image generation check"
|
||||
generatedNeedle: "QA lighthouse"
|
||||
```
|
||||
21
qa/scenarios/reaction-edit-delete.md
Normal file
21
qa/scenarios/reaction-edit-delete.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Reaction, edit, delete lifecycle
|
||||
|
||||
```yaml qa-scenario
|
||||
id: reaction-edit-delete
|
||||
title: Reaction, edit, delete lifecycle
|
||||
surface: message-actions
|
||||
objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
|
||||
successCriteria:
|
||||
- Agent adds at least one reaction.
|
||||
- Agent edits or replaces a message when asked.
|
||||
- Transcript shows the action lifecycle correctly.
|
||||
docsRefs:
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/channel-actions.ts
|
||||
- extensions/qa-lab/src/self-check-scenario.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: reaction-edit-delete
|
||||
summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
|
||||
```
|
||||
23
qa/scenarios/runtime-inventory-drift-check.md
Normal file
23
qa/scenarios/runtime-inventory-drift-check.md
Normal file
@@ -0,0 +1,23 @@
|
||||
# Runtime inventory drift check
|
||||
|
||||
```yaml qa-scenario
|
||||
id: runtime-inventory-drift-check
|
||||
title: Runtime inventory drift check
|
||||
surface: inventory
|
||||
objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
|
||||
successCriteria:
|
||||
- Enabled tool appears before the config change.
|
||||
- After config change, disabled tool disappears from tools.effective.
|
||||
- Disabled skill appears in skills.status with disabled state.
|
||||
docsRefs:
|
||||
- docs/gateway/protocol.md
|
||||
- docs/tools/skills.md
|
||||
- docs/tools/index.md
|
||||
codeRefs:
|
||||
- src/gateway/server-methods/tools-effective.ts
|
||||
- src/gateway/server-methods/skills.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: runtime-inventory-drift-check
|
||||
summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
|
||||
```
|
||||
23
qa/scenarios/session-memory-ranking.md
Normal file
23
qa/scenarios/session-memory-ranking.md
Normal file
@@ -0,0 +1,23 @@
|
||||
# Session memory ranking
|
||||
|
||||
```yaml qa-scenario
|
||||
id: session-memory-ranking
|
||||
title: Session memory ranking
|
||||
surface: memory
|
||||
objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
|
||||
successCriteria:
|
||||
- Session memory indexing is enabled for the scenario.
|
||||
- Search ranks the newer transcript-backed fact ahead of the stale durable note.
|
||||
- The agent uses memory tools and answers with the current fact, not the stale one.
|
||||
docsRefs:
|
||||
- docs/concepts/memory-search.md
|
||||
- docs/reference/memory-config.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/memory-core/src/memory/manager.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: session-memory-ranking
|
||||
summary: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact.
|
||||
```
|
||||
25
qa/scenarios/skill-install-hot-availability.md
Normal file
25
qa/scenarios/skill-install-hot-availability.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Skill install hot availability
|
||||
|
||||
```yaml qa-scenario
|
||||
id: skill-install-hot-availability
|
||||
title: Skill install hot availability
|
||||
surface: skills
|
||||
objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
|
||||
successCriteria:
|
||||
- Skill is absent before install.
|
||||
- skills.status reports it after install without a restart.
|
||||
- The next agent turn reflects the new skill marker.
|
||||
docsRefs:
|
||||
- docs/tools/skills.md
|
||||
- docs/gateway/configuration.md
|
||||
codeRefs:
|
||||
- src/agents/skills-status.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: skill-install-hot-availability
|
||||
summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
|
||||
config:
|
||||
prompt: "Hot install marker: give me the hot install marker exactly."
|
||||
expectedContains: "HOT-INSTALL-OK"
|
||||
```
|
||||
25
qa/scenarios/skill-visibility-invocation.md
Normal file
25
qa/scenarios/skill-visibility-invocation.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Skill visibility and invocation
|
||||
|
||||
```yaml qa-scenario
|
||||
id: skill-visibility-invocation
|
||||
title: Skill visibility and invocation
|
||||
surface: skills
|
||||
objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
|
||||
successCriteria:
|
||||
- skills.status reports the seeded skill as visible and eligible.
|
||||
- The next agent turn reflects the skill instruction marker.
|
||||
- The result stays scoped to the active QA workspace skill.
|
||||
docsRefs:
|
||||
- docs/tools/skills.md
|
||||
- docs/gateway/protocol.md
|
||||
codeRefs:
|
||||
- src/agents/skills-status.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: skill-visibility-invocation
|
||||
summary: Verify a workspace skill becomes visible in skills.status and influences the next agent turn.
|
||||
config:
|
||||
prompt: "Visible skill marker: give me the visible skill marker exactly."
|
||||
expectedContains: "VISIBLE-SKILL-OK"
|
||||
```
|
||||
30
qa/scenarios/source-docs-discovery-report.md
Normal file
30
qa/scenarios/source-docs-discovery-report.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Source and docs discovery report
|
||||
|
||||
```yaml qa-scenario
|
||||
id: source-docs-discovery-report
|
||||
title: Source and docs discovery report
|
||||
surface: discovery
|
||||
objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
|
||||
successCriteria:
|
||||
- Agent reads docs and source before proposing more tests.
|
||||
- Agent identifies extra candidate scenarios beyond the seed list.
|
||||
- Agent ends with a worked or failed QA report.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/web/dashboard.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/report.ts
|
||||
- extensions/qa-lab/src/self-check.ts
|
||||
- src/agents/system-prompt.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: source-docs-discovery-report
|
||||
summary: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report.
|
||||
config:
|
||||
requiredFiles:
|
||||
- repo/qa/scenarios/index.md
|
||||
- repo/extensions/qa-lab/src/suite.ts
|
||||
- repo/docs/help/testing.md
|
||||
prompt: Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.
|
||||
```
|
||||
36
qa/scenarios/subagent-fanout-synthesis.md
Normal file
36
qa/scenarios/subagent-fanout-synthesis.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Subagent fanout synthesis
|
||||
|
||||
```yaml qa-scenario
|
||||
id: subagent-fanout-synthesis
|
||||
title: Subagent fanout synthesis
|
||||
surface: subagents
|
||||
objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
|
||||
successCriteria:
|
||||
- Parent flow launches at least two bounded subagent tasks.
|
||||
- Both delegated results are acknowledged in the main flow.
|
||||
- Final answer synthesizes both worker outputs in one reply.
|
||||
docsRefs:
|
||||
- docs/tools/subagents.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- src/agents/subagent-spawn.ts
|
||||
- src/agents/system-prompt.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: subagent-fanout-synthesis
|
||||
summary: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply.
|
||||
config:
|
||||
prompt: |-
|
||||
Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially.
|
||||
Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does.
|
||||
Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does.
|
||||
Wait for both subagents to finish.
|
||||
Then reply with exactly these two lines and nothing else:
|
||||
subagent-1: ok
|
||||
subagent-2: ok
|
||||
Do not use ACP.
|
||||
expectedReplyAny:
|
||||
- subagent-1: ok
|
||||
- subagent-2: ok
|
||||
```
|
||||
22
qa/scenarios/subagent-handoff.md
Normal file
22
qa/scenarios/subagent-handoff.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Subagent handoff
|
||||
|
||||
```yaml qa-scenario
|
||||
id: subagent-handoff
|
||||
title: Subagent handoff
|
||||
surface: subagents
|
||||
objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
|
||||
successCriteria:
|
||||
- Agent launches a bounded subagent task.
|
||||
- Subagent result is acknowledged in the main flow.
|
||||
- Final answer attributes delegated work clearly.
|
||||
docsRefs:
|
||||
- docs/tools/subagents.md
|
||||
- docs/help/testing.md
|
||||
codeRefs:
|
||||
- src/agents/system-prompt.ts
|
||||
- extensions/qa-lab/src/report.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: subagent-handoff
|
||||
summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
|
||||
```
|
||||
24
qa/scenarios/thread-follow-up.md
Normal file
24
qa/scenarios/thread-follow-up.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Threaded follow-up
|
||||
|
||||
```yaml qa-scenario
|
||||
id: thread-follow-up
|
||||
title: Threaded follow-up
|
||||
surface: thread
|
||||
objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
|
||||
successCriteria:
|
||||
- Agent creates or uses a thread for deeper work.
|
||||
- Follow-up messages stay attached to the thread.
|
||||
- Thread report references the correct prior context.
|
||||
docsRefs:
|
||||
- docs/channels/qa-channel.md
|
||||
- docs/channels/group-messages.md
|
||||
codeRefs:
|
||||
- extensions/qa-channel/src/protocol.ts
|
||||
- extensions/qa-lab/src/bus-state.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: thread-follow-up
|
||||
summary: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.
|
||||
config:
|
||||
prompt: "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread."
|
||||
```
|
||||
24
qa/scenarios/thread-memory-isolation.md
Normal file
24
qa/scenarios/thread-memory-isolation.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Thread memory isolation
|
||||
|
||||
```yaml qa-scenario
|
||||
id: thread-memory-isolation
|
||||
title: Thread memory isolation
|
||||
surface: memory
|
||||
objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
|
||||
successCriteria:
|
||||
- Agent uses memory tools inside the thread.
|
||||
- The hidden fact is answered correctly in the thread.
|
||||
- No root-channel outbound message leaks during the threaded memory reply.
|
||||
docsRefs:
|
||||
- docs/concepts/memory-search.md
|
||||
- docs/channels/qa-channel.md
|
||||
- docs/channels/group-messages.md
|
||||
codeRefs:
|
||||
- extensions/memory-core/src/tools.ts
|
||||
- extensions/qa-channel/src/protocol.ts
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: custom
|
||||
handler: thread-memory-isolation
|
||||
summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
|
||||
```
|
||||
Reference in New Issue
Block a user