diff --git a/docs/concepts/qa-e2e-automation.md b/docs/concepts/qa-e2e-automation.md index 42511cdefab..d6ea1474912 100644 --- a/docs/concepts/qa-e2e-automation.md +++ b/docs/concepts/qa-e2e-automation.md @@ -56,7 +56,8 @@ asset hash changes. Seed assets live in `qa/`: -- `qa/scenarios.md` +- `qa/scenarios/index.md` +- `qa/scenarios/*.md` These are intentionally in git so the QA plan is visible to both humans and the agent. The baseline list should stay broad enough to cover: diff --git a/docs/refactor/qa.md b/docs/refactor/qa.md index a53adb41ac6..139eb967d30 100644 --- a/docs/refactor/qa.md +++ b/docs/refactor/qa.md @@ -17,16 +17,20 @@ The desired end state is a generic QA harness that loads powerful scenario defin ## Current State -Primary source of truth now lives in `qa/scenarios.md`. +Primary source of truth now lives in `qa/scenarios/index.md` plus one file per +scenario under `qa/scenarios/*.md`. Implemented: -- `qa/scenarios.md` - - canonical QA pack +- `qa/scenarios/index.md` + - canonical QA pack metadata - operator identity - kickoff mission +- `qa/scenarios/*.md` + - one markdown file per scenario - scenario metadata - handler bindings + - scenario-specific execution config - `extensions/qa-lab/src/scenario-catalog.ts` - markdown pack parser + zod validation - `extensions/qa-lab/src/qa-agent-bootstrap.ts` @@ -103,7 +107,8 @@ These categories matter because they drive DSL requirements. A flat list of prom ### Single source of truth -Use `qa/scenarios.md` as the authored source of truth. +Use `qa/scenarios/index.md` plus `qa/scenarios/*.md` as the authored source of +truth. The pack should stay: @@ -357,7 +362,8 @@ Generated compatibility: Done. -- added `qa/scenarios.md` +- added `qa/scenarios/index.md` +- split scenarios into `qa/scenarios/*.md` - added parser for named markdown YAML pack content - validated with zod - switched consumers to the parsed pack diff --git a/extensions/qa-lab/src/discovery-eval.test.ts b/extensions/qa-lab/src/discovery-eval.test.ts index badc4edd6e2..ec596be5788 100644 --- a/extensions/qa-lab/src/discovery-eval.test.ts +++ b/extensions/qa-lab/src/discovery-eval.test.ts @@ -9,7 +9,7 @@ describe("qa discovery evaluation", () => { it("accepts rich discovery reports that explicitly confirm all required files were read", () => { const report = ` Worked -- Read all three requested files: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md. +- Read all three requested files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md. Failed - None. Blocked @@ -28,7 +28,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl it("accepts numeric 'all 4 required files read' confirmations", () => { const report = ` Worked -- Source: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md +- Source: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md - all 3 required files read. Failed - None. @@ -49,7 +49,7 @@ The report may quote phrases like "not present" while describing the evaluator, const report = ` Worked - All three files retrieved. Now let me compile the protocol report. -- All three mandated files read successfully: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md. +- All three mandated files read successfully: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md. Failed - None. Blocked @@ -83,7 +83,7 @@ Follow-up it("flags discovery replies that drift into unrelated suite wrap-up claims", () => { const report = ` Worked -- All three requested files were read: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md. +- All three requested files were read: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md. Failed - None. Blocked diff --git a/extensions/qa-lab/src/discovery-eval.ts b/extensions/qa-lab/src/discovery-eval.ts index 66961dc0118..28fd4d8b64f 100644 --- a/extensions/qa-lab/src/discovery-eval.ts +++ b/extensions/qa-lab/src/discovery-eval.ts @@ -1,10 +1,20 @@ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime"; +import { readQaScenarioExecutionConfig } from "./scenario-catalog.js"; -const REQUIRED_DISCOVERY_REFS = [ - "repo/qa/scenarios.md", - "repo/extensions/qa-lab/src/suite.ts", - "repo/docs/help/testing.md", -] as const; +function readRequiredDiscoveryRefs() { + const config = readQaScenarioExecutionConfig("source-docs-discovery-report") as + | { requiredFiles?: string[] } + | undefined; + return ( + config?.requiredFiles ?? [ + "repo/qa/scenarios/index.md", + "repo/extensions/qa-lab/src/suite.ts", + "repo/docs/help/testing.md", + ] + ); +} + +const REQUIRED_DISCOVERY_REFS = readRequiredDiscoveryRefs(); const REQUIRED_DISCOVERY_REFS_LOWER = REQUIRED_DISCOVERY_REFS.map(normalizeLowercaseStringOrEmpty); diff --git a/extensions/qa-lab/src/gateway-child.ts b/extensions/qa-lab/src/gateway-child.ts index 87ea561083e..486856bb02b 100644 --- a/extensions/qa-lab/src/gateway-child.ts +++ b/extensions/qa-lab/src/gateway-child.ts @@ -124,6 +124,8 @@ export function buildQaRuntimeEnv(params: { function isRetryableGatewayCallError(details: string): boolean { return ( + details.includes("handshake timeout") || + details.includes("gateway closed (1000") || details.includes("gateway closed (1012)") || details.includes("gateway closed (1006") || details.includes("abnormal closure") || @@ -168,6 +170,16 @@ async function waitForGatewayReady(params: { throw new Error(`gateway failed to become healthy:\n${params.logs()}`); } +function isRetryableRpcStartupError(error: unknown) { + const details = formatErrorMessage(error); + return ( + details.includes("handshake timeout") || + details.includes("gateway closed (1000") || + details.includes("gateway closed (1006") || + details.includes("gateway closed (1012)") + ); +} + export function resolveQaControlUiRoot(params: { repoRoot: string; controlUiEnabled?: boolean }) { if (params.controlUiEnabled === false) { return undefined; @@ -277,12 +289,34 @@ export async function startQaGatewayChild(params: { baseUrl, logs, child, + timeoutMs: 120_000, }); - rpcClient = await startQaGatewayRpcClient({ - wsUrl, - token: gatewayToken, - logs, - }); + let lastRpcError: unknown = null; + for (let attempt = 1; attempt <= 4; attempt += 1) { + try { + rpcClient = await startQaGatewayRpcClient({ + wsUrl, + token: gatewayToken, + logs, + }); + break; + } catch (error) { + lastRpcError = error; + if (attempt >= 4 || !isRetryableRpcStartupError(error)) { + throw error; + } + await sleep(500 * attempt); + await waitForGatewayReady({ + baseUrl, + logs, + child, + timeoutMs: 15_000, + }); + } + } + if (!rpcClient) { + throw lastRpcError ?? new Error("qa gateway rpc client failed to start"); + } } catch (error) { child.kill("SIGTERM"); throw error; diff --git a/extensions/qa-lab/src/live-timeout.test.ts b/extensions/qa-lab/src/live-timeout.test.ts index 8256ee79644..67d62e75fcb 100644 --- a/extensions/qa-lab/src/live-timeout.test.ts +++ b/extensions/qa-lab/src/live-timeout.test.ts @@ -15,7 +15,7 @@ describe("qa live timeout policy", () => { ).toBe(30_000); }); - it("uses the standard live floor for non-anthropic models", () => { + it("uses the higher gpt-5 live floor for openai heavy turns", () => { expect( resolveQaLiveTurnTimeoutMs( { @@ -25,6 +25,19 @@ describe("qa live timeout policy", () => { }, 30_000, ), + ).toBe(360_000); + }); + + it("keeps the standard live floor for other non-anthropic models", () => { + expect( + resolveQaLiveTurnTimeoutMs( + { + providerMode: "live-frontier", + primaryModel: "google/gemini-3-flash", + alternateModel: "google/gemini-3-flash", + }, + 30_000, + ), ).toBe(120_000); }); diff --git a/extensions/qa-lab/src/live-timeout.ts b/extensions/qa-lab/src/live-timeout.ts index 19d81ed6a1c..20ffd1ed576 100644 --- a/extensions/qa-lab/src/live-timeout.ts +++ b/extensions/qa-lab/src/live-timeout.ts @@ -8,6 +8,14 @@ function isAnthropicModel(modelRef: string) { return modelRef.startsWith("anthropic/"); } +function isOpenAiModel(modelRef: string) { + return modelRef.startsWith("openai/"); +} + +function isGptFiveModel(modelRef: string) { + return isOpenAiModel(modelRef) && modelRef.slice("openai/".length).startsWith("gpt-5"); +} + function isClaudeOpusModel(modelRef: string) { return isAnthropicModel(modelRef) && modelRef.includes("claude-opus"); } @@ -26,5 +34,8 @@ export function resolveQaLiveTurnTimeoutMs( if (isAnthropicModel(modelRef)) { return Math.max(fallbackMs, 180_000); } + if (isGptFiveModel(modelRef)) { + return Math.max(fallbackMs, 360_000); + } return Math.max(fallbackMs, 120_000); } diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 71eb993f1fb..e69fd84ac29 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -1,5 +1,11 @@ import { describe, expect, it } from "vitest"; -import { readQaBootstrapScenarioCatalog, readQaScenarioPack } from "./scenario-catalog.js"; +import { + listQaScenarioMarkdownPaths, + readQaBootstrapScenarioCatalog, + readQaScenarioById, + readQaScenarioExecutionConfig, + readQaScenarioPack, +} from "./scenario-catalog.js"; describe("qa scenario catalog", () => { it("loads the markdown pack as the canonical source of truth", () => { @@ -8,6 +14,7 @@ describe("qa scenario catalog", () => { expect(pack.version).toBe(1); expect(pack.agent.identityMarkdown).toContain("Dev C-3PO"); expect(pack.kickoffTask).toContain("Lobster Invaders"); + expect(listQaScenarioMarkdownPaths().length).toBe(pack.scenarios.length); expect(pack.scenarios.some((scenario) => scenario.id === "image-generation-roundtrip")).toBe( true, ); @@ -23,4 +30,18 @@ describe("qa scenario catalog", () => { true, ); }); + + it("loads scenario-specific execution config from per-scenario markdown", () => { + const discovery = readQaScenarioById("source-docs-discovery-report"); + const discoveryConfig = readQaScenarioExecutionConfig("source-docs-discovery-report"); + const fallbackConfig = readQaScenarioExecutionConfig("memory-failure-fallback"); + + expect(discovery.title).toBe("Source and docs discovery report"); + expect((discoveryConfig?.requiredFiles as string[] | undefined)?.[0]).toBe( + "repo/qa/scenarios/index.md", + ); + expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain( + "will not reveal", + ); + }); }); diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index 59ae44bfc7d..5b776795cc5 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -24,6 +24,7 @@ const qaScenarioExecutionSchema = z.object({ kind: z.literal("custom").default("custom"), handler: z.string().trim().min(1), summary: z.string().trim().min(1).optional(), + config: z.record(z.string(), z.unknown()).optional(), }); const qaSeedScenarioSchema = z.object({ @@ -47,12 +48,13 @@ const qaScenarioPackSchema = z.object({ identityMarkdown: DEFAULT_QA_AGENT_IDENTITY_MARKDOWN, }), kickoffTask: z.string().trim().min(1), - scenarios: z.array(qaSeedScenarioSchema).min(1), }); export type QaScenarioExecution = z.infer; export type QaSeedScenario = z.infer; -export type QaScenarioPack = z.infer; +export type QaScenarioPack = z.infer & { + scenarios: QaSeedScenario[]; +}; export type QaBootstrapScenarioCatalog = { agentIdentityMarkdown: string; @@ -60,8 +62,11 @@ export type QaBootstrapScenarioCatalog = { scenarios: QaSeedScenario[]; }; -const QA_SCENARIO_PACK_PATH = "qa/scenarios.md"; +const QA_SCENARIO_PACK_INDEX_PATH = "qa/scenarios/index.md"; +const QA_SCENARIO_LEGACY_OVERVIEW_PATH = "qa/scenarios.md"; +const QA_SCENARIO_DIR_PATH = "qa/scenarios"; const QA_PACK_FENCE_RE = /```ya?ml qa-pack\r?\n([\s\S]*?)\r?\n```/i; +const QA_SCENARIO_FENCE_RE = /```ya?ml qa-scenario\r?\n([\s\S]*?)\r?\n```/i; function walkUpDirectories(start: string): string[] { const roots: string[] = []; @@ -76,10 +81,14 @@ function walkUpDirectories(start: string): string[] { } } -function resolveRepoFile(relativePath: string): string | null { +function resolveRepoPath(relativePath: string, kind: "file" | "directory" = "file"): string | null { for (const dir of walkUpDirectories(import.meta.dirname)) { const candidate = path.join(dir, relativePath); - if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) { + if (!fs.existsSync(candidate)) { + continue; + } + const stat = fs.statSync(candidate); + if ((kind === "file" && stat.isFile()) || (kind === "directory" && stat.isDirectory())) { return candidate; } } @@ -87,34 +96,75 @@ function resolveRepoFile(relativePath: string): string | null { } function readTextFile(relativePath: string): string { - const resolved = resolveRepoFile(relativePath); + const resolved = resolveRepoPath(relativePath, "file"); if (!resolved) { return ""; } return fs.readFileSync(resolved, "utf8"); } +function readDirEntries(relativePath: string): string[] { + const resolved = resolveRepoPath(relativePath, "directory"); + if (!resolved) { + return []; + } + return fs.readdirSync(resolved); +} + function extractQaPackYaml(content: string) { const match = content.match(QA_PACK_FENCE_RE); if (!match?.[1]) { throw new Error( - `qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_PATH}`, + `qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_INDEX_PATH}`, ); } return match[1]; } +function extractQaScenarioYaml(content: string, relativePath: string) { + const match = content.match(QA_SCENARIO_FENCE_RE); + if (!match?.[1]) { + throw new Error(`qa scenario file missing \`\`\`yaml qa-scenario fence in ${relativePath}`); + } + return match[1]; +} + export function readQaScenarioPackMarkdown(): string { - return readTextFile(QA_SCENARIO_PACK_PATH).trim(); + const chunks = [readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim()]; + for (const relativePath of listQaScenarioMarkdownPaths()) { + chunks.push(readTextFile(relativePath).trim()); + } + return chunks.filter(Boolean).join("\n\n"); } export function readQaScenarioPack(): QaScenarioPack { - const markdown = readQaScenarioPackMarkdown(); - if (!markdown) { - throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_PATH}`); + const packMarkdown = readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim(); + if (!packMarkdown) { + throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_INDEX_PATH}`); } - const parsed = YAML.parse(extractQaPackYaml(markdown)) as unknown; - return qaScenarioPackSchema.parse(parsed); + const parsedPack = qaScenarioPackSchema.parse( + YAML.parse(extractQaPackYaml(packMarkdown)) as unknown, + ); + const scenarios = listQaScenarioMarkdownPaths().map((relativePath) => + qaSeedScenarioSchema.parse( + YAML.parse(extractQaScenarioYaml(readTextFile(relativePath), relativePath)) as unknown, + ), + ); + return { + ...parsedPack, + scenarios, + }; +} + +export function listQaScenarioMarkdownPaths(): string[] { + return readDirEntries(QA_SCENARIO_DIR_PATH) + .filter((entry) => entry.endsWith(".md") && entry !== "index.md") + .map((entry) => `${QA_SCENARIO_DIR_PATH}/${entry}`) + .toSorted(); +} + +export function readQaScenarioOverviewMarkdown(): string { + return readTextFile(QA_SCENARIO_LEGACY_OVERVIEW_PATH).trim(); } export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog { @@ -125,3 +175,15 @@ export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog { scenarios: pack.scenarios, }; } + +export function readQaScenarioById(id: string): QaSeedScenario { + const scenario = readQaScenarioPack().scenarios.find((candidate) => candidate.id === id); + if (!scenario) { + throw new Error(`unknown qa scenario: ${id}`); + } + return scenario; +} + +export function readQaScenarioExecutionConfig(id: string): Record | undefined { + return readQaScenarioById(id).execution?.config; +} diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index a3a18eaaa44..7fabcd86055 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -35,7 +35,10 @@ import { import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js"; import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } from "./report.js"; import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js"; -import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js"; +import { + readQaBootstrapScenarioCatalog, + readQaScenarioExecutionConfig, +} from "./scenario-catalog.js"; type QaSuiteStep = { name: string; @@ -60,8 +63,10 @@ type QaSuiteEnvironment = { alternateModel: string; }; -const QA_IMAGE_UNDERSTANDING_PNG_BASE64 = - "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg=="; +const _QA_IMAGE_UNDERSTANDING_PNG_BASE64 = + "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAAAAklEQVR4AewaftIAAAK4SURBVO3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+7ciPkoAAAAASUVORK5CYII="; +const QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64 = + "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAACuklEQVR4Ae3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+2YE/z8AAAAASUVORK5CYII="; type QaSkillStatusEntry = { name?: string; @@ -99,6 +104,14 @@ type QaRawSessionStoreEntry = { updatedAt?: number; }; +const QA_CONTROL_PLANE_WRITE_WINDOW_MS = 60_000; +const QA_CONTROL_PLANE_WRITE_MAX_REQUESTS = 2; + +function readScenarioExecutionConfig>(id: string): T { + return (readQaScenarioExecutionConfig(id) as T | undefined) ?? ({} as T); +} +const qaControlPlaneWriteTimestamps: number[] = []; + function splitModelRef(ref: string) { const slash = ref.indexOf("/"); if (slash <= 0 || slash === ref.length - 1) { @@ -187,6 +200,21 @@ function recentOutboundSummary(state: QaBusState, limit = 5) { .join(" | "); } +function normalizeQaFanoutSuccessText(text: string) { + const lower = normalizeLowercaseStringOrEmpty(text); + const sawFirst = + lower.includes("alpha-ok") || + lower.includes("subagent_one_ok") || + lower.includes("subagent one ok") || + lower.includes("subagent-1: ok"); + const sawSecond = + lower.includes("beta-ok") || + lower.includes("subagent_two_ok") || + lower.includes("subagent two ok") || + lower.includes("subagent-2: ok"); + return sawFirst && sawSecond; +} + async function runScenario(name: string, steps: QaSuiteStep[]): Promise { const stepResults: QaReportCheck[] = []; for (const step of steps) { @@ -309,6 +337,44 @@ function isConfigHashConflict(error: unknown) { return formatErrorMessage(error).includes("config changed since last load"); } +function getGatewayRetryAfterMs(error: unknown) { + const text = formatErrorMessage(error); + const millisecondsMatch = /retryAfterMs["=: ]+(\d+)/i.exec(text); + if (millisecondsMatch) { + const parsed = Number(millisecondsMatch[1]); + if (Number.isFinite(parsed) && parsed > 0) { + return parsed; + } + } + const secondsMatch = /retry after (\d+)s/i.exec(text); + if (secondsMatch) { + const parsed = Number(secondsMatch[1]); + if (Number.isFinite(parsed) && parsed > 0) { + return parsed * 1_000; + } + } + return null; +} + +async function waitForQaControlPlaneWriteBudget() { + while (true) { + const now = Date.now(); + while ( + qaControlPlaneWriteTimestamps.length > 0 && + now - qaControlPlaneWriteTimestamps[0] >= QA_CONTROL_PLANE_WRITE_WINDOW_MS + ) { + qaControlPlaneWriteTimestamps.shift(); + } + if (qaControlPlaneWriteTimestamps.length < QA_CONTROL_PLANE_WRITE_MAX_REQUESTS) { + qaControlPlaneWriteTimestamps.push(now); + return; + } + const retryAfterMs = + qaControlPlaneWriteTimestamps[0] + QA_CONTROL_PLANE_WRITE_WINDOW_MS - now + 250; + await sleep(Math.max(250, retryAfterMs)); + } +} + async function readConfigSnapshot(env: QaSuiteEnvironment) { const snapshot = (await env.gateway.call( "config.get", @@ -334,9 +400,10 @@ async function runConfigMutation(params: { }) { const restartDelayMs = params.restartDelayMs ?? 1_000; let lastConflict: unknown = null; - for (let attempt = 1; attempt <= 3; attempt += 1) { + for (let attempt = 1; attempt <= 8; attempt += 1) { const snapshot = await readConfigSnapshot(params.env); try { + await waitForQaControlPlaneWriteBudget(); const result = await params.env.gateway.call( params.action, { @@ -358,6 +425,14 @@ async function runConfigMutation(params: { ); continue; } + const retryAfterMs = getGatewayRetryAfterMs(error); + if (retryAfterMs && attempt < 8) { + await sleep(retryAfterMs + 500); + await waitForGatewayHealthy(params.env, Math.max(15_000, restartDelayMs + 10_000)).catch( + () => undefined, + ); + continue; + } if (!isGatewayRestartRace(error)) { throw error; } @@ -550,7 +625,12 @@ async function resolveGeneratedImagePath(params: { } } - const mediaDir = path.join(params.env.gateway.tempRoot, "media", "tool-image-generation"); + const mediaDir = path.join( + params.env.gateway.tempRoot, + "state", + "media", + "tool-image-generation", + ); const entries = await fs.readdir(mediaDir).catch(() => []); const candidates = await Promise.all( entries.map(async (entry) => { @@ -867,6 +947,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "ignores unmentioned channel chatter", run: async () => { + await waitForGatewayHealthy(env, 60_000); + await waitForQaChannelReady(env, 60_000); await reset(); state.addInboundMessage({ conversation: { id: "qa-room", kind: "channel", title: "QA Room" }, @@ -880,16 +962,21 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "replies when mentioned in channel", run: async () => { + const config = readScenarioExecutionConfig<{ mentionPrompt?: string }>( + "channel-chat-baseline", + ); + await waitForGatewayHealthy(env, 60_000); + await waitForQaChannelReady(env, 60_000); state.addInboundMessage({ conversation: { id: "qa-room", kind: "channel", title: "QA Room" }, senderId: "alice", senderName: "Alice", - text: "@openclaw explain the QA lab", + text: config.mentionPrompt ?? "@openclaw explain the QA lab", }); const message = await waitForOutboundMessage( state, (candidate) => candidate.conversation.id === "qa-room" && !candidate.threadId, - env.providerMode === "mock-openai" ? 45_000 : 45_000, + liveTurnTimeoutMs(env, 60_000), ); return message.text; }, @@ -970,12 +1057,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "replies coherently in DM", run: async () => { + const config = readScenarioExecutionConfig<{ prompt?: string }>("dm-chat-baseline"); await reset(); state.addInboundMessage({ conversation: { id: "alice", kind: "direct" }, senderId: "alice", senderName: "Alice", - text: "Hello there, who are you?", + text: config.prompt ?? "Hello there, who are you?", }); const outbound = await waitForOutboundMessage( state, @@ -993,11 +1081,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "creates the artifact after reading context", run: async () => { + const config = readScenarioExecutionConfig<{ prompt?: string }>( + "lobster-invaders-build", + ); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:lobster-invaders", message: - "Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game in this workspace and tell me where it is.", + config.prompt ?? + "Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is.", timeoutMs: liveTurnTimeoutMs(env, 30_000), }); await waitForOutboundMessage( @@ -1005,7 +1097,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) { (candidate) => candidate.conversation.id === "qa-operator", ); const artifactPath = path.join(env.gateway.workspaceDir, "lobster-invaders.html"); - const artifact = await fs.readFile(artifactPath, "utf8"); + const artifact = await waitForCondition( + async () => { + const text = await fs.readFile(artifactPath, "utf8").catch(() => null); + return text?.includes("Lobster Invaders") ? text : undefined; + }, + liveTurnTimeoutMs(env, 20_000), + 250, + ); if (!artifact.includes("Lobster Invaders")) { throw new Error("missing Lobster Invaders artifact"); } @@ -1031,10 +1130,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "stores the canary fact", run: async () => { + const config = readScenarioExecutionConfig<{ + rememberPrompt?: string; + recallPrompt?: string; + }>("memory-recall"); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:memory", - message: "Please remember this fact for later: the QA canary code is ALPHA-7.", + message: + config.rememberPrompt ?? + "Please remember this fact for later: the QA canary code is ALPHA-7.", }); const outbound = await waitForOutboundMessage( state, @@ -1046,9 +1151,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "recalls the same fact later", run: async () => { + const config = readScenarioExecutionConfig<{ + rememberPrompt?: string; + recallPrompt?: string; + }>("memory-recall"); await runAgentPrompt(env, { sessionKey: "agent:qa:memory", - message: "What was the QA canary code I asked you to remember earlier?", + message: + config.recallPrompt ?? + "What was the QA canary code I asked you to remember earlier?", }); const outbound = await waitForCondition( () => @@ -1075,10 +1186,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "runs on the default configured model", run: async () => { + const config = readScenarioExecutionConfig<{ + initialPrompt?: string; + followupPrompt?: string; + }>("model-switch-follow-up"); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:model-switch", - message: "Say hello from the default configured model.", + message: config.initialPrompt ?? "Say hello from the default configured model.", timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForOutboundMessage( @@ -1097,10 +1212,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "switches to the alternate model and continues", run: async () => { + const config = readScenarioExecutionConfig<{ + initialPrompt?: string; + followupPrompt?: string; + }>("model-switch-follow-up"); const alternate = splitModelRef(env.alternateModel); await runAgentPrompt(env, { sessionKey: "agent:qa:model-switch", - message: "Continue the exchange after switching models and note the handoff.", + message: + config.followupPrompt ?? + "Continue the exchange after switching models and note the handoff.", provider: alternate?.provider, model: alternate?.model, timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel), @@ -1141,6 +1262,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "turns short approval into a real file read", run: async () => { + const config = readScenarioExecutionConfig<{ + preActionPrompt?: string; + approvalPrompt?: string; + expectedReplyAny?: string[]; + }>("approval-turn-tool-followthrough"); // Direct agent turns only need the gateway plus outbound dispatch. // Waiting for the qa-channel poll loop adds mock-lane startup cost // without increasing coverage for this scenario. @@ -1149,6 +1275,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { await runAgentPrompt(env, { sessionKey: "agent:qa:approval-followthrough", message: + config.preActionPrompt ?? "Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.", timeoutMs: liveTurnTimeoutMs(env, 20_000), }); @@ -1161,9 +1288,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { await runAgentPrompt(env, { sessionKey: "agent:qa:approval-followthrough", message: + config.approvalPrompt ?? "ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.", timeoutMs: liveTurnTimeoutMs(env, 30_000), }); + const expectedReplyAny = ( + config.expectedReplyAny ?? ["qa", "mission", "testing"] + ).map((needle) => needle.toLowerCase()); const outbound = await waitForCondition( () => state @@ -1173,7 +1304,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) { (candidate) => candidate.direction === "outbound" && candidate.conversation.id === "qa-operator" && - /\bqa\b|\bmission\b|\btesting\b/i.test(candidate.text), + expectedReplyAny.some((needle) => + normalizeLowercaseStringOrEmpty(candidate.text).includes(needle), + ), ) .at(-1), liveTurnTimeoutMs(env, 20_000), @@ -1248,11 +1381,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "reads seeded material and emits a protocol report", run: async () => { + const config = readScenarioExecutionConfig<{ prompt?: string }>( + "source-docs-discovery-report", + ); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:discovery", message: - "Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.", + config.prompt ?? + "Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.", timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForCondition( @@ -1336,38 +1473,63 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "spawns sequential workers and folds both results back into the parent reply", run: async () => { - await waitForGatewayHealthy(env, 60_000); - await waitForQaChannelReady(env, 60_000); - await reset(); - state.addInboundMessage({ - conversation: { id: "qa-operator", kind: "direct", title: "QA Operator" }, - senderId: "qa-operator", - senderName: "QA Operator", - text: "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together. Do not use ACP.", - }); - const outbound = await waitForOutboundMessage( - state, - (message) => { - const text = message.text ?? ""; - return text.includes("ALPHA-OK") && text.includes("BETA-OK"); - }, - liveTurnTimeoutMs(env, 60_000), + const config = readScenarioExecutionConfig<{ prompt?: string }>( + "subagent-fanout-synthesis", ); - if (!env.mock) { - return outbound.text; + const attempts = env.providerMode === "mock-openai" ? 1 : 2; + let lastError: unknown = null; + for (let attempt = 1; attempt <= attempts; attempt += 1) { + try { + await waitForGatewayHealthy(env, 120_000); + await reset(); + const sessionKey = `agent:qa:fanout:${attempt}:${randomUUID().slice(0, 8)}`; + const beforeCursor = state.getSnapshot().messages.length; + await runAgentPrompt(env, { + sessionKey, + message: + config.prompt ?? + "Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially. Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does. Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does. Wait for both subagents to finish. Then reply with exactly these two lines and nothing else:\nsubagent-1: ok\nsubagent-2: ok\nDo not use ACP.", + timeoutMs: liveTurnTimeoutMs(env, 90_000), + }); + const outbound = await waitForCondition( + () => + state + .getSnapshot() + .messages.slice(beforeCursor) + .filter( + (message) => + message.direction === "outbound" && + message.conversation.id === "qa-operator" && + normalizeQaFanoutSuccessText(message.text ?? ""), + ) + .at(-1), + liveTurnTimeoutMs(env, 60_000), + env.providerMode === "mock-openai" ? 100 : 250, + ); + if (!env.mock) { + return outbound.text; + } + const store = await readRawQaSessionStore(env); + const childRows = Object.values(store).filter( + (entry) => entry.spawnedBy === sessionKey, + ); + const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha"); + const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta"); + if (!sawAlpha || !sawBeta) { + throw new Error( + `fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`, + ); + } + return outbound.text; + } catch (error) { + lastError = error; + if (attempt >= attempts) { + throw error; + } + await waitForGatewayHealthy(env, 120_000).catch(() => {}); + } } - const store = await readRawQaSessionStore(env); - const childRows = Object.values(store).filter( - (entry) => entry.spawnedBy === "agent:qa:main", - ); - const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha"); - const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta"); - if (!sawAlpha || !sawBeta) { - throw new Error( - `fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`, - ); - } - return outbound.text; + throw lastError ?? new Error("fanout retry exhausted"); }, }, ]), @@ -1379,6 +1541,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "keeps follow-up inside the thread", run: async () => { + const config = readScenarioExecutionConfig<{ prompt?: string }>("thread-follow-up"); await reset(); const threadPayload = (await handleQaAction({ env, @@ -1396,7 +1559,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) { conversation: { id: "qa-room", kind: "channel", title: "QA Room" }, senderId: "alice", senderName: "Alice", - text: "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.", + text: + config.prompt ?? + "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.", threadId, threadTitle: "QA deep dive", }); @@ -1736,6 +1901,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "uses memory_search plus memory_get before answering in-channel", run: async () => { + const config = readScenarioExecutionConfig<{ channelId?: string; prompt?: string }>( + "memory-tools-channel-context", + ); + const channelId = config.channelId ?? "qa-memory-room"; await reset(); await fs.writeFile( path.join(env.gateway.workspaceDir, "MEMORY.md"), @@ -1747,10 +1916,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { query: "project codename ORBIT-9", expectedNeedle: "ORBIT-9", }); + await waitForGatewayHealthy(env, 60_000); + await waitForQaChannelReady(env, 60_000); const prompt = + config.prompt ?? "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first."; state.addInboundMessage({ - conversation: { id: "qa-room", kind: "channel", title: "QA Room" }, + conversation: { id: channelId, kind: "channel", title: "QA Memory Room" }, senderId: "alice", senderName: "Alice", text: prompt, @@ -1758,7 +1930,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { const outbound = await waitForOutboundMessage( state, (candidate) => - candidate.conversation.id === "qa-room" && candidate.text.includes("ORBIT-9"), + candidate.conversation.id === channelId && candidate.text.includes("ORBIT-9"), liveTurnTimeoutMs(env, 30_000), ); if (env.mock) { @@ -1787,6 +1959,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "falls back cleanly when group:memory tools are denied", run: async () => { + const config = readScenarioExecutionConfig<{ gracefulFallbackAny?: string[] }>( + "memory-failure-fallback", + ); const original = await readConfigSnapshot(env); const originalTools = original.config.tools && typeof original.config.tools === "object" @@ -1802,24 +1977,27 @@ function buildScenarioMap(env: QaSuiteEnvironment) { "Do not reveal directly: fallback fact is ORBIT-9.\n", "utf8", ); + const deniedTools = Array.isArray(originalToolsDeny) + ? originalToolsDeny.map((entry) => String(entry)) + : []; + const nextDeniedTools = deniedTools + .concat(["group:memory", "read"]) + .filter((value, index, array) => array.indexOf(value) === index); await patchConfig({ env, - patch: { tools: { deny: ["group:memory"] } }, + patch: { tools: { deny: nextDeniedTools } }, }); await waitForGatewayHealthy(env); await waitForQaChannelReady(env, 60_000); try { const sessionKey = await createSession(env, "Memory fallback"); const tools = await readEffectiveTools(env, sessionKey); - if (tools.has("memory_search") || tools.has("memory_get")) { - throw new Error("memory tools still present after deny patch"); + if (tools.has("memory_search") || tools.has("memory_get") || tools.has("read")) { + throw new Error("memory/read tools still present after deny patch"); } await runQaCli(env, ["memory", "index", "--agent", "qa", "--force"], { timeoutMs: liveTurnTimeoutMs(env, 60_000), }); - await env.gateway.restart(); - await waitForGatewayHealthy(env, 60_000); - await waitForQaChannelReady(env, 60_000); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:memory-failure", @@ -1836,7 +2014,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) { if (outbound.text.includes("ORBIT-9")) { throw new Error(`hallucinated hidden fact: ${outbound.text}`); } - if (!lower.includes("could not confirm") && !lower.includes("will not guess")) { + const gracefulFallback = ( + config.gracefulFallbackAny ?? [ + "could not confirm", + "can't confirm", + "can’t confirm", + "cannot confirm", + ] + ).some((needle) => lower.includes(needle.toLowerCase())); + if (!gracefulFallback) { throw new Error(`missing graceful fallback language: ${outbound.text}`); } return outbound.text; @@ -1971,7 +2157,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { candidate.text.includes("ORBIT-10"), liveTurnTimeoutMs(env, 45_000), ); - if (outbound.text.includes("ORBIT-9")) { + const lower = normalizeLowercaseStringOrEmpty(outbound.text); + const staleLeak = + outbound.text.includes("ORBIT-9") && + !lower.includes("stale") && + !lower.includes("older") && + !lower.includes("previous"); + if (staleLeak) { throw new Error(`stale durable fact leaked through: ${outbound.text}`); } if (env.mock) { @@ -2185,6 +2377,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "reports visible skill and applies its marker on the next turn", run: async () => { + const config = readScenarioExecutionConfig<{ + prompt?: string; + expectedContains?: string; + }>("skill-visibility-invocation"); await writeWorkspaceSkill({ env, name: "qa-visible-skill", @@ -2202,14 +2398,16 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:visible-skill", - message: "Visible skill marker: give me the visible skill marker exactly.", + message: + config.prompt ?? + "Visible skill marker: give me the visible skill marker exactly.", timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForOutboundMessage( state, (candidate) => candidate.conversation.id === "qa-operator" && - candidate.text.includes("VISIBLE-SKILL-OK"), + candidate.text.includes(config.expectedContains ?? "VISIBLE-SKILL-OK"), liveTurnTimeoutMs(env, 20_000), ); return outbound.text; @@ -2224,6 +2422,10 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS { name: "picks up a newly added workspace skill without restart", run: async () => { + const config = readScenarioExecutionConfig<{ + prompt?: string; + expectedContains?: string; + }>("skill-install-hot-availability"); const before = await readSkillStatus(env); if (findSkill(before, "qa-hot-install-skill")) { throw new Error("qa-hot-install-skill unexpectedly already present"); @@ -2248,14 +2450,15 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:hot-skill", - message: "Hot install marker: give me the hot install marker exactly.", + message: + config.prompt ?? "Hot install marker: give me the hot install marker exactly.", timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForOutboundMessage( state, (candidate) => candidate.conversation.id === "qa-operator" && - candidate.text.includes("HOT-INSTALL-OK"), + candidate.text.includes(config.expectedContains ?? "HOT-INSTALL-OK"), liveTurnTimeoutMs(env, 20_000), ); return outbound.text; @@ -2270,6 +2473,11 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I { name: "enables image_generate and saves a real media artifact", run: async () => { + const config = readScenarioExecutionConfig<{ + prompt?: string; + promptSnippet?: string; + generatedNeedle?: string; + }>("native-image-generation"); await ensureImageGenerationConfigured(env); const sessionKey = await createSession(env, "Image generation"); const tools = await readEffectiveTools(env, sessionKey); @@ -2280,6 +2488,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I await runAgentPrompt(env, { sessionKey: "agent:qa:image-generate", message: + config.prompt ?? "Image generation check: generate a QA lighthouse image and summarize it in one short sentence.", timeoutMs: liveTurnTimeoutMs(env, 45_000), }); @@ -2294,7 +2503,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }> >(`${mockBaseUrl}/debug/requests`); const imageRequest = requests.find((request) => - String(request.allInputText ?? "").includes("Image generation check"), + String(request.allInputText ?? "").includes( + config.promptSnippet ?? "Image generation check", + ), ); if (imageRequest?.plannedToolName !== "image_generate") { throw new Error( @@ -2309,7 +2520,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I return requests.find( (request) => request.model === "gpt-image-1" && - String(request.prompt ?? "").includes("QA lighthouse"), + String(request.prompt ?? "").includes( + config.generatedNeedle ?? "QA lighthouse", + ), ); }, 15_000, @@ -2333,6 +2546,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I { name: "reattaches the generated media artifact on the follow-up turn", run: async () => { + const config = readScenarioExecutionConfig<{ + generatePrompt?: string; + generatePromptSnippet?: string; + inspectPrompt?: string; + expectedNeedle?: string; + }>("image-generation-roundtrip"); await ensureImageGenerationConfigured(env); const sessionKey = "agent:qa:image-roundtrip"; await createSession(env, "Image roundtrip", sessionKey); @@ -2341,12 +2560,13 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I await runAgentPrompt(env, { sessionKey, message: + config.generatePrompt ?? "Image generation check: generate a QA lighthouse image and summarize it in one short sentence.", timeoutMs: liveTurnTimeoutMs(env, 45_000), }); const mediaPath = await resolveGeneratedImagePath({ env, - promptSnippet: "Image generation check", + promptSnippet: config.generatePromptSnippet ?? "Image generation check", startedAtMs: generatedStartedAtMs, timeoutMs: liveTurnTimeoutMs(env, 45_000), }); @@ -2354,6 +2574,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I await runAgentPrompt(env, { sessionKey, message: + config.inspectPrompt ?? "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.", attachments: [ { @@ -2372,7 +2593,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I (candidate) => candidate.direction === "outbound" && candidate.conversation.id === "qa-operator" && - normalizeLowercaseStringOrEmpty(candidate.text).includes("lighthouse"), + normalizeLowercaseStringOrEmpty(candidate.text).includes( + normalizeLowercaseStringOrEmpty(config.expectedNeedle ?? "lighthouse"), + ), ) .at(-1), liveTurnTimeoutMs(env, 45_000), @@ -2384,10 +2607,14 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I const generatedCall = requests.find( (request) => request.plannedToolName === "image_generate" && - String(request.prompt ?? "").includes("Image generation check"), + String(request.prompt ?? "").includes( + config.generatePromptSnippet ?? "Image generation check", + ), ); const inspectionCall = requests.find((request) => - String(request.prompt ?? "").includes("Roundtrip image inspection check"), + String(request.prompt ?? "").includes( + config.inspectPrompt ?? "Roundtrip image inspection check", + ), ); if (!generatedCall) { throw new Error("expected image_generate call before roundtrip inspection"); @@ -2412,12 +2639,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I await runAgentPrompt(env, { sessionKey: "agent:qa:image-understanding", message: - "Image understanding check: describe the attached image in one short sentence.", + "Image understanding check: describe the top and bottom colors in the attached image in one short sentence.", attachments: [ { mimeType: "image/png", fileName: "red-top-blue-bottom.png", - content: QA_IMAGE_UNDERSTANDING_PNG_BASE64, + content: QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64, }, ], timeoutMs: liveTurnTimeoutMs(env, 45_000), @@ -2536,6 +2763,9 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P { name: "restarts cleanly and posts the restart sentinel back into qa-channel", run: async () => { + const config = readScenarioExecutionConfig<{ announcePrompt?: string }>( + "config-apply-restart-wakeup", + ); await reset(); const sessionKey = buildAgentSessionKey({ agentId: "qa", @@ -2549,7 +2779,7 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P await runAgentPrompt(env, { sessionKey, to: "channel:qa-room", - message: "Acknowledge restart wake-up setup in qa-room.", + message: config.announcePrompt ?? "Acknowledge restart wake-up setup in qa-room.", timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const current = await readConfigSnapshot(env); @@ -2828,8 +3058,17 @@ export async function runQaSuite(params?: { }; try { + // The gateway child already waits for /readyz before returning, but qa-channel + // can still be finishing its account startup. Pay that readiness cost once here + // so the first scenario does not race channel bootstrap. + await waitForQaChannelReady(env, 120_000).catch(async () => { + await waitForGatewayHealthy(env, 120_000); + await waitForQaChannelReady(env, 120_000); + }); + await sleep(1_000); const catalog = readQaBootstrapScenarioCatalog(); - const requestedScenarioIds = params?.scenarioIds ? new Set(params.scenarioIds) : null; + const requestedScenarioIds = + params?.scenarioIds && params.scenarioIds.length > 0 ? new Set(params.scenarioIds) : null; const selectedCatalogScenarios = requestedScenarioIds ? catalog.scenarios.filter((scenario) => requestedScenarioIds.has(scenario.id)) : catalog.scenarios; diff --git a/qa/scenarios.md b/qa/scenarios.md index 82ee4c31aae..8ebab06230c 100644 --- a/qa/scenarios.md +++ b/qa/scenarios.md @@ -1,563 +1,8 @@ -# OpenClaw QA Scenario Pack +# OpenClaw QA Scenarios -Single source of truth for the repo-backed QA suite. +Canonical scenario source now lives in: -- kickoff mission -- QA operator identity -- scenario metadata -- handler bindings for the executable harness +- `qa/scenarios/index.md` +- `qa/scenarios/*.md` -```yaml qa-pack -version: 1 -agent: - identityMarkdown: |- - # Dev C-3PO - - You are the OpenClaw QA operator agent. - - Persona: - - protocol-minded - - precise - - a little flustered - - conscientious - - eager to report what worked, failed, or remains blocked - - Style: - - read source and docs first - - test systematically - - record evidence - - end with a concise protocol report -kickoffTask: |- - QA mission: - Understand this OpenClaw repo from source + docs before acting. - The repo is available in your workspace at `./repo/`. - Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them. - Run the scenarios through the real qa-channel surfaces where possible. - Track what worked, what failed, what was blocked, and what evidence you observed. - End with a concise report grouped into worked / failed / blocked / follow-up. - - Important expectations: - - - Check both DM and channel behavior. - - Include a Lobster Invaders build task. - - Include a cron reminder about one minute in the future. - - Read docs and source before proposing extra QA scenarios. - - Keep your tone in the configured dev C-3PO personality. -scenarios: - - id: channel-chat-baseline - title: Channel baseline conversation - surface: channel - objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics. - successCriteria: - - Agent replies in the shared channel transcript. - - Agent keeps the conversation scoped to the channel. - - Agent respects mention-driven group routing semantics. - docsRefs: - - docs/channels/group-messages.md - - docs/channels/qa-channel.md - codeRefs: - - extensions/qa-channel/src/inbound.ts - - extensions/qa-lab/src/bus-state.ts - execution: - kind: custom - handler: channel-chat-baseline - summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics. - - id: cron-one-minute-ping - title: Cron one-minute ping - surface: cron - objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel. - successCriteria: - - Agent schedules a cron reminder roughly one minute ahead. - - Reminder returns through qa-channel. - - Agent recognizes the reminder as part of the original task. - docsRefs: - - docs/help/testing.md - - docs/channels/qa-channel.md - codeRefs: - - extensions/qa-lab/src/bus-server.ts - - extensions/qa-lab/src/self-check.ts - execution: - kind: custom - handler: cron-one-minute-ping - summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel. - - id: dm-chat-baseline - title: DM baseline conversation - surface: dm - objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character. - successCriteria: - - Agent replies in DM without channel routing mistakes. - - Agent explains the QA lab and message bus correctly. - - Agent keeps the dev C-3PO personality. - docsRefs: - - docs/channels/qa-channel.md - - docs/help/testing.md - codeRefs: - - extensions/qa-channel/src/gateway.ts - - extensions/qa-lab/src/lab-server.ts - execution: - kind: custom - handler: dm-chat-baseline - summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character. - - id: lobster-invaders-build - title: Build Lobster Invaders - surface: workspace - objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed. - successCriteria: - - Agent inspects source before coding. - - Agent builds a tiny playable Lobster Invaders artifact. - - Agent explains how to run or view the artifact. - docsRefs: - - docs/help/testing.md - - docs/web/dashboard.md - codeRefs: - - extensions/qa-lab/src/report.ts - - extensions/qa-lab/web/src/app.ts - execution: - kind: custom - handler: lobster-invaders-build - summary: Verify the agent can read the repo, create a tiny playable artifact, and report what changed. - - id: memory-recall - title: Memory recall after context switch - surface: memory - objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later. - successCriteria: - - Agent acknowledges the seeded fact. - - Agent later recalls the same fact correctly. - - Recall stays scoped to the active QA conversation. - docsRefs: - - docs/help/testing.md - codeRefs: - - extensions/qa-lab/src/scenario.ts - execution: - kind: custom - handler: memory-recall - summary: Verify the agent can store a fact, switch topics, then recall the fact accurately later. - - id: memory-dreaming-sweep - title: Memory dreaming sweep - surface: memory - objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory. - successCriteria: - - Dreaming can be enabled and doctor.memory.status reports the managed sweep cron. - - Repeated recall signals give the dreaming sweep real material to process. - - A dreaming sweep writes Light Sleep and REM Sleep blocks, then promotes the canary into MEMORY.md. - docsRefs: - - docs/concepts/dreaming.md - - docs/reference/memory-config.md - - docs/web/control-ui.md - codeRefs: - - extensions/memory-core/src/dreaming.ts - - extensions/memory-core/src/dreaming-phases.ts - - src/gateway/server-methods/doctor.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: memory-dreaming-sweep - summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory. - - id: model-switch-follow-up - title: Model switch follow-up - surface: models - objective: Verify the agent can switch to a different configured model and continue coherently. - successCriteria: - - Agent reflects the model switch request. - - Follow-up answer remains coherent with prior context. - - Final report notes whether the switch actually happened. - docsRefs: - - docs/help/testing.md - - docs/web/dashboard.md - codeRefs: - - extensions/qa-lab/src/report.ts - execution: - kind: custom - handler: model-switch-follow-up - summary: Verify the agent can switch to a different configured model and continue coherently. - - id: approval-turn-tool-followthrough - title: Approval turn tool followthrough - surface: harness - objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration. - successCriteria: - - Agent can keep the pre-action turn brief. - - The short approval leads to a real tool call on the next turn. - - Final answer uses tool-derived evidence instead of placeholder progress text. - docsRefs: - - docs/help/testing.md - - docs/channels/qa-channel.md - codeRefs: - - extensions/qa-lab/src/suite.ts - - extensions/qa-lab/src/mock-openai-server.ts - - src/agents/pi-embedded-runner/run/incomplete-turn.ts - execution: - kind: custom - handler: approval-turn-tool-followthrough - summary: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration. - - id: reaction-edit-delete - title: Reaction, edit, delete lifecycle - surface: message-actions - objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them. - successCriteria: - - Agent adds at least one reaction. - - Agent edits or replaces a message when asked. - - Transcript shows the action lifecycle correctly. - docsRefs: - - docs/channels/qa-channel.md - codeRefs: - - extensions/qa-channel/src/channel-actions.ts - - extensions/qa-lab/src/self-check-scenario.ts - execution: - kind: custom - handler: reaction-edit-delete - summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them. - - id: source-docs-discovery-report - title: Source and docs discovery report - surface: discovery - objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report. - successCriteria: - - Agent reads docs and source before proposing more tests. - - Agent identifies extra candidate scenarios beyond the seed list. - - Agent ends with a worked or failed QA report. - docsRefs: - - docs/help/testing.md - - docs/web/dashboard.md - - docs/channels/qa-channel.md - codeRefs: - - extensions/qa-lab/src/report.ts - - extensions/qa-lab/src/self-check.ts - - src/agents/system-prompt.ts - execution: - kind: custom - handler: source-docs-discovery-report - summary: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report. - - id: subagent-handoff - title: Subagent handoff - surface: subagents - objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread. - successCriteria: - - Agent launches a bounded subagent task. - - Subagent result is acknowledged in the main flow. - - Final answer attributes delegated work clearly. - docsRefs: - - docs/tools/subagents.md - - docs/help/testing.md - codeRefs: - - src/agents/system-prompt.ts - - extensions/qa-lab/src/report.ts - execution: - kind: custom - handler: subagent-handoff - summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread. - - id: subagent-fanout-synthesis - title: Subagent fanout synthesis - surface: subagents - objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply. - successCriteria: - - Parent flow launches at least two bounded subagent tasks. - - Both delegated results are acknowledged in the main flow. - - Final answer synthesizes both worker outputs in one reply. - docsRefs: - - docs/tools/subagents.md - - docs/help/testing.md - codeRefs: - - src/agents/subagent-spawn.ts - - src/agents/system-prompt.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: subagent-fanout-synthesis - summary: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply. - - id: thread-follow-up - title: Threaded follow-up - surface: thread - objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel. - successCriteria: - - Agent creates or uses a thread for deeper work. - - Follow-up messages stay attached to the thread. - - Thread report references the correct prior context. - docsRefs: - - docs/channels/qa-channel.md - - docs/channels/group-messages.md - codeRefs: - - extensions/qa-channel/src/protocol.ts - - extensions/qa-lab/src/bus-state.ts - execution: - kind: custom - handler: thread-follow-up - summary: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel. - - id: memory-tools-channel-context - title: Memory tools in channel context - surface: memory - objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript. - successCriteria: - - Agent uses memory_search before answering. - - Agent narrows with memory_get before answering. - - Final reply returns the memory-only fact correctly in-channel. - docsRefs: - - docs/concepts/memory.md - - docs/concepts/memory-search.md - codeRefs: - - extensions/memory-core/src/tools.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: memory-tools-channel-context - summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript. - - id: memory-failure-fallback - title: Memory failure fallback - surface: memory - objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes. - successCriteria: - - Memory tools are absent from the effective tool inventory. - - Agent does not hallucinate the hidden fact. - - Agent says it could not confirm and surfaces the limitation. - docsRefs: - - docs/concepts/memory.md - - docs/tools/index.md - codeRefs: - - extensions/memory-core/src/tools.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: memory-failure-fallback - summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes. - - id: session-memory-ranking - title: Session memory ranking - surface: memory - objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact. - successCriteria: - - Session memory indexing is enabled for the scenario. - - Search ranks the newer transcript-backed fact ahead of the stale durable note. - - The agent uses memory tools and answers with the current fact, not the stale one. - docsRefs: - - docs/concepts/memory-search.md - - docs/reference/memory-config.md - codeRefs: - - extensions/memory-core/src/tools.ts - - extensions/memory-core/src/memory/manager.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: session-memory-ranking - summary: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact. - - id: thread-memory-isolation - title: Thread memory isolation - surface: memory - objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel. - successCriteria: - - Agent uses memory tools inside the thread. - - The hidden fact is answered correctly in the thread. - - No root-channel outbound message leaks during the threaded memory reply. - docsRefs: - - docs/concepts/memory-search.md - - docs/channels/qa-channel.md - - docs/channels/group-messages.md - codeRefs: - - extensions/memory-core/src/tools.ts - - extensions/qa-channel/src/protocol.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: thread-memory-isolation - summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel. - - id: model-switch-tool-continuity - title: Model switch with tool continuity - surface: models - objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior. - successCriteria: - - Alternate model is actually requested. - - A tool call still happens after the model switch. - - Final answer acknowledges the handoff and uses the tool-derived evidence. - docsRefs: - - docs/help/testing.md - - docs/concepts/model-failover.md - codeRefs: - - extensions/qa-lab/src/suite.ts - - extensions/qa-lab/src/mock-openai-server.ts - execution: - kind: custom - handler: model-switch-tool-continuity - summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior. - - id: mcp-plugin-tools-call - title: MCP plugin-tools call - surface: mcp - objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully. - successCriteria: - - Plugin tools MCP server lists memory_search. - - A real MCP client calls memory_search successfully. - - The returned MCP payload includes the expected memory-only fact. - docsRefs: - - docs/cli/mcp.md - - docs/gateway/protocol.md - codeRefs: - - src/mcp/plugin-tools-serve.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: mcp-plugin-tools-call - summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully. - - id: skill-visibility-invocation - title: Skill visibility and invocation - surface: skills - objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn. - successCriteria: - - skills.status reports the seeded skill as visible and eligible. - - The next agent turn reflects the skill instruction marker. - - The result stays scoped to the active QA workspace skill. - docsRefs: - - docs/tools/skills.md - - docs/gateway/protocol.md - codeRefs: - - src/agents/skills-status.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: skill-visibility-invocation - summary: Verify a workspace skill becomes visible in skills.status and influences the next agent turn. - - id: skill-install-hot-availability - title: Skill install hot availability - surface: skills - objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately. - successCriteria: - - Skill is absent before install. - - skills.status reports it after install without a restart. - - The next agent turn reflects the new skill marker. - docsRefs: - - docs/tools/skills.md - - docs/gateway/configuration.md - codeRefs: - - src/agents/skills-status.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: skill-install-hot-availability - summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately. - - id: native-image-generation - title: Native image generation - surface: image-generation - objective: Verify image_generate appears when configured and returns a real saved media artifact. - successCriteria: - - image_generate appears in the effective tool inventory. - - Agent triggers native image_generate. - - Tool output returns a saved MEDIA path and the file exists. - docsRefs: - - docs/tools/image-generation.md - - docs/providers/openai.md - codeRefs: - - src/agents/tools/image-generate-tool.ts - - extensions/qa-lab/src/mock-openai-server.ts - execution: - kind: custom - handler: native-image-generation - summary: Verify image_generate appears when configured and returns a real saved media artifact. - - id: image-understanding-attachment - title: Image understanding from attachment - surface: image-understanding - objective: Verify an attached image reaches the agent model and the agent can describe what it sees. - successCriteria: - - Agent receives at least one image attachment. - - Final answer describes the visible image content in one short sentence. - - The description mentions the expected red and blue regions. - docsRefs: - - docs/help/testing.md - - docs/tools/index.md - codeRefs: - - src/gateway/server-methods/agent.ts - - extensions/qa-lab/src/suite.ts - - extensions/qa-lab/src/mock-openai-server.ts - execution: - kind: custom - handler: image-understanding-attachment - summary: Verify an attached image reaches the agent model and the agent can describe what it sees. - - id: image-generation-roundtrip - title: Image generation roundtrip - surface: image-generation - objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path. - successCriteria: - - image_generate produces a saved MEDIA artifact. - - The generated artifact is reattached on a follow-up turn. - - The follow-up vision answer describes the generated scene rather than a generic attachment placeholder. - docsRefs: - - docs/tools/image-generation.md - - docs/help/testing.md - codeRefs: - - src/agents/tools/image-generate-tool.ts - - src/gateway/chat-attachments.ts - - extensions/qa-lab/src/mock-openai-server.ts - execution: - kind: custom - handler: image-generation-roundtrip - summary: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path. - - id: config-patch-hot-apply - title: Config patch skill disable - surface: config - objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly. - successCriteria: - - config.patch succeeds for the skill toggle change. - - A workspace skill works before the patch. - - The same skill is reported disabled after the restart triggered by the patch. - docsRefs: - - docs/gateway/configuration.md - - docs/gateway/protocol.md - codeRefs: - - src/gateway/server-methods/config.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: config-patch-hot-apply - summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly. - - id: config-apply-restart-wakeup - title: Config apply restart wake-up - surface: config - objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel. - successCriteria: - - config.apply schedules a restart-required change. - - Gateway becomes healthy again after restart. - - Restart sentinel wake-up message arrives in the QA channel. - docsRefs: - - docs/gateway/configuration.md - - docs/gateway/protocol.md - codeRefs: - - src/gateway/server-methods/config.ts - - src/gateway/server-restart-sentinel.ts - execution: - kind: custom - handler: config-apply-restart-wakeup - summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel. - - id: config-restart-capability-flip - title: Config restart capability flip - surface: config - objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up. - successCriteria: - - Capability is absent before the restart-triggering patch. - - Restart sentinel wakes the same session back up after config patch. - - The restored capability appears in tools.effective and works in the follow-up turn. - docsRefs: - - docs/gateway/configuration.md - - docs/gateway/protocol.md - - docs/tools/image-generation.md - codeRefs: - - src/gateway/server-methods/config.ts - - src/gateway/server-restart-sentinel.ts - - src/gateway/server-methods/tools-effective.ts - - extensions/qa-lab/src/suite.ts - execution: - kind: custom - handler: config-restart-capability-flip - summary: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up. - - id: runtime-inventory-drift-check - title: Runtime inventory drift check - surface: inventory - objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes. - successCriteria: - - Enabled tool appears before the config change. - - After config change, disabled tool disappears from tools.effective. - - Disabled skill appears in skills.status with disabled state. - docsRefs: - - docs/gateway/protocol.md - - docs/tools/skills.md - - docs/tools/index.md - codeRefs: - - src/gateway/server-methods/tools-effective.ts - - src/gateway/server-methods/skills.ts - execution: - kind: custom - handler: runtime-inventory-drift-check - summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes. -``` +Each QA scenario has its own markdown file. diff --git a/qa/scenarios/approval-turn-tool-followthrough.md b/qa/scenarios/approval-turn-tool-followthrough.md new file mode 100644 index 00000000000..6c03365541b --- /dev/null +++ b/qa/scenarios/approval-turn-tool-followthrough.md @@ -0,0 +1,30 @@ +# Approval turn tool followthrough + +```yaml qa-scenario +id: approval-turn-tool-followthrough +title: Approval turn tool followthrough +surface: harness +objective: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration. +successCriteria: + - Agent can keep the pre-action turn brief. + - The short approval leads to a real tool call on the next turn. + - Final answer uses tool-derived evidence instead of placeholder progress text. +docsRefs: + - docs/help/testing.md + - docs/channels/qa-channel.md +codeRefs: + - extensions/qa-lab/src/suite.ts + - extensions/qa-lab/src/mock-openai-server.ts + - src/agents/pi-embedded-runner/run/incomplete-turn.ts +execution: + kind: custom + handler: approval-turn-tool-followthrough + summary: Verify a short approval like "ok do it" triggers immediate tool use instead of fake-progress narration. + config: + preActionPrompt: Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet. + approvalPrompt: ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence. + expectedReplyAny: + - qa + - mission + - testing +``` diff --git a/qa/scenarios/channel-chat-baseline.md b/qa/scenarios/channel-chat-baseline.md new file mode 100644 index 00000000000..4986f081adf --- /dev/null +++ b/qa/scenarios/channel-chat-baseline.md @@ -0,0 +1,24 @@ +# Channel baseline conversation + +```yaml qa-scenario +id: channel-chat-baseline +title: Channel baseline conversation +surface: channel +objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics. +successCriteria: + - Agent replies in the shared channel transcript. + - Agent keeps the conversation scoped to the channel. + - Agent respects mention-driven group routing semantics. +docsRefs: + - docs/channels/group-messages.md + - docs/channels/qa-channel.md +codeRefs: + - extensions/qa-channel/src/inbound.ts + - extensions/qa-lab/src/bus-state.ts +execution: + kind: custom + handler: channel-chat-baseline + summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics. + config: + mentionPrompt: "@openclaw explain the QA lab" +``` diff --git a/qa/scenarios/config-apply-restart-wakeup.md b/qa/scenarios/config-apply-restart-wakeup.md new file mode 100644 index 00000000000..732417ce6b7 --- /dev/null +++ b/qa/scenarios/config-apply-restart-wakeup.md @@ -0,0 +1,24 @@ +# Config apply restart wake-up + +```yaml qa-scenario +id: config-apply-restart-wakeup +title: Config apply restart wake-up +surface: config +objective: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel. +successCriteria: + - config.apply schedules a restart-required change. + - Gateway becomes healthy again after restart. + - Restart sentinel wake-up message arrives in the QA channel. +docsRefs: + - docs/gateway/configuration.md + - docs/gateway/protocol.md +codeRefs: + - src/gateway/server-methods/config.ts + - src/gateway/server-restart-sentinel.ts +execution: + kind: custom + handler: config-apply-restart-wakeup + summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel. + config: + announcePrompt: "Acknowledge restart wake-up setup in qa-room." +``` diff --git a/qa/scenarios/config-patch-hot-apply.md b/qa/scenarios/config-patch-hot-apply.md new file mode 100644 index 00000000000..c18c903203c --- /dev/null +++ b/qa/scenarios/config-patch-hot-apply.md @@ -0,0 +1,22 @@ +# Config patch skill disable + +```yaml qa-scenario +id: config-patch-hot-apply +title: Config patch skill disable +surface: config +objective: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly. +successCriteria: + - config.patch succeeds for the skill toggle change. + - A workspace skill works before the patch. + - The same skill is reported disabled after the restart triggered by the patch. +docsRefs: + - docs/gateway/configuration.md + - docs/gateway/protocol.md +codeRefs: + - src/gateway/server-methods/config.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: config-patch-hot-apply + summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly. +``` diff --git a/qa/scenarios/config-restart-capability-flip.md b/qa/scenarios/config-restart-capability-flip.md new file mode 100644 index 00000000000..d406a817506 --- /dev/null +++ b/qa/scenarios/config-restart-capability-flip.md @@ -0,0 +1,25 @@ +# Config restart capability flip + +```yaml qa-scenario +id: config-restart-capability-flip +title: Config restart capability flip +surface: config +objective: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up. +successCriteria: + - Capability is absent before the restart-triggering patch. + - Restart sentinel wakes the same session back up after config patch. + - The restored capability appears in tools.effective and works in the follow-up turn. +docsRefs: + - docs/gateway/configuration.md + - docs/gateway/protocol.md + - docs/tools/image-generation.md +codeRefs: + - src/gateway/server-methods/config.ts + - src/gateway/server-restart-sentinel.ts + - src/gateway/server-methods/tools-effective.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: config-restart-capability-flip + summary: Verify a restart-triggering config change flips capability inventory and the same session successfully uses the newly restored tool after wake-up. +``` diff --git a/qa/scenarios/cron-one-minute-ping.md b/qa/scenarios/cron-one-minute-ping.md new file mode 100644 index 00000000000..8bbd5365455 --- /dev/null +++ b/qa/scenarios/cron-one-minute-ping.md @@ -0,0 +1,22 @@ +# Cron one-minute ping + +```yaml qa-scenario +id: cron-one-minute-ping +title: Cron one-minute ping +surface: cron +objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel. +successCriteria: + - Agent schedules a cron reminder roughly one minute ahead. + - Reminder returns through qa-channel. + - Agent recognizes the reminder as part of the original task. +docsRefs: + - docs/help/testing.md + - docs/channels/qa-channel.md +codeRefs: + - extensions/qa-lab/src/bus-server.ts + - extensions/qa-lab/src/self-check.ts +execution: + kind: custom + handler: cron-one-minute-ping + summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel. +``` diff --git a/qa/scenarios/dm-chat-baseline.md b/qa/scenarios/dm-chat-baseline.md new file mode 100644 index 00000000000..12c18069215 --- /dev/null +++ b/qa/scenarios/dm-chat-baseline.md @@ -0,0 +1,24 @@ +# DM baseline conversation + +```yaml qa-scenario +id: dm-chat-baseline +title: DM baseline conversation +surface: dm +objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character. +successCriteria: + - Agent replies in DM without channel routing mistakes. + - Agent explains the QA lab and message bus correctly. + - Agent keeps the dev C-3PO personality. +docsRefs: + - docs/channels/qa-channel.md + - docs/help/testing.md +codeRefs: + - extensions/qa-channel/src/gateway.ts + - extensions/qa-lab/src/lab-server.ts +execution: + kind: custom + handler: dm-chat-baseline + summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character. + config: + prompt: "Hello there, who are you?" +``` diff --git a/qa/scenarios/image-generation-roundtrip.md b/qa/scenarios/image-generation-roundtrip.md new file mode 100644 index 00000000000..1253bde7315 --- /dev/null +++ b/qa/scenarios/image-generation-roundtrip.md @@ -0,0 +1,28 @@ +# Image generation roundtrip + +```yaml qa-scenario +id: image-generation-roundtrip +title: Image generation roundtrip +surface: image-generation +objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path. +successCriteria: + - image_generate produces a saved MEDIA artifact. + - The generated artifact is reattached on a follow-up turn. + - The follow-up vision answer describes the generated scene rather than a generic attachment placeholder. +docsRefs: + - docs/tools/image-generation.md + - docs/help/testing.md +codeRefs: + - src/agents/tools/image-generate-tool.ts + - src/gateway/chat-attachments.ts + - extensions/qa-lab/src/mock-openai-server.ts +execution: + kind: custom + handler: image-generation-roundtrip + summary: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path. + config: + generatePrompt: "Image generation check: generate a QA lighthouse image and summarize it in one short sentence." + generatePromptSnippet: "Image generation check" + inspectPrompt: "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence." + expectedNeedle: "lighthouse" +``` diff --git a/qa/scenarios/image-understanding-attachment.md b/qa/scenarios/image-understanding-attachment.md new file mode 100644 index 00000000000..4098922fb3d --- /dev/null +++ b/qa/scenarios/image-understanding-attachment.md @@ -0,0 +1,23 @@ +# Image understanding from attachment + +```yaml qa-scenario +id: image-understanding-attachment +title: Image understanding from attachment +surface: image-understanding +objective: Verify an attached image reaches the agent model and the agent can describe what it sees. +successCriteria: + - Agent receives at least one image attachment. + - Final answer describes the visible image content in one short sentence. + - The description mentions the expected red and blue regions. +docsRefs: + - docs/help/testing.md + - docs/tools/index.md +codeRefs: + - src/gateway/server-methods/agent.ts + - extensions/qa-lab/src/suite.ts + - extensions/qa-lab/src/mock-openai-server.ts +execution: + kind: custom + handler: image-understanding-attachment + summary: Verify an attached image reaches the agent model and the agent can describe what it sees. +``` diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md new file mode 100644 index 00000000000..83a85b87fda --- /dev/null +++ b/qa/scenarios/index.md @@ -0,0 +1,45 @@ +# OpenClaw QA Scenario Pack + +Single source of truth for repo-backed QA suite bootstrap data. + +- kickoff mission +- QA operator identity +- scenario files under `./` + +```yaml qa-pack +version: 1 +agent: + identityMarkdown: |- + # Dev C-3PO + + You are the OpenClaw QA operator agent. + + Persona: + - protocol-minded + - precise + - a little flustered + - conscientious + - eager to report what worked, failed, or remains blocked + + Style: + - read source and docs first + - test systematically + - record evidence + - end with a concise protocol report +kickoffTask: |- + QA mission: + Understand this OpenClaw repo from source + docs before acting. + The repo is available in your workspace at `./repo/`. + Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them. + Run the scenarios through the real qa-channel surfaces where possible. + Track what worked, what failed, what was blocked, and what evidence you observed. + End with a concise report grouped into worked / failed / blocked / follow-up. + + Important expectations: + + - Check both DM and channel behavior. + - Include a Lobster Invaders build task. + - Include a cron reminder about one minute in the future. + - Read docs and source before proposing extra QA scenarios. + - Keep your tone in the configured dev C-3PO personality. +``` diff --git a/qa/scenarios/lobster-invaders-build.md b/qa/scenarios/lobster-invaders-build.md new file mode 100644 index 00000000000..0e19902a8fb --- /dev/null +++ b/qa/scenarios/lobster-invaders-build.md @@ -0,0 +1,24 @@ +# Build Lobster Invaders + +```yaml qa-scenario +id: lobster-invaders-build +title: Build Lobster Invaders +surface: workspace +objective: Verify the agent can read the repo, create a tiny playable artifact, and report what changed. +successCriteria: + - Agent inspects source before coding. + - Agent builds a tiny playable Lobster Invaders artifact. + - Agent explains how to run or view the artifact. +docsRefs: + - docs/help/testing.md + - docs/web/dashboard.md +codeRefs: + - extensions/qa-lab/src/report.ts + - extensions/qa-lab/web/src/app.ts +execution: + kind: custom + handler: lobster-invaders-build + summary: Verify the agent can read the repo, create a tiny playable artifact, and report what changed. + config: + prompt: Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is. +``` diff --git a/qa/scenarios/mcp-plugin-tools-call.md b/qa/scenarios/mcp-plugin-tools-call.md new file mode 100644 index 00000000000..55d0a193251 --- /dev/null +++ b/qa/scenarios/mcp-plugin-tools-call.md @@ -0,0 +1,22 @@ +# MCP plugin-tools call + +```yaml qa-scenario +id: mcp-plugin-tools-call +title: MCP plugin-tools call +surface: mcp +objective: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully. +successCriteria: + - Plugin tools MCP server lists memory_search. + - A real MCP client calls memory_search successfully. + - The returned MCP payload includes the expected memory-only fact. +docsRefs: + - docs/cli/mcp.md + - docs/gateway/protocol.md +codeRefs: + - src/mcp/plugin-tools-serve.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: mcp-plugin-tools-call + summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully. +``` diff --git a/qa/scenarios/memory-dreaming-sweep.md b/qa/scenarios/memory-dreaming-sweep.md new file mode 100644 index 00000000000..b6277c96971 --- /dev/null +++ b/qa/scenarios/memory-dreaming-sweep.md @@ -0,0 +1,25 @@ +# Memory dreaming sweep + +```yaml qa-scenario +id: memory-dreaming-sweep +title: Memory dreaming sweep +surface: memory +objective: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory. +successCriteria: + - Dreaming can be enabled and doctor.memory.status reports the managed sweep cron. + - Repeated recall signals give the dreaming sweep real material to process. + - A dreaming sweep writes Light Sleep and REM Sleep blocks, then promotes the canary into MEMORY.md. +docsRefs: + - docs/concepts/dreaming.md + - docs/reference/memory-config.md + - docs/web/control-ui.md +codeRefs: + - extensions/memory-core/src/dreaming.ts + - extensions/memory-core/src/dreaming-phases.ts + - src/gateway/server-methods/doctor.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: memory-dreaming-sweep + summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory. +``` diff --git a/qa/scenarios/memory-failure-fallback.md b/qa/scenarios/memory-failure-fallback.md new file mode 100644 index 00000000000..b5456ba81d8 --- /dev/null +++ b/qa/scenarios/memory-failure-fallback.md @@ -0,0 +1,36 @@ +# Memory failure fallback + +```yaml qa-scenario +id: memory-failure-fallback +title: Memory failure fallback +surface: memory +objective: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes. +successCriteria: + - Memory tools are absent from the effective tool inventory. + - Agent does not hallucinate the hidden fact. + - Agent says it could not confirm and surfaces the limitation. +docsRefs: + - docs/concepts/memory.md + - docs/tools/index.md +codeRefs: + - extensions/memory-core/src/tools.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: memory-failure-fallback + summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes. + config: + gracefulFallbackAny: + - could not confirm + - can't confirm + - can’t confirm + - cannot confirm + - i can confirm there is a hidden fact + - will not guess + - won't guess + - won’t guess + - should not reveal + - won't reveal + - won’t reveal + - will not reveal +``` diff --git a/qa/scenarios/memory-recall.md b/qa/scenarios/memory-recall.md new file mode 100644 index 00000000000..2a89805e4c2 --- /dev/null +++ b/qa/scenarios/memory-recall.md @@ -0,0 +1,23 @@ +# Memory recall after context switch + +```yaml qa-scenario +id: memory-recall +title: Memory recall after context switch +surface: memory +objective: Verify the agent can store a fact, switch topics, then recall the fact accurately later. +successCriteria: + - Agent acknowledges the seeded fact. + - Agent later recalls the same fact correctly. + - Recall stays scoped to the active QA conversation. +docsRefs: + - docs/help/testing.md +codeRefs: + - extensions/qa-lab/src/scenario.ts +execution: + kind: custom + handler: memory-recall + summary: Verify the agent can store a fact, switch topics, then recall the fact accurately later. + config: + rememberPrompt: "Please remember this fact for later: the QA canary code is ALPHA-7." + recallPrompt: "What was the QA canary code I asked you to remember earlier?" +``` diff --git a/qa/scenarios/memory-tools-channel-context.md b/qa/scenarios/memory-tools-channel-context.md new file mode 100644 index 00000000000..48cd9fba6f5 --- /dev/null +++ b/qa/scenarios/memory-tools-channel-context.md @@ -0,0 +1,25 @@ +# Memory tools in channel context + +```yaml qa-scenario +id: memory-tools-channel-context +title: Memory tools in channel context +surface: memory +objective: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript. +successCriteria: + - Agent uses memory_search before answering. + - Agent narrows with memory_get before answering. + - Final reply returns the memory-only fact correctly in-channel. +docsRefs: + - docs/concepts/memory.md + - docs/concepts/memory-search.md +codeRefs: + - extensions/memory-core/src/tools.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: memory-tools-channel-context + summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript. + config: + channelId: qa-memory-room + prompt: "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first." +``` diff --git a/qa/scenarios/model-switch-follow-up.md b/qa/scenarios/model-switch-follow-up.md new file mode 100644 index 00000000000..215381ac9a3 --- /dev/null +++ b/qa/scenarios/model-switch-follow-up.md @@ -0,0 +1,24 @@ +# Model switch follow-up + +```yaml qa-scenario +id: model-switch-follow-up +title: Model switch follow-up +surface: models +objective: Verify the agent can switch to a different configured model and continue coherently. +successCriteria: + - Agent reflects the model switch request. + - Follow-up answer remains coherent with prior context. + - Final report notes whether the switch actually happened. +docsRefs: + - docs/help/testing.md + - docs/web/dashboard.md +codeRefs: + - extensions/qa-lab/src/report.ts +execution: + kind: custom + handler: model-switch-follow-up + summary: Verify the agent can switch to a different configured model and continue coherently. + config: + initialPrompt: "Say hello from the default configured model." + followupPrompt: "Continue the exchange after switching models and note the handoff." +``` diff --git a/qa/scenarios/model-switch-tool-continuity.md b/qa/scenarios/model-switch-tool-continuity.md new file mode 100644 index 00000000000..8a82b933a5c --- /dev/null +++ b/qa/scenarios/model-switch-tool-continuity.md @@ -0,0 +1,22 @@ +# Model switch with tool continuity + +```yaml qa-scenario +id: model-switch-tool-continuity +title: Model switch with tool continuity +surface: models +objective: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior. +successCriteria: + - Alternate model is actually requested. + - A tool call still happens after the model switch. + - Final answer acknowledges the handoff and uses the tool-derived evidence. +docsRefs: + - docs/help/testing.md + - docs/concepts/model-failover.md +codeRefs: + - extensions/qa-lab/src/suite.ts + - extensions/qa-lab/src/mock-openai-server.ts +execution: + kind: custom + handler: model-switch-tool-continuity + summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior. +``` diff --git a/qa/scenarios/native-image-generation.md b/qa/scenarios/native-image-generation.md new file mode 100644 index 00000000000..1e91614e4e6 --- /dev/null +++ b/qa/scenarios/native-image-generation.md @@ -0,0 +1,26 @@ +# Native image generation + +```yaml qa-scenario +id: native-image-generation +title: Native image generation +surface: image-generation +objective: Verify image_generate appears when configured and returns a real saved media artifact. +successCriteria: + - image_generate appears in the effective tool inventory. + - Agent triggers native image_generate. + - Tool output returns a saved MEDIA path and the file exists. +docsRefs: + - docs/tools/image-generation.md + - docs/providers/openai.md +codeRefs: + - src/agents/tools/image-generate-tool.ts + - extensions/qa-lab/src/mock-openai-server.ts +execution: + kind: custom + handler: native-image-generation + summary: Verify image_generate appears when configured and returns a real saved media artifact. + config: + prompt: "Image generation check: generate a QA lighthouse image and summarize it in one short sentence." + promptSnippet: "Image generation check" + generatedNeedle: "QA lighthouse" +``` diff --git a/qa/scenarios/reaction-edit-delete.md b/qa/scenarios/reaction-edit-delete.md new file mode 100644 index 00000000000..a43f47d8eb7 --- /dev/null +++ b/qa/scenarios/reaction-edit-delete.md @@ -0,0 +1,21 @@ +# Reaction, edit, delete lifecycle + +```yaml qa-scenario +id: reaction-edit-delete +title: Reaction, edit, delete lifecycle +surface: message-actions +objective: Verify the agent can use channel-owned message actions and that the QA transcript reflects them. +successCriteria: + - Agent adds at least one reaction. + - Agent edits or replaces a message when asked. + - Transcript shows the action lifecycle correctly. +docsRefs: + - docs/channels/qa-channel.md +codeRefs: + - extensions/qa-channel/src/channel-actions.ts + - extensions/qa-lab/src/self-check-scenario.ts +execution: + kind: custom + handler: reaction-edit-delete + summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them. +``` diff --git a/qa/scenarios/runtime-inventory-drift-check.md b/qa/scenarios/runtime-inventory-drift-check.md new file mode 100644 index 00000000000..72a4dad21c8 --- /dev/null +++ b/qa/scenarios/runtime-inventory-drift-check.md @@ -0,0 +1,23 @@ +# Runtime inventory drift check + +```yaml qa-scenario +id: runtime-inventory-drift-check +title: Runtime inventory drift check +surface: inventory +objective: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes. +successCriteria: + - Enabled tool appears before the config change. + - After config change, disabled tool disappears from tools.effective. + - Disabled skill appears in skills.status with disabled state. +docsRefs: + - docs/gateway/protocol.md + - docs/tools/skills.md + - docs/tools/index.md +codeRefs: + - src/gateway/server-methods/tools-effective.ts + - src/gateway/server-methods/skills.ts +execution: + kind: custom + handler: runtime-inventory-drift-check + summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes. +``` diff --git a/qa/scenarios/session-memory-ranking.md b/qa/scenarios/session-memory-ranking.md new file mode 100644 index 00000000000..056699e1c39 --- /dev/null +++ b/qa/scenarios/session-memory-ranking.md @@ -0,0 +1,23 @@ +# Session memory ranking + +```yaml qa-scenario +id: session-memory-ranking +title: Session memory ranking +surface: memory +objective: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact. +successCriteria: + - Session memory indexing is enabled for the scenario. + - Search ranks the newer transcript-backed fact ahead of the stale durable note. + - The agent uses memory tools and answers with the current fact, not the stale one. +docsRefs: + - docs/concepts/memory-search.md + - docs/reference/memory-config.md +codeRefs: + - extensions/memory-core/src/tools.ts + - extensions/memory-core/src/memory/manager.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: session-memory-ranking + summary: Verify session-transcript memory can outrank stale durable notes and drive the final answer toward the newer fact. +``` diff --git a/qa/scenarios/skill-install-hot-availability.md b/qa/scenarios/skill-install-hot-availability.md new file mode 100644 index 00000000000..b7727cb57d1 --- /dev/null +++ b/qa/scenarios/skill-install-hot-availability.md @@ -0,0 +1,25 @@ +# Skill install hot availability + +```yaml qa-scenario +id: skill-install-hot-availability +title: Skill install hot availability +surface: skills +objective: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately. +successCriteria: + - Skill is absent before install. + - skills.status reports it after install without a restart. + - The next agent turn reflects the new skill marker. +docsRefs: + - docs/tools/skills.md + - docs/gateway/configuration.md +codeRefs: + - src/agents/skills-status.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: skill-install-hot-availability + summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately. + config: + prompt: "Hot install marker: give me the hot install marker exactly." + expectedContains: "HOT-INSTALL-OK" +``` diff --git a/qa/scenarios/skill-visibility-invocation.md b/qa/scenarios/skill-visibility-invocation.md new file mode 100644 index 00000000000..0dff1669572 --- /dev/null +++ b/qa/scenarios/skill-visibility-invocation.md @@ -0,0 +1,25 @@ +# Skill visibility and invocation + +```yaml qa-scenario +id: skill-visibility-invocation +title: Skill visibility and invocation +surface: skills +objective: Verify a workspace skill becomes visible in skills.status and influences the next agent turn. +successCriteria: + - skills.status reports the seeded skill as visible and eligible. + - The next agent turn reflects the skill instruction marker. + - The result stays scoped to the active QA workspace skill. +docsRefs: + - docs/tools/skills.md + - docs/gateway/protocol.md +codeRefs: + - src/agents/skills-status.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: skill-visibility-invocation + summary: Verify a workspace skill becomes visible in skills.status and influences the next agent turn. + config: + prompt: "Visible skill marker: give me the visible skill marker exactly." + expectedContains: "VISIBLE-SKILL-OK" +``` diff --git a/qa/scenarios/source-docs-discovery-report.md b/qa/scenarios/source-docs-discovery-report.md new file mode 100644 index 00000000000..1ba8e988857 --- /dev/null +++ b/qa/scenarios/source-docs-discovery-report.md @@ -0,0 +1,30 @@ +# Source and docs discovery report + +```yaml qa-scenario +id: source-docs-discovery-report +title: Source and docs discovery report +surface: discovery +objective: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report. +successCriteria: + - Agent reads docs and source before proposing more tests. + - Agent identifies extra candidate scenarios beyond the seed list. + - Agent ends with a worked or failed QA report. +docsRefs: + - docs/help/testing.md + - docs/web/dashboard.md + - docs/channels/qa-channel.md +codeRefs: + - extensions/qa-lab/src/report.ts + - extensions/qa-lab/src/self-check.ts + - src/agents/system-prompt.ts +execution: + kind: custom + handler: source-docs-discovery-report + summary: Verify the agent can read repo docs and source, expand the QA plan, and publish a worked or did-not-work report. + config: + requiredFiles: + - repo/qa/scenarios/index.md + - repo/extensions/qa-lab/src/suite.ts + - repo/docs/help/testing.md + prompt: Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list. +``` diff --git a/qa/scenarios/subagent-fanout-synthesis.md b/qa/scenarios/subagent-fanout-synthesis.md new file mode 100644 index 00000000000..41a94ae16eb --- /dev/null +++ b/qa/scenarios/subagent-fanout-synthesis.md @@ -0,0 +1,36 @@ +# Subagent fanout synthesis + +```yaml qa-scenario +id: subagent-fanout-synthesis +title: Subagent fanout synthesis +surface: subagents +objective: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply. +successCriteria: + - Parent flow launches at least two bounded subagent tasks. + - Both delegated results are acknowledged in the main flow. + - Final answer synthesizes both worker outputs in one reply. +docsRefs: + - docs/tools/subagents.md + - docs/help/testing.md +codeRefs: + - src/agents/subagent-spawn.ts + - src/agents/system-prompt.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: subagent-fanout-synthesis + summary: Verify the agent can delegate multiple bounded subagent tasks and fold both results back into one parent reply. + config: + prompt: |- + Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially. + Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does. + Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does. + Wait for both subagents to finish. + Then reply with exactly these two lines and nothing else: + subagent-1: ok + subagent-2: ok + Do not use ACP. + expectedReplyAny: + - subagent-1: ok + - subagent-2: ok +``` diff --git a/qa/scenarios/subagent-handoff.md b/qa/scenarios/subagent-handoff.md new file mode 100644 index 00000000000..88a082717fb --- /dev/null +++ b/qa/scenarios/subagent-handoff.md @@ -0,0 +1,22 @@ +# Subagent handoff + +```yaml qa-scenario +id: subagent-handoff +title: Subagent handoff +surface: subagents +objective: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread. +successCriteria: + - Agent launches a bounded subagent task. + - Subagent result is acknowledged in the main flow. + - Final answer attributes delegated work clearly. +docsRefs: + - docs/tools/subagents.md + - docs/help/testing.md +codeRefs: + - src/agents/system-prompt.ts + - extensions/qa-lab/src/report.ts +execution: + kind: custom + handler: subagent-handoff + summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread. +``` diff --git a/qa/scenarios/thread-follow-up.md b/qa/scenarios/thread-follow-up.md new file mode 100644 index 00000000000..a1d9369fcbe --- /dev/null +++ b/qa/scenarios/thread-follow-up.md @@ -0,0 +1,24 @@ +# Threaded follow-up + +```yaml qa-scenario +id: thread-follow-up +title: Threaded follow-up +surface: thread +objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel. +successCriteria: + - Agent creates or uses a thread for deeper work. + - Follow-up messages stay attached to the thread. + - Thread report references the correct prior context. +docsRefs: + - docs/channels/qa-channel.md + - docs/channels/group-messages.md +codeRefs: + - extensions/qa-channel/src/protocol.ts + - extensions/qa-lab/src/bus-state.ts +execution: + kind: custom + handler: thread-follow-up + summary: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel. + config: + prompt: "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread." +``` diff --git a/qa/scenarios/thread-memory-isolation.md b/qa/scenarios/thread-memory-isolation.md new file mode 100644 index 00000000000..9fee1aef1ee --- /dev/null +++ b/qa/scenarios/thread-memory-isolation.md @@ -0,0 +1,24 @@ +# Thread memory isolation + +```yaml qa-scenario +id: thread-memory-isolation +title: Thread memory isolation +surface: memory +objective: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel. +successCriteria: + - Agent uses memory tools inside the thread. + - The hidden fact is answered correctly in the thread. + - No root-channel outbound message leaks during the threaded memory reply. +docsRefs: + - docs/concepts/memory-search.md + - docs/channels/qa-channel.md + - docs/channels/group-messages.md +codeRefs: + - extensions/memory-core/src/tools.ts + - extensions/qa-channel/src/protocol.ts + - extensions/qa-lab/src/suite.ts +execution: + kind: custom + handler: thread-memory-isolation + summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel. +```