From da8afe359d78b59a141a85eedfe714c6a58e4f93 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 17 May 2026 08:56:28 +0800 Subject: [PATCH] feat(qa-lab): add scenario pack selector --- CHANGELOG.md | 1 + .../concepts/personal-agent-benchmark-pack.md | 14 ++++----- docs/concepts/qa-e2e-automation.md | 3 ++ extensions/qa-lab/src/cli.runtime.test.ts | 29 +++++++++++++++++++ extensions/qa-lab/src/cli.runtime.ts | 11 +++++-- extensions/qa-lab/src/cli.test.ts | 7 +++++ extensions/qa-lab/src/cli.ts | 4 +++ extensions/qa-lab/src/scenario-catalog.ts | 1 + extensions/qa-lab/src/scenario-packs.test.ts | 28 +++++++++++++++++- extensions/qa-lab/src/scenario-packs.ts | 18 ++++++++++++ extensions/qa-lab/src/suite-planning.test.ts | 17 +++++++++++ extensions/qa-lab/src/suite-planning.ts | 11 +++---- 12 files changed, 126 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbb68b516f2..fb7536fd1ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai - Gateway: add opt-in restart trace logs for restart signal, active-work drain, close, next-start, ready, and memory spans. (#82396) Thanks @samzong. - Gateway/performance: split startup benchmark HTTP-listen timing from full gateway-ready timing and add post-bind plugin and sidecar diagnostics to restart-readiness traces. (#82603) Thanks @samzong. - QA-Lab: add a deterministic local personal-agent scenario pack covering reminders, threaded replies, scoped memory recall, redaction, and safe tool followthrough. (#78219) Thanks @iFiras-Max1. +- QA-Lab: add `--pack personal-agent` for `openclaw qa suite` so maintainers can run the accepted personal-agent scenario pack by selector. (#82760) Thanks @iFiras-Max1. - QA-Lab: add a private Codex-vs-Pi runtime parity axis with runtime-pair suite runs, parity reports, and release-check wiring. (#80238) Thanks @100yenadmin. - Slack: add Slack assistant thread lifecycle support with assistant view manifest entries, suggested prompts, thread-scoped assistant sessions, and Slack-provided assistant context. Fixes #80787. Thanks @mobybot27. diff --git a/docs/concepts/personal-agent-benchmark-pack.md b/docs/concepts/personal-agent-benchmark-pack.md index 15cc811a00d..e38fe35f23f 100644 --- a/docs/concepts/personal-agent-benchmark-pack.md +++ b/docs/concepts/personal-agent-benchmark-pack.md @@ -25,20 +25,20 @@ The first pack is intentionally narrow: ## Scenarios The machine-readable pack metadata lives in -`extensions/qa-lab/src/scenario-packs.ts`. The initial pack does not add a CLI -pack selector, so run the scenarios explicitly: +`extensions/qa-lab/src/scenario-packs.ts`. Run the pack with +`--pack personal-agent`: ```bash OPENCLAW_ENABLE_PRIVATE_QA_CLI=1 pnpm openclaw qa suite \ --provider-mode mock-openai \ - --scenario personal-reminder-roundtrip \ - --scenario personal-channel-thread-reply \ - --scenario personal-memory-preference-recall \ - --scenario personal-redaction-no-secret-leak \ - --scenario personal-tool-safety-followthrough \ + --pack personal-agent \ --concurrency 1 ``` +`--pack` is additive with repeated `--scenario` flags. Explicit scenarios run +first, then the pack scenarios run in `QA_PERSONAL_AGENT_SCENARIO_IDS` order with +duplicates removed. + The pack is designed for `qa-channel` with `mock-openai` or another local QA provider lane. It should not be pointed at live chat services or real personal accounts. diff --git a/docs/concepts/qa-e2e-automation.md b/docs/concepts/qa-e2e-automation.md index 870c25f6158..187d1f6cb63 100644 --- a/docs/concepts/qa-e2e-automation.md +++ b/docs/concepts/qa-e2e-automation.md @@ -231,6 +231,9 @@ Host and Multipass suite runs execute multiple selected scenarios in parallel with isolated gateway workers by default. `qa-channel` defaults to concurrency 4, capped by the selected scenario count. Use `--concurrency ` to tune the worker count, or `--concurrency 1` for serial execution. +Use `--pack personal-agent` to run the personal assistant benchmark pack. The +pack selector is additive with repeated `--scenario` flags: explicit scenarios +run first, then pack scenarios run in pack order with duplicates removed. The command exits non-zero when any scenario fails. Use `--allow-failures` when you want artifacts without a failing exit code. Live runs forward the supported QA auth inputs that are practical for the diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index b56331ca10b..99eb550f892 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -761,6 +761,35 @@ describe("qa cli runtime", () => { }); }); + it("expands the personal-agent pack onto the suite scenario list", async () => { + await runQaSuiteCommand({ + repoRoot: "/tmp/openclaw-repo", + pack: "personal-agent", + scenarioIds: ["channel-chat-baseline"], + }); + + expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), { + repoRoot: path.resolve("/tmp/openclaw-repo"), + scenarioIds: [ + "channel-chat-baseline", + "personal-reminder-roundtrip", + "personal-channel-thread-reply", + "personal-memory-preference-recall", + "personal-redaction-no-secret-leak", + "personal-tool-safety-followthrough", + ], + }); + }); + + it("rejects unknown suite packs", async () => { + await expect( + runQaSuiteCommand({ + repoRoot: "/tmp/openclaw-repo", + pack: "personal-admin", + }), + ).rejects.toThrow('--pack must be one of personal-agent, got "personal-admin"'); + }); + it("rejects unknown suite CLI auth modes", async () => { await expect( runQaSuiteCommand({ diff --git a/extensions/qa-lab/src/cli.runtime.ts b/extensions/qa-lab/src/cli.runtime.ts index 4df57c7d9a9..d802b37053f 100644 --- a/extensions/qa-lab/src/cli.runtime.ts +++ b/extensions/qa-lab/src/cli.runtime.ts @@ -43,6 +43,7 @@ import { } from "./run-config.js"; import type { RuntimeId } from "./runtime-parity.js"; import { readQaScenarioPack } from "./scenario-catalog.js"; +import { resolveQaScenarioPackScenarioIds } from "./scenario-packs.js"; import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js"; import { readQaSuiteFailedScenarioCountFromSummary } from "./suite-summary.js"; @@ -496,6 +497,7 @@ export async function runQaSuiteCommand(opts: { thinking?: string; cliAuthMode?: string; parityPack?: string; + pack?: string; scenarioIds?: string[]; concurrency?: number; allowFailures?: boolean; @@ -510,9 +512,12 @@ export async function runQaSuiteCommand(opts: { const repoRoot = path.resolve(opts.repoRoot ?? process.cwd()); const transportId = normalizeQaTransportId(opts.transportId); const runner = (opts.runner ?? "host").trim().toLowerCase(); - const scenarioIds = resolveQaParityPackScenarioIds({ - parityPack: opts.parityPack, - scenarioIds: opts.scenarioIds, + const scenarioIds = resolveQaScenarioPackScenarioIds({ + pack: opts.pack, + scenarioIds: resolveQaParityPackScenarioIds({ + parityPack: opts.parityPack, + scenarioIds: opts.scenarioIds, + }), }); const allowFailures = opts.allowFailures === true; if (runner !== "host" && runner !== "multipass") { diff --git a/extensions/qa-lab/src/cli.test.ts b/extensions/qa-lab/src/cli.test.ts index 9255f178d40..3a9b4cec000 100644 --- a/extensions/qa-lab/src/cli.test.ts +++ b/extensions/qa-lab/src/cli.test.ts @@ -537,6 +537,13 @@ describe("qa cli registration", () => { expect(options.allowFailures).toBe(true); }); + it("forwards --pack for suite runs", async () => { + await program.parseAsync(["node", "openclaw", "qa", "suite", "--pack", "personal-agent"]); + + const options = requireQaSuiteOptions(); + expect(options.pack).toBe("personal-agent"); + }); + it("routes credential add flags into the qa runtime command", async () => { await program.parseAsync([ "node", diff --git a/extensions/qa-lab/src/cli.ts b/extensions/qa-lab/src/cli.ts index 21cc003da12..8a03d591336 100644 --- a/extensions/qa-lab/src/cli.ts +++ b/extensions/qa-lab/src/cli.ts @@ -41,6 +41,7 @@ async function runQaSuite(opts: { enabledPluginIds?: string[]; cliAuthMode?: string; parityPack?: string; + pack?: string; scenarioIds?: string[]; concurrency?: number; runner?: string; @@ -253,6 +254,7 @@ export function registerQaLabCli(program: Command) { "CLI backend auth mode for live Claude CLI runs: auto, api-key, or subscription", ) .option("--parity-pack ", 'Preset scenario pack; currently only "agentic" is supported') + .option("--pack ", 'Scenario pack id; currently only "personal-agent" is supported') .option("--scenario ", "Run only the named QA scenario (repeatable)", collectString, []) .option( "--enable-plugin ", @@ -290,6 +292,7 @@ export function registerQaLabCli(program: Command) { altModel?: string; cliAuthMode?: string; parityPack?: string; + pack?: string; scenario?: string[]; enablePlugin?: string[]; concurrency?: number; @@ -315,6 +318,7 @@ export function registerQaLabCli(program: Command) { thinking: opts.thinking, cliAuthMode: opts.cliAuthMode, parityPack: opts.parityPack, + pack: opts.pack, scenarioIds: opts.scenario, enabledPluginIds: opts.enablePlugin, concurrency: opts.concurrency, diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index 023c878f682..554147732ee 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -227,6 +227,7 @@ export type QaBootstrapScenarioCatalog = { export { QA_PERSONAL_AGENT_SCENARIO_IDS, QA_SCENARIO_PACKS, + resolveQaScenarioPackScenarioIds, type QaScenarioPackDefinition, } from "./scenario-packs.js"; diff --git a/extensions/qa-lab/src/scenario-packs.test.ts b/extensions/qa-lab/src/scenario-packs.test.ts index 988584726d8..5105df632d9 100644 --- a/extensions/qa-lab/src/scenario-packs.test.ts +++ b/extensions/qa-lab/src/scenario-packs.test.ts @@ -1,5 +1,10 @@ import { describe, expect, it } from "vitest"; -import { QA_SCENARIO_PACKS, readQaScenarioById } from "./scenario-catalog.js"; +import { + QA_PERSONAL_AGENT_SCENARIO_IDS, + QA_SCENARIO_PACKS, + readQaScenarioById, + resolveQaScenarioPackScenarioIds, +} from "./scenario-catalog.js"; describe("qa scenario packs", () => { it("points every pack scenario id at a loadable markdown scenario", () => { @@ -41,6 +46,27 @@ describe("qa scenario packs", () => { } }); + it("expands the personal-agent pack in pack order", () => { + expect(resolveQaScenarioPackScenarioIds({ pack: "personal-agent" })).toEqual([ + ...QA_PERSONAL_AGENT_SCENARIO_IDS, + ]); + }); + + it("combines explicit scenarios with pack scenarios", () => { + expect( + resolveQaScenarioPackScenarioIds({ + pack: "personal-agent", + scenarioIds: ["channel-chat-baseline", "personal-reminder-roundtrip"], + }), + ).toEqual(["channel-chat-baseline", ...QA_PERSONAL_AGENT_SCENARIO_IDS]); + }); + + it("rejects unknown scenario packs", () => { + expect(() => resolveQaScenarioPackScenarioIds({ pack: "personal-admin" })).toThrow( + '--pack must be one of personal-agent, got "personal-admin"', + ); + }); + it("keeps personal pack mock debug assertions scoped to each reviewed scenario", () => { const redactionFlow = JSON.stringify( readQaScenarioById("personal-redaction-no-secret-leak").execution.flow, diff --git a/extensions/qa-lab/src/scenario-packs.ts b/extensions/qa-lab/src/scenario-packs.ts index eb8764dcbe0..a68813a994e 100644 --- a/extensions/qa-lab/src/scenario-packs.ts +++ b/extensions/qa-lab/src/scenario-packs.ts @@ -22,3 +22,21 @@ export const QA_SCENARIO_PACKS = [ scenarioIds: QA_PERSONAL_AGENT_SCENARIO_IDS, }, ] as const satisfies readonly QaScenarioPackDefinition[]; + +export function resolveQaScenarioPackScenarioIds(params: { + pack?: string; + scenarioIds?: string[]; +}): string[] { + const normalizedPack = params.pack?.trim().toLowerCase(); + const explicitScenarioIds = [...new Set(params.scenarioIds ?? [])]; + if (!normalizedPack) { + return explicitScenarioIds; + } + const pack = QA_SCENARIO_PACKS.find((candidate) => candidate.id === normalizedPack); + if (!pack) { + throw new Error( + `--pack must be one of ${QA_SCENARIO_PACKS.map((candidate) => candidate.id).join(", ")}, got "${params.pack}"`, + ); + } + return [...new Set([...explicitScenarioIds, ...pack.scenarioIds])]; +} diff --git a/extensions/qa-lab/src/suite-planning.test.ts b/extensions/qa-lab/src/suite-planning.test.ts index 30bf80b298b..6a3ffa6f8af 100644 --- a/extensions/qa-lab/src/suite-planning.test.ts +++ b/extensions/qa-lab/src/suite-planning.test.ts @@ -189,6 +189,23 @@ describe("qa suite planning helpers", () => { ).toEqual(["anthropic-only"]); }); + it("keeps explicitly requested scenarios in request order", () => { + const scenarios = [ + makeQaSuiteTestScenario("first"), + makeQaSuiteTestScenario("second"), + makeQaSuiteTestScenario("third"), + ]; + + expect( + selectQaSuiteScenarios({ + scenarios, + scenarioIds: ["third", "first"], + providerMode: "live-frontier", + primaryModel: "openai/gpt-5.5", + }).map((scenario) => scenario.id), + ).toEqual(["third", "first"]); + }); + it("collects unique scenario-declared bundled plugins in encounter order", () => { const scenarios = [ makeQaSuiteTestScenario("generic", { plugins: ["active-memory", "memory-wiki"] }), diff --git a/extensions/qa-lab/src/suite-planning.ts b/extensions/qa-lab/src/suite-planning.ts index ee274c5d921..b7b4938b74f 100644 --- a/extensions/qa-lab/src/suite-planning.ts +++ b/extensions/qa-lab/src/suite-planning.ts @@ -66,20 +66,17 @@ function selectQaSuiteScenarios(params: { }) { const requestedScenarioIds = params.scenarioIds && params.scenarioIds.length > 0 ? new Set(params.scenarioIds) : null; - const requestedScenarios = requestedScenarioIds - ? params.scenarios.filter((scenario) => requestedScenarioIds.has(scenario.id)) - : params.scenarios; if (requestedScenarioIds) { - const foundScenarioIds = new Set(requestedScenarios.map((scenario) => scenario.id)); + const scenarioById = new Map(params.scenarios.map((scenario) => [scenario.id, scenario])); const missingScenarioIds = [...requestedScenarioIds].filter( - (scenarioId) => !foundScenarioIds.has(scenarioId), + (scenarioId) => !scenarioById.has(scenarioId), ); if (missingScenarioIds.length > 0) { throw new Error(`unknown QA scenario id(s): ${missingScenarioIds.join(", ")}`); } - return requestedScenarios; + return [...requestedScenarioIds].map((scenarioId) => scenarioById.get(scenarioId)!); } - return requestedScenarios.filter((scenario) => + return params.scenarios.filter((scenario) => scenarioMatchesLiveLane({ scenario, providerMode: params.providerMode,