feat(qa-lab): add scenario pack selector

2026-05-18 16:44:45 +00:00 · 2026-05-17 08:56:28 +08:00
parent dcb4160909
commit da8afe359d
12 changed files with 126 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
 - Gateway: add opt-in restart trace logs for restart signal, active-work drain, close, next-start, ready, and memory spans. (#82396) Thanks @samzong.
 - Gateway/performance: split startup benchmark HTTP-listen timing from full gateway-ready timing and add post-bind plugin and sidecar diagnostics to restart-readiness traces. (#82603) Thanks @samzong.
 - QA-Lab: add a deterministic local personal-agent scenario pack covering reminders, threaded replies, scoped memory recall, redaction, and safe tool followthrough. (#78219) Thanks @iFiras-Max1.
+- QA-Lab: add `--pack personal-agent` for `openclaw qa suite` so maintainers can run the accepted personal-agent scenario pack by selector. (#82760) Thanks @iFiras-Max1.
 - QA-Lab: add a private Codex-vs-Pi runtime parity axis with runtime-pair suite runs, parity reports, and release-check wiring. (#80238) Thanks @100yenadmin.
 - Slack: add Slack assistant thread lifecycle support with assistant view manifest entries, suggested prompts, thread-scoped assistant sessions, and Slack-provided assistant context. Fixes #80787. Thanks @mobybot27.

--- a/docs/concepts/personal-agent-benchmark-pack.md
+++ b/docs/concepts/personal-agent-benchmark-pack.md
@@ -25,20 +25,20 @@ The first pack is intentionally narrow:
 ## Scenarios

 The machine-readable pack metadata lives in
-`extensions/qa-lab/src/scenario-packs.ts`. The initial pack does not add a CLI
-pack selector, so run the scenarios explicitly:
+`extensions/qa-lab/src/scenario-packs.ts`. Run the pack with
+`--pack personal-agent`:

 ```bash
 OPENCLAW_ENABLE_PRIVATE_QA_CLI=1 pnpm openclaw qa suite \
  --provider-mode mock-openai \
-  --scenario personal-reminder-roundtrip \
-  --scenario personal-channel-thread-reply \
-  --scenario personal-memory-preference-recall \
-  --scenario personal-redaction-no-secret-leak \
-  --scenario personal-tool-safety-followthrough \
+  --pack personal-agent \
  --concurrency 1
 ```

+`--pack` is additive with repeated `--scenario` flags. Explicit scenarios run
+first, then the pack scenarios run in `QA_PERSONAL_AGENT_SCENARIO_IDS` order with
+duplicates removed.
+
 The pack is designed for `qa-channel` with `mock-openai` or another local QA
 provider lane. It should not be pointed at live chat services or real personal
 accounts.
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -231,6 +231,9 @@ Host and Multipass suite runs execute multiple selected scenarios in parallel
 with isolated gateway workers by default. `qa-channel` defaults to concurrency
 4, capped by the selected scenario count. Use `--concurrency <count>` to tune
 the worker count, or `--concurrency 1` for serial execution.
+Use `--pack personal-agent` to run the personal assistant benchmark pack. The
+pack selector is additive with repeated `--scenario` flags: explicit scenarios
+run first, then pack scenarios run in pack order with duplicates removed.
 The command exits non-zero when any scenario fails. Use `--allow-failures` when
 you want artifacts without a failing exit code.
 Live runs forward the supported QA auth inputs that are practical for the
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -761,6 +761,35 @@ describe("qa cli runtime", () => {
    });
  });

+  it("expands the personal-agent pack onto the suite scenario list", async () => {
+    await runQaSuiteCommand({
+      repoRoot: "/tmp/openclaw-repo",
+      pack: "personal-agent",
+      scenarioIds: ["channel-chat-baseline"],
+    });
+
+    expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
+      repoRoot: path.resolve("/tmp/openclaw-repo"),
+      scenarioIds: [
+        "channel-chat-baseline",
+        "personal-reminder-roundtrip",
+        "personal-channel-thread-reply",
+        "personal-memory-preference-recall",
+        "personal-redaction-no-secret-leak",
+        "personal-tool-safety-followthrough",
+      ],
+    });
+  });
+
+  it("rejects unknown suite packs", async () => {
+    await expect(
+      runQaSuiteCommand({
+        repoRoot: "/tmp/openclaw-repo",
+        pack: "personal-admin",
+      }),
+    ).rejects.toThrow('--pack must be one of personal-agent, got "personal-admin"');
+  });
+
  it("rejects unknown suite CLI auth modes", async () => {
    await expect(
      runQaSuiteCommand({
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -43,6 +43,7 @@ import {
 } from "./run-config.js";
 import type { RuntimeId } from "./runtime-parity.js";
 import { readQaScenarioPack } from "./scenario-catalog.js";
+import { resolveQaScenarioPackScenarioIds } from "./scenario-packs.js";
 import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js";
 import { readQaSuiteFailedScenarioCountFromSummary } from "./suite-summary.js";

@@ -496,6 +497,7 @@ export async function runQaSuiteCommand(opts: {
  thinking?: string;
  cliAuthMode?: string;
  parityPack?: string;
+  pack?: string;
  scenarioIds?: string[];
  concurrency?: number;
  allowFailures?: boolean;
@@ -510,9 +512,12 @@ export async function runQaSuiteCommand(opts: {
  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
  const transportId = normalizeQaTransportId(opts.transportId);
  const runner = (opts.runner ?? "host").trim().toLowerCase();
-  const scenarioIds = resolveQaParityPackScenarioIds({
-    parityPack: opts.parityPack,
-    scenarioIds: opts.scenarioIds,
+  const scenarioIds = resolveQaScenarioPackScenarioIds({
+    pack: opts.pack,
+    scenarioIds: resolveQaParityPackScenarioIds({
+      parityPack: opts.parityPack,
+      scenarioIds: opts.scenarioIds,
+    }),
  });
  const allowFailures = opts.allowFailures === true;
  if (runner !== "host" && runner !== "multipass") {
--- a/extensions/qa-lab/src/cli.test.ts
+++ b/extensions/qa-lab/src/cli.test.ts
@@ -537,6 +537,13 @@ describe("qa cli registration", () => {
    expect(options.allowFailures).toBe(true);
  });

+  it("forwards --pack for suite runs", async () => {
+    await program.parseAsync(["node", "openclaw", "qa", "suite", "--pack", "personal-agent"]);
+
+    const options = requireQaSuiteOptions();
+    expect(options.pack).toBe("personal-agent");
+  });
+
  it("routes credential add flags into the qa runtime command", async () => {
    await program.parseAsync([
      "node",
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -41,6 +41,7 @@ async function runQaSuite(opts: {
  enabledPluginIds?: string[];
  cliAuthMode?: string;
  parityPack?: string;
+  pack?: string;
  scenarioIds?: string[];
  concurrency?: number;
  runner?: string;
@@ -253,6 +254,7 @@ export function registerQaLabCli(program: Command) {
      "CLI backend auth mode for live Claude CLI runs: auto, api-key, or subscription",
    )
    .option("--parity-pack <name>", 'Preset scenario pack; currently only "agentic" is supported')
+    .option("--pack <id>", 'Scenario pack id; currently only "personal-agent" is supported')
    .option("--scenario <id>", "Run only the named QA scenario (repeatable)", collectString, [])
    .option(
      "--enable-plugin <id>",
@@ -290,6 +292,7 @@ export function registerQaLabCli(program: Command) {
        altModel?: string;
        cliAuthMode?: string;
        parityPack?: string;
+        pack?: string;
        scenario?: string[];
        enablePlugin?: string[];
        concurrency?: number;
@@ -315,6 +318,7 @@ export function registerQaLabCli(program: Command) {
          thinking: opts.thinking,
          cliAuthMode: opts.cliAuthMode,
          parityPack: opts.parityPack,
+          pack: opts.pack,
          scenarioIds: opts.scenario,
          enabledPluginIds: opts.enablePlugin,
          concurrency: opts.concurrency,
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -227,6 +227,7 @@ export type QaBootstrapScenarioCatalog = {
 export {
  QA_PERSONAL_AGENT_SCENARIO_IDS,
  QA_SCENARIO_PACKS,
+  resolveQaScenarioPackScenarioIds,
  type QaScenarioPackDefinition,
 } from "./scenario-packs.js";

--- a/extensions/qa-lab/src/scenario-packs.test.ts
+++ b/extensions/qa-lab/src/scenario-packs.test.ts
@@ -1,5 +1,10 @@
 import { describe, expect, it } from "vitest";
-import { QA_SCENARIO_PACKS, readQaScenarioById } from "./scenario-catalog.js";
+import {
+  QA_PERSONAL_AGENT_SCENARIO_IDS,
+  QA_SCENARIO_PACKS,
+  readQaScenarioById,
+  resolveQaScenarioPackScenarioIds,
+} from "./scenario-catalog.js";

 describe("qa scenario packs", () => {
  it("points every pack scenario id at a loadable markdown scenario", () => {
@@ -41,6 +46,27 @@ describe("qa scenario packs", () => {
    }
  });

+  it("expands the personal-agent pack in pack order", () => {
+    expect(resolveQaScenarioPackScenarioIds({ pack: "personal-agent" })).toEqual([
+      ...QA_PERSONAL_AGENT_SCENARIO_IDS,
+    ]);
+  });
+
+  it("combines explicit scenarios with pack scenarios", () => {
+    expect(
+      resolveQaScenarioPackScenarioIds({
+        pack: "personal-agent",
+        scenarioIds: ["channel-chat-baseline", "personal-reminder-roundtrip"],
+      }),
+    ).toEqual(["channel-chat-baseline", ...QA_PERSONAL_AGENT_SCENARIO_IDS]);
+  });
+
+  it("rejects unknown scenario packs", () => {
+    expect(() => resolveQaScenarioPackScenarioIds({ pack: "personal-admin" })).toThrow(
+      '--pack must be one of personal-agent, got "personal-admin"',
+    );
+  });
+
  it("keeps personal pack mock debug assertions scoped to each reviewed scenario", () => {
    const redactionFlow = JSON.stringify(
      readQaScenarioById("personal-redaction-no-secret-leak").execution.flow,
--- a/extensions/qa-lab/src/scenario-packs.ts
+++ b/extensions/qa-lab/src/scenario-packs.ts
@@ -22,3 +22,21 @@ export const QA_SCENARIO_PACKS = [
    scenarioIds: QA_PERSONAL_AGENT_SCENARIO_IDS,
  },
 ] as const satisfies readonly QaScenarioPackDefinition[];
+
+export function resolveQaScenarioPackScenarioIds(params: {
+  pack?: string;
+  scenarioIds?: string[];
+}): string[] {
+  const normalizedPack = params.pack?.trim().toLowerCase();
+  const explicitScenarioIds = [...new Set(params.scenarioIds ?? [])];
+  if (!normalizedPack) {
+    return explicitScenarioIds;
+  }
+  const pack = QA_SCENARIO_PACKS.find((candidate) => candidate.id === normalizedPack);
+  if (!pack) {
+    throw new Error(
+      `--pack must be one of ${QA_SCENARIO_PACKS.map((candidate) => candidate.id).join(", ")}, got "${params.pack}"`,
+    );
+  }
+  return [...new Set([...explicitScenarioIds, ...pack.scenarioIds])];
+}
--- a/extensions/qa-lab/src/suite-planning.test.ts
+++ b/extensions/qa-lab/src/suite-planning.test.ts
@@ -189,6 +189,23 @@ describe("qa suite planning helpers", () => {
    ).toEqual(["anthropic-only"]);
  });

+  it("keeps explicitly requested scenarios in request order", () => {
+    const scenarios = [
+      makeQaSuiteTestScenario("first"),
+      makeQaSuiteTestScenario("second"),
+      makeQaSuiteTestScenario("third"),
+    ];
+
+    expect(
+      selectQaSuiteScenarios({
+        scenarios,
+        scenarioIds: ["third", "first"],
+        providerMode: "live-frontier",
+        primaryModel: "openai/gpt-5.5",
+      }).map((scenario) => scenario.id),
+    ).toEqual(["third", "first"]);
+  });
+
  it("collects unique scenario-declared bundled plugins in encounter order", () => {
    const scenarios = [
      makeQaSuiteTestScenario("generic", { plugins: ["active-memory", "memory-wiki"] }),
--- a/extensions/qa-lab/src/suite-planning.ts
+++ b/extensions/qa-lab/src/suite-planning.ts
@@ -66,20 +66,17 @@ function selectQaSuiteScenarios(params: {
 }) {
  const requestedScenarioIds =
    params.scenarioIds && params.scenarioIds.length > 0 ? new Set(params.scenarioIds) : null;
-  const requestedScenarios = requestedScenarioIds
-    ? params.scenarios.filter((scenario) => requestedScenarioIds.has(scenario.id))
-    : params.scenarios;
  if (requestedScenarioIds) {
-    const foundScenarioIds = new Set(requestedScenarios.map((scenario) => scenario.id));
+    const scenarioById = new Map(params.scenarios.map((scenario) => [scenario.id, scenario]));
    const missingScenarioIds = [...requestedScenarioIds].filter(
-      (scenarioId) => !foundScenarioIds.has(scenarioId),
+      (scenarioId) => !scenarioById.has(scenarioId),
    );
    if (missingScenarioIds.length > 0) {
      throw new Error(`unknown QA scenario id(s): ${missingScenarioIds.join(", ")}`);
    }
-    return requestedScenarios;
+    return [...requestedScenarioIds].map((scenarioId) => scenarioById.get(scenarioId)!);
  }
-  return requestedScenarios.filter((scenario) =>
+  return params.scenarios.filter((scenario) =>
    scenarioMatchesLiveLane({
      scenario,
      providerMode: params.providerMode,