feat(qa): add Mantis before-after CLI

2026-05-06 05:30:42 +00:00 · 2026-05-03 19:13:01 +01:00
parent 3147efbed4
commit d4af125b52
7 changed files with 625 additions and 2 deletions
--- a/docs/concepts/mantis.md
+++ b/docs/concepts/mantis.md
@@ -72,7 +72,7 @@ pnpm openclaw qa mantis discord-smoke \
  --output-dir .artifacts/qa-e2e/mantis/discord-smoke
 ```

-The later before and after runner should accept this shape:
+The local before and after runner accepts this shape:

 ```bash
 pnpm openclaw qa mantis run \
@@ -83,6 +83,12 @@ pnpm openclaw qa mantis run \
  --output-dir .artifacts/qa-e2e/mantis/local-discord-status-reactions
 ```

+The runner creates detached baseline and candidate worktrees under the output
+directory, installs dependencies, builds each ref, runs the scenario with
+`--allow-failures`, then writes `baseline/`, `candidate/`, `comparison.json`,
+and `mantis-report.md`. For the first Discord scenario, a successful verification
+means baseline status is `fail` and candidate status is `pass`.
+
 The GitHub smoke workflow is `Mantis Discord Smoke`. The before and after GitHub
 workflow for the first real scenario is `Mantis Discord Status Reactions`. It
 accepts:
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -47,7 +47,7 @@ script aliases; both forms are supported.
 | `qa matrix`                                         | Live transport lane against a disposable Tuwunel homeserver. See [Matrix QA](/concepts/qa-matrix).                                                                     |
 | `qa telegram`                                       | Live transport lane against a real private Telegram group.                                                                                                             |
 | `qa discord`                                        | Live transport lane against a real private Discord guild channel.                                                                                                      |
-| `qa mantis`                                         | Planned before and after verification runner for live transport bugs. See [Mantis](/concepts/mantis).                                                                  |
+| `qa mantis`                                         | Before and after verification runner for live transport bugs, with the first Discord status-reactions scenario. See [Mantis](/concepts/mantis).                        |

 ## Operator flow

--- a/extensions/qa-lab/src/cli.test.ts
+++ b/extensions/qa-lab/src/cli.test.ts
@@ -48,6 +48,7 @@ const {
  runQaProviderServerCommand,
  runQaSuiteCommand,
  runQaTelegramCommand,
+  runMantisBeforeAfterCommand,
  runMantisDiscordSmokeCommand,
 } = vi.hoisted(() => ({
  runQaCredentialsAddCommand: vi.fn(),
@@ -57,6 +58,7 @@ const {
  runQaProviderServerCommand: vi.fn(),
  runQaSuiteCommand: vi.fn(),
  runQaTelegramCommand: vi.fn(),
+  runMantisBeforeAfterCommand: vi.fn(),
  runMantisDiscordSmokeCommand: vi.fn(),
 }));

@@ -75,6 +77,7 @@ vi.mock("./live-transports/telegram/cli.runtime.js", () => ({
 }));

 vi.mock("./mantis/cli.runtime.js", () => ({
+  runMantisBeforeAfterCommand,
  runMantisDiscordSmokeCommand,
 }));

@@ -101,6 +104,7 @@ describe("qa cli registration", () => {
    runQaProviderServerCommand.mockReset();
    runQaSuiteCommand.mockReset();
    runQaTelegramCommand.mockReset();
+    runMantisBeforeAfterCommand.mockReset();
    runMantisDiscordSmokeCommand.mockReset();
    listQaRunnerCliContributions
      .mockReset()
@@ -161,6 +165,49 @@ describe("qa cli registration", () => {
    });
  });

+  it("routes mantis before/after flags into the mantis runtime command", async () => {
+    await program.parseAsync([
+      "node",
+      "openclaw",
+      "qa",
+      "mantis",
+      "run",
+      "--transport",
+      "discord",
+      "--scenario",
+      "discord-status-reactions-tool-only",
+      "--baseline",
+      "origin/main",
+      "--candidate",
+      "HEAD",
+      "--repo-root",
+      "/tmp/openclaw-repo",
+      "--output-dir",
+      ".artifacts/qa-e2e/mantis/local-discord-status-reactions",
+      "--credential-source",
+      "convex",
+      "--credential-role",
+      "maintainer",
+      "--skip-install",
+      "--skip-build",
+    ]);
+
+    expect(runMantisBeforeAfterCommand).toHaveBeenCalledWith({
+      baseline: "origin/main",
+      candidate: "HEAD",
+      credentialRole: "maintainer",
+      credentialSource: "convex",
+      fastMode: true,
+      outputDir: ".artifacts/qa-e2e/mantis/local-discord-status-reactions",
+      providerMode: "live-frontier",
+      repoRoot: "/tmp/openclaw-repo",
+      scenario: "discord-status-reactions-tool-only",
+      skipBuild: true,
+      skipInstall: true,
+      transport: "discord",
+    });
+  });
+
  it("routes coverage report flags into the qa runtime command", async () => {
    await program.parseAsync([
      "node",
--- a/extensions/qa-lab/src/mantis/cli.runtime.ts
+++ b/extensions/qa-lab/src/mantis/cli.runtime.ts
@@ -1,4 +1,5 @@
 import { runMantisDiscordSmoke, type MantisDiscordSmokeOptions } from "./discord-smoke.runtime.js";
+import { runMantisBeforeAfter, type MantisBeforeAfterOptions } from "./run.runtime.js";

 export async function runMantisDiscordSmokeCommand(opts: MantisDiscordSmokeOptions) {
  const result = await runMantisDiscordSmoke(opts);
@@ -8,3 +9,12 @@ export async function runMantisDiscordSmokeCommand(opts: MantisDiscordSmokeOptio
    process.exitCode = 1;
  }
 }
+
+export async function runMantisBeforeAfterCommand(opts: MantisBeforeAfterOptions) {
+  const result = await runMantisBeforeAfter(opts);
+  process.stdout.write(`Mantis before/after report: ${result.reportPath}\n`);
+  process.stdout.write(`Mantis before/after comparison: ${result.comparisonPath}\n`);
+  if (result.status === "fail") {
+    process.exitCode = 1;
+  }
+}
--- a/extensions/qa-lab/src/mantis/cli.ts
+++ b/extensions/qa-lab/src/mantis/cli.ts
@@ -1,6 +1,7 @@
 import type { Command } from "commander";
 import { createLazyCliRuntimeLoader } from "../live-transports/shared/live-transport-cli.js";
 import type { MantisDiscordSmokeOptions } from "./discord-smoke.runtime.js";
+import type { MantisBeforeAfterOptions } from "./run.runtime.js";

 type MantisCliRuntime = typeof import("./cli.runtime.js");

@@ -13,6 +14,11 @@ async function runDiscordSmoke(opts: MantisDiscordSmokeOptions) {
  await runtime.runMantisDiscordSmokeCommand(opts);
 }

+async function runBeforeAfter(opts: MantisBeforeAfterOptions) {
+  const runtime = await loadMantisCliRuntime();
+  await runtime.runMantisBeforeAfterCommand(opts);
+}
+
 type MantisDiscordSmokeCommanderOptions = {
  channelId?: string;
  guildId?: string;
@@ -25,11 +31,58 @@ type MantisDiscordSmokeCommanderOptions = {
  tokenEnv?: string;
 };

+type MantisBeforeAfterCommanderOptions = {
+  baseline?: string;
+  candidate?: string;
+  credentialRole?: string;
+  credentialSource?: string;
+  fast?: boolean;
+  outputDir?: string;
+  providerMode?: string;
+  repoRoot?: string;
+  scenario?: string;
+  skipBuild?: boolean;
+  skipInstall?: boolean;
+  transport?: string;
+};
+
 export function registerMantisCli(qa: Command) {
  const mantis = qa
    .command("mantis")
    .description("Run Mantis before/after and live-smoke verification flows");

+  mantis
+    .command("run")
+    .description("Run a Mantis before/after scenario against baseline and candidate refs")
+    .requiredOption("--transport <transport>", "Transport to verify; currently only discord")
+    .requiredOption("--scenario <id>", "Mantis scenario id to run")
+    .requiredOption("--baseline <ref>", "Ref expected to reproduce the bug")
+    .requiredOption("--candidate <ref>", "Ref expected to contain the fix")
+    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
+    .option("--output-dir <path>", "Mantis before/after artifact directory")
+    .option("--provider-mode <mode>", "QA provider mode", "live-frontier")
+    .option("--credential-source <source>", "QA credential source", "convex")
+    .option("--credential-role <role>", "QA credential role", "ci")
+    .option("--fast", "Enable fast provider mode where supported", true)
+    .option("--skip-install", "Skip pnpm install in baseline/candidate worktrees", false)
+    .option("--skip-build", "Skip pnpm build in baseline/candidate worktrees", false)
+    .action(async (opts: MantisBeforeAfterCommanderOptions) => {
+      await runBeforeAfter({
+        baseline: opts.baseline,
+        candidate: opts.candidate,
+        credentialRole: opts.credentialRole,
+        credentialSource: opts.credentialSource,
+        fastMode: opts.fast,
+        outputDir: opts.outputDir,
+        providerMode: opts.providerMode,
+        repoRoot: opts.repoRoot,
+        scenario: opts.scenario,
+        skipBuild: opts.skipBuild,
+        skipInstall: opts.skipInstall,
+        transport: opts.transport,
+      });
+    });
+
  mantis
    .command("discord-smoke")
    .description("Verify the Mantis Discord bot can see the guild/channel, post, and react")
--- a/extensions/qa-lab/src/mantis/run.runtime.test.ts
+++ b/extensions/qa-lab/src/mantis/run.runtime.test.ts
@@ -0,0 +1,98 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { runMantisBeforeAfter } from "./run.runtime.js";
+
+describe("mantis before/after runtime", () => {
+  let repoRoot: string;
+
+  beforeEach(async () => {
+    repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "mantis-before-after-"));
+  });
+
+  afterEach(async () => {
+    await fs.rm(repoRoot, { force: true, recursive: true });
+  });
+
+  it("runs baseline and candidate worktrees and writes stable comparison artifacts", async () => {
+    const commands: { args: readonly string[]; command: string; cwd?: string }[] = [];
+    const runner = vi.fn(async (command: string, args: readonly string[]) => {
+      commands.push({ command, args });
+      if (command !== "pnpm" || !args.includes("openclaw")) {
+        return;
+      }
+      const repoRootArg = args[args.indexOf("--repo-root") + 1];
+      const outputDirArg = args[args.indexOf("--output-dir") + 1];
+      const lane = outputDirArg.endsWith("baseline") ? "baseline" : "candidate";
+      const outputDir = path.join(repoRootArg, outputDirArg);
+      await fs.mkdir(outputDir, { recursive: true });
+      const screenshotPath = path.join(outputDir, `${lane}-timeline.png`);
+      await fs.writeFile(screenshotPath, `${lane} screenshot`);
+      await fs.writeFile(
+        path.join(outputDir, "discord-qa-summary.json"),
+        `${JSON.stringify(
+          {
+            scenarios: [
+              {
+                artifactPaths: { screenshot: screenshotPath },
+                details:
+                  lane === "baseline"
+                    ? "reaction timeline missing thinking/done"
+                    : "reaction timeline matched queued -> thinking -> done",
+                id: "discord-status-reactions-tool-only",
+                status: lane === "baseline" ? "fail" : "pass",
+              },
+            ],
+          },
+          null,
+          2,
+        )}\n`,
+      );
+    });
+
+    const result = await runMantisBeforeAfter({
+      baseline: "bug-sha",
+      candidate: "fix-sha",
+      commandRunner: runner,
+      now: () => new Date("2026-05-03T12:00:00.000Z"),
+      outputDir: ".artifacts/qa-e2e/mantis/test-run",
+      repoRoot,
+      skipBuild: true,
+      skipInstall: true,
+    });
+
+    expect(result.status).toBe("pass");
+    expect(
+      commands.map((entry) => [
+        entry.command,
+        entry.args[0],
+        entry.args[1],
+        entry.args[2],
+        entry.args[3],
+      ]),
+    ).toEqual([
+      ["git", "worktree", "add", "--detach", expect.stringContaining("baseline")],
+      ["pnpm", "--dir", expect.stringContaining("baseline"), "openclaw", "qa"],
+      ["git", "worktree", "add", "--detach", expect.stringContaining("candidate")],
+      ["pnpm", "--dir", expect.stringContaining("candidate"), "openclaw", "qa"],
+    ]);
+
+    const comparison = JSON.parse(await fs.readFile(result.comparisonPath, "utf8")) as {
+      baseline: { reproduced: boolean; status: string };
+      candidate: { fixed: boolean; status: string };
+      pass: boolean;
+    };
+    expect(comparison).toMatchObject({
+      baseline: { reproduced: true, status: "fail" },
+      candidate: { fixed: true, status: "pass" },
+      pass: true,
+    });
+    await expect(
+      fs.readFile(path.join(result.outputDir, "baseline", "baseline.png"), "utf8"),
+    ).resolves.toBe("baseline screenshot");
+    await expect(
+      fs.readFile(path.join(result.outputDir, "candidate", "candidate.png"), "utf8"),
+    ).resolves.toBe("candidate screenshot");
+  });
+});
--- a/extensions/qa-lab/src/mantis/run.runtime.ts
+++ b/extensions/qa-lab/src/mantis/run.runtime.ts
@@ -0,0 +1,409 @@
+import { spawn, type SpawnOptions } from "node:child_process";
+import fs from "node:fs/promises";
+import path from "node:path";
+import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
+import { ensureRepoBoundDirectory, resolveRepoRelativeOutputDir } from "../cli-paths.js";
+
+export type MantisBeforeAfterOptions = {
+  allowFailures?: boolean;
+  baseline?: string;
+  candidate?: string;
+  commandRunner?: CommandRunner;
+  credentialRole?: string;
+  credentialSource?: string;
+  fastMode?: boolean;
+  now?: () => Date;
+  outputDir?: string;
+  providerMode?: string;
+  repoRoot?: string;
+  scenario?: string;
+  skipBuild?: boolean;
+  skipInstall?: boolean;
+  transport?: string;
+};
+
+export type MantisBeforeAfterResult = {
+  comparisonPath: string;
+  outputDir: string;
+  reportPath: string;
+  status: "pass" | "fail";
+};
+
+type CommandRunner = (
+  command: string,
+  args: readonly string[],
+  options: SpawnOptions,
+) => Promise<void>;
+
+type DiscordQaSummary = {
+  scenarios?: {
+    artifactPaths?: Record<string, string>;
+    details?: string;
+    id?: string;
+    status?: string;
+    title?: string;
+  }[];
+};
+
+type LaneResult = {
+  outputDir: string;
+  scenarioDetails?: string;
+  screenshotPath?: string;
+  status: string;
+  summaryPath: string;
+};
+
+type Comparison = {
+  baseline: {
+    expected: "queued-only";
+    ref: string;
+    reproduced: boolean;
+    screenshotPath?: string;
+    status: string;
+  };
+  candidate: {
+    expected: "queued -> thinking -> done";
+    fixed: boolean;
+    ref: string;
+    screenshotPath?: string;
+    status: string;
+  };
+  pass: boolean;
+  scenario: string;
+  transport: "discord";
+};
+
+const DEFAULT_BASELINE_REF = "0bf06e953fdda290799fc9fb9244a8f67fdae593";
+const DEFAULT_CANDIDATE_REF = "HEAD";
+const DEFAULT_SCENARIO = "discord-status-reactions-tool-only";
+const DEFAULT_TRANSPORT = "discord";
+const DEFAULT_PROVIDER_MODE = "live-frontier";
+const DEFAULT_MODEL = "openai/gpt-5.4";
+const DEFAULT_CREDENTIAL_SOURCE = "convex";
+const DEFAULT_CREDENTIAL_ROLE = "ci";
+
+function trimToValue(value: string | undefined) {
+  const trimmed = value?.trim();
+  return trimmed && trimmed.length > 0 ? trimmed : undefined;
+}
+
+function normalizeRequiredLiteral<T extends string>(
+  value: string | undefined,
+  defaultValue: T,
+  allowed: readonly T[],
+  label: string,
+): T {
+  const normalized = (trimToValue(value) ?? defaultValue) as T;
+  if (!allowed.includes(normalized)) {
+    throw new Error(`${label} must be ${allowed.map((entry) => `'${entry}'`).join(" or ")}.`);
+  }
+  return normalized;
+}
+
+function defaultOutputDir(repoRoot: string, startedAt: Date) {
+  const stamp = startedAt.toISOString().replace(/[:.]/gu, "-");
+  return path.join(repoRoot, ".artifacts", "qa-e2e", "mantis", `run-${stamp}`);
+}
+
+function defaultCommandRunner(
+  command: string,
+  args: readonly string[],
+  options: SpawnOptions,
+): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const child = spawn(command, args, {
+      ...options,
+      stdio: options.stdio ?? "inherit",
+    });
+    child.on("error", reject);
+    child.on("close", (code, signal) => {
+      if (code === 0) {
+        resolve();
+        return;
+      }
+      const detail = signal ? `signal ${signal}` : `exit code ${code ?? "unknown"}`;
+      reject(new Error(`${command} ${args.join(" ")} failed with ${detail}`));
+    });
+  });
+}
+
+async function runCommand(params: {
+  args: readonly string[];
+  command: string;
+  cwd: string;
+  runner: CommandRunner;
+}) {
+  await params.runner(params.command, params.args, {
+    cwd: params.cwd,
+    env: process.env,
+    stdio: "inherit",
+  });
+}
+
+async function copyDirContents(sourceDir: string, targetDir: string) {
+  await fs.rm(targetDir, { force: true, recursive: true });
+  await fs.mkdir(targetDir, { recursive: true });
+  await fs.cp(sourceDir, targetDir, { recursive: true });
+}
+
+async function readLaneResult(params: {
+  laneOutputDir: string;
+  publishedLaneDir: string;
+  scenario: string;
+}) {
+  const summaryPath = path.join(params.publishedLaneDir, "discord-qa-summary.json");
+  const summary = JSON.parse(await fs.readFile(summaryPath, "utf8")) as DiscordQaSummary;
+  const scenarioSummary =
+    summary.scenarios?.find((entry) => entry.id === params.scenario) ?? summary.scenarios?.[0];
+  const status = scenarioSummary?.status ?? "fail";
+  const screenshotPath = scenarioSummary?.artifactPaths?.screenshot;
+  return {
+    outputDir: params.publishedLaneDir,
+    scenarioDetails: scenarioSummary?.details,
+    screenshotPath,
+    status,
+    summaryPath,
+  } satisfies LaneResult;
+}
+
+function renderReport(params: {
+  baseline: LaneResult;
+  candidate: LaneResult;
+  comparison: Comparison;
+  outputDir: string;
+}) {
+  const lines = [
+    "# Mantis Before/After",
+    "",
+    `Status: ${params.comparison.pass ? "pass" : "fail"}`,
+    `Transport: ${params.comparison.transport}`,
+    `Scenario: ${params.comparison.scenario}`,
+    `Output: ${params.outputDir}`,
+    "",
+    "## Baseline",
+    "",
+    `- Ref: \`${params.comparison.baseline.ref}\``,
+    `- Expected: ${params.comparison.baseline.expected}`,
+    `- Status: \`${params.baseline.status}\``,
+    `- Reproduced: \`${params.comparison.baseline.reproduced}\``,
+    params.baseline.screenshotPath
+      ? `- Screenshot: \`${path.join("baseline", path.basename(params.baseline.screenshotPath))}\``
+      : "- Screenshot: missing",
+    params.baseline.scenarioDetails ? `- Details: ${params.baseline.scenarioDetails}` : undefined,
+    "",
+    "## Candidate",
+    "",
+    `- Ref: \`${params.comparison.candidate.ref}\``,
+    `- Expected: ${params.comparison.candidate.expected}`,
+    `- Status: \`${params.candidate.status}\``,
+    `- Fixed: \`${params.comparison.candidate.fixed}\``,
+    params.candidate.screenshotPath
+      ? `- Screenshot: \`${path.join("candidate", path.basename(params.candidate.screenshotPath))}\``
+      : "- Screenshot: missing",
+    params.candidate.scenarioDetails ? `- Details: ${params.candidate.scenarioDetails}` : undefined,
+    "",
+  ].filter((line) => line !== undefined);
+  return `${lines.join("\n")}\n`;
+}
+
+async function copyScreenshot(params: { lane: "baseline" | "candidate"; result: LaneResult }) {
+  if (!params.result.screenshotPath) {
+    return undefined;
+  }
+  const source = path.isAbsolute(params.result.screenshotPath)
+    ? params.result.screenshotPath
+    : path.join(params.result.outputDir, params.result.screenshotPath);
+  const target = path.join(params.result.outputDir, `${params.lane}.png`);
+  await fs.copyFile(source, target);
+  return target;
+}
+
+async function runLane(params: {
+  lane: "baseline" | "candidate";
+  outputDir: string;
+  ref: string;
+  repoRoot: string;
+  runner: CommandRunner;
+  scenario: string;
+  worktreeRoot: string;
+  opts: Required<
+    Pick<
+      MantisBeforeAfterOptions,
+      | "credentialRole"
+      | "credentialSource"
+      | "fastMode"
+      | "providerMode"
+      | "skipBuild"
+      | "skipInstall"
+    >
+  >;
+}) {
+  const worktreeDir = path.join(params.worktreeRoot, params.lane);
+  const worktreeOutputDir = path.join(".artifacts", "qa-e2e", "mantis", "run", params.lane);
+  await runCommand({
+    command: "git",
+    args: ["worktree", "add", "--detach", worktreeDir, params.ref],
+    cwd: params.repoRoot,
+    runner: params.runner,
+  });
+  if (!params.opts.skipInstall) {
+    await runCommand({
+      command: "pnpm",
+      args: ["--dir", worktreeDir, "install", "--frozen-lockfile"],
+      cwd: params.repoRoot,
+      runner: params.runner,
+    });
+  }
+  if (!params.opts.skipBuild) {
+    await runCommand({
+      command: "pnpm",
+      args: ["--dir", worktreeDir, "build"],
+      cwd: params.repoRoot,
+      runner: params.runner,
+    });
+  }
+  await runCommand({
+    command: "pnpm",
+    args: [
+      "--dir",
+      worktreeDir,
+      "openclaw",
+      "qa",
+      "discord",
+      "--repo-root",
+      worktreeDir,
+      "--output-dir",
+      worktreeOutputDir,
+      "--provider-mode",
+      params.opts.providerMode,
+      "--model",
+      DEFAULT_MODEL,
+      "--alt-model",
+      DEFAULT_MODEL,
+      ...(params.opts.fastMode ? ["--fast"] : []),
+      "--credential-source",
+      params.opts.credentialSource,
+      "--credential-role",
+      params.opts.credentialRole,
+      "--scenario",
+      params.scenario,
+      "--allow-failures",
+    ],
+    cwd: params.repoRoot,
+    runner: params.runner,
+  });
+  const publishedLaneDir = path.join(params.outputDir, params.lane);
+  await copyDirContents(path.join(worktreeDir, worktreeOutputDir), publishedLaneDir);
+  const result = await readLaneResult({
+    laneOutputDir: path.join(worktreeDir, worktreeOutputDir),
+    publishedLaneDir,
+    scenario: params.scenario,
+  });
+  const copiedScreenshot = await copyScreenshot({ lane: params.lane, result });
+  return {
+    ...result,
+    screenshotPath: copiedScreenshot ?? result.screenshotPath,
+  } satisfies LaneResult;
+}
+
+export async function runMantisBeforeAfter(
+  opts: MantisBeforeAfterOptions = {},
+): Promise<MantisBeforeAfterResult> {
+  const startedAt = (opts.now ?? (() => new Date()))();
+  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
+  const outputDir = await ensureRepoBoundDirectory(
+    repoRoot,
+    resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ?? defaultOutputDir(repoRoot, startedAt),
+    "Mantis before/after output directory",
+    { mode: 0o755 },
+  );
+  const transport = normalizeRequiredLiteral(
+    opts.transport,
+    DEFAULT_TRANSPORT,
+    ["discord"],
+    "--transport",
+  );
+  const scenario = normalizeRequiredLiteral(
+    opts.scenario,
+    DEFAULT_SCENARIO,
+    [DEFAULT_SCENARIO],
+    "--scenario",
+  );
+  const baseline = trimToValue(opts.baseline) ?? DEFAULT_BASELINE_REF;
+  const candidate = trimToValue(opts.candidate) ?? DEFAULT_CANDIDATE_REF;
+  const runner = opts.commandRunner ?? defaultCommandRunner;
+  const worktreeRoot = path.join(outputDir, "worktrees");
+  const comparisonPath = path.join(outputDir, "comparison.json");
+  const reportPath = path.join(outputDir, "mantis-report.md");
+  await fs.mkdir(worktreeRoot, { recursive: true });
+
+  try {
+    const commonOpts = {
+      credentialRole: trimToValue(opts.credentialRole) ?? DEFAULT_CREDENTIAL_ROLE,
+      credentialSource: trimToValue(opts.credentialSource) ?? DEFAULT_CREDENTIAL_SOURCE,
+      fastMode: opts.fastMode ?? true,
+      providerMode: trimToValue(opts.providerMode) ?? DEFAULT_PROVIDER_MODE,
+      skipBuild: opts.skipBuild ?? false,
+      skipInstall: opts.skipInstall ?? false,
+    };
+    const baselineResult = await runLane({
+      lane: "baseline",
+      outputDir,
+      ref: baseline,
+      repoRoot,
+      runner,
+      scenario,
+      worktreeRoot,
+      opts: commonOpts,
+    });
+    const candidateResult = await runLane({
+      lane: "candidate",
+      outputDir,
+      ref: candidate,
+      repoRoot,
+      runner,
+      scenario,
+      worktreeRoot,
+      opts: commonOpts,
+    });
+    const comparison = {
+      baseline: {
+        expected: "queued-only",
+        ref: baseline,
+        reproduced: baselineResult.status === "fail",
+        screenshotPath: baselineResult.screenshotPath,
+        status: baselineResult.status,
+      },
+      candidate: {
+        expected: "queued -> thinking -> done",
+        fixed: candidateResult.status === "pass",
+        ref: candidate,
+        screenshotPath: candidateResult.screenshotPath,
+        status: candidateResult.status,
+      },
+      pass: baselineResult.status === "fail" && candidateResult.status === "pass",
+      scenario,
+      transport,
+    } satisfies Comparison;
+    await fs.writeFile(comparisonPath, `${JSON.stringify(comparison, null, 2)}\n`, "utf8");
+    await fs.writeFile(
+      reportPath,
+      renderReport({
+        baseline: baselineResult,
+        candidate: candidateResult,
+        comparison,
+        outputDir,
+      }),
+      "utf8",
+    );
+    return {
+      comparisonPath,
+      outputDir,
+      reportPath,
+      status: comparison.pass ? "pass" : "fail",
+    };
+  } catch (error) {
+    await fs.writeFile(path.join(outputDir, "error.txt"), `${formatErrorMessage(error)}\n`, "utf8");
+    throw error;
+  }
+}