From 35266879de8d5cc56c82cd5c6ea6bf13d872d2be Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 5 May 2026 05:34:49 +0100 Subject: [PATCH] feat: add Mantis visual task video QA --- CHANGELOG.md | 1 + docs/concepts/qa-e2e-automation.md | 31 +- extensions/qa-lab/src/mantis/cli.runtime.ts | 36 + extensions/qa-lab/src/mantis/cli.ts | 145 ++- .../desktop-browser-smoke.runtime.test.ts | 11 +- .../mantis/desktop-browser-smoke.runtime.ts | 37 +- .../qa-lab/src/mantis/run.runtime.test.ts | 10 +- extensions/qa-lab/src/mantis/run.runtime.ts | 27 + .../slack-desktop-smoke.runtime.test.ts | 16 +- .../src/mantis/slack-desktop-smoke.runtime.ts | 41 +- .../src/mantis/visual-task.runtime.test.ts | 349 +++++++ .../qa-lab/src/mantis/visual-task.runtime.ts | 926 ++++++++++++++++++ 12 files changed, 1612 insertions(+), 18 deletions(-) create mode 100644 extensions/qa-lab/src/mantis/visual-task.runtime.test.ts create mode 100644 extensions/qa-lab/src/mantis/visual-task.runtime.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c85ba4e0fd..bc90ba75631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ Docs: https://docs.openclaw.ai - Gateway/diagnostics: add startup phase spans, active work labels, stale terminal bridge markers, and default sync-I/O tracing in `pnpm gateway:watch` so slow Gateway turns are easier to attribute from logs and stability diagnostics. - Plugins/loader: preserve real compiled plugin module evaluation errors on the native fast path instead of treating every thrown `.js` module as a source-transform fallback miss. Thanks @vincentkoc. - QA/Mantis: add `pnpm openclaw qa mantis slack-desktop-smoke` to run Slack live QA inside a Crabbox VNC desktop, open Slack Web, and capture desktop screenshots beside the Slack QA artifacts. +- QA/Mantis: add visual desktop tasks with Crabbox MP4 recording, screenshot capture, and optional image-understanding assertions, and preserve video artifacts in Mantis before/after reports. - QA/Mantis: pass the runtime env through desktop-browser Crabbox and artifact-copy child commands, so embedded Mantis callers can provide Crabbox credentials without mutating the parent process. Thanks @vincentkoc. - QA/Mantis: return the copied Slack desktop screenshot path even when remote Slack QA fails, so the CLI still prints the failure screenshot artifact. Thanks @vincentkoc. - QA/Mantis: accept Blacksmith Testbox `tbx_...` lease ids from desktop smoke warmup, so provider overrides do not fail before inspect/run. Thanks @vincentkoc. diff --git a/docs/concepts/qa-e2e-automation.md b/docs/concepts/qa-e2e-automation.md index 46e32f91c31..da3d1655430 100644 --- a/docs/concepts/qa-e2e-automation.md +++ b/docs/concepts/qa-e2e-automation.md @@ -132,12 +132,37 @@ pnpm openclaw qa mantis slack-desktop-smoke \ That command leases a Crabbox desktop/browser machine, runs the Slack live lane inside the VM, opens Slack Web in the VNC browser, captures the desktop, and -copies `slack-qa/` plus `slack-desktop-smoke.png` back to the Mantis artifact -directory. Reuse `--lease-id ` after logging in to Slack Web manually +copies `slack-qa/`, `slack-desktop-smoke.png`, and `slack-desktop-smoke.mp4` +when video capture is available back to the Mantis artifact directory. Reuse `--lease-id ` after logging in to Slack Web manually through VNC. With `--gateway-setup`, Mantis leaves a persistent OpenClaw Slack gateway running inside the VM on port `38973`; without it, the command runs the normal bot-to-bot Slack QA lane and exits after artifact capture. +For an agent/CV style desktop task, run: + +```bash +pnpm openclaw qa mantis visual-task \ + --browser-url https://example.net \ + --expect-text "Example Domain" \ + --vision-model openai/gpt-5.4 +``` + +`visual-task` leases or reuses a Crabbox desktop/browser machine, starts +`crabbox record --while`, drives the visible browser through a nested +`visual-driver`, captures `visual-task.png`, runs `openclaw infer image describe` +against the screenshot when `--vision-mode image-describe` is selected, and +writes `visual-task.mp4`, `mantis-visual-task-summary.json`, +`mantis-visual-task-driver-result.json`, and `mantis-visual-task-report.md`. +When `--expect-text` is set, the vision prompt asks for a structured JSON +verdict and only passes when the model reports positive visible evidence; a +negative response that merely quotes the target text fails the assertion. +Use `--vision-mode metadata` for a no-model smoke that proves the desktop, +browser, screenshot, and video plumbing without calling an image-understanding +provider. Recording is a required artifact for `visual-task`; if Crabbox records +no non-empty `visual-task.mp4`, the task fails even when the visual driver +passed. On failure, Mantis keeps the lease for VNC unless the task had already +passed and `--keep-lease` was not set. + Before using pooled live credentials, run: ```bash @@ -266,7 +291,7 @@ Scenarios (`extensions/qa-lab/src/live-transports/discord/discord-live.runtime.t - `discord-canary` - `discord-mention-gating` - `discord-native-help-command-registration` -- `discord-status-reactions-tool-only` — opt-in Mantis scenario. Runs by itself because it switches the SUT to always-on, tool-only guild replies with `messages.statusReactions.enabled=true`, then captures a REST reaction timeline plus an HTML/PNG visual artifact. +- `discord-status-reactions-tool-only` — opt-in Mantis scenario. Runs by itself because it switches the SUT to always-on, tool-only guild replies with `messages.statusReactions.enabled=true`, then captures a REST reaction timeline plus HTML/PNG visual artifacts. Mantis before/after reports also preserve scenario-provided MP4 artifacts as `baseline.mp4` and `candidate.mp4`. Run the Mantis status-reaction scenario explicitly: diff --git a/extensions/qa-lab/src/mantis/cli.runtime.ts b/extensions/qa-lab/src/mantis/cli.runtime.ts index 4089f7d5771..d2643f33e69 100644 --- a/extensions/qa-lab/src/mantis/cli.runtime.ts +++ b/extensions/qa-lab/src/mantis/cli.runtime.ts @@ -8,6 +8,12 @@ import { runMantisSlackDesktopSmoke, type MantisSlackDesktopSmokeOptions, } from "./slack-desktop-smoke.runtime.js"; +import { + runMantisVisualDriver, + runMantisVisualTask, + type MantisVisualDriverOptions, + type MantisVisualTaskOptions, +} from "./visual-task.runtime.js"; export async function runMantisDiscordSmokeCommand(opts: MantisDiscordSmokeOptions) { const result = await runMantisDiscordSmoke(opts); @@ -34,6 +40,9 @@ export async function runMantisDesktopBrowserSmokeCommand(opts: MantisDesktopBro if (result.screenshotPath) { process.stdout.write(`Mantis desktop browser screenshot: ${result.screenshotPath}\n`); } + if (result.videoPath) { + process.stdout.write(`Mantis desktop browser video: ${result.videoPath}\n`); + } if (result.status === "fail") { process.exitCode = 1; } @@ -46,6 +55,33 @@ export async function runMantisSlackDesktopSmokeCommand(opts: MantisSlackDesktop if (result.screenshotPath) { process.stdout.write(`Mantis Slack desktop screenshot: ${result.screenshotPath}\n`); } + if (result.videoPath) { + process.stdout.write(`Mantis Slack desktop video: ${result.videoPath}\n`); + } + if (result.status === "fail") { + process.exitCode = 1; + } +} + +export async function runMantisVisualDriverCommand(opts: MantisVisualDriverOptions) { + const result = await runMantisVisualDriver(opts); + process.stdout.write(`Mantis visual driver result: ${result.status}\n`); + process.stdout.write(`Mantis visual driver screenshot: ${result.screenshotPath}\n`); + if (result.status === "fail") { + process.exitCode = 1; + } +} + +export async function runMantisVisualTaskCommand(opts: MantisVisualTaskOptions) { + const result = await runMantisVisualTask(opts); + process.stdout.write(`Mantis visual task report: ${result.reportPath}\n`); + process.stdout.write(`Mantis visual task summary: ${result.summaryPath}\n`); + if (result.screenshotPath) { + process.stdout.write(`Mantis visual task screenshot: ${result.screenshotPath}\n`); + } + if (result.videoPath) { + process.stdout.write(`Mantis visual task video: ${result.videoPath}\n`); + } if (result.status === "fail") { process.exitCode = 1; } diff --git a/extensions/qa-lab/src/mantis/cli.ts b/extensions/qa-lab/src/mantis/cli.ts index 7647448bbbb..dc51f86c62b 100644 --- a/extensions/qa-lab/src/mantis/cli.ts +++ b/extensions/qa-lab/src/mantis/cli.ts @@ -4,6 +4,11 @@ import type { MantisDesktopBrowserSmokeOptions } from "./desktop-browser-smoke.r import type { MantisDiscordSmokeOptions } from "./discord-smoke.runtime.js"; import type { MantisBeforeAfterOptions } from "./run.runtime.js"; import type { MantisSlackDesktopSmokeOptions } from "./slack-desktop-smoke.runtime.js"; +import type { + MantisVisualDriverOptions, + MantisVisualTaskOptions, + MantisVisualTaskVisionMode, +} from "./visual-task.runtime.js"; type MantisCliRuntime = typeof import("./cli.runtime.js"); @@ -31,6 +36,16 @@ async function runSlackDesktopSmoke(opts: MantisSlackDesktopSmokeOptions) { await runtime.runMantisSlackDesktopSmokeCommand(opts); } +async function runVisualDriver(opts: MantisVisualDriverOptions) { + const runtime = await loadMantisCliRuntime(); + await runtime.runMantisVisualDriverCommand(opts); +} + +async function runVisualTask(opts: MantisVisualTaskOptions) { + const runtime = await loadMantisCliRuntime(); + await runtime.runMantisVisualTaskCommand(opts); +} + type MantisDiscordSmokeCommanderOptions = { channelId?: string; guildId?: string; @@ -96,10 +111,57 @@ type MantisSlackDesktopSmokeCommanderOptions = { ttl?: string; }; +type MantisVisualTaskCommanderOptions = { + browserUrl?: string; + class?: string; + crabboxBin?: string; + duration?: string; + expectText?: string; + idleTimeout?: string; + keepLease?: boolean; + leaseId?: string; + machineClass?: string; + outputDir?: string; + provider?: string; + repoRoot?: string; + settleMs?: string; + ttl?: string; + visionMode?: MantisVisualTaskVisionMode; + visionModel?: string; + visionPrompt?: string; + visionTimeoutMs?: string; +}; + +type MantisVisualDriverCommanderOptions = { + browserUrl?: string; + crabboxBin?: string; + expectText?: string; + leaseId?: string; + outputDir?: string; + provider?: string; + repoRoot?: string; + settleMs?: string; + visionMode?: MantisVisualTaskVisionMode; + visionModel?: string; + visionPrompt?: string; + visionTimeoutMs?: string; +}; + function collectString(value: string, previous: string[] = []) { return [...previous, value]; } +function parseOptionalInteger(value: string | undefined, label: string) { + if (value === undefined) { + return undefined; + } + const parsed = Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || String(parsed) !== value || parsed < 0) { + throw new Error(`${label} must be a non-negative integer`); + } + return parsed; +} + export function registerMantisCli(qa: Command) { const mantis = qa .command("mantis") @@ -166,7 +228,7 @@ export function registerMantisCli(qa: Command) { mantis .command("desktop-browser-smoke") .description( - "Lease or reuse a Crabbox desktop, open a visible browser, and capture a VNC desktop screenshot", + "Lease or reuse a Crabbox desktop, open a visible browser, and capture VNC desktop screenshot/video artifacts", ) .option("--repo-root ", "Repository root to target when running from a neutral cwd") .option("--output-dir ", "Mantis desktop browser artifact directory") @@ -199,7 +261,7 @@ export function registerMantisCli(qa: Command) { mantis .command("slack-desktop-smoke") .description( - "Lease or reuse a Crabbox VNC desktop, run Slack QA inside it, open Slack in the browser, and capture a screenshot", + "Lease or reuse a Crabbox VNC desktop, run Slack QA inside it, open Slack in the browser, and capture screenshot/video artifacts", ) .option("--repo-root ", "Repository root to target when running from a neutral cwd") .option("--output-dir ", "Mantis Slack desktop artifact directory") @@ -249,4 +311,83 @@ export function registerMantisCli(qa: Command) { ttl: opts.ttl, }); }); + + mantis + .command("visual-task") + .description( + "Lease or reuse a Crabbox desktop, drive visible browser UI, record MP4, screenshot it, and optionally run image-understanding assertions", + ) + .option("--repo-root ", "Repository root to target when running from a neutral cwd") + .option("--output-dir ", "Mantis visual-task artifact directory") + .option("--crabbox-bin ", "Crabbox binary path") + .option("--provider ", "Crabbox provider") + .option("--machine-class ", "Crabbox machine class") + .option("--class ", "Alias for --machine-class") + .option("--lease-id ", "Reuse an existing Crabbox lease") + .option("--idle-timeout ", "Crabbox idle timeout") + .option("--ttl ", "Crabbox maximum lease lifetime") + .option("--keep-lease", "Keep a lease created by this run after a passing task") + .option("--browser-url ", "URL to open in the visible browser") + .option("--duration ", "Desktop recording duration") + .option("--settle-ms ", "Milliseconds to wait after launch before screenshot") + .option("--vision-mode ", "Vision mode: image-describe or metadata") + .option("--vision-prompt ", "Prompt for image understanding") + .option("--vision-model ", "Image-capable provider/model ref") + .option("--vision-timeout-ms ", "Image understanding timeout in milliseconds") + .option("--expect-text ", "Case-insensitive text expected in the vision output") + .action(async (opts: MantisVisualTaskCommanderOptions) => { + await runVisualTask({ + browserUrl: opts.browserUrl, + crabboxBin: opts.crabboxBin, + duration: opts.duration, + expectText: opts.expectText, + idleTimeout: opts.idleTimeout, + keepLease: opts.keepLease, + leaseId: opts.leaseId, + machineClass: opts.machineClass ?? opts.class, + outputDir: opts.outputDir, + provider: opts.provider, + repoRoot: opts.repoRoot, + settleMs: parseOptionalInteger(opts.settleMs, "--settle-ms"), + ttl: opts.ttl, + visionMode: opts.visionMode, + visionModel: opts.visionModel, + visionPrompt: opts.visionPrompt, + visionTimeoutMs: parseOptionalInteger(opts.visionTimeoutMs, "--vision-timeout-ms"), + }); + }); + + mantis + .command("visual-driver") + .description( + "Driver half for Mantis visual-task; launched by Crabbox record --while, then opens browser, screenshots, and runs vision", + ) + .option("--repo-root ", "Repository root to target when running from a neutral cwd") + .option("--output-dir ", "Mantis visual-task artifact directory") + .option("--crabbox-bin ", "Crabbox binary path") + .option("--provider ", "Crabbox provider") + .option("--lease-id ", "Crabbox lease id") + .option("--browser-url ", "URL to open in the visible browser") + .option("--settle-ms ", "Milliseconds to wait after launch before screenshot") + .option("--vision-mode ", "Vision mode: image-describe or metadata") + .option("--vision-prompt ", "Prompt for image understanding") + .option("--vision-model ", "Image-capable provider/model ref") + .option("--vision-timeout-ms ", "Image understanding timeout in milliseconds") + .option("--expect-text ", "Case-insensitive text expected in the vision output") + .action(async (opts: MantisVisualDriverCommanderOptions) => { + await runVisualDriver({ + browserUrl: opts.browserUrl, + crabboxBin: opts.crabboxBin, + expectText: opts.expectText, + leaseId: opts.leaseId, + outputDir: opts.outputDir, + provider: opts.provider, + repoRoot: opts.repoRoot, + settleMs: parseOptionalInteger(opts.settleMs, "--settle-ms"), + visionMode: opts.visionMode, + visionModel: opts.visionModel, + visionPrompt: opts.visionPrompt, + visionTimeoutMs: parseOptionalInteger(opts.visionTimeoutMs, "--vision-timeout-ms"), + }); + }); } diff --git a/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.test.ts b/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.test.ts index c8e4315c338..160779edb10 100644 --- a/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.test.ts +++ b/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.test.ts @@ -50,8 +50,10 @@ describe("mantis desktop browser smoke runtime", () => { expect(outputDir).toBeTypeOf("string"); await fs.mkdir(outputDir as string, { recursive: true }); await fs.writeFile(path.join(outputDir as string, "desktop-browser-smoke.png"), "png"); + await fs.writeFile(path.join(outputDir as string, "desktop-browser-smoke.mp4"), "mp4"); await fs.writeFile(path.join(outputDir as string, "remote-metadata.json"), "{}\n"); await fs.writeFile(path.join(outputDir as string, "chrome.log"), "chrome\n"); + await fs.writeFile(path.join(outputDir as string, "ffmpeg.log"), "ffmpeg\n"); return { stdout: "", stderr: "" }; } return { stdout: "", stderr: "" }; @@ -80,11 +82,10 @@ describe("mantis desktop browser smoke runtime", () => { expect(commands.every((entry) => entry.env === runtimeEnv)).toBe(true); const rsyncArgs = commands.find((entry) => entry.command === "rsync")?.args ?? []; expect(rsyncArgs).not.toContain("--delete"); + expect(rsyncArgs).toEqual(expect.arrayContaining(["--exclude", "chrome-profile/**"])); expect(rsyncArgs).toEqual( expect.arrayContaining([ - "crabbox@203.0.113.10:/tmp/openclaw-mantis-desktop-2026-05-04T12-00-00-000Z/desktop-browser-smoke.png", - "crabbox@203.0.113.10:/tmp/openclaw-mantis-desktop-2026-05-04T12-00-00-000Z/remote-metadata.json", - "crabbox@203.0.113.10:/tmp/openclaw-mantis-desktop-2026-05-04T12-00-00-000Z/chrome.log", + "crabbox@203.0.113.10:/tmp/openclaw-mantis-desktop-2026-05-04T12-00-00-000Z/", ]), ); const remoteScript = commands @@ -94,9 +95,13 @@ describe("mantis desktop browser smoke runtime", () => { expect(remoteScript).toContain("${CHROME_BIN:-}"); expect(remoteScript).toContain("chromium-browser"); expect(remoteScript).toContain("base64 -d"); + expect(remoteScript).toContain("ffmpeg"); + expect(remoteScript).toContain('sudo apt-get update -y >>"$out/apt.log" 2>&1 || true'); + expect(remoteScript).toContain("desktop-browser-smoke.mp4"); expect(remoteScript).toContain('url="file://$out/input.html"'); expect(remoteScript).toContain('"browserBinary": "$browser_bin"'); await expect(fs.readFile(result.screenshotPath ?? "", "utf8")).resolves.toBe("png"); + await expect(fs.readFile(result.videoPath ?? "", "utf8")).resolves.toBe("mp4"); const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { browserUrl: string; crabbox: { id: string; vncCommand: string }; diff --git a/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.ts b/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.ts index 2c687690631..3a4f35e425a 100644 --- a/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.ts +++ b/extensions/qa-lab/src/mantis/desktop-browser-smoke.runtime.ts @@ -28,6 +28,7 @@ export type MantisDesktopBrowserSmokeResult = { screenshotPath?: string; status: "pass" | "fail"; summaryPath: string; + videoPath?: string; }; type CommandResult = { @@ -58,6 +59,7 @@ type MantisDesktopBrowserSmokeSummary = { reportPath: string; screenshotPath?: string; summaryPath: string; + videoPath?: string; }; browserUrl: string; htmlFile?: string; @@ -232,6 +234,24 @@ if [ -z "$browser_bin" ]; then echo "No browser binary found. Checked BROWSER, CHROME_BIN, google-chrome, chromium, chromium-browser." >&2 exit 127 fi +video_pid="" +if command -v ffmpeg >/dev/null 2>&1; then + : +else + sudo apt-get update -y >>"$out/apt.log" 2>&1 || true + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg >>"$out/apt.log" 2>&1 || true +fi +if command -v ffmpeg >/dev/null 2>&1; then + display_input="$DISPLAY" + case "$display_input" in + *.*) ;; + *) display_input="$display_input.0" ;; + esac + ffmpeg -hide_banner -loglevel error -y -f x11grab -video_size 1280x900 -framerate 15 -i "$display_input" -t 10 -pix_fmt yuv420p "$out/desktop-browser-smoke.mp4" >"$out/ffmpeg.log" 2>&1 & + video_pid=$! +else + echo "ffmpeg missing; video artifact skipped" >"$out/ffmpeg.log" +fi "$browser_bin" \ --user-data-dir="$profile" \ --no-first-run \ @@ -248,6 +268,9 @@ cleanup() { trap cleanup EXIT sleep 8 scrot "$out/desktop-browser-smoke.png" +if [ -n "$video_pid" ]; then + wait "$video_pid" || true +fi cleanup trap - EXIT sleep 1 @@ -291,7 +314,11 @@ function renderReport(summary: MantisDesktopBrowserSmokeSummary) { summary.artifacts.screenshotPath ? `- Screenshot: \`${path.basename(summary.artifacts.screenshotPath)}\`` : "- Screenshot: missing", + summary.artifacts.videoPath + ? `- Video: \`${path.basename(summary.artifacts.videoPath)}\`` + : "- Video: missing", "- Remote metadata: `remote-metadata.json`", + "- FFmpeg log: `ffmpeg.log`", "- Chrome log: `chrome.log`", summary.error ? `- Error: ${summary.error}` : undefined, "", @@ -401,9 +428,9 @@ async function copyRemoteArtifacts(params: { "-o", "UserKnownHostsFile=/dev/null", ].join(" "), - `${sshUser}@${host}:${params.remoteOutputDir}/desktop-browser-smoke.png`, - `${sshUser}@${host}:${params.remoteOutputDir}/remote-metadata.json`, - `${sshUser}@${host}:${params.remoteOutputDir}/chrome.log`, + "--exclude", + "chrome-profile/**", + `${sshUser}@${host}:${params.remoteOutputDir}/`, `${params.outputDir}/`, ], cwd: params.cwd, @@ -524,14 +551,17 @@ export async function runMantisDesktopBrowserSmoke( runner, }); const screenshotPath = path.join(outputDir, "desktop-browser-smoke.png"); + const videoPath = path.join(outputDir, "desktop-browser-smoke.mp4"); if (!(await pathExists(screenshotPath))) { throw new Error("Desktop browser screenshot was not copied back from Crabbox."); } + const copiedVideoPath = (await pathExists(videoPath)) ? videoPath : undefined; summary = { artifacts: { reportPath, screenshotPath, summaryPath, + videoPath: copiedVideoPath, }, browserUrl, htmlFile, @@ -556,6 +586,7 @@ export async function runMantisDesktopBrowserSmoke( screenshotPath, status: "pass", summaryPath, + videoPath: copiedVideoPath, }; } catch (error) { summary = { diff --git a/extensions/qa-lab/src/mantis/run.runtime.test.ts b/extensions/qa-lab/src/mantis/run.runtime.test.ts index bd46e54aa8c..74000d27ddf 100644 --- a/extensions/qa-lab/src/mantis/run.runtime.test.ts +++ b/extensions/qa-lab/src/mantis/run.runtime.test.ts @@ -28,14 +28,16 @@ describe("mantis before/after runtime", () => { const outputDir = path.join(repoRootArg, outputDirArg); await fs.mkdir(outputDir, { recursive: true }); const screenshotPath = path.join(outputDir, `${lane}-timeline.png`); + const videoPath = path.join(outputDir, `${lane}-timeline.mp4`); await fs.writeFile(screenshotPath, `${lane} screenshot`); + await fs.writeFile(videoPath, `${lane} video`); await fs.writeFile( path.join(outputDir, "discord-qa-summary.json"), `${JSON.stringify( { scenarios: [ { - artifactPaths: { screenshot: screenshotPath }, + artifactPaths: { screenshot: screenshotPath, video: videoPath }, details: lane === "baseline" ? "reaction timeline missing thinking/done" @@ -94,5 +96,11 @@ describe("mantis before/after runtime", () => { await expect( fs.readFile(path.join(result.outputDir, "candidate", "candidate.png"), "utf8"), ).resolves.toBe("candidate screenshot"); + await expect( + fs.readFile(path.join(result.outputDir, "baseline", "baseline.mp4"), "utf8"), + ).resolves.toBe("baseline video"); + await expect( + fs.readFile(path.join(result.outputDir, "candidate", "candidate.mp4"), "utf8"), + ).resolves.toBe("candidate video"); }); }); diff --git a/extensions/qa-lab/src/mantis/run.runtime.ts b/extensions/qa-lab/src/mantis/run.runtime.ts index ade6d88cb8c..b5b35a6393f 100644 --- a/extensions/qa-lab/src/mantis/run.runtime.ts +++ b/extensions/qa-lab/src/mantis/run.runtime.ts @@ -51,6 +51,7 @@ type LaneResult = { screenshotPath?: string; status: string; summaryPath: string; + videoPath?: string; }; type Comparison = { @@ -60,6 +61,7 @@ type Comparison = { reproduced: boolean; screenshotPath?: string; status: string; + videoPath?: string; }; candidate: { expected: "queued -> thinking -> done"; @@ -67,6 +69,7 @@ type Comparison = { ref: string; screenshotPath?: string; status: string; + videoPath?: string; }; pass: boolean; scenario: string; @@ -157,12 +160,14 @@ async function readLaneResult(params: { summary.scenarios?.find((entry) => entry.id === params.scenario) ?? summary.scenarios?.[0]; const status = scenarioSummary?.status ?? "fail"; const screenshotPath = scenarioSummary?.artifactPaths?.screenshot; + const videoPath = scenarioSummary?.artifactPaths?.video; return { outputDir: params.publishedLaneDir, scenarioDetails: scenarioSummary?.details, screenshotPath, status, summaryPath, + videoPath, } satisfies LaneResult; } @@ -189,6 +194,9 @@ function renderReport(params: { params.baseline.screenshotPath ? `- Screenshot: \`${path.join("baseline", path.basename(params.baseline.screenshotPath))}\`` : "- Screenshot: missing", + params.baseline.videoPath + ? `- Video: \`${path.join("baseline", path.basename(params.baseline.videoPath))}\`` + : "- Video: missing", params.baseline.scenarioDetails ? `- Details: ${params.baseline.scenarioDetails}` : undefined, "", "## Candidate", @@ -200,6 +208,9 @@ function renderReport(params: { params.candidate.screenshotPath ? `- Screenshot: \`${path.join("candidate", path.basename(params.candidate.screenshotPath))}\`` : "- Screenshot: missing", + params.candidate.videoPath + ? `- Video: \`${path.join("candidate", path.basename(params.candidate.videoPath))}\`` + : "- Video: missing", params.candidate.scenarioDetails ? `- Details: ${params.candidate.scenarioDetails}` : undefined, "", ].filter((line) => line !== undefined); @@ -218,6 +229,18 @@ async function copyScreenshot(params: { lane: "baseline" | "candidate"; result: return target; } +async function copyVideo(params: { lane: "baseline" | "candidate"; result: LaneResult }) { + if (!params.result.videoPath) { + return undefined; + } + const source = path.isAbsolute(params.result.videoPath) + ? params.result.videoPath + : path.join(params.result.outputDir, params.result.videoPath); + const target = path.join(params.result.outputDir, `${params.lane}.mp4`); + await fs.copyFile(source, target); + return target; +} + async function runLane(params: { lane: "baseline" | "candidate"; outputDir: string; @@ -300,9 +323,11 @@ async function runLane(params: { scenario: params.scenario, }); const copiedScreenshot = await copyScreenshot({ lane: params.lane, result }); + const copiedVideo = await copyVideo({ lane: params.lane, result }); return { ...result, screenshotPath: copiedScreenshot ?? result.screenshotPath, + videoPath: copiedVideo ?? result.videoPath, } satisfies LaneResult; } @@ -373,6 +398,7 @@ export async function runMantisBeforeAfter( reproduced: baselineResult.status === "fail", screenshotPath: baselineResult.screenshotPath, status: baselineResult.status, + videoPath: baselineResult.videoPath, }, candidate: { expected: "queued -> thinking -> done", @@ -380,6 +406,7 @@ export async function runMantisBeforeAfter( ref: candidate, screenshotPath: candidateResult.screenshotPath, status: candidateResult.status, + videoPath: candidateResult.videoPath, }, pass: baselineResult.status === "fail" && candidateResult.status === "pass", scenario, diff --git a/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.test.ts b/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.test.ts index a91209de9e5..dad6cd7ac97 100644 --- a/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.test.ts +++ b/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.test.ts @@ -54,8 +54,10 @@ describe("mantis Slack desktop smoke runtime", () => { await fs.writeFile(path.join(outputDir as string, "slack-qa-report.md"), "# Slack\n"); } else { await fs.writeFile(path.join(outputDir as string, "slack-desktop-smoke.png"), "png"); + await fs.writeFile(path.join(outputDir as string, "slack-desktop-smoke.mp4"), "mp4"); await fs.writeFile(path.join(outputDir as string, "remote-metadata.json"), "{}\n"); await fs.writeFile(path.join(outputDir as string, "chrome.log"), "chrome\n"); + await fs.writeFile(path.join(outputDir as string, "ffmpeg.log"), "ffmpeg\n"); await fs.writeFile(path.join(outputDir as string, "slack-desktop-command.log"), "qa\n"); } return { stdout: "", stderr: "" }; @@ -97,6 +99,9 @@ describe("mantis Slack desktop smoke runtime", () => { expect(remoteScript).toContain("${CHROME_BIN:-}"); expect(remoteScript).toContain("pnpm install --frozen-lockfile"); expect(remoteScript).toContain("pnpm build"); + expect(remoteScript).toContain("ffmpeg"); + expect(remoteScript).toContain('sudo apt-get update -y >>"$out/apt.log" 2>&1 || true'); + expect(remoteScript).toContain("slack-desktop-smoke.mp4"); expect(remoteScript).toContain("openclaw qa slack"); expect(remoteScript).toContain("--scenario 'slack-canary'"); expect(remoteScript).toContain("OPENCLAW_MANTIS_SLACK_BROWSER_PROFILE_DIR"); @@ -106,11 +111,12 @@ describe("mantis Slack desktop smoke runtime", () => { expect(rsyncArgs).not.toContain("--delete"); expect(rsyncArgs).toEqual( expect.arrayContaining([ - "crabbox@203.0.113.10:/tmp/openclaw-mantis-slack-desktop-2026-05-04T13-00-00-000Z/slack-desktop-smoke.png", + "crabbox@203.0.113.10:/tmp/openclaw-mantis-slack-desktop-2026-05-04T13-00-00-000Z/", "crabbox@203.0.113.10:/tmp/openclaw-mantis-slack-desktop-2026-05-04T13-00-00-000Z/slack-qa/", ]), ); await expect(fs.readFile(result.screenshotPath ?? "", "utf8")).resolves.toBe("png"); + await expect(fs.readFile(result.videoPath ?? "", "utf8")).resolves.toBe("mp4"); const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { crabbox: { id: string; vncCommand: string }; status: string; @@ -146,8 +152,10 @@ describe("mantis Slack desktop smoke runtime", () => { const outputDir = args.at(-1); await fs.mkdir(outputDir as string, { recursive: true }); await fs.writeFile(path.join(outputDir as string, "slack-desktop-smoke.png"), "png"); + await fs.writeFile(path.join(outputDir as string, "slack-desktop-smoke.mp4"), "mp4"); await fs.writeFile(path.join(outputDir as string, "remote-metadata.json"), "{}\n"); await fs.writeFile(path.join(outputDir as string, "chrome.log"), "chrome\n"); + await fs.writeFile(path.join(outputDir as string, "ffmpeg.log"), "ffmpeg\n"); await fs.writeFile(path.join(outputDir as string, "slack-desktop-command.log"), "qa\n"); } return { stdout: "", stderr: "" }; @@ -163,17 +171,19 @@ describe("mantis Slack desktop smoke runtime", () => { expect(result.status).toBe("fail"); expect(result.screenshotPath).toBe(path.join(result.outputDir, "slack-desktop-smoke.png")); + expect(result.videoPath).toBe(path.join(result.outputDir, "slack-desktop-smoke.mp4")); await expect( fs.readFile(path.join(result.outputDir, "slack-desktop-smoke.png"), "utf8"), ).resolves.toBe("png"); const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { - artifacts: { screenshotPath?: string }; + artifacts: { screenshotPath?: string; videoPath?: string }; error?: string; status: string; }; expect(summary.status).toBe("fail"); expect(summary.error).toContain("remote Slack QA failed"); expect(summary.artifacts.screenshotPath).toContain("slack-desktop-smoke.png"); + expect(summary.artifacts.videoPath).toContain("slack-desktop-smoke.mp4"); }); it("accepts Blacksmith Testbox lease ids from Crabbox warmup", async () => { @@ -204,8 +214,10 @@ describe("mantis Slack desktop smoke runtime", () => { await fs.writeFile(path.join(outputDir as string, "slack-qa-report.md"), "# Slack\n"); } else { await fs.writeFile(path.join(outputDir as string, "slack-desktop-smoke.png"), "png"); + await fs.writeFile(path.join(outputDir as string, "slack-desktop-smoke.mp4"), "mp4"); await fs.writeFile(path.join(outputDir as string, "remote-metadata.json"), "{}\n"); await fs.writeFile(path.join(outputDir as string, "chrome.log"), "chrome\n"); + await fs.writeFile(path.join(outputDir as string, "ffmpeg.log"), "ffmpeg\n"); await fs.writeFile(path.join(outputDir as string, "slack-desktop-command.log"), "qa\n"); } } diff --git a/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.ts b/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.ts index 6f7a1a71ec0..f6d21c3cfa8 100644 --- a/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.ts +++ b/extensions/qa-lab/src/mantis/slack-desktop-smoke.runtime.ts @@ -35,6 +35,7 @@ export type MantisSlackDesktopSmokeResult = { screenshotPath?: string; status: "pass" | "fail"; summaryPath: string; + videoPath?: string; }; type CommandResult = { @@ -66,6 +67,7 @@ type MantisSlackDesktopSmokeSummary = { screenshotPath?: string; slackQaDir?: string; summaryPath: string; + videoPath?: string; }; crabbox: { bin: string; @@ -302,6 +304,24 @@ fi if [ -z "$slack_url" ]; then slack_url="https://app.slack.com/client" fi +video_pid="" +if command -v ffmpeg >/dev/null 2>&1; then + : +else + sudo apt-get update -y >>"$out/apt.log" 2>&1 || true + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg >>"$out/apt.log" 2>&1 || true +fi +if command -v ffmpeg >/dev/null 2>&1; then + display_input="$DISPLAY" + case "$display_input" in + *.*) ;; + *) display_input="$display_input.0" ;; + esac + ffmpeg -hide_banner -loglevel error -y -f x11grab -video_size 1440x1000 -framerate 15 -i "$display_input" -t 45 -pix_fmt yuv420p "$out/slack-desktop-smoke.mp4" >"$out/ffmpeg.log" 2>&1 & + video_pid=$! +else + echo "ffmpeg missing; video artifact skipped" >"$out/ffmpeg.log" +fi if [ "$setup_gateway" = "1" ]; then nohup "$browser_bin" \ --user-data-dir="$profile" \ @@ -376,6 +396,9 @@ MANTIS_SLACK_PATCH } >"$out/slack-desktop-command.log" 2>&1 || qa_status=$? sleep 5 scrot "$out/slack-desktop-smoke.png" || true +if [ -n "$video_pid" ]; then + wait "$video_pid" || true +fi if [ "$setup_gateway" != "1" ]; then kill "$chrome_pid" >/dev/null 2>&1 || true fi @@ -422,9 +445,13 @@ function renderReport(summary: MantisSlackDesktopSmokeSummary) { summary.artifacts.screenshotPath ? `- Screenshot: \`${path.basename(summary.artifacts.screenshotPath)}\`` : "- Screenshot: missing", + summary.artifacts.videoPath + ? `- Video: \`${path.basename(summary.artifacts.videoPath)}\`` + : "- Video: missing", summary.artifacts.slackQaDir ? "- Slack QA artifacts: `slack-qa/`" : undefined, "- Remote metadata: `remote-metadata.json`", "- Remote command log: `slack-desktop-command.log`", + "- FFmpeg log: `ffmpeg.log`", "- Chrome log: `chrome.log`", summary.error ? `- Error: ${summary.error}` : undefined, "", @@ -544,10 +571,7 @@ async function copyRemoteArtifacts(params: { "-az", "-e", sshArgs, - `${sshUser}@${host}:${params.remoteOutputDir}/slack-desktop-smoke.png`, - `${sshUser}@${host}:${params.remoteOutputDir}/remote-metadata.json`, - `${sshUser}@${host}:${params.remoteOutputDir}/chrome.log`, - `${sshUser}@${host}:${params.remoteOutputDir}/slack-desktop-command.log`, + `${sshUser}@${host}:${params.remoteOutputDir}/`, `${params.outputDir}/`, ], cwd: params.cwd, @@ -636,6 +660,7 @@ export async function runMantisSlackDesktopSmoke( let summary: MantisSlackDesktopSmokeSummary | undefined; let screenshotPath: string | undefined; let slackQaDir: string | undefined; + let videoPath: string | undefined; try { leaseId = @@ -702,6 +727,10 @@ export async function runMantisSlackDesktopSmoke( runner, }); screenshotPath = path.join(outputDir, "slack-desktop-smoke.png"); + videoPath = path.join(outputDir, "slack-desktop-smoke.mp4"); + if (!(await pathExists(videoPath))) { + videoPath = undefined; + } slackQaDir = path.join(outputDir, "slack-qa"); if (!(await pathExists(screenshotPath))) { throw new Error("Slack desktop screenshot was not copied back from Crabbox."); @@ -715,6 +744,7 @@ export async function runMantisSlackDesktopSmoke( screenshotPath, slackQaDir, summaryPath, + videoPath, }, crabbox: { bin: crabboxBin, @@ -738,6 +768,7 @@ export async function runMantisSlackDesktopSmoke( screenshotPath, status: "pass", summaryPath, + videoPath, }; } catch (error) { summary = { @@ -746,6 +777,7 @@ export async function runMantisSlackDesktopSmoke( screenshotPath, slackQaDir, summaryPath, + videoPath, }, crabbox: { bin: crabboxBin, @@ -771,6 +803,7 @@ export async function runMantisSlackDesktopSmoke( screenshotPath, status: "fail", summaryPath, + videoPath, }; } finally { if (summary) { diff --git a/extensions/qa-lab/src/mantis/visual-task.runtime.test.ts b/extensions/qa-lab/src/mantis/visual-task.runtime.test.ts new file mode 100644 index 00000000000..bcfd258906a --- /dev/null +++ b/extensions/qa-lab/src/mantis/visual-task.runtime.test.ts @@ -0,0 +1,349 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { runMantisVisualDriver, runMantisVisualTask } from "./visual-task.runtime.js"; + +describe("mantis visual task runtime", () => { + let repoRoot: string; + + beforeEach(async () => { + repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "mantis-visual-task-")); + }); + + afterEach(async () => { + await fs.rm(repoRoot, { force: true, recursive: true }); + }); + + it("records a visible browser task and keeps screenshot/video artifacts", async () => { + const commands: { args: readonly string[]; command: string }[] = []; + const runner = vi.fn(async (command: string, args: readonly string[]) => { + commands.push({ command, args }); + if (command === "/tmp/crabbox" && args[0] === "warmup") { + return { stdout: "ready lease cbx_abc123\n", stderr: "" }; + } + if (command === "/tmp/crabbox" && args[0] === "inspect") { + return { + stdout: `${JSON.stringify({ + id: "cbx_abc123", + provider: "hetzner", + slug: "brisk-mantis", + state: "active", + })}\n`, + stderr: "", + }; + } + if (command === "/tmp/crabbox" && args[0] === "record") { + const outputPath = args[args.indexOf("--output") + 1]; + const outputDir = args[args.indexOf("--output-dir") + 1]; + await fs.mkdir(path.dirname(outputPath), { recursive: true }); + await fs.writeFile(outputPath, "mp4"); + await fs.writeFile(path.join(outputDir, "visual-task.png"), "png"); + await fs.writeFile( + path.join(outputDir, "mantis-visual-task-driver-result.json"), + `${JSON.stringify({ + browserUrl: "https://example.net", + finishedAt: "2026-05-04T12:00:05.000Z", + matched: true, + outputDir, + screenshotPath: path.join(outputDir, "visual-task.png"), + startedAt: "2026-05-04T12:00:01.000Z", + status: "pass", + vision: { + mode: "metadata", + timeoutMs: 120000, + }, + })}\n`, + ); + } + return { stdout: "", stderr: "" }; + }); + + const result = await runMantisVisualTask({ + commandRunner: runner, + crabboxBin: "/tmp/crabbox", + duration: "12s", + env: { PATH: process.env.PATH }, + now: () => new Date("2026-05-04T12:00:00.000Z"), + outputDir: ".artifacts/qa-e2e/mantis/visual-task-test", + repoRoot, + settleMs: 0, + visionMode: "metadata", + }); + + expect(result.status).toBe("pass"); + expect(commands.map((entry) => [entry.command, entry.args[0]])).toEqual([ + ["/tmp/crabbox", "warmup"], + ["/tmp/crabbox", "inspect"], + ["/tmp/crabbox", "record"], + ["/tmp/crabbox", "stop"], + ]); + const recordArgs = commands.find((entry) => entry.args[0] === "record")?.args ?? []; + expect(recordArgs).toEqual( + expect.arrayContaining([ + "--duration", + "12s", + "--output", + path.join(repoRoot, ".artifacts/qa-e2e/mantis/visual-task-test/visual-task.mp4"), + "--while", + "--", + "pnpm", + "--dir", + repoRoot, + "openclaw", + "qa", + "mantis", + "visual-driver", + ]), + ); + await expect(fs.readFile(result.screenshotPath ?? "", "utf8")).resolves.toBe("png"); + await expect(fs.readFile(result.videoPath ?? "", "utf8")).resolves.toBe("mp4"); + const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { + crabbox: { id: string; vncCommand: string }; + status: string; + visionMode: string; + }; + expect(summary).toMatchObject({ + crabbox: { + id: "cbx_abc123", + vncCommand: "/tmp/crabbox vnc --provider hetzner --id cbx_abc123 --open", + }, + status: "pass", + visionMode: "metadata", + }); + }); + + it("fails when recording breaks after the visual driver passes", async () => { + const commands: { args: readonly string[]; command: string }[] = []; + const runner = vi.fn(async (command: string, args: readonly string[]) => { + commands.push({ command, args }); + if (command === "/tmp/crabbox" && args[0] === "warmup") { + return { stdout: "ready lease cbx_abc123\n", stderr: "" }; + } + if (command === "/tmp/crabbox" && args[0] === "inspect") { + return { + stdout: `${JSON.stringify({ + id: "cbx_abc123", + provider: "hetzner", + slug: "brisk-mantis", + state: "active", + })}\n`, + stderr: "", + }; + } + if (command === "/tmp/crabbox" && args[0] === "record") { + const outputDir = args[args.indexOf("--output-dir") + 1]; + await fs.mkdir(outputDir, { recursive: true }); + await fs.writeFile(path.join(outputDir, "visual-task.png"), "png"); + await fs.writeFile( + path.join(outputDir, "mantis-visual-task-driver-result.json"), + `${JSON.stringify({ + browserUrl: "https://example.net", + finishedAt: "2026-05-04T12:00:05.000Z", + matched: true, + outputDir, + screenshotPath: path.join(outputDir, "visual-task.png"), + startedAt: "2026-05-04T12:00:01.000Z", + status: "pass", + vision: { + mode: "metadata", + timeoutMs: 120000, + }, + })}\n`, + ); + throw new Error("crabbox record failed after driver exit"); + } + return { stdout: "", stderr: "" }; + }); + + const result = await runMantisVisualTask({ + commandRunner: runner, + crabboxBin: "/tmp/crabbox", + env: { PATH: process.env.PATH }, + now: () => new Date("2026-05-04T12:00:00.000Z"), + outputDir: ".artifacts/qa-e2e/mantis/visual-task-recording-fail", + repoRoot, + settleMs: 0, + visionMode: "metadata", + }); + + expect(result).toMatchObject({ + status: "fail", + videoPath: undefined, + }); + expect(commands.map((entry) => [entry.command, entry.args[0]])).toEqual([ + ["/tmp/crabbox", "warmup"], + ["/tmp/crabbox", "inspect"], + ["/tmp/crabbox", "record"], + ]); + const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { + error?: string; + recording?: { error?: string; required: boolean }; + status: string; + }; + expect(summary).toMatchObject({ + error: "crabbox record failed after driver exit", + recording: { + error: "crabbox record failed after driver exit", + required: true, + }, + status: "fail", + }); + }); + + it("drives a lease, screenshots it, and verifies image-describe text", async () => { + const commands: { args: readonly string[]; command: string }[] = []; + const runner = vi.fn(async (command: string, args: readonly string[]) => { + commands.push({ command, args }); + if (command === "/tmp/crabbox" && args[0] === "screenshot") { + const outputPath = args[args.indexOf("--output") + 1]; + await fs.mkdir(path.dirname(outputPath), { recursive: true }); + await fs.writeFile(outputPath, "png"); + } + if (command === "pnpm") { + return { + stdout: `\n> openclaw qa mantis visual-driver --vision-prompt '{"visible": boolean}'\n${JSON.stringify( + { + ok: true, + outputs: [ + { + kind: "image.description", + text: JSON.stringify({ + evidence: 'The page heading reads "Example Domain".', + reason: "The expected text is visible as the main heading.", + visible: true, + }), + }, + ], + }, + )}\n`, + stderr: "", + }; + } + return { stdout: "", stderr: "" }; + }); + + const result = await runMantisVisualDriver({ + browserUrl: "https://example.net", + commandRunner: runner, + crabboxBin: "/tmp/crabbox", + env: { PATH: process.env.PATH }, + expectText: "Example Domain", + leaseId: "cbx_abc123", + outputDir: ".artifacts/qa-e2e/mantis/visual-driver-test", + repoRoot, + settleMs: 0, + visionMode: "image-describe", + visionModel: "openai/gpt-5.4", + visionPrompt: "Read the page title", + }); + + expect(result.status).toBe("pass"); + expect(commands.map((entry) => [entry.command, entry.args[0], entry.args[1]])).toEqual([ + ["/tmp/crabbox", "desktop", "launch"], + ["/tmp/crabbox", "screenshot", "--provider"], + ["pnpm", "--dir", repoRoot], + ]); + const launchArgs = commands.find((entry) => entry.args[0] === "desktop")?.args ?? []; + expect(launchArgs).toEqual( + expect.arrayContaining(["--", "sh", "-lc", expect.stringContaining("--no-first-run")]), + ); + const visionArgs = commands.find((entry) => entry.command === "pnpm")?.args ?? []; + expect(visionArgs).toEqual( + expect.arrayContaining([ + "infer", + "image", + "describe", + "--file", + path.join(repoRoot, ".artifacts/qa-e2e/mantis/visual-driver-test/visual-task.png"), + "--model", + "openai/gpt-5.4", + ]), + ); + expect(visionArgs).toEqual( + expect.arrayContaining(["--prompt", expect.stringContaining("return only valid JSON")]), + ); + expect(result.vision.assertion).toMatchObject({ + evidence: 'The page heading reads "Example Domain".', + matched: true, + visible: true, + }); + }); + + it("fails image-describe text checks when the model gives negative evidence that quotes the target", async () => { + const runner = vi.fn(async (command: string, args: readonly string[]) => { + if (command === "/tmp/crabbox" && args[0] === "screenshot") { + const outputPath = args[args.indexOf("--output") + 1]; + await fs.mkdir(path.dirname(outputPath), { recursive: true }); + await fs.writeFile(outputPath, "png"); + } + if (command === "pnpm") { + return { + stdout: `${JSON.stringify({ + ok: true, + outputs: [ + { + kind: "image.description", + text: 'The screenshot does not contain "Example Domain".', + }, + ], + })}\n`, + stderr: "", + }; + } + return { stdout: "", stderr: "" }; + }); + + const result = await runMantisVisualDriver({ + commandRunner: runner, + crabboxBin: "/tmp/crabbox", + expectText: "Example Domain", + leaseId: "cbx_abc123", + outputDir: ".artifacts/qa-e2e/mantis/visual-driver-negative", + repoRoot, + settleMs: 0, + visionMode: "image-describe", + }); + + expect(result).toMatchObject({ + matched: false, + status: "fail", + vision: { + assertion: { + matched: false, + reason: "Image describe did not return a structured visual assertion.", + }, + }, + }); + }); + + it("fails metadata mode when text evidence is requested", async () => { + const runner = vi.fn(async (command: string, args: readonly string[]) => { + if (command === "/tmp/crabbox" && args[0] === "screenshot") { + const outputPath = args[args.indexOf("--output") + 1]; + await fs.mkdir(path.dirname(outputPath), { recursive: true }); + await fs.writeFile(outputPath, "png"); + } + return { stdout: "", stderr: "" }; + }); + + const result = await runMantisVisualDriver({ + commandRunner: runner, + crabboxBin: "/tmp/crabbox", + expectText: "Example Domain", + leaseId: "cbx_abc123", + outputDir: ".artifacts/qa-e2e/mantis/visual-driver-metadata", + repoRoot, + settleMs: 0, + visionMode: "metadata", + }); + + expect(result).toMatchObject({ + matched: false, + status: "fail", + vision: { + mode: "metadata", + }, + }); + }); +}); diff --git a/extensions/qa-lab/src/mantis/visual-task.runtime.ts b/extensions/qa-lab/src/mantis/visual-task.runtime.ts new file mode 100644 index 00000000000..68464cc8c0f --- /dev/null +++ b/extensions/qa-lab/src/mantis/visual-task.runtime.ts @@ -0,0 +1,926 @@ +import { spawn, type SpawnOptions } from "node:child_process"; +import fs from "node:fs/promises"; +import path from "node:path"; +import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; +import { ensureRepoBoundDirectory, resolveRepoRelativeOutputDir } from "../cli-paths.js"; + +export type MantisVisualTaskVisionMode = "image-describe" | "metadata"; + +export type MantisVisualTaskOptions = { + browserUrl?: string; + commandRunner?: CommandRunner; + crabboxBin?: string; + duration?: string; + env?: NodeJS.ProcessEnv; + expectText?: string; + idleTimeout?: string; + keepLease?: boolean; + leaseId?: string; + machineClass?: string; + now?: () => Date; + outputDir?: string; + provider?: string; + repoRoot?: string; + settleMs?: number; + ttl?: string; + visionMode?: MantisVisualTaskVisionMode; + visionModel?: string; + visionPrompt?: string; + visionTimeoutMs?: number; +}; + +export type MantisVisualDriverOptions = { + browserUrl?: string; + commandRunner?: CommandRunner; + crabboxBin?: string; + env?: NodeJS.ProcessEnv; + expectText?: string; + leaseId?: string; + outputDir?: string; + provider?: string; + repoRoot?: string; + settleMs?: number; + visionMode?: MantisVisualTaskVisionMode; + visionModel?: string; + visionPrompt?: string; + visionTimeoutMs?: number; +}; + +export type MantisVisualTaskResult = { + outputDir: string; + reportPath: string; + screenshotPath?: string; + status: "pass" | "fail"; + summaryPath: string; + videoPath?: string; +}; + +type CommandResult = { + stderr: string; + stdout: string; +}; + +type CommandRunner = ( + command: string, + args: readonly string[], + options: SpawnOptions, +) => Promise; + +type CrabboxInspect = { + id?: string; + provider?: string; + slug?: string; + state?: string; +}; + +type MantisVisualDriverResult = { + browserUrl: string; + error?: string; + expectText?: string; + finishedAt: string; + matched?: boolean; + outputDir: string; + screenshotPath: string; + startedAt: string; + status: "pass" | "fail"; + vision: { + assertion?: VisionAssertion; + mode: MantisVisualTaskVisionMode; + model?: string; + prompt?: string; + text?: string; + timeoutMs: number; + }; +}; + +type VisionAssertion = { + evidence?: string; + expectedText: string; + matched: boolean; + reason?: string; + visible?: boolean; +}; + +type MantisVisualTaskSummary = { + artifacts: { + driverResultPath: string; + reportPath: string; + screenshotPath?: string; + summaryPath: string; + videoPath?: string; + }; + browserUrl: string; + crabbox: { + bin: string; + createdLease: boolean; + id: string; + provider: string; + slug?: string; + state?: string; + vncCommand: string; + }; + driver?: MantisVisualDriverResult; + error?: string; + finishedAt: string; + outputDir: string; + recording: { + error?: string; + required: boolean; + }; + startedAt: string; + status: "pass" | "fail"; + visionMode: MantisVisualTaskVisionMode; +}; + +const DEFAULT_BROWSER_URL = "https://example.net"; +const DEFAULT_PROVIDER = "hetzner"; +const DEFAULT_CLASS = "beast"; +const DEFAULT_DURATION = "180s"; +const DEFAULT_IDLE_TIMEOUT = "60m"; +const DEFAULT_TTL = "120m"; +const DEFAULT_SETTLE_MS = 8000; +const DEFAULT_VISION_TIMEOUT_MS = 120000; +const CRABBOX_BIN_ENV = "OPENCLAW_MANTIS_CRABBOX_BIN"; +const CRABBOX_PROVIDER_ENV = "OPENCLAW_MANTIS_CRABBOX_PROVIDER"; +const CRABBOX_CLASS_ENV = "OPENCLAW_MANTIS_CRABBOX_CLASS"; +const CRABBOX_LEASE_ID_ENV = "OPENCLAW_MANTIS_CRABBOX_LEASE_ID"; +const CRABBOX_KEEP_ENV = "OPENCLAW_MANTIS_KEEP_VM"; +const CRABBOX_IDLE_TIMEOUT_ENV = "OPENCLAW_MANTIS_CRABBOX_IDLE_TIMEOUT"; +const CRABBOX_TTL_ENV = "OPENCLAW_MANTIS_CRABBOX_TTL"; + +function trimToValue(value: string | undefined) { + const trimmed = value?.trim(); + return trimmed && trimmed.length > 0 ? trimmed : undefined; +} + +function isTruthyOptIn(value: string | undefined) { + const normalized = value?.trim().toLowerCase(); + return normalized === "1" || normalized === "true" || normalized === "yes"; +} + +function defaultOutputDir(repoRoot: string, startedAt: Date) { + const stamp = startedAt.toISOString().replace(/[:.]/gu, "-"); + return path.join(repoRoot, ".artifacts", "qa-e2e", "mantis", `visual-task-${stamp}`); +} + +function resolveMantisOutputDir(repoRoot: string, outputDir: string | undefined, startedAt: Date) { + const configured = trimToValue(outputDir); + if (!configured) { + return defaultOutputDir(repoRoot, startedAt); + } + return path.isAbsolute(configured) + ? configured + : (resolveRepoRelativeOutputDir(repoRoot, configured) ?? defaultOutputDir(repoRoot, startedAt)); +} + +async function defaultCommandRunner( + command: string, + args: readonly string[], + options: SpawnOptions, +): Promise { + return new Promise((resolve, reject) => { + const child = spawn(command, args, { + ...options, + stdio: ["ignore", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + child.stdout?.on("data", (chunk: Buffer) => { + const text = chunk.toString(); + stdout += text; + if (options.stdio === "inherit") { + process.stdout.write(text); + } + }); + child.stderr?.on("data", (chunk: Buffer) => { + const text = chunk.toString(); + stderr += text; + if (options.stdio === "inherit") { + process.stderr.write(text); + } + }); + child.on("error", reject); + child.on("close", (code, signal) => { + if (code === 0) { + resolve({ stdout, stderr }); + return; + } + const detail = signal ? `signal ${signal}` : `exit code ${code ?? "unknown"}`; + reject(new Error(`${command} ${args.join(" ")} failed with ${detail}`)); + }); + }); +} + +async function pathExists(filePath: string) { + try { + await fs.access(filePath); + return true; + } catch { + return false; + } +} + +async function nonEmptyFileExists(filePath: string) { + try { + const stat = await fs.stat(filePath); + return stat.isFile() && stat.size > 0; + } catch { + return false; + } +} + +async function resolveCrabboxBin(params: { + env: NodeJS.ProcessEnv; + explicit?: string; + repoRoot: string; +}) { + const configured = trimToValue(params.explicit) ?? trimToValue(params.env[CRABBOX_BIN_ENV]); + if (configured) { + return configured; + } + const sibling = path.resolve(params.repoRoot, "../crabbox/bin/crabbox"); + if (await pathExists(sibling)) { + return sibling; + } + return "crabbox"; +} + +function extractLeaseId(output: string) { + return output.match(/\b(?:cbx_[a-f0-9]+|tbx_[A-Za-z0-9_-]+)\b/u)?.[0]; +} + +function normalizeVisionMode(value: string | undefined): MantisVisualTaskVisionMode { + const normalized = trimToValue(value); + if (normalized === undefined || normalized === "image-describe") { + return "image-describe"; + } + if (normalized === "metadata") { + return "metadata"; + } + throw new Error(`Unsupported Mantis visual-task vision mode: ${normalized}`); +} + +function defaultVisionPrompt(expectText: string | undefined) { + if (expectText) { + return `Inspect this UI screenshot and determine whether the exact text "${expectText}" is visibly present.`; + } + return "Inspect this UI screenshot and describe the visible page state in one concise sentence."; +} + +function buildVisionPrompt(prompt: string | undefined, expectText: string | undefined) { + const base = trimToValue(prompt) ?? defaultVisionPrompt(expectText); + if (!expectText) { + return base; + } + if (base.includes("Visual assertion contract:")) { + return base; + } + return `${base}\n\nVisual assertion contract: return only valid JSON: {"visible": boolean, "evidence": string, "reason": string}. Set visible=true only when the exact text "${expectText}" is actually visible in the screenshot; text quoted in the prompt or a negative statement is not evidence.`; +} + +async function runCommand(params: { + args: readonly string[]; + command: string; + cwd: string; + env: NodeJS.ProcessEnv; + runner: CommandRunner; + stdio?: "inherit" | "pipe"; +}) { + return params.runner(params.command, params.args, { + cwd: params.cwd, + env: params.env, + stdio: params.stdio ?? "pipe", + }); +} + +async function warmupCrabbox(params: { + crabboxBin: string; + cwd: string; + env: NodeJS.ProcessEnv; + idleTimeout: string; + machineClass: string; + provider: string; + runner: CommandRunner; + ttl: string; +}) { + const result = await runCommand({ + command: params.crabboxBin, + args: [ + "warmup", + "--provider", + params.provider, + "--desktop", + "--browser", + "--class", + params.machineClass, + "--idle-timeout", + params.idleTimeout, + "--ttl", + params.ttl, + ], + cwd: params.cwd, + env: params.env, + runner: params.runner, + stdio: "inherit", + }); + const leaseId = extractLeaseId(`${result.stdout}\n${result.stderr}`); + if (!leaseId) { + throw new Error("Crabbox warmup did not print a lease id."); + } + return leaseId; +} + +async function inspectCrabbox(params: { + crabboxBin: string; + cwd: string; + env: NodeJS.ProcessEnv; + leaseId: string; + provider: string; + runner: CommandRunner; +}) { + const result = await runCommand({ + command: params.crabboxBin, + args: ["inspect", "--provider", params.provider, "--id", params.leaseId, "--json"], + cwd: params.cwd, + env: params.env, + runner: params.runner, + }); + return JSON.parse(result.stdout) as CrabboxInspect; +} + +async function stopCrabbox(params: { + crabboxBin: string; + cwd: string; + env: NodeJS.ProcessEnv; + leaseId: string; + provider: string; + runner: CommandRunner; +}) { + await runCommand({ + command: params.crabboxBin, + args: ["stop", "--provider", params.provider, params.leaseId], + cwd: params.cwd, + env: params.env, + runner: params.runner, + stdio: "inherit", + }); +} + +function buildVisualDriverArgs(params: { + browserUrl: string; + crabboxBin: string; + expectText?: string; + leaseId: string; + outputDir: string; + provider: string; + repoRoot: string; + settleMs: number; + visionMode: MantisVisualTaskVisionMode; + visionModel?: string; + visionPrompt: string; + visionTimeoutMs: number; +}) { + const args = [ + "--dir", + params.repoRoot, + "openclaw", + "qa", + "mantis", + "visual-driver", + "--repo-root", + params.repoRoot, + "--output-dir", + params.outputDir, + "--crabbox-bin", + params.crabboxBin, + "--provider", + params.provider, + "--lease-id", + params.leaseId, + "--browser-url", + params.browserUrl, + "--settle-ms", + String(params.settleMs), + "--vision-mode", + params.visionMode, + "--vision-prompt", + params.visionPrompt, + "--vision-timeout-ms", + String(params.visionTimeoutMs), + ]; + if (params.expectText) { + args.push("--expect-text", params.expectText); + } + if (params.visionModel) { + args.push("--vision-model", params.visionModel); + } + return args; +} + +function parseImageDescribeText(stdout: string) { + const parsed = parseJsonObjectFromText( + stdout, + (value): value is { outputs?: Array<{ text?: unknown }> } => + Boolean( + value && + typeof value === "object" && + Array.isArray((value as { outputs?: unknown }).outputs), + ), + ); + if (!parsed) { + throw new Error("Image describe did not return a JSON envelope with outputs."); + } + const text = parsed.outputs?.find((output) => typeof output.text === "string")?.text; + if (typeof text !== "string" || text.trim().length === 0) { + throw new Error("Image describe did not return output text."); + } + return text; +} + +function parseJsonObjectFromText(text: string, accepts: (value: unknown) => value is T) { + const starts = [...text.matchAll(/\{/gu)] + .map((match) => match.index) + .filter((index) => index !== undefined); + const ends = [...text.matchAll(/\}/gu)] + .map((match) => match.index) + .filter((index) => index !== undefined); + for (const start of starts) { + for (const end of ends.toReversed()) { + if (end < start) { + continue; + } + try { + const parsed = JSON.parse(text.slice(start, end + 1)) as unknown; + if (accepts(parsed)) { + return parsed; + } + } catch { + // Keep scanning: command wrappers can echo prompt schemas before the real JSON. + } + } + } + return undefined; +} + +function parseVisionAssertion(text: string, expectText: string): VisionAssertion { + const parsed = parseJsonObjectFromText(text, (value): value is Record => + Boolean(value && typeof value === "object" && "visible" in value), + ); + if (!parsed) { + return { + expectedText: expectText, + matched: false, + reason: "Image describe did not return a structured visual assertion.", + }; + } + const record = parsed; + const visible = record.visible; + const evidence = typeof record.evidence === "string" ? record.evidence.trim() : undefined; + const reason = typeof record.reason === "string" ? record.reason.trim() : undefined; + if (typeof visible !== "boolean") { + return { + evidence, + expectedText: expectText, + matched: false, + reason: reason ?? "Image describe visual assertion is missing boolean visible.", + }; + } + const normalizedExpected = expectText.toLowerCase(); + const positiveEvidence = [evidence, reason] + .filter((value): value is string => Boolean(value)) + .some((value) => value.toLowerCase().includes(normalizedExpected)); + return { + evidence, + expectedText: expectText, + matched: visible && Boolean(evidence) && positiveEvidence, + reason: positiveEvidence + ? reason + : (reason ?? `Visual assertion did not cite the expected text "${expectText}".`), + visible, + }; +} + +function evaluateVisualExpectation(text: string | undefined, expectText: string | undefined) { + if (!expectText) { + return { matched: true }; + } + if (!text) { + return { + assertion: { + expectedText: expectText, + matched: false, + reason: "Image describe did not return text.", + }, + matched: false, + }; + } + const assertion = parseVisionAssertion(text, expectText); + return { assertion, matched: assertion.matched }; +} + +function browserLaunchScript() { + return [ + 'browser="${BROWSER:-${CHROME_BIN:-google-chrome}}"', + 'profile="${TMPDIR:-/tmp}/openclaw-mantis-visual-chrome-profile"', + 'mkdir -p "$profile"', + 'exec "$browser" --user-data-dir="$profile" --no-first-run --no-default-browser-check --disable-default-apps --disable-dev-shm-usage --window-size=1280,900 --window-position=0,0 "$0"', + ].join("; "); +} + +function renderReport(summary: MantisVisualTaskSummary) { + const lines = [ + "# Mantis Visual Task", + "", + `Status: ${summary.status}`, + `Browser URL: ${summary.browserUrl}`, + `Vision mode: ${summary.visionMode}`, + `Output: ${summary.outputDir}`, + `Started: ${summary.startedAt}`, + `Finished: ${summary.finishedAt}`, + "", + "## Crabbox", + "", + `- Provider: ${summary.crabbox.provider}`, + `- Lease: ${summary.crabbox.id}${summary.crabbox.slug ? ` (${summary.crabbox.slug})` : ""}`, + `- Created by run: ${summary.crabbox.createdLease}`, + `- State: ${summary.crabbox.state ?? "unknown"}`, + `- VNC: \`${summary.crabbox.vncCommand}\``, + "", + "## Artifacts", + "", + summary.artifacts.screenshotPath + ? `- Screenshot: \`${path.basename(summary.artifacts.screenshotPath)}\`` + : "- Screenshot: missing", + summary.artifacts.videoPath + ? `- Video: \`${path.basename(summary.artifacts.videoPath)}\`` + : "- Video: missing", + `- Driver result: \`${path.basename(summary.artifacts.driverResultPath)}\``, + "", + "## Vision", + "", + summary.driver?.vision.text ? summary.driver.vision.text : "No vision text recorded.", + summary.driver?.expectText ? `Expected text: ${summary.driver.expectText}` : undefined, + summary.driver?.vision.assertion?.visible !== undefined + ? `Visible: ${summary.driver.vision.assertion.visible}` + : undefined, + summary.driver?.vision.assertion?.evidence + ? `Evidence: ${summary.driver.vision.assertion.evidence}` + : undefined, + summary.driver?.vision.assertion?.reason + ? `Reason: ${summary.driver.vision.assertion.reason}` + : undefined, + summary.driver?.matched !== undefined ? `Matched: ${summary.driver.matched}` : undefined, + summary.recording.error ? `Recording error: ${summary.recording.error}` : undefined, + summary.error ? `Error: ${summary.error}` : undefined, + "", + ].filter((line) => line !== undefined); + return `${lines.join("\n")}\n`; +} + +export async function runMantisVisualDriver( + opts: MantisVisualDriverOptions = {}, +): Promise { + const env = opts.env ?? process.env; + const startedAt = new Date(); + const repoRoot = path.resolve(opts.repoRoot ?? process.cwd()); + const outputDir = await ensureRepoBoundDirectory( + repoRoot, + resolveMantisOutputDir(repoRoot, opts.outputDir, startedAt), + "Mantis visual driver output directory", + { mode: 0o755 }, + ); + const resultPath = path.join(outputDir, "mantis-visual-task-driver-result.json"); + const screenshotPath = path.join(outputDir, "visual-task.png"); + const crabboxBin = await resolveCrabboxBin({ env, explicit: opts.crabboxBin, repoRoot }); + const provider = + trimToValue(opts.provider) ?? + trimToValue(env.CRABBOX_RECORD_PROVIDER) ?? + trimToValue(env[CRABBOX_PROVIDER_ENV]) ?? + DEFAULT_PROVIDER; + const leaseId = + trimToValue(opts.leaseId) ?? + trimToValue(env.CRABBOX_RECORD_LEASE_ID) ?? + trimToValue(env[CRABBOX_LEASE_ID_ENV]); + if (!leaseId) { + throw new Error("Mantis visual-driver needs --lease-id or CRABBOX_RECORD_LEASE_ID."); + } + const browserUrl = trimToValue(opts.browserUrl) ?? DEFAULT_BROWSER_URL; + const visionMode = normalizeVisionMode(opts.visionMode); + const expectText = trimToValue(opts.expectText); + const visionPrompt = buildVisionPrompt(opts.visionPrompt, expectText); + const visionTimeoutMs = opts.visionTimeoutMs ?? DEFAULT_VISION_TIMEOUT_MS; + const runner = opts.commandRunner ?? defaultCommandRunner; + let result: MantisVisualDriverResult; + + try { + await runCommand({ + command: crabboxBin, + args: [ + "desktop", + "launch", + "--provider", + provider, + "--id", + leaseId, + "--browser", + "--url", + browserUrl, + "--reclaim", + "--", + "sh", + "-lc", + browserLaunchScript(), + ], + cwd: repoRoot, + env, + runner, + stdio: "inherit", + }); + await new Promise((resolve) => setTimeout(resolve, opts.settleMs ?? DEFAULT_SETTLE_MS)); + await runCommand({ + command: crabboxBin, + args: [ + "screenshot", + "--provider", + provider, + "--id", + leaseId, + "--output", + screenshotPath, + "--reclaim", + ], + cwd: repoRoot, + env, + runner, + stdio: "inherit", + }); + let visionText: string | undefined; + if (visionMode === "image-describe") { + const imageArgs = [ + "openclaw", + "infer", + "image", + "describe", + "--file", + screenshotPath, + "--prompt", + visionPrompt, + "--timeout-ms", + String(visionTimeoutMs), + "--json", + ]; + const visionModel = trimToValue(opts.visionModel); + if (visionModel) { + imageArgs.push("--model", visionModel); + } + const described = await runCommand({ + command: "pnpm", + args: ["--dir", repoRoot, ...imageArgs], + cwd: repoRoot, + env, + runner, + }); + visionText = parseImageDescribeText(described.stdout); + } + const { assertion, matched } = evaluateVisualExpectation(visionText, expectText); + result = { + browserUrl, + expectText, + finishedAt: new Date().toISOString(), + matched, + outputDir, + screenshotPath, + startedAt: startedAt.toISOString(), + status: matched ? "pass" : "fail", + vision: { + assertion, + mode: visionMode, + model: trimToValue(opts.visionModel), + prompt: visionPrompt, + text: visionText, + timeoutMs: visionTimeoutMs, + }, + }; + } catch (error) { + result = { + browserUrl, + error: formatErrorMessage(error), + expectText, + finishedAt: new Date().toISOString(), + matched: false, + outputDir, + screenshotPath, + startedAt: startedAt.toISOString(), + status: "fail", + vision: { + mode: visionMode, + model: trimToValue(opts.visionModel), + prompt: visionPrompt, + timeoutMs: visionTimeoutMs, + }, + }; + } + await fs.writeFile(resultPath, `${JSON.stringify(result, null, 2)}\n`, "utf8"); + return result; +} + +export async function runMantisVisualTask( + opts: MantisVisualTaskOptions = {}, +): Promise { + const env = opts.env ?? process.env; + const startedAt = (opts.now ?? (() => new Date()))(); + const repoRoot = path.resolve(opts.repoRoot ?? process.cwd()); + const outputDir = await ensureRepoBoundDirectory( + repoRoot, + resolveMantisOutputDir(repoRoot, opts.outputDir, startedAt), + "Mantis visual task output directory", + { mode: 0o755 }, + ); + const summaryPath = path.join(outputDir, "mantis-visual-task-summary.json"); + const reportPath = path.join(outputDir, "mantis-visual-task-report.md"); + const driverResultPath = path.join(outputDir, "mantis-visual-task-driver-result.json"); + const screenshotPath = path.join(outputDir, "visual-task.png"); + const videoPath = path.join(outputDir, "visual-task.mp4"); + const crabboxBin = await resolveCrabboxBin({ env, explicit: opts.crabboxBin, repoRoot }); + const provider = + trimToValue(opts.provider) ?? trimToValue(env[CRABBOX_PROVIDER_ENV]) ?? DEFAULT_PROVIDER; + const machineClass = + trimToValue(opts.machineClass) ?? trimToValue(env[CRABBOX_CLASS_ENV]) ?? DEFAULT_CLASS; + const idleTimeout = + trimToValue(opts.idleTimeout) ?? + trimToValue(env[CRABBOX_IDLE_TIMEOUT_ENV]) ?? + DEFAULT_IDLE_TIMEOUT; + const ttl = trimToValue(opts.ttl) ?? trimToValue(env[CRABBOX_TTL_ENV]) ?? DEFAULT_TTL; + const explicitLeaseId = trimToValue(opts.leaseId) ?? trimToValue(env[CRABBOX_LEASE_ID_ENV]); + const keepLease = opts.keepLease ?? isTruthyOptIn(env[CRABBOX_KEEP_ENV]); + const createdLease = explicitLeaseId === undefined; + const browserUrl = trimToValue(opts.browserUrl) ?? DEFAULT_BROWSER_URL; + const expectText = trimToValue(opts.expectText); + const visionMode = normalizeVisionMode(opts.visionMode); + const visionPrompt = buildVisionPrompt(opts.visionPrompt, expectText); + const runner = opts.commandRunner ?? defaultCommandRunner; + let leaseId = explicitLeaseId; + let inspected: CrabboxInspect = {}; + let summary: MantisVisualTaskSummary | undefined; + + try { + leaseId = + leaseId ?? + (await warmupCrabbox({ + crabboxBin, + cwd: repoRoot, + env, + idleTimeout, + machineClass, + provider, + runner, + ttl, + })); + inspected = await inspectCrabbox({ + crabboxBin, + cwd: repoRoot, + env, + leaseId, + provider, + runner, + }); + let recordingError: string | undefined; + try { + await runCommand({ + command: crabboxBin, + args: [ + "record", + "--provider", + provider, + "--id", + leaseId, + "--duration", + trimToValue(opts.duration) ?? DEFAULT_DURATION, + "--output", + videoPath, + "--while", + "--", + "pnpm", + ...buildVisualDriverArgs({ + browserUrl, + crabboxBin, + expectText, + leaseId, + outputDir, + provider, + repoRoot, + settleMs: opts.settleMs ?? DEFAULT_SETTLE_MS, + visionMode, + visionModel: trimToValue(opts.visionModel), + visionPrompt, + visionTimeoutMs: opts.visionTimeoutMs ?? DEFAULT_VISION_TIMEOUT_MS, + }), + ], + cwd: repoRoot, + env, + runner, + stdio: "inherit", + }); + } catch (error) { + if (!(await pathExists(driverResultPath))) { + throw error; + } + recordingError = formatErrorMessage(error); + } + const driver = JSON.parse( + await fs.readFile(driverResultPath, "utf8"), + ) as MantisVisualDriverResult; + const copiedScreenshot = (await pathExists(screenshotPath)) ? screenshotPath : undefined; + const copiedVideo = (await nonEmptyFileExists(videoPath)) ? videoPath : undefined; + const recordingFailure = + recordingError ?? + (copiedVideo ? undefined : "Mantis visual task recording did not produce visual-task.mp4."); + const status = driver.status === "pass" && !recordingFailure ? "pass" : "fail"; + summary = { + artifacts: { + driverResultPath, + reportPath, + screenshotPath: copiedScreenshot, + summaryPath, + videoPath: copiedVideo, + }, + browserUrl, + crabbox: { + bin: crabboxBin, + createdLease, + id: leaseId, + provider, + slug: inspected.slug, + state: inspected.state, + vncCommand: `${crabboxBin} vnc --provider ${provider} --id ${leaseId} --open`, + }, + driver, + error: recordingFailure, + finishedAt: new Date().toISOString(), + outputDir, + recording: { + error: recordingFailure, + required: true, + }, + startedAt: startedAt.toISOString(), + status, + visionMode, + }; + return { + outputDir, + reportPath, + screenshotPath: copiedScreenshot, + status, + summaryPath, + videoPath: copiedVideo, + }; + } catch (error) { + summary = { + artifacts: { + driverResultPath, + reportPath, + summaryPath, + videoPath: (await pathExists(videoPath)) ? videoPath : undefined, + }, + browserUrl, + crabbox: { + bin: crabboxBin, + createdLease, + id: leaseId ?? "unallocated", + provider, + slug: inspected.slug, + state: inspected.state, + vncCommand: leaseId + ? `${crabboxBin} vnc --provider ${provider} --id ${leaseId} --open` + : "unallocated", + }, + error: formatErrorMessage(error), + finishedAt: new Date().toISOString(), + outputDir, + recording: { + error: (await nonEmptyFileExists(videoPath)) ? undefined : "visual-task.mp4 missing", + required: true, + }, + startedAt: startedAt.toISOString(), + status: "fail", + visionMode, + }; + await fs.writeFile(path.join(outputDir, "error.txt"), `${summary.error}\n`, "utf8"); + return { + outputDir, + reportPath, + status: "fail", + summaryPath, + videoPath: summary.artifacts.videoPath, + }; + } finally { + if (summary) { + summary.finishedAt = new Date().toISOString(); + await fs.writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, "utf8"); + await fs.writeFile(reportPath, renderReport(summary), "utf8"); + } + if (summary?.status === "pass" && createdLease && leaseId && !keepLease) { + await stopCrabbox({ crabboxBin, cwd: repoRoot, env, leaseId, provider, runner }); + } + } +}