From e8810c04a46bb5b68bc441e425be77e8212309ea Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 1 May 2026 11:16:04 +0530 Subject: [PATCH] feat(rtt): add published package measurement harness --- .gitignore | 2 + package.json | 1 + scripts/lib/rtt-harness.ts | 215 ++++++++++++++++++++ scripts/rtt.ts | 216 +++++++++++++++++++++ test/fixtures/telegram-qa-summary-rtt.json | 31 +++ test/scripts/rtt-harness.test.ts | 172 ++++++++++++++++ 6 files changed, 637 insertions(+) create mode 100644 scripts/lib/rtt-harness.ts create mode 100644 scripts/rtt.ts create mode 100644 test/fixtures/telegram-qa-summary-rtt.json create mode 100644 test/scripts/rtt-harness.test.ts diff --git a/.gitignore b/.gitignore index 20096c48fae..b3a56ddb49b 100644 --- a/.gitignore +++ b/.gitignore @@ -188,6 +188,8 @@ changelog/fragments/ test/fixtures/openclaw-vitest-unit-report.json analysis/ .artifacts/qa-e2e/ +/runs/ +/data/rtt.jsonl extensions/qa-lab/web/dist/ # Generated bundled plugin runtime dependency manifests diff --git a/package.json b/package.json index 4737f482cf3..eec39ace283 100644 --- a/package.json +++ b/package.json @@ -1439,6 +1439,7 @@ "release:plugins:clawhub:plan": "node --import tsx scripts/plugin-clawhub-release-plan.ts", "release:plugins:npm:check": "node --import tsx scripts/plugin-npm-release-check.ts", "release:plugins:npm:plan": "node --import tsx scripts/plugin-npm-release-plan.ts", + "rtt": "node --import tsx scripts/rtt.ts", "runtime-sidecars:check": "node --import tsx scripts/generate-runtime-sidecar-paths-baseline.ts --check", "runtime-sidecars:gen": "node --import tsx scripts/generate-runtime-sidecar-paths-baseline.ts --write", "stage:bundled-plugin-runtime-deps": "node scripts/stage-bundled-plugin-runtime-deps.mjs", diff --git a/scripts/lib/rtt-harness.ts b/scripts/lib/rtt-harness.ts new file mode 100644 index 00000000000..822da9d1c89 --- /dev/null +++ b/scripts/lib/rtt-harness.ts @@ -0,0 +1,215 @@ +import { execFile, spawn } from "node:child_process"; +import fs from "node:fs/promises"; +import path from "node:path"; +import { promisify } from "node:util"; + +const execFileAsync = promisify(execFile); + +export type RttProviderMode = "mock-openai" | "live-frontier"; + +export type RttCliOptions = { + providerMode: RttProviderMode; + runs: number; + harnessRoot: string; + output: string; + scenarios: string[]; + timeoutMs: number; +}; + +export type RttResult = { + package: { + spec: string; + version: string; + }; + run: { + id: string; + startedAt: string; + finishedAt: string; + durationMs: number; + status: "pass" | "fail"; + }; + mode: { + providerMode: RttProviderMode; + scenarios: string[]; + }; + rtt: { + canaryMs?: number; + mentionReplyMs?: number; + }; + artifacts: { + rawSummaryPath: string; + rawReportPath: string; + rawObservedMessagesPath: string; + resultPath: string; + }; +}; + +export type TelegramQaSummary = { + scenarios?: Array<{ + id?: string; + rttMs?: number; + status?: string; + }>; +}; + +const OPENCLAW_PACKAGE_SPEC_RE = + /^openclaw@(beta|latest|[0-9]{4}\.[1-9][0-9]*\.[1-9][0-9]*(-[1-9][0-9]*|-beta\.[1-9][0-9]*)?)$/u; + +const REQUIRED_TELEGRAM_ENV = [ + "OPENCLAW_QA_TELEGRAM_GROUP_ID", + "OPENCLAW_QA_TELEGRAM_DRIVER_BOT_TOKEN", + "OPENCLAW_QA_TELEGRAM_SUT_BOT_TOKEN", +] as const; + +export function validateOpenClawPackageSpec(spec: string) { + if (!OPENCLAW_PACKAGE_SPEC_RE.test(spec)) { + throw new Error( + `Package spec must be openclaw@beta, openclaw@latest, or an exact OpenClaw release version; got: ${spec}`, + ); + } + return spec; +} + +export function safeRunLabel(input: string) { + return input.replace(/[^a-zA-Z0-9.-]+/gu, "_").replace(/^_+|_+$/gu, ""); +} + +export function buildRunId(params: { now: Date; spec: string; index?: number }) { + const stamp = params.now.toISOString().replaceAll(":", "").replaceAll(".", ""); + const suffix = params.index === undefined ? "" : `-${params.index + 1}`; + return `${stamp}-${safeRunLabel(params.spec)}${suffix}`; +} + +export function extractRtt(summary: TelegramQaSummary) { + const scenarios = summary.scenarios ?? []; + return { + canaryMs: scenarios.find((scenario) => scenario.id === "telegram-canary")?.rttMs, + mentionReplyMs: scenarios.find((scenario) => scenario.id === "telegram-mentioned-message-reply") + ?.rttMs, + }; +} + +export function createHarnessEnv(params: { + baseEnv: NodeJS.ProcessEnv; + providerMode: RttProviderMode; + scenarios: string[]; + spec: string; + version: string; + rawOutputDir: string; + timeoutMs: number; +}) { + return { + ...params.baseEnv, + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC: params.spec, + OPENCLAW_NPM_TELEGRAM_PACKAGE_LABEL: `${params.spec} (${params.version})`, + OPENCLAW_NPM_TELEGRAM_PROVIDER_MODE: params.providerMode, + OPENCLAW_NPM_TELEGRAM_SCENARIOS: params.scenarios.join(","), + OPENCLAW_NPM_TELEGRAM_SKIP_HOTPATH: "1", + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR: params.rawOutputDir, + OPENCLAW_NPM_TELEGRAM_FAST: params.baseEnv.OPENCLAW_NPM_TELEGRAM_FAST ?? "1", + OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS: String(params.timeoutMs), + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS: String(params.timeoutMs), + }; +} + +export function assertRequiredEnv(env: NodeJS.ProcessEnv) { + const missing = REQUIRED_TELEGRAM_ENV.filter((key) => !env[key]?.trim()); + if (missing.length > 0) { + throw new Error(`Missing Telegram QA env: ${missing.join(", ")}`); + } +} + +export async function assertHarnessRoot(harnessRoot: string) { + const scriptPath = path.join(harnessRoot, "scripts/e2e/npm-telegram-live-docker.sh"); + try { + await fs.access(scriptPath); + } catch { + throw new Error(`Missing OpenClaw Telegram npm harness: ${scriptPath}`); + } +} + +export async function assertDockerAvailable() { + try { + await execFileAsync("docker", ["version", "--format", "{{.Server.Version}}"], { + timeout: 10_000, + }); + } catch { + throw new Error("Docker is required for RTT runs; install/start Docker and retry."); + } +} + +export async function resolvePublishedVersion(spec: string) { + const { stdout } = await execFileAsync("npm", ["view", spec, "version", "--json"], { + timeout: 30_000, + }); + const parsed = JSON.parse(stdout.trim()) as unknown; + if (typeof parsed !== "string" || parsed.trim().length === 0) { + throw new Error(`npm did not return a version for ${spec}.`); + } + return parsed.trim(); +} + +export async function readTelegramSummary(summaryPath: string) { + return JSON.parse(await fs.readFile(summaryPath, "utf8")) as TelegramQaSummary; +} + +export async function writeJson(pathname: string, value: unknown) { + await fs.mkdir(path.dirname(pathname), { recursive: true }); + await fs.writeFile(pathname, `${JSON.stringify(value, null, 2)}\n`); +} + +export async function appendJsonl(pathname: string, value: unknown) { + await fs.mkdir(path.dirname(pathname), { recursive: true }); + await fs.appendFile(pathname, `${JSON.stringify(value)}\n`); +} + +export async function runHarness(params: { env: NodeJS.ProcessEnv; harnessRoot: string }) { + const scriptPath = path.join(params.harnessRoot, "scripts/e2e/npm-telegram-live-docker.sh"); + const child = spawn("bash", [scriptPath], { + cwd: params.harnessRoot, + env: params.env, + stdio: "inherit", + }); + const exitCode = await new Promise((resolve, reject) => { + child.once("error", reject); + child.once("exit", resolve); + }); + return exitCode ?? 1; +} + +export function buildRttResult(params: { + artifacts: RttResult["artifacts"]; + finishedAt: Date; + providerMode: RttProviderMode; + rawSummary: TelegramQaSummary; + runId: string; + scenarios: string[]; + spec: string; + startedAt: Date; + version: string; +}): RttResult { + const failed = (params.rawSummary.scenarios ?? []).some((scenario) => scenario.status === "fail"); + return { + package: { + spec: params.spec, + version: params.version, + }, + run: { + id: params.runId, + startedAt: params.startedAt.toISOString(), + finishedAt: params.finishedAt.toISOString(), + durationMs: params.finishedAt.getTime() - params.startedAt.getTime(), + status: failed ? "fail" : "pass", + }, + mode: { + providerMode: params.providerMode, + scenarios: params.scenarios, + }, + rtt: extractRtt(params.rawSummary), + artifacts: params.artifacts, + }; +} + +export const __testing = { + REQUIRED_TELEGRAM_ENV, +}; diff --git a/scripts/rtt.ts b/scripts/rtt.ts new file mode 100644 index 00000000000..5ae096ab3b8 --- /dev/null +++ b/scripts/rtt.ts @@ -0,0 +1,216 @@ +#!/usr/bin/env -S node --import tsx +import fs from "node:fs/promises"; +import path from "node:path"; +import { + appendJsonl, + assertDockerAvailable, + assertHarnessRoot, + assertRequiredEnv, + buildRttResult, + buildRunId, + createHarnessEnv, + readTelegramSummary, + resolvePublishedVersion, + runHarness, + validateOpenClawPackageSpec, + writeJson, + type RttProviderMode, +} from "./lib/rtt-harness.ts"; + +const DEFAULT_SCENARIOS = ["telegram-mentioned-message-reply"]; +const DEFAULT_PROVIDER_MODE = "mock-openai" satisfies RttProviderMode; +const DEFAULT_TIMEOUT_MS = 180_000; + +function usage() { + return [ + "Usage: pnpm rtt [--provider mock-openai|live-frontier] [--runs N] [--timeout-ms N] [--harness-root PATH] [--output PATH]", + "", + "Examples:", + " pnpm rtt openclaw@beta", + " pnpm rtt openclaw@2026.4.30", + " pnpm rtt openclaw@latest --provider live-frontier", + ].join("\n"); +} + +function parseProviderMode(value: string): RttProviderMode { + if (value === "mock-openai" || value === "live-frontier") { + return value; + } + throw new Error(`--provider must be mock-openai or live-frontier; got: ${value}`); +} + +function parsePositiveInt(label: string, value: string) { + const parsed = Number(value); + if (!Number.isInteger(parsed) || parsed < 1) { + throw new Error(`${label} must be a positive integer; got: ${value}`); + } + return parsed; +} + +function resolveHome(input: string) { + if (input === "~") { + return process.env.HOME ?? input; + } + if (input.startsWith("~/")) { + return path.join(process.env.HOME ?? "~", input.slice(2)); + } + return input; +} + +function parseArgs(argv: string[]) { + let spec: string | undefined; + let providerMode = DEFAULT_PROVIDER_MODE; + let runs = 1; + let harnessRoot = "~/Developer/clawdbot"; + let output = "runs"; + let timeoutMs = DEFAULT_TIMEOUT_MS; + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + if (arg === "--help" || arg === "-h") { + process.stdout.write(`${usage()}\n`); + process.exit(0); + } + if (arg === "--provider") { + providerMode = parseProviderMode(argv[++index] ?? ""); + continue; + } + if (arg === "--runs") { + runs = parsePositiveInt("--runs", argv[++index] ?? ""); + continue; + } + if (arg === "--harness-root") { + harnessRoot = argv[++index] ?? ""; + if (!harnessRoot.trim()) { + throw new Error("--harness-root requires a path."); + } + continue; + } + if (arg === "--timeout-ms") { + timeoutMs = parsePositiveInt("--timeout-ms", argv[++index] ?? ""); + continue; + } + if (arg === "--output") { + output = argv[++index] ?? ""; + if (!output.trim()) { + throw new Error("--output requires a path."); + } + continue; + } + if (arg.startsWith("--")) { + throw new Error(`Unknown option: ${arg}`); + } + if (spec) { + throw new Error(`Unexpected extra argument: ${arg}`); + } + spec = arg; + } + + if (!spec) { + throw new Error(`Missing package spec.\n${usage()}`); + } + + return { + spec: validateOpenClawPackageSpec(spec), + options: { + providerMode, + runs, + harnessRoot: path.resolve(resolveHome(harnessRoot)), + output: path.resolve(resolveHome(output)), + scenarios: DEFAULT_SCENARIOS, + timeoutMs, + }, + }; +} + +async function runOne(params: { + index: number; + options: ReturnType["options"]; + spec: string; + version: string; +}) { + const runId = buildRunId({ now: new Date(), spec: params.spec, index: params.index }); + const runDir = path.join(params.options.output, runId); + const rawDir = path.join(runDir, "raw"); + const resultPath = path.join(runDir, "result.json"); + const harnessRawDir = path.join(params.options.harnessRoot, ".artifacts/rtt", runId, "raw"); + const rawOutputDir = path.relative(params.options.harnessRoot, harnessRawDir); + const startedAt = new Date(); + const env = createHarnessEnv({ + baseEnv: process.env, + providerMode: params.options.providerMode, + rawOutputDir, + scenarios: params.options.scenarios, + spec: params.spec, + timeoutMs: params.options.timeoutMs, + version: params.version, + }); + + process.stderr.write(`[rtt] run ${params.index + 1}/${params.options.runs}: ${params.spec}\n`); + const harnessExitCode = await runHarness({ env, harnessRoot: params.options.harnessRoot }); + await readTelegramSummary(path.join(harnessRawDir, "telegram-qa-summary.json")); + await fs.rm(rawDir, { recursive: true, force: true }); + await fs.mkdir(path.dirname(rawDir), { recursive: true }); + await fs.cp(harnessRawDir, rawDir, { recursive: true }); + + const rawSummaryPath = path.join(rawDir, "telegram-qa-summary.json"); + const rawReportPath = path.join(rawDir, "telegram-qa-report.md"); + const rawObservedMessagesPath = path.join(rawDir, "telegram-qa-observed-messages.json"); + const rawSummary = await readTelegramSummary(rawSummaryPath); + const finishedAt = new Date(); + const result = buildRttResult({ + artifacts: { + rawSummaryPath, + rawReportPath, + rawObservedMessagesPath, + resultPath, + }, + finishedAt, + providerMode: params.options.providerMode, + rawSummary, + runId, + scenarios: params.options.scenarios, + spec: params.spec, + startedAt, + version: params.version, + }); + + await writeJson(resultPath, result); + await appendJsonl(path.resolve("data/rtt.jsonl"), result); + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); + return { + harnessExitCode, + result, + }; +} + +async function main() { + const { spec, options } = parseArgs(process.argv.slice(2)); + assertRequiredEnv(process.env); + await assertHarnessRoot(options.harnessRoot); + await assertDockerAvailable(); + const version = await resolvePublishedVersion(spec); + let failed = false; + for (let index = 0; index < options.runs; index += 1) { + const run = await runOne({ index, options, spec, version }); + failed = failed || run.harnessExitCode !== 0 || run.result.run.status === "fail"; + } + if (failed) { + process.exitCode = 1; + } +} + +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + process.stderr.write(`[rtt] ${message}\n`); + process.exitCode = 1; + }); +} + +export const __testing = { + parseArgs, + parseProviderMode, + parsePositiveInt, + resolveHome, +}; diff --git a/test/fixtures/telegram-qa-summary-rtt.json b/test/fixtures/telegram-qa-summary-rtt.json new file mode 100644 index 00000000000..36796aabb08 --- /dev/null +++ b/test/fixtures/telegram-qa-summary-rtt.json @@ -0,0 +1,31 @@ +{ + "credentials": { + "kind": "telegram", + "source": "env" + }, + "groupId": "-100123", + "startedAt": "2026-05-01T00:00:00.000Z", + "finishedAt": "2026-05-01T00:00:10.000Z", + "cleanupIssues": [], + "counts": { + "total": 2, + "passed": 2, + "failed": 0 + }, + "scenarios": [ + { + "id": "telegram-canary", + "title": "Telegram canary", + "status": "pass", + "details": "reply matched in 1234ms", + "rttMs": 1234 + }, + { + "id": "telegram-mentioned-message-reply", + "title": "Telegram mentioned message gets a reply", + "status": "pass", + "details": "reply matched in 5678ms", + "rttMs": 5678 + } + ] +} diff --git a/test/scripts/rtt-harness.test.ts b/test/scripts/rtt-harness.test.ts new file mode 100644 index 00000000000..8643af3721e --- /dev/null +++ b/test/scripts/rtt-harness.test.ts @@ -0,0 +1,172 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { describe, expect, it } from "vitest"; +import { + appendJsonl, + buildRttResult, + buildRunId, + createHarnessEnv, + extractRtt, + readTelegramSummary, + safeRunLabel, + validateOpenClawPackageSpec, +} from "../../scripts/lib/rtt-harness.ts"; +import { __testing as cliTesting } from "../../scripts/rtt.ts"; + +const TEST_DIR = path.dirname(fileURLToPath(import.meta.url)); +const FIXTURE_PATH = path.resolve(TEST_DIR, "../fixtures/telegram-qa-summary-rtt.json"); + +describe("RTT harness", () => { + it("validates published OpenClaw package specs", () => { + expect(validateOpenClawPackageSpec("openclaw@beta")).toBe("openclaw@beta"); + expect(validateOpenClawPackageSpec("openclaw@latest")).toBe("openclaw@latest"); + expect(validateOpenClawPackageSpec("openclaw@2026.4.30")).toBe("openclaw@2026.4.30"); + expect(validateOpenClawPackageSpec("openclaw@2026.4.30-beta.2")).toBe( + "openclaw@2026.4.30-beta.2", + ); + + expect(() => validateOpenClawPackageSpec("@openclaw/openclaw@beta")).toThrow( + /Package spec must be/, + ); + expect(() => validateOpenClawPackageSpec("openclaw@next")).toThrow(/Package spec must be/); + }); + + it("builds stable run labels", () => { + expect(safeRunLabel("openclaw@beta")).toBe("openclaw_beta"); + expect( + buildRunId({ + now: new Date("2026-05-01T03:04:05.678Z"), + spec: "openclaw@beta", + index: 1, + }), + ).toBe("2026-05-01T030405678Z-openclaw_beta-2"); + }); + + it("constructs harness env without dropping caller env", () => { + const env = createHarnessEnv({ + baseEnv: { + OPENCLAW_QA_TELEGRAM_GROUP_ID: "-100123", + OPENCLAW_NPM_TELEGRAM_FAST: "0", + }, + providerMode: "mock-openai", + rawOutputDir: ".artifacts/rtt/run/raw", + scenarios: ["telegram-mentioned-message-reply"], + spec: "openclaw@beta", + timeoutMs: 180_000, + version: "2026.4.30-beta.1", + }); + + expect(env.OPENCLAW_QA_TELEGRAM_GROUP_ID).toBe("-100123"); + expect(env.OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC).toBe("openclaw@beta"); + expect(env.OPENCLAW_NPM_TELEGRAM_PACKAGE_LABEL).toBe("openclaw@beta (2026.4.30-beta.1)"); + expect(env.OPENCLAW_NPM_TELEGRAM_PROVIDER_MODE).toBe("mock-openai"); + expect(env.OPENCLAW_NPM_TELEGRAM_SCENARIOS).toBe("telegram-mentioned-message-reply"); + expect(env.OPENCLAW_NPM_TELEGRAM_SKIP_HOTPATH).toBe("1"); + expect(env.OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR).toBe(".artifacts/rtt/run/raw"); + expect(env.OPENCLAW_NPM_TELEGRAM_FAST).toBe("0"); + expect(env.OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS).toBe("180000"); + expect(env.OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS).toBe("180000"); + }); + + it("extracts RTT values from Telegram QA summaries", async () => { + const summary = await readTelegramSummary(FIXTURE_PATH); + expect(extractRtt(summary)).toEqual({ + canaryMs: 1234, + mentionReplyMs: 5678, + }); + }); + + it("builds normalized result JSON", async () => { + const summary = await readTelegramSummary(FIXTURE_PATH); + const result = buildRttResult({ + artifacts: { + rawObservedMessagesPath: "runs/run/raw/telegram-qa-observed-messages.json", + rawReportPath: "runs/run/raw/telegram-qa-report.md", + rawSummaryPath: "runs/run/raw/telegram-qa-summary.json", + resultPath: "runs/run/result.json", + }, + finishedAt: new Date("2026-05-01T00:00:12.000Z"), + providerMode: "mock-openai", + rawSummary: summary, + runId: "run", + scenarios: ["telegram-mentioned-message-reply"], + spec: "openclaw@beta", + startedAt: new Date("2026-05-01T00:00:00.000Z"), + version: "2026.4.30-beta.1", + }); + + expect(result).toMatchObject({ + package: { spec: "openclaw@beta", version: "2026.4.30-beta.1" }, + run: { durationMs: 12_000, id: "run", status: "pass" }, + mode: { + providerMode: "mock-openai", + scenarios: ["telegram-mentioned-message-reply"], + }, + rtt: { canaryMs: 1234, mentionReplyMs: 5678 }, + }); + }); + + it("marks failed scenario summaries as failed results", () => { + const result = buildRttResult({ + artifacts: { + rawObservedMessagesPath: "runs/run/raw/telegram-qa-observed-messages.json", + rawReportPath: "runs/run/raw/telegram-qa-report.md", + rawSummaryPath: "runs/run/raw/telegram-qa-summary.json", + resultPath: "runs/run/result.json", + }, + finishedAt: new Date("2026-05-01T00:00:12.000Z"), + providerMode: "mock-openai", + rawSummary: { + scenarios: [ + { id: "telegram-canary", rttMs: 5948, status: "pass" }, + { id: "telegram-mentioned-message-reply", status: "fail" }, + ], + }, + runId: "run", + scenarios: ["telegram-mentioned-message-reply"], + spec: "openclaw@latest", + startedAt: new Date("2026-05-01T00:00:00.000Z"), + version: "2026.4.29", + }); + + expect(result.run.status).toBe("fail"); + expect(result.rtt).toEqual({ canaryMs: 5948, mentionReplyMs: undefined }); + }); + + it("appends JSONL rows", async () => { + const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-rtt-test-")); + const jsonlPath = path.join(tempDir, "data/rtt.jsonl"); + await appendJsonl(jsonlPath, { run: 1 }); + await appendJsonl(jsonlPath, { run: 2 }); + + await expect(fs.readFile(jsonlPath, "utf8")).resolves.toBe('{"run":1}\n{"run":2}\n'); + }); + + it("parses CLI options", () => { + const parsed = cliTesting.parseArgs([ + "openclaw@latest", + "--provider", + "live-frontier", + "--runs", + "3", + "--timeout-ms", + "240000", + "--harness-root", + "/tmp/openclaw", + "--output", + "/tmp/runs", + ]); + + expect(parsed.spec).toBe("openclaw@latest"); + expect(parsed.options).toMatchObject({ + providerMode: "live-frontier", + runs: 3, + harnessRoot: "/tmp/openclaw", + output: "/tmp/runs", + scenarios: ["telegram-mentioned-message-reply"], + timeoutMs: 240_000, + }); + }); +});