mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 09:50:42 +00:00
test(rtt): expose warm sample metrics
This commit is contained in:
@@ -10,6 +10,8 @@ export type RttProviderMode = "mock-openai" | "live-frontier";
|
||||
export type RttCliOptions = {
|
||||
providerMode: RttProviderMode;
|
||||
runs: number;
|
||||
samples: number;
|
||||
sampleTimeoutMs: number;
|
||||
harnessRoot: string;
|
||||
output: string;
|
||||
scenarios: string[];
|
||||
@@ -35,6 +37,12 @@ export type RttResult = {
|
||||
rtt: {
|
||||
canaryMs?: number;
|
||||
mentionReplyMs?: number;
|
||||
warmSamples?: number[];
|
||||
avgMs?: number;
|
||||
p50Ms?: number;
|
||||
p95Ms?: number;
|
||||
maxMs?: number;
|
||||
failedSamples?: number;
|
||||
};
|
||||
artifacts: {
|
||||
rawSummaryPath: string;
|
||||
@@ -49,6 +57,20 @@ export type TelegramQaSummary = {
|
||||
id?: string;
|
||||
rttMs?: number;
|
||||
status?: string;
|
||||
samples?: Array<{
|
||||
index?: number;
|
||||
status?: string;
|
||||
rttMs?: number;
|
||||
}>;
|
||||
stats?: {
|
||||
total?: number;
|
||||
passed?: number;
|
||||
failed?: number;
|
||||
avgMs?: number;
|
||||
p50Ms?: number;
|
||||
p95Ms?: number;
|
||||
maxMs?: number;
|
||||
};
|
||||
}>;
|
||||
};
|
||||
|
||||
@@ -82,11 +104,26 @@ export function buildRunId(params: { now: Date; spec: string; index?: number })
|
||||
|
||||
export function extractRtt(summary: TelegramQaSummary) {
|
||||
const scenarios = summary.scenarios ?? [];
|
||||
return {
|
||||
const mention = scenarios.find((scenario) => scenario.id === "telegram-mentioned-message-reply");
|
||||
const warmSamples = mention?.samples
|
||||
?.filter((sample) => sample.status === "pass" && sample.rttMs !== undefined)
|
||||
.sort((left, right) => (left.index ?? 0) - (right.index ?? 0))
|
||||
.flatMap((sample) => (sample.rttMs === undefined ? [] : [sample.rttMs]));
|
||||
const rtt: RttResult["rtt"] = {
|
||||
canaryMs: scenarios.find((scenario) => scenario.id === "telegram-canary")?.rttMs,
|
||||
mentionReplyMs: scenarios.find((scenario) => scenario.id === "telegram-mentioned-message-reply")
|
||||
?.rttMs,
|
||||
mentionReplyMs: mention?.stats?.p50Ms ?? mention?.rttMs,
|
||||
};
|
||||
if (warmSamples?.length) {
|
||||
rtt.warmSamples = warmSamples;
|
||||
}
|
||||
if (mention?.stats) {
|
||||
rtt.avgMs = mention.stats.avgMs;
|
||||
rtt.p50Ms = mention.stats.p50Ms;
|
||||
rtt.p95Ms = mention.stats.p95Ms;
|
||||
rtt.maxMs = mention.stats.maxMs;
|
||||
rtt.failedSamples = mention.stats.failed;
|
||||
}
|
||||
return rtt;
|
||||
}
|
||||
|
||||
export function createHarnessEnv(params: {
|
||||
@@ -96,6 +133,8 @@ export function createHarnessEnv(params: {
|
||||
spec: string;
|
||||
version: string;
|
||||
rawOutputDir: string;
|
||||
samples: number;
|
||||
sampleTimeoutMs: number;
|
||||
timeoutMs: number;
|
||||
}) {
|
||||
return {
|
||||
@@ -106,6 +145,8 @@ export function createHarnessEnv(params: {
|
||||
OPENCLAW_NPM_TELEGRAM_SCENARIOS: params.scenarios.join(","),
|
||||
OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR: params.rawOutputDir,
|
||||
OPENCLAW_NPM_TELEGRAM_FAST: params.baseEnv.OPENCLAW_NPM_TELEGRAM_FAST ?? "1",
|
||||
OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES: String(params.samples),
|
||||
OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS: String(params.sampleTimeoutMs),
|
||||
OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS: String(params.timeoutMs),
|
||||
OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS: String(params.timeoutMs),
|
||||
};
|
||||
|
||||
@@ -20,10 +20,12 @@ import {
|
||||
const DEFAULT_SCENARIOS = ["telegram-mentioned-message-reply"];
|
||||
const DEFAULT_PROVIDER_MODE = "mock-openai" satisfies RttProviderMode;
|
||||
const DEFAULT_TIMEOUT_MS = 180_000;
|
||||
const DEFAULT_SAMPLES = 20;
|
||||
const DEFAULT_SAMPLE_TIMEOUT_MS = 30_000;
|
||||
|
||||
function usage() {
|
||||
return [
|
||||
"Usage: pnpm rtt <openclaw@spec> [--provider mock-openai|live-frontier] [--runs N] [--timeout-ms N] [--harness-root PATH] [--output PATH]",
|
||||
"Usage: pnpm rtt <openclaw@spec> [--provider mock-openai|live-frontier] [--runs N] [--samples N] [--sample-timeout-ms N] [--timeout-ms N] [--harness-root PATH] [--output PATH]",
|
||||
"",
|
||||
"Examples:",
|
||||
" pnpm rtt openclaw@beta",
|
||||
@@ -61,6 +63,8 @@ function parseArgs(argv: string[]) {
|
||||
let spec: string | undefined;
|
||||
let providerMode = DEFAULT_PROVIDER_MODE;
|
||||
let runs = 1;
|
||||
let samples = DEFAULT_SAMPLES;
|
||||
let sampleTimeoutMs = DEFAULT_SAMPLE_TIMEOUT_MS;
|
||||
let harnessRoot = "~/Developer/clawdbot";
|
||||
let output = "runs";
|
||||
let timeoutMs = DEFAULT_TIMEOUT_MS;
|
||||
@@ -79,6 +83,14 @@ function parseArgs(argv: string[]) {
|
||||
runs = parsePositiveInt("--runs", argv[++index] ?? "");
|
||||
continue;
|
||||
}
|
||||
if (arg === "--samples") {
|
||||
samples = parsePositiveInt("--samples", argv[++index] ?? "");
|
||||
continue;
|
||||
}
|
||||
if (arg === "--sample-timeout-ms") {
|
||||
sampleTimeoutMs = parsePositiveInt("--sample-timeout-ms", argv[++index] ?? "");
|
||||
continue;
|
||||
}
|
||||
if (arg === "--harness-root") {
|
||||
harnessRoot = argv[++index] ?? "";
|
||||
if (!harnessRoot.trim()) {
|
||||
@@ -115,6 +127,8 @@ function parseArgs(argv: string[]) {
|
||||
options: {
|
||||
providerMode,
|
||||
runs,
|
||||
samples,
|
||||
sampleTimeoutMs,
|
||||
harnessRoot: path.resolve(resolveHome(harnessRoot)),
|
||||
output: path.resolve(resolveHome(output)),
|
||||
scenarios: DEFAULT_SCENARIOS,
|
||||
@@ -140,6 +154,8 @@ async function runOne(params: {
|
||||
baseEnv: process.env,
|
||||
providerMode: params.options.providerMode,
|
||||
rawOutputDir,
|
||||
samples: params.options.samples,
|
||||
sampleTimeoutMs: params.options.sampleTimeoutMs,
|
||||
scenarios: params.options.scenarios,
|
||||
spec: params.spec,
|
||||
timeoutMs: params.options.timeoutMs,
|
||||
|
||||
33
test/fixtures/telegram-qa-summary-rtt.json
vendored
33
test/fixtures/telegram-qa-summary-rtt.json
vendored
@@ -24,8 +24,37 @@
|
||||
"id": "telegram-mentioned-message-reply",
|
||||
"title": "Telegram mentioned message gets a reply",
|
||||
"status": "pass",
|
||||
"details": "reply matched in 5678ms",
|
||||
"rttMs": 5678
|
||||
"details": "3/3 warm samples passed",
|
||||
"rttMs": 5000,
|
||||
"samples": [
|
||||
{
|
||||
"index": 1,
|
||||
"status": "pass",
|
||||
"details": "observed SUT message 101",
|
||||
"rttMs": 4000
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"status": "pass",
|
||||
"details": "observed SUT message 102",
|
||||
"rttMs": 5000
|
||||
},
|
||||
{
|
||||
"index": 3,
|
||||
"status": "pass",
|
||||
"details": "observed SUT message 103",
|
||||
"rttMs": 7000
|
||||
}
|
||||
],
|
||||
"stats": {
|
||||
"total": 3,
|
||||
"passed": 3,
|
||||
"failed": 0,
|
||||
"avgMs": 5333,
|
||||
"p50Ms": 5000,
|
||||
"p95Ms": 7000,
|
||||
"maxMs": 7000
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -52,6 +52,8 @@ describe("RTT harness", () => {
|
||||
},
|
||||
providerMode: "mock-openai",
|
||||
rawOutputDir: ".artifacts/rtt/run/raw",
|
||||
samples: 20,
|
||||
sampleTimeoutMs: 30_000,
|
||||
scenarios: ["telegram-mentioned-message-reply"],
|
||||
spec: "openclaw@beta",
|
||||
timeoutMs: 180_000,
|
||||
@@ -65,6 +67,8 @@ describe("RTT harness", () => {
|
||||
expect(env.OPENCLAW_NPM_TELEGRAM_SCENARIOS).toBe("telegram-mentioned-message-reply");
|
||||
expect(env.OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR).toBe(".artifacts/rtt/run/raw");
|
||||
expect(env.OPENCLAW_NPM_TELEGRAM_FAST).toBe("0");
|
||||
expect(env.OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES).toBe("20");
|
||||
expect(env.OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS).toBe("30000");
|
||||
expect(env.OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS).toBe("180000");
|
||||
expect(env.OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS).toBe("180000");
|
||||
});
|
||||
@@ -73,7 +77,13 @@ describe("RTT harness", () => {
|
||||
const summary = await readTelegramSummary(FIXTURE_PATH);
|
||||
expect(extractRtt(summary)).toEqual({
|
||||
canaryMs: 1234,
|
||||
mentionReplyMs: 5678,
|
||||
mentionReplyMs: 5000,
|
||||
warmSamples: [4000, 5000, 7000],
|
||||
avgMs: 5333,
|
||||
p50Ms: 5000,
|
||||
p95Ms: 7000,
|
||||
maxMs: 7000,
|
||||
failedSamples: 0,
|
||||
});
|
||||
});
|
||||
|
||||
@@ -103,8 +113,17 @@ describe("RTT harness", () => {
|
||||
providerMode: "mock-openai",
|
||||
scenarios: ["telegram-mentioned-message-reply"],
|
||||
},
|
||||
rtt: { canaryMs: 1234, mentionReplyMs: 5678 },
|
||||
rtt: {
|
||||
canaryMs: 1234,
|
||||
mentionReplyMs: 5000,
|
||||
avgMs: 5333,
|
||||
p50Ms: 5000,
|
||||
p95Ms: 7000,
|
||||
maxMs: 7000,
|
||||
failedSamples: 0,
|
||||
},
|
||||
});
|
||||
expect(result.rtt.warmSamples).toEqual([4000, 5000, 7000]);
|
||||
});
|
||||
|
||||
it("marks failed scenario summaries as failed results", () => {
|
||||
@@ -150,6 +169,10 @@ describe("RTT harness", () => {
|
||||
"live-frontier",
|
||||
"--runs",
|
||||
"3",
|
||||
"--samples",
|
||||
"5",
|
||||
"--sample-timeout-ms",
|
||||
"30000",
|
||||
"--timeout-ms",
|
||||
"240000",
|
||||
"--harness-root",
|
||||
@@ -162,6 +185,8 @@ describe("RTT harness", () => {
|
||||
expect(parsed.options).toMatchObject({
|
||||
providerMode: "live-frontier",
|
||||
runs: 3,
|
||||
samples: 5,
|
||||
sampleTimeoutMs: 30_000,
|
||||
harnessRoot: "/tmp/openclaw",
|
||||
output: "/tmp/runs",
|
||||
scenarios: ["telegram-mentioned-message-reply"],
|
||||
|
||||
Reference in New Issue
Block a user