test(rtt): expose warm sample metrics

This commit is contained in:
Ayaan Zaidi
2026-05-01 16:32:52 +05:30
parent 3fd4d1d29d
commit aa5a0a36f8
4 changed files with 119 additions and 8 deletions

View File

@@ -24,8 +24,37 @@
"id": "telegram-mentioned-message-reply",
"title": "Telegram mentioned message gets a reply",
"status": "pass",
"details": "reply matched in 5678ms",
"rttMs": 5678
"details": "3/3 warm samples passed",
"rttMs": 5000,
"samples": [
{
"index": 1,
"status": "pass",
"details": "observed SUT message 101",
"rttMs": 4000
},
{
"index": 2,
"status": "pass",
"details": "observed SUT message 102",
"rttMs": 5000
},
{
"index": 3,
"status": "pass",
"details": "observed SUT message 103",
"rttMs": 7000
}
],
"stats": {
"total": 3,
"passed": 3,
"failed": 0,
"avgMs": 5333,
"p50Ms": 5000,
"p95Ms": 7000,
"maxMs": 7000
}
}
]
}

View File

@@ -52,6 +52,8 @@ describe("RTT harness", () => {
},
providerMode: "mock-openai",
rawOutputDir: ".artifacts/rtt/run/raw",
samples: 20,
sampleTimeoutMs: 30_000,
scenarios: ["telegram-mentioned-message-reply"],
spec: "openclaw@beta",
timeoutMs: 180_000,
@@ -65,6 +67,8 @@ describe("RTT harness", () => {
expect(env.OPENCLAW_NPM_TELEGRAM_SCENARIOS).toBe("telegram-mentioned-message-reply");
expect(env.OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR).toBe(".artifacts/rtt/run/raw");
expect(env.OPENCLAW_NPM_TELEGRAM_FAST).toBe("0");
expect(env.OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES).toBe("20");
expect(env.OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS).toBe("30000");
expect(env.OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS).toBe("180000");
expect(env.OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS).toBe("180000");
});
@@ -73,7 +77,13 @@ describe("RTT harness", () => {
const summary = await readTelegramSummary(FIXTURE_PATH);
expect(extractRtt(summary)).toEqual({
canaryMs: 1234,
mentionReplyMs: 5678,
mentionReplyMs: 5000,
warmSamples: [4000, 5000, 7000],
avgMs: 5333,
p50Ms: 5000,
p95Ms: 7000,
maxMs: 7000,
failedSamples: 0,
});
});
@@ -103,8 +113,17 @@ describe("RTT harness", () => {
providerMode: "mock-openai",
scenarios: ["telegram-mentioned-message-reply"],
},
rtt: { canaryMs: 1234, mentionReplyMs: 5678 },
rtt: {
canaryMs: 1234,
mentionReplyMs: 5000,
avgMs: 5333,
p50Ms: 5000,
p95Ms: 7000,
maxMs: 7000,
failedSamples: 0,
},
});
expect(result.rtt.warmSamples).toEqual([4000, 5000, 7000]);
});
it("marks failed scenario summaries as failed results", () => {
@@ -150,6 +169,10 @@ describe("RTT harness", () => {
"live-frontier",
"--runs",
"3",
"--samples",
"5",
"--sample-timeout-ms",
"30000",
"--timeout-ms",
"240000",
"--harness-root",
@@ -162,6 +185,8 @@ describe("RTT harness", () => {
expect(parsed.options).toMatchObject({
providerMode: "live-frontier",
runs: 3,
samples: 5,
sampleTimeoutMs: 30_000,
harnessRoot: "/tmp/openclaw",
output: "/tmp/runs",
scenarios: ["telegram-mentioned-message-reply"],