test(rtt): expose warm sample metrics

2026-05-06 11:20:43 +00:00 · 2026-05-01 16:32:52 +05:30
parent 3fd4d1d29d
commit aa5a0a36f8
4 changed files with 119 additions and 8 deletions
--- a/test/fixtures/telegram-qa-summary-rtt.json
+++ b/test/fixtures/telegram-qa-summary-rtt.json
@@ -24,8 +24,37 @@
      "id": "telegram-mentioned-message-reply",
      "title": "Telegram mentioned message gets a reply",
      "status": "pass",
-      "details": "reply matched in 5678ms",
-      "rttMs": 5678
+      "details": "3/3 warm samples passed",
+      "rttMs": 5000,
+      "samples": [
+        {
+          "index": 1,
+          "status": "pass",
+          "details": "observed SUT message 101",
+          "rttMs": 4000
+        },
+        {
+          "index": 2,
+          "status": "pass",
+          "details": "observed SUT message 102",
+          "rttMs": 5000
+        },
+        {
+          "index": 3,
+          "status": "pass",
+          "details": "observed SUT message 103",
+          "rttMs": 7000
+        }
+      ],
+      "stats": {
+        "total": 3,
+        "passed": 3,
+        "failed": 0,
+        "avgMs": 5333,
+        "p50Ms": 5000,
+        "p95Ms": 7000,
+        "maxMs": 7000
+      }
    }
  ]
 }
--- a/test/scripts/rtt-harness.test.ts
+++ b/test/scripts/rtt-harness.test.ts
@@ -52,6 +52,8 @@ describe("RTT harness", () => {
      },
      providerMode: "mock-openai",
      rawOutputDir: ".artifacts/rtt/run/raw",
+      samples: 20,
+      sampleTimeoutMs: 30_000,
      scenarios: ["telegram-mentioned-message-reply"],
      spec: "openclaw@beta",
      timeoutMs: 180_000,
@@ -65,6 +67,8 @@ describe("RTT harness", () => {
    expect(env.OPENCLAW_NPM_TELEGRAM_SCENARIOS).toBe("telegram-mentioned-message-reply");
    expect(env.OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR).toBe(".artifacts/rtt/run/raw");
    expect(env.OPENCLAW_NPM_TELEGRAM_FAST).toBe("0");
+    expect(env.OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES).toBe("20");
+    expect(env.OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS).toBe("30000");
    expect(env.OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS).toBe("180000");
    expect(env.OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS).toBe("180000");
  });
@@ -73,7 +77,13 @@ describe("RTT harness", () => {
    const summary = await readTelegramSummary(FIXTURE_PATH);
    expect(extractRtt(summary)).toEqual({
      canaryMs: 1234,
-      mentionReplyMs: 5678,
+      mentionReplyMs: 5000,
+      warmSamples: [4000, 5000, 7000],
+      avgMs: 5333,
+      p50Ms: 5000,
+      p95Ms: 7000,
+      maxMs: 7000,
+      failedSamples: 0,
    });
  });

@@ -103,8 +113,17 @@ describe("RTT harness", () => {
        providerMode: "mock-openai",
        scenarios: ["telegram-mentioned-message-reply"],
      },
-      rtt: { canaryMs: 1234, mentionReplyMs: 5678 },
+      rtt: {
+        canaryMs: 1234,
+        mentionReplyMs: 5000,
+        avgMs: 5333,
+        p50Ms: 5000,
+        p95Ms: 7000,
+        maxMs: 7000,
+        failedSamples: 0,
+      },
    });
+    expect(result.rtt.warmSamples).toEqual([4000, 5000, 7000]);
  });

  it("marks failed scenario summaries as failed results", () => {
@@ -150,6 +169,10 @@ describe("RTT harness", () => {
      "live-frontier",
      "--runs",
      "3",
+      "--samples",
+      "5",
+      "--sample-timeout-ms",
+      "30000",
      "--timeout-ms",
      "240000",
      "--harness-root",
@@ -162,6 +185,8 @@ describe("RTT harness", () => {
    expect(parsed.options).toMatchObject({
      providerMode: "live-frontier",
      runs: 3,
+      samples: 5,
+      sampleTimeoutMs: 30_000,
      harnessRoot: "/tmp/openclaw",
      output: "/tmp/runs",
      scenarios: ["telegram-mentioned-message-reply"],