test: tune live cache assertions

2026-05-04 13:50:23 +00:00 · 2026-04-04 15:16:43 +09:00
parent c4d2c4899d
commit cdb572d703
3 changed files with 65 additions and 16 deletions
--- a/docs/reference/prompt-caching.md
+++ b/docs/reference/prompt-caching.md
@@ -114,6 +114,7 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`.
 - OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`.
 - OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache.
 - OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers.
+- In practice, OpenAI often behaves like an initial-prefix cache rather than Anthropic-style moving full-history reuse. Stable long-prefix text turns can land near a `4864` cached-token plateau in current live probes, while tool-heavy or MCP-style transcripts often plateau near `4608` cached tokens even on exact repeats.

 ### Amazon Bedrock

@@ -163,6 +164,44 @@ agents:

 OpenClaw exposes dedicated cache-trace diagnostics for embedded agent runs.

+## Live regression tests
+
+OpenClaw keeps provider-specific live cache probes for repeated prefixes, tool turns, image turns, and MCP-style tool transcripts.
+
+- `src/agents/pi-embedded-runner.cache.live.test.ts`
+- `src/agents/pi-mcp-style.cache.live.test.ts`
+
+These tests intentionally do not use identical success criteria across providers.
+
+### Anthropic live expectations
+
+- Expect explicit warmup writes via `cacheWrite`.
+- Expect near-full history reuse on repeated turns because Anthropic cache control advances the cache breakpoint through the conversation.
+- Current live assertions still use high hit-rate thresholds for stable, tool, and image paths.
+
+### OpenAI live expectations
+
+- Expect `cacheRead` only. `cacheWrite` remains `0`.
+- Treat repeated-turn cache reuse as a provider-specific plateau, not as Anthropic-style moving full-history reuse.
+- Current live assertions use conservative floor checks derived from observed live behavior on `gpt-5.4-mini`:
+  - stable prefix: `cacheRead >= 4608`, hit rate `>= 0.90`
+  - tool transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
+  - image transcript: `cacheRead >= 3840`, hit rate `>= 0.82`
+  - MCP-style transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
+
+Fresh OpenAI verification on 2026-04-04 landed at:
+
+- stable prefix: `cacheRead=4864`, hit rate `0.971`
+- tool transcript: `cacheRead=4608`, hit rate `0.900`
+- image transcript: `cacheRead=4864`, hit rate `0.959`
+- MCP-style transcript: `cacheRead=4608`, hit rate `0.895`
+
+Why the assertions differ:
+
+- Anthropic exposes explicit cache breakpoints and moving conversation-history reuse.
+- OpenAI prompt caching is still exact-prefix sensitive, but the effective reusable prefix in live Responses traffic can plateau earlier than the full prompt.
+- Because of that, comparing Anthropic and OpenAI by a single cross-provider percentage threshold creates false regressions.
+
 ### `diagnostics.cacheTrace` config

 ```yaml
--- a/src/agents/pi-embedded-runner.cache.live.test.ts
+++ b/src/agents/pi-embedded-runner.cache.live.test.ts
@@ -27,6 +27,12 @@ const OPENAI_SESSION_ID = "live-cache-openai-stable-session";
 const ANTHROPIC_SESSION_ID = "live-cache-anthropic-stable-session";
 const OPENAI_PREFIX = buildStableCachePrefix("openai");
 const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
+const OPENAI_STABLE_PREFIX_MIN_CACHE_READ = 4_608;
+const OPENAI_STABLE_PREFIX_MIN_HIT_RATE = 0.9;
+const OPENAI_TOOL_MIN_CACHE_READ = 4_096;
+const OPENAI_TOOL_MIN_HIT_RATE = 0.85;
+const OPENAI_IMAGE_MIN_CACHE_READ = 3_840;
+const OPENAI_IMAGE_MIN_HIT_RATE = 0.82;
 const LIVE_TEST_PNG_URL = new URL(
  "../../apps/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png",
  import.meta.url,
@@ -625,7 +631,7 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
    }, 120_000);

    it(
-      "hits a high cache-read rate on repeated stable prefixes",
+      "hits the expected OpenAI cache plateau on repeated stable prefixes",
      async () => {
        const warmup = await runOpenAiCacheProbe({
          ...fixture,
@@ -653,17 +659,19 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
          (candidate.usage.cacheRead ?? 0) > (best.usage.cacheRead ?? 0) ? candidate : best,
        );
        logLiveCache(
-          `openai best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
+          `openai stable-prefix plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
        );

-        expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
-        expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.7);
+        expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(
+          OPENAI_STABLE_PREFIX_MIN_CACHE_READ,
+        );
+        expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_STABLE_PREFIX_MIN_HIT_RATE);
      },
      6 * 60_000,
    );

    it(
-      "keeps high cache-read rates across tool-call followup turns",
+      "keeps the expected OpenAI cache plateau across tool-call followup turns",
      async () => {
        const warmup = await runOpenAiToolCacheProbe({
          ...fixture,
@@ -686,17 +694,17 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
        });
        const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
        logLiveCache(
-          `openai tool best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
+          `openai tool plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
        );

-        expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
-        expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.7);
+        expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_TOOL_MIN_CACHE_READ);
+        expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_TOOL_MIN_HIT_RATE);
      },
      8 * 60_000,
    );

    it(
-      "keeps high cache-read rates across image-heavy followup turns",
+      "keeps the expected OpenAI cache plateau across image-heavy followup turns",
      async () => {
        const warmup = await runOpenAiImageCacheProbe({
          ...fixture,
@@ -719,11 +727,11 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
        });
        const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
        logLiveCache(
-          `openai image best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
+          `openai image plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
        );

-        expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
-        expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
+        expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_IMAGE_MIN_CACHE_READ);
+        expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_IMAGE_MIN_HIT_RATE);
      },
      6 * 60_000,
    );
--- a/src/agents/pi-mcp-style.cache.live.test.ts
+++ b/src/agents/pi-mcp-style.cache.live.test.ts
@@ -16,6 +16,8 @@ const describeCacheLive = LIVE_CACHE_TEST_ENABLED ? describe : describe.skip;
 const OPENAI_TIMEOUT_MS = 120_000;
 const OPENAI_SESSION_ID = "live-cache-openai-mcp-style-session";
 const OPENAI_PREFIX = buildStableCachePrefix("openai-mcp-style");
+const OPENAI_MCP_STYLE_MIN_CACHE_READ = 4_096;
+const OPENAI_MCP_STYLE_MIN_HIT_RATE = 0.85;

 const MCP_TOOL: Tool = {
  name: "bundleProbe__bundle_probe",
@@ -161,7 +163,7 @@ async function runOpenAiMcpStyleCacheProbe(params: {

 describeCacheLive("MCP-style prompt caching (live)", () => {
  it(
-    "keeps high cache-read rates across MCP-style followup turns",
+    "keeps an OpenAI cache plateau across MCP-style followup turns",
    async () => {
      const fixture = await resolveLiveDirectModel({
        provider: "openai",
@@ -192,11 +194,11 @@ describeCacheLive("MCP-style prompt caching (live)", () => {
      });
      const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
      logLiveCache(
-        `openai mcp-style best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
+        `openai mcp-style plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
      );

-      expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
-      expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
+      expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_MCP_STYLE_MIN_CACHE_READ);
+      expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_MCP_STYLE_MIN_HIT_RATE);
    },
    10 * 60_000,
  );