test: tune live cache assertions

This commit is contained in:
Peter Steinberger
2026-04-04 15:16:43 +09:00
parent c4d2c4899d
commit cdb572d703
3 changed files with 65 additions and 16 deletions

View File

@@ -114,6 +114,7 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`.
- OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`.
- OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache.
- OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers.
- In practice, OpenAI often behaves like an initial-prefix cache rather than Anthropic-style moving full-history reuse. Stable long-prefix text turns can land near a `4864` cached-token plateau in current live probes, while tool-heavy or MCP-style transcripts often plateau near `4608` cached tokens even on exact repeats.
### Amazon Bedrock
@@ -163,6 +164,44 @@ agents:
OpenClaw exposes dedicated cache-trace diagnostics for embedded agent runs.
## Live regression tests
OpenClaw keeps provider-specific live cache probes for repeated prefixes, tool turns, image turns, and MCP-style tool transcripts.
- `src/agents/pi-embedded-runner.cache.live.test.ts`
- `src/agents/pi-mcp-style.cache.live.test.ts`
These tests intentionally do not use identical success criteria across providers.
### Anthropic live expectations
- Expect explicit warmup writes via `cacheWrite`.
- Expect near-full history reuse on repeated turns because Anthropic cache control advances the cache breakpoint through the conversation.
- Current live assertions still use high hit-rate thresholds for stable, tool, and image paths.
### OpenAI live expectations
- Expect `cacheRead` only. `cacheWrite` remains `0`.
- Treat repeated-turn cache reuse as a provider-specific plateau, not as Anthropic-style moving full-history reuse.
- Current live assertions use conservative floor checks derived from observed live behavior on `gpt-5.4-mini`:
- stable prefix: `cacheRead >= 4608`, hit rate `>= 0.90`
- tool transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
- image transcript: `cacheRead >= 3840`, hit rate `>= 0.82`
- MCP-style transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
Fresh OpenAI verification on 2026-04-04 landed at:
- stable prefix: `cacheRead=4864`, hit rate `0.971`
- tool transcript: `cacheRead=4608`, hit rate `0.900`
- image transcript: `cacheRead=4864`, hit rate `0.959`
- MCP-style transcript: `cacheRead=4608`, hit rate `0.895`
Why the assertions differ:
- Anthropic exposes explicit cache breakpoints and moving conversation-history reuse.
- OpenAI prompt caching is still exact-prefix sensitive, but the effective reusable prefix in live Responses traffic can plateau earlier than the full prompt.
- Because of that, comparing Anthropic and OpenAI by a single cross-provider percentage threshold creates false regressions.
### `diagnostics.cacheTrace` config
```yaml

View File

@@ -27,6 +27,12 @@ const OPENAI_SESSION_ID = "live-cache-openai-stable-session";
const ANTHROPIC_SESSION_ID = "live-cache-anthropic-stable-session";
const OPENAI_PREFIX = buildStableCachePrefix("openai");
const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
const OPENAI_STABLE_PREFIX_MIN_CACHE_READ = 4_608;
const OPENAI_STABLE_PREFIX_MIN_HIT_RATE = 0.9;
const OPENAI_TOOL_MIN_CACHE_READ = 4_096;
const OPENAI_TOOL_MIN_HIT_RATE = 0.85;
const OPENAI_IMAGE_MIN_CACHE_READ = 3_840;
const OPENAI_IMAGE_MIN_HIT_RATE = 0.82;
const LIVE_TEST_PNG_URL = new URL(
"../../apps/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png",
import.meta.url,
@@ -625,7 +631,7 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
}, 120_000);
it(
"hits a high cache-read rate on repeated stable prefixes",
"hits the expected OpenAI cache plateau on repeated stable prefixes",
async () => {
const warmup = await runOpenAiCacheProbe({
...fixture,
@@ -653,17 +659,19 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
(candidate.usage.cacheRead ?? 0) > (best.usage.cacheRead ?? 0) ? candidate : best,
);
logLiveCache(
`openai best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
`openai stable-prefix plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.7);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(
OPENAI_STABLE_PREFIX_MIN_CACHE_READ,
);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_STABLE_PREFIX_MIN_HIT_RATE);
},
6 * 60_000,
);
it(
"keeps high cache-read rates across tool-call followup turns",
"keeps the expected OpenAI cache plateau across tool-call followup turns",
async () => {
const warmup = await runOpenAiToolCacheProbe({
...fixture,
@@ -686,17 +694,17 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
});
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
logLiveCache(
`openai tool best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
`openai tool plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.7);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_TOOL_MIN_CACHE_READ);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_TOOL_MIN_HIT_RATE);
},
8 * 60_000,
);
it(
"keeps high cache-read rates across image-heavy followup turns",
"keeps the expected OpenAI cache plateau across image-heavy followup turns",
async () => {
const warmup = await runOpenAiImageCacheProbe({
...fixture,
@@ -719,11 +727,11 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
});
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
logLiveCache(
`openai image best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
`openai image plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_IMAGE_MIN_CACHE_READ);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_IMAGE_MIN_HIT_RATE);
},
6 * 60_000,
);

View File

@@ -16,6 +16,8 @@ const describeCacheLive = LIVE_CACHE_TEST_ENABLED ? describe : describe.skip;
const OPENAI_TIMEOUT_MS = 120_000;
const OPENAI_SESSION_ID = "live-cache-openai-mcp-style-session";
const OPENAI_PREFIX = buildStableCachePrefix("openai-mcp-style");
const OPENAI_MCP_STYLE_MIN_CACHE_READ = 4_096;
const OPENAI_MCP_STYLE_MIN_HIT_RATE = 0.85;
const MCP_TOOL: Tool = {
name: "bundleProbe__bundle_probe",
@@ -161,7 +163,7 @@ async function runOpenAiMcpStyleCacheProbe(params: {
describeCacheLive("MCP-style prompt caching (live)", () => {
it(
"keeps high cache-read rates across MCP-style followup turns",
"keeps an OpenAI cache plateau across MCP-style followup turns",
async () => {
const fixture = await resolveLiveDirectModel({
provider: "openai",
@@ -192,11 +194,11 @@ describeCacheLive("MCP-style prompt caching (live)", () => {
});
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
logLiveCache(
`openai mcp-style best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
`openai mcp-style plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_MCP_STYLE_MIN_CACHE_READ);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_MCP_STYLE_MIN_HIT_RATE);
},
10 * 60_000,
);