mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-04 13:50:23 +00:00
test: tune live cache assertions
This commit is contained in:
@@ -114,6 +114,7 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`.
|
||||
- OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`.
|
||||
- OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache.
|
||||
- OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers.
|
||||
- In practice, OpenAI often behaves like an initial-prefix cache rather than Anthropic-style moving full-history reuse. Stable long-prefix text turns can land near a `4864` cached-token plateau in current live probes, while tool-heavy or MCP-style transcripts often plateau near `4608` cached tokens even on exact repeats.
|
||||
|
||||
### Amazon Bedrock
|
||||
|
||||
@@ -163,6 +164,44 @@ agents:
|
||||
|
||||
OpenClaw exposes dedicated cache-trace diagnostics for embedded agent runs.
|
||||
|
||||
## Live regression tests
|
||||
|
||||
OpenClaw keeps provider-specific live cache probes for repeated prefixes, tool turns, image turns, and MCP-style tool transcripts.
|
||||
|
||||
- `src/agents/pi-embedded-runner.cache.live.test.ts`
|
||||
- `src/agents/pi-mcp-style.cache.live.test.ts`
|
||||
|
||||
These tests intentionally do not use identical success criteria across providers.
|
||||
|
||||
### Anthropic live expectations
|
||||
|
||||
- Expect explicit warmup writes via `cacheWrite`.
|
||||
- Expect near-full history reuse on repeated turns because Anthropic cache control advances the cache breakpoint through the conversation.
|
||||
- Current live assertions still use high hit-rate thresholds for stable, tool, and image paths.
|
||||
|
||||
### OpenAI live expectations
|
||||
|
||||
- Expect `cacheRead` only. `cacheWrite` remains `0`.
|
||||
- Treat repeated-turn cache reuse as a provider-specific plateau, not as Anthropic-style moving full-history reuse.
|
||||
- Current live assertions use conservative floor checks derived from observed live behavior on `gpt-5.4-mini`:
|
||||
- stable prefix: `cacheRead >= 4608`, hit rate `>= 0.90`
|
||||
- tool transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
|
||||
- image transcript: `cacheRead >= 3840`, hit rate `>= 0.82`
|
||||
- MCP-style transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
|
||||
|
||||
Fresh OpenAI verification on 2026-04-04 landed at:
|
||||
|
||||
- stable prefix: `cacheRead=4864`, hit rate `0.971`
|
||||
- tool transcript: `cacheRead=4608`, hit rate `0.900`
|
||||
- image transcript: `cacheRead=4864`, hit rate `0.959`
|
||||
- MCP-style transcript: `cacheRead=4608`, hit rate `0.895`
|
||||
|
||||
Why the assertions differ:
|
||||
|
||||
- Anthropic exposes explicit cache breakpoints and moving conversation-history reuse.
|
||||
- OpenAI prompt caching is still exact-prefix sensitive, but the effective reusable prefix in live Responses traffic can plateau earlier than the full prompt.
|
||||
- Because of that, comparing Anthropic and OpenAI by a single cross-provider percentage threshold creates false regressions.
|
||||
|
||||
### `diagnostics.cacheTrace` config
|
||||
|
||||
```yaml
|
||||
|
||||
@@ -27,6 +27,12 @@ const OPENAI_SESSION_ID = "live-cache-openai-stable-session";
|
||||
const ANTHROPIC_SESSION_ID = "live-cache-anthropic-stable-session";
|
||||
const OPENAI_PREFIX = buildStableCachePrefix("openai");
|
||||
const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
|
||||
const OPENAI_STABLE_PREFIX_MIN_CACHE_READ = 4_608;
|
||||
const OPENAI_STABLE_PREFIX_MIN_HIT_RATE = 0.9;
|
||||
const OPENAI_TOOL_MIN_CACHE_READ = 4_096;
|
||||
const OPENAI_TOOL_MIN_HIT_RATE = 0.85;
|
||||
const OPENAI_IMAGE_MIN_CACHE_READ = 3_840;
|
||||
const OPENAI_IMAGE_MIN_HIT_RATE = 0.82;
|
||||
const LIVE_TEST_PNG_URL = new URL(
|
||||
"../../apps/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png",
|
||||
import.meta.url,
|
||||
@@ -625,7 +631,7 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
|
||||
}, 120_000);
|
||||
|
||||
it(
|
||||
"hits a high cache-read rate on repeated stable prefixes",
|
||||
"hits the expected OpenAI cache plateau on repeated stable prefixes",
|
||||
async () => {
|
||||
const warmup = await runOpenAiCacheProbe({
|
||||
...fixture,
|
||||
@@ -653,17 +659,19 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
|
||||
(candidate.usage.cacheRead ?? 0) > (best.usage.cacheRead ?? 0) ? candidate : best,
|
||||
);
|
||||
logLiveCache(
|
||||
`openai best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
`openai stable-prefix plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
);
|
||||
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.7);
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(
|
||||
OPENAI_STABLE_PREFIX_MIN_CACHE_READ,
|
||||
);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_STABLE_PREFIX_MIN_HIT_RATE);
|
||||
},
|
||||
6 * 60_000,
|
||||
);
|
||||
|
||||
it(
|
||||
"keeps high cache-read rates across tool-call followup turns",
|
||||
"keeps the expected OpenAI cache plateau across tool-call followup turns",
|
||||
async () => {
|
||||
const warmup = await runOpenAiToolCacheProbe({
|
||||
...fixture,
|
||||
@@ -686,17 +694,17 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
|
||||
});
|
||||
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
|
||||
logLiveCache(
|
||||
`openai tool best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
`openai tool plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
);
|
||||
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.7);
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_TOOL_MIN_CACHE_READ);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_TOOL_MIN_HIT_RATE);
|
||||
},
|
||||
8 * 60_000,
|
||||
);
|
||||
|
||||
it(
|
||||
"keeps high cache-read rates across image-heavy followup turns",
|
||||
"keeps the expected OpenAI cache plateau across image-heavy followup turns",
|
||||
async () => {
|
||||
const warmup = await runOpenAiImageCacheProbe({
|
||||
...fixture,
|
||||
@@ -719,11 +727,11 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
|
||||
});
|
||||
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
|
||||
logLiveCache(
|
||||
`openai image best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
`openai image plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
);
|
||||
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_IMAGE_MIN_CACHE_READ);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_IMAGE_MIN_HIT_RATE);
|
||||
},
|
||||
6 * 60_000,
|
||||
);
|
||||
|
||||
@@ -16,6 +16,8 @@ const describeCacheLive = LIVE_CACHE_TEST_ENABLED ? describe : describe.skip;
|
||||
const OPENAI_TIMEOUT_MS = 120_000;
|
||||
const OPENAI_SESSION_ID = "live-cache-openai-mcp-style-session";
|
||||
const OPENAI_PREFIX = buildStableCachePrefix("openai-mcp-style");
|
||||
const OPENAI_MCP_STYLE_MIN_CACHE_READ = 4_096;
|
||||
const OPENAI_MCP_STYLE_MIN_HIT_RATE = 0.85;
|
||||
|
||||
const MCP_TOOL: Tool = {
|
||||
name: "bundleProbe__bundle_probe",
|
||||
@@ -161,7 +163,7 @@ async function runOpenAiMcpStyleCacheProbe(params: {
|
||||
|
||||
describeCacheLive("MCP-style prompt caching (live)", () => {
|
||||
it(
|
||||
"keeps high cache-read rates across MCP-style followup turns",
|
||||
"keeps an OpenAI cache plateau across MCP-style followup turns",
|
||||
async () => {
|
||||
const fixture = await resolveLiveDirectModel({
|
||||
provider: "openai",
|
||||
@@ -192,11 +194,11 @@ describeCacheLive("MCP-style prompt caching (live)", () => {
|
||||
});
|
||||
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
|
||||
logLiveCache(
|
||||
`openai mcp-style best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
`openai mcp-style plateau suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
|
||||
);
|
||||
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
|
||||
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThanOrEqual(OPENAI_MCP_STYLE_MIN_CACHE_READ);
|
||||
expect(bestHit.hitRate).toBeGreaterThanOrEqual(OPENAI_MCP_STYLE_MIN_HIT_RATE);
|
||||
},
|
||||
10 * 60_000,
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user