diff --git a/src/agents/live-cache-regression-baseline.ts b/src/agents/live-cache-regression-baseline.ts index 6c76212e632..68565908ae2 100644 --- a/src/agents/live-cache-regression-baseline.ts +++ b/src/agents/live-cache-regression-baseline.ts @@ -64,18 +64,21 @@ export const LIVE_CACHE_REGRESSION_BASELINE = { observedHitRate: 0.891, minCacheRead: 4_096, minHitRate: 0.85, + warnOnly: true, }, stable: { observedCacheRead: 4_864, observedHitRate: 0.966, minCacheRead: 4_608, minHitRate: 0.9, + warnOnly: true, }, tool: { observedCacheRead: 4_608, observedHitRate: 0.896, minCacheRead: 4_096, minHitRate: 0.85, + warnOnly: true, }, }, } as const satisfies Record>; diff --git a/src/agents/live-cache-regression-runner.test.ts b/src/agents/live-cache-regression-runner.test.ts index f462b1e214d..1ee0637a8ea 100644 --- a/src/agents/live-cache-regression-runner.test.ts +++ b/src/agents/live-cache-regression-runner.test.ts @@ -28,7 +28,7 @@ describe("live cache regression runner", () => { ]); }); - it("keeps hard cache floors blocking for required OpenAI lanes", () => { + it("keeps OpenAI text cache floor misses advisory", () => { const regressions: string[] = []; const warnings: string[] = []; @@ -47,11 +47,11 @@ describe("live cache regression runner", () => { warnings, }); - expect(regressions).toEqual([ + expect(regressions).toEqual([]); + expect(warnings).toEqual([ "openai:stable cacheRead=0 < min=4608", "openai:stable hitRate=0.000 < min=0.900", ]); - expect(warnings).toEqual([]); }); it("retries hard cache baseline misses once", () => { @@ -122,6 +122,65 @@ describe("live cache regression runner", () => { ).toBe(false); }); + it("keeps OpenAI cache probes above the reasoning output floor", () => { + expect( + __testing.resolveCacheProbeMaxTokens({ + maxTokens: 32, + providerTag: "openai", + }), + ).toBe(256); + expect( + __testing.resolveCacheProbeMaxTokens({ + maxTokens: 512, + providerTag: "openai", + }), + ).toBe(512); + expect( + __testing.resolveCacheProbeMaxTokens({ + maxTokens: 32, + providerTag: "anthropic", + }), + ).toBe(32); + }); + + it("accepts empty OpenAI cache probe text only when usage is observable", () => { + expect( + __testing.shouldAcceptEmptyOpenAICacheProbe({ + providerTag: "openai", + text: "", + usage: { input: 5_000 }, + }), + ).toBe(true); + expect( + __testing.shouldAcceptEmptyOpenAICacheProbe({ + providerTag: "openai", + text: "", + usage: { cacheRead: 4_608 }, + }), + ).toBe(true); + expect( + __testing.shouldAcceptEmptyOpenAICacheProbe({ + providerTag: "openai", + text: "wrong", + usage: { input: 5_000 }, + }), + ).toBe(false); + expect( + __testing.shouldAcceptEmptyOpenAICacheProbe({ + providerTag: "anthropic", + text: "", + usage: { input: 5_000 }, + }), + ).toBe(false); + expect( + __testing.shouldAcceptEmptyOpenAICacheProbe({ + providerTag: "openai", + text: "", + usage: {}, + }), + ).toBe(false); + }); + it("accepts a warmup that already hits the provider cache", () => { const findings = __testing.evaluateAgainstBaseline({ lane: "image", diff --git a/src/agents/live-cache-regression-runner.ts b/src/agents/live-cache-regression-runner.ts index 78d01634d43..ac91ba0f587 100644 --- a/src/agents/live-cache-regression-runner.ts +++ b/src/agents/live-cache-regression-runner.ts @@ -22,6 +22,7 @@ const ANTHROPIC_TIMEOUT_MS = 120_000; const LIVE_CACHE_LANE_RETRIES = 1; const LIVE_CACHE_RESPONSE_RETRIES = 2; const OPENAI_CACHE_REASONING = "low" as unknown as never; +const OPENAI_CACHE_MIN_MAX_TOKENS = 256; const OPENAI_PREFIX = buildStableCachePrefix("openai"); const OPENAI_MCP_PREFIX = buildStableCachePrefix("openai-mcp-style"); const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic"); @@ -153,6 +154,32 @@ function shouldRetryCacheProbeText(params: { ); } +function resolveCacheProbeMaxTokens(params: { + maxTokens: number | undefined; + providerTag: "anthropic" | "openai"; +}): number { + const requested = params.maxTokens ?? 64; + if (params.providerTag !== "openai") { + return requested; + } + return Math.max(requested, OPENAI_CACHE_MIN_MAX_TOKENS); +} + +function shouldAcceptEmptyOpenAICacheProbe(params: { + providerTag: "anthropic" | "openai"; + text: string; + usage: CacheUsage; +}): boolean { + if (params.providerTag !== "openai" || params.text.trim().length > 0) { + return false; + } + return ( + (params.usage.input ?? 0) > 0 || + (params.usage.cacheRead ?? 0) > 0 || + (params.usage.cacheWrite ?? 0) > 0 + ); +} + async function runToolOnlyTurn(params: { apiKey: string; cacheRetention: "none" | "short" | "long"; @@ -242,7 +269,10 @@ async function completeCacheProbe(params: { apiKey: params.apiKey, cacheRetention: params.cacheRetention, sessionId: params.sessionId, - maxTokens: params.maxTokens ?? 64, + maxTokens: resolveCacheProbeMaxTokens({ + maxTokens: params.maxTokens, + providerTag: params.providerTag, + }), temperature: 0, ...(params.providerTag === "openai" ? { reasoning: OPENAI_CACHE_REASONING } : {}), }, @@ -250,6 +280,24 @@ async function completeCacheProbe(params: { timeoutMs, ); const text = extractAssistantText(response); + const usage = normalizeCacheUsage(response.usage); + if ( + shouldAcceptEmptyOpenAICacheProbe({ + providerTag: params.providerTag, + text, + usage, + }) + ) { + logLiveCache( + `${params.providerTag} cache lane ${params.suffix} accepted empty text with usage ${formatUsage(usage)}`, + ); + return { + suffix: params.suffix, + text, + usage, + hitRate: computeCacheHitRate(usage), + }; + } if (shouldRetryCacheProbeText({ attempt, suffix: params.suffix, text })) { logLiveCache( `${params.providerTag} cache lane ${params.suffix} response mismatch; retrying: ${JSON.stringify(text)}`, @@ -262,7 +310,6 @@ async function completeCacheProbe(params: { if (!responseTextLower.includes(markerLower)) { throw new CacheProbeTextMismatchError(params.suffix, text); } - const usage = normalizeCacheUsage(response.usage); return { suffix: params.suffix, text, @@ -551,6 +598,8 @@ function appendBaselineFindings(target: BaselineFindings, source: BaselineFindin export const __testing = { assertAgainstBaseline, evaluateAgainstBaseline, + resolveCacheProbeMaxTokens, + shouldAcceptEmptyOpenAICacheProbe, shouldRetryCacheProbeText, shouldRetryBaselineFindings, }; @@ -562,7 +611,7 @@ export async function runLiveCacheRegression(): Promise