diff --git a/src/agents/live-cache-regression-runner.test.ts b/src/agents/live-cache-regression-runner.test.ts index 0202d33bfda..e47b6bf42e5 100644 --- a/src/agents/live-cache-regression-runner.test.ts +++ b/src/agents/live-cache-regression-runner.test.ts @@ -143,6 +143,16 @@ describe("live cache regression runner", () => { ).toBe(1024); }); + it("classifies Anthropic tool-only probe misses as provider drift", () => { + expect(__testing.isAnthropicToolProbeDrift(new Error("expected tool call for noop"))).toBe(true); + expect( + __testing.isAnthropicToolProbeDrift( + new Error('expected tool-only response for noop, got "ok"'), + ), + ).toBe(true); + expect(__testing.isAnthropicToolProbeDrift(new Error("other failure"))).toBe(false); + }); + it("accepts empty cache probe text only when usage is observable", () => { expect( __testing.shouldAcceptEmptyCacheProbe({ diff --git a/src/agents/live-cache-regression-runner.ts b/src/agents/live-cache-regression-runner.ts index 6171043ace7..17183024d3f 100644 --- a/src/agents/live-cache-regression-runner.ts +++ b/src/agents/live-cache-regression-runner.ts @@ -605,6 +605,16 @@ function isAnthropicEmptyCacheProbe(error: unknown): boolean { return error instanceof CacheProbeTextMismatchError && error.text.trim().length === 0; } +function isAnthropicToolProbeDrift(error: unknown): boolean { + if (!(error instanceof Error)) { + return false; + } + return ( + error.message.startsWith("expected tool call for ") || + error.message.startsWith("expected tool-only response for ") + ); +} + function shouldSkipAnthropicCacheProviderDrift(error: unknown): boolean { return Boolean( shouldSkipLiveProviderDrift({ @@ -646,8 +656,16 @@ async function runAnthropicCacheLane(params: { } } - if (shouldSkipAnthropicCacheProviderDrift(lastError) || isAnthropicEmptyCacheProbe(lastError)) { - const reason = isAnthropicEmptyCacheProbe(lastError) ? "empty response" : "account drift"; + if ( + shouldSkipAnthropicCacheProviderDrift(lastError) || + isAnthropicEmptyCacheProbe(lastError) || + isAnthropicToolProbeDrift(lastError) + ) { + const reason = isAnthropicEmptyCacheProbe(lastError) + ? "empty response" + : isAnthropicToolProbeDrift(lastError) + ? "tool probe drift" + : "account drift"; const warning = `anthropic ${params.lane} skipped: ${reason}`; params.warnings.push(warning); logLiveCache(warning); @@ -682,6 +700,7 @@ export const __testing = { assertAgainstBaseline, evaluateAgainstBaseline, resolveCacheProbeMaxTokens, + isAnthropicToolProbeDrift, shouldAcceptEmptyCacheProbe, shouldRetryCacheProbeText, shouldRetryBaselineFindings,