diff --git a/src/agents/openai-ws-connection.ts b/src/agents/openai-ws-connection.ts index a6e923ae1d5..e489cc36259 100644 --- a/src/agents/openai-ws-connection.ts +++ b/src/agents/openai-ws-connection.ts @@ -44,6 +44,9 @@ export interface UsageInfo { total_tokens?: number; prompt_tokens?: number; completion_tokens?: number; + input_tokens_details?: { + cached_tokens?: number; + }; } export type OpenAIResponsesAssistantPhase = "commentary" | "final_answer"; diff --git a/src/agents/openai-ws-message-conversion.test.ts b/src/agents/openai-ws-message-conversion.test.ts new file mode 100644 index 00000000000..4702f517192 --- /dev/null +++ b/src/agents/openai-ws-message-conversion.test.ts @@ -0,0 +1,82 @@ +import { describe, expect, it } from "vitest"; +import type { ResponseObject } from "./openai-ws-connection.js"; +import { buildAssistantMessageFromResponse } from "./openai-ws-message-conversion.js"; + +describe("openai ws message conversion", () => { + it("preserves cached token usage from responses usage details", () => { + const response: ResponseObject = { + id: "resp_123", + object: "response", + created_at: Date.now(), + status: "completed", + model: "gpt-5", + output: [ + { + type: "message", + id: "msg_123", + role: "assistant", + status: "completed", + content: [{ type: "output_text", text: "hello" }], + }, + ], + usage: { + input_tokens: 120, + output_tokens: 30, + total_tokens: 250, + input_tokens_details: { cached_tokens: 100 }, + }, + }; + + const message = buildAssistantMessageFromResponse(response, { + api: "openai-responses", + provider: "openai", + id: "gpt-5", + }); + + expect(message.usage).toMatchObject({ + input: 20, + output: 30, + cacheRead: 100, + cacheWrite: 0, + totalTokens: 250, + }); + }); + + it("derives cache-inclusive total tokens when responses total is missing", () => { + const response: ResponseObject = { + id: "resp_124", + object: "response", + created_at: Date.now(), + status: "completed", + model: "gpt-5", + output: [ + { + type: "message", + id: "msg_124", + role: "assistant", + status: "completed", + content: [{ type: "output_text", text: "hello" }], + }, + ], + usage: { + input_tokens: 120, + output_tokens: 30, + input_tokens_details: { cached_tokens: 100 }, + }, + }; + + const message = buildAssistantMessageFromResponse(response, { + api: "openai-responses", + provider: "openai", + id: "gpt-5", + }); + + expect(message.usage).toMatchObject({ + input: 20, + output: 30, + cacheRead: 100, + cacheWrite: 0, + totalTokens: 150, + }); + }); +}); diff --git a/src/agents/openai-ws-message-conversion.ts b/src/agents/openai-ws-message-conversion.ts index 3dd4665d303..506f03e0c36 100644 --- a/src/agents/openai-ws-message-conversion.ts +++ b/src/agents/openai-ws-message-conversion.ts @@ -563,6 +563,13 @@ export function buildAssistantMessageFromResponse( const stopReason: StopReason = hasToolCalls ? "toolUse" : "stop"; const normalizedUsage = normalizeUsage(response.usage); const rawTotalTokens = normalizedUsage?.total; + const resolvedTotalTokens = + rawTotalTokens && rawTotalTokens > 0 + ? rawTotalTokens + : (normalizedUsage?.input ?? 0) + + (normalizedUsage?.output ?? 0) + + (normalizedUsage?.cacheRead ?? 0) + + (normalizedUsage?.cacheWrite ?? 0); const message = buildAssistantMessage({ model: modelInfo, @@ -571,7 +578,9 @@ export function buildAssistantMessageFromResponse( usage: buildUsageWithNoCost({ input: normalizedUsage?.input ?? 0, output: normalizedUsage?.output ?? 0, - totalTokens: rawTotalTokens && rawTotalTokens > 0 ? rawTotalTokens : undefined, + cacheRead: normalizedUsage?.cacheRead ?? 0, + cacheWrite: normalizedUsage?.cacheWrite ?? 0, + totalTokens: resolvedTotalTokens > 0 ? resolvedTotalTokens : undefined, }), }); diff --git a/src/agents/usage.test.ts b/src/agents/usage.test.ts index 01b3bf893a3..3ac11a7bd93 100644 --- a/src/agents/usage.test.ts +++ b/src/agents/usage.test.ts @@ -63,7 +63,7 @@ describe("normalizeUsage", () => { cached_tokens: 19, }); expect(usage).toEqual({ - input: 30, + input: 11, output: 9, cacheRead: 19, cacheWrite: undefined, @@ -80,7 +80,7 @@ describe("normalizeUsage", () => { prompt_tokens_details: { cached_tokens: 1024 }, }); expect(usage).toEqual({ - input: 1113, + input: 89, output: 5, cacheRead: 1024, cacheWrite: undefined, @@ -88,6 +88,22 @@ describe("normalizeUsage", () => { }); }); + it("handles OpenAI Responses input_tokens_details.cached_tokens field", () => { + const usage = normalizeUsage({ + input_tokens: 120, + output_tokens: 30, + total_tokens: 250, + input_tokens_details: { cached_tokens: 100 }, + }); + expect(usage).toEqual({ + input: 20, + output: 30, + cacheRead: 100, + cacheWrite: undefined, + total: 250, + }); + }); + it("clamps negative input to zero (pre-subtracted cached_tokens > prompt_tokens)", () => { // pi-ai OpenAI-format providers subtract cached_tokens from prompt_tokens // upstream. When cached_tokens exceeds prompt_tokens the result is negative. diff --git a/src/agents/usage.ts b/src/agents/usage.ts index 251cb56155c..0071bb81263 100644 --- a/src/agents/usage.ts +++ b/src/agents/usage.ts @@ -17,6 +17,8 @@ export type UsageLike = { cache_creation_input_tokens?: number; // Moonshot/Kimi uses cached_tokens for cache read count (explicit caching API). cached_tokens?: number; + // OpenAI Responses reports cached prompt reuse here. + input_tokens_details?: { cached_tokens?: number }; // Kimi K2 uses prompt_tokens_details.cached_tokens for automatic prefix caching. prompt_tokens_details?: { cached_tokens?: number }; // Some agents/logs emit alternate naming. @@ -90,13 +92,34 @@ export function normalizeUsage(raw?: UsageLike | null): NormalizedUsage | undefi return undefined; } - // Some providers (pi-ai OpenAI-format) pre-subtract cached_tokens from - // prompt_tokens upstream. When cached_tokens > prompt_tokens the result is - // negative, which is nonsensical. Clamp to 0. - const rawInput = asFiniteNumber( - raw.input ?? raw.inputTokens ?? raw.input_tokens ?? raw.promptTokens ?? raw.prompt_tokens, + const cacheRead = asFiniteNumber( + raw.cacheRead ?? + raw.cache_read ?? + raw.cache_read_input_tokens ?? + raw.cached_tokens ?? + raw.input_tokens_details?.cached_tokens ?? + raw.prompt_tokens_details?.cached_tokens, ); - const input = rawInput !== undefined && rawInput < 0 ? 0 : rawInput; + + const rawInputValue = + raw.input ?? raw.inputTokens ?? raw.input_tokens ?? raw.promptTokens ?? raw.prompt_tokens; + + const usesOpenAIStylePromptTotals = + raw.cached_tokens !== undefined || + raw.input_tokens_details?.cached_tokens !== undefined || + raw.prompt_tokens_details?.cached_tokens !== undefined; + + // Some providers (pi-ai OpenAI-format) pre-subtract cached_tokens from + // prompt/input totals upstream, while OpenAI-style prompt/input aliases + // include cached tokens in the reported prompt total. Normalize both cases + // to uncached input tokens so downstream prompt-token math does not double- + // count cache reads. + const rawInput = asFiniteNumber(rawInputValue); + const normalizedInput = + rawInput !== undefined && usesOpenAIStylePromptTotals && cacheRead !== undefined + ? rawInput - cacheRead + : rawInput; + const input = normalizedInput !== undefined && normalizedInput < 0 ? 0 : normalizedInput; const output = asFiniteNumber( raw.output ?? raw.outputTokens ?? @@ -104,13 +127,6 @@ export function normalizeUsage(raw?: UsageLike | null): NormalizedUsage | undefi raw.completionTokens ?? raw.completion_tokens, ); - const cacheRead = asFiniteNumber( - raw.cacheRead ?? - raw.cache_read ?? - raw.cache_read_input_tokens ?? - raw.cached_tokens ?? - raw.prompt_tokens_details?.cached_tokens, - ); const cacheWrite = asFiniteNumber( raw.cacheWrite ?? raw.cache_write ?? raw.cache_creation_input_tokens, );