fix: normalize cached prompt token accounting

This commit is contained in:
Tak Hoffman
2026-04-06 15:55:27 -05:00
parent a29b501ec9
commit 079494aee5
5 changed files with 142 additions and 16 deletions

View File

@@ -44,6 +44,9 @@ export interface UsageInfo {
total_tokens?: number;
prompt_tokens?: number;
completion_tokens?: number;
input_tokens_details?: {
cached_tokens?: number;
};
}
export type OpenAIResponsesAssistantPhase = "commentary" | "final_answer";

View File

@@ -0,0 +1,82 @@
import { describe, expect, it } from "vitest";
import type { ResponseObject } from "./openai-ws-connection.js";
import { buildAssistantMessageFromResponse } from "./openai-ws-message-conversion.js";
describe("openai ws message conversion", () => {
it("preserves cached token usage from responses usage details", () => {
const response: ResponseObject = {
id: "resp_123",
object: "response",
created_at: Date.now(),
status: "completed",
model: "gpt-5",
output: [
{
type: "message",
id: "msg_123",
role: "assistant",
status: "completed",
content: [{ type: "output_text", text: "hello" }],
},
],
usage: {
input_tokens: 120,
output_tokens: 30,
total_tokens: 250,
input_tokens_details: { cached_tokens: 100 },
},
};
const message = buildAssistantMessageFromResponse(response, {
api: "openai-responses",
provider: "openai",
id: "gpt-5",
});
expect(message.usage).toMatchObject({
input: 20,
output: 30,
cacheRead: 100,
cacheWrite: 0,
totalTokens: 250,
});
});
it("derives cache-inclusive total tokens when responses total is missing", () => {
const response: ResponseObject = {
id: "resp_124",
object: "response",
created_at: Date.now(),
status: "completed",
model: "gpt-5",
output: [
{
type: "message",
id: "msg_124",
role: "assistant",
status: "completed",
content: [{ type: "output_text", text: "hello" }],
},
],
usage: {
input_tokens: 120,
output_tokens: 30,
input_tokens_details: { cached_tokens: 100 },
},
};
const message = buildAssistantMessageFromResponse(response, {
api: "openai-responses",
provider: "openai",
id: "gpt-5",
});
expect(message.usage).toMatchObject({
input: 20,
output: 30,
cacheRead: 100,
cacheWrite: 0,
totalTokens: 150,
});
});
});

View File

@@ -563,6 +563,13 @@ export function buildAssistantMessageFromResponse(
const stopReason: StopReason = hasToolCalls ? "toolUse" : "stop";
const normalizedUsage = normalizeUsage(response.usage);
const rawTotalTokens = normalizedUsage?.total;
const resolvedTotalTokens =
rawTotalTokens && rawTotalTokens > 0
? rawTotalTokens
: (normalizedUsage?.input ?? 0) +
(normalizedUsage?.output ?? 0) +
(normalizedUsage?.cacheRead ?? 0) +
(normalizedUsage?.cacheWrite ?? 0);
const message = buildAssistantMessage({
model: modelInfo,
@@ -571,7 +578,9 @@ export function buildAssistantMessageFromResponse(
usage: buildUsageWithNoCost({
input: normalizedUsage?.input ?? 0,
output: normalizedUsage?.output ?? 0,
totalTokens: rawTotalTokens && rawTotalTokens > 0 ? rawTotalTokens : undefined,
cacheRead: normalizedUsage?.cacheRead ?? 0,
cacheWrite: normalizedUsage?.cacheWrite ?? 0,
totalTokens: resolvedTotalTokens > 0 ? resolvedTotalTokens : undefined,
}),
});

View File

@@ -63,7 +63,7 @@ describe("normalizeUsage", () => {
cached_tokens: 19,
});
expect(usage).toEqual({
input: 30,
input: 11,
output: 9,
cacheRead: 19,
cacheWrite: undefined,
@@ -80,7 +80,7 @@ describe("normalizeUsage", () => {
prompt_tokens_details: { cached_tokens: 1024 },
});
expect(usage).toEqual({
input: 1113,
input: 89,
output: 5,
cacheRead: 1024,
cacheWrite: undefined,
@@ -88,6 +88,22 @@ describe("normalizeUsage", () => {
});
});
it("handles OpenAI Responses input_tokens_details.cached_tokens field", () => {
const usage = normalizeUsage({
input_tokens: 120,
output_tokens: 30,
total_tokens: 250,
input_tokens_details: { cached_tokens: 100 },
});
expect(usage).toEqual({
input: 20,
output: 30,
cacheRead: 100,
cacheWrite: undefined,
total: 250,
});
});
it("clamps negative input to zero (pre-subtracted cached_tokens > prompt_tokens)", () => {
// pi-ai OpenAI-format providers subtract cached_tokens from prompt_tokens
// upstream. When cached_tokens exceeds prompt_tokens the result is negative.

View File

@@ -17,6 +17,8 @@ export type UsageLike = {
cache_creation_input_tokens?: number;
// Moonshot/Kimi uses cached_tokens for cache read count (explicit caching API).
cached_tokens?: number;
// OpenAI Responses reports cached prompt reuse here.
input_tokens_details?: { cached_tokens?: number };
// Kimi K2 uses prompt_tokens_details.cached_tokens for automatic prefix caching.
prompt_tokens_details?: { cached_tokens?: number };
// Some agents/logs emit alternate naming.
@@ -90,13 +92,34 @@ export function normalizeUsage(raw?: UsageLike | null): NormalizedUsage | undefi
return undefined;
}
// Some providers (pi-ai OpenAI-format) pre-subtract cached_tokens from
// prompt_tokens upstream. When cached_tokens > prompt_tokens the result is
// negative, which is nonsensical. Clamp to 0.
const rawInput = asFiniteNumber(
raw.input ?? raw.inputTokens ?? raw.input_tokens ?? raw.promptTokens ?? raw.prompt_tokens,
const cacheRead = asFiniteNumber(
raw.cacheRead ??
raw.cache_read ??
raw.cache_read_input_tokens ??
raw.cached_tokens ??
raw.input_tokens_details?.cached_tokens ??
raw.prompt_tokens_details?.cached_tokens,
);
const input = rawInput !== undefined && rawInput < 0 ? 0 : rawInput;
const rawInputValue =
raw.input ?? raw.inputTokens ?? raw.input_tokens ?? raw.promptTokens ?? raw.prompt_tokens;
const usesOpenAIStylePromptTotals =
raw.cached_tokens !== undefined ||
raw.input_tokens_details?.cached_tokens !== undefined ||
raw.prompt_tokens_details?.cached_tokens !== undefined;
// Some providers (pi-ai OpenAI-format) pre-subtract cached_tokens from
// prompt/input totals upstream, while OpenAI-style prompt/input aliases
// include cached tokens in the reported prompt total. Normalize both cases
// to uncached input tokens so downstream prompt-token math does not double-
// count cache reads.
const rawInput = asFiniteNumber(rawInputValue);
const normalizedInput =
rawInput !== undefined && usesOpenAIStylePromptTotals && cacheRead !== undefined
? rawInput - cacheRead
: rawInput;
const input = normalizedInput !== undefined && normalizedInput < 0 ? 0 : normalizedInput;
const output = asFiniteNumber(
raw.output ??
raw.outputTokens ??
@@ -104,13 +127,6 @@ export function normalizeUsage(raw?: UsageLike | null): NormalizedUsage | undefi
raw.completionTokens ??
raw.completion_tokens,
);
const cacheRead = asFiniteNumber(
raw.cacheRead ??
raw.cache_read ??
raw.cache_read_input_tokens ??
raw.cached_tokens ??
raw.prompt_tokens_details?.cached_tokens,
);
const cacheWrite = asFiniteNumber(
raw.cacheWrite ?? raw.cache_write ?? raw.cache_creation_input_tokens,
);