From 0c26623a965414df85088bf2d59f5912b4863b08 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 21 Apr 2026 03:48:17 +0100 Subject: [PATCH] fix: correct tiered model pricing costs --- src/gateway/model-pricing-cache.test.ts | 13 ++- src/gateway/model-pricing-cache.ts | 5 +- src/utils/usage-format.test.ts | 68 ++++----------- src/utils/usage-format.ts | 105 ++++++++---------------- 4 files changed, 66 insertions(+), 125 deletions(-) diff --git a/src/gateway/model-pricing-cache.test.ts b/src/gateway/model-pricing-cache.test.ts index 5e8dfbeb7fd..d88857c6728 100644 --- a/src/gateway/model-pricing-cache.test.ts +++ b/src/gateway/model-pricing-cache.test.ts @@ -279,21 +279,25 @@ describe("model-pricing-cache", () => { "volcengine/doubao-seed-2-0-pro": { input_cost_per_token: 4.6e-7, output_cost_per_token: 2.3e-6, + cache_creation_input_token_cost: 9.2e-7, litellm_provider: "volcengine", tiered_pricing: [ { input_cost_per_token: 4.6e-7, output_cost_per_token: 2.3e-6, + cache_creation_input_token_cost: 9.2e-8, range: [0, 32000], }, { input_cost_per_token: 7e-7, output_cost_per_token: 3.5e-6, + cache_creation_input_token_cost: 1.4e-7, range: [32000, 128000], }, { input_cost_per_token: 1.4e-6, output_cost_per_token: 7e-6, + cache_creation_input_token_cost: 2.8e-7, range: [128000, 256000], }, ], @@ -316,14 +320,16 @@ describe("model-pricing-cache", () => { expect(pricing).toBeDefined(); expect(pricing!.input).toBeCloseTo(0.46); expect(pricing!.output).toBeCloseTo(2.3); + expect(pricing!.cacheWrite).toBeCloseTo(0.92); expect(pricing!.tieredPricing).toHaveLength(3); expect(pricing!.tieredPricing![0]).toEqual({ input: expect.closeTo(0.46), output: expect.closeTo(2.3), cacheRead: 0, - cacheWrite: 0, + cacheWrite: expect.closeTo(0.092), range: [0, 32000], }); + expect(pricing!.tieredPricing![2].cacheWrite).toBeCloseTo(0.28); expect(pricing!.tieredPricing![2].range).toEqual([128000, 256000]); }); @@ -359,6 +365,7 @@ describe("model-pricing-cache", () => { { input_cost_per_token: 7e-7, output_cost_per_token: 3.5e-6, + cache_creation_input_token_cost: 1.4e-7, range: [32000], }, ], @@ -382,6 +389,7 @@ describe("model-pricing-cache", () => { expect(pricing!.tieredPricing).toHaveLength(2); expect(pricing!.tieredPricing![0].range).toEqual([0, 32000]); expect(pricing!.tieredPricing![1].range).toEqual([32000, Infinity]); + expect(pricing!.tieredPricing![1].cacheWrite).toBeCloseTo(0.14); }); it("merges OpenRouter flat pricing with LiteLLM tiered pricing", async () => { @@ -424,11 +432,13 @@ describe("model-pricing-cache", () => { { input_cost_per_token: 4e-7, output_cost_per_token: 2.4e-6, + cache_creation_input_token_cost: 8e-8, range: [0, 256000], }, { input_cost_per_token: 5e-7, output_cost_per_token: 3e-6, + cache_creation_input_token_cost: 1e-7, range: [256000, 1000000], }, ], @@ -455,6 +465,7 @@ describe("model-pricing-cache", () => { // LiteLLM tiered pricing is merged in expect(pricing!.tieredPricing).toHaveLength(2); expect(pricing!.tieredPricing![1].range).toEqual([256000, 1000000]); + expect(pricing!.tieredPricing![1].cacheWrite).toBeCloseTo(0.1); }); it("falls back gracefully when LiteLLM fetch fails", async () => { diff --git a/src/gateway/model-pricing-cache.ts b/src/gateway/model-pricing-cache.ts index 5003328d160..7f95a4dbbf8 100644 --- a/src/gateway/model-pricing-cache.ts +++ b/src/gateway/model-pricing-cache.ts @@ -153,6 +153,7 @@ type LiteLLMTierRaw = { input_cost_per_token?: unknown; output_cost_per_token?: unknown; cache_read_input_token_cost?: unknown; + cache_creation_input_token_cost?: unknown; range?: unknown; }; @@ -194,7 +195,7 @@ function parseLiteLLMTieredPricing(tiers: unknown): CachedPricingTier[] | undefi input: toPricePerMillion(inputPerToken), output: toPricePerMillion(outputPerToken), cacheRead: toPricePerMillion(parseNumberString(tier.cache_read_input_token_cost)), - cacheWrite: 0, + cacheWrite: toPricePerMillion(parseNumberString(tier.cache_creation_input_token_cost)), range: [start, end], }); } @@ -211,7 +212,7 @@ function parseLiteLLMPricing(entry: LiteLLMModelEntry): CachedModelPricing | nul input: toPricePerMillion(inputPerToken), output: toPricePerMillion(outputPerToken), cacheRead: toPricePerMillion(parseNumberString(entry.cache_read_input_token_cost)), - cacheWrite: 0, + cacheWrite: toPricePerMillion(parseNumberString(entry.cache_creation_input_token_cost)), }; const tieredPricing = parseLiteLLMTieredPricing(entry.tiered_pricing); if (tieredPricing) { diff --git a/src/utils/usage-format.test.ts b/src/utils/usage-format.test.ts index a54f2f32424..d22e83b54cd 100644 --- a/src/utils/usage-format.test.ts +++ b/src/utils/usage-format.test.ts @@ -282,7 +282,7 @@ describe("usage-format", () => { expect(total).toBeCloseTo(0.003); }); - it("estimates cost with two tiers — input split across tiers", () => { + it("uses the matching context tier instead of blending lower tiers", () => { // Tier 1: [0, 32000) → input $0.30/M, output $1.50/M // Tier 2: [32000, 128000) → input $0.50/M, output $2.50/M const tiers: PricingTier[] = [ @@ -291,17 +291,13 @@ describe("usage-format", () => { ]; const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers }; - // 40000 input tokens, 10000 output tokens - // Tier 1 gets 32000/40000 = 80% of input → 32000 input tokens - // Tier 2 gets 8000/40000 = 20% of input → 8000 input tokens - // Input cost = (32000 * 0.3 + 8000 * 0.5) / 1M = (9600 + 4000) / 1M = 0.0136 - // Output cost = (10000 * 0.8 * 1.5 + 10000 * 0.2 * 2.5) / 1M = (12000 + 5000) / 1M = 0.017 - // Total = 0.0136 + 0.017 = 0.0306 + // 40000 input tokens selects Tier 2 for the whole request: + // (40000 * 0.5 + 10000 * 2.5) / 1M = 0.045 const total = estimateUsageCost({ usage: { input: 40_000, output: 10_000 }, cost, }); - expect(total).toBeCloseTo(0.0306, 4); + expect(total).toBeCloseTo(0.045, 4); }); it("estimates cost with three tiers — volcengine-style pricing", () => { @@ -316,22 +312,13 @@ describe("usage-format", () => { ]; const cost = { input: 0.46, output: 2.3, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers }; - // 200000 input tokens, 5000 output tokens - // Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16 - // Tier 2: 96000 tokens, fraction = 96000/200000 = 0.48 - // Tier 3: 72000 tokens, fraction = 72000/200000 = 0.36 - // - // Input cost = (32000*0.46 + 96000*0.70 + 72000*1.40) / 1M - // = (14720 + 67200 + 100800) / 1M = 182720 / 1M = 0.18272 - // Output cost = 5000 * (0.16*2.3 + 0.48*3.5 + 0.36*7.0) / 1M - // = 5000 * (0.368 + 1.68 + 2.52) / 1M - // = 5000 * 4.568 / 1M = 22840 / 1M = 0.02284 - // Total = 0.18272 + 0.02284 = 0.20556 + // 200000 input tokens selects Tier 3 for the whole request: + // (200000 * 1.40 + 5000 * 7.00) / 1M = 0.315 const total = estimateUsageCost({ usage: { input: 200_000, output: 5_000 }, cost, }); - expect(total).toBeCloseTo(0.20556, 4); + expect(total).toBeCloseTo(0.315, 4); }); it("uses first tier rates for output when input is zero", () => { @@ -375,22 +362,13 @@ describe("usage-format", () => { ]; const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers }; - // 200000 input, 10000 output - // Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16 - // Tier 2: 96000 tokens, fraction = 96000/200000 = 0.48 - // Overflow (at Tier 2 rates): 72000 tokens, fraction = 72000/200000 = 0.36 - // - // Input cost = (32000*0.3 + 96000*0.5 + 72000*0.5) / 1M - // = (9600 + 48000 + 36000) / 1M = 93600/1M = 0.0936 - // Output cost = 10000 * (0.16*1.5 + 0.48*2.5 + 0.36*2.5) / 1M - // = 10000 * (0.24 + 1.2 + 0.9) / 1M - // = 10000 * 2.34 / 1M = 23400/1M = 0.0234 - // Total = 0.0936 + 0.0234 = 0.117 + // 200000 input tokens exceeds the max range, so the last tier is the + // whole-request fallback: (200000 * 0.5 + 10000 * 2.5) / 1M = 0.125 const total = estimateUsageCost({ usage: { input: 200_000, output: 10_000 }, cost, }); - expect(total).toBeCloseTo(0.117, 4); + expect(total).toBeCloseTo(0.125, 4); }); it("bills overflow at last tier when only a single small-range tier exists (e.g. <30K)", () => { @@ -400,14 +378,7 @@ describe("usage-format", () => { ]; const cost = { input: 1.0, output: 3.0, cacheRead: 0.5, cacheWrite: 0, tieredPricing: tiers }; - // 100000 input, 5000 output, 2000 cacheRead - // Tier 1: 30000 tokens, fraction = 30000/100000 = 0.3 - // Overflow (at Tier 1 rates): 70000 tokens, fraction = 70000/100000 = 0.7 - // Fractions sum to 1.0 — all output/cache fully billed - // - // Input cost = (30000*1.0 + 70000*1.0) / 1M = 100000/1M = 0.1 - // Output cost = 5000 * (0.3*3.0 + 0.7*3.0) / 1M = 5000*3.0/1M = 0.015 - // CacheRead cost = 2000 * (0.3*0.5 + 0.7*0.5) / 1M = 2000*0.5/1M = 0.001 + // 100000 input exceeds the only range, so Tier 1 is the whole-request fallback. // Total = 0.1 + 0.015 + 0.001 = 0.116 const total = estimateUsageCost({ usage: { input: 100_000, output: 5_000, cacheRead: 2_000 }, @@ -425,19 +396,12 @@ describe("usage-format", () => { ]; const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers }; - // 200000 input, 10000 output - // Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16 - // Tier 2: 168000 tokens, fraction = 168000/200000 = 0.84 - // No overflow — Tier 2 absorbs everything beyond 32K - // - // Input cost = (32000*0.3 + 168000*0.5) / 1M = (9600 + 84000) / 1M = 0.0936 - // Output cost = 10000 * (0.16*1.5 + 0.84*2.5) / 1M = 10000 * (0.24 + 2.1) / 1M = 0.0234 - // Total = 0.0936 + 0.0234 = 0.117 + // 200000 input tokens selects the open-ended Tier 2 for the whole request. const total = estimateUsageCost({ usage: { input: 200_000, output: 10_000 }, cost, }); - expect(total).toBeCloseTo(0.117, 4); + expect(total).toBeCloseTo(0.125, 4); }); it("uses declared tier ranges instead of sequential widths", () => { @@ -452,10 +416,10 @@ describe("usage-format", () => { cost, }); - expect(total).toBeCloseTo(0.00125, 8); + expect(total).toBeCloseTo(0.00075, 8); }); - it("bills malformed tier gaps at a fallback tier instead of dropping them", () => { + it("bills malformed tier gaps at a whole-request fallback tier", () => { const tiers: PricingTier[] = [ { input: 1, output: 10, cacheRead: 0, cacheWrite: 0, range: [0, 50] }, { input: 3, output: 30, cacheRead: 0, cacheWrite: 0, range: [100, 150] }, @@ -467,7 +431,7 @@ describe("usage-format", () => { cost, }); - expect(total).toBeCloseTo(0.00175, 8); + expect(total).toBeCloseTo(0.00225, 8); }); it("normalizes open-ended range from models.json ([start] and [start, -1])", async () => { diff --git a/src/utils/usage-format.ts b/src/utils/usage-format.ts index 634a0589825..b805bd60567 100644 --- a/src/utils/usage-format.ts +++ b/src/utils/usage-format.ts @@ -313,23 +313,32 @@ export function resolveModelCostConfig(params: { const toNumber = (value: number | undefined): number => typeof value === "number" && Number.isFinite(value) ? value : 0; -/** - * Compute the cost for a single token dimension (input, output, cacheRead, - * or cacheWrite) across a set of sorted tiered-pricing tiers. - * - * The tiers define ranges on the **input** token axis. For each tier, - * the proportion of the total input that falls into that range determines - * the fraction of *all* token types billed at that tier's rates. - * - * For example, if the input is 40 000 tokens and the tiers are: - * [0, 32000) → $0.30/M input, $1.50/M output - * [32000, 128000) → $0.50/M input, $2.50/M output - * - * Then 80 % of every dimension is billed at the first tier and 20 % at the - * second tier. - * - * Prices are per-million; the caller divides by 1 000 000 after summing. - */ +function selectPricingTier(tiers: PricingTier[], input: number): PricingTier | undefined { + const sortedTiers = tiers.toSorted((a, b) => a.range[0] - b.range[0]); + if (sortedTiers.length === 0) { + return undefined; + } + if (input <= 0) { + return sortedTiers[0]; + } + + for (const tier of sortedTiers) { + const [start, end] = tier.range; + if (input >= start && input < end) { + return tier; + } + } + + for (let index = sortedTiers.length - 1; index >= 0; index -= 1) { + const tier = sortedTiers[index]; + if (input >= tier.range[0]) { + return tier; + } + } + + return sortedTiers[0]; +} + function computeTieredCost( tiers: PricingTier[], input: number, @@ -337,61 +346,17 @@ function computeTieredCost( cacheRead: number, cacheWrite: number, ): number { - const totalInputTokens = input; - const sortedTiers = tiers.toSorted((a, b) => a.range[0] - b.range[0]); - if (totalInputTokens <= 0) { - // If there are no input tokens the tier proportion is undefined; - // fall back to the first tier for any residual output/cache usage. - const tier = sortedTiers[0]; - if (!tier) { - return 0; - } - return output * tier.output + cacheRead * tier.cacheRead + cacheWrite * tier.cacheWrite; + const tier = selectPricingTier(tiers, input); + if (!tier) { + return 0; } - let total = 0; - let billedInput = 0; - let coveredUntil = 0; - let lastTier: PricingTier | undefined; - - for (const tier of sortedTiers) { - const [start, end] = tier.range; - const tierStart = Math.max(0, start, coveredUntil); - const tierEnd = Math.min(totalInputTokens, end); - const inputInTier = Math.max(0, tierEnd - tierStart); - if (end > coveredUntil) { - coveredUntil = end; - } - if (inputInTier <= 0) { - continue; - } - const fraction = inputInTier / totalInputTokens; - total += - inputInTier * tier.input + - output * fraction * tier.output + - cacheRead * fraction * tier.cacheRead + - cacheWrite * fraction * tier.cacheWrite; - billedInput += inputInTier; - lastTier = tier; - } - - // Bill any uncovered gaps or overflow at the highest matched tier's rate. - // This keeps malformed remote/user tier ranges from underestimating cost. - const unbilledInput = totalInputTokens - billedInput; - if (unbilledInput > 0) { - const fallbackTier = lastTier ?? sortedTiers[sortedTiers.length - 1]; - if (!fallbackTier) { - return total; - } - const fraction = unbilledInput / totalInputTokens; - total += - unbilledInput * fallbackTier.input + - output * fraction * fallbackTier.output + - cacheRead * fraction * fallbackTier.cacheRead + - cacheWrite * fraction * fallbackTier.cacheWrite; - } - - return total; + return ( + input * tier.input + + output * tier.output + + cacheRead * tier.cacheRead + + cacheWrite * tier.cacheWrite + ); } export function estimateUsageCost(params: {