fix: correct tiered model pricing costs

This commit is contained in:
Peter Steinberger
2026-04-21 03:48:17 +01:00
parent 04d41aeae1
commit 0c26623a96
4 changed files with 66 additions and 125 deletions

View File

@@ -279,21 +279,25 @@ describe("model-pricing-cache", () => {
"volcengine/doubao-seed-2-0-pro": {
input_cost_per_token: 4.6e-7,
output_cost_per_token: 2.3e-6,
cache_creation_input_token_cost: 9.2e-7,
litellm_provider: "volcengine",
tiered_pricing: [
{
input_cost_per_token: 4.6e-7,
output_cost_per_token: 2.3e-6,
cache_creation_input_token_cost: 9.2e-8,
range: [0, 32000],
},
{
input_cost_per_token: 7e-7,
output_cost_per_token: 3.5e-6,
cache_creation_input_token_cost: 1.4e-7,
range: [32000, 128000],
},
{
input_cost_per_token: 1.4e-6,
output_cost_per_token: 7e-6,
cache_creation_input_token_cost: 2.8e-7,
range: [128000, 256000],
},
],
@@ -316,14 +320,16 @@ describe("model-pricing-cache", () => {
expect(pricing).toBeDefined();
expect(pricing!.input).toBeCloseTo(0.46);
expect(pricing!.output).toBeCloseTo(2.3);
expect(pricing!.cacheWrite).toBeCloseTo(0.92);
expect(pricing!.tieredPricing).toHaveLength(3);
expect(pricing!.tieredPricing![0]).toEqual({
input: expect.closeTo(0.46),
output: expect.closeTo(2.3),
cacheRead: 0,
cacheWrite: 0,
cacheWrite: expect.closeTo(0.092),
range: [0, 32000],
});
expect(pricing!.tieredPricing![2].cacheWrite).toBeCloseTo(0.28);
expect(pricing!.tieredPricing![2].range).toEqual([128000, 256000]);
});
@@ -359,6 +365,7 @@ describe("model-pricing-cache", () => {
{
input_cost_per_token: 7e-7,
output_cost_per_token: 3.5e-6,
cache_creation_input_token_cost: 1.4e-7,
range: [32000],
},
],
@@ -382,6 +389,7 @@ describe("model-pricing-cache", () => {
expect(pricing!.tieredPricing).toHaveLength(2);
expect(pricing!.tieredPricing![0].range).toEqual([0, 32000]);
expect(pricing!.tieredPricing![1].range).toEqual([32000, Infinity]);
expect(pricing!.tieredPricing![1].cacheWrite).toBeCloseTo(0.14);
});
it("merges OpenRouter flat pricing with LiteLLM tiered pricing", async () => {
@@ -424,11 +432,13 @@ describe("model-pricing-cache", () => {
{
input_cost_per_token: 4e-7,
output_cost_per_token: 2.4e-6,
cache_creation_input_token_cost: 8e-8,
range: [0, 256000],
},
{
input_cost_per_token: 5e-7,
output_cost_per_token: 3e-6,
cache_creation_input_token_cost: 1e-7,
range: [256000, 1000000],
},
],
@@ -455,6 +465,7 @@ describe("model-pricing-cache", () => {
// LiteLLM tiered pricing is merged in
expect(pricing!.tieredPricing).toHaveLength(2);
expect(pricing!.tieredPricing![1].range).toEqual([256000, 1000000]);
expect(pricing!.tieredPricing![1].cacheWrite).toBeCloseTo(0.1);
});
it("falls back gracefully when LiteLLM fetch fails", async () => {

View File

@@ -153,6 +153,7 @@ type LiteLLMTierRaw = {
input_cost_per_token?: unknown;
output_cost_per_token?: unknown;
cache_read_input_token_cost?: unknown;
cache_creation_input_token_cost?: unknown;
range?: unknown;
};
@@ -194,7 +195,7 @@ function parseLiteLLMTieredPricing(tiers: unknown): CachedPricingTier[] | undefi
input: toPricePerMillion(inputPerToken),
output: toPricePerMillion(outputPerToken),
cacheRead: toPricePerMillion(parseNumberString(tier.cache_read_input_token_cost)),
cacheWrite: 0,
cacheWrite: toPricePerMillion(parseNumberString(tier.cache_creation_input_token_cost)),
range: [start, end],
});
}
@@ -211,7 +212,7 @@ function parseLiteLLMPricing(entry: LiteLLMModelEntry): CachedModelPricing | nul
input: toPricePerMillion(inputPerToken),
output: toPricePerMillion(outputPerToken),
cacheRead: toPricePerMillion(parseNumberString(entry.cache_read_input_token_cost)),
cacheWrite: 0,
cacheWrite: toPricePerMillion(parseNumberString(entry.cache_creation_input_token_cost)),
};
const tieredPricing = parseLiteLLMTieredPricing(entry.tiered_pricing);
if (tieredPricing) {

View File

@@ -282,7 +282,7 @@ describe("usage-format", () => {
expect(total).toBeCloseTo(0.003);
});
it("estimates cost with two tiers — input split across tiers", () => {
it("uses the matching context tier instead of blending lower tiers", () => {
// Tier 1: [0, 32000) → input $0.30/M, output $1.50/M
// Tier 2: [32000, 128000) → input $0.50/M, output $2.50/M
const tiers: PricingTier[] = [
@@ -291,17 +291,13 @@ describe("usage-format", () => {
];
const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };
// 40000 input tokens, 10000 output tokens
// Tier 1 gets 32000/40000 = 80% of input → 32000 input tokens
// Tier 2 gets 8000/40000 = 20% of input → 8000 input tokens
// Input cost = (32000 * 0.3 + 8000 * 0.5) / 1M = (9600 + 4000) / 1M = 0.0136
// Output cost = (10000 * 0.8 * 1.5 + 10000 * 0.2 * 2.5) / 1M = (12000 + 5000) / 1M = 0.017
// Total = 0.0136 + 0.017 = 0.0306
// 40000 input tokens selects Tier 2 for the whole request:
// (40000 * 0.5 + 10000 * 2.5) / 1M = 0.045
const total = estimateUsageCost({
usage: { input: 40_000, output: 10_000 },
cost,
});
expect(total).toBeCloseTo(0.0306, 4);
expect(total).toBeCloseTo(0.045, 4);
});
it("estimates cost with three tiers — volcengine-style pricing", () => {
@@ -316,22 +312,13 @@ describe("usage-format", () => {
];
const cost = { input: 0.46, output: 2.3, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };
// 200000 input tokens, 5000 output tokens
// Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16
// Tier 2: 96000 tokens, fraction = 96000/200000 = 0.48
// Tier 3: 72000 tokens, fraction = 72000/200000 = 0.36
//
// Input cost = (32000*0.46 + 96000*0.70 + 72000*1.40) / 1M
// = (14720 + 67200 + 100800) / 1M = 182720 / 1M = 0.18272
// Output cost = 5000 * (0.16*2.3 + 0.48*3.5 + 0.36*7.0) / 1M
// = 5000 * (0.368 + 1.68 + 2.52) / 1M
// = 5000 * 4.568 / 1M = 22840 / 1M = 0.02284
// Total = 0.18272 + 0.02284 = 0.20556
// 200000 input tokens selects Tier 3 for the whole request:
// (200000 * 1.40 + 5000 * 7.00) / 1M = 0.315
const total = estimateUsageCost({
usage: { input: 200_000, output: 5_000 },
cost,
});
expect(total).toBeCloseTo(0.20556, 4);
expect(total).toBeCloseTo(0.315, 4);
});
it("uses first tier rates for output when input is zero", () => {
@@ -375,22 +362,13 @@ describe("usage-format", () => {
];
const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };
// 200000 input, 10000 output
// Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16
// Tier 2: 96000 tokens, fraction = 96000/200000 = 0.48
// Overflow (at Tier 2 rates): 72000 tokens, fraction = 72000/200000 = 0.36
//
// Input cost = (32000*0.3 + 96000*0.5 + 72000*0.5) / 1M
// = (9600 + 48000 + 36000) / 1M = 93600/1M = 0.0936
// Output cost = 10000 * (0.16*1.5 + 0.48*2.5 + 0.36*2.5) / 1M
// = 10000 * (0.24 + 1.2 + 0.9) / 1M
// = 10000 * 2.34 / 1M = 23400/1M = 0.0234
// Total = 0.0936 + 0.0234 = 0.117
// 200000 input tokens exceeds the max range, so the last tier is the
// whole-request fallback: (200000 * 0.5 + 10000 * 2.5) / 1M = 0.125
const total = estimateUsageCost({
usage: { input: 200_000, output: 10_000 },
cost,
});
expect(total).toBeCloseTo(0.117, 4);
expect(total).toBeCloseTo(0.125, 4);
});
it("bills overflow at last tier when only a single small-range tier exists (e.g. <30K)", () => {
@@ -400,14 +378,7 @@ describe("usage-format", () => {
];
const cost = { input: 1.0, output: 3.0, cacheRead: 0.5, cacheWrite: 0, tieredPricing: tiers };
// 100000 input, 5000 output, 2000 cacheRead
// Tier 1: 30000 tokens, fraction = 30000/100000 = 0.3
// Overflow (at Tier 1 rates): 70000 tokens, fraction = 70000/100000 = 0.7
// Fractions sum to 1.0 — all output/cache fully billed
//
// Input cost = (30000*1.0 + 70000*1.0) / 1M = 100000/1M = 0.1
// Output cost = 5000 * (0.3*3.0 + 0.7*3.0) / 1M = 5000*3.0/1M = 0.015
// CacheRead cost = 2000 * (0.3*0.5 + 0.7*0.5) / 1M = 2000*0.5/1M = 0.001
// 100000 input exceeds the only range, so Tier 1 is the whole-request fallback.
// Total = 0.1 + 0.015 + 0.001 = 0.116
const total = estimateUsageCost({
usage: { input: 100_000, output: 5_000, cacheRead: 2_000 },
@@ -425,19 +396,12 @@ describe("usage-format", () => {
];
const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };
// 200000 input, 10000 output
// Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16
// Tier 2: 168000 tokens, fraction = 168000/200000 = 0.84
// No overflow — Tier 2 absorbs everything beyond 32K
//
// Input cost = (32000*0.3 + 168000*0.5) / 1M = (9600 + 84000) / 1M = 0.0936
// Output cost = 10000 * (0.16*1.5 + 0.84*2.5) / 1M = 10000 * (0.24 + 2.1) / 1M = 0.0234
// Total = 0.0936 + 0.0234 = 0.117
// 200000 input tokens selects the open-ended Tier 2 for the whole request.
const total = estimateUsageCost({
usage: { input: 200_000, output: 10_000 },
cost,
});
expect(total).toBeCloseTo(0.117, 4);
expect(total).toBeCloseTo(0.125, 4);
});
it("uses declared tier ranges instead of sequential widths", () => {
@@ -452,10 +416,10 @@ describe("usage-format", () => {
cost,
});
expect(total).toBeCloseTo(0.00125, 8);
expect(total).toBeCloseTo(0.00075, 8);
});
it("bills malformed tier gaps at a fallback tier instead of dropping them", () => {
it("bills malformed tier gaps at a whole-request fallback tier", () => {
const tiers: PricingTier[] = [
{ input: 1, output: 10, cacheRead: 0, cacheWrite: 0, range: [0, 50] },
{ input: 3, output: 30, cacheRead: 0, cacheWrite: 0, range: [100, 150] },
@@ -467,7 +431,7 @@ describe("usage-format", () => {
cost,
});
expect(total).toBeCloseTo(0.00175, 8);
expect(total).toBeCloseTo(0.00225, 8);
});
it("normalizes open-ended range from models.json ([start] and [start, -1])", async () => {

View File

@@ -313,23 +313,32 @@ export function resolveModelCostConfig(params: {
const toNumber = (value: number | undefined): number =>
typeof value === "number" && Number.isFinite(value) ? value : 0;
/**
* Compute the cost for a single token dimension (input, output, cacheRead,
* or cacheWrite) across a set of sorted tiered-pricing tiers.
*
* The tiers define ranges on the **input** token axis. For each tier,
* the proportion of the total input that falls into that range determines
* the fraction of *all* token types billed at that tier's rates.
*
* For example, if the input is 40 000 tokens and the tiers are:
* [0, 32000) → $0.30/M input, $1.50/M output
* [32000, 128000) → $0.50/M input, $2.50/M output
*
* Then 80 % of every dimension is billed at the first tier and 20 % at the
* second tier.
*
* Prices are per-million; the caller divides by 1 000 000 after summing.
*/
function selectPricingTier(tiers: PricingTier[], input: number): PricingTier | undefined {
const sortedTiers = tiers.toSorted((a, b) => a.range[0] - b.range[0]);
if (sortedTiers.length === 0) {
return undefined;
}
if (input <= 0) {
return sortedTiers[0];
}
for (const tier of sortedTiers) {
const [start, end] = tier.range;
if (input >= start && input < end) {
return tier;
}
}
for (let index = sortedTiers.length - 1; index >= 0; index -= 1) {
const tier = sortedTiers[index];
if (input >= tier.range[0]) {
return tier;
}
}
return sortedTiers[0];
}
function computeTieredCost(
tiers: PricingTier[],
input: number,
@@ -337,61 +346,17 @@ function computeTieredCost(
cacheRead: number,
cacheWrite: number,
): number {
const totalInputTokens = input;
const sortedTiers = tiers.toSorted((a, b) => a.range[0] - b.range[0]);
if (totalInputTokens <= 0) {
// If there are no input tokens the tier proportion is undefined;
// fall back to the first tier for any residual output/cache usage.
const tier = sortedTiers[0];
if (!tier) {
return 0;
}
return output * tier.output + cacheRead * tier.cacheRead + cacheWrite * tier.cacheWrite;
const tier = selectPricingTier(tiers, input);
if (!tier) {
return 0;
}
let total = 0;
let billedInput = 0;
let coveredUntil = 0;
let lastTier: PricingTier | undefined;
for (const tier of sortedTiers) {
const [start, end] = tier.range;
const tierStart = Math.max(0, start, coveredUntil);
const tierEnd = Math.min(totalInputTokens, end);
const inputInTier = Math.max(0, tierEnd - tierStart);
if (end > coveredUntil) {
coveredUntil = end;
}
if (inputInTier <= 0) {
continue;
}
const fraction = inputInTier / totalInputTokens;
total +=
inputInTier * tier.input +
output * fraction * tier.output +
cacheRead * fraction * tier.cacheRead +
cacheWrite * fraction * tier.cacheWrite;
billedInput += inputInTier;
lastTier = tier;
}
// Bill any uncovered gaps or overflow at the highest matched tier's rate.
// This keeps malformed remote/user tier ranges from underestimating cost.
const unbilledInput = totalInputTokens - billedInput;
if (unbilledInput > 0) {
const fallbackTier = lastTier ?? sortedTiers[sortedTiers.length - 1];
if (!fallbackTier) {
return total;
}
const fraction = unbilledInput / totalInputTokens;
total +=
unbilledInput * fallbackTier.input +
output * fraction * fallbackTier.output +
cacheRead * fraction * fallbackTier.cacheRead +
cacheWrite * fraction * fallbackTier.cacheWrite;
}
return total;
return (
input * tier.input +
output * tier.output +
cacheRead * tier.cacheRead +
cacheWrite * tier.cacheWrite
);
}
export function estimateUsageCost(params: {