fix: correct tiered model pricing costs

2026-05-06 10:30:44 +00:00 · 2026-04-21 03:48:17 +01:00
parent 04d41aeae1
commit 0c26623a96
4 changed files with 66 additions and 125 deletions
--- a/src/gateway/model-pricing-cache.test.ts
+++ b/src/gateway/model-pricing-cache.test.ts
@@ -279,21 +279,25 @@ describe("model-pricing-cache", () => {
          "volcengine/doubao-seed-2-0-pro": {
            input_cost_per_token: 4.6e-7,
            output_cost_per_token: 2.3e-6,
+            cache_creation_input_token_cost: 9.2e-7,
            litellm_provider: "volcengine",
            tiered_pricing: [
              {
                input_cost_per_token: 4.6e-7,
                output_cost_per_token: 2.3e-6,
+                cache_creation_input_token_cost: 9.2e-8,
                range: [0, 32000],
              },
              {
                input_cost_per_token: 7e-7,
                output_cost_per_token: 3.5e-6,
+                cache_creation_input_token_cost: 1.4e-7,
                range: [32000, 128000],
              },
              {
                input_cost_per_token: 1.4e-6,
                output_cost_per_token: 7e-6,
+                cache_creation_input_token_cost: 2.8e-7,
                range: [128000, 256000],
              },
            ],
@@ -316,14 +320,16 @@ describe("model-pricing-cache", () => {
    expect(pricing).toBeDefined();
    expect(pricing!.input).toBeCloseTo(0.46);
    expect(pricing!.output).toBeCloseTo(2.3);
+    expect(pricing!.cacheWrite).toBeCloseTo(0.92);
    expect(pricing!.tieredPricing).toHaveLength(3);
    expect(pricing!.tieredPricing![0]).toEqual({
      input: expect.closeTo(0.46),
      output: expect.closeTo(2.3),
      cacheRead: 0,
-      cacheWrite: 0,
+      cacheWrite: expect.closeTo(0.092),
      range: [0, 32000],
    });
+    expect(pricing!.tieredPricing![2].cacheWrite).toBeCloseTo(0.28);
    expect(pricing!.tieredPricing![2].range).toEqual([128000, 256000]);
  });

@@ -359,6 +365,7 @@ describe("model-pricing-cache", () => {
              {
                input_cost_per_token: 7e-7,
                output_cost_per_token: 3.5e-6,
+                cache_creation_input_token_cost: 1.4e-7,
                range: [32000],
              },
            ],
@@ -382,6 +389,7 @@ describe("model-pricing-cache", () => {
    expect(pricing!.tieredPricing).toHaveLength(2);
    expect(pricing!.tieredPricing![0].range).toEqual([0, 32000]);
    expect(pricing!.tieredPricing![1].range).toEqual([32000, Infinity]);
+    expect(pricing!.tieredPricing![1].cacheWrite).toBeCloseTo(0.14);
  });

  it("merges OpenRouter flat pricing with LiteLLM tiered pricing", async () => {
@@ -424,11 +432,13 @@ describe("model-pricing-cache", () => {
              {
                input_cost_per_token: 4e-7,
                output_cost_per_token: 2.4e-6,
+                cache_creation_input_token_cost: 8e-8,
                range: [0, 256000],
              },
              {
                input_cost_per_token: 5e-7,
                output_cost_per_token: 3e-6,
+                cache_creation_input_token_cost: 1e-7,
                range: [256000, 1000000],
              },
            ],
@@ -455,6 +465,7 @@ describe("model-pricing-cache", () => {
    // LiteLLM tiered pricing is merged in
    expect(pricing!.tieredPricing).toHaveLength(2);
    expect(pricing!.tieredPricing![1].range).toEqual([256000, 1000000]);
+    expect(pricing!.tieredPricing![1].cacheWrite).toBeCloseTo(0.1);
  });

  it("falls back gracefully when LiteLLM fetch fails", async () => {
--- a/src/gateway/model-pricing-cache.ts
+++ b/src/gateway/model-pricing-cache.ts
@@ -153,6 +153,7 @@ type LiteLLMTierRaw = {
  input_cost_per_token?: unknown;
  output_cost_per_token?: unknown;
  cache_read_input_token_cost?: unknown;
+  cache_creation_input_token_cost?: unknown;
  range?: unknown;
 };

@@ -194,7 +195,7 @@ function parseLiteLLMTieredPricing(tiers: unknown): CachedPricingTier[] | undefi
      input: toPricePerMillion(inputPerToken),
      output: toPricePerMillion(outputPerToken),
      cacheRead: toPricePerMillion(parseNumberString(tier.cache_read_input_token_cost)),
-      cacheWrite: 0,
+      cacheWrite: toPricePerMillion(parseNumberString(tier.cache_creation_input_token_cost)),
      range: [start, end],
    });
  }
@@ -211,7 +212,7 @@ function parseLiteLLMPricing(entry: LiteLLMModelEntry): CachedModelPricing | nul
    input: toPricePerMillion(inputPerToken),
    output: toPricePerMillion(outputPerToken),
    cacheRead: toPricePerMillion(parseNumberString(entry.cache_read_input_token_cost)),
-    cacheWrite: 0,
+    cacheWrite: toPricePerMillion(parseNumberString(entry.cache_creation_input_token_cost)),
  };
  const tieredPricing = parseLiteLLMTieredPricing(entry.tiered_pricing);
  if (tieredPricing) {
--- a/src/utils/usage-format.test.ts
+++ b/src/utils/usage-format.test.ts
@@ -282,7 +282,7 @@ describe("usage-format", () => {
    expect(total).toBeCloseTo(0.003);
  });

-  it("estimates cost with two tiers — input split across tiers", () => {
+  it("uses the matching context tier instead of blending lower tiers", () => {
    // Tier 1: [0, 32000) → input $0.30/M, output $1.50/M
    // Tier 2: [32000, 128000) → input $0.50/M, output $2.50/M
    const tiers: PricingTier[] = [
@@ -291,17 +291,13 @@ describe("usage-format", () => {
    ];
    const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };

-    // 40000 input tokens, 10000 output tokens
-    // Tier 1 gets 32000/40000 = 80% of input → 32000 input tokens
-    // Tier 2 gets 8000/40000 = 20% of input → 8000 input tokens
-    // Input cost = (32000 * 0.3 + 8000 * 0.5) / 1M = (9600 + 4000) / 1M = 0.0136
-    // Output cost = (10000 * 0.8 * 1.5 + 10000 * 0.2 * 2.5) / 1M = (12000 + 5000) / 1M = 0.017
-    // Total = 0.0136 + 0.017 = 0.0306
+    // 40000 input tokens selects Tier 2 for the whole request:
+    // (40000 * 0.5 + 10000 * 2.5) / 1M = 0.045
    const total = estimateUsageCost({
      usage: { input: 40_000, output: 10_000 },
      cost,
    });
-    expect(total).toBeCloseTo(0.0306, 4);
+    expect(total).toBeCloseTo(0.045, 4);
  });

  it("estimates cost with three tiers — volcengine-style pricing", () => {
@@ -316,22 +312,13 @@ describe("usage-format", () => {
    ];
    const cost = { input: 0.46, output: 2.3, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };

-    // 200000 input tokens, 5000 output tokens
-    // Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16
-    // Tier 2: 96000 tokens, fraction = 96000/200000 = 0.48
-    // Tier 3: 72000 tokens, fraction = 72000/200000 = 0.36
-    //
-    // Input cost = (32000*0.46 + 96000*0.70 + 72000*1.40) / 1M
-    //            = (14720 + 67200 + 100800) / 1M = 182720 / 1M = 0.18272
-    // Output cost = 5000 * (0.16*2.3 + 0.48*3.5 + 0.36*7.0) / 1M
-    //             = 5000 * (0.368 + 1.68 + 2.52) / 1M
-    //             = 5000 * 4.568 / 1M = 22840 / 1M = 0.02284
-    // Total = 0.18272 + 0.02284 = 0.20556
+    // 200000 input tokens selects Tier 3 for the whole request:
+    // (200000 * 1.40 + 5000 * 7.00) / 1M = 0.315
    const total = estimateUsageCost({
      usage: { input: 200_000, output: 5_000 },
      cost,
    });
-    expect(total).toBeCloseTo(0.20556, 4);
+    expect(total).toBeCloseTo(0.315, 4);
  });

  it("uses first tier rates for output when input is zero", () => {
@@ -375,22 +362,13 @@ describe("usage-format", () => {
    ];
    const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };

-    // 200000 input, 10000 output
-    // Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16
-    // Tier 2: 96000 tokens, fraction = 96000/200000 = 0.48
-    // Overflow (at Tier 2 rates): 72000 tokens, fraction = 72000/200000 = 0.36
-    //
-    // Input cost = (32000*0.3 + 96000*0.5 + 72000*0.5) / 1M
-    //            = (9600 + 48000 + 36000) / 1M = 93600/1M = 0.0936
-    // Output cost = 10000 * (0.16*1.5 + 0.48*2.5 + 0.36*2.5) / 1M
-    //             = 10000 * (0.24 + 1.2 + 0.9) / 1M
-    //             = 10000 * 2.34 / 1M = 23400/1M = 0.0234
-    // Total = 0.0936 + 0.0234 = 0.117
+    // 200000 input tokens exceeds the max range, so the last tier is the
+    // whole-request fallback: (200000 * 0.5 + 10000 * 2.5) / 1M = 0.125
    const total = estimateUsageCost({
      usage: { input: 200_000, output: 10_000 },
      cost,
    });
-    expect(total).toBeCloseTo(0.117, 4);
+    expect(total).toBeCloseTo(0.125, 4);
  });

  it("bills overflow at last tier when only a single small-range tier exists (e.g. <30K)", () => {
@@ -400,14 +378,7 @@ describe("usage-format", () => {
    ];
    const cost = { input: 1.0, output: 3.0, cacheRead: 0.5, cacheWrite: 0, tieredPricing: tiers };

-    // 100000 input, 5000 output, 2000 cacheRead
-    // Tier 1: 30000 tokens, fraction = 30000/100000 = 0.3
-    // Overflow (at Tier 1 rates): 70000 tokens, fraction = 70000/100000 = 0.7
-    // Fractions sum to 1.0 — all output/cache fully billed
-    //
-    // Input cost = (30000*1.0 + 70000*1.0) / 1M = 100000/1M = 0.1
-    // Output cost = 5000 * (0.3*3.0 + 0.7*3.0) / 1M = 5000*3.0/1M = 0.015
-    // CacheRead cost = 2000 * (0.3*0.5 + 0.7*0.5) / 1M = 2000*0.5/1M = 0.001
+    // 100000 input exceeds the only range, so Tier 1 is the whole-request fallback.
    // Total = 0.1 + 0.015 + 0.001 = 0.116
    const total = estimateUsageCost({
      usage: { input: 100_000, output: 5_000, cacheRead: 2_000 },
@@ -425,19 +396,12 @@ describe("usage-format", () => {
    ];
    const cost = { input: 0.3, output: 1.5, cacheRead: 0, cacheWrite: 0, tieredPricing: tiers };

-    // 200000 input, 10000 output
-    // Tier 1: 32000 tokens, fraction = 32000/200000 = 0.16
-    // Tier 2: 168000 tokens, fraction = 168000/200000 = 0.84
-    // No overflow — Tier 2 absorbs everything beyond 32K
-    //
-    // Input cost = (32000*0.3 + 168000*0.5) / 1M = (9600 + 84000) / 1M = 0.0936
-    // Output cost = 10000 * (0.16*1.5 + 0.84*2.5) / 1M = 10000 * (0.24 + 2.1) / 1M = 0.0234
-    // Total = 0.0936 + 0.0234 = 0.117
+    // 200000 input tokens selects the open-ended Tier 2 for the whole request.
    const total = estimateUsageCost({
      usage: { input: 200_000, output: 10_000 },
      cost,
    });
-    expect(total).toBeCloseTo(0.117, 4);
+    expect(total).toBeCloseTo(0.125, 4);
  });

  it("uses declared tier ranges instead of sequential widths", () => {
@@ -452,10 +416,10 @@ describe("usage-format", () => {
      cost,
    });

-    expect(total).toBeCloseTo(0.00125, 8);
+    expect(total).toBeCloseTo(0.00075, 8);
  });

-  it("bills malformed tier gaps at a fallback tier instead of dropping them", () => {
+  it("bills malformed tier gaps at a whole-request fallback tier", () => {
    const tiers: PricingTier[] = [
      { input: 1, output: 10, cacheRead: 0, cacheWrite: 0, range: [0, 50] },
      { input: 3, output: 30, cacheRead: 0, cacheWrite: 0, range: [100, 150] },
@@ -467,7 +431,7 @@ describe("usage-format", () => {
      cost,
    });

-    expect(total).toBeCloseTo(0.00175, 8);
+    expect(total).toBeCloseTo(0.00225, 8);
  });

  it("normalizes open-ended range from models.json ([start] and [start, -1])", async () => {
--- a/src/utils/usage-format.ts
+++ b/src/utils/usage-format.ts
@@ -313,23 +313,32 @@ export function resolveModelCostConfig(params: {
 const toNumber = (value: number | undefined): number =>
  typeof value === "number" && Number.isFinite(value) ? value : 0;

-/**
- * Compute the cost for a single token dimension (input, output, cacheRead,
- * or cacheWrite) across a set of sorted tiered-pricing tiers.
- *
- * The tiers define ranges on the **input** token axis.  For each tier,
- * the proportion of the total input that falls into that range determines
- * the fraction of *all* token types billed at that tier's rates.
- *
- * For example, if the input is 40 000 tokens and the tiers are:
- *   [0, 32000)  → $0.30/M input, $1.50/M output
- *   [32000, 128000) → $0.50/M input, $2.50/M output
- *
- * Then 80 % of every dimension is billed at the first tier and 20 % at the
- * second tier.
- *
- * Prices are per-million; the caller divides by 1 000 000 after summing.
- */
+function selectPricingTier(tiers: PricingTier[], input: number): PricingTier | undefined {
+  const sortedTiers = tiers.toSorted((a, b) => a.range[0] - b.range[0]);
+  if (sortedTiers.length === 0) {
+    return undefined;
+  }
+  if (input <= 0) {
+    return sortedTiers[0];
+  }
+
+  for (const tier of sortedTiers) {
+    const [start, end] = tier.range;
+    if (input >= start && input < end) {
+      return tier;
+    }
+  }
+
+  for (let index = sortedTiers.length - 1; index >= 0; index -= 1) {
+    const tier = sortedTiers[index];
+    if (input >= tier.range[0]) {
+      return tier;
+    }
+  }
+
+  return sortedTiers[0];
+}
+
 function computeTieredCost(
  tiers: PricingTier[],
  input: number,
@@ -337,61 +346,17 @@ function computeTieredCost(
  cacheRead: number,
  cacheWrite: number,
 ): number {
-  const totalInputTokens = input;
-  const sortedTiers = tiers.toSorted((a, b) => a.range[0] - b.range[0]);
-  if (totalInputTokens <= 0) {
-    // If there are no input tokens the tier proportion is undefined;
-    // fall back to the first tier for any residual output/cache usage.
-    const tier = sortedTiers[0];
-    if (!tier) {
-      return 0;
-    }
-    return output * tier.output + cacheRead * tier.cacheRead + cacheWrite * tier.cacheWrite;
+  const tier = selectPricingTier(tiers, input);
+  if (!tier) {
+    return 0;
  }

-  let total = 0;
-  let billedInput = 0;
-  let coveredUntil = 0;
-  let lastTier: PricingTier | undefined;
-
-  for (const tier of sortedTiers) {
-    const [start, end] = tier.range;
-    const tierStart = Math.max(0, start, coveredUntil);
-    const tierEnd = Math.min(totalInputTokens, end);
-    const inputInTier = Math.max(0, tierEnd - tierStart);
-    if (end > coveredUntil) {
-      coveredUntil = end;
-    }
-    if (inputInTier <= 0) {
-      continue;
-    }
-    const fraction = inputInTier / totalInputTokens;
-    total +=
-      inputInTier * tier.input +
-      output * fraction * tier.output +
-      cacheRead * fraction * tier.cacheRead +
-      cacheWrite * fraction * tier.cacheWrite;
-    billedInput += inputInTier;
-    lastTier = tier;
-  }
-
-  // Bill any uncovered gaps or overflow at the highest matched tier's rate.
-  // This keeps malformed remote/user tier ranges from underestimating cost.
-  const unbilledInput = totalInputTokens - billedInput;
-  if (unbilledInput > 0) {
-    const fallbackTier = lastTier ?? sortedTiers[sortedTiers.length - 1];
-    if (!fallbackTier) {
-      return total;
-    }
-    const fraction = unbilledInput / totalInputTokens;
-    total +=
-      unbilledInput * fallbackTier.input +
-      output * fraction * fallbackTier.output +
-      cacheRead * fraction * fallbackTier.cacheRead +
-      cacheWrite * fraction * fallbackTier.cacheWrite;
-  }
-
-  return total;
+  return (
+    input * tier.input +
+    output * tier.output +
+    cacheRead * tier.cacheRead +
+    cacheWrite * tier.cacheWrite
+  );
 }

 export function estimateUsageCost(params: {