fix(auth): use shorter backoff for auth_permanent failures

auth_permanent errors (e.g. API_KEY_INVALID) can be caused by transient
provider outages rather than genuinely revoked credentials. Previously
these used the same 5h-24h billing backoff, which left providers disabled
long after the upstream issue resolved.

Introduce separate authPermanentBackoffMinutes (default: 10) and
authPermanentMaxMinutes (default: 60) config options so auth_permanent
failures recover in minutes rather than hours.

Fixes #56838
This commit is contained in:
Extra Small
2026-04-03 09:17:34 -07:00
committed by Peter Steinberger
parent 022a24ec48
commit 42e1d489fd
5 changed files with 47 additions and 4 deletions

View File

@@ -516,6 +516,8 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
type ResolvedAuthCooldownConfig = {
billingBackoffMs: number;
billingMaxMs: number;
authPermanentBackoffMs: number;
authPermanentMaxMs: number;
failureWindowMs: number;
};
@@ -556,9 +558,17 @@ function resolveAuthCooldownConfig(params: {
defaults.failureWindowHours,
);
const resolveMinutes = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10);
const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
authPermanentBackoffMs: authPermanentBackoffMinutes * 60 * 1000,
authPermanentMaxMs: authPermanentMaxMinutes * 60 * 1000,
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
};
}
@@ -662,7 +672,7 @@ function computeNextProfileUsageStats(params: {
lastFailureAt: params.now,
};
if (params.reason === "billing" || params.reason === "auth_permanent") {
if (params.reason === "billing") {
const billingCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
@@ -677,6 +687,23 @@ function computeNextProfileUsageStats(params: {
recomputedUntil: params.now + backoffMs,
});
updatedStats.disabledReason = params.reason;
} else if (params.reason === "auth_permanent") {
// auth_permanent errors can be caused by transient provider outages (e.g.
// GCP returning API_KEY_INVALID during an incident). Use a much shorter
// backoff than billing so the provider recovers automatically once the
// upstream issue resolves.
const authPermCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: authPermCount,
baseMs: params.cfgResolved.authPermanentBackoffMs,
maxMs: params.cfgResolved.authPermanentMaxMs,
});
updatedStats.disabledUntil = keepActiveWindowOrRecompute({
existingUntil: params.existing.disabledUntil,
now: params.now,
recomputedUntil: params.now + backoffMs,
});
updatedStats.disabledReason = params.reason;
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
// Keep active cooldown windows immutable so retries within the window