diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 9d51d54df6f..5cd4912a545 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -709,7 +709,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () lastFailureAt: now - 60_000, }), // errorCount resets, billing count resets to 1 → - // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h + // calculateDisabledLaneBackoffMs(1, 5h, 24h) = 5h expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000, readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil, }, @@ -724,7 +724,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () lastFailureAt: now - 60_000, }), // errorCount resets, auth_permanent count resets to 1 → - // calculateAuthProfileBillingDisableMsWithConfig(1, 10m, 60m) = 10m + // calculateDisabledLaneBackoffMs(1, 10m, 60m) = 10m expectedUntil: (now: number) => now + 10 * 60 * 1000, readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil, }, diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 6bc64fa79ae..aff80ce9c92 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -518,6 +518,27 @@ type ResolvedAuthCooldownConfig = { failureWindowMs: number; }; +type DisabledFailureReason = Extract; + +type DisabledFailureBackoffPolicy = { + baseMs: (cfg: ResolvedAuthCooldownConfig) => number; + maxMs: (cfg: ResolvedAuthCooldownConfig) => number; +}; + +const DISABLED_FAILURE_BACKOFF_POLICIES = { + billing: { + baseMs: (cfg) => cfg.billingBackoffMs, + maxMs: (cfg) => cfg.billingMaxMs, + }, + auth_permanent: { + // Keep high-confidence permanent-auth failures in the disabled lane, but + // recover much sooner than billing because some providers surface + // auth-looking payloads transiently during incidents. + baseMs: (cfg) => cfg.authPermanentBackoffMs, + maxMs: (cfg) => cfg.authPermanentMaxMs, + }, +} as const satisfies Record; + function resolveAuthCooldownConfig(params: { cfg?: OpenClawConfig; providerId: string; @@ -530,9 +551,7 @@ function resolveAuthCooldownConfig(params: { failureWindowHours: 24, } as const; - const resolveHours = (value: unknown, fallback: number) => - typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback; - const resolveMinutes = (value: unknown, fallback: number) => + const resolvePositiveNumber = (value: unknown, fallback: number) => typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback; const cooldowns = params.cfg?.auth?.cooldowns; @@ -549,20 +568,23 @@ function resolveAuthCooldownConfig(params: { return undefined; })(); - const billingBackoffHours = resolveHours( + const billingBackoffHours = resolvePositiveNumber( billingOverride ?? cooldowns?.billingBackoffHours, defaults.billingBackoffHours, ); - const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours); - const authPermanentBackoffMinutes = resolveMinutes( + const billingMaxHours = resolvePositiveNumber( + cooldowns?.billingMaxHours, + defaults.billingMaxHours, + ); + const authPermanentBackoffMinutes = resolvePositiveNumber( cooldowns?.authPermanentBackoffMinutes, defaults.authPermanentBackoffMinutes, ); - const authPermanentMaxMinutes = resolveMinutes( + const authPermanentMaxMinutes = resolvePositiveNumber( cooldowns?.authPermanentMaxMinutes, defaults.authPermanentMaxMinutes, ); - const failureWindowHours = resolveHours( + const failureWindowHours = resolvePositiveNumber( cooldowns?.failureWindowHours, defaults.failureWindowHours, ); @@ -576,7 +598,7 @@ function resolveAuthCooldownConfig(params: { }; } -function calculateAuthProfileBillingDisableMsWithConfig(params: { +function calculateDisabledLaneBackoffMs(params: { errorCount: number; baseMs: number; maxMs: number; @@ -589,6 +611,19 @@ function calculateAuthProfileBillingDisableMsWithConfig(params: { return Math.min(maxMs, raw); } +function resolveDisabledFailureBackoffMs(params: { + reason: DisabledFailureReason; + errorCount: number; + cfgResolved: ResolvedAuthCooldownConfig; +}): number { + const policy = DISABLED_FAILURE_BACKOFF_POLICIES[params.reason]; + return calculateDisabledLaneBackoffMs({ + errorCount: params.errorCount, + baseMs: policy.baseMs(params.cfgResolved), + maxMs: policy.maxMs(params.cfgResolved), + }); +} + export function resolveProfileUnusableUntilForDisplay( store: AuthProfileStore, profileId: string, @@ -675,12 +710,15 @@ function computeNextProfileUsageStats(params: { lastFailureAt: params.now, }; - if (params.reason === "billing") { - const billingCount = failureCounts[params.reason] ?? 1; - const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ - errorCount: billingCount, - baseMs: params.cfgResolved.billingBackoffMs, - maxMs: params.cfgResolved.billingMaxMs, + const disabledFailureReason = + params.reason === "billing" || params.reason === "auth_permanent" ? params.reason : null; + + if (disabledFailureReason) { + const disableCount = failureCounts[disabledFailureReason] ?? 1; + const backoffMs = resolveDisabledFailureBackoffMs({ + reason: disabledFailureReason, + errorCount: disableCount, + cfgResolved: params.cfgResolved, }); // Keep active disable windows immutable so retries within the window cannot // extend recovery time indefinitely. @@ -689,23 +727,7 @@ function computeNextProfileUsageStats(params: { now: params.now, recomputedUntil: params.now + backoffMs, }); - updatedStats.disabledReason = params.reason; - } else if (params.reason === "auth_permanent") { - // Keep permanent-auth failures in the disabled lane, but use a much - // shorter backoff than billing. Some upstream incidents surface auth-ish - // payloads transiently, so the provider should recover automatically. - const authPermanentCount = failureCounts[params.reason] ?? 1; - const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ - errorCount: authPermanentCount, - baseMs: params.cfgResolved.authPermanentBackoffMs, - maxMs: params.cfgResolved.authPermanentMaxMs, - }); - updatedStats.disabledUntil = keepActiveWindowOrRecompute({ - existingUntil: params.existing.disabledUntil, - now: params.now, - recomputedUntil: params.now + backoffMs, - }); - updatedStats.disabledReason = params.reason; + updatedStats.disabledReason = disabledFailureReason; } else { const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount); // Keep active cooldown windows immutable so retries within the window diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index 492e3972595..d2b43a7fcd9 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -109,7 +109,7 @@ describe("failover-error", () => { status: 410, message: "invalid_api_key", }), - ).toBe("auth_permanent"); + ).toBe("auth"); expect( resolveFailoverReasonFromError({ status: 410, diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 2d4f30766ca..62eb6e692fc 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -68,14 +68,12 @@ describe("isAuthPermanentErrorMessage", () => { { name: "matches permanent auth failure patterns", samples: [ - "invalid_api_key", "api key revoked", "api key deactivated", "key has been disabled", "key has been revoked", "account has been deactivated", - "could not authenticate api key", - "could not validate credentials", + "OAuth authentication is currently not allowed for this organization", "API_KEY_REVOKED", "api_key_deleted", ], @@ -84,6 +82,8 @@ describe("isAuthPermanentErrorMessage", () => { { name: "does not match transient auth errors", samples: [ + "invalid_api_key", + "permission_error", "unauthorized", "invalid token", "authentication failed", @@ -102,8 +102,12 @@ describe("isAuthErrorMessage", () => { it.each([ 'No credentials found for profile "anthropic:default".', "No API key found for profile openai.", + "invalid_api_key", + "permission_error", "OAuth token refresh failed for anthropic: Failed to refresh OAuth token for anthropic. Please try again or re-authenticate.", "Please re-authenticate to continue.", + "could not authenticate api key", + "could not validate credentials", "Failed to extract accountId from token", ])("matches auth errors for %j", (sample) => { expect(isAuthErrorMessage(sample)).toBe(true); diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index 02c465f9c46..4ae7dda1a5a 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -3,6 +3,41 @@ type ErrorPattern = RegExp | string; const PERIODIC_USAGE_LIMIT_RE = /\b(?:daily|weekly|monthly)(?:\/(?:daily|weekly|monthly))* (?:usage )?limit(?:s)?(?: (?:exhausted|reached|exceeded))?\b/i; +const HIGH_CONFIDENCE_AUTH_PERMANENT_PATTERNS = [ + /api[_ ]?key[_ ]?(?:revoked|deactivated|deleted)/i, + "key has been disabled", + "key has been revoked", + "account has been deactivated", + "not allowed for this organization", +] as const satisfies readonly ErrorPattern[]; + +const AMBIGUOUS_AUTH_ERROR_PATTERNS = [ + /invalid[_ ]?api[_ ]?key/, + /could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i, + "permission_error", +] as const satisfies readonly ErrorPattern[]; + +const COMMON_AUTH_ERROR_PATTERNS = [ + "incorrect api key", + "invalid token", + "authentication", + "re-authenticate", + "oauth token refresh failed", + "unauthorized", + "forbidden", + "access denied", + "insufficient permissions", + "insufficient permission", + /missing scopes?:/i, + "expired", + "token has expired", + /\b401\b/, + /\b403\b/, + "no credentials found", + "no api key found", + /\bfailed to (?:extract|parse|validate|decode)\b.*\btoken\b/, +] as const satisfies readonly ErrorPattern[]; + const ERROR_PATTERNS = { rateLimit: [ /rate[_ ]limit|too many requests|429/, @@ -80,36 +115,8 @@ const ERROR_PATTERNS = { "insufficient usd or diem balance", /requires?\s+more\s+credits/i, ], - authPermanent: [ - /api[_ ]?key[_ ]?(?:revoked|deactivated|deleted)/i, - "key has been disabled", - "key has been revoked", - "account has been deactivated", - "not allowed for this organization", - ], - auth: [ - /invalid[_ ]?api[_ ]?key/, - "incorrect api key", - "invalid token", - "authentication", - "re-authenticate", - "oauth token refresh failed", - /could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i, - "permission_error", - "unauthorized", - "forbidden", - "access denied", - "insufficient permissions", - "insufficient permission", - /missing scopes?:/i, - "expired", - "token has expired", - /\b401\b/, - /\b403\b/, - "no credentials found", - "no api key found", - /\bfailed to (?:extract|parse|validate|decode)\b.*\btoken\b/, - ], + authPermanent: HIGH_CONFIDENCE_AUTH_PERMANENT_PATTERNS, + auth: [...AMBIGUOUS_AUTH_ERROR_PATTERNS, ...COMMON_AUTH_ERROR_PATTERNS], format: [ "string should match pattern", "tool_use.id", @@ -136,6 +143,13 @@ function matchesErrorPatterns(raw: string, patterns: readonly ErrorPattern[]): b ); } +function matchesErrorPatternGroups( + raw: string, + groups: readonly (readonly ErrorPattern[])[], +): boolean { + return groups.some((patterns) => matchesErrorPatterns(raw, patterns)); +} + export function matchesFormatErrorPattern(raw: string): boolean { return matchesErrorPatterns(raw, ERROR_PATTERNS.format); } @@ -176,11 +190,14 @@ export function isBillingErrorMessage(raw: string): boolean { } export function isAuthPermanentErrorMessage(raw: string): boolean { - return matchesErrorPatterns(raw, ERROR_PATTERNS.authPermanent); + return matchesErrorPatternGroups(raw, [HIGH_CONFIDENCE_AUTH_PERMANENT_PATTERNS]); } export function isAuthErrorMessage(raw: string): boolean { - return matchesErrorPatterns(raw, ERROR_PATTERNS.auth); + return matchesErrorPatternGroups(raw, [ + AMBIGUOUS_AUTH_ERROR_PATTERNS, + COMMON_AUTH_ERROR_PATTERNS, + ]); } export function isOverloadedErrorMessage(raw: string): boolean {