refactor: clarify auth failover policy

This commit is contained in:
Peter Steinberger
2026-04-04 02:47:01 +09:00
parent 1d1a8264ec
commit 6739c28718
5 changed files with 113 additions and 70 deletions

View File

@@ -709,7 +709,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
lastFailureAt: now - 60_000,
}),
// errorCount resets, billing count resets to 1 →
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
// calculateDisabledLaneBackoffMs(1, 5h, 24h) = 5h
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
},
@@ -724,7 +724,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
lastFailureAt: now - 60_000,
}),
// errorCount resets, auth_permanent count resets to 1 →
// calculateAuthProfileBillingDisableMsWithConfig(1, 10m, 60m) = 10m
// calculateDisabledLaneBackoffMs(1, 10m, 60m) = 10m
expectedUntil: (now: number) => now + 10 * 60 * 1000,
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
},

View File

@@ -518,6 +518,27 @@ type ResolvedAuthCooldownConfig = {
failureWindowMs: number;
};
type DisabledFailureReason = Extract<AuthProfileFailureReason, "billing" | "auth_permanent">;
type DisabledFailureBackoffPolicy = {
baseMs: (cfg: ResolvedAuthCooldownConfig) => number;
maxMs: (cfg: ResolvedAuthCooldownConfig) => number;
};
const DISABLED_FAILURE_BACKOFF_POLICIES = {
billing: {
baseMs: (cfg) => cfg.billingBackoffMs,
maxMs: (cfg) => cfg.billingMaxMs,
},
auth_permanent: {
// Keep high-confidence permanent-auth failures in the disabled lane, but
// recover much sooner than billing because some providers surface
// auth-looking payloads transiently during incidents.
baseMs: (cfg) => cfg.authPermanentBackoffMs,
maxMs: (cfg) => cfg.authPermanentMaxMs,
},
} as const satisfies Record<DisabledFailureReason, DisabledFailureBackoffPolicy>;
function resolveAuthCooldownConfig(params: {
cfg?: OpenClawConfig;
providerId: string;
@@ -530,9 +551,7 @@ function resolveAuthCooldownConfig(params: {
failureWindowHours: 24,
} as const;
const resolveHours = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const resolveMinutes = (value: unknown, fallback: number) =>
const resolvePositiveNumber = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const cooldowns = params.cfg?.auth?.cooldowns;
@@ -549,20 +568,23 @@ function resolveAuthCooldownConfig(params: {
return undefined;
})();
const billingBackoffHours = resolveHours(
const billingBackoffHours = resolvePositiveNumber(
billingOverride ?? cooldowns?.billingBackoffHours,
defaults.billingBackoffHours,
);
const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours);
const authPermanentBackoffMinutes = resolveMinutes(
const billingMaxHours = resolvePositiveNumber(
cooldowns?.billingMaxHours,
defaults.billingMaxHours,
);
const authPermanentBackoffMinutes = resolvePositiveNumber(
cooldowns?.authPermanentBackoffMinutes,
defaults.authPermanentBackoffMinutes,
);
const authPermanentMaxMinutes = resolveMinutes(
const authPermanentMaxMinutes = resolvePositiveNumber(
cooldowns?.authPermanentMaxMinutes,
defaults.authPermanentMaxMinutes,
);
const failureWindowHours = resolveHours(
const failureWindowHours = resolvePositiveNumber(
cooldowns?.failureWindowHours,
defaults.failureWindowHours,
);
@@ -576,7 +598,7 @@ function resolveAuthCooldownConfig(params: {
};
}
function calculateAuthProfileBillingDisableMsWithConfig(params: {
function calculateDisabledLaneBackoffMs(params: {
errorCount: number;
baseMs: number;
maxMs: number;
@@ -589,6 +611,19 @@ function calculateAuthProfileBillingDisableMsWithConfig(params: {
return Math.min(maxMs, raw);
}
function resolveDisabledFailureBackoffMs(params: {
reason: DisabledFailureReason;
errorCount: number;
cfgResolved: ResolvedAuthCooldownConfig;
}): number {
const policy = DISABLED_FAILURE_BACKOFF_POLICIES[params.reason];
return calculateDisabledLaneBackoffMs({
errorCount: params.errorCount,
baseMs: policy.baseMs(params.cfgResolved),
maxMs: policy.maxMs(params.cfgResolved),
});
}
export function resolveProfileUnusableUntilForDisplay(
store: AuthProfileStore,
profileId: string,
@@ -675,12 +710,15 @@ function computeNextProfileUsageStats(params: {
lastFailureAt: params.now,
};
if (params.reason === "billing") {
const billingCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
baseMs: params.cfgResolved.billingBackoffMs,
maxMs: params.cfgResolved.billingMaxMs,
const disabledFailureReason =
params.reason === "billing" || params.reason === "auth_permanent" ? params.reason : null;
if (disabledFailureReason) {
const disableCount = failureCounts[disabledFailureReason] ?? 1;
const backoffMs = resolveDisabledFailureBackoffMs({
reason: disabledFailureReason,
errorCount: disableCount,
cfgResolved: params.cfgResolved,
});
// Keep active disable windows immutable so retries within the window cannot
// extend recovery time indefinitely.
@@ -689,23 +727,7 @@ function computeNextProfileUsageStats(params: {
now: params.now,
recomputedUntil: params.now + backoffMs,
});
updatedStats.disabledReason = params.reason;
} else if (params.reason === "auth_permanent") {
// Keep permanent-auth failures in the disabled lane, but use a much
// shorter backoff than billing. Some upstream incidents surface auth-ish
// payloads transiently, so the provider should recover automatically.
const authPermanentCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: authPermanentCount,
baseMs: params.cfgResolved.authPermanentBackoffMs,
maxMs: params.cfgResolved.authPermanentMaxMs,
});
updatedStats.disabledUntil = keepActiveWindowOrRecompute({
existingUntil: params.existing.disabledUntil,
now: params.now,
recomputedUntil: params.now + backoffMs,
});
updatedStats.disabledReason = params.reason;
updatedStats.disabledReason = disabledFailureReason;
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
// Keep active cooldown windows immutable so retries within the window

View File

@@ -109,7 +109,7 @@ describe("failover-error", () => {
status: 410,
message: "invalid_api_key",
}),
).toBe("auth_permanent");
).toBe("auth");
expect(
resolveFailoverReasonFromError({
status: 410,

View File

@@ -68,14 +68,12 @@ describe("isAuthPermanentErrorMessage", () => {
{
name: "matches permanent auth failure patterns",
samples: [
"invalid_api_key",
"api key revoked",
"api key deactivated",
"key has been disabled",
"key has been revoked",
"account has been deactivated",
"could not authenticate api key",
"could not validate credentials",
"OAuth authentication is currently not allowed for this organization",
"API_KEY_REVOKED",
"api_key_deleted",
],
@@ -84,6 +82,8 @@ describe("isAuthPermanentErrorMessage", () => {
{
name: "does not match transient auth errors",
samples: [
"invalid_api_key",
"permission_error",
"unauthorized",
"invalid token",
"authentication failed",
@@ -102,8 +102,12 @@ describe("isAuthErrorMessage", () => {
it.each([
'No credentials found for profile "anthropic:default".',
"No API key found for profile openai.",
"invalid_api_key",
"permission_error",
"OAuth token refresh failed for anthropic: Failed to refresh OAuth token for anthropic. Please try again or re-authenticate.",
"Please re-authenticate to continue.",
"could not authenticate api key",
"could not validate credentials",
"Failed to extract accountId from token",
])("matches auth errors for %j", (sample) => {
expect(isAuthErrorMessage(sample)).toBe(true);

View File

@@ -3,6 +3,41 @@ type ErrorPattern = RegExp | string;
const PERIODIC_USAGE_LIMIT_RE =
/\b(?:daily|weekly|monthly)(?:\/(?:daily|weekly|monthly))* (?:usage )?limit(?:s)?(?: (?:exhausted|reached|exceeded))?\b/i;
const HIGH_CONFIDENCE_AUTH_PERMANENT_PATTERNS = [
/api[_ ]?key[_ ]?(?:revoked|deactivated|deleted)/i,
"key has been disabled",
"key has been revoked",
"account has been deactivated",
"not allowed for this organization",
] as const satisfies readonly ErrorPattern[];
const AMBIGUOUS_AUTH_ERROR_PATTERNS = [
/invalid[_ ]?api[_ ]?key/,
/could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i,
"permission_error",
] as const satisfies readonly ErrorPattern[];
const COMMON_AUTH_ERROR_PATTERNS = [
"incorrect api key",
"invalid token",
"authentication",
"re-authenticate",
"oauth token refresh failed",
"unauthorized",
"forbidden",
"access denied",
"insufficient permissions",
"insufficient permission",
/missing scopes?:/i,
"expired",
"token has expired",
/\b401\b/,
/\b403\b/,
"no credentials found",
"no api key found",
/\bfailed to (?:extract|parse|validate|decode)\b.*\btoken\b/,
] as const satisfies readonly ErrorPattern[];
const ERROR_PATTERNS = {
rateLimit: [
/rate[_ ]limit|too many requests|429/,
@@ -80,36 +115,8 @@ const ERROR_PATTERNS = {
"insufficient usd or diem balance",
/requires?\s+more\s+credits/i,
],
authPermanent: [
/api[_ ]?key[_ ]?(?:revoked|deactivated|deleted)/i,
"key has been disabled",
"key has been revoked",
"account has been deactivated",
"not allowed for this organization",
],
auth: [
/invalid[_ ]?api[_ ]?key/,
"incorrect api key",
"invalid token",
"authentication",
"re-authenticate",
"oauth token refresh failed",
/could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i,
"permission_error",
"unauthorized",
"forbidden",
"access denied",
"insufficient permissions",
"insufficient permission",
/missing scopes?:/i,
"expired",
"token has expired",
/\b401\b/,
/\b403\b/,
"no credentials found",
"no api key found",
/\bfailed to (?:extract|parse|validate|decode)\b.*\btoken\b/,
],
authPermanent: HIGH_CONFIDENCE_AUTH_PERMANENT_PATTERNS,
auth: [...AMBIGUOUS_AUTH_ERROR_PATTERNS, ...COMMON_AUTH_ERROR_PATTERNS],
format: [
"string should match pattern",
"tool_use.id",
@@ -136,6 +143,13 @@ function matchesErrorPatterns(raw: string, patterns: readonly ErrorPattern[]): b
);
}
function matchesErrorPatternGroups(
raw: string,
groups: readonly (readonly ErrorPattern[])[],
): boolean {
return groups.some((patterns) => matchesErrorPatterns(raw, patterns));
}
export function matchesFormatErrorPattern(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.format);
}
@@ -176,11 +190,14 @@ export function isBillingErrorMessage(raw: string): boolean {
}
export function isAuthPermanentErrorMessage(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.authPermanent);
return matchesErrorPatternGroups(raw, [HIGH_CONFIDENCE_AUTH_PERMANENT_PATTERNS]);
}
export function isAuthErrorMessage(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.auth);
return matchesErrorPatternGroups(raw, [
AMBIGUOUS_AUTH_ERROR_PATTERNS,
COMMON_AUTH_ERROR_PATTERNS,
]);
}
export function isOverloadedErrorMessage(raw: string): boolean {