From 865fa2ba726d0401193befe183d475e4cd08a18b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 4 Apr 2026 02:22:04 +0900 Subject: [PATCH] fix: narrow auth permanent lockouts --- docs/.generated/config-baseline.core.json | 32 +++++++++++++++++ docs/.generated/config-baseline.json | 32 +++++++++++++++++ docs/gateway/configuration-reference.md | 4 +++ ...th-profiles.markauthprofilefailure.test.ts | 30 +++++++++++++++- src/agents/auth-profiles/usage.ts | 34 ++++++++++--------- src/agents/failover-error.test.ts | 22 +++++------- ...dded-helpers.isbillingerrormessage.test.ts | 21 ++++++++---- .../pi-embedded-helpers/failover-matches.ts | 7 ++-- src/config/config-misc.test.ts | 14 ++++++++ src/config/schema.base.generated.ts | 18 ++++++++++ src/config/schema.help.ts | 2 +- src/config/types.auth.ts | 7 ++-- src/config/zod-schema.ts | 2 ++ 13 files changed, 180 insertions(+), 45 deletions(-) diff --git a/docs/.generated/config-baseline.core.json b/docs/.generated/config-baseline.core.json index 15a91a28d17..4798e55c83d 100644 --- a/docs/.generated/config-baseline.core.json +++ b/docs/.generated/config-baseline.core.json @@ -7816,6 +7816,38 @@ "help": "Cooldown/backoff controls for temporary profile suppression after billing-related failures and retry windows. Use these to prevent rapid re-selection of profiles that are still blocked.", "hasChildren": true }, + { + "path": "auth.cooldowns.authPermanentBackoffMinutes", + "kind": "core", + "type": "number", + "required": false, + "deprecated": false, + "sensitive": false, + "tags": [ + "access", + "auth", + "reliability" + ], + "label": "Auth-Permanent Backoff (minutes)", + "help": "Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.", + "hasChildren": false + }, + { + "path": "auth.cooldowns.authPermanentMaxMinutes", + "kind": "core", + "type": "number", + "required": false, + "deprecated": false, + "sensitive": false, + "tags": [ + "access", + "auth", + "performance" + ], + "label": "Auth-Permanent Backoff Cap (minutes)", + "help": "Cap (minutes) for auth_permanent backoff (default: 60).", + "hasChildren": false + }, { "path": "auth.cooldowns.billingBackoffHours", "kind": "core", diff --git a/docs/.generated/config-baseline.json b/docs/.generated/config-baseline.json index 819b239d3f0..7c8d1c2f0b0 100644 --- a/docs/.generated/config-baseline.json +++ b/docs/.generated/config-baseline.json @@ -7815,6 +7815,38 @@ "help": "Cooldown/backoff controls for temporary profile suppression after billing-related failures and retry windows. Use these to prevent rapid re-selection of profiles that are still blocked.", "hasChildren": true }, + { + "path": "auth.cooldowns.authPermanentBackoffMinutes", + "kind": "core", + "type": "number", + "required": false, + "deprecated": false, + "sensitive": false, + "tags": [ + "access", + "auth", + "reliability" + ], + "label": "Auth-Permanent Backoff (minutes)", + "help": "Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.", + "hasChildren": false + }, + { + "path": "auth.cooldowns.authPermanentMaxMinutes", + "kind": "core", + "type": "number", + "required": false, + "deprecated": false, + "sensitive": false, + "tags": [ + "access", + "auth", + "performance" + ], + "label": "Auth-Permanent Backoff Cap (minutes)", + "help": "Cap (minutes) for auth_permanent backoff (default: 60).", + "hasChildren": false + }, { "path": "auth.cooldowns.billingBackoffHours", "kind": "core", diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index 62a14b4bdd6..bf9db746e2c 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -3037,6 +3037,8 @@ Notes: billingBackoffHours: 5, billingBackoffHoursByProvider: { anthropic: 3, openai: 8 }, billingMaxHours: 24, + authPermanentBackoffMinutes: 10, + authPermanentMaxMinutes: 60, failureWindowHours: 24, overloadedProfileRotations: 1, overloadedBackoffMs: 0, @@ -3049,6 +3051,8 @@ Notes: - `billingBackoffHours`: base backoff in hours when a profile fails due to billing/insufficient credits (default: `5`). - `billingBackoffHoursByProvider`: optional per-provider overrides for billing backoff hours. - `billingMaxHours`: cap in hours for billing backoff exponential growth (default: `24`). +- `authPermanentBackoffMinutes`: base backoff in minutes for high-confidence `auth_permanent` failures (default: `10`). +- `authPermanentMaxMinutes`: cap in minutes for `auth_permanent` backoff growth (default: `60`). - `failureWindowHours`: rolling window in hours used for backoff counters (default: `24`). - `overloadedProfileRotations`: maximum same-provider auth-profile rotations for overloaded errors before switching to model fallback (default: `1`). - `overloadedBackoffMs`: fixed delay before retrying an overloaded provider/profile rotation (default: `0`). diff --git a/src/agents/auth-profiles.markauthprofilefailure.test.ts b/src/agents/auth-profiles.markauthprofilefailure.test.ts index dea3eddef5a..7807e58d9db 100644 --- a/src/agents/auth-profiles.markauthprofilefailure.test.ts +++ b/src/agents/auth-profiles.markauthprofilefailure.test.ts @@ -199,8 +199,9 @@ describe("markAuthProfileFailure", () => { expect(stats?.failureCounts?.overloaded).toBe(1); }); }); - it("disables auth_permanent failures via disabledUntil (like billing)", async () => { + it("disables auth_permanent failures for ~10 minutes by default", async () => { await withAuthProfileStore(async ({ agentDir, store }) => { + const startedAt = Date.now(); await markAuthProfileFailure({ store, profileId: "anthropic:default", @@ -213,6 +214,33 @@ describe("markAuthProfileFailure", () => { expect(stats?.disabledReason).toBe("auth_permanent"); // Should NOT set cooldownUntil (that's for transient errors) expect(stats?.cooldownUntil).toBeUndefined(); + const remainingMs = (stats?.disabledUntil as number) - startedAt; + expectCooldownInRange(remainingMs, 9 * 60 * 1000, 11 * 60 * 1000); + }); + }); + + it("honors auth_permanent backoff overrides", async () => { + await withAuthProfileStore(async ({ agentDir, store }) => { + const startedAt = Date.now(); + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "auth_permanent", + agentDir, + cfg: { + auth: { + cooldowns: { + authPermanentBackoffMinutes: 15, + authPermanentMaxMinutes: 45, + }, + }, + } as never, + }); + + const disabledUntil = store.usageStats?.["anthropic:default"]?.disabledUntil; + expect(typeof disabledUntil).toBe("number"); + const remainingMs = (disabledUntil as number) - startedAt; + expectCooldownInRange(remainingMs, 14 * 60 * 1000, 16 * 60 * 1000); }); }); it("resets backoff counters outside the failure window", async () => { diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 5b6a7d27d79..6bc64fa79ae 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -130,10 +130,7 @@ function applyWhamCooldownResult(params: { : 0; return { ...params.computed, - cooldownUntil: Math.max( - existingActiveCooldownUntil, - params.now + params.whamResult.cooldownMs, - ), + cooldownUntil: Math.max(existingActiveCooldownUntil, params.now + params.whamResult.cooldownMs), }; } @@ -528,11 +525,15 @@ function resolveAuthCooldownConfig(params: { const defaults = { billingBackoffHours: 5, billingMaxHours: 24, + authPermanentBackoffMinutes: 10, + authPermanentMaxMinutes: 60, failureWindowHours: 24, } as const; const resolveHours = (value: unknown, fallback: number) => typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback; + const resolveMinutes = (value: unknown, fallback: number) => + typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback; const cooldowns = params.cfg?.auth?.cooldowns; const billingOverride = (() => { @@ -553,17 +554,19 @@ function resolveAuthCooldownConfig(params: { defaults.billingBackoffHours, ); const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours); + const authPermanentBackoffMinutes = resolveMinutes( + cooldowns?.authPermanentBackoffMinutes, + defaults.authPermanentBackoffMinutes, + ); + const authPermanentMaxMinutes = resolveMinutes( + cooldowns?.authPermanentMaxMinutes, + defaults.authPermanentMaxMinutes, + ); const failureWindowHours = resolveHours( cooldowns?.failureWindowHours, defaults.failureWindowHours, ); - const resolveMinutes = (value: unknown, fallback: number) => - typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback; - - const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10); - const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60); - return { billingBackoffMs: billingBackoffHours * 60 * 60 * 1000, billingMaxMs: billingMaxHours * 60 * 60 * 1000, @@ -688,13 +691,12 @@ function computeNextProfileUsageStats(params: { }); updatedStats.disabledReason = params.reason; } else if (params.reason === "auth_permanent") { - // auth_permanent errors can be caused by transient provider outages (e.g. - // GCP returning API_KEY_INVALID during an incident). Use a much shorter - // backoff than billing so the provider recovers automatically once the - // upstream issue resolves. - const authPermCount = failureCounts[params.reason] ?? 1; + // Keep permanent-auth failures in the disabled lane, but use a much + // shorter backoff than billing. Some upstream incidents surface auth-ish + // payloads transiently, so the provider should recover automatically. + const authPermanentCount = failureCounts[params.reason] ?? 1; const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ - errorCount: authPermCount, + errorCount: authPermanentCount, baseMs: params.cfgResolved.authPermanentBackoffMs, maxMs: params.cfgResolved.authPermanentMaxMs, }); diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index fcb9388932a..492e3972595 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -500,9 +500,9 @@ describe("failover-error", () => { expect(resolveFailoverReasonFromError({ status: 403, message: "Forbidden" })).toBe("auth"); }); - it("401 with permanent auth message returns auth_permanent", () => { + it("401 with ambiguous auth message returns auth", () => { expect(resolveFailoverReasonFromError({ status: 401, message: "invalid_api_key" })).toBe( - "auth_permanent", + "auth", ); }); @@ -516,26 +516,22 @@ describe("failover-error", () => { expect(resolveFailoverStatus("auth_permanent")).toBe(403); }); - it("coerces permanent auth error with correct reason", () => { + it("coerces ambiguous auth error into the short auth lane", () => { const err = coerceToFailoverError( { status: 401, message: "invalid_api_key" }, { provider: "anthropic", model: "claude-opus-4-6" }, ); - expect(err?.reason).toBe("auth_permanent"); + expect(err?.reason).toBe("auth"); expect(err?.provider).toBe("anthropic"); }); - it("403 permission_error returns auth_permanent", () => { - expect( - resolveFailoverReasonFromError({ - status: 403, - message: - "permission_error: OAuth authentication is currently not allowed for this organization.", - }), - ).toBe("auth_permanent"); + it("403 bare permission_error returns auth", () => { + expect(resolveFailoverReasonFromError({ status: 403, message: "permission_error" })).toBe( + "auth", + ); }); - it("permission_error in error message string classifies as auth_permanent", () => { + it("permission_error with organization denial stays auth_permanent", () => { const err = coerceToFailoverError( "HTTP 403 permission_error: OAuth authentication is currently not allowed for this organization.", { provider: "anthropic", model: "claude-opus-4-6" }, diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index dabe78c2d26..2d4f30766ca 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -528,8 +528,8 @@ describe("isTransientHttpError", () => { }); describe("classifyFailoverReasonFromHttpStatus", () => { - it("treats HTTP 401 permanent auth failures as auth_permanent", () => { - expect(classifyFailoverReasonFromHttpStatus(401, "invalid_api_key")).toBe("auth_permanent"); + it("treats HTTP 401 invalid_api_key as ambiguous auth", () => { + expect(classifyFailoverReasonFromHttpStatus(401, "invalid_api_key")).toBe("auth"); }); it("treats HTTP 422 as format error", () => { @@ -591,7 +591,7 @@ describe("classifyFailoverReasonFromHttpStatus", () => { }); it("preserves explicit billing and auth signals on HTTP 410", () => { - expect(classifyFailoverReasonFromHttpStatus(410, "invalid_api_key")).toBe("auth_permanent"); + expect(classifyFailoverReasonFromHttpStatus(410, "invalid_api_key")).toBe("auth"); expect(classifyFailoverReasonFromHttpStatus(410, "authentication failed")).toBe("auth"); expect(classifyFailoverReasonFromHttpStatus(410, "insufficient credits")).toBe("billing"); }); @@ -613,7 +613,7 @@ describe("classifyFailoverReason", () => { }); it("keeps explicit billing and auth signals on 410 text", () => { - expect(classifyFailoverReason("HTTP 410: invalid_api_key")).toBe("auth_permanent"); + expect(classifyFailoverReason("HTTP 410: invalid_api_key")).toBe("auth"); expect(classifyFailoverReason("HTTP 410: authentication failed")).toBe("auth"); expect(classifyFailoverReason("HTTP 410: insufficient credits")).toBe("billing"); }); @@ -935,11 +935,15 @@ describe("classifyFailoverReason", () => { expect(classifyFailoverReason("LLM error: monthly limit reached")).toBe("rate_limit"); expect(classifyFailoverReason("LLM error: daily limit exceeded")).toBe("rate_limit"); }); - it("classifies permanent auth errors as auth_permanent", () => { - expect(classifyFailoverReason("invalid_api_key")).toBe("auth_permanent"); + it("keeps only high-confidence auth failures in auth_permanent", () => { + expect(classifyFailoverReason("invalid_api_key")).toBe("auth"); + expect(classifyFailoverReason("permission_error")).toBe("auth"); expect(classifyFailoverReason("Your api key has been revoked")).toBe("auth_permanent"); expect(classifyFailoverReason("key has been disabled")).toBe("auth_permanent"); expect(classifyFailoverReason("account has been deactivated")).toBe("auth_permanent"); + expect( + classifyFailoverReason("OAuth authentication is currently not allowed for this organization"), + ).toBe("auth_permanent"); }); it("classifies JSON api_error with transient signal as timeout", () => { expect( @@ -1013,6 +1017,11 @@ describe("classifyFailoverReason", () => { classifyFailoverReason( '{"type":"error","error":{"type":"api_error","message":"permission_error"}}', ), + ).toBe("auth"); + expect( + classifyFailoverReason( + '{"type":"error","error":{"type":"api_error","message":"permission_error: OAuth authentication is currently not allowed for this organization"}}', + ), ).toBe("auth_permanent"); }); }); diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index 1b3cb262e15..02c465f9c46 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -81,13 +81,10 @@ const ERROR_PATTERNS = { /requires?\s+more\s+credits/i, ], authPermanent: [ - /api[_ ]?key[_ ]?(?:revoked|invalid|deactivated|deleted)/i, - "invalid_api_key", + /api[_ ]?key[_ ]?(?:revoked|deactivated|deleted)/i, "key has been disabled", "key has been revoked", "account has been deactivated", - /could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i, - "permission_error", "not allowed for this organization", ], auth: [ @@ -97,6 +94,8 @@ const ERROR_PATTERNS = { "authentication", "re-authenticate", "oauth token refresh failed", + /could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i, + "permission_error", "unauthorized", "forbidden", "access denied", diff --git a/src/config/config-misc.test.ts b/src/config/config-misc.test.ts index 455d0811803..4ba8fcb4f64 100644 --- a/src/config/config-misc.test.ts +++ b/src/config/config-misc.test.ts @@ -52,6 +52,20 @@ describe("plugins.slots.contextEngine", () => { }); }); +describe("auth.cooldowns auth_permanent backoff config", () => { + it("accepts auth_permanent backoff knobs", () => { + const result = OpenClawSchema.safeParse({ + auth: { + cooldowns: { + authPermanentBackoffMinutes: 10, + authPermanentMaxMinutes: 60, + }, + }, + }); + expect(result.success).toBe(true); + }); +}); + describe("ui.seamColor", () => { it("accepts hex colors", () => { const res = validateConfigObject({ ui: { seamColor: "#FF4500" } }); diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 12438d2bb5c..f01cd58c982 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -782,6 +782,14 @@ export const GENERATED_BASE_CONFIG_SCHEMA = { type: "number", exclusiveMinimum: 0, }, + authPermanentBackoffMinutes: { + type: "number", + exclusiveMinimum: 0, + }, + authPermanentMaxMinutes: { + type: "number", + exclusiveMinimum: 0, + }, failureWindowHours: { type: "number", exclusiveMinimum: 0, @@ -22567,6 +22575,16 @@ export const GENERATED_BASE_CONFIG_SCHEMA = { help: "Cap (hours) for billing backoff (default: 24).", tags: ["auth", "access", "performance"], }, + "auth.cooldowns.authPermanentBackoffMinutes": { + label: "Auth-Permanent Backoff (minutes)", + help: "Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.", + tags: ["auth", "access", "reliability"], + }, + "auth.cooldowns.authPermanentMaxMinutes": { + label: "Auth-Permanent Backoff Cap (minutes)", + help: "Cap (minutes) for auth_permanent backoff (default: 60).", + tags: ["auth", "access", "performance"], + }, "auth.cooldowns.failureWindowHours": { label: "Failover Window (hours)", help: "Failure window (hours) for backoff counters (default: 24).", diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 55eedc04b9e..18579ab2ee9 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -823,7 +823,7 @@ export const FIELD_HELP: Record = { "Optional per-provider overrides for billing backoff (hours).", "auth.cooldowns.billingMaxHours": "Cap (hours) for billing backoff (default: 24).", "auth.cooldowns.authPermanentBackoffMinutes": - "Base backoff (minutes) for auth_permanent failures (default: 10). Shorter than billing because these errors are often caused by transient provider outages.", + "Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.", "auth.cooldowns.authPermanentMaxMinutes": "Cap (minutes) for auth_permanent backoff (default: 60).", "auth.cooldowns.failureWindowHours": "Failure window (hours) for backoff counters (default: 24).", diff --git a/src/config/types.auth.ts b/src/config/types.auth.ts index bc00dc7d841..ca9340dd6b9 100644 --- a/src/config/types.auth.ts +++ b/src/config/types.auth.ts @@ -22,13 +22,12 @@ export type AuthConfig = { /** Billing backoff cap (hours). Default: 24. */ billingMaxHours?: number; /** - * Base backoff for permanent-auth failures (minutes). These errors (e.g. - * API_KEY_INVALID) can be caused by transient provider outages, so the - * default is much shorter than billing backoff. Default: 10. + * Base backoff for high-confidence permanent-auth failures (minutes). + * Default: 10. */ authPermanentBackoffMinutes?: number; /** - * Cap for permanent-auth backoff (minutes). Default: 60. + * Cap for high-confidence permanent-auth backoff (minutes). Default: 60. */ authPermanentMaxMinutes?: number; /** diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 62de4cd30ef..4f0e43b3e3f 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -443,6 +443,8 @@ export const OpenClawSchema = z billingBackoffHours: z.number().positive().optional(), billingBackoffHoursByProvider: z.record(z.string(), z.number().positive()).optional(), billingMaxHours: z.number().positive().optional(), + authPermanentBackoffMinutes: z.number().positive().optional(), + authPermanentMaxMinutes: z.number().positive().optional(), failureWindowHours: z.number().positive().optional(), overloadedProfileRotations: z.number().int().nonnegative().optional(), overloadedBackoffMs: z.number().int().nonnegative().optional(),