fix: narrow auth permanent lockouts

This commit is contained in:
Peter Steinberger
2026-04-04 02:22:04 +09:00
parent 42e1d489fd
commit 865fa2ba72
13 changed files with 180 additions and 45 deletions

View File

@@ -199,8 +199,9 @@ describe("markAuthProfileFailure", () => {
expect(stats?.failureCounts?.overloaded).toBe(1);
});
});
it("disables auth_permanent failures via disabledUntil (like billing)", async () => {
it("disables auth_permanent failures for ~10 minutes by default", async () => {
await withAuthProfileStore(async ({ agentDir, store }) => {
const startedAt = Date.now();
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
@@ -213,6 +214,33 @@ describe("markAuthProfileFailure", () => {
expect(stats?.disabledReason).toBe("auth_permanent");
// Should NOT set cooldownUntil (that's for transient errors)
expect(stats?.cooldownUntil).toBeUndefined();
const remainingMs = (stats?.disabledUntil as number) - startedAt;
expectCooldownInRange(remainingMs, 9 * 60 * 1000, 11 * 60 * 1000);
});
});
it("honors auth_permanent backoff overrides", async () => {
await withAuthProfileStore(async ({ agentDir, store }) => {
const startedAt = Date.now();
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
reason: "auth_permanent",
agentDir,
cfg: {
auth: {
cooldowns: {
authPermanentBackoffMinutes: 15,
authPermanentMaxMinutes: 45,
},
},
} as never,
});
const disabledUntil = store.usageStats?.["anthropic:default"]?.disabledUntil;
expect(typeof disabledUntil).toBe("number");
const remainingMs = (disabledUntil as number) - startedAt;
expectCooldownInRange(remainingMs, 14 * 60 * 1000, 16 * 60 * 1000);
});
});
it("resets backoff counters outside the failure window", async () => {

View File

@@ -130,10 +130,7 @@ function applyWhamCooldownResult(params: {
: 0;
return {
...params.computed,
cooldownUntil: Math.max(
existingActiveCooldownUntil,
params.now + params.whamResult.cooldownMs,
),
cooldownUntil: Math.max(existingActiveCooldownUntil, params.now + params.whamResult.cooldownMs),
};
}
@@ -528,11 +525,15 @@ function resolveAuthCooldownConfig(params: {
const defaults = {
billingBackoffHours: 5,
billingMaxHours: 24,
authPermanentBackoffMinutes: 10,
authPermanentMaxMinutes: 60,
failureWindowHours: 24,
} as const;
const resolveHours = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const resolveMinutes = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const cooldowns = params.cfg?.auth?.cooldowns;
const billingOverride = (() => {
@@ -553,17 +554,19 @@ function resolveAuthCooldownConfig(params: {
defaults.billingBackoffHours,
);
const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours);
const authPermanentBackoffMinutes = resolveMinutes(
cooldowns?.authPermanentBackoffMinutes,
defaults.authPermanentBackoffMinutes,
);
const authPermanentMaxMinutes = resolveMinutes(
cooldowns?.authPermanentMaxMinutes,
defaults.authPermanentMaxMinutes,
);
const failureWindowHours = resolveHours(
cooldowns?.failureWindowHours,
defaults.failureWindowHours,
);
const resolveMinutes = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10);
const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
@@ -688,13 +691,12 @@ function computeNextProfileUsageStats(params: {
});
updatedStats.disabledReason = params.reason;
} else if (params.reason === "auth_permanent") {
// auth_permanent errors can be caused by transient provider outages (e.g.
// GCP returning API_KEY_INVALID during an incident). Use a much shorter
// backoff than billing so the provider recovers automatically once the
// upstream issue resolves.
const authPermCount = failureCounts[params.reason] ?? 1;
// Keep permanent-auth failures in the disabled lane, but use a much
// shorter backoff than billing. Some upstream incidents surface auth-ish
// payloads transiently, so the provider should recover automatically.
const authPermanentCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: authPermCount,
errorCount: authPermanentCount,
baseMs: params.cfgResolved.authPermanentBackoffMs,
maxMs: params.cfgResolved.authPermanentMaxMs,
});

View File

@@ -500,9 +500,9 @@ describe("failover-error", () => {
expect(resolveFailoverReasonFromError({ status: 403, message: "Forbidden" })).toBe("auth");
});
it("401 with permanent auth message returns auth_permanent", () => {
it("401 with ambiguous auth message returns auth", () => {
expect(resolveFailoverReasonFromError({ status: 401, message: "invalid_api_key" })).toBe(
"auth_permanent",
"auth",
);
});
@@ -516,26 +516,22 @@ describe("failover-error", () => {
expect(resolveFailoverStatus("auth_permanent")).toBe(403);
});
it("coerces permanent auth error with correct reason", () => {
it("coerces ambiguous auth error into the short auth lane", () => {
const err = coerceToFailoverError(
{ status: 401, message: "invalid_api_key" },
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
expect(err?.reason).toBe("auth");
expect(err?.provider).toBe("anthropic");
});
it("403 permission_error returns auth_permanent", () => {
expect(
resolveFailoverReasonFromError({
status: 403,
message:
"permission_error: OAuth authentication is currently not allowed for this organization.",
}),
).toBe("auth_permanent");
it("403 bare permission_error returns auth", () => {
expect(resolveFailoverReasonFromError({ status: 403, message: "permission_error" })).toBe(
"auth",
);
});
it("permission_error in error message string classifies as auth_permanent", () => {
it("permission_error with organization denial stays auth_permanent", () => {
const err = coerceToFailoverError(
"HTTP 403 permission_error: OAuth authentication is currently not allowed for this organization.",
{ provider: "anthropic", model: "claude-opus-4-6" },

View File

@@ -528,8 +528,8 @@ describe("isTransientHttpError", () => {
});
describe("classifyFailoverReasonFromHttpStatus", () => {
it("treats HTTP 401 permanent auth failures as auth_permanent", () => {
expect(classifyFailoverReasonFromHttpStatus(401, "invalid_api_key")).toBe("auth_permanent");
it("treats HTTP 401 invalid_api_key as ambiguous auth", () => {
expect(classifyFailoverReasonFromHttpStatus(401, "invalid_api_key")).toBe("auth");
});
it("treats HTTP 422 as format error", () => {
@@ -591,7 +591,7 @@ describe("classifyFailoverReasonFromHttpStatus", () => {
});
it("preserves explicit billing and auth signals on HTTP 410", () => {
expect(classifyFailoverReasonFromHttpStatus(410, "invalid_api_key")).toBe("auth_permanent");
expect(classifyFailoverReasonFromHttpStatus(410, "invalid_api_key")).toBe("auth");
expect(classifyFailoverReasonFromHttpStatus(410, "authentication failed")).toBe("auth");
expect(classifyFailoverReasonFromHttpStatus(410, "insufficient credits")).toBe("billing");
});
@@ -613,7 +613,7 @@ describe("classifyFailoverReason", () => {
});
it("keeps explicit billing and auth signals on 410 text", () => {
expect(classifyFailoverReason("HTTP 410: invalid_api_key")).toBe("auth_permanent");
expect(classifyFailoverReason("HTTP 410: invalid_api_key")).toBe("auth");
expect(classifyFailoverReason("HTTP 410: authentication failed")).toBe("auth");
expect(classifyFailoverReason("HTTP 410: insufficient credits")).toBe("billing");
});
@@ -935,11 +935,15 @@ describe("classifyFailoverReason", () => {
expect(classifyFailoverReason("LLM error: monthly limit reached")).toBe("rate_limit");
expect(classifyFailoverReason("LLM error: daily limit exceeded")).toBe("rate_limit");
});
it("classifies permanent auth errors as auth_permanent", () => {
expect(classifyFailoverReason("invalid_api_key")).toBe("auth_permanent");
it("keeps only high-confidence auth failures in auth_permanent", () => {
expect(classifyFailoverReason("invalid_api_key")).toBe("auth");
expect(classifyFailoverReason("permission_error")).toBe("auth");
expect(classifyFailoverReason("Your api key has been revoked")).toBe("auth_permanent");
expect(classifyFailoverReason("key has been disabled")).toBe("auth_permanent");
expect(classifyFailoverReason("account has been deactivated")).toBe("auth_permanent");
expect(
classifyFailoverReason("OAuth authentication is currently not allowed for this organization"),
).toBe("auth_permanent");
});
it("classifies JSON api_error with transient signal as timeout", () => {
expect(
@@ -1013,6 +1017,11 @@ describe("classifyFailoverReason", () => {
classifyFailoverReason(
'{"type":"error","error":{"type":"api_error","message":"permission_error"}}',
),
).toBe("auth");
expect(
classifyFailoverReason(
'{"type":"error","error":{"type":"api_error","message":"permission_error: OAuth authentication is currently not allowed for this organization"}}',
),
).toBe("auth_permanent");
});
});

View File

@@ -81,13 +81,10 @@ const ERROR_PATTERNS = {
/requires?\s+more\s+credits/i,
],
authPermanent: [
/api[_ ]?key[_ ]?(?:revoked|invalid|deactivated|deleted)/i,
"invalid_api_key",
/api[_ ]?key[_ ]?(?:revoked|deactivated|deleted)/i,
"key has been disabled",
"key has been revoked",
"account has been deactivated",
/could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i,
"permission_error",
"not allowed for this organization",
],
auth: [
@@ -97,6 +94,8 @@ const ERROR_PATTERNS = {
"authentication",
"re-authenticate",
"oauth token refresh failed",
/could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i,
"permission_error",
"unauthorized",
"forbidden",
"access denied",