diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c9c649d7f9..d38c10cf6d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai - Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf. - iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky. - Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927. +- Auth/cooldowns: reset expired auth-profile cooldown error counters before computing the next backoff so stale on-disk counters do not re-escalate into long cooldown loops after expiry. (#41028) thanks @zerone0x. ## 2026.3.8 diff --git a/src/agents/auth-profiles.markauthprofilefailure.test.ts b/src/agents/auth-profiles.markauthprofilefailure.test.ts index e5690f75c6a..5c4d73197b3 100644 --- a/src/agents/auth-profiles.markauthprofilefailure.test.ts +++ b/src/agents/auth-profiles.markauthprofilefailure.test.ts @@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => { } }); + it("resets error count when previous cooldown has expired to prevent escalation", async () => { + const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-")); + try { + const authPath = path.join(agentDir, "auth-profiles.json"); + const now = Date.now(); + // Simulate state left on disk after 3 rapid failures within a 1-min cooldown + // window. The cooldown has since expired, but clearExpiredCooldowns() only + // ran in-memory and never persisted — so disk still carries errorCount: 3. + fs.writeFileSync( + authPath, + JSON.stringify({ + version: 1, + profiles: { + "anthropic:default": { + type: "api_key", + provider: "anthropic", + key: "sk-default", + }, + }, + usageStats: { + "anthropic:default": { + errorCount: 3, + failureCounts: { rate_limit: 3 }, + lastFailureAt: now - 120_000, // 2 minutes ago + cooldownUntil: now - 60_000, // expired 1 minute ago + }, + }, + }), + ); + + const store = ensureAuthProfileStore(agentDir); + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "rate_limit", + agentDir, + }); + + const stats = store.usageStats?.["anthropic:default"]; + // Error count should reset to 1 (not escalate to 4) because the + // previous cooldown expired. Cooldown should be ~1 min, not ~60 min. + expect(stats?.errorCount).toBe(1); + expect(stats?.failureCounts?.rate_limit).toBe(1); + const cooldownMs = (stats?.cooldownUntil ?? 0) - now; + // calculateAuthProfileCooldownMs(1) = 60_000 (1 minute) + expect(cooldownMs).toBeLessThan(120_000); + expect(cooldownMs).toBeGreaterThan(0); + } finally { + fs.rmSync(agentDir, { recursive: true, force: true }); + } + }); + it("does not persist cooldown windows for OpenRouter profiles", async () => { await withAuthProfileStore(async ({ agentDir, store }) => { await markAuthProfileFailure({ diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 120f75d3665..261eae6efd5 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () }); } + // When a cooldown/disabled window expires, the error count resets to prevent + // stale counters from escalating the next cooldown (the root cause of + // infinite cooldown loops — see #40989). The next failure should compute + // backoff from errorCount=1, not from the accumulated stale count. const expiredWindowCases = [ { label: "cooldownUntil", @@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () errorCount: 3, lastFailureAt: now - 60_000, }), - expectedUntil: (now: number) => now + 60 * 60 * 1000, + // errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000 + expectedUntil: (now: number) => now + 60_000, readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil, }, { @@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () failureCounts: { billing: 2 }, lastFailureAt: now - 60_000, }), - expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000, + // errorCount resets, billing count resets to 1 → + // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h + expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000, readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil, }, { @@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () failureCounts: { auth_permanent: 2 }, lastFailureAt: now - 60_000, }), - expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000, + // errorCount resets, auth_permanent count resets to 1 → + // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h + expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000, readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil, }, ]; diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index c28b51e3e57..0d9ae6a6aaa 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -400,9 +400,19 @@ function computeNextProfileUsageStats(params: { params.existing.lastFailureAt > 0 && params.now - params.existing.lastFailureAt > windowMs; - const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0); + // If the previous cooldown has already expired, reset error counters so the + // profile gets a fresh backoff window. clearExpiredCooldowns() does this + // in-memory during profile ordering, but the on-disk state may still carry + // the old counters when the lock-based updater reads a fresh store. Without + // this check, stale error counts from an expired cooldown cause the next + // failure to escalate to a much longer cooldown (e.g. 1 min → 25 min). + const unusableUntil = resolveProfileUnusableUntil(params.existing); + const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil; + + const shouldResetCounters = windowExpired || previousCooldownExpired; + const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0); const nextErrorCount = baseErrorCount + 1; - const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts }; + const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts }; failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1; const updatedStats: ProfileUsageStats = {