fix(auth): reset cooldown error counters on expiry to prevent infinite escalation (#41028)

Merged via squash. Prepared head SHA: 89bd83f09a Co-authored-by: zerone0x <39543393+zerone0x@users.noreply.github.com> Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com> Reviewed-by: @altaywtf
2026-05-06 05:30:42 +00:00 · 2026-03-10 04:40:11 +08:00
parent 2b2e5e2038
commit 5f90883ad3
4 changed files with 77 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
 - Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.
 - iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky.
 - Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927.
+- Auth/cooldowns: reset expired auth-profile cooldown error counters before computing the next backoff so stale on-disk counters do not re-escalate into long cooldown loops after expiry. (#41028) thanks @zerone0x.

 ## 2026.3.8

--- a/src/agents/auth-profiles.markauthprofilefailure.test.ts
+++ b/src/agents/auth-profiles.markauthprofilefailure.test.ts
@@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => {
    }
  });

+  it("resets error count when previous cooldown has expired to prevent escalation", async () => {
+    const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-"));
+    try {
+      const authPath = path.join(agentDir, "auth-profiles.json");
+      const now = Date.now();
+      // Simulate state left on disk after 3 rapid failures within a 1-min cooldown
+      // window. The cooldown has since expired, but clearExpiredCooldowns() only
+      // ran in-memory and never persisted — so disk still carries errorCount: 3.
+      fs.writeFileSync(
+        authPath,
+        JSON.stringify({
+          version: 1,
+          profiles: {
+            "anthropic:default": {
+              type: "api_key",
+              provider: "anthropic",
+              key: "sk-default",
+            },
+          },
+          usageStats: {
+            "anthropic:default": {
+              errorCount: 3,
+              failureCounts: { rate_limit: 3 },
+              lastFailureAt: now - 120_000, // 2 minutes ago
+              cooldownUntil: now - 60_000, // expired 1 minute ago
+            },
+          },
+        }),
+      );
+
+      const store = ensureAuthProfileStore(agentDir);
+      await markAuthProfileFailure({
+        store,
+        profileId: "anthropic:default",
+        reason: "rate_limit",
+        agentDir,
+      });
+
+      const stats = store.usageStats?.["anthropic:default"];
+      // Error count should reset to 1 (not escalate to 4) because the
+      // previous cooldown expired. Cooldown should be ~1 min, not ~60 min.
+      expect(stats?.errorCount).toBe(1);
+      expect(stats?.failureCounts?.rate_limit).toBe(1);
+      const cooldownMs = (stats?.cooldownUntil ?? 0) - now;
+      // calculateAuthProfileCooldownMs(1) = 60_000 (1 minute)
+      expect(cooldownMs).toBeLessThan(120_000);
+      expect(cooldownMs).toBeGreaterThan(0);
+    } finally {
+      fs.rmSync(agentDir, { recursive: true, force: true });
+    }
+  });
+
  it("does not persist cooldown windows for OpenRouter profiles", async () => {
    await withAuthProfileStore(async ({ agentDir, store }) => {
      await markAuthProfileFailure({
--- a/src/agents/auth-profiles/usage.test.ts
+++ b/src/agents/auth-profiles/usage.test.ts
@@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
    });
  }

+  // When a cooldown/disabled window expires, the error count resets to prevent
+  // stale counters from escalating the next cooldown (the root cause of
+  // infinite cooldown loops — see #40989). The next failure should compute
+  // backoff from errorCount=1, not from the accumulated stale count.
  const expiredWindowCases = [
    {
      label: "cooldownUntil",
@@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
        errorCount: 3,
        lastFailureAt: now - 60_000,
      }),
-      expectedUntil: (now: number) => now + 60 * 60 * 1000,
+      // errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000
+      expectedUntil: (now: number) => now + 60_000,
      readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil,
    },
    {
@@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
        failureCounts: { billing: 2 },
        lastFailureAt: now - 60_000,
      }),
-      expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
+      // errorCount resets, billing count resets to 1 →
+      // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
+      expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
      readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
    },
    {
@@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
        failureCounts: { auth_permanent: 2 },
        lastFailureAt: now - 60_000,
      }),
-      expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
+      // errorCount resets, auth_permanent count resets to 1 →
+      // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
+      expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
      readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
    },
  ];
--- a/src/agents/auth-profiles/usage.ts
+++ b/src/agents/auth-profiles/usage.ts
@@ -400,9 +400,19 @@ function computeNextProfileUsageStats(params: {
    params.existing.lastFailureAt > 0 &&
    params.now - params.existing.lastFailureAt > windowMs;

-  const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
+  // If the previous cooldown has already expired, reset error counters so the
+  // profile gets a fresh backoff window. clearExpiredCooldowns() does this
+  // in-memory during profile ordering, but the on-disk state may still carry
+  // the old counters when the lock-based updater reads a fresh store. Without
+  // this check, stale error counts from an expired cooldown cause the next
+  // failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
+  const unusableUntil = resolveProfileUnusableUntil(params.existing);
+  const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
+
+  const shouldResetCounters = windowExpired || previousCooldownExpired;
+  const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
  const nextErrorCount = baseErrorCount + 1;
-  const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts };
+  const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
  failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;

  const updatedStats: ProfileUsageStats = {