fix(auth): reset cooldown error counters on expiry to prevent infinite escalation (#41028)

Merged via squash. Prepared head SHA: 89bd83f09a Co-authored-by: zerone0x <39543393+zerone0x@users.noreply.github.com> Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com> Reviewed-by: @altaywtf
2026-03-12 07:20:45 +00:00 · 2026-03-10 04:40:11 +08:00
parent 2b2e5e2038
commit 5f90883ad3
4 changed files with 77 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
 - Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.
 - iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky.
 - Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927.
 - Auth/cooldowns: reset expired auth-profile cooldown error counters before computing the next backoff so stale on-disk counters do not re-escalate into long cooldown loops after expiry. (#41028) thanks @zerone0x.
 ## 2026.3.8
--- a/src/agents/auth-profiles.markauthprofilefailure.test.ts
+++ b/src/agents/auth-profiles.markauthprofilefailure.test.ts
@@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => {
    }
  });
  it("resets error count when previous cooldown has expired to prevent escalation", async () => {
    const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-"));
    try {
      const authPath = path.join(agentDir, "auth-profiles.json");
      const now = Date.now();
      // Simulate state left on disk after 3 rapid failures within a 1-min cooldown
      // window. The cooldown has since expired, but clearExpiredCooldowns() only
      // ran in-memory and never persisted — so disk still carries errorCount: 3.
      fs.writeFileSync(
        authPath,
        JSON.stringify({
          version: 1,
          profiles: {
            "anthropic:default": {
              type: "api_key",
              provider: "anthropic",
              key: "sk-default",
            },
          },
          usageStats: {
            "anthropic:default": {
              errorCount: 3,
              failureCounts: { rate_limit: 3 },
              lastFailureAt: now - 120_000, // 2 minutes ago
              cooldownUntil: now - 60_000, // expired 1 minute ago
            },
          },
        }),
      );
      const store = ensureAuthProfileStore(agentDir);
      await markAuthProfileFailure({
        store,
        profileId: "anthropic:default",
        reason: "rate_limit",
        agentDir,
      });
      const stats = store.usageStats?.["anthropic:default"];
      // Error count should reset to 1 (not escalate to 4) because the
      // previous cooldown expired. Cooldown should be ~1 min, not ~60 min.
      expect(stats?.errorCount).toBe(1);
      expect(stats?.failureCounts?.rate_limit).toBe(1);
      const cooldownMs = (stats?.cooldownUntil ?? 0) - now;
      // calculateAuthProfileCooldownMs(1) = 60_000 (1 minute)
      expect(cooldownMs).toBeLessThan(120_000);
      expect(cooldownMs).toBeGreaterThan(0);
    } finally {
      fs.rmSync(agentDir, { recursive: true, force: true });
    }
  });
  it("does not persist cooldown windows for OpenRouter profiles", async () => {
    await withAuthProfileStore(async ({ agentDir, store }) => {
      await markAuthProfileFailure({
--- a/src/agents/auth-profiles/usage.test.ts
+++ b/src/agents/auth-profiles/usage.test.ts
@@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
    });
  }
  // When a cooldown/disabled window expires, the error count resets to prevent
  // stale counters from escalating the next cooldown (the root cause of
  // infinite cooldown loops — see #40989). The next failure should compute
  // backoff from errorCount=1, not from the accumulated stale count.
  const expiredWindowCases = [
    {
      label: "cooldownUntil",
@@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
        errorCount: 3,
        lastFailureAt: now - 60_000,
      }),
-      expectedUntil: (now: number) => now + 60 * 60 * 1000,
+      // errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000
      expectedUntil: (now: number) => now + 60_000,
      readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil,
    },
    {
@@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
        failureCounts: { billing: 2 },
        lastFailureAt: now - 60_000,
      }),
-      expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
+      // errorCount resets, billing count resets to 1 →
      // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
      expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
      readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
    },
    {
@@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
        failureCounts: { auth_permanent: 2 },
        lastFailureAt: now - 60_000,
      }),
-      expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
+      // errorCount resets, auth_permanent count resets to 1 →
      // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
      expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
      readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
    },
  ];
--- a/src/agents/auth-profiles/usage.ts
+++ b/src/agents/auth-profiles/usage.ts
@@ -400,9 +400,19 @@ function computeNextProfileUsageStats(params: {
    params.existing.lastFailureAt > 0 &&
    params.now - params.existing.lastFailureAt > windowMs;
-  const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
+  // If the previous cooldown has already expired, reset error counters so the
  // profile gets a fresh backoff window. clearExpiredCooldowns() does this
  // in-memory during profile ordering, but the on-disk state may still carry
  // the old counters when the lock-based updater reads a fresh store. Without
  // this check, stale error counts from an expired cooldown cause the next
  // failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
  const unusableUntil = resolveProfileUnusableUntil(params.existing);
  const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
  const shouldResetCounters = windowExpired || previousCooldownExpired;
  const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
  const nextErrorCount = baseErrorCount + 1;
-  const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts };
+  const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
  failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
  const updatedStats: ProfileUsageStats = {