fix(auth): reset cooldown error counters on expiry to prevent infinite escalation (#41028)

Merged via squash.

Prepared head SHA: 89bd83f09a
Co-authored-by: zerone0x <39543393+zerone0x@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
This commit is contained in:
zerone0x
2026-03-10 04:40:11 +08:00
committed by GitHub
parent 2b2e5e2038
commit 5f90883ad3
4 changed files with 77 additions and 5 deletions

View File

@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
- Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.
- iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky.
- Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927.
- Auth/cooldowns: reset expired auth-profile cooldown error counters before computing the next backoff so stale on-disk counters do not re-escalate into long cooldown loops after expiry. (#41028) thanks @zerone0x.
## 2026.3.8

View File

@@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => {
}
});
it("resets error count when previous cooldown has expired to prevent escalation", async () => {
const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-"));
try {
const authPath = path.join(agentDir, "auth-profiles.json");
const now = Date.now();
// Simulate state left on disk after 3 rapid failures within a 1-min cooldown
// window. The cooldown has since expired, but clearExpiredCooldowns() only
// ran in-memory and never persisted — so disk still carries errorCount: 3.
fs.writeFileSync(
authPath,
JSON.stringify({
version: 1,
profiles: {
"anthropic:default": {
type: "api_key",
provider: "anthropic",
key: "sk-default",
},
},
usageStats: {
"anthropic:default": {
errorCount: 3,
failureCounts: { rate_limit: 3 },
lastFailureAt: now - 120_000, // 2 minutes ago
cooldownUntil: now - 60_000, // expired 1 minute ago
},
},
}),
);
const store = ensureAuthProfileStore(agentDir);
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
reason: "rate_limit",
agentDir,
});
const stats = store.usageStats?.["anthropic:default"];
// Error count should reset to 1 (not escalate to 4) because the
// previous cooldown expired. Cooldown should be ~1 min, not ~60 min.
expect(stats?.errorCount).toBe(1);
expect(stats?.failureCounts?.rate_limit).toBe(1);
const cooldownMs = (stats?.cooldownUntil ?? 0) - now;
// calculateAuthProfileCooldownMs(1) = 60_000 (1 minute)
expect(cooldownMs).toBeLessThan(120_000);
expect(cooldownMs).toBeGreaterThan(0);
} finally {
fs.rmSync(agentDir, { recursive: true, force: true });
}
});
it("does not persist cooldown windows for OpenRouter profiles", async () => {
await withAuthProfileStore(async ({ agentDir, store }) => {
await markAuthProfileFailure({

View File

@@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
});
}
// When a cooldown/disabled window expires, the error count resets to prevent
// stale counters from escalating the next cooldown (the root cause of
// infinite cooldown loops — see #40989). The next failure should compute
// backoff from errorCount=1, not from the accumulated stale count.
const expiredWindowCases = [
{
label: "cooldownUntil",
@@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
errorCount: 3,
lastFailureAt: now - 60_000,
}),
expectedUntil: (now: number) => now + 60 * 60 * 1000,
// errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000
expectedUntil: (now: number) => now + 60_000,
readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil,
},
{
@@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
failureCounts: { billing: 2 },
lastFailureAt: now - 60_000,
}),
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
// errorCount resets, billing count resets to 1 →
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
},
{
@@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
failureCounts: { auth_permanent: 2 },
lastFailureAt: now - 60_000,
}),
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
// errorCount resets, auth_permanent count resets to 1 →
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
},
];

View File

@@ -400,9 +400,19 @@ function computeNextProfileUsageStats(params: {
params.existing.lastFailureAt > 0 &&
params.now - params.existing.lastFailureAt > windowMs;
const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
// If the previous cooldown has already expired, reset error counters so the
// profile gets a fresh backoff window. clearExpiredCooldowns() does this
// in-memory during profile ordering, but the on-disk state may still carry
// the old counters when the lock-based updater reads a fresh store. Without
// this check, stale error counts from an expired cooldown cause the next
// failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
const unusableUntil = resolveProfileUnusableUntil(params.existing);
const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
const shouldResetCounters = windowExpired || previousCooldownExpired;
const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
const nextErrorCount = baseErrorCount + 1;
const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts };
const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
const updatedStats: ProfileUsageStats = {