mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
fix(auth): reset cooldown error counters on expiry to prevent infinite escalation (#41028)
Merged via squash.
Prepared head SHA: 89bd83f09a
Co-authored-by: zerone0x <39543393+zerone0x@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
This commit is contained in:
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.
|
||||
- iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky.
|
||||
- Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927.
|
||||
- Auth/cooldowns: reset expired auth-profile cooldown error counters before computing the next backoff so stale on-disk counters do not re-escalate into long cooldown loops after expiry. (#41028) thanks @zerone0x.
|
||||
|
||||
## 2026.3.8
|
||||
|
||||
|
||||
@@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("resets error count when previous cooldown has expired to prevent escalation", async () => {
|
||||
const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-"));
|
||||
try {
|
||||
const authPath = path.join(agentDir, "auth-profiles.json");
|
||||
const now = Date.now();
|
||||
// Simulate state left on disk after 3 rapid failures within a 1-min cooldown
|
||||
// window. The cooldown has since expired, but clearExpiredCooldowns() only
|
||||
// ran in-memory and never persisted — so disk still carries errorCount: 3.
|
||||
fs.writeFileSync(
|
||||
authPath,
|
||||
JSON.stringify({
|
||||
version: 1,
|
||||
profiles: {
|
||||
"anthropic:default": {
|
||||
type: "api_key",
|
||||
provider: "anthropic",
|
||||
key: "sk-default",
|
||||
},
|
||||
},
|
||||
usageStats: {
|
||||
"anthropic:default": {
|
||||
errorCount: 3,
|
||||
failureCounts: { rate_limit: 3 },
|
||||
lastFailureAt: now - 120_000, // 2 minutes ago
|
||||
cooldownUntil: now - 60_000, // expired 1 minute ago
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const store = ensureAuthProfileStore(agentDir);
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "rate_limit",
|
||||
agentDir,
|
||||
});
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// Error count should reset to 1 (not escalate to 4) because the
|
||||
// previous cooldown expired. Cooldown should be ~1 min, not ~60 min.
|
||||
expect(stats?.errorCount).toBe(1);
|
||||
expect(stats?.failureCounts?.rate_limit).toBe(1);
|
||||
const cooldownMs = (stats?.cooldownUntil ?? 0) - now;
|
||||
// calculateAuthProfileCooldownMs(1) = 60_000 (1 minute)
|
||||
expect(cooldownMs).toBeLessThan(120_000);
|
||||
expect(cooldownMs).toBeGreaterThan(0);
|
||||
} finally {
|
||||
fs.rmSync(agentDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("does not persist cooldown windows for OpenRouter profiles", async () => {
|
||||
await withAuthProfileStore(async ({ agentDir, store }) => {
|
||||
await markAuthProfileFailure({
|
||||
|
||||
@@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
||||
});
|
||||
}
|
||||
|
||||
// When a cooldown/disabled window expires, the error count resets to prevent
|
||||
// stale counters from escalating the next cooldown (the root cause of
|
||||
// infinite cooldown loops — see #40989). The next failure should compute
|
||||
// backoff from errorCount=1, not from the accumulated stale count.
|
||||
const expiredWindowCases = [
|
||||
{
|
||||
label: "cooldownUntil",
|
||||
@@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
||||
errorCount: 3,
|
||||
lastFailureAt: now - 60_000,
|
||||
}),
|
||||
expectedUntil: (now: number) => now + 60 * 60 * 1000,
|
||||
// errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000
|
||||
expectedUntil: (now: number) => now + 60_000,
|
||||
readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil,
|
||||
},
|
||||
{
|
||||
@@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
||||
failureCounts: { billing: 2 },
|
||||
lastFailureAt: now - 60_000,
|
||||
}),
|
||||
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
|
||||
// errorCount resets, billing count resets to 1 →
|
||||
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
|
||||
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
|
||||
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
|
||||
},
|
||||
{
|
||||
@@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
||||
failureCounts: { auth_permanent: 2 },
|
||||
lastFailureAt: now - 60_000,
|
||||
}),
|
||||
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
|
||||
// errorCount resets, auth_permanent count resets to 1 →
|
||||
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
|
||||
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
|
||||
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
|
||||
},
|
||||
];
|
||||
|
||||
@@ -400,9 +400,19 @@ function computeNextProfileUsageStats(params: {
|
||||
params.existing.lastFailureAt > 0 &&
|
||||
params.now - params.existing.lastFailureAt > windowMs;
|
||||
|
||||
const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
|
||||
// If the previous cooldown has already expired, reset error counters so the
|
||||
// profile gets a fresh backoff window. clearExpiredCooldowns() does this
|
||||
// in-memory during profile ordering, but the on-disk state may still carry
|
||||
// the old counters when the lock-based updater reads a fresh store. Without
|
||||
// this check, stale error counts from an expired cooldown cause the next
|
||||
// failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
|
||||
const unusableUntil = resolveProfileUnusableUntil(params.existing);
|
||||
const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
|
||||
|
||||
const shouldResetCounters = windowExpired || previousCooldownExpired;
|
||||
const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
|
||||
const nextErrorCount = baseErrorCount + 1;
|
||||
const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts };
|
||||
const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
|
||||
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
|
||||
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
|
||||
Reference in New Issue
Block a user