mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
fix(auth): reset cooldown error counters on expiry to prevent infinite escalation (#41028)
Merged via squash.
Prepared head SHA: 89bd83f09a
Co-authored-by: zerone0x <39543393+zerone0x@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
This commit is contained in:
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.
|
- Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.
|
||||||
- iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky.
|
- iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky.
|
||||||
- Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927.
|
- Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927.
|
||||||
|
- Auth/cooldowns: reset expired auth-profile cooldown error counters before computing the next backoff so stale on-disk counters do not re-escalate into long cooldown loops after expiry. (#41028) thanks @zerone0x.
|
||||||
|
|
||||||
## 2026.3.8
|
## 2026.3.8
|
||||||
|
|
||||||
|
|||||||
@@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("resets error count when previous cooldown has expired to prevent escalation", async () => {
|
||||||
|
const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-"));
|
||||||
|
try {
|
||||||
|
const authPath = path.join(agentDir, "auth-profiles.json");
|
||||||
|
const now = Date.now();
|
||||||
|
// Simulate state left on disk after 3 rapid failures within a 1-min cooldown
|
||||||
|
// window. The cooldown has since expired, but clearExpiredCooldowns() only
|
||||||
|
// ran in-memory and never persisted — so disk still carries errorCount: 3.
|
||||||
|
fs.writeFileSync(
|
||||||
|
authPath,
|
||||||
|
JSON.stringify({
|
||||||
|
version: 1,
|
||||||
|
profiles: {
|
||||||
|
"anthropic:default": {
|
||||||
|
type: "api_key",
|
||||||
|
provider: "anthropic",
|
||||||
|
key: "sk-default",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
usageStats: {
|
||||||
|
"anthropic:default": {
|
||||||
|
errorCount: 3,
|
||||||
|
failureCounts: { rate_limit: 3 },
|
||||||
|
lastFailureAt: now - 120_000, // 2 minutes ago
|
||||||
|
cooldownUntil: now - 60_000, // expired 1 minute ago
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const store = ensureAuthProfileStore(agentDir);
|
||||||
|
await markAuthProfileFailure({
|
||||||
|
store,
|
||||||
|
profileId: "anthropic:default",
|
||||||
|
reason: "rate_limit",
|
||||||
|
agentDir,
|
||||||
|
});
|
||||||
|
|
||||||
|
const stats = store.usageStats?.["anthropic:default"];
|
||||||
|
// Error count should reset to 1 (not escalate to 4) because the
|
||||||
|
// previous cooldown expired. Cooldown should be ~1 min, not ~60 min.
|
||||||
|
expect(stats?.errorCount).toBe(1);
|
||||||
|
expect(stats?.failureCounts?.rate_limit).toBe(1);
|
||||||
|
const cooldownMs = (stats?.cooldownUntil ?? 0) - now;
|
||||||
|
// calculateAuthProfileCooldownMs(1) = 60_000 (1 minute)
|
||||||
|
expect(cooldownMs).toBeLessThan(120_000);
|
||||||
|
expect(cooldownMs).toBeGreaterThan(0);
|
||||||
|
} finally {
|
||||||
|
fs.rmSync(agentDir, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
it("does not persist cooldown windows for OpenRouter profiles", async () => {
|
it("does not persist cooldown windows for OpenRouter profiles", async () => {
|
||||||
await withAuthProfileStore(async ({ agentDir, store }) => {
|
await withAuthProfileStore(async ({ agentDir, store }) => {
|
||||||
await markAuthProfileFailure({
|
await markAuthProfileFailure({
|
||||||
|
|||||||
@@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// When a cooldown/disabled window expires, the error count resets to prevent
|
||||||
|
// stale counters from escalating the next cooldown (the root cause of
|
||||||
|
// infinite cooldown loops — see #40989). The next failure should compute
|
||||||
|
// backoff from errorCount=1, not from the accumulated stale count.
|
||||||
const expiredWindowCases = [
|
const expiredWindowCases = [
|
||||||
{
|
{
|
||||||
label: "cooldownUntil",
|
label: "cooldownUntil",
|
||||||
@@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
|||||||
errorCount: 3,
|
errorCount: 3,
|
||||||
lastFailureAt: now - 60_000,
|
lastFailureAt: now - 60_000,
|
||||||
}),
|
}),
|
||||||
expectedUntil: (now: number) => now + 60 * 60 * 1000,
|
// errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000
|
||||||
|
expectedUntil: (now: number) => now + 60_000,
|
||||||
readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil,
|
readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
|||||||
failureCounts: { billing: 2 },
|
failureCounts: { billing: 2 },
|
||||||
lastFailureAt: now - 60_000,
|
lastFailureAt: now - 60_000,
|
||||||
}),
|
}),
|
||||||
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
|
// errorCount resets, billing count resets to 1 →
|
||||||
|
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
|
||||||
|
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
|
||||||
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
|
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
|||||||
failureCounts: { auth_permanent: 2 },
|
failureCounts: { auth_permanent: 2 },
|
||||||
lastFailureAt: now - 60_000,
|
lastFailureAt: now - 60_000,
|
||||||
}),
|
}),
|
||||||
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
|
// errorCount resets, auth_permanent count resets to 1 →
|
||||||
|
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
|
||||||
|
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
|
||||||
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
|
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -400,9 +400,19 @@ function computeNextProfileUsageStats(params: {
|
|||||||
params.existing.lastFailureAt > 0 &&
|
params.existing.lastFailureAt > 0 &&
|
||||||
params.now - params.existing.lastFailureAt > windowMs;
|
params.now - params.existing.lastFailureAt > windowMs;
|
||||||
|
|
||||||
const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
|
// If the previous cooldown has already expired, reset error counters so the
|
||||||
|
// profile gets a fresh backoff window. clearExpiredCooldowns() does this
|
||||||
|
// in-memory during profile ordering, but the on-disk state may still carry
|
||||||
|
// the old counters when the lock-based updater reads a fresh store. Without
|
||||||
|
// this check, stale error counts from an expired cooldown cause the next
|
||||||
|
// failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
|
||||||
|
const unusableUntil = resolveProfileUnusableUntil(params.existing);
|
||||||
|
const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
|
||||||
|
|
||||||
|
const shouldResetCounters = windowExpired || previousCooldownExpired;
|
||||||
|
const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
|
||||||
const nextErrorCount = baseErrorCount + 1;
|
const nextErrorCount = baseErrorCount + 1;
|
||||||
const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts };
|
const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
|
||||||
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
|
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
|
||||||
|
|
||||||
const updatedStats: ProfileUsageStats = {
|
const updatedStats: ProfileUsageStats = {
|
||||||
|
|||||||
Reference in New Issue
Block a user