Files
openclaw/src/agents/auth-profiles/usage.ts
kiranvk2011 84401223c7 fix: per-model cooldown scope, stepped backoff, and user-facing rate-limit message (#49834)
Merged via squash.

Prepared head SHA: 7c488c070c
Co-authored-by: kiranvk-2011 <91108465+kiranvk-2011@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
2026-03-25 22:03:49 +03:00

701 lines
22 KiB
TypeScript

import type { OpenClawConfig } from "../../config/config.js";
import { normalizeProviderId } from "../model-selection.js";
import { logAuthProfileFailureStateChange } from "./state-observation.js";
import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js";
import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js";
const authProfileUsageDeps = {
saveAuthProfileStore,
updateAuthProfileStoreWithLock,
};
export const __testing = {
setDepsForTest(
overrides: Partial<{
saveAuthProfileStore: typeof saveAuthProfileStore;
updateAuthProfileStoreWithLock: typeof updateAuthProfileStoreWithLock;
}> | null,
) {
authProfileUsageDeps.saveAuthProfileStore =
overrides?.saveAuthProfileStore ?? saveAuthProfileStore;
authProfileUsageDeps.updateAuthProfileStoreWithLock =
overrides?.updateAuthProfileStoreWithLock ?? updateAuthProfileStoreWithLock;
},
};
const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [
"auth_permanent",
"auth",
"billing",
"format",
"model_not_found",
"overloaded",
"timeout",
"rate_limit",
"unknown",
];
const FAILURE_REASON_SET = new Set<AuthProfileFailureReason>(FAILURE_REASON_PRIORITY);
const FAILURE_REASON_ORDER = new Map<AuthProfileFailureReason, number>(
FAILURE_REASON_PRIORITY.map((reason, index) => [reason, index]),
);
function isAuthCooldownBypassedForProvider(provider: string | undefined): boolean {
const normalized = normalizeProviderId(provider ?? "");
return normalized === "openrouter" || normalized === "kilocode";
}
export function resolveProfileUnusableUntil(
stats: Pick<ProfileUsageStats, "cooldownUntil" | "disabledUntil">,
): number | null {
const values = [stats.cooldownUntil, stats.disabledUntil]
.filter((value): value is number => typeof value === "number")
.filter((value) => Number.isFinite(value) && value > 0);
if (values.length === 0) {
return null;
}
return Math.max(...values);
}
/**
* Check if a profile is currently in cooldown (due to rate limits, overload, or other transient failures).
*/
export function isProfileInCooldown(
store: AuthProfileStore,
profileId: string,
now?: number,
forModel?: string,
): boolean {
if (isAuthCooldownBypassedForProvider(store.profiles[profileId]?.provider)) {
return false;
}
const stats = store.usageStats?.[profileId];
if (!stats) {
return false;
}
const ts = now ?? Date.now();
// Model-aware bypass: if the cooldown was caused by a rate_limit on a
// specific model and the caller is requesting a *different* model, allow it.
// We still honour any active billing/auth disable (`disabledUntil`) — those
// are profile-wide and must not be short-circuited by model scoping.
if (shouldBypassModelScopedCooldown(stats, ts, forModel)) {
return false;
}
const unusableUntil = resolveProfileUnusableUntil(stats);
return unusableUntil ? ts < unusableUntil : false;
}
function isActiveUnusableWindow(until: number | undefined, now: number): boolean {
return typeof until === "number" && Number.isFinite(until) && until > 0 && now < until;
}
/**
* Infer the most likely reason all candidate profiles are currently unavailable.
*
* We prefer explicit active `disabledReason` values (for example billing/auth)
* over generic cooldown buckets, then fall back to failure-count signals.
*/
export function resolveProfilesUnavailableReason(params: {
store: AuthProfileStore;
profileIds: string[];
now?: number;
}): AuthProfileFailureReason | null {
const now = params.now ?? Date.now();
const scores = new Map<AuthProfileFailureReason, number>();
const addScore = (reason: AuthProfileFailureReason, value: number) => {
if (!FAILURE_REASON_SET.has(reason) || value <= 0 || !Number.isFinite(value)) {
return;
}
scores.set(reason, (scores.get(reason) ?? 0) + value);
};
for (const profileId of params.profileIds) {
const stats = params.store.usageStats?.[profileId];
if (!stats) {
continue;
}
const disabledActive = isActiveUnusableWindow(stats.disabledUntil, now);
if (disabledActive && stats.disabledReason && FAILURE_REASON_SET.has(stats.disabledReason)) {
// Disabled reasons are explicit and high-signal; weight heavily.
addScore(stats.disabledReason, 1_000);
continue;
}
const cooldownActive = isActiveUnusableWindow(stats.cooldownUntil, now);
if (!cooldownActive) {
continue;
}
let recordedReason = false;
for (const [rawReason, rawCount] of Object.entries(stats.failureCounts ?? {})) {
const reason = rawReason as AuthProfileFailureReason;
const count = typeof rawCount === "number" ? rawCount : 0;
if (!FAILURE_REASON_SET.has(reason) || count <= 0) {
continue;
}
addScore(reason, count);
recordedReason = true;
}
if (!recordedReason) {
// No failure counts recorded for this cooldown window. Previously this
// defaulted to "rate_limit", which caused false "rate limit reached"
// warnings when the actual reason was unknown (e.g. transient network
// blip or server error without a classified failure count).
addScore("unknown", 1);
}
}
if (scores.size === 0) {
return null;
}
let best: AuthProfileFailureReason | null = null;
let bestScore = -1;
let bestPriority = Number.MAX_SAFE_INTEGER;
for (const reason of FAILURE_REASON_PRIORITY) {
const score = scores.get(reason);
if (typeof score !== "number") {
continue;
}
const priority = FAILURE_REASON_ORDER.get(reason) ?? Number.MAX_SAFE_INTEGER;
if (score > bestScore || (score === bestScore && priority < bestPriority)) {
best = reason;
bestScore = score;
bestPriority = priority;
}
}
return best;
}
/**
* Return the soonest `unusableUntil` timestamp (ms epoch) among the given
* profiles, or `null` when no profile has a recorded cooldown. Note: the
* returned timestamp may be in the past if the cooldown has already expired.
*/
export function getSoonestCooldownExpiry(
store: AuthProfileStore,
profileIds: string[],
options?: { now?: number; forModel?: string },
): number | null {
const ts = options?.now ?? Date.now();
let soonest: number | null = null;
for (const id of profileIds) {
const stats = store.usageStats?.[id];
if (!stats) {
continue;
}
if (shouldBypassModelScopedCooldown(stats, ts, options?.forModel)) {
continue;
}
const until = resolveProfileUnusableUntil(stats);
if (typeof until !== "number" || !Number.isFinite(until) || until <= 0) {
continue;
}
if (soonest === null || until < soonest) {
soonest = until;
}
}
return soonest;
}
function shouldBypassModelScopedCooldown(
stats: Pick<ProfileUsageStats, "cooldownReason" | "cooldownModel" | "disabledUntil">,
now: number,
forModel?: string,
): boolean {
return !!(
forModel &&
stats.cooldownReason === "rate_limit" &&
stats.cooldownModel &&
stats.cooldownModel !== forModel &&
!isActiveUnusableWindow(stats.disabledUntil, now)
);
}
/**
* Clear expired cooldowns from all profiles in the store.
*
* When `cooldownUntil` or `disabledUntil` has passed, the corresponding fields
* are removed and error counters are reset so the profile gets a fresh start
* (circuit-breaker half-open → closed). Without this, a stale `errorCount`
* causes the *next* transient failure to immediately escalate to a much longer
* cooldown — the root cause of profiles appearing "stuck" after rate limits.
*
* `cooldownUntil` and `disabledUntil` are handled independently: if a profile
* has both and only one has expired, only that field is cleared.
*
* Mutates the in-memory store; disk persistence happens lazily on the next
* store write (e.g. `markAuthProfileUsed` / `markAuthProfileFailure`), which
* matches the existing save pattern throughout the auth-profiles module.
*
* @returns `true` if any profile was modified.
*/
export function clearExpiredCooldowns(store: AuthProfileStore, now?: number): boolean {
const usageStats = store.usageStats;
if (!usageStats) {
return false;
}
const ts = now ?? Date.now();
let mutated = false;
for (const [profileId, stats] of Object.entries(usageStats)) {
if (!stats) {
continue;
}
let profileMutated = false;
const cooldownExpired =
typeof stats.cooldownUntil === "number" &&
Number.isFinite(stats.cooldownUntil) &&
stats.cooldownUntil > 0 &&
ts >= stats.cooldownUntil;
const disabledExpired =
typeof stats.disabledUntil === "number" &&
Number.isFinite(stats.disabledUntil) &&
stats.disabledUntil > 0 &&
ts >= stats.disabledUntil;
if (cooldownExpired) {
stats.cooldownUntil = undefined;
stats.cooldownReason = undefined;
stats.cooldownModel = undefined;
profileMutated = true;
}
if (disabledExpired) {
stats.disabledUntil = undefined;
stats.disabledReason = undefined;
profileMutated = true;
}
// Reset error counters when ALL cooldowns have expired so the profile gets
// a fair retry window. Preserves lastFailureAt for the failureWindowMs
// decay check in computeNextProfileUsageStats.
if (profileMutated && !resolveProfileUnusableUntil(stats)) {
stats.errorCount = 0;
stats.failureCounts = undefined;
}
if (profileMutated) {
usageStats[profileId] = stats;
mutated = true;
}
}
return mutated;
}
/**
* Mark a profile as successfully used. Resets error count and updates lastUsed.
* Uses store lock to avoid overwriting concurrent usage updates.
*/
export async function markAuthProfileUsed(params: {
store: AuthProfileStore;
profileId: string;
agentDir?: string;
}): Promise<void> {
const { store, profileId, agentDir } = params;
const updated = await authProfileUsageDeps.updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
if (!freshStore.profiles[profileId]) {
return false;
}
updateUsageStatsEntry(freshStore, profileId, (existing) =>
resetUsageStats(existing, { lastUsed: Date.now() }),
);
return true;
},
});
if (updated) {
store.usageStats = updated.usageStats;
return;
}
if (!store.profiles[profileId]) {
return;
}
updateUsageStatsEntry(store, profileId, (existing) =>
resetUsageStats(existing, { lastUsed: Date.now() }),
);
authProfileUsageDeps.saveAuthProfileStore(store, agentDir);
}
export function calculateAuthProfileCooldownMs(errorCount: number): number {
const normalized = Math.max(1, errorCount);
if (normalized <= 1) {
return 30_000; // 30 seconds
}
if (normalized <= 2) {
return 60_000; // 1 minute
}
return 5 * 60_000; // 5 minutes max
}
type ResolvedAuthCooldownConfig = {
billingBackoffMs: number;
billingMaxMs: number;
failureWindowMs: number;
};
function resolveAuthCooldownConfig(params: {
cfg?: OpenClawConfig;
providerId: string;
}): ResolvedAuthCooldownConfig {
const defaults = {
billingBackoffHours: 5,
billingMaxHours: 24,
failureWindowHours: 24,
} as const;
const resolveHours = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const cooldowns = params.cfg?.auth?.cooldowns;
const billingOverride = (() => {
const map = cooldowns?.billingBackoffHoursByProvider;
if (!map) {
return undefined;
}
for (const [key, value] of Object.entries(map)) {
if (normalizeProviderId(key) === params.providerId) {
return value;
}
}
return undefined;
})();
const billingBackoffHours = resolveHours(
billingOverride ?? cooldowns?.billingBackoffHours,
defaults.billingBackoffHours,
);
const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours);
const failureWindowHours = resolveHours(
cooldowns?.failureWindowHours,
defaults.failureWindowHours,
);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
};
}
function calculateAuthProfileBillingDisableMsWithConfig(params: {
errorCount: number;
baseMs: number;
maxMs: number;
}): number {
const normalized = Math.max(1, params.errorCount);
const baseMs = Math.max(60_000, params.baseMs);
const maxMs = Math.max(baseMs, params.maxMs);
const exponent = Math.min(normalized - 1, 10);
const raw = baseMs * 2 ** exponent;
return Math.min(maxMs, raw);
}
export function resolveProfileUnusableUntilForDisplay(
store: AuthProfileStore,
profileId: string,
): number | null {
if (isAuthCooldownBypassedForProvider(store.profiles[profileId]?.provider)) {
return null;
}
const stats = store.usageStats?.[profileId];
if (!stats) {
return null;
}
return resolveProfileUnusableUntil(stats);
}
function resetUsageStats(
existing: ProfileUsageStats | undefined,
overrides?: Partial<ProfileUsageStats>,
): ProfileUsageStats {
return {
...existing,
errorCount: 0,
cooldownUntil: undefined,
cooldownReason: undefined,
cooldownModel: undefined,
disabledUntil: undefined,
disabledReason: undefined,
failureCounts: undefined,
...overrides,
};
}
function updateUsageStatsEntry(
store: AuthProfileStore,
profileId: string,
updater: (existing: ProfileUsageStats | undefined) => ProfileUsageStats,
): void {
store.usageStats = store.usageStats ?? {};
store.usageStats[profileId] = updater(store.usageStats[profileId]);
}
function keepActiveWindowOrRecompute(params: {
existingUntil: number | undefined;
now: number;
recomputedUntil: number;
}): number {
const { existingUntil, now, recomputedUntil } = params;
const hasActiveWindow =
typeof existingUntil === "number" && Number.isFinite(existingUntil) && existingUntil > now;
return hasActiveWindow ? existingUntil : recomputedUntil;
}
function computeNextProfileUsageStats(params: {
existing: ProfileUsageStats;
now: number;
reason: AuthProfileFailureReason;
cfgResolved: ResolvedAuthCooldownConfig;
modelId?: string;
}): ProfileUsageStats {
const windowMs = params.cfgResolved.failureWindowMs;
const windowExpired =
typeof params.existing.lastFailureAt === "number" &&
params.existing.lastFailureAt > 0 &&
params.now - params.existing.lastFailureAt > windowMs;
// If the previous cooldown has already expired, reset error counters so the
// profile gets a fresh backoff window. clearExpiredCooldowns() does this
// in-memory during profile ordering, but the on-disk state may still carry
// the old counters when the lock-based updater reads a fresh store. Without
// this check, stale error counts from an expired cooldown cause the next
// failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
const unusableUntil = resolveProfileUnusableUntil(params.existing);
const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
const shouldResetCounters = windowExpired || previousCooldownExpired;
const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
const nextErrorCount = baseErrorCount + 1;
const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
const updatedStats: ProfileUsageStats = {
...params.existing,
errorCount: nextErrorCount,
failureCounts,
lastFailureAt: params.now,
};
if (params.reason === "billing" || params.reason === "auth_permanent") {
const billingCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
baseMs: params.cfgResolved.billingBackoffMs,
maxMs: params.cfgResolved.billingMaxMs,
});
// Keep active disable windows immutable so retries within the window cannot
// extend recovery time indefinitely.
updatedStats.disabledUntil = keepActiveWindowOrRecompute({
existingUntil: params.existing.disabledUntil,
now: params.now,
recomputedUntil: params.now + backoffMs,
});
updatedStats.disabledReason = params.reason;
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
// Keep active cooldown windows immutable so retries within the window
// cannot push recovery further out.
updatedStats.cooldownUntil = keepActiveWindowOrRecompute({
existingUntil: params.existing.cooldownUntil,
now: params.now,
recomputedUntil: params.now + backoffMs,
});
// Update cooldown metadata based on whether the window is still active
// and whether the same or a different model is failing.
const existingCooldownActive =
typeof params.existing.cooldownUntil === "number" &&
params.existing.cooldownUntil > params.now;
if (existingCooldownActive) {
// Always use the latest failure reason so that downstream consumers
// (e.g. isProfileInCooldown model-bypass) see the most recent signal.
// A non-rate_limit failure (auth, billing, …) is profile-wide, so
// upgrading from rate_limit → auth correctly blocks all models.
updatedStats.cooldownReason = params.reason;
// If a different model fails during an active window, widen the scope
// to all models (undefined) so neither model bypasses the cooldown.
if (
params.existing.cooldownModel &&
params.modelId &&
params.existing.cooldownModel !== params.modelId
) {
updatedStats.cooldownModel = undefined;
} else if (
params.reason === "rate_limit" &&
!params.modelId &&
params.existing.cooldownModel
) {
// Unknown originating model during an active model-scoped cooldown:
// widen scope conservatively so no model can bypass on stale metadata.
updatedStats.cooldownModel = undefined;
} else if (params.reason !== "rate_limit") {
// Non-rate-limit failures are profile-wide — clear model scope even
// when the same model fails, so that no model can bypass.
updatedStats.cooldownModel = undefined;
} else {
updatedStats.cooldownModel = params.existing.cooldownModel;
}
} else {
updatedStats.cooldownReason = params.reason;
updatedStats.cooldownModel = params.reason === "rate_limit" ? params.modelId : undefined;
}
}
return updatedStats;
}
/**
* Mark a profile as failed for a specific reason. Billing and permanent-auth
* failures are treated as "disabled" (longer backoff) vs the regular cooldown
* window.
*/
export async function markAuthProfileFailure(params: {
store: AuthProfileStore;
profileId: string;
reason: AuthProfileFailureReason;
cfg?: OpenClawConfig;
agentDir?: string;
runId?: string;
modelId?: string;
}): Promise<void> {
const { store, profileId, reason, agentDir, cfg, runId, modelId } = params;
const profile = store.profiles[profileId];
if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) {
return;
}
let nextStats: ProfileUsageStats | undefined;
let previousStats: ProfileUsageStats | undefined;
let updateTime = 0;
const updated = await authProfileUsageDeps.updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
const profile = freshStore.profiles[profileId];
if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) {
return false;
}
const now = Date.now();
const providerKey = normalizeProviderId(profile.provider);
const cfgResolved = resolveAuthCooldownConfig({
cfg,
providerId: providerKey,
});
previousStats = freshStore.usageStats?.[profileId];
updateTime = now;
const computed = computeNextProfileUsageStats({
existing: previousStats ?? {},
now,
reason,
cfgResolved,
modelId,
});
nextStats = computed;
updateUsageStatsEntry(freshStore, profileId, () => computed);
return true;
},
});
if (updated) {
store.usageStats = updated.usageStats;
if (nextStats) {
logAuthProfileFailureStateChange({
runId,
profileId,
provider: profile.provider,
reason,
previous: previousStats,
next: nextStats,
now: updateTime,
});
}
return;
}
if (!store.profiles[profileId]) {
return;
}
const now = Date.now();
const providerKey = normalizeProviderId(store.profiles[profileId]?.provider ?? "");
const cfgResolved = resolveAuthCooldownConfig({
cfg,
providerId: providerKey,
});
previousStats = store.usageStats?.[profileId];
const computed = computeNextProfileUsageStats({
existing: previousStats ?? {},
now,
reason,
cfgResolved,
modelId,
});
nextStats = computed;
updateUsageStatsEntry(store, profileId, () => computed);
authProfileUsageDeps.saveAuthProfileStore(store, agentDir);
logAuthProfileFailureStateChange({
runId,
profileId,
provider: store.profiles[profileId]?.provider ?? profile.provider,
reason,
previous: previousStats,
next: nextStats,
now,
});
}
/**
* Mark a profile as transiently failed. Applies stepped backoff cooldown.
* Cooldown times: 30s, 1min, 5min (capped).
* Uses store lock to avoid overwriting concurrent usage updates.
*/
export async function markAuthProfileCooldown(params: {
store: AuthProfileStore;
profileId: string;
agentDir?: string;
runId?: string;
}): Promise<void> {
await markAuthProfileFailure({
store: params.store,
profileId: params.profileId,
reason: "unknown",
agentDir: params.agentDir,
runId: params.runId,
});
}
/**
* Clear cooldown for a profile (e.g., manual reset).
* Uses store lock to avoid overwriting concurrent usage updates.
*/
export async function clearAuthProfileCooldown(params: {
store: AuthProfileStore;
profileId: string;
agentDir?: string;
}): Promise<void> {
const { store, profileId, agentDir } = params;
const updated = await authProfileUsageDeps.updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
if (!freshStore.usageStats?.[profileId]) {
return false;
}
updateUsageStatsEntry(freshStore, profileId, (existing) => resetUsageStats(existing));
return true;
},
});
if (updated) {
store.usageStats = updated.usageStats;
return;
}
if (!store.usageStats?.[profileId]) {
return;
}
updateUsageStatsEntry(store, profileId, (existing) => resetUsageStats(existing));
authProfileUsageDeps.saveAuthProfileStore(store, agentDir);
}