mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-24 14:19:31 +00:00
Summary:
- Add forced provider re-login support that clears cached auth profiles before running provider login again.
- Add provider-auth remediation guidance and a session-scoped skip cache for known-bad fallback auth attempts.
- Wire session ids through agent command, auto-reply, and embedded compaction fallback callers so the skip cache applies on real run paths.
- Fail closed when forced auth profile removal cannot update the profile store.
Verification:
- Local format, lint, diff-check, focused Vitest shards, and autoreview passed.
- PR CI, CodeQL Security High, and Critical Quality agent-runtime-boundary passed on head 1b4e9e753e.
Co-authored-by: Mert Basar <MertBasar0@users.noreply.github.com>
247 lines
7.5 KiB
TypeScript
247 lines
7.5 KiB
TypeScript
/**
|
|
* Session-scoped "known-bad candidate" cache for the model fallback chain.
|
|
*
|
|
* When explicitly enabled and a fallback candidate fails with a non-transient
|
|
* credential error (`auth` / `auth_permanent`), the chain can avoid retrying
|
|
* the same candidate on every subsequent turn until the user fixes their auth.
|
|
*
|
|
* This module records skip markers per `(sessionId, provider, model)` with a
|
|
* short TTL. The cache is intentionally in-memory only: a process restart
|
|
* clears it so a freshly-restarted gateway always tries every candidate at
|
|
* least once before deciding to skip again.
|
|
*
|
|
* The cache is global, not per-config, so any caller running fallbacks for the
|
|
* same `sessionId` shares the same skip set. Tests can reset state via
|
|
* `resetFallbackSkipCacheForTest()`.
|
|
*/
|
|
|
|
import { modelKey } from "./model-selection-normalize.js";
|
|
|
|
/**
|
|
* Default time-to-live for a skip marker. Disabled by default so existing
|
|
* fallback retry behavior stays unchanged unless an operator opts in with
|
|
* OPENCLAW_FALLBACK_SKIP_TTL_MS.
|
|
*/
|
|
export const DEFAULT_FALLBACK_SKIP_TTL_MS = 0;
|
|
const FALLBACK_SKIP_TTL_ENV = "OPENCLAW_FALLBACK_SKIP_TTL_MS";
|
|
const FALLBACK_SKIP_TTL_MIN_MS = 1_000;
|
|
const FALLBACK_SKIP_TTL_MAX_MS = 10 * 60_000;
|
|
|
|
function resolveConfiguredSkipTtlMs(env: NodeJS.ProcessEnv = process.env): number {
|
|
const raw = env[FALLBACK_SKIP_TTL_ENV];
|
|
if (!raw) {
|
|
return DEFAULT_FALLBACK_SKIP_TTL_MS;
|
|
}
|
|
const trimmed = raw.trim();
|
|
if (!trimmed) {
|
|
return DEFAULT_FALLBACK_SKIP_TTL_MS;
|
|
}
|
|
const parsed = Number.parseInt(trimmed, 10);
|
|
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
return DEFAULT_FALLBACK_SKIP_TTL_MS;
|
|
}
|
|
if (parsed === 0) {
|
|
return 0;
|
|
}
|
|
return Math.min(FALLBACK_SKIP_TTL_MAX_MS, Math.max(FALLBACK_SKIP_TTL_MIN_MS, parsed));
|
|
}
|
|
|
|
type SkipEntry = {
|
|
expiresAtMs: number;
|
|
reason: string;
|
|
};
|
|
|
|
type SkipBySession = Map<string, Map<string, SkipEntry>>;
|
|
|
|
type SkipCacheState = {
|
|
buckets: SkipBySession;
|
|
lastGlobalPruneAtMs: number;
|
|
};
|
|
|
|
/**
|
|
* Minimum interval between two opportunistic global prunes. Keeps the
|
|
* worst-case cost of a hot write/check path amortized: even if a gateway
|
|
* tracks thousands of sessions, the cache is only walked every
|
|
* `GLOBAL_PRUNE_INTERVAL_MS`, not on every call.
|
|
*/
|
|
const GLOBAL_PRUNE_INTERVAL_MS = 5_000;
|
|
|
|
function getState(): SkipCacheState {
|
|
const globalStore = globalThis as typeof globalThis & {
|
|
openclawFallbackSkipCache?: SkipBySession;
|
|
openclawFallbackSkipCacheState?: SkipCacheState;
|
|
};
|
|
if (!globalStore.openclawFallbackSkipCacheState) {
|
|
// Reuse the existing buckets map if a prior version of this module already
|
|
// populated the legacy global; otherwise start fresh.
|
|
const buckets = globalStore.openclawFallbackSkipCache ?? new Map();
|
|
globalStore.openclawFallbackSkipCacheState = {
|
|
buckets,
|
|
lastGlobalPruneAtMs: 0,
|
|
};
|
|
globalStore.openclawFallbackSkipCache = buckets;
|
|
}
|
|
return globalStore.openclawFallbackSkipCacheState;
|
|
}
|
|
|
|
function getBuckets(): SkipBySession {
|
|
return getState().buckets;
|
|
}
|
|
|
|
function sessionBucket(sessionId: string, create: boolean): Map<string, SkipEntry> | undefined {
|
|
const buckets = getBuckets();
|
|
let bucket = buckets.get(sessionId);
|
|
if (!bucket && create) {
|
|
bucket = new Map();
|
|
buckets.set(sessionId, bucket);
|
|
}
|
|
return bucket;
|
|
}
|
|
|
|
function candidateKey(provider: string, model: string): string {
|
|
return modelKey(provider, model);
|
|
}
|
|
|
|
function pruneExpired(bucket: Map<string, SkipEntry>, now: number): void {
|
|
for (const [key, entry] of bucket.entries()) {
|
|
if (entry.expiresAtMs <= now) {
|
|
bucket.delete(key);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Walk every session bucket, drop expired markers, and remove buckets that
|
|
* end up empty. Called opportunistically from the hot write/check paths so
|
|
* stale buckets left behind by one-off sessions cannot accumulate across the
|
|
* gateway's lifetime — the per-bucket prune only fires when the same session
|
|
* is queried again, which is not guaranteed for short-lived sessions.
|
|
*/
|
|
function pruneAllExpired(now: number): void {
|
|
const state = getState();
|
|
if (now - state.lastGlobalPruneAtMs < GLOBAL_PRUNE_INTERVAL_MS) {
|
|
return;
|
|
}
|
|
state.lastGlobalPruneAtMs = now;
|
|
for (const [sessionId, bucket] of state.buckets.entries()) {
|
|
pruneExpired(bucket, now);
|
|
if (bucket.size === 0) {
|
|
state.buckets.delete(sessionId);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Record that `(sessionId, provider, model)` should be skipped for the
|
|
* configured TTL. Safe to call with falsy `sessionId` — the call becomes a
|
|
* no-op so callers do not need to guard themselves.
|
|
*/
|
|
export function markFallbackCandidateSkipped(params: {
|
|
sessionId: string | undefined;
|
|
provider: string;
|
|
model: string;
|
|
reason: string;
|
|
now?: number;
|
|
ttlMs?: number;
|
|
}): void {
|
|
if (!params.sessionId || !params.provider || !params.model) {
|
|
return;
|
|
}
|
|
const now = params.now ?? Date.now();
|
|
const ttlMs = params.ttlMs ?? resolveConfiguredSkipTtlMs();
|
|
if (ttlMs <= 0) {
|
|
return;
|
|
}
|
|
pruneAllExpired(now);
|
|
const bucket = sessionBucket(params.sessionId, true);
|
|
if (!bucket) {
|
|
return;
|
|
}
|
|
bucket.set(candidateKey(params.provider, params.model), {
|
|
expiresAtMs: now + ttlMs,
|
|
reason: params.reason,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Returns true when `(sessionId, provider, model)` has an unexpired skip
|
|
* marker. Expired entries are pruned as a side-effect so the cache does not
|
|
* grow unbounded.
|
|
*/
|
|
export function isFallbackCandidateSkipped(params: {
|
|
sessionId: string | undefined;
|
|
provider: string;
|
|
model: string;
|
|
now?: number;
|
|
}): boolean {
|
|
if (!params.sessionId || !params.provider || !params.model) {
|
|
return false;
|
|
}
|
|
const now = params.now ?? Date.now();
|
|
pruneAllExpired(now);
|
|
const bucket = sessionBucket(params.sessionId, false);
|
|
if (!bucket) {
|
|
return false;
|
|
}
|
|
pruneExpired(bucket, now);
|
|
if (bucket.size === 0) {
|
|
getBuckets().delete(params.sessionId);
|
|
return false;
|
|
}
|
|
const entry = bucket.get(candidateKey(params.provider, params.model));
|
|
return Boolean(entry && entry.expiresAtMs > now);
|
|
}
|
|
|
|
/**
|
|
* Look up the recorded skip reason for a `(sessionId, provider, model)`
|
|
* triple. Returns `undefined` when no unexpired marker exists. Used by the
|
|
* fallback chain to surface the original failure reason in observation logs.
|
|
*/
|
|
export function getFallbackCandidateSkipReason(params: {
|
|
sessionId: string | undefined;
|
|
provider: string;
|
|
model: string;
|
|
now?: number;
|
|
}): string | undefined {
|
|
if (!params.sessionId || !params.provider || !params.model) {
|
|
return undefined;
|
|
}
|
|
const bucket = sessionBucket(params.sessionId, false);
|
|
if (!bucket) {
|
|
return undefined;
|
|
}
|
|
const now = params.now ?? Date.now();
|
|
const entry = bucket.get(candidateKey(params.provider, params.model));
|
|
if (!entry || entry.expiresAtMs <= now) {
|
|
return undefined;
|
|
}
|
|
return entry.reason;
|
|
}
|
|
|
|
/** Drop every skip marker associated with the given session. */
|
|
export function clearFallbackSkipCacheForSession(sessionId: string | undefined): void {
|
|
if (!sessionId) {
|
|
return;
|
|
}
|
|
getBuckets().delete(sessionId);
|
|
}
|
|
|
|
/**
|
|
* Test-only escape hatch. Production code must not call this; the global
|
|
* cache is meant to outlive individual fallback runs.
|
|
*/
|
|
export function resetFallbackSkipCacheForTest(): void {
|
|
const state = getState();
|
|
state.buckets.clear();
|
|
state.lastGlobalPruneAtMs = 0;
|
|
}
|
|
|
|
/**
|
|
* Test-only inspection hook for the global session-bucket map. Production
|
|
* code must not read this; the buckets are an implementation detail of the
|
|
* cache and may change shape.
|
|
*/
|
|
export function peekFallbackSkipBucketsForTest(): SkipBySession {
|
|
return getBuckets();
|
|
}
|