Agents: infer auth-profile unavailable failover reason

2026-03-12 07:20:45 +00:00 · 2026-02-22 16:10:24 -08:00
parent 331b728b8d
commit 5c7c37a02a
9 changed files with 340 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai

 - Install/Discord Voice: make `@discordjs/opus` an optional dependency so `openclaw` install/update no longer hard-fails when native Opus builds fail, while keeping `opusscript` as the runtime fallback decoder for Discord voice flows. (#23737, #23733, #23703) Thanks @jeadland, @Sheetaa, and @Breakyman.
 - Agents/Exec: honor explicit agent context when resolving `tools.exec` defaults for runs with opaque/non-agent session keys, so per-agent `host/security/ask` policies are applied consistently. (#11832)
+- Agents/Auth profiles: infer `all profiles unavailable` failover reasons from active profile cooldown/disabled stats (instead of hardcoded `rate_limit`) so auth/billing OAuth outages surface accurately in fallback errors. (#23996) Thanks @DerpyNoodlez.
 - Security/Sessions: redact sensitive token patterns from `sessions_history` tool output and surface `contentRedacted` metadata when masking occurs. (#16928) Thanks @aether-ai-agent.
 - Sandbox/Docker: default sandbox container user to the workspace owner `uid:gid` when `agents.*.sandbox.docker.user` is unset, fixing non-root gateway file-tool permissions under capability-dropped containers. (#20979)
 - Doctor/Security: add an explicit warning that `approvals.exec.enabled=false` disables forwarding only, while enforcement remains driven by host-local `exec-approvals.json` policy. (#15047)
--- a/src/agents/auth-profiles.ts
+++ b/src/agents/auth-profiles.ts
@@ -40,5 +40,6 @@ export {
  markAuthProfileCooldown,
  markAuthProfileFailure,
  markAuthProfileUsed,
+  resolveProfilesUnavailableReason,
  resolveProfileUnusableUntilForDisplay,
 } from "./auth-profiles/usage.js";
--- a/src/agents/auth-profiles/usage.test.ts
+++ b/src/agents/auth-profiles/usage.test.ts
@@ -5,6 +5,7 @@ import {
  clearExpiredCooldowns,
  isProfileInCooldown,
  markAuthProfileFailure,
+  resolveProfilesUnavailableReason,
  resolveProfileUnusableUntil,
 } from "./usage.js";

@@ -85,6 +86,101 @@ describe("isProfileInCooldown", () => {
  });
 });

+describe("resolveProfilesUnavailableReason", () => {
+  it("prefers active disabledReason when profiles are disabled", () => {
+    const now = Date.now();
+    const store = makeStore({
+      "anthropic:default": {
+        disabledUntil: now + 60_000,
+        disabledReason: "billing",
+      },
+    });
+
+    expect(
+      resolveProfilesUnavailableReason({
+        store,
+        profileIds: ["anthropic:default"],
+        now,
+      }),
+    ).toBe("billing");
+  });
+
+  it("uses recorded non-rate-limit failure counts for active cooldown windows", () => {
+    const now = Date.now();
+    const store = makeStore({
+      "anthropic:default": {
+        cooldownUntil: now + 60_000,
+        failureCounts: { auth: 3, rate_limit: 1 },
+      },
+    });
+
+    expect(
+      resolveProfilesUnavailableReason({
+        store,
+        profileIds: ["anthropic:default"],
+        now,
+      }),
+    ).toBe("auth");
+  });
+
+  it("falls back to rate_limit when active cooldown has no reason history", () => {
+    const now = Date.now();
+    const store = makeStore({
+      "anthropic:default": {
+        cooldownUntil: now + 60_000,
+      },
+    });
+
+    expect(
+      resolveProfilesUnavailableReason({
+        store,
+        profileIds: ["anthropic:default"],
+        now,
+      }),
+    ).toBe("rate_limit");
+  });
+
+  it("ignores expired windows and returns null when no profile is actively unavailable", () => {
+    const now = Date.now();
+    const store = makeStore({
+      "anthropic:default": {
+        cooldownUntil: now - 1_000,
+        failureCounts: { auth: 5 },
+      },
+      "anthropic:backup": {
+        disabledUntil: now - 500,
+        disabledReason: "billing",
+      },
+    });
+
+    expect(
+      resolveProfilesUnavailableReason({
+        store,
+        profileIds: ["anthropic:default", "anthropic:backup"],
+        now,
+      }),
+    ).toBeNull();
+  });
+
+  it("breaks ties by reason priority for equal active failure counts", () => {
+    const now = Date.now();
+    const store = makeStore({
+      "anthropic:default": {
+        cooldownUntil: now + 60_000,
+        failureCounts: { timeout: 2, auth: 2 },
+      },
+    });
+
+    expect(
+      resolveProfilesUnavailableReason({
+        store,
+        profileIds: ["anthropic:default"],
+        now,
+      }),
+    ).toBe("auth");
+  });
+});
+
 // ---------------------------------------------------------------------------
 // clearExpiredCooldowns
 // ---------------------------------------------------------------------------
--- a/src/agents/auth-profiles/usage.ts
+++ b/src/agents/auth-profiles/usage.ts
@@ -3,6 +3,20 @@ import { normalizeProviderId } from "../model-selection.js";
 import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js";
 import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js";

+const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [
+  "auth",
+  "billing",
+  "format",
+  "model_not_found",
+  "timeout",
+  "rate_limit",
+  "unknown",
+];
+const FAILURE_REASON_SET = new Set<AuthProfileFailureReason>(FAILURE_REASON_PRIORITY);
+const FAILURE_REASON_ORDER = new Map<AuthProfileFailureReason, number>(
+  FAILURE_REASON_PRIORITY.map((reason, index) => [reason, index]),
+);
+
 export function resolveProfileUnusableUntil(
  stats: Pick<ProfileUsageStats, "cooldownUntil" | "disabledUntil">,
 ): number | null {
@@ -27,6 +41,85 @@ export function isProfileInCooldown(store: AuthProfileStore, profileId: string):
  return unusableUntil ? Date.now() < unusableUntil : false;
 }

+function isActiveUnusableWindow(until: number | undefined, now: number): boolean {
+  return typeof until === "number" && Number.isFinite(until) && until > 0 && now < until;
+}
+
+/**
+ * Infer the most likely reason all candidate profiles are currently unavailable.
+ *
+ * We prefer explicit active `disabledReason` values (for example billing/auth)
+ * over generic cooldown buckets, then fall back to failure-count signals.
+ */
+export function resolveProfilesUnavailableReason(params: {
+  store: AuthProfileStore;
+  profileIds: string[];
+  now?: number;
+}): AuthProfileFailureReason | null {
+  const now = params.now ?? Date.now();
+  const scores = new Map<AuthProfileFailureReason, number>();
+  const addScore = (reason: AuthProfileFailureReason, value: number) => {
+    if (!FAILURE_REASON_SET.has(reason) || value <= 0 || !Number.isFinite(value)) {
+      return;
+    }
+    scores.set(reason, (scores.get(reason) ?? 0) + value);
+  };
+
+  for (const profileId of params.profileIds) {
+    const stats = params.store.usageStats?.[profileId];
+    if (!stats) {
+      continue;
+    }
+
+    const disabledActive = isActiveUnusableWindow(stats.disabledUntil, now);
+    if (disabledActive && stats.disabledReason && FAILURE_REASON_SET.has(stats.disabledReason)) {
+      // Disabled reasons are explicit and high-signal; weight heavily.
+      addScore(stats.disabledReason, 1_000);
+      continue;
+    }
+
+    const cooldownActive = isActiveUnusableWindow(stats.cooldownUntil, now);
+    if (!cooldownActive) {
+      continue;
+    }
+
+    let recordedReason = false;
+    for (const [rawReason, rawCount] of Object.entries(stats.failureCounts ?? {})) {
+      const reason = rawReason as AuthProfileFailureReason;
+      const count = typeof rawCount === "number" ? rawCount : 0;
+      if (!FAILURE_REASON_SET.has(reason) || count <= 0) {
+        continue;
+      }
+      addScore(reason, count);
+      recordedReason = true;
+    }
+    if (!recordedReason) {
+      addScore("rate_limit", 1);
+    }
+  }
+
+  if (scores.size === 0) {
+    return null;
+  }
+
+  let best: AuthProfileFailureReason | null = null;
+  let bestScore = -1;
+  let bestPriority = Number.MAX_SAFE_INTEGER;
+  for (const reason of FAILURE_REASON_PRIORITY) {
+    const score = scores.get(reason);
+    if (typeof score !== "number") {
+      continue;
+    }
+    const priority = FAILURE_REASON_ORDER.get(reason) ?? Number.MAX_SAFE_INTEGER;
+    if (score > bestScore || (score === bestScore && priority < bestPriority)) {
+      best = reason;
+      bestScore = score;
+      bestPriority = priority;
+    }
+  }
+  return best;
+}
+
 /**
 * Return the soonest `unusableUntil` timestamp (ms epoch) among the given
 * profiles, or `null` when no profile has a recorded cooldown. Note: the
--- a/src/agents/model-fallback.probe.test.ts
+++ b/src/agents/model-fallback.probe.test.ts
@@ -8,6 +8,7 @@ vi.mock("./auth-profiles.js", () => ({
  ensureAuthProfileStore: vi.fn(),
  getSoonestCooldownExpiry: vi.fn(),
  isProfileInCooldown: vi.fn(),
+  resolveProfilesUnavailableReason: vi.fn(),
  resolveAuthProfileOrder: vi.fn(),
 }));

@@ -15,6 +16,7 @@ import {
  ensureAuthProfileStore,
  getSoonestCooldownExpiry,
  isProfileInCooldown,
+  resolveProfilesUnavailableReason,
  resolveAuthProfileOrder,
 } from "./auth-profiles.js";
 import { _probeThrottleInternals, runWithModelFallback } from "./model-fallback.js";
@@ -22,6 +24,7 @@ import { _probeThrottleInternals, runWithModelFallback } from "./model-fallback.
 const mockedEnsureAuthProfileStore = vi.mocked(ensureAuthProfileStore);
 const mockedGetSoonestCooldownExpiry = vi.mocked(getSoonestCooldownExpiry);
 const mockedIsProfileInCooldown = vi.mocked(isProfileInCooldown);
+const mockedResolveProfilesUnavailableReason = vi.mocked(resolveProfilesUnavailableReason);
 const mockedResolveAuthProfileOrder = vi.mocked(resolveAuthProfileOrder);

 const makeCfg = makeModelFallbackCfg;
@@ -98,6 +101,7 @@ describe("runWithModelFallback – probe logic", () => {
    mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) => {
      return profileId.startsWith("openai");
    });
+    mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit");
  });

  afterEach(() => {
@@ -119,6 +123,22 @@ describe("runWithModelFallback – probe logic", () => {
    expectFallbackUsed(result, run);
  });

+  it("uses inferred unavailable reason when skipping a cooldowned primary model", async () => {
+    const cfg = makeCfg();
+    const expiresIn30Min = NOW + 30 * 60 * 1000;
+    mockedGetSoonestCooldownExpiry.mockReturnValue(expiresIn30Min);
+    mockedResolveProfilesUnavailableReason.mockReturnValue("billing");
+
+    const run = vi.fn().mockResolvedValue("ok");
+
+    const result = await runPrimaryCandidate(cfg, run);
+
+    expect(result.result).toBe("ok");
+    expect(run).toHaveBeenCalledTimes(1);
+    expect(run).toHaveBeenCalledWith("anthropic", "claude-haiku-3-5");
+    expect(result.attempts[0]?.reason).toBe("billing");
+  });
+
  it("probes primary model when within 2-min margin of cooldown expiry", async () => {
    const cfg = makeCfg();
    // Cooldown expires in 1 minute — within 2-min probe margin
--- a/src/agents/model-fallback.test.ts
+++ b/src/agents/model-fallback.test.ts
@@ -348,6 +348,49 @@ describe("runWithModelFallback", () => {
    expect(result.attempts[0]?.reason).toBe("rate_limit");
  });

+  it("propagates disabled reason when all profiles are unavailable", async () => {
+    const provider = `disabled-test-${crypto.randomUUID()}`;
+    const profileId = `${provider}:default`;
+    const now = Date.now();
+
+    const store: AuthProfileStore = {
+      version: AUTH_STORE_VERSION,
+      profiles: {
+        [profileId]: {
+          type: "api_key",
+          provider,
+          key: "test-key",
+        },
+      },
+      usageStats: {
+        [profileId]: {
+          disabledUntil: now + 5 * 60_000,
+          disabledReason: "billing",
+          failureCounts: { rate_limit: 4 },
+        },
+      },
+    };
+
+    const cfg = makeProviderFallbackCfg(provider);
+    const run = vi.fn().mockImplementation(async (providerId, modelId) => {
+      if (providerId === "fallback") {
+        return "ok";
+      }
+      throw new Error(`unexpected provider: ${providerId}/${modelId}`);
+    });
+
+    const result = await runWithStoredAuth({
+      cfg,
+      store,
+      provider,
+      run,
+    });
+
+    expect(result.result).toBe("ok");
+    expect(run.mock.calls).toEqual([["fallback", "ok-model"]]);
+    expect(result.attempts[0]?.reason).toBe("billing");
+  });
+
  it("does not skip when any profile is available", async () => {
    const provider = `cooldown-mixed-${crypto.randomUUID()}`;
    const profileA = `${provider}:a`;
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -3,6 +3,7 @@ import {
  ensureAuthProfileStore,
  getSoonestCooldownExpiry,
  isProfileInCooldown,
+  resolveProfilesUnavailableReason,
  resolveAuthProfileOrder,
 } from "./auth-profiles.js";
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
@@ -342,12 +343,18 @@ export async function runWithModelFallback<T>(params: {
          profileIds,
        });
        if (!shouldProbe) {
+          const inferredReason =
+            resolveProfilesUnavailableReason({
+              store: authStore,
+              profileIds,
+              now,
+            }) ?? "rate_limit";
          // Skip without attempting
          attempts.push({
            provider: candidate.provider,
            model: candidate.model,
            error: `Provider ${candidate.provider} is in cooldown (all profiles unavailable)`,
-            reason: "rate_limit",
+            reason: inferredReason,
          });
          continue;
        }
--- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.test.ts
+++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.test.ts
@@ -4,6 +4,7 @@ import path from "node:path";
 import type { AssistantMessage } from "@mariozechner/pi-ai";
 import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../config/config.js";
+import type { AuthProfileFailureReason } from "./auth-profiles.js";
 import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js";

 const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise<EmbeddedRunAttemptResult>>();
@@ -112,7 +113,16 @@ const writeAuthStore = async (
  agentDir: string,
  opts?: {
    includeAnthropic?: boolean;
-    usageStats?: Record<string, { lastUsed?: number; cooldownUntil?: number }>;
+    usageStats?: Record<
+      string,
+      {
+        lastUsed?: number;
+        cooldownUntil?: number;
+        disabledUntil?: number;
+        disabledReason?: AuthProfileFailureReason;
+        failureCounts?: Partial<Record<AuthProfileFailureReason, number>>;
+      }
+    >;
  },
 ) => {
  const authPath = path.join(agentDir, "auth-profiles.json");
@@ -184,7 +194,17 @@ async function runAutoPinnedOpenAiTurn(params: {
 async function readUsageStats(agentDir: string) {
  const stored = JSON.parse(
    await fs.readFile(path.join(agentDir, "auth-profiles.json"), "utf-8"),
-  ) as { usageStats?: Record<string, { lastUsed?: number; cooldownUntil?: number }> };
+  ) as {
+    usageStats?: Record<
+      string,
+      {
+        lastUsed?: number;
+        cooldownUntil?: number;
+        disabledUntil?: number;
+        disabledReason?: AuthProfileFailureReason;
+      }
+    >;
+  };
  return stored.usageStats ?? {};
 }

@@ -496,6 +516,50 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
    });
  });

+  it("fails over with disabled reason when all profiles are unavailable", async () => {
+    await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => {
+      await writeAuthStore(agentDir, {
+        usageStats: {
+          "openai:p1": {
+            lastUsed: 1,
+            disabledUntil: now + 60 * 60 * 1000,
+            disabledReason: "billing",
+            failureCounts: { rate_limit: 4 },
+          },
+          "openai:p2": {
+            lastUsed: 2,
+            disabledUntil: now + 60 * 60 * 1000,
+            disabledReason: "billing",
+          },
+        },
+      });
+
+      await expect(
+        runEmbeddedPiAgent({
+          sessionId: "session:test",
+          sessionKey: "agent:test:disabled-failover",
+          sessionFile: path.join(workspaceDir, "session.jsonl"),
+          workspaceDir,
+          agentDir,
+          config: makeConfig({ fallbacks: ["openai/mock-2"] }),
+          prompt: "hello",
+          provider: "openai",
+          model: "mock-1",
+          authProfileIdSource: "auto",
+          timeoutMs: 5_000,
+          runId: "run:disabled-failover",
+        }),
+      ).rejects.toMatchObject({
+        name: "FailoverError",
+        reason: "billing",
+        provider: "openai",
+        model: "mock-1",
+      });
+
+      expect(runEmbeddedAttemptMock).not.toHaveBeenCalled();
+    });
+  });
+
  it("fails over when auth is unavailable and fallbacks are configured", async () => {
    const previousOpenAiKey = process.env.OPENAI_API_KEY;
    delete process.env.OPENAI_API_KEY;
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -12,6 +12,7 @@ import {
  markAuthProfileFailure,
  markAuthProfileGood,
  markAuthProfileUsed,
+  resolveProfilesUnavailableReason,
 } from "../auth-profiles.js";
 import {
  CONTEXT_WINDOW_HARD_MIN_TOKENS,
@@ -364,9 +365,18 @@ export async function runEmbeddedPiAgent(
      const resolveAuthProfileFailoverReason = (params: {
        allInCooldown: boolean;
        message: string;
+        profileIds?: Array<string | undefined>;
      }): FailoverReason => {
        if (params.allInCooldown) {
-          return "rate_limit";
+          const profileIds = (params.profileIds ?? profileCandidates).filter(
+            (id): id is string => typeof id === "string" && id.length > 0,
+          );
+          return (
+            resolveProfilesUnavailableReason({
+              store: authStore,
+              profileIds,
+            }) ?? "rate_limit"
+          );
        }
        const classified = classifyFailoverReason(params.message);
        return classified ?? "auth";
@@ -385,6 +395,7 @@ export async function runEmbeddedPiAgent(
        const reason = resolveAuthProfileFailoverReason({
          allInCooldown: params.allInCooldown,
          message,
+          profileIds: profileCandidates,
        });
        if (fallbackConfigured) {
          throw new FailoverError(message, {