Agents: infer auth-profile unavailable failover reason

This commit is contained in:
Vignesh Natarajan
2026-02-22 16:10:24 -08:00
parent 331b728b8d
commit 5c7c37a02a
9 changed files with 340 additions and 4 deletions

View File

@@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai
- Install/Discord Voice: make `@discordjs/opus` an optional dependency so `openclaw` install/update no longer hard-fails when native Opus builds fail, while keeping `opusscript` as the runtime fallback decoder for Discord voice flows. (#23737, #23733, #23703) Thanks @jeadland, @Sheetaa, and @Breakyman.
- Agents/Exec: honor explicit agent context when resolving `tools.exec` defaults for runs with opaque/non-agent session keys, so per-agent `host/security/ask` policies are applied consistently. (#11832)
- Agents/Auth profiles: infer `all profiles unavailable` failover reasons from active profile cooldown/disabled stats (instead of hardcoded `rate_limit`) so auth/billing OAuth outages surface accurately in fallback errors. (#23996) Thanks @DerpyNoodlez.
- Security/Sessions: redact sensitive token patterns from `sessions_history` tool output and surface `contentRedacted` metadata when masking occurs. (#16928) Thanks @aether-ai-agent.
- Sandbox/Docker: default sandbox container user to the workspace owner `uid:gid` when `agents.*.sandbox.docker.user` is unset, fixing non-root gateway file-tool permissions under capability-dropped containers. (#20979)
- Doctor/Security: add an explicit warning that `approvals.exec.enabled=false` disables forwarding only, while enforcement remains driven by host-local `exec-approvals.json` policy. (#15047)

View File

@@ -40,5 +40,6 @@ export {
markAuthProfileCooldown,
markAuthProfileFailure,
markAuthProfileUsed,
resolveProfilesUnavailableReason,
resolveProfileUnusableUntilForDisplay,
} from "./auth-profiles/usage.js";

View File

@@ -5,6 +5,7 @@ import {
clearExpiredCooldowns,
isProfileInCooldown,
markAuthProfileFailure,
resolveProfilesUnavailableReason,
resolveProfileUnusableUntil,
} from "./usage.js";
@@ -85,6 +86,101 @@ describe("isProfileInCooldown", () => {
});
});
describe("resolveProfilesUnavailableReason", () => {
it("prefers active disabledReason when profiles are disabled", () => {
const now = Date.now();
const store = makeStore({
"anthropic:default": {
disabledUntil: now + 60_000,
disabledReason: "billing",
},
});
expect(
resolveProfilesUnavailableReason({
store,
profileIds: ["anthropic:default"],
now,
}),
).toBe("billing");
});
it("uses recorded non-rate-limit failure counts for active cooldown windows", () => {
const now = Date.now();
const store = makeStore({
"anthropic:default": {
cooldownUntil: now + 60_000,
failureCounts: { auth: 3, rate_limit: 1 },
},
});
expect(
resolveProfilesUnavailableReason({
store,
profileIds: ["anthropic:default"],
now,
}),
).toBe("auth");
});
it("falls back to rate_limit when active cooldown has no reason history", () => {
const now = Date.now();
const store = makeStore({
"anthropic:default": {
cooldownUntil: now + 60_000,
},
});
expect(
resolveProfilesUnavailableReason({
store,
profileIds: ["anthropic:default"],
now,
}),
).toBe("rate_limit");
});
it("ignores expired windows and returns null when no profile is actively unavailable", () => {
const now = Date.now();
const store = makeStore({
"anthropic:default": {
cooldownUntil: now - 1_000,
failureCounts: { auth: 5 },
},
"anthropic:backup": {
disabledUntil: now - 500,
disabledReason: "billing",
},
});
expect(
resolveProfilesUnavailableReason({
store,
profileIds: ["anthropic:default", "anthropic:backup"],
now,
}),
).toBeNull();
});
it("breaks ties by reason priority for equal active failure counts", () => {
const now = Date.now();
const store = makeStore({
"anthropic:default": {
cooldownUntil: now + 60_000,
failureCounts: { timeout: 2, auth: 2 },
},
});
expect(
resolveProfilesUnavailableReason({
store,
profileIds: ["anthropic:default"],
now,
}),
).toBe("auth");
});
});
// ---------------------------------------------------------------------------
// clearExpiredCooldowns
// ---------------------------------------------------------------------------

View File

@@ -3,6 +3,20 @@ import { normalizeProviderId } from "../model-selection.js";
import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js";
import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js";
const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [
"auth",
"billing",
"format",
"model_not_found",
"timeout",
"rate_limit",
"unknown",
];
const FAILURE_REASON_SET = new Set<AuthProfileFailureReason>(FAILURE_REASON_PRIORITY);
const FAILURE_REASON_ORDER = new Map<AuthProfileFailureReason, number>(
FAILURE_REASON_PRIORITY.map((reason, index) => [reason, index]),
);
export function resolveProfileUnusableUntil(
stats: Pick<ProfileUsageStats, "cooldownUntil" | "disabledUntil">,
): number | null {
@@ -27,6 +41,85 @@ export function isProfileInCooldown(store: AuthProfileStore, profileId: string):
return unusableUntil ? Date.now() < unusableUntil : false;
}
function isActiveUnusableWindow(until: number | undefined, now: number): boolean {
return typeof until === "number" && Number.isFinite(until) && until > 0 && now < until;
}
/**
* Infer the most likely reason all candidate profiles are currently unavailable.
*
* We prefer explicit active `disabledReason` values (for example billing/auth)
* over generic cooldown buckets, then fall back to failure-count signals.
*/
export function resolveProfilesUnavailableReason(params: {
store: AuthProfileStore;
profileIds: string[];
now?: number;
}): AuthProfileFailureReason | null {
const now = params.now ?? Date.now();
const scores = new Map<AuthProfileFailureReason, number>();
const addScore = (reason: AuthProfileFailureReason, value: number) => {
if (!FAILURE_REASON_SET.has(reason) || value <= 0 || !Number.isFinite(value)) {
return;
}
scores.set(reason, (scores.get(reason) ?? 0) + value);
};
for (const profileId of params.profileIds) {
const stats = params.store.usageStats?.[profileId];
if (!stats) {
continue;
}
const disabledActive = isActiveUnusableWindow(stats.disabledUntil, now);
if (disabledActive && stats.disabledReason && FAILURE_REASON_SET.has(stats.disabledReason)) {
// Disabled reasons are explicit and high-signal; weight heavily.
addScore(stats.disabledReason, 1_000);
continue;
}
const cooldownActive = isActiveUnusableWindow(stats.cooldownUntil, now);
if (!cooldownActive) {
continue;
}
let recordedReason = false;
for (const [rawReason, rawCount] of Object.entries(stats.failureCounts ?? {})) {
const reason = rawReason as AuthProfileFailureReason;
const count = typeof rawCount === "number" ? rawCount : 0;
if (!FAILURE_REASON_SET.has(reason) || count <= 0) {
continue;
}
addScore(reason, count);
recordedReason = true;
}
if (!recordedReason) {
addScore("rate_limit", 1);
}
}
if (scores.size === 0) {
return null;
}
let best: AuthProfileFailureReason | null = null;
let bestScore = -1;
let bestPriority = Number.MAX_SAFE_INTEGER;
for (const reason of FAILURE_REASON_PRIORITY) {
const score = scores.get(reason);
if (typeof score !== "number") {
continue;
}
const priority = FAILURE_REASON_ORDER.get(reason) ?? Number.MAX_SAFE_INTEGER;
if (score > bestScore || (score === bestScore && priority < bestPriority)) {
best = reason;
bestScore = score;
bestPriority = priority;
}
}
return best;
}
/**
* Return the soonest `unusableUntil` timestamp (ms epoch) among the given
* profiles, or `null` when no profile has a recorded cooldown. Note: the

View File

@@ -8,6 +8,7 @@ vi.mock("./auth-profiles.js", () => ({
ensureAuthProfileStore: vi.fn(),
getSoonestCooldownExpiry: vi.fn(),
isProfileInCooldown: vi.fn(),
resolveProfilesUnavailableReason: vi.fn(),
resolveAuthProfileOrder: vi.fn(),
}));
@@ -15,6 +16,7 @@ import {
ensureAuthProfileStore,
getSoonestCooldownExpiry,
isProfileInCooldown,
resolveProfilesUnavailableReason,
resolveAuthProfileOrder,
} from "./auth-profiles.js";
import { _probeThrottleInternals, runWithModelFallback } from "./model-fallback.js";
@@ -22,6 +24,7 @@ import { _probeThrottleInternals, runWithModelFallback } from "./model-fallback.
const mockedEnsureAuthProfileStore = vi.mocked(ensureAuthProfileStore);
const mockedGetSoonestCooldownExpiry = vi.mocked(getSoonestCooldownExpiry);
const mockedIsProfileInCooldown = vi.mocked(isProfileInCooldown);
const mockedResolveProfilesUnavailableReason = vi.mocked(resolveProfilesUnavailableReason);
const mockedResolveAuthProfileOrder = vi.mocked(resolveAuthProfileOrder);
const makeCfg = makeModelFallbackCfg;
@@ -98,6 +101,7 @@ describe("runWithModelFallback probe logic", () => {
mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) => {
return profileId.startsWith("openai");
});
mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit");
});
afterEach(() => {
@@ -119,6 +123,22 @@ describe("runWithModelFallback probe logic", () => {
expectFallbackUsed(result, run);
});
it("uses inferred unavailable reason when skipping a cooldowned primary model", async () => {
const cfg = makeCfg();
const expiresIn30Min = NOW + 30 * 60 * 1000;
mockedGetSoonestCooldownExpiry.mockReturnValue(expiresIn30Min);
mockedResolveProfilesUnavailableReason.mockReturnValue("billing");
const run = vi.fn().mockResolvedValue("ok");
const result = await runPrimaryCandidate(cfg, run);
expect(result.result).toBe("ok");
expect(run).toHaveBeenCalledTimes(1);
expect(run).toHaveBeenCalledWith("anthropic", "claude-haiku-3-5");
expect(result.attempts[0]?.reason).toBe("billing");
});
it("probes primary model when within 2-min margin of cooldown expiry", async () => {
const cfg = makeCfg();
// Cooldown expires in 1 minute — within 2-min probe margin

View File

@@ -348,6 +348,49 @@ describe("runWithModelFallback", () => {
expect(result.attempts[0]?.reason).toBe("rate_limit");
});
it("propagates disabled reason when all profiles are unavailable", async () => {
const provider = `disabled-test-${crypto.randomUUID()}`;
const profileId = `${provider}:default`;
const now = Date.now();
const store: AuthProfileStore = {
version: AUTH_STORE_VERSION,
profiles: {
[profileId]: {
type: "api_key",
provider,
key: "test-key",
},
},
usageStats: {
[profileId]: {
disabledUntil: now + 5 * 60_000,
disabledReason: "billing",
failureCounts: { rate_limit: 4 },
},
},
};
const cfg = makeProviderFallbackCfg(provider);
const run = vi.fn().mockImplementation(async (providerId, modelId) => {
if (providerId === "fallback") {
return "ok";
}
throw new Error(`unexpected provider: ${providerId}/${modelId}`);
});
const result = await runWithStoredAuth({
cfg,
store,
provider,
run,
});
expect(result.result).toBe("ok");
expect(run.mock.calls).toEqual([["fallback", "ok-model"]]);
expect(result.attempts[0]?.reason).toBe("billing");
});
it("does not skip when any profile is available", async () => {
const provider = `cooldown-mixed-${crypto.randomUUID()}`;
const profileA = `${provider}:a`;

View File

@@ -3,6 +3,7 @@ import {
ensureAuthProfileStore,
getSoonestCooldownExpiry,
isProfileInCooldown,
resolveProfilesUnavailableReason,
resolveAuthProfileOrder,
} from "./auth-profiles.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
@@ -342,12 +343,18 @@ export async function runWithModelFallback<T>(params: {
profileIds,
});
if (!shouldProbe) {
const inferredReason =
resolveProfilesUnavailableReason({
store: authStore,
profileIds,
now,
}) ?? "rate_limit";
// Skip without attempting
attempts.push({
provider: candidate.provider,
model: candidate.model,
error: `Provider ${candidate.provider} is in cooldown (all profiles unavailable)`,
reason: "rate_limit",
reason: inferredReason,
});
continue;
}

View File

@@ -4,6 +4,7 @@ import path from "node:path";
import type { AssistantMessage } from "@mariozechner/pi-ai";
import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import type { AuthProfileFailureReason } from "./auth-profiles.js";
import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js";
const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise<EmbeddedRunAttemptResult>>();
@@ -112,7 +113,16 @@ const writeAuthStore = async (
agentDir: string,
opts?: {
includeAnthropic?: boolean;
usageStats?: Record<string, { lastUsed?: number; cooldownUntil?: number }>;
usageStats?: Record<
string,
{
lastUsed?: number;
cooldownUntil?: number;
disabledUntil?: number;
disabledReason?: AuthProfileFailureReason;
failureCounts?: Partial<Record<AuthProfileFailureReason, number>>;
}
>;
},
) => {
const authPath = path.join(agentDir, "auth-profiles.json");
@@ -184,7 +194,17 @@ async function runAutoPinnedOpenAiTurn(params: {
async function readUsageStats(agentDir: string) {
const stored = JSON.parse(
await fs.readFile(path.join(agentDir, "auth-profiles.json"), "utf-8"),
) as { usageStats?: Record<string, { lastUsed?: number; cooldownUntil?: number }> };
) as {
usageStats?: Record<
string,
{
lastUsed?: number;
cooldownUntil?: number;
disabledUntil?: number;
disabledReason?: AuthProfileFailureReason;
}
>;
};
return stored.usageStats ?? {};
}
@@ -496,6 +516,50 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
});
});
it("fails over with disabled reason when all profiles are unavailable", async () => {
await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => {
await writeAuthStore(agentDir, {
usageStats: {
"openai:p1": {
lastUsed: 1,
disabledUntil: now + 60 * 60 * 1000,
disabledReason: "billing",
failureCounts: { rate_limit: 4 },
},
"openai:p2": {
lastUsed: 2,
disabledUntil: now + 60 * 60 * 1000,
disabledReason: "billing",
},
},
});
await expect(
runEmbeddedPiAgent({
sessionId: "session:test",
sessionKey: "agent:test:disabled-failover",
sessionFile: path.join(workspaceDir, "session.jsonl"),
workspaceDir,
agentDir,
config: makeConfig({ fallbacks: ["openai/mock-2"] }),
prompt: "hello",
provider: "openai",
model: "mock-1",
authProfileIdSource: "auto",
timeoutMs: 5_000,
runId: "run:disabled-failover",
}),
).rejects.toMatchObject({
name: "FailoverError",
reason: "billing",
provider: "openai",
model: "mock-1",
});
expect(runEmbeddedAttemptMock).not.toHaveBeenCalled();
});
});
it("fails over when auth is unavailable and fallbacks are configured", async () => {
const previousOpenAiKey = process.env.OPENAI_API_KEY;
delete process.env.OPENAI_API_KEY;

View File

@@ -12,6 +12,7 @@ import {
markAuthProfileFailure,
markAuthProfileGood,
markAuthProfileUsed,
resolveProfilesUnavailableReason,
} from "../auth-profiles.js";
import {
CONTEXT_WINDOW_HARD_MIN_TOKENS,
@@ -364,9 +365,18 @@ export async function runEmbeddedPiAgent(
const resolveAuthProfileFailoverReason = (params: {
allInCooldown: boolean;
message: string;
profileIds?: Array<string | undefined>;
}): FailoverReason => {
if (params.allInCooldown) {
return "rate_limit";
const profileIds = (params.profileIds ?? profileCandidates).filter(
(id): id is string => typeof id === "string" && id.length > 0,
);
return (
resolveProfilesUnavailableReason({
store: authStore,
profileIds,
}) ?? "rate_limit"
);
}
const classified = classifyFailoverReason(params.message);
return classified ?? "auth";
@@ -385,6 +395,7 @@ export async function runEmbeddedPiAgent(
const reason = resolveAuthProfileFailoverReason({
allInCooldown: params.allInCooldown,
message,
profileIds: profileCandidates,
});
if (fallbackConfigured) {
throw new FailoverError(message, {