Files
openclaw/src/agents/failover-error.ts
Mert Başar 0ff5fe3a80 fix(auth): add force re-login recovery and fallback auth skips
Summary:
- Add forced provider re-login support that clears cached auth profiles before running provider login again.
- Add provider-auth remediation guidance and a session-scoped skip cache for known-bad fallback auth attempts.
- Wire session ids through agent command, auto-reply, and embedded compaction fallback callers so the skip cache applies on real run paths.
- Fail closed when forced auth profile removal cannot update the profile store.

Verification:
- Local format, lint, diff-check, focused Vitest shards, and autoreview passed.
- PR CI, CodeQL Security High, and Critical Quality agent-runtime-boundary passed on head 1b4e9e753e.

Co-authored-by: Mert Basar <MertBasar0@users.noreply.github.com>
2026-05-31 19:01:51 +01:00

670 lines
19 KiB
TypeScript

import { parseStrictNonNegativeInteger } from "@openclaw/normalization-core/number-coercion";
import { formatCliCommand } from "../cli/command-format.js";
import { readErrorName } from "../infra/errors.js";
import {
classifyFailoverSignal,
inferSignalStatus,
isUnclassifiedNoBodyHttpSignal,
type FailoverClassification,
type FailoverSignal,
} from "./embedded-agent-helpers/errors.js";
import { isTimeoutErrorMessage } from "./embedded-agent-helpers/errors.js";
import type { FailoverReason } from "./embedded-agent-helpers/types.js";
import { isSessionWriteLockAcquireError } from "./session-write-lock-error.js";
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
const MAX_FAILOVER_CAUSE_DEPTH = 25;
export class FailoverError extends Error {
readonly reason: FailoverReason;
readonly provider?: string;
readonly model?: string;
readonly profileId?: string;
readonly status?: number;
readonly code?: string;
readonly rawError?: string;
// Originating request attribution propagated through wrapper errors so
// structured log ingestion (e.g. api_health_log) can attribute exhausted
// failover failures back to a session/lane and the last attempted provider.
// See #42713.
readonly sessionId?: string;
readonly lane?: string;
readonly suspend?: boolean;
constructor(
message: string,
params: {
reason: FailoverReason;
provider?: string;
model?: string;
profileId?: string;
status?: number;
code?: string;
rawError?: string;
sessionId?: string;
lane?: string;
cause?: unknown;
suspend?: boolean;
},
) {
super(message, { cause: params.cause });
this.name = "FailoverError";
this.reason = params.reason;
this.provider = params.provider;
this.model = params.model;
this.profileId = params.profileId;
this.status = params.status;
this.code = params.code;
this.rawError = params.rawError;
this.sessionId = params.sessionId;
this.lane = params.lane;
this.suspend = params.suspend;
}
}
export function isFailoverError(err: unknown): err is FailoverError {
if (err instanceof FailoverError) {
return true;
}
return Boolean(
err &&
typeof err === "object" &&
(err as { name?: unknown }).name === "FailoverError" &&
typeof (err as { reason?: unknown }).reason === "string",
);
}
export function resolveFailoverStatus(reason: FailoverReason): number | undefined {
switch (reason) {
case "billing":
return 402;
case "server_error":
return 500;
case "rate_limit":
return 429;
case "overloaded":
return 503;
case "auth":
return 401;
case "auth_permanent":
return 403;
case "timeout":
return 408;
case "format":
return 400;
case "model_not_found":
return 404;
case "session_expired":
return 410; // Gone - session no longer exists
default:
return undefined;
}
}
function findErrorProperty<T>(
err: unknown,
reader: (candidate: unknown) => T | undefined,
seen: Set<object> = new Set(),
): T | undefined {
const direct = reader(err);
if (direct !== undefined) {
return direct;
}
if (!err || typeof err !== "object") {
return undefined;
}
if (seen.has(err)) {
return undefined;
}
seen.add(err);
const candidate = err as { error?: unknown; cause?: unknown };
return (
findErrorProperty(candidate.error, reader, seen) ??
findErrorProperty(candidate.cause, reader, seen)
);
}
function readDirectStatusCode(err: unknown): number | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const candidate =
(err as { status?: unknown; statusCode?: unknown }).status ??
(err as { statusCode?: unknown }).statusCode;
if (typeof candidate === "number") {
return candidate;
}
if (typeof candidate === "string") {
return parseStrictNonNegativeInteger(candidate);
}
return undefined;
}
function getStatusCode(err: unknown): number | undefined {
return findErrorProperty(err, readDirectStatusCode);
}
function readDirectErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const directCode = (err as { code?: unknown }).code;
if (typeof directCode === "string") {
const trimmed = directCode.trim();
return trimmed ? trimmed : undefined;
}
const detailCode = (err as { detail?: { code?: unknown } }).detail?.code;
if (typeof detailCode === "string") {
const trimmed = detailCode.trim();
return trimmed ? trimmed : undefined;
}
const status = (err as { status?: unknown }).status;
if (typeof status !== "string" || /^\d+$/.test(status)) {
return undefined;
}
const trimmed = status.trim();
return trimmed ? trimmed : undefined;
}
function getErrorCode(err: unknown): string | undefined {
return findErrorProperty(err, readDirectErrorCode);
}
function isStableProviderErrorType(value: string): boolean {
if (
/^(?:api|authentication|invalid_request|not_found|overloaded|permission|rate_limit|server)_error$/i.test(
value,
)
) {
return false;
}
return /^[A-Z][A-Z0-9_:-]*$/.test(value);
}
function readDirectErrorType(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const directType = (err as { errorType?: unknown }).errorType;
if (typeof directType === "string") {
const trimmed = directType.trim();
return trimmed && isStableProviderErrorType(trimmed) ? trimmed : undefined;
}
const detailType = (err as { detail?: { type?: unknown } }).detail?.type;
if (typeof detailType === "string") {
const trimmed = detailType.trim();
return trimmed && isStableProviderErrorType(trimmed) ? trimmed : undefined;
}
const type = (err as { type?: unknown }).type;
if (typeof type === "string") {
const trimmed = type.trim();
if (!trimmed || /^(?:error|exception)$/i.test(trimmed)) {
return undefined;
}
return isStableProviderErrorType(trimmed) ? trimmed : undefined;
}
return undefined;
}
function getErrorType(err: unknown): string | undefined {
return findErrorProperty(err, readDirectErrorType);
}
function readDirectProvider(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const provider = (err as { provider?: unknown }).provider;
if (typeof provider !== "string") {
return undefined;
}
const trimmed = provider.trim();
return trimmed || undefined;
}
function getProvider(err: unknown): string | undefined {
return findErrorProperty(err, readDirectProvider);
}
function readDirectErrorMessage(err: unknown): string | undefined {
if (err instanceof Error) {
return err.message || undefined;
}
if (typeof err === "string") {
return err || undefined;
}
if (typeof err === "number" || typeof err === "boolean" || typeof err === "bigint") {
return String(err);
}
if (typeof err === "symbol") {
return err.description ?? undefined;
}
if (err && typeof err === "object") {
const message = (err as { message?: unknown }).message;
if (typeof message === "string") {
return message || undefined;
}
}
return undefined;
}
function getErrorMessage(err: unknown): string {
return findErrorProperty(err, readDirectErrorMessage) ?? "";
}
function normalizeDirectErrorSignal(err: unknown): FailoverSignal {
const message = readDirectErrorMessage(err);
return {
status: readDirectStatusCode(err),
code: readDirectErrorCode(err),
errorType: readDirectErrorType(err),
message: message || undefined,
provider: readDirectProvider(err),
};
}
function hasSessionWriteLockContention(err: unknown, seen: Set<object> = new Set()): boolean {
if (isSessionWriteLockAcquireError(err)) {
return true;
}
if (!err || typeof err !== "object") {
return false;
}
if (seen.has(err)) {
return false;
}
seen.add(err);
const candidate = err as { error?: unknown; cause?: unknown; reason?: unknown };
return (
hasSessionWriteLockContention(candidate.error, seen) ||
hasSessionWriteLockContention(candidate.cause, seen) ||
hasSessionWriteLockContention(candidate.reason, seen)
);
}
function isEmbeddedAttemptSessionTakeover(err: unknown): boolean {
// Match by name to avoid importing embedded-agent-runner here (would create a cycle).
return Boolean(
err && typeof err === "object" && readErrorName(err) === "EmbeddedAttemptSessionTakeoverError",
);
}
function hasEmbeddedAttemptSessionTakeover(err: unknown, seen: Set<object> = new Set()): boolean {
if (isEmbeddedAttemptSessionTakeover(err)) {
return true;
}
if (!err || typeof err !== "object") {
return false;
}
if (seen.has(err)) {
return false;
}
seen.add(err);
const candidate = err as { error?: unknown; cause?: unknown; reason?: unknown };
return (
hasEmbeddedAttemptSessionTakeover(candidate.error, seen) ||
hasEmbeddedAttemptSessionTakeover(candidate.cause, seen) ||
hasEmbeddedAttemptSessionTakeover(candidate.reason, seen)
);
}
/**
* True when the error is a local runtime coordination error (session write-lock
* timeout or embedded attempt session takeover) rather than a provider/model
* failure. The model fallback chain must abort on these instead of consuming
* candidate slots — retrying any model would hit the same local condition.
* See #83510.
*/
export function isNonProviderRuntimeCoordinationError(err: unknown): boolean {
if (!hasSessionWriteLockContention(err) && !hasEmbeddedAttemptSessionTakeover(err)) {
return false;
}
if (isFailoverError(err)) {
return false;
}
if (isEmbeddedAttemptSessionTakeover(err)) {
return true;
}
return resolveFailoverClassificationFromError(err) === null;
}
function hasTimeoutHint(err: unknown): boolean {
if (!err) {
return false;
}
if (hasSessionWriteLockContention(err)) {
return false;
}
if (readErrorName(err) === "TimeoutError") {
return true;
}
const message = getErrorMessage(err);
return Boolean(message && isTimeoutErrorMessage(message));
}
export function isTimeoutError(err: unknown): boolean {
if (hasTimeoutHint(err)) {
return true;
}
if (!err || typeof err !== "object") {
return false;
}
if (readErrorName(err) !== "AbortError") {
return false;
}
if (hasSessionWriteLockContention(err)) {
return false;
}
const message = getErrorMessage(err);
if (message && ABORT_TIMEOUT_RE.test(message)) {
return true;
}
const cause = "cause" in err ? (err as { cause?: unknown }).cause : undefined;
const reason = "reason" in err ? (err as { reason?: unknown }).reason : undefined;
return hasTimeoutHint(cause) || hasTimeoutHint(reason);
}
function failoverReasonFromClassification(
classification: FailoverClassification | null,
): FailoverReason | null {
return classification?.kind === "reason" ? classification.reason : null;
}
function normalizeErrorSignal(err: unknown, providerHint?: string): FailoverSignal {
const message = getErrorMessage(err);
return {
status: getStatusCode(err),
code: getErrorCode(err),
errorType: getErrorType(err),
message: message || undefined,
provider: getProvider(err) ?? providerHint,
};
}
function getNestedErrorCandidates(err: unknown): unknown[] {
if (!err || typeof err !== "object") {
return [];
}
const candidate = err as { error?: unknown; cause?: unknown };
return [candidate.error, candidate.cause].filter(
(value): value is unknown => value !== undefined && value !== err,
);
}
function isFormatClassification(classification: FailoverClassification | null): boolean {
return classification?.kind === "reason" && classification.reason === "format";
}
function decideNestedFormatOverride(
candidate: unknown,
inheritedStatus: number | undefined,
seen: Set<object>,
depth: number,
): boolean | null {
if (depth > MAX_FAILOVER_CAUSE_DEPTH) {
return null;
}
if (candidate && typeof candidate === "object") {
if (seen.has(candidate)) {
return null;
}
seen.add(candidate);
}
const directSignal = normalizeDirectErrorSignal(candidate);
const nestedCandidates = getNestedErrorCandidates(candidate);
const nestedStatus = directSignal.status ?? inheritedStatus;
const hasDirectMessage = Boolean(directSignal.message?.trim());
if (
hasDirectMessage &&
isUnclassifiedNoBodyHttpSignal({ ...directSignal, status: nestedStatus })
) {
return true;
}
if (hasDirectMessage && (nestedCandidates.length === 0 || classifyFailoverSignal(directSignal))) {
return false;
}
for (const nestedCandidate of nestedCandidates) {
const decision = decideNestedFormatOverride(nestedCandidate, nestedStatus, seen, depth + 1);
if (decision !== null) {
return decision;
}
}
return null;
}
function resolveFailoverClassificationFromErrorInternal(
err: unknown,
seen: Set<object>,
depth: number,
providerHint?: string,
): FailoverClassification | null {
if (depth > MAX_FAILOVER_CAUSE_DEPTH) {
return null;
}
if (err && typeof err === "object") {
if (seen.has(err)) {
return null;
}
seen.add(err);
}
if (isFailoverError(err)) {
return {
kind: "reason",
reason: err.reason,
};
}
const signal = normalizeErrorSignal(err, providerHint);
const codeReason = signal.code
? failoverReasonFromClassification(classifyFailoverSignal({ code: signal.code }))
: null;
const hasExplicitFailoverMetadata =
typeof inferSignalStatus(signal) === "number" ||
(codeReason !== null && codeReason !== "timeout");
const hasSessionLock = hasSessionWriteLockContention(err);
const classification = classifyFailoverSignal(signal);
const nestedCandidates = getNestedErrorCandidates(err);
if (!classification || classification.kind === "context_overflow") {
for (const candidate of nestedCandidates) {
const nestedClassification = resolveFailoverClassificationFromErrorInternal(
candidate,
seen,
depth + 1,
providerHint,
);
if (nestedClassification) {
if (hasSessionLock && !hasExplicitFailoverMetadata) {
return null;
}
return nestedClassification;
}
}
}
if (isFormatClassification(classification)) {
for (const candidate of nestedCandidates) {
const shouldClearFormat = decideNestedFormatOverride(
candidate,
signal.status,
seen,
depth + 1,
);
if (shouldClearFormat === true) {
return null;
}
if (shouldClearFormat === false) {
break;
}
}
}
if (classification) {
if (hasSessionLock && !hasExplicitFailoverMetadata) {
return null;
}
return classification;
}
if (hasSessionLock) {
return null;
}
if (isTimeoutError(err)) {
return {
kind: "reason",
reason: "timeout",
};
}
return null;
}
function resolveFailoverClassificationFromError(
err: unknown,
providerHint?: string,
): FailoverClassification | null {
return resolveFailoverClassificationFromErrorInternal(err, new Set<object>(), 0, providerHint);
}
export function resolveFailoverReasonFromError(
err: unknown,
providerHint?: string,
): FailoverReason | null {
return failoverReasonFromClassification(
resolveFailoverClassificationFromError(err, providerHint),
);
}
/**
* Build an actionable remediation hint for a failover error when the failure
* reason is `auth` / `auth_permanent` and we have enough provider attribution
* to suggest a re-authentication command. Returns `undefined` for any other
* failure shape so callers can opportunistically append the hint without
* branching on every reason themselves.
*
* Keep the string short and copy-pasteable — operators see it in fallback
* summary errors and TUI status lines.
*/
export function buildFailoverRemediationHint(err: unknown): string | undefined {
if (!isFailoverError(err)) {
return undefined;
}
if (err.reason !== "auth" && err.reason !== "auth_permanent") {
return undefined;
}
const provider = err.provider?.trim();
if (!provider) {
return undefined;
}
const command = buildProviderReauthCommand(provider);
return command ? `Re-authenticate with: ${command}` : undefined;
}
function quotePosixShellArg(value: string): string {
return `'${value.replaceAll("'", "'\\''")}'`;
}
export function buildProviderReauthCommand(
provider: string,
env: Record<string, string | undefined> = process.env as Record<string, string | undefined>,
): string | undefined {
const trimmed = provider.trim();
if (!trimmed || hasControlCharacter(trimmed)) {
return undefined;
}
return formatCliCommand(
`openclaw models auth login --provider ${quotePosixShellArg(trimmed)} --force`,
env,
);
}
function hasControlCharacter(value: string): boolean {
for (let i = 0; i < value.length; i += 1) {
const code = value.charCodeAt(i);
if (code < 0x20 || code === 0x7f) {
return true;
}
}
return false;
}
export function describeFailoverError(err: unknown): {
message: string;
rawError?: string;
reason?: FailoverReason;
status?: number;
code?: string;
provider?: string;
model?: string;
profileId?: string;
sessionId?: string;
lane?: string;
} {
if (isFailoverError(err)) {
return {
message: err.message,
rawError: err.rawError,
reason: err.reason,
status: err.status,
code: err.code,
provider: err.provider,
model: err.model,
profileId: err.profileId,
sessionId: err.sessionId,
lane: err.lane,
};
}
const signal = normalizeErrorSignal(err);
const message = signal.message ?? String(err);
return {
message,
reason: resolveFailoverReasonFromError(err) ?? undefined,
status: signal.status,
code: signal.code,
provider: signal.provider,
};
}
export function coerceToFailoverError(
err: unknown,
context?: {
provider?: string;
model?: string;
profileId?: string;
sessionId?: string;
lane?: string;
},
): FailoverError | null {
if (isFailoverError(err)) {
return err;
}
const reason = resolveFailoverReasonFromError(err, context?.provider);
if (!reason) {
return null;
}
const signal = normalizeErrorSignal(err);
const message = signal.message ?? String(err);
const status = signal.status ?? resolveFailoverStatus(reason);
const code = signal.code;
// Suspend when hitting rate limits or billing issues in an attributed session
const shouldSuspend =
Boolean(context?.sessionId) && (reason === "rate_limit" || reason === "billing");
return new FailoverError(message, {
reason,
provider: context?.provider ?? signal.provider,
model: context?.model,
profileId: context?.profileId,
sessionId: context?.sessionId,
lane: context?.lane,
status,
code,
rawError: message,
cause: err instanceof Error ? err : undefined,
suspend: shouldSuspend,
});
}