fix(agents,failover): propagate sessionId/lane/provider attribution through FailoverError (#73506)

* fix(agents,failover): propagate sessionId/lane/provider attribution through FailoverError

Adds optional `sessionId` and `lane` fields to `FailoverError` and threads
them — together with the existing `provider`, `model`, `profileId` — through
`describeFailoverError` and `coerceToFailoverError` context, so structured
error log ingestion can attribute exhausted-fallback wrapper errors back
to the originating request instead of dropping the per-profile metadata
when the final wrapper is built.

Fixes #42713.

* fix: preserve failover error attribution

---------

Co-authored-by: Altay <altay@uinaf.dev>
This commit is contained in:
wenxu007
2026-05-01 16:26:56 +08:00
committed by GitHub
parent 29ed5266bf
commit 9df0ae6767
15 changed files with 227 additions and 7 deletions

View File

@@ -87,6 +87,7 @@ Docs: https://docs.openclaw.ai
- fix(logging): add redaction patterns for Tencent Cloud, Alibaba Cloud, HuggingFace and Replicate API keys (#58162). Thanks @gavyngong
- Pairing: surface unexpected allowlist filesystem stat errors instead of treating the allowlist as missing, so permission and I/O failures are visible during pairing authorization checks. (#63324) Thanks @franciscomaestre.
- macOS app: reserve layout space for exec approval command details so the allow dialog no longer overlaps the command, context, and action buttons. (#75470) Thanks @ngutman.
- Agents/failover: carry `sessionId`, `lane`, `provider`, `model`, and `profileId` attribution through `FailoverError` and `describeFailoverError`/`coerceToFailoverError` so structured error logs (e.g. `gateway.err.log` ingestion) can attribute exhausted-fallback wrapper errors to the originating session and last-attempted provider instead of dropping the metadata after the per-profile errors. Fixes #42713. (#73506) Thanks @wenxu007.
## 2026.4.29

View File

@@ -94,6 +94,7 @@ function buildPreparedContext(params?: {
sessionKey?: string;
cliSessionId?: string;
runId?: string;
lane?: string;
openClawHistoryPrompt?: string;
}): PreparedCliRunContext {
const backend = {
@@ -117,6 +118,7 @@ function buildPreparedContext(params?: {
thinkLevel: "low",
timeoutMs: 1_000,
runId: params?.runId ?? "run-2",
lane: params?.lane,
},
started: Date.now(),
workspaceDir: "/tmp",
@@ -173,6 +175,36 @@ describe("runCliAgent reliability", () => {
).rejects.toThrow("produced no output");
});
it("adds request attribution to CLI watchdog failover errors", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "no-output-timeout",
exitCode: null,
exitSignal: "SIGKILL",
durationMs: 200,
stdout: "",
stderr: "",
timedOut: true,
noOutputTimedOut: true,
}),
);
await expect(
executePreparedCliRun(
buildPreparedContext({
cliSessionId: "thread-123",
lane: "custom-lane",
runId: "run-attribution",
}),
"thread-123",
),
).rejects.toMatchObject({
name: "FailoverError",
sessionId: "s1",
lane: "custom-lane",
});
});
it("enqueues a system event and heartbeat wake on no-output watchdog timeout for session runs", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({

View File

@@ -188,6 +188,8 @@ export async function runPreparedCliAgent(
reason,
provider: params.provider,
model: context.modelId,
sessionId: params.sessionId,
lane: params.lane,
status,
});
}

View File

@@ -553,6 +553,8 @@ export async function executePreparedCliRun(
reason: "timeout",
provider: params.provider,
model: context.modelId,
sessionId: params.sessionId,
lane: params.lane,
status: resolveFailoverStatus("timeout"),
});
}
@@ -562,6 +564,8 @@ export async function executePreparedCliRun(
reason: "timeout",
provider: params.provider,
model: context.modelId,
sessionId: params.sessionId,
lane: params.lane,
status: resolveFailoverStatus("timeout"),
});
}
@@ -576,6 +580,8 @@ export async function executePreparedCliRun(
reason,
provider: params.provider,
model: context.modelId,
sessionId: params.sessionId,
lane: params.lane,
status,
});
}

View File

@@ -29,6 +29,7 @@ export type RunCliAgentParams = {
thinkLevel?: ThinkLevel;
timeoutMs: number;
runId: string;
lane?: string;
jobId?: string;
extraSystemPrompt?: string;
sourceReplyDeliveryMode?: SourceReplyDeliveryMode;

View File

@@ -972,4 +972,40 @@ describe("failover-error", () => {
expect(described.message).toBe("123");
expect(described.reason).toBeUndefined();
});
it("propagates sessionId/lane/provider attribution through FailoverError (#42713)", () => {
const err = new FailoverError("all fallbacks exhausted", {
reason: "rate_limit",
provider: "anthropic",
model: "claude-opus-4-6",
profileId: "profile-2",
sessionId: "session:browser-abcd",
lane: "answer",
status: 429,
});
expect(err.sessionId).toBe("session:browser-abcd");
expect(err.lane).toBe("answer");
expect(describeFailoverError(err)).toMatchObject({
provider: "anthropic",
model: "claude-opus-4-6",
profileId: "profile-2",
sessionId: "session:browser-abcd",
lane: "answer",
reason: "rate_limit",
status: 429,
});
});
it("coerceToFailoverError carries sessionId/lane from context (#42713)", () => {
const err = coerceToFailoverError("rate limit exceeded", {
provider: "openai",
model: "gpt-5",
profileId: "p1",
sessionId: "session:browser-1234",
lane: "draft",
});
expect(err?.sessionId).toBe("session:browser-1234");
expect(err?.lane).toBe("draft");
expect(err?.provider).toBe("openai");
});
});

View File

@@ -21,6 +21,12 @@ export class FailoverError extends Error {
readonly status?: number;
readonly code?: string;
readonly rawError?: string;
// Originating request attribution propagated through wrapper errors so
// structured log ingestion (e.g. api_health_log) can attribute exhausted
// failover failures back to a session/lane and the last attempted provider.
// See #42713.
readonly sessionId?: string;
readonly lane?: string;
constructor(
message: string,
@@ -32,6 +38,8 @@ export class FailoverError extends Error {
status?: number;
code?: string;
rawError?: string;
sessionId?: string;
lane?: string;
cause?: unknown;
},
) {
@@ -44,6 +52,8 @@ export class FailoverError extends Error {
this.status = params.status;
this.code = params.code;
this.rawError = params.rawError;
this.sessionId = params.sessionId;
this.lane = params.lane;
}
}
@@ -422,6 +432,11 @@ export function describeFailoverError(err: unknown): {
reason?: FailoverReason;
status?: number;
code?: string;
provider?: string;
model?: string;
profileId?: string;
sessionId?: string;
lane?: string;
} {
if (isFailoverError(err)) {
return {
@@ -430,6 +445,11 @@ export function describeFailoverError(err: unknown): {
reason: err.reason,
status: err.status,
code: err.code,
provider: err.provider,
model: err.model,
profileId: err.profileId,
sessionId: err.sessionId,
lane: err.lane,
};
}
const signal = normalizeErrorSignal(err);
@@ -439,6 +459,7 @@ export function describeFailoverError(err: unknown): {
reason: resolveFailoverReasonFromError(err) ?? undefined,
status: signal.status,
code: signal.code,
provider: signal.provider,
};
}
@@ -448,6 +469,8 @@ export function coerceToFailoverError(
provider?: string;
model?: string;
profileId?: string;
sessionId?: string;
lane?: string;
},
): FailoverError | null {
if (isFailoverError(err)) {
@@ -465,9 +488,11 @@ export function coerceToFailoverError(
return new FailoverError(message, {
reason,
provider: context?.provider,
provider: context?.provider ?? signal.provider,
model: context?.model,
profileId: context?.profileId,
sessionId: context?.sessionId,
lane: context?.lane,
status,
code,
rawError: message,

View File

@@ -46,6 +46,8 @@ export type ModelFallbackDecisionParams = {
| "candidate_failed"
| "candidate_succeeded";
runId?: string;
sessionId?: string;
lane?: string;
requestedProvider: string;
requestedModel: string;
candidate: ModelCandidate;
@@ -145,6 +147,8 @@ export function logModelFallbackDecision(
event: "model_fallback_decision",
tags: ["error_handling", "model_fallback", params.decision],
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
decision: params.decision,
requestedProvider: params.requestedProvider,
requestedModel: params.requestedModel,

View File

@@ -8,7 +8,11 @@ import { AUTH_STORE_VERSION } from "./auth-profiles/constants.js";
import type { AuthProfileStore } from "./auth-profiles/types.js";
import { FailoverError } from "./failover-error.js";
import { LiveSessionModelSwitchError } from "./live-model-switch-error.js";
import { runWithImageModelFallback, runWithModelFallback } from "./model-fallback.js";
import {
FallbackSummaryError,
runWithImageModelFallback,
runWithModelFallback,
} from "./model-fallback.js";
import { classifyEmbeddedPiRunResultForModelFallback } from "./pi-embedded-runner/result-fallback-classifier.js";
import type { EmbeddedPiRunResult } from "./pi-embedded-runner/types.js";
import { makeModelFallbackCfg } from "./test-helpers/model-fallback-config-fixture.js";
@@ -474,6 +478,51 @@ describe("runWithModelFallback", () => {
});
});
it("carries request attribution through exhausted fallback summaries", async () => {
const cfg = makeCfg({
agents: {
defaults: {
model: {
primary: "openai/gpt-5.4",
fallbacks: ["anthropic/claude-opus-4-6"],
},
},
},
});
const run = vi
.fn()
.mockRejectedValueOnce(Object.assign(new Error("rate limit exceeded"), { status: 429 }))
.mockRejectedValueOnce(Object.assign(new Error("overloaded"), { status: 503 }));
try {
await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-5.4",
runId: "run-42713",
sessionId: "session:browser-42713",
lane: "answer",
run,
});
throw new Error("expected fallback summary");
} catch (err) {
expect(err).toBeInstanceOf(FallbackSummaryError);
if (!(err instanceof FallbackSummaryError)) {
throw err;
}
expect(err).toMatchObject({
name: "FallbackSummaryError",
sessionId: "session:browser-42713",
lane: "answer",
});
expect(err.cause).toMatchObject({
name: "FailoverError",
sessionId: "session:browser-42713",
lane: "answer",
});
}
});
it("uses optional result classification to continue to configured fallbacks", async () => {
const cfg = makeCfg({
agents: {

View File

@@ -42,6 +42,11 @@ import type { FailoverReason } from "./pi-embedded-helpers/types.js";
const log = createSubsystemLogger("model-fallback");
type FailoverAttribution = {
sessionId?: string;
lane?: string;
};
/**
* Structured error thrown when all model fallback candidates have been
* exhausted. Carries per-attempt details so callers can build informative
@@ -50,17 +55,22 @@ const log = createSubsystemLogger("model-fallback");
export class FallbackSummaryError extends Error {
readonly attempts: FallbackAttempt[];
readonly soonestCooldownExpiry: number | null;
readonly sessionId?: string;
readonly lane?: string;
constructor(
message: string,
attempts: FallbackAttempt[],
soonestCooldownExpiry: number | null,
cause?: Error,
attribution?: FailoverAttribution,
) {
super(message, { cause });
this.name = "FallbackSummaryError";
this.attempts = attempts;
this.soonestCooldownExpiry = soonestCooldownExpiry;
this.sessionId = attribution?.sessionId;
this.lane = attribution?.lane;
}
}
@@ -197,6 +207,7 @@ async function runFallbackCandidate<T>(params: {
provider: string;
model: string;
options?: ModelFallbackRunOptions;
attribution?: FailoverAttribution;
}): Promise<{ ok: true; result: T } | { ok: false; error: unknown }> {
try {
const result = params.options
@@ -212,6 +223,8 @@ async function runFallbackCandidate<T>(params: {
const normalizedFailover = coerceToFailoverError(err, {
provider: params.provider,
model: params.model,
sessionId: params.attribution?.sessionId,
lane: params.attribution?.lane,
});
if (shouldRethrowAbort(err) && !normalizedFailover) {
throw err;
@@ -229,12 +242,14 @@ async function runFallbackAttempt<T>(params: {
classifyResult?: ModelFallbackResultClassifier<T>;
attempt: number;
total: number;
attribution?: FailoverAttribution;
}): Promise<{ success: ModelFallbackRunResult<T> } | { error: unknown }> {
const runResult = await runFallbackCandidate({
run: params.run,
provider: params.provider,
model: params.model,
options: params.options,
attribution: params.attribution,
});
if (runResult.ok) {
const classification = await params.classifyResult?.({
@@ -247,6 +262,7 @@ async function runFallbackAttempt<T>(params: {
const classifiedError = resolveResultClassificationError(classification, {
provider: params.provider,
model: params.model,
attribution: params.attribution,
});
if (classifiedError) {
return { error: classifiedError };
@@ -265,7 +281,7 @@ async function runFallbackAttempt<T>(params: {
function resolveResultClassificationError(
classification: ModelFallbackResultClassification,
params: { provider: string; model: string },
params: { provider: string; model: string; attribution?: FailoverAttribution },
) {
if (!classification) {
return null;
@@ -281,6 +297,8 @@ function resolveResultClassificationError(
reason: classification.reason ?? "unknown",
provider: params.provider,
model: params.model,
sessionId: params.attribution?.sessionId,
lane: params.attribution?.lane,
status: classification.status,
code: classification.code,
rawError: classification.rawError,
@@ -296,6 +314,8 @@ function recordFailedCandidateAttempt(params: {
candidate: ModelCandidate;
error: unknown;
runId?: string;
sessionId?: string;
lane?: string;
requestedProvider?: string;
requestedModel?: string;
attempt: number;
@@ -317,6 +337,8 @@ function recordFailedCandidateAttempt(params: {
return logModelFallbackDecision({
decision: "candidate_failed",
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.requestedProvider ?? params.candidate.provider,
requestedModel: params.requestedModel ?? params.candidate.model,
candidate: params.candidate,
@@ -355,6 +377,7 @@ function throwFallbackFailureSummary(params: {
label: string;
formatAttempt: (attempt: FallbackAttempt) => string;
soonestCooldownExpiry?: number | null;
attribution?: FailoverAttribution;
}): never {
if (params.attempts.length <= 1 && params.lastError) {
throw params.lastError;
@@ -366,6 +389,7 @@ function throwFallbackFailureSummary(params: {
params.attempts,
params.soonestCooldownExpiry ?? null,
params.lastError instanceof Error ? params.lastError : undefined,
params.attribution,
);
}
@@ -757,6 +781,8 @@ export async function runWithModelFallback<T>(params: {
provider: string;
model: string;
runId?: string;
sessionId?: string;
lane?: string;
agentDir?: string;
/** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
fallbacksOverride?: string[];
@@ -849,6 +875,8 @@ export async function runWithModelFallback<T>(params: {
await observeDecision({
decision: "skip_candidate",
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.provider,
requestedModel: params.model,
candidate,
@@ -884,6 +912,8 @@ export async function runWithModelFallback<T>(params: {
await observeDecision({
decision: "skip_candidate",
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.provider,
requestedModel: params.model,
candidate,
@@ -908,6 +938,8 @@ export async function runWithModelFallback<T>(params: {
await observeDecision({
decision: "probe_cooldown_candidate",
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.provider,
requestedModel: params.model,
candidate,
@@ -932,12 +964,15 @@ export async function runWithModelFallback<T>(params: {
classifyResult: params.classifyResult,
attempt: i + 1,
total: candidates.length,
attribution: { sessionId: params.sessionId, lane: params.lane },
});
if ("success" in attemptRun) {
if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
await observeDecision({
decision: "candidate_succeeded",
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.provider,
requestedModel: params.model,
candidate,
@@ -978,6 +1013,8 @@ export async function runWithModelFallback<T>(params: {
coerceToFailoverError(err, {
provider: candidate.provider,
model: candidate.model,
sessionId: params.sessionId,
lane: params.lane,
}) ?? err;
// LiveSessionModelSwitchError during fallback may point at a later
@@ -1001,6 +1038,8 @@ export async function runWithModelFallback<T>(params: {
reason: "unknown",
provider: candidate.provider,
model: candidate.model,
sessionId: params.sessionId,
lane: params.lane,
});
lastError = switchNormalized;
await observeFailedCandidate({
@@ -1008,6 +1047,8 @@ export async function runWithModelFallback<T>(params: {
candidate,
error: switchNormalized,
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.provider,
requestedModel: params.model,
attempt: i + 1,
@@ -1034,6 +1075,8 @@ export async function runWithModelFallback<T>(params: {
candidate,
error: normalized,
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.provider,
requestedModel: params.model,
attempt: i + 1,
@@ -1069,6 +1112,7 @@ export async function runWithModelFallback<T>(params: {
cfg: params.cfg,
candidates,
}),
attribution: { sessionId: params.sessionId, lane: params.lane },
});
}

View File

@@ -495,6 +495,8 @@ export async function runEmbeddedPiAgent(
reason: "model_not_found",
provider,
model: modelId,
sessionId: params.sessionId,
lane: globalLane,
});
}
let runtimeModel = model;
@@ -779,6 +781,8 @@ export async function runEmbeddedPiAgent(
const overloadFailoverBackoffMs = resolveOverloadFailoverBackoffMs(params.config);
const overloadProfileRotationLimit = resolveOverloadProfileRotationLimit(params.config);
const rateLimitProfileRotationLimit = resolveRateLimitProfileRotationLimit(params.config);
let activeSessionId = params.sessionId;
let activeSessionFile = params.sessionFile;
const maybeEscalateRateLimitProfileFallback = (params: {
failoverProvider: string;
failoverModel: string;
@@ -800,6 +804,8 @@ export async function runEmbeddedPiAgent(
provider: params.failoverProvider,
model: params.failoverModel,
profileId: lastProfileId,
sessionId: activeSessionId,
lane: globalLane,
status,
},
);
@@ -857,8 +863,6 @@ export async function runEmbeddedPiAgent(
});
startupStages.mark("context-engine");
try {
let activeSessionId = params.sessionId;
let activeSessionFile = params.sessionFile;
const resolveActiveHookContext = () => ({
...hookCtx,
sessionId: activeSessionId,
@@ -1633,6 +1637,8 @@ export async function runEmbeddedPiAgent(
provider: activeErrorContext.provider,
model: activeErrorContext.model,
profileId: lastProfileId,
sessionId: sessionIdUsed,
lane: globalLane,
});
const promptErrorDetails = normalizedPromptFailover
? describeFailoverError(normalizedPromptFailover)
@@ -1824,6 +1830,8 @@ export async function runEmbeddedPiAgent(
provider,
model: modelId,
profileId: lastProfileId,
sessionId: sessionIdUsed,
lane: globalLane,
status,
})
);

View File

@@ -44,6 +44,7 @@ import { logVerbose } from "../../globals.js";
import { emitAgentEvent, registerAgentRunContext } from "../../infra/agent-events.js";
import { formatErrorMessage } from "../../infra/errors.js";
import { CommandLaneClearedError, GatewayDrainingError } from "../../process/command-queue.js";
import { CommandLane } from "../../process/lanes.js";
import { defaultRuntime } from "../../runtime.js";
import {
hasNonEmptyString,
@@ -1187,9 +1188,12 @@ export async function runAgentTurnWithFallback(params: {
: undefined;
const onToolResult = params.opts?.onToolResult;
const outcomePlan = buildAgentRuntimeOutcomePlan();
const runLane = CommandLane.Main;
const fallbackResult = await runWithModelFallback<EmbeddedAgentRunResult>({
...resolveModelFallbackOptions(effectiveRun, runtimeConfig),
runId,
sessionId: params.followupRun.run.sessionId,
lane: runLane,
classifyResult: async ({ result, provider, model }) => {
const classification = outcomePlan.classifyRunResult({
result,
@@ -1288,6 +1292,7 @@ export async function runAgentTurnWithFallback(params: {
thinkLevel: params.followupRun.run.thinkLevel,
timeoutMs: params.followupRun.run.timeoutMs,
runId,
lane: runLane,
extraSystemPrompt: params.followupRun.run.extraSystemPrompt,
sourceReplyDeliveryMode: params.followupRun.run.sourceReplyDeliveryMode,
silentReplyPromptMode: params.followupRun.run.silentReplyPromptMode,

View File

@@ -25,6 +25,7 @@ import { readSessionMessages } from "../../gateway/session-utils.fs.js";
import { logVerbose } from "../../globals.js";
import { registerAgentRunContext } from "../../infra/agent-events.js";
import { resolveMemoryFlushPlan } from "../../plugins/memory-state.js";
import { CommandLane } from "../../process/lanes.js";
import { normalizeOptionalString } from "../../shared/string-coerce.js";
import type { TemplateContext } from "../templating.js";
import type { VerboseLevel } from "../thinking.js";
@@ -817,6 +818,8 @@ export async function runMemoryFlushIfNeeded(params: {
params.cfg,
),
runId: flushRunId,
sessionId: activeSessionEntry?.sessionId ?? params.followupRun.run.sessionId,
lane: CommandLane.Main,
run: async (provider, model, runOptions) => {
const { embeddedContext, senderContext, runBaseParams } = buildEmbeddedRunExecutionParams({
run: params.followupRun.run,

View File

@@ -1,5 +1,6 @@
export { resolveEffectiveModelFallbacks } from "../../agents/agent-scope.js";
export { resolveBootstrapWarningSignaturesSeen } from "../../agents/bootstrap-budget.js";
export { resolveCronAgentLane } from "../../agents/lanes.js";
export { LiveSessionModelSwitchError } from "../../agents/live-model-switch-error.js";
export { runWithModelFallback } from "../../agents/model-fallback.js";
export { isCliProvider } from "../../agents/model-selection-cli.js";

View File

@@ -17,6 +17,7 @@ import {
normalizeVerboseLevel,
registerAgentRunContext,
resolveBootstrapWarningSignaturesSeen,
resolveCronAgentLane,
resolveSessionTranscriptPath,
runCliAgent,
runWithModelFallback,
@@ -123,6 +124,8 @@ export function createCronPromptExecutor(params: {
provider: params.liveSelection.provider,
model: params.liveSelection.model,
runId: params.cronSession.sessionEntry.sessionId,
sessionId: params.cronSession.sessionEntry.sessionId,
lane: resolveCronAgentLane(params.lane),
agentDir: params.agentDir,
fallbacksOverride: cronFallbacksOverride,
run: async (providerOverride, modelOverride, runOptions) => {
@@ -150,6 +153,7 @@ export function createCronPromptExecutor(params: {
thinkLevel: params.thinkLevel,
timeoutMs: params.timeoutMs,
runId: params.cronSession.sessionEntry.sessionId,
lane: resolveCronAgentLane(params.lane),
cliSessionId,
skillsSnapshot: params.skillsSnapshot,
messageChannel: params.messageChannel,
@@ -164,8 +168,7 @@ export function createCronPromptExecutor(params: {
);
return result;
}
const { resolveCronAgentLane, resolveFastModeState, runEmbeddedPiAgent } =
await loadCronEmbeddedRuntime();
const { resolveFastModeState, runEmbeddedPiAgent } = await loadCronEmbeddedRuntime();
const currentChannelId = await resolveCurrentChannelTarget({
channel: params.messageChannel,
to: params.resolvedDelivery.to,