From ab95812d65a63cb89e805b6879020772893397f5 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 28 Apr 2026 05:05:02 +0100 Subject: [PATCH] fix: record model fallback steps in trajectories --- CHANGELOG.md | 1 + docs/tools/trajectory.md | 1 + .../agent-command.live-model-switch.test.ts | 49 ++++++++++++ src/agents/agent-command.ts | 18 +++++ src/agents/model-fallback-observation.ts | 75 ++++++++++--------- src/agents/model-fallback.probe.test.ts | 29 ++++++- src/agents/model-fallback.ts | 39 +++++++--- 7 files changed, 167 insertions(+), 45 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f13c4e4bbdb..b02eed075c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai - Gateway/hooks: route non-delivered hook completion and error summaries to the target agent's main session instead of the default agent session, preserving multi-agent hook isolation. Fixes #24693; carries forward #68667. Thanks @abersonFAC and @bluesky6868. - Discord: own the Carbon interaction listener and hand off Discord slash/component handling asynchronously, so compaction or long session locks no longer trip `InteractionEventListener` listener timeouts. Fixes #73204. Thanks @slideshow-dingo. - Compaction/diagnostics: keep unknown compaction failure classifications stable while logging sanitized detail for unclassified provider errors such as missing Ollama provider adapters. Thanks @gzsiang. +- Models/fallbacks: record first-class `model.fallback_step` trajectory events with from/to models, failure detail, chain position, and final outcome so support exports preserve the primary model failure even when a later fallback also fails. Fixes #71744. Thanks @nikolaykazakovvs-ux. - Gateway/startup: keep value-option foreground starts on the gateway fast path and skip proxy bootstrap unless proxy env is configured, reducing normal gateway startup RSS and avoiding full CLI graph loading. Thanks @vincentkoc. - Heartbeat/models: show heartbeat model bleed guidance on context-overflow resets when the last runtime model matches configured `heartbeat.model`, so smaller local heartbeat models point users to `isolatedSession` or `lightContext` instead of only compaction-buffer tuning. Fixes #67314. Thanks @Knightmare6890. - Subagents/models: persist `sessions_spawn.model` and configured subagent models as child-session model overrides before the first turn, so spawned subagents actually run on the requested provider/model instead of reverting to the target agent default. Fixes #73180. Thanks @danielzinhu99. diff --git a/docs/tools/trajectory.md b/docs/tools/trajectory.md index c6678ae2c6a..a9a9e34ebb1 100644 --- a/docs/tools/trajectory.md +++ b/docs/tools/trajectory.md @@ -64,6 +64,7 @@ Runtime events include: - `trace.metadata` - `context.compiled` - `prompt.submitted` +- `model.fallback_step`, including the source model, next model, failure reason/detail, chain position, and whether fallback advanced, succeeded, or exhausted the chain - `model.completed` - `trace.artifacts` - `session.ended` diff --git a/src/agents/agent-command.live-model-switch.test.ts b/src/agents/agent-command.live-model-switch.test.ts index ec32f60824d..c6c89e7b9d7 100644 --- a/src/agents/agent-command.live-model-switch.test.ts +++ b/src/agents/agent-command.live-model-switch.test.ts @@ -19,6 +19,8 @@ const state = vi.hoisted(() => ({ clearAgentRunContextMock: vi.fn(), updateSessionStoreAfterAgentRunMock: vi.fn(), deliverAgentCommandResultMock: vi.fn(), + trajectoryRecordEventMock: vi.fn(), + trajectoryFlushMock: vi.fn(async () => undefined), clearSessionAuthProfileOverrideMock: vi.fn(), authProfileStoreMock: { profiles: {} } as { profiles: Record }, sessionEntryMock: undefined as unknown, @@ -249,6 +251,15 @@ vi.mock("../terminal/ansi.js", () => ({ sanitizeForLog: (s: string) => s, })); +vi.mock("../trajectory/runtime.js", () => ({ + createTrajectoryRuntimeRecorder: () => ({ + enabled: true, + filePath: "/tmp/session.trajectory.jsonl", + recordEvent: (...args: unknown[]) => state.trajectoryRecordEventMock(...args), + flush: () => state.trajectoryFlushMock(), + }), +})); + vi.mock("../utils/message-channel.js", () => ({ resolveMessageChannel: () => "test", })); @@ -356,6 +367,7 @@ type FallbackRunnerParams = { provider: string; model: string; run: (provider: string, model: string) => Promise; + onFallbackStep?: (step: Record) => void | Promise; classifyResult?: (params: { provider: string; model: string; @@ -462,6 +474,7 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => { state.sessionStoreMock = undefined; state.deliverAgentCommandResultMock.mockResolvedValue(undefined); state.updateSessionStoreAfterAgentRunMock.mockResolvedValue(undefined); + state.trajectoryFlushMock.mockResolvedValue(undefined); }); afterEach(() => { @@ -493,6 +506,42 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => { expect(lifecycleEndCalls.length).toBeGreaterThanOrEqual(1); }); + it("records fallback steps to the session trajectory runtime", async () => { + state.runWithModelFallbackMock.mockImplementation(async (params: FallbackRunnerParams) => { + await params.onFallbackStep?.({ + fallbackStepType: "fallback_step", + fallbackStepFromModel: "ollama/llama3", + fallbackStepToModel: "openai/gpt-5.4", + fallbackStepFromFailureReason: "overloaded", + fallbackStepChainPosition: 1, + fallbackStepFinalOutcome: "next_fallback", + }); + const result = await params.run(params.provider, params.model); + return { + result, + provider: params.provider, + model: params.model, + attempts: [], + }; + }); + state.runAgentAttemptMock.mockResolvedValue(makeSuccessResult("openai", "gpt-5.4")); + + await runBasicAgentCommand(); + + expect(state.trajectoryRecordEventMock).toHaveBeenCalledWith( + "model.fallback_step", + expect.objectContaining({ + fallbackStepType: "fallback_step", + fallbackStepFromModel: "ollama/llama3", + fallbackStepToModel: "openai/gpt-5.4", + fallbackStepFromFailureReason: "overloaded", + fallbackStepChainPosition: 1, + fallbackStepFinalOutcome: "next_fallback", + }), + ); + expect(state.trajectoryFlushMock).toHaveBeenCalled(); + }); + it("propagates non-switch errors without retrying and emits lifecycle error", async () => { state.runWithModelFallbackMock.mockRejectedValueOnce(new Error("provider down")); diff --git a/src/agents/agent-command.ts b/src/agents/agent-command.ts index eade2283e18..5cb8bfd9075 100644 --- a/src/agents/agent-command.ts +++ b/src/agents/agent-command.ts @@ -25,6 +25,7 @@ import { applyModelOverrideToSessionEntry } from "../sessions/model-overrides.js import { resolveSendPolicy } from "../sessions/send-policy.js"; import { normalizeOptionalString } from "../shared/string-coerce.js"; import { sanitizeForLog } from "../terminal/ansi.js"; +import { createTrajectoryRuntimeRecorder } from "../trajectory/runtime.js"; import { resolveMessageChannel } from "../utils/message-channel.js"; import { resolveAgentRuntimeConfig } from "./agent-runtime-config.js"; import { @@ -909,6 +910,16 @@ async function agentCommandInternal( let fallbackModel = model; const MAX_LIVE_SWITCH_RETRIES = 5; let liveSwitchRetries = 0; + const fallbackTrajectoryRecorder = createTrajectoryRuntimeRecorder({ + cfg, + runId, + sessionId, + sessionKey, + sessionFile, + provider, + modelId: model, + workspaceDir, + }); for (;;) { try { const spawnedBy = normalizedSpawned.spawnedBy ?? sessionEntry?.spawnedBy; @@ -928,6 +939,9 @@ async function agentCommandInternal( runId, agentDir, fallbacksOverride: effectiveFallbacksOverride, + onFallbackStep: (step) => { + fallbackTrajectoryRecorder?.recordEvent("model.fallback_step", step); + }, classifyResult: ({ provider, model, result }) => classifyEmbeddedPiRunResultForModelFallback({ provider, @@ -1024,6 +1038,7 @@ async function agentCommandInternal( }, }); } + await fallbackTrajectoryRecorder?.flush(); throw new Error( `Exceeded maximum live model switch retries (${MAX_LIVE_SWITCH_RETRIES})`, { cause: err }, @@ -1048,6 +1063,7 @@ async function agentCommandInternal( }, }); } + await fallbackTrajectoryRecorder?.flush(); throw new Error( `Live model switch rejected: ${sanitizeForLog(err.provider)}/${sanitizeForLog(err.model)} is not in the agent allowlist`, { cause: err }, @@ -1094,9 +1110,11 @@ async function agentCommandInternal( }, }); } + await fallbackTrajectoryRecorder?.flush(); throw err; } } + await fallbackTrajectoryRecorder?.flush(); // Update token+model fields in the session store. if (sessionStore && sessionKey) { diff --git a/src/agents/model-fallback-observation.ts b/src/agents/model-fallback-observation.ts index 6f5c203fe1a..ae6ea8619a5 100644 --- a/src/agents/model-fallback-observation.ts +++ b/src/agents/model-fallback-observation.ts @@ -29,6 +29,41 @@ function buildErrorObservationFields(error?: string): { type FallbackStepOutcome = "next_fallback" | "succeeded" | "chain_exhausted"; +export type ModelFallbackStepFields = { + fallbackStepType: "fallback_step"; + fallbackStepFromModel: string; + fallbackStepToModel?: string; + fallbackStepFromFailureReason?: FailoverReason; + fallbackStepFromFailureDetail?: string; + fallbackStepChainPosition?: number; + fallbackStepFinalOutcome: FallbackStepOutcome; +}; + +export type ModelFallbackDecisionParams = { + decision: + | "skip_candidate" + | "probe_cooldown_candidate" + | "candidate_failed" + | "candidate_succeeded"; + runId?: string; + requestedProvider: string; + requestedModel: string; + candidate: ModelCandidate; + attempt?: number; + total?: number; + reason?: FailoverReason | null; + status?: number; + code?: string; + error?: string; + nextCandidate?: ModelCandidate; + isPrimary?: boolean; + requestedModelMatched?: boolean; + fallbackConfigured?: boolean; + allowTransientCooldownProbe?: boolean; + profileCount?: number; + previousAttempts?: FallbackAttempt[]; +}; + function formatModelRef(candidate: ModelCandidate): string { return `${candidate.provider}/${candidate.model}`; } @@ -41,17 +76,7 @@ function buildFallbackStepFields(params: { nextCandidate?: ModelCandidate; attempt?: number; previousAttempts?: FallbackAttempt[]; -}): - | { - fallbackStepType: "fallback_step"; - fallbackStepFromModel: string; - fallbackStepToModel?: string; - fallbackStepFromFailureReason?: FailoverReason; - fallbackStepFromFailureDetail?: string; - fallbackStepChainPosition?: number; - fallbackStepFinalOutcome: FallbackStepOutcome; - } - | undefined { +}): ModelFallbackStepFields | undefined { const lastPreviousAttempt = params.previousAttempts?.at(-1); if (params.decision === "candidate_succeeded") { if (!lastPreviousAttempt) { @@ -89,30 +114,9 @@ function buildFallbackStepFields(params: { }; } -export function logModelFallbackDecision(params: { - decision: - | "skip_candidate" - | "probe_cooldown_candidate" - | "candidate_failed" - | "candidate_succeeded"; - runId?: string; - requestedProvider: string; - requestedModel: string; - candidate: ModelCandidate; - attempt?: number; - total?: number; - reason?: FailoverReason | null; - status?: number; - code?: string; - error?: string; - nextCandidate?: ModelCandidate; - isPrimary?: boolean; - requestedModelMatched?: boolean; - fallbackConfigured?: boolean; - allowTransientCooldownProbe?: boolean; - profileCount?: number; - previousAttempts?: FallbackAttempt[]; -}): void { +export function logModelFallbackDecision( + params: ModelFallbackDecisionParams, +): ModelFallbackStepFields | undefined { const nextText = params.nextCandidate ? `${sanitizeForLog(params.nextCandidate.provider)}/${sanitizeForLog(params.nextCandidate.model)}` : "none"; @@ -172,4 +176,5 @@ export function logModelFallbackDecision(params: { `model fallback decision: decision=${params.decision} requested=${sanitizeForLog(params.requestedProvider)}/${sanitizeForLog(params.requestedModel)} ` + `candidate=${sanitizeForLog(params.candidate.provider)}/${sanitizeForLog(params.candidate.model)} reason=${reasonText}${providerErrorTypeSuffix} next=${nextText}${detailSuffix}`, }); + return fallbackStepFields; } diff --git a/src/agents/model-fallback.probe.test.ts b/src/agents/model-fallback.probe.test.ts index 90b2a6fa05f..945ff05c932 100644 --- a/src/agents/model-fallback.probe.test.ts +++ b/src/agents/model-fallback.probe.test.ts @@ -306,8 +306,15 @@ describe("runWithModelFallback – probe logic", () => { .fn() .mockRejectedValueOnce(Object.assign(new Error("rate limited"), { status: 429 })) .mockResolvedValueOnce("fallback-ok"); + const onFallbackStep = vi.fn(); - const fallbackResult = await runPrimaryCandidate(fallbackCfg, fallbackRun); + const fallbackResult = await runWithModelFallback({ + cfg: fallbackCfg, + provider: "openai", + model: "gpt-4.1-mini", + run: fallbackRun, + onFallbackStep, + }); await logCapture.flush(); expect(fallbackResult.result).toBe("fallback-ok"); @@ -369,6 +376,26 @@ describe("runWithModelFallback – probe logic", () => { }), ]), ); + expect(onFallbackStep).toHaveBeenCalledWith( + expect.objectContaining({ + fallbackStepType: "fallback_step", + fallbackStepFromModel: "openai/gpt-4.1-mini", + fallbackStepToModel: "anthropic/claude-haiku-3-5", + fallbackStepFromFailureReason: "rate_limit", + fallbackStepChainPosition: 1, + fallbackStepFinalOutcome: "next_fallback", + }), + ); + expect(onFallbackStep).toHaveBeenCalledWith( + expect.objectContaining({ + fallbackStepType: "fallback_step", + fallbackStepFromModel: "openai/gpt-4.1-mini", + fallbackStepToModel: "anthropic/claude-haiku-3-5", + fallbackStepFromFailureReason: "rate_limit", + fallbackStepChainPosition: 2, + fallbackStepFinalOutcome: "succeeded", + }), + ); }); it("probes primary model when cooldown already expired", async () => { diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index ebc3fb7b22e..bf10e92d757 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -23,7 +23,11 @@ import { shouldUseTransientCooldownProbeSlot, } from "./failover-policy.js"; import { LiveSessionModelSwitchError } from "./live-model-switch-error.js"; -import { logModelFallbackDecision } from "./model-fallback-observation.js"; +import { + logModelFallbackDecision, + type ModelFallbackDecisionParams, + type ModelFallbackStepFields, +} from "./model-fallback-observation.js"; import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js"; import { modelKey, normalizeModelRef } from "./model-selection-normalize.js"; import { @@ -133,6 +137,8 @@ type ModelFallbackErrorHandler = (attempt: { total: number; }) => void | Promise; +type ModelFallbackStepHandler = (step: ModelFallbackStepFields) => void | Promise; + export type ModelFallbackResultClassification = | { message: string; @@ -297,7 +303,7 @@ function recordFailedCandidateAttempt(params: { isPrimary: boolean; requestedModelMatched: boolean; fallbackConfigured: boolean; -}) { +}): ModelFallbackStepFields | undefined { const described = describeFailoverError(params.error); params.attempts.push({ provider: params.candidate.provider, @@ -307,7 +313,7 @@ function recordFailedCandidateAttempt(params: { status: described.status, code: described.code, }); - logModelFallbackDecision({ + return logModelFallbackDecision({ decision: "candidate_failed", runId: params.runId, requestedProvider: params.requestedProvider ?? params.candidate.provider, @@ -737,6 +743,7 @@ export async function runWithModelFallback(params: { fallbacksOverride?: string[]; run: ModelFallbackRunFn; onError?: ModelFallbackErrorHandler; + onFallbackStep?: ModelFallbackStepHandler; classifyResult?: ModelFallbackResultClassifier; }): Promise> { const candidates = resolveFallbackCandidates({ @@ -755,6 +762,20 @@ export async function runWithModelFallback(params: { const attempts: FallbackAttempt[] = []; let lastError: unknown; const cooldownProbeUsedProviders = new Set(); + const observeDecision = async (decision: ModelFallbackDecisionParams) => { + const fallbackStep = logModelFallbackDecision(decision); + if (fallbackStep) { + await params.onFallbackStep?.(fallbackStep); + } + }; + const observeFailedCandidate = async ( + failedAttempt: Parameters[0], + ) => { + const fallbackStep = recordFailedCandidateAttempt(failedAttempt); + if (fallbackStep) { + await params.onFallbackStep?.(fallbackStep); + } + }; const hasFallbackCandidates = candidates.length > 1; @@ -799,7 +820,7 @@ export async function runWithModelFallback(params: { error: decision.error, reason: decision.reason, }); - logModelFallbackDecision({ + await observeDecision({ decision: "skip_candidate", runId: params.runId, requestedProvider: params.provider, @@ -834,7 +855,7 @@ export async function runWithModelFallback(params: { error, reason: decision.reason, }); - logModelFallbackDecision({ + await observeDecision({ decision: "skip_candidate", runId: params.runId, requestedProvider: params.provider, @@ -858,7 +879,7 @@ export async function runWithModelFallback(params: { } } attemptedDuringCooldown = true; - logModelFallbackDecision({ + await observeDecision({ decision: "probe_cooldown_candidate", runId: params.runId, requestedProvider: params.provider, @@ -888,7 +909,7 @@ export async function runWithModelFallback(params: { }); if ("success" in attemptRun) { if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) { - logModelFallbackDecision({ + await observeDecision({ decision: "candidate_succeeded", runId: params.runId, requestedProvider: params.provider, @@ -956,7 +977,7 @@ export async function runWithModelFallback(params: { model: candidate.model, }); lastError = switchNormalized; - recordFailedCandidateAttempt({ + await observeFailedCandidate({ attempts, candidate, error: switchNormalized, @@ -982,7 +1003,7 @@ export async function runWithModelFallback(params: { } lastError = isKnownFailover ? normalized : err; - recordFailedCandidateAttempt({ + await observeFailedCandidate({ attempts, candidate, error: normalized,