From ab95812d65a63cb89e805b6879020772893397f5 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Tue, 28 Apr 2026 05:05:02 +0100
Subject: [PATCH] fix: record model fallback steps in trajectories

---
 CHANGELOG.md                                  |  1 +
 docs/tools/trajectory.md                      |  1 +
 .../agent-command.live-model-switch.test.ts   | 49 ++++++++++++
 src/agents/agent-command.ts                   | 18 +++++
 src/agents/model-fallback-observation.ts      | 75 ++++++++++---------
 src/agents/model-fallback.probe.test.ts       | 29 ++++++-
 src/agents/model-fallback.ts                  | 39 +++++++---
 7 files changed, 167 insertions(+), 45 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f13c4e4bbdb..b02eed075c6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
 - Gateway/hooks: route non-delivered hook completion and error summaries to the target agent's main session instead of the default agent session, preserving multi-agent hook isolation. Fixes #24693; carries forward #68667. Thanks @abersonFAC and @bluesky6868.
 - Discord: own the Carbon interaction listener and hand off Discord slash/component handling asynchronously, so compaction or long session locks no longer trip `InteractionEventListener` listener timeouts. Fixes #73204. Thanks @slideshow-dingo.
 - Compaction/diagnostics: keep unknown compaction failure classifications stable while logging sanitized detail for unclassified provider errors such as missing Ollama provider adapters. Thanks @gzsiang.
+- Models/fallbacks: record first-class `model.fallback_step` trajectory events with from/to models, failure detail, chain position, and final outcome so support exports preserve the primary model failure even when a later fallback also fails. Fixes #71744. Thanks @nikolaykazakovvs-ux.
 - Gateway/startup: keep value-option foreground starts on the gateway fast path and skip proxy bootstrap unless proxy env is configured, reducing normal gateway startup RSS and avoiding full CLI graph loading. Thanks @vincentkoc.
 - Heartbeat/models: show heartbeat model bleed guidance on context-overflow resets when the last runtime model matches configured `heartbeat.model`, so smaller local heartbeat models point users to `isolatedSession` or `lightContext` instead of only compaction-buffer tuning. Fixes #67314. Thanks @Knightmare6890.
 - Subagents/models: persist `sessions_spawn.model` and configured subagent models as child-session model overrides before the first turn, so spawned subagents actually run on the requested provider/model instead of reverting to the target agent default. Fixes #73180. Thanks @danielzinhu99.
diff --git a/docs/tools/trajectory.md b/docs/tools/trajectory.md
index c6678ae2c6a..a9a9e34ebb1 100644
--- a/docs/tools/trajectory.md
+++ b/docs/tools/trajectory.md
@@ -64,6 +64,7 @@ Runtime events include:
 - `trace.metadata`
 - `context.compiled`
 - `prompt.submitted`
+- `model.fallback_step`, including the source model, next model, failure reason/detail, chain position, and whether fallback advanced, succeeded, or exhausted the chain
 - `model.completed`
 - `trace.artifacts`
 - `session.ended`
diff --git a/src/agents/agent-command.live-model-switch.test.ts b/src/agents/agent-command.live-model-switch.test.ts
index ec32f60824d..c6c89e7b9d7 100644
--- a/src/agents/agent-command.live-model-switch.test.ts
+++ b/src/agents/agent-command.live-model-switch.test.ts
@@ -19,6 +19,8 @@ const state = vi.hoisted(() => ({
   clearAgentRunContextMock: vi.fn(),
   updateSessionStoreAfterAgentRunMock: vi.fn(),
   deliverAgentCommandResultMock: vi.fn(),
+  trajectoryRecordEventMock: vi.fn(),
+  trajectoryFlushMock: vi.fn(async () => undefined),
   clearSessionAuthProfileOverrideMock: vi.fn(),
   authProfileStoreMock: { profiles: {} } as { profiles: Record<string, unknown> },
   sessionEntryMock: undefined as unknown,
@@ -249,6 +251,15 @@ vi.mock("../terminal/ansi.js", () => ({
   sanitizeForLog: (s: string) => s,
 }));
 
+vi.mock("../trajectory/runtime.js", () => ({
+  createTrajectoryRuntimeRecorder: () => ({
+    enabled: true,
+    filePath: "/tmp/session.trajectory.jsonl",
+    recordEvent: (...args: unknown[]) => state.trajectoryRecordEventMock(...args),
+    flush: () => state.trajectoryFlushMock(),
+  }),
+}));
+
 vi.mock("../utils/message-channel.js", () => ({
   resolveMessageChannel: () => "test",
 }));
@@ -356,6 +367,7 @@ type FallbackRunnerParams = {
   provider: string;
   model: string;
   run: (provider: string, model: string) => Promise<unknown>;
+  onFallbackStep?: (step: Record<string, unknown>) => void | Promise<void>;
   classifyResult?: (params: {
     provider: string;
     model: string;
@@ -462,6 +474,7 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => {
     state.sessionStoreMock = undefined;
     state.deliverAgentCommandResultMock.mockResolvedValue(undefined);
     state.updateSessionStoreAfterAgentRunMock.mockResolvedValue(undefined);
+    state.trajectoryFlushMock.mockResolvedValue(undefined);
   });
 
   afterEach(() => {
@@ -493,6 +506,42 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => {
     expect(lifecycleEndCalls.length).toBeGreaterThanOrEqual(1);
   });
 
+  it("records fallback steps to the session trajectory runtime", async () => {
+    state.runWithModelFallbackMock.mockImplementation(async (params: FallbackRunnerParams) => {
+      await params.onFallbackStep?.({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "ollama/llama3",
+        fallbackStepToModel: "openai/gpt-5.4",
+        fallbackStepFromFailureReason: "overloaded",
+        fallbackStepChainPosition: 1,
+        fallbackStepFinalOutcome: "next_fallback",
+      });
+      const result = await params.run(params.provider, params.model);
+      return {
+        result,
+        provider: params.provider,
+        model: params.model,
+        attempts: [],
+      };
+    });
+    state.runAgentAttemptMock.mockResolvedValue(makeSuccessResult("openai", "gpt-5.4"));
+
+    await runBasicAgentCommand();
+
+    expect(state.trajectoryRecordEventMock).toHaveBeenCalledWith(
+      "model.fallback_step",
+      expect.objectContaining({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "ollama/llama3",
+        fallbackStepToModel: "openai/gpt-5.4",
+        fallbackStepFromFailureReason: "overloaded",
+        fallbackStepChainPosition: 1,
+        fallbackStepFinalOutcome: "next_fallback",
+      }),
+    );
+    expect(state.trajectoryFlushMock).toHaveBeenCalled();
+  });
+
   it("propagates non-switch errors without retrying and emits lifecycle error", async () => {
     state.runWithModelFallbackMock.mockRejectedValueOnce(new Error("provider down"));
 
diff --git a/src/agents/agent-command.ts b/src/agents/agent-command.ts
index eade2283e18..5cb8bfd9075 100644
--- a/src/agents/agent-command.ts
+++ b/src/agents/agent-command.ts
@@ -25,6 +25,7 @@ import { applyModelOverrideToSessionEntry } from "../sessions/model-overrides.js
 import { resolveSendPolicy } from "../sessions/send-policy.js";
 import { normalizeOptionalString } from "../shared/string-coerce.js";
 import { sanitizeForLog } from "../terminal/ansi.js";
+import { createTrajectoryRuntimeRecorder } from "../trajectory/runtime.js";
 import { resolveMessageChannel } from "../utils/message-channel.js";
 import { resolveAgentRuntimeConfig } from "./agent-runtime-config.js";
 import {
@@ -909,6 +910,16 @@ async function agentCommandInternal(
     let fallbackModel = model;
     const MAX_LIVE_SWITCH_RETRIES = 5;
     let liveSwitchRetries = 0;
+    const fallbackTrajectoryRecorder = createTrajectoryRuntimeRecorder({
+      cfg,
+      runId,
+      sessionId,
+      sessionKey,
+      sessionFile,
+      provider,
+      modelId: model,
+      workspaceDir,
+    });
     for (;;) {
       try {
         const spawnedBy = normalizedSpawned.spawnedBy ?? sessionEntry?.spawnedBy;
@@ -928,6 +939,9 @@ async function agentCommandInternal(
           runId,
           agentDir,
           fallbacksOverride: effectiveFallbacksOverride,
+          onFallbackStep: (step) => {
+            fallbackTrajectoryRecorder?.recordEvent("model.fallback_step", step);
+          },
           classifyResult: ({ provider, model, result }) =>
             classifyEmbeddedPiRunResultForModelFallback({
               provider,
@@ -1024,6 +1038,7 @@ async function agentCommandInternal(
                 },
               });
             }
+            await fallbackTrajectoryRecorder?.flush();
             throw new Error(
               `Exceeded maximum live model switch retries (${MAX_LIVE_SWITCH_RETRIES})`,
               { cause: err },
@@ -1048,6 +1063,7 @@ async function agentCommandInternal(
                 },
               });
             }
+            await fallbackTrajectoryRecorder?.flush();
             throw new Error(
               `Live model switch rejected: ${sanitizeForLog(err.provider)}/${sanitizeForLog(err.model)} is not in the agent allowlist`,
               { cause: err },
@@ -1094,9 +1110,11 @@ async function agentCommandInternal(
             },
           });
         }
+        await fallbackTrajectoryRecorder?.flush();
         throw err;
       }
     }
+    await fallbackTrajectoryRecorder?.flush();
 
     // Update token+model fields in the session store.
     if (sessionStore && sessionKey) {
diff --git a/src/agents/model-fallback-observation.ts b/src/agents/model-fallback-observation.ts
index 6f5c203fe1a..ae6ea8619a5 100644
--- a/src/agents/model-fallback-observation.ts
+++ b/src/agents/model-fallback-observation.ts
@@ -29,6 +29,41 @@ function buildErrorObservationFields(error?: string): {
 
 type FallbackStepOutcome = "next_fallback" | "succeeded" | "chain_exhausted";
 
+export type ModelFallbackStepFields = {
+  fallbackStepType: "fallback_step";
+  fallbackStepFromModel: string;
+  fallbackStepToModel?: string;
+  fallbackStepFromFailureReason?: FailoverReason;
+  fallbackStepFromFailureDetail?: string;
+  fallbackStepChainPosition?: number;
+  fallbackStepFinalOutcome: FallbackStepOutcome;
+};
+
+export type ModelFallbackDecisionParams = {
+  decision:
+    | "skip_candidate"
+    | "probe_cooldown_candidate"
+    | "candidate_failed"
+    | "candidate_succeeded";
+  runId?: string;
+  requestedProvider: string;
+  requestedModel: string;
+  candidate: ModelCandidate;
+  attempt?: number;
+  total?: number;
+  reason?: FailoverReason | null;
+  status?: number;
+  code?: string;
+  error?: string;
+  nextCandidate?: ModelCandidate;
+  isPrimary?: boolean;
+  requestedModelMatched?: boolean;
+  fallbackConfigured?: boolean;
+  allowTransientCooldownProbe?: boolean;
+  profileCount?: number;
+  previousAttempts?: FallbackAttempt[];
+};
+
 function formatModelRef(candidate: ModelCandidate): string {
   return `${candidate.provider}/${candidate.model}`;
 }
@@ -41,17 +76,7 @@ function buildFallbackStepFields(params: {
   nextCandidate?: ModelCandidate;
   attempt?: number;
   previousAttempts?: FallbackAttempt[];
-}):
-  | {
-      fallbackStepType: "fallback_step";
-      fallbackStepFromModel: string;
-      fallbackStepToModel?: string;
-      fallbackStepFromFailureReason?: FailoverReason;
-      fallbackStepFromFailureDetail?: string;
-      fallbackStepChainPosition?: number;
-      fallbackStepFinalOutcome: FallbackStepOutcome;
-    }
-  | undefined {
+}): ModelFallbackStepFields | undefined {
   const lastPreviousAttempt = params.previousAttempts?.at(-1);
   if (params.decision === "candidate_succeeded") {
     if (!lastPreviousAttempt) {
@@ -89,30 +114,9 @@ function buildFallbackStepFields(params: {
   };
 }
 
-export function logModelFallbackDecision(params: {
-  decision:
-    | "skip_candidate"
-    | "probe_cooldown_candidate"
-    | "candidate_failed"
-    | "candidate_succeeded";
-  runId?: string;
-  requestedProvider: string;
-  requestedModel: string;
-  candidate: ModelCandidate;
-  attempt?: number;
-  total?: number;
-  reason?: FailoverReason | null;
-  status?: number;
-  code?: string;
-  error?: string;
-  nextCandidate?: ModelCandidate;
-  isPrimary?: boolean;
-  requestedModelMatched?: boolean;
-  fallbackConfigured?: boolean;
-  allowTransientCooldownProbe?: boolean;
-  profileCount?: number;
-  previousAttempts?: FallbackAttempt[];
-}): void {
+export function logModelFallbackDecision(
+  params: ModelFallbackDecisionParams,
+): ModelFallbackStepFields | undefined {
   const nextText = params.nextCandidate
     ? `${sanitizeForLog(params.nextCandidate.provider)}/${sanitizeForLog(params.nextCandidate.model)}`
     : "none";
@@ -172,4 +176,5 @@ export function logModelFallbackDecision(params: {
       `model fallback decision: decision=${params.decision} requested=${sanitizeForLog(params.requestedProvider)}/${sanitizeForLog(params.requestedModel)} ` +
       `candidate=${sanitizeForLog(params.candidate.provider)}/${sanitizeForLog(params.candidate.model)} reason=${reasonText}${providerErrorTypeSuffix} next=${nextText}${detailSuffix}`,
   });
+  return fallbackStepFields;
 }
diff --git a/src/agents/model-fallback.probe.test.ts b/src/agents/model-fallback.probe.test.ts
index 90b2a6fa05f..945ff05c932 100644
--- a/src/agents/model-fallback.probe.test.ts
+++ b/src/agents/model-fallback.probe.test.ts
@@ -306,8 +306,15 @@ describe("runWithModelFallback – probe logic", () => {
       .fn()
       .mockRejectedValueOnce(Object.assign(new Error("rate limited"), { status: 429 }))
       .mockResolvedValueOnce("fallback-ok");
+    const onFallbackStep = vi.fn();
 
-    const fallbackResult = await runPrimaryCandidate(fallbackCfg, fallbackRun);
+    const fallbackResult = await runWithModelFallback({
+      cfg: fallbackCfg,
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      run: fallbackRun,
+      onFallbackStep,
+    });
     await logCapture.flush();
 
     expect(fallbackResult.result).toBe("fallback-ok");
@@ -369,6 +376,26 @@ describe("runWithModelFallback – probe logic", () => {
         }),
       ]),
     );
+    expect(onFallbackStep).toHaveBeenCalledWith(
+      expect.objectContaining({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "openai/gpt-4.1-mini",
+        fallbackStepToModel: "anthropic/claude-haiku-3-5",
+        fallbackStepFromFailureReason: "rate_limit",
+        fallbackStepChainPosition: 1,
+        fallbackStepFinalOutcome: "next_fallback",
+      }),
+    );
+    expect(onFallbackStep).toHaveBeenCalledWith(
+      expect.objectContaining({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "openai/gpt-4.1-mini",
+        fallbackStepToModel: "anthropic/claude-haiku-3-5",
+        fallbackStepFromFailureReason: "rate_limit",
+        fallbackStepChainPosition: 2,
+        fallbackStepFinalOutcome: "succeeded",
+      }),
+    );
   });
 
   it("probes primary model when cooldown already expired", async () => {
diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts
index ebc3fb7b22e..bf10e92d757 100644
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -23,7 +23,11 @@ import {
   shouldUseTransientCooldownProbeSlot,
 } from "./failover-policy.js";
 import { LiveSessionModelSwitchError } from "./live-model-switch-error.js";
-import { logModelFallbackDecision } from "./model-fallback-observation.js";
+import {
+  logModelFallbackDecision,
+  type ModelFallbackDecisionParams,
+  type ModelFallbackStepFields,
+} from "./model-fallback-observation.js";
 import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
 import { modelKey, normalizeModelRef } from "./model-selection-normalize.js";
 import {
@@ -133,6 +137,8 @@ type ModelFallbackErrorHandler = (attempt: {
   total: number;
 }) => void | Promise<void>;
 
+type ModelFallbackStepHandler = (step: ModelFallbackStepFields) => void | Promise<void>;
+
 export type ModelFallbackResultClassification =
   | {
       message: string;
@@ -297,7 +303,7 @@ function recordFailedCandidateAttempt(params: {
   isPrimary: boolean;
   requestedModelMatched: boolean;
   fallbackConfigured: boolean;
-}) {
+}): ModelFallbackStepFields | undefined {
   const described = describeFailoverError(params.error);
   params.attempts.push({
     provider: params.candidate.provider,
@@ -307,7 +313,7 @@ function recordFailedCandidateAttempt(params: {
     status: described.status,
     code: described.code,
   });
-  logModelFallbackDecision({
+  return logModelFallbackDecision({
     decision: "candidate_failed",
     runId: params.runId,
     requestedProvider: params.requestedProvider ?? params.candidate.provider,
@@ -737,6 +743,7 @@ export async function runWithModelFallback<T>(params: {
   fallbacksOverride?: string[];
   run: ModelFallbackRunFn<T>;
   onError?: ModelFallbackErrorHandler;
+  onFallbackStep?: ModelFallbackStepHandler;
   classifyResult?: ModelFallbackResultClassifier<T>;
 }): Promise<ModelFallbackRunResult<T>> {
   const candidates = resolveFallbackCandidates({
@@ -755,6 +762,20 @@ export async function runWithModelFallback<T>(params: {
   const attempts: FallbackAttempt[] = [];
   let lastError: unknown;
   const cooldownProbeUsedProviders = new Set<string>();
+  const observeDecision = async (decision: ModelFallbackDecisionParams) => {
+    const fallbackStep = logModelFallbackDecision(decision);
+    if (fallbackStep) {
+      await params.onFallbackStep?.(fallbackStep);
+    }
+  };
+  const observeFailedCandidate = async (
+    failedAttempt: Parameters<typeof recordFailedCandidateAttempt>[0],
+  ) => {
+    const fallbackStep = recordFailedCandidateAttempt(failedAttempt);
+    if (fallbackStep) {
+      await params.onFallbackStep?.(fallbackStep);
+    }
+  };
 
   const hasFallbackCandidates = candidates.length > 1;
 
@@ -799,7 +820,7 @@ export async function runWithModelFallback<T>(params: {
             error: decision.error,
             reason: decision.reason,
           });
-          logModelFallbackDecision({
+          await observeDecision({
             decision: "skip_candidate",
             runId: params.runId,
             requestedProvider: params.provider,
@@ -834,7 +855,7 @@ export async function runWithModelFallback<T>(params: {
               error,
               reason: decision.reason,
             });
-            logModelFallbackDecision({
+            await observeDecision({
               decision: "skip_candidate",
               runId: params.runId,
               requestedProvider: params.provider,
@@ -858,7 +879,7 @@ export async function runWithModelFallback<T>(params: {
           }
         }
         attemptedDuringCooldown = true;
-        logModelFallbackDecision({
+        await observeDecision({
           decision: "probe_cooldown_candidate",
           runId: params.runId,
           requestedProvider: params.provider,
@@ -888,7 +909,7 @@ export async function runWithModelFallback<T>(params: {
     });
     if ("success" in attemptRun) {
       if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
-        logModelFallbackDecision({
+        await observeDecision({
           decision: "candidate_succeeded",
           runId: params.runId,
           requestedProvider: params.provider,
@@ -956,7 +977,7 @@ export async function runWithModelFallback<T>(params: {
           model: candidate.model,
         });
         lastError = switchNormalized;
-        recordFailedCandidateAttempt({
+        await observeFailedCandidate({
           attempts,
           candidate,
           error: switchNormalized,
@@ -982,7 +1003,7 @@ export async function runWithModelFallback<T>(params: {
       }
 
       lastError = isKnownFailover ? normalized : err;
-      recordFailedCandidateAttempt({
+      await observeFailedCandidate({
         attempts,
         candidate,
         error: normalized,