fix: record model fallback steps in trajectories

2026-05-06 07:00:43 +00:00 · 2026-04-28 05:05:02 +01:00
parent 714f3b59cc
commit ab95812d65
7 changed files with 167 additions and 45 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
 - Gateway/hooks: route non-delivered hook completion and error summaries to the target agent's main session instead of the default agent session, preserving multi-agent hook isolation. Fixes #24693; carries forward #68667. Thanks @abersonFAC and @bluesky6868.
 - Discord: own the Carbon interaction listener and hand off Discord slash/component handling asynchronously, so compaction or long session locks no longer trip `InteractionEventListener` listener timeouts. Fixes #73204. Thanks @slideshow-dingo.
 - Compaction/diagnostics: keep unknown compaction failure classifications stable while logging sanitized detail for unclassified provider errors such as missing Ollama provider adapters. Thanks @gzsiang.
+- Models/fallbacks: record first-class `model.fallback_step` trajectory events with from/to models, failure detail, chain position, and final outcome so support exports preserve the primary model failure even when a later fallback also fails. Fixes #71744. Thanks @nikolaykazakovvs-ux.
 - Gateway/startup: keep value-option foreground starts on the gateway fast path and skip proxy bootstrap unless proxy env is configured, reducing normal gateway startup RSS and avoiding full CLI graph loading. Thanks @vincentkoc.
 - Heartbeat/models: show heartbeat model bleed guidance on context-overflow resets when the last runtime model matches configured `heartbeat.model`, so smaller local heartbeat models point users to `isolatedSession` or `lightContext` instead of only compaction-buffer tuning. Fixes #67314. Thanks @Knightmare6890.
 - Subagents/models: persist `sessions_spawn.model` and configured subagent models as child-session model overrides before the first turn, so spawned subagents actually run on the requested provider/model instead of reverting to the target agent default. Fixes #73180. Thanks @danielzinhu99.
--- a/docs/tools/trajectory.md
+++ b/docs/tools/trajectory.md
@@ -64,6 +64,7 @@ Runtime events include:
 - `trace.metadata`
 - `context.compiled`
 - `prompt.submitted`
+- `model.fallback_step`, including the source model, next model, failure reason/detail, chain position, and whether fallback advanced, succeeded, or exhausted the chain
 - `model.completed`
 - `trace.artifacts`
 - `session.ended`
--- a/src/agents/agent-command.live-model-switch.test.ts
+++ b/src/agents/agent-command.live-model-switch.test.ts
@@ -19,6 +19,8 @@ const state = vi.hoisted(() => ({
  clearAgentRunContextMock: vi.fn(),
  updateSessionStoreAfterAgentRunMock: vi.fn(),
  deliverAgentCommandResultMock: vi.fn(),
+  trajectoryRecordEventMock: vi.fn(),
+  trajectoryFlushMock: vi.fn(async () => undefined),
  clearSessionAuthProfileOverrideMock: vi.fn(),
  authProfileStoreMock: { profiles: {} } as { profiles: Record<string, unknown> },
  sessionEntryMock: undefined as unknown,
@@ -249,6 +251,15 @@ vi.mock("../terminal/ansi.js", () => ({
  sanitizeForLog: (s: string) => s,
 }));

+vi.mock("../trajectory/runtime.js", () => ({
+  createTrajectoryRuntimeRecorder: () => ({
+    enabled: true,
+    filePath: "/tmp/session.trajectory.jsonl",
+    recordEvent: (...args: unknown[]) => state.trajectoryRecordEventMock(...args),
+    flush: () => state.trajectoryFlushMock(),
+  }),
+}));
+
 vi.mock("../utils/message-channel.js", () => ({
  resolveMessageChannel: () => "test",
 }));
@@ -356,6 +367,7 @@ type FallbackRunnerParams = {
  provider: string;
  model: string;
  run: (provider: string, model: string) => Promise<unknown>;
+  onFallbackStep?: (step: Record<string, unknown>) => void | Promise<void>;
  classifyResult?: (params: {
    provider: string;
    model: string;
@@ -462,6 +474,7 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => {
    state.sessionStoreMock = undefined;
    state.deliverAgentCommandResultMock.mockResolvedValue(undefined);
    state.updateSessionStoreAfterAgentRunMock.mockResolvedValue(undefined);
+    state.trajectoryFlushMock.mockResolvedValue(undefined);
  });

  afterEach(() => {
@@ -493,6 +506,42 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => {
    expect(lifecycleEndCalls.length).toBeGreaterThanOrEqual(1);
  });

+  it("records fallback steps to the session trajectory runtime", async () => {
+    state.runWithModelFallbackMock.mockImplementation(async (params: FallbackRunnerParams) => {
+      await params.onFallbackStep?.({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "ollama/llama3",
+        fallbackStepToModel: "openai/gpt-5.4",
+        fallbackStepFromFailureReason: "overloaded",
+        fallbackStepChainPosition: 1,
+        fallbackStepFinalOutcome: "next_fallback",
+      });
+      const result = await params.run(params.provider, params.model);
+      return {
+        result,
+        provider: params.provider,
+        model: params.model,
+        attempts: [],
+      };
+    });
+    state.runAgentAttemptMock.mockResolvedValue(makeSuccessResult("openai", "gpt-5.4"));
+
+    await runBasicAgentCommand();
+
+    expect(state.trajectoryRecordEventMock).toHaveBeenCalledWith(
+      "model.fallback_step",
+      expect.objectContaining({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "ollama/llama3",
+        fallbackStepToModel: "openai/gpt-5.4",
+        fallbackStepFromFailureReason: "overloaded",
+        fallbackStepChainPosition: 1,
+        fallbackStepFinalOutcome: "next_fallback",
+      }),
+    );
+    expect(state.trajectoryFlushMock).toHaveBeenCalled();
+  });
+
  it("propagates non-switch errors without retrying and emits lifecycle error", async () => {
    state.runWithModelFallbackMock.mockRejectedValueOnce(new Error("provider down"));

--- a/src/agents/agent-command.ts
+++ b/src/agents/agent-command.ts
@@ -25,6 +25,7 @@ import { applyModelOverrideToSessionEntry } from "../sessions/model-overrides.js
 import { resolveSendPolicy } from "../sessions/send-policy.js";
 import { normalizeOptionalString } from "../shared/string-coerce.js";
 import { sanitizeForLog } from "../terminal/ansi.js";
+import { createTrajectoryRuntimeRecorder } from "../trajectory/runtime.js";
 import { resolveMessageChannel } from "../utils/message-channel.js";
 import { resolveAgentRuntimeConfig } from "./agent-runtime-config.js";
 import {
@@ -909,6 +910,16 @@ async function agentCommandInternal(
    let fallbackModel = model;
    const MAX_LIVE_SWITCH_RETRIES = 5;
    let liveSwitchRetries = 0;
+    const fallbackTrajectoryRecorder = createTrajectoryRuntimeRecorder({
+      cfg,
+      runId,
+      sessionId,
+      sessionKey,
+      sessionFile,
+      provider,
+      modelId: model,
+      workspaceDir,
+    });
    for (;;) {
      try {
        const spawnedBy = normalizedSpawned.spawnedBy ?? sessionEntry?.spawnedBy;
@@ -928,6 +939,9 @@ async function agentCommandInternal(
          runId,
          agentDir,
          fallbacksOverride: effectiveFallbacksOverride,
+          onFallbackStep: (step) => {
+            fallbackTrajectoryRecorder?.recordEvent("model.fallback_step", step);
+          },
          classifyResult: ({ provider, model, result }) =>
            classifyEmbeddedPiRunResultForModelFallback({
              provider,
@@ -1024,6 +1038,7 @@ async function agentCommandInternal(
                },
              });
            }
+            await fallbackTrajectoryRecorder?.flush();
            throw new Error(
              `Exceeded maximum live model switch retries (${MAX_LIVE_SWITCH_RETRIES})`,
              { cause: err },
@@ -1048,6 +1063,7 @@ async function agentCommandInternal(
                },
              });
            }
+            await fallbackTrajectoryRecorder?.flush();
            throw new Error(
              `Live model switch rejected: ${sanitizeForLog(err.provider)}/${sanitizeForLog(err.model)} is not in the agent allowlist`,
              { cause: err },
@@ -1094,9 +1110,11 @@ async function agentCommandInternal(
            },
          });
        }
+        await fallbackTrajectoryRecorder?.flush();
        throw err;
      }
    }
+    await fallbackTrajectoryRecorder?.flush();

    // Update token+model fields in the session store.
    if (sessionStore && sessionKey) {
--- a/src/agents/model-fallback-observation.ts
+++ b/src/agents/model-fallback-observation.ts
@@ -29,6 +29,41 @@ function buildErrorObservationFields(error?: string): {

 type FallbackStepOutcome = "next_fallback" | "succeeded" | "chain_exhausted";

+export type ModelFallbackStepFields = {
+  fallbackStepType: "fallback_step";
+  fallbackStepFromModel: string;
+  fallbackStepToModel?: string;
+  fallbackStepFromFailureReason?: FailoverReason;
+  fallbackStepFromFailureDetail?: string;
+  fallbackStepChainPosition?: number;
+  fallbackStepFinalOutcome: FallbackStepOutcome;
+};
+
+export type ModelFallbackDecisionParams = {
+  decision:
+    | "skip_candidate"
+    | "probe_cooldown_candidate"
+    | "candidate_failed"
+    | "candidate_succeeded";
+  runId?: string;
+  requestedProvider: string;
+  requestedModel: string;
+  candidate: ModelCandidate;
+  attempt?: number;
+  total?: number;
+  reason?: FailoverReason | null;
+  status?: number;
+  code?: string;
+  error?: string;
+  nextCandidate?: ModelCandidate;
+  isPrimary?: boolean;
+  requestedModelMatched?: boolean;
+  fallbackConfigured?: boolean;
+  allowTransientCooldownProbe?: boolean;
+  profileCount?: number;
+  previousAttempts?: FallbackAttempt[];
+};
+
 function formatModelRef(candidate: ModelCandidate): string {
  return `${candidate.provider}/${candidate.model}`;
 }
@@ -41,17 +76,7 @@ function buildFallbackStepFields(params: {
  nextCandidate?: ModelCandidate;
  attempt?: number;
  previousAttempts?: FallbackAttempt[];
-}):
-  | {
-      fallbackStepType: "fallback_step";
-      fallbackStepFromModel: string;
-      fallbackStepToModel?: string;
-      fallbackStepFromFailureReason?: FailoverReason;
-      fallbackStepFromFailureDetail?: string;
-      fallbackStepChainPosition?: number;
-      fallbackStepFinalOutcome: FallbackStepOutcome;
-    }
-  | undefined {
+}): ModelFallbackStepFields | undefined {
  const lastPreviousAttempt = params.previousAttempts?.at(-1);
  if (params.decision === "candidate_succeeded") {
    if (!lastPreviousAttempt) {
@@ -89,30 +114,9 @@ function buildFallbackStepFields(params: {
  };
 }

-export function logModelFallbackDecision(params: {
-  decision:
-    | "skip_candidate"
-    | "probe_cooldown_candidate"
-    | "candidate_failed"
-    | "candidate_succeeded";
-  runId?: string;
-  requestedProvider: string;
-  requestedModel: string;
-  candidate: ModelCandidate;
-  attempt?: number;
-  total?: number;
-  reason?: FailoverReason | null;
-  status?: number;
-  code?: string;
-  error?: string;
-  nextCandidate?: ModelCandidate;
-  isPrimary?: boolean;
-  requestedModelMatched?: boolean;
-  fallbackConfigured?: boolean;
-  allowTransientCooldownProbe?: boolean;
-  profileCount?: number;
-  previousAttempts?: FallbackAttempt[];
-}): void {
+export function logModelFallbackDecision(
+  params: ModelFallbackDecisionParams,
+): ModelFallbackStepFields | undefined {
  const nextText = params.nextCandidate
    ? `${sanitizeForLog(params.nextCandidate.provider)}/${sanitizeForLog(params.nextCandidate.model)}`
    : "none";
@@ -172,4 +176,5 @@ export function logModelFallbackDecision(params: {
      `model fallback decision: decision=${params.decision} requested=${sanitizeForLog(params.requestedProvider)}/${sanitizeForLog(params.requestedModel)} ` +
      `candidate=${sanitizeForLog(params.candidate.provider)}/${sanitizeForLog(params.candidate.model)} reason=${reasonText}${providerErrorTypeSuffix} next=${nextText}${detailSuffix}`,
  });
+  return fallbackStepFields;
 }
--- a/src/agents/model-fallback.probe.test.ts
+++ b/src/agents/model-fallback.probe.test.ts
@@ -306,8 +306,15 @@ describe("runWithModelFallback – probe logic", () => {
      .fn()
      .mockRejectedValueOnce(Object.assign(new Error("rate limited"), { status: 429 }))
      .mockResolvedValueOnce("fallback-ok");
+    const onFallbackStep = vi.fn();

-    const fallbackResult = await runPrimaryCandidate(fallbackCfg, fallbackRun);
+    const fallbackResult = await runWithModelFallback({
+      cfg: fallbackCfg,
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      run: fallbackRun,
+      onFallbackStep,
+    });
    await logCapture.flush();

    expect(fallbackResult.result).toBe("fallback-ok");
@@ -369,6 +376,26 @@ describe("runWithModelFallback – probe logic", () => {
        }),
      ]),
    );
+    expect(onFallbackStep).toHaveBeenCalledWith(
+      expect.objectContaining({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "openai/gpt-4.1-mini",
+        fallbackStepToModel: "anthropic/claude-haiku-3-5",
+        fallbackStepFromFailureReason: "rate_limit",
+        fallbackStepChainPosition: 1,
+        fallbackStepFinalOutcome: "next_fallback",
+      }),
+    );
+    expect(onFallbackStep).toHaveBeenCalledWith(
+      expect.objectContaining({
+        fallbackStepType: "fallback_step",
+        fallbackStepFromModel: "openai/gpt-4.1-mini",
+        fallbackStepToModel: "anthropic/claude-haiku-3-5",
+        fallbackStepFromFailureReason: "rate_limit",
+        fallbackStepChainPosition: 2,
+        fallbackStepFinalOutcome: "succeeded",
+      }),
+    );
  });

  it("probes primary model when cooldown already expired", async () => {
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -23,7 +23,11 @@ import {
  shouldUseTransientCooldownProbeSlot,
 } from "./failover-policy.js";
 import { LiveSessionModelSwitchError } from "./live-model-switch-error.js";
-import { logModelFallbackDecision } from "./model-fallback-observation.js";
+import {
+  logModelFallbackDecision,
+  type ModelFallbackDecisionParams,
+  type ModelFallbackStepFields,
+} from "./model-fallback-observation.js";
 import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
 import { modelKey, normalizeModelRef } from "./model-selection-normalize.js";
 import {
@@ -133,6 +137,8 @@ type ModelFallbackErrorHandler = (attempt: {
  total: number;
 }) => void | Promise<void>;

+type ModelFallbackStepHandler = (step: ModelFallbackStepFields) => void | Promise<void>;
+
 export type ModelFallbackResultClassification =
  | {
      message: string;
@@ -297,7 +303,7 @@ function recordFailedCandidateAttempt(params: {
  isPrimary: boolean;
  requestedModelMatched: boolean;
  fallbackConfigured: boolean;
-}) {
+}): ModelFallbackStepFields | undefined {
  const described = describeFailoverError(params.error);
  params.attempts.push({
    provider: params.candidate.provider,
@@ -307,7 +313,7 @@ function recordFailedCandidateAttempt(params: {
    status: described.status,
    code: described.code,
  });
-  logModelFallbackDecision({
+  return logModelFallbackDecision({
    decision: "candidate_failed",
    runId: params.runId,
    requestedProvider: params.requestedProvider ?? params.candidate.provider,
@@ -737,6 +743,7 @@ export async function runWithModelFallback<T>(params: {
  fallbacksOverride?: string[];
  run: ModelFallbackRunFn<T>;
  onError?: ModelFallbackErrorHandler;
+  onFallbackStep?: ModelFallbackStepHandler;
  classifyResult?: ModelFallbackResultClassifier<T>;
 }): Promise<ModelFallbackRunResult<T>> {
  const candidates = resolveFallbackCandidates({
@@ -755,6 +762,20 @@ export async function runWithModelFallback<T>(params: {
  const attempts: FallbackAttempt[] = [];
  let lastError: unknown;
  const cooldownProbeUsedProviders = new Set<string>();
+  const observeDecision = async (decision: ModelFallbackDecisionParams) => {
+    const fallbackStep = logModelFallbackDecision(decision);
+    if (fallbackStep) {
+      await params.onFallbackStep?.(fallbackStep);
+    }
+  };
+  const observeFailedCandidate = async (
+    failedAttempt: Parameters<typeof recordFailedCandidateAttempt>[0],
+  ) => {
+    const fallbackStep = recordFailedCandidateAttempt(failedAttempt);
+    if (fallbackStep) {
+      await params.onFallbackStep?.(fallbackStep);
+    }
+  };

  const hasFallbackCandidates = candidates.length > 1;

@@ -799,7 +820,7 @@ export async function runWithModelFallback<T>(params: {
            error: decision.error,
            reason: decision.reason,
          });
-          logModelFallbackDecision({
+          await observeDecision({
            decision: "skip_candidate",
            runId: params.runId,
            requestedProvider: params.provider,
@@ -834,7 +855,7 @@ export async function runWithModelFallback<T>(params: {
              error,
              reason: decision.reason,
            });
-            logModelFallbackDecision({
+            await observeDecision({
              decision: "skip_candidate",
              runId: params.runId,
              requestedProvider: params.provider,
@@ -858,7 +879,7 @@ export async function runWithModelFallback<T>(params: {
          }
        }
        attemptedDuringCooldown = true;
-        logModelFallbackDecision({
+        await observeDecision({
          decision: "probe_cooldown_candidate",
          runId: params.runId,
          requestedProvider: params.provider,
@@ -888,7 +909,7 @@ export async function runWithModelFallback<T>(params: {
    });
    if ("success" in attemptRun) {
      if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
-        logModelFallbackDecision({
+        await observeDecision({
          decision: "candidate_succeeded",
          runId: params.runId,
          requestedProvider: params.provider,
@@ -956,7 +977,7 @@ export async function runWithModelFallback<T>(params: {
          model: candidate.model,
        });
        lastError = switchNormalized;
-        recordFailedCandidateAttempt({
+        await observeFailedCandidate({
          attempts,
          candidate,
          error: switchNormalized,
@@ -982,7 +1003,7 @@ export async function runWithModelFallback<T>(params: {
      }

      lastError = isKnownFailover ? normalized : err;
-      recordFailedCandidateAttempt({
+      await observeFailedCandidate({
        attempts,
        candidate,
        error: normalized,