From ed86252aa5a514728a116ad9a9c9d43db41fcc65 Mon Sep 17 00:00:00 2001 From: Frank Yang Date: Mon, 2 Mar 2026 09:11:05 +0800 Subject: [PATCH] fix: handle CLI session expired errors gracefully instead of crashing gateway (#31090) * fix: handle CLI session expired errors gracefully - Add session_expired to FailoverReason type - Add isCliSessionExpiredErrorMessage to detect expired CLI sessions - Modify runCliAgent to retry with new session when session expires - Update agentCommand to clear expired session IDs from session store - Add proper error handling to prevent gateway crashes on expired sessions Fixes #30986 * fix: add session_expired to AuthProfileFailureReason and missing log import * fix: type cli-runner usage field to match EmbeddedPiAgentMeta * fix: harden CLI session-expiry recovery handling * build: regenerate host env security policy swift --------- Co-authored-by: Peter Steinberger --- .../HostEnvSecurityPolicy.generated.swift | 6 +- src/agents/auth-profiles/types.ts | 1 + src/agents/cli-runner.test.ts | 44 ++ src/agents/cli-runner.ts | 443 ++++++++++-------- src/agents/failover-error.ts | 2 + src/agents/pi-embedded-helpers/errors.ts | 24 + src/agents/pi-embedded-helpers/types.ts | 1 + src/commands/agent.test.ts | 63 +++ src/commands/agent.ts | 103 +++- 9 files changed, 481 insertions(+), 206 deletions(-) diff --git a/apps/macos/Sources/OpenClaw/HostEnvSecurityPolicy.generated.swift b/apps/macos/Sources/OpenClaw/HostEnvSecurityPolicy.generated.swift index e4927331b4f..b126d03de21 100644 --- a/apps/macos/Sources/OpenClaw/HostEnvSecurityPolicy.generated.swift +++ b/apps/macos/Sources/OpenClaw/HostEnvSecurityPolicy.generated.swift @@ -22,17 +22,17 @@ enum HostEnvSecurityPolicy { "PS4", "GCONV_PATH", "IFS", - "SSLKEYLOGFILE", + "SSLKEYLOGFILE" ] static let blockedOverrideKeys: Set = [ "HOME", - "ZDOTDIR", + "ZDOTDIR" ] static let blockedPrefixes: [String] = [ "DYLD_", "LD_", - "BASH_FUNC_", + "BASH_FUNC_" ] } diff --git a/src/agents/auth-profiles/types.ts b/src/agents/auth-profiles/types.ts index f4e56f59d68..3c186350667 100644 --- a/src/agents/auth-profiles/types.ts +++ b/src/agents/auth-profiles/types.ts @@ -43,6 +43,7 @@ export type AuthProfileFailureReason = | "billing" | "timeout" | "model_not_found" + | "session_expired" | "unknown"; /** Per-profile usage statistics for round-robin and cooldown tracking */ diff --git a/src/agents/cli-runner.test.ts b/src/agents/cli-runner.test.ts index 7d512dd4dbe..ec2ea4768c5 100644 --- a/src/agents/cli-runner.test.ts +++ b/src/agents/cli-runner.test.ts @@ -153,6 +153,50 @@ describe("runCliAgent with process supervisor", () => { ).rejects.toThrow("exceeded timeout"); }); + it("rethrows the retry failure when session-expired recovery retry also fails", async () => { + supervisorSpawnMock.mockResolvedValueOnce( + createManagedRun({ + reason: "exit", + exitCode: 1, + exitSignal: null, + durationMs: 150, + stdout: "", + stderr: "session expired", + timedOut: false, + noOutputTimedOut: false, + }), + ); + supervisorSpawnMock.mockResolvedValueOnce( + createManagedRun({ + reason: "exit", + exitCode: 1, + exitSignal: null, + durationMs: 150, + stdout: "", + stderr: "rate limit exceeded", + timedOut: false, + noOutputTimedOut: false, + }), + ); + + await expect( + runCliAgent({ + sessionId: "s1", + sessionKey: "agent:main:subagent:retry", + sessionFile: "/tmp/session.jsonl", + workspaceDir: "/tmp", + prompt: "hi", + provider: "codex-cli", + model: "gpt-5.2-codex", + timeoutMs: 1_000, + runId: "run-retry-failure", + cliSessionId: "thread-123", + }), + ).rejects.toThrow("rate limit exceeded"); + + expect(supervisorSpawnMock).toHaveBeenCalledTimes(2); + }); + it("falls back to per-agent workspace when workspaceDir is missing", async () => { const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cli-runner-")); const fallbackWorkspace = path.join(tempDir, "workspace-main"); diff --git a/src/agents/cli-runner.ts b/src/agents/cli-runner.ts index cc19546b534..0757483b549 100644 --- a/src/agents/cli-runner.ts +++ b/src/agents/cli-runner.ts @@ -122,204 +122,221 @@ export async function runCliAgent(params: { agentId: sessionAgentId, }); - const { sessionId: cliSessionIdToSend, isNew } = resolveSessionIdToSend({ - backend, - cliSessionId: params.cliSessionId, - }); - const useResume = Boolean( - params.cliSessionId && - cliSessionIdToSend && - backend.resumeArgs && - backend.resumeArgs.length > 0, - ); - const sessionIdSent = cliSessionIdToSend - ? useResume || Boolean(backend.sessionArg) || Boolean(backend.sessionArgs?.length) - ? cliSessionIdToSend - : undefined - : undefined; - const systemPromptArg = resolveSystemPromptUsage({ - backend, - isNewSession: isNew, - systemPrompt, - }); - - let imagePaths: string[] | undefined; - let cleanupImages: (() => Promise) | undefined; - let prompt = params.prompt; - if (params.images && params.images.length > 0) { - const imagePayload = await writeCliImages(params.images); - imagePaths = imagePayload.paths; - cleanupImages = imagePayload.cleanup; - if (!backend.imageArg) { - prompt = appendImagePathsToPrompt(prompt, imagePaths); - } - } - - const { argsPrompt, stdin } = resolvePromptInput({ - backend, - prompt, - }); - const stdinPayload = stdin ?? ""; - const baseArgs = useResume ? (backend.resumeArgs ?? backend.args ?? []) : (backend.args ?? []); - const resolvedArgs = useResume - ? baseArgs.map((entry) => entry.replaceAll("{sessionId}", cliSessionIdToSend ?? "")) - : baseArgs; - const args = buildCliArgs({ - backend, - baseArgs: resolvedArgs, - modelId: normalizedModel, - sessionId: cliSessionIdToSend, - systemPrompt: systemPromptArg, - imagePaths, - promptArg: argsPrompt, - useResume, - }); - - const serialize = backend.serialize ?? true; - const queueKey = serialize ? backendResolved.id : `${backendResolved.id}:${params.runId}`; - - try { - const output = await enqueueCliRun(queueKey, async () => { - log.info( - `cli exec: provider=${params.provider} model=${normalizedModel} promptChars=${params.prompt.length}`, - ); - const logOutputText = isTruthyEnvValue(process.env.OPENCLAW_CLAUDE_CLI_LOG_OUTPUT); - if (logOutputText) { - const logArgs: string[] = []; - for (let i = 0; i < args.length; i += 1) { - const arg = args[i] ?? ""; - if (arg === backend.systemPromptArg) { - const systemPromptValue = args[i + 1] ?? ""; - logArgs.push(arg, ``); - i += 1; - continue; - } - if (arg === backend.sessionArg) { - logArgs.push(arg, args[i + 1] ?? ""); - i += 1; - continue; - } - if (arg === backend.modelArg) { - logArgs.push(arg, args[i + 1] ?? ""); - i += 1; - continue; - } - if (arg === backend.imageArg) { - logArgs.push(arg, ""); - i += 1; - continue; - } - logArgs.push(arg); - } - if (argsPrompt) { - const promptIndex = logArgs.indexOf(argsPrompt); - if (promptIndex >= 0) { - logArgs[promptIndex] = ``; - } - } - log.info(`cli argv: ${backend.command} ${logArgs.join(" ")}`); - } - - const env = (() => { - const next = { ...process.env, ...backend.env }; - for (const key of backend.clearEnv ?? []) { - delete next[key]; - } - return next; - })(); - const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({ - backend, - timeoutMs: params.timeoutMs, - useResume, - }); - const supervisor = getProcessSupervisor(); - const scopeKey = buildCliSupervisorScopeKey({ - backend, - backendId: backendResolved.id, - cliSessionId: useResume ? cliSessionIdToSend : undefined, - }); - - const managedRun = await supervisor.spawn({ - sessionId: params.sessionId, - backendId: backendResolved.id, - scopeKey, - replaceExistingScope: Boolean(useResume && scopeKey), - mode: "child", - argv: [backend.command, ...args], - timeoutMs: params.timeoutMs, - noOutputTimeoutMs, - cwd: workspaceDir, - env, - input: stdinPayload, - }); - const result = await managedRun.wait(); - - const stdout = result.stdout.trim(); - const stderr = result.stderr.trim(); - if (logOutputText) { - if (stdout) { - log.info(`cli stdout:\n${stdout}`); - } - if (stderr) { - log.info(`cli stderr:\n${stderr}`); - } - } - if (shouldLogVerbose()) { - if (stdout) { - log.debug(`cli stdout:\n${stdout}`); - } - if (stderr) { - log.debug(`cli stderr:\n${stderr}`); - } - } - - if (result.exitCode !== 0 || result.reason !== "exit") { - if (result.reason === "no-output-timeout" || result.noOutputTimedOut) { - const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`; - log.warn( - `cli watchdog timeout: provider=${params.provider} model=${modelId} session=${cliSessionIdToSend ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`, - ); - throw new FailoverError(timeoutReason, { - reason: "timeout", - provider: params.provider, - model: modelId, - status: resolveFailoverStatus("timeout"), - }); - } - if (result.reason === "overall-timeout") { - const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`; - throw new FailoverError(timeoutReason, { - reason: "timeout", - provider: params.provider, - model: modelId, - status: resolveFailoverStatus("timeout"), - }); - } - const err = stderr || stdout || "CLI failed."; - const reason = classifyFailoverReason(err) ?? "unknown"; - const status = resolveFailoverStatus(reason); - throw new FailoverError(err, { - reason, - provider: params.provider, - model: modelId, - status, - }); - } - - const outputMode = useResume ? (backend.resumeOutput ?? backend.output) : backend.output; - - if (outputMode === "text") { - return { text: stdout, sessionId: undefined }; - } - if (outputMode === "jsonl") { - const parsed = parseCliJsonl(stdout, backend); - return parsed ?? { text: stdout }; - } - - const parsed = parseCliJson(stdout, backend); - return parsed ?? { text: stdout }; + // Helper function to execute CLI with given session ID + const executeCliWithSession = async ( + cliSessionIdToUse?: string, + ): Promise<{ + text: string; + sessionId?: string; + usage?: { + input?: number; + output?: number; + cacheRead?: number; + cacheWrite?: number; + total?: number; + }; + }> => { + const { sessionId: resolvedSessionId, isNew } = resolveSessionIdToSend({ + backend, + cliSessionId: cliSessionIdToUse, + }); + const useResume = Boolean( + cliSessionIdToUse && resolvedSessionId && backend.resumeArgs && backend.resumeArgs.length > 0, + ); + const systemPromptArg = resolveSystemPromptUsage({ + backend, + isNewSession: isNew, + systemPrompt, }); + let imagePaths: string[] | undefined; + let cleanupImages: (() => Promise) | undefined; + let prompt = params.prompt; + if (params.images && params.images.length > 0) { + const imagePayload = await writeCliImages(params.images); + imagePaths = imagePayload.paths; + cleanupImages = imagePayload.cleanup; + if (!backend.imageArg) { + prompt = appendImagePathsToPrompt(prompt, imagePaths); + } + } + + const { argsPrompt, stdin } = resolvePromptInput({ + backend, + prompt, + }); + const stdinPayload = stdin ?? ""; + const baseArgs = useResume ? (backend.resumeArgs ?? backend.args ?? []) : (backend.args ?? []); + const resolvedArgs = useResume + ? baseArgs.map((entry) => entry.replaceAll("{sessionId}", resolvedSessionId ?? "")) + : baseArgs; + const args = buildCliArgs({ + backend, + baseArgs: resolvedArgs, + modelId: normalizedModel, + sessionId: resolvedSessionId, + systemPrompt: systemPromptArg, + imagePaths, + promptArg: argsPrompt, + useResume, + }); + + const serialize = backend.serialize ?? true; + const queueKey = serialize ? backendResolved.id : `${backendResolved.id}:${params.runId}`; + + try { + const output = await enqueueCliRun(queueKey, async () => { + log.info( + `cli exec: provider=${params.provider} model=${normalizedModel} promptChars=${params.prompt.length}`, + ); + const logOutputText = isTruthyEnvValue(process.env.OPENCLAW_CLAUDE_CLI_LOG_OUTPUT); + if (logOutputText) { + const logArgs: string[] = []; + for (let i = 0; i < args.length; i += 1) { + const arg = args[i] ?? ""; + if (arg === backend.systemPromptArg) { + const systemPromptValue = args[i + 1] ?? ""; + logArgs.push(arg, ``); + i += 1; + continue; + } + if (arg === backend.sessionArg) { + logArgs.push(arg, args[i + 1] ?? ""); + i += 1; + continue; + } + if (arg === backend.modelArg) { + logArgs.push(arg, args[i + 1] ?? ""); + i += 1; + continue; + } + if (arg === backend.imageArg) { + logArgs.push(arg, ""); + i += 1; + continue; + } + logArgs.push(arg); + } + if (argsPrompt) { + const promptIndex = logArgs.indexOf(argsPrompt); + if (promptIndex >= 0) { + logArgs[promptIndex] = ``; + } + } + log.info(`cli argv: ${backend.command} ${logArgs.join(" ")}`); + } + + const env = (() => { + const next = { ...process.env, ...backend.env }; + for (const key of backend.clearEnv ?? []) { + delete next[key]; + } + return next; + })(); + const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({ + backend, + timeoutMs: params.timeoutMs, + useResume, + }); + const supervisor = getProcessSupervisor(); + const scopeKey = buildCliSupervisorScopeKey({ + backend, + backendId: backendResolved.id, + cliSessionId: useResume ? resolvedSessionId : undefined, + }); + + const managedRun = await supervisor.spawn({ + sessionId: params.sessionId, + backendId: backendResolved.id, + scopeKey, + replaceExistingScope: Boolean(useResume && scopeKey), + mode: "child", + argv: [backend.command, ...args], + timeoutMs: params.timeoutMs, + noOutputTimeoutMs, + cwd: workspaceDir, + env, + input: stdinPayload, + }); + const result = await managedRun.wait(); + + const stdout = result.stdout.trim(); + const stderr = result.stderr.trim(); + if (logOutputText) { + if (stdout) { + log.info(`cli stdout:\n${stdout}`); + } + if (stderr) { + log.info(`cli stderr:\n${stderr}`); + } + } + if (shouldLogVerbose()) { + if (stdout) { + log.debug(`cli stdout:\n${stdout}`); + } + if (stderr) { + log.debug(`cli stderr:\n${stderr}`); + } + } + + if (result.exitCode !== 0 || result.reason !== "exit") { + if (result.reason === "no-output-timeout" || result.noOutputTimedOut) { + const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`; + log.warn( + `cli watchdog timeout: provider=${params.provider} model=${modelId} session=${resolvedSessionId ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`, + ); + throw new FailoverError(timeoutReason, { + reason: "timeout", + provider: params.provider, + model: modelId, + status: resolveFailoverStatus("timeout"), + }); + } + if (result.reason === "overall-timeout") { + const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`; + throw new FailoverError(timeoutReason, { + reason: "timeout", + provider: params.provider, + model: modelId, + status: resolveFailoverStatus("timeout"), + }); + } + const err = stderr || stdout || "CLI failed."; + const reason = classifyFailoverReason(err) ?? "unknown"; + const status = resolveFailoverStatus(reason); + throw new FailoverError(err, { + reason, + provider: params.provider, + model: modelId, + status, + }); + } + + const outputMode = useResume ? (backend.resumeOutput ?? backend.output) : backend.output; + + if (outputMode === "text") { + return { text: stdout, sessionId: undefined }; + } + if (outputMode === "jsonl") { + const parsed = parseCliJsonl(stdout, backend); + return parsed ?? { text: stdout }; + } + + const parsed = parseCliJson(stdout, backend); + return parsed ?? { text: stdout }; + }); + + return output; + } finally { + if (cleanupImages) { + await cleanupImages(); + } + } + }; + + // Try with the provided CLI session ID first + try { + const output = await executeCliWithSession(params.cliSessionId); const text = output.text?.trim(); const payloads = text ? [{ text }] : undefined; @@ -328,7 +345,7 @@ export async function runCliAgent(params: { meta: { durationMs: Date.now() - started, agentMeta: { - sessionId: output.sessionId ?? sessionIdSent ?? params.sessionId ?? "", + sessionId: output.sessionId ?? params.cliSessionId ?? params.sessionId ?? "", provider: params.provider, model: modelId, usage: output.usage, @@ -337,6 +354,34 @@ export async function runCliAgent(params: { }; } catch (err) { if (err instanceof FailoverError) { + // Check if this is a session expired error and we have a session to clear + if (err.reason === "session_expired" && params.cliSessionId && params.sessionKey) { + log.warn( + `CLI session expired, clearing session ID and retrying: provider=${params.provider} session=${redactRunIdentifier(params.cliSessionId)}`, + ); + + // Clear the expired session ID from the session entry + // This requires access to the session store, which we don't have here + // We'll need to modify the caller to handle this case + + // For now, retry without the session ID to create a new session + const output = await executeCliWithSession(undefined); + const text = output.text?.trim(); + const payloads = text ? [{ text }] : undefined; + + return { + payloads, + meta: { + durationMs: Date.now() - started, + agentMeta: { + sessionId: output.sessionId ?? params.sessionId ?? "", + provider: params.provider, + model: modelId, + usage: output.usage, + }, + }, + }; + } throw err; } const message = err instanceof Error ? err.message : String(err); @@ -351,10 +396,6 @@ export async function runCliAgent(params: { }); } throw err; - } finally { - if (cleanupImages) { - await cleanupImages(); - } } } diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index 708af55e322..ee287d79484 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -59,6 +59,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine return 400; case "model_not_found": return 404; + case "session_expired": + return 410; // Gone - session no longer exists default: return undefined; } diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index 5f8d70e3bbc..3d608696705 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -883,6 +883,27 @@ export function isModelNotFoundErrorMessage(raw: string): boolean { return false; } +function isCliSessionExpiredErrorMessage(raw: string): boolean { + if (!raw) { + return false; + } + const lower = raw.toLowerCase(); + return ( + lower.includes("session not found") || + lower.includes("session does not exist") || + lower.includes("session expired") || + lower.includes("session invalid") || + lower.includes("conversation not found") || + lower.includes("conversation does not exist") || + lower.includes("conversation expired") || + lower.includes("conversation invalid") || + lower.includes("no such session") || + lower.includes("invalid session") || + lower.includes("session id not found") || + lower.includes("conversation id not found") + ); +} + export function classifyFailoverReason(raw: string): FailoverReason | null { if (isImageDimensionErrorMessage(raw)) { return null; @@ -890,6 +911,9 @@ export function classifyFailoverReason(raw: string): FailoverReason | null { if (isImageSizeError(raw)) { return null; } + if (isCliSessionExpiredErrorMessage(raw)) { + return "session_expired"; + } if (isModelNotFoundErrorMessage(raw)) { return "model_not_found"; } diff --git a/src/agents/pi-embedded-helpers/types.ts b/src/agents/pi-embedded-helpers/types.ts index 2440473d9f6..86ee1c4cda1 100644 --- a/src/agents/pi-embedded-helpers/types.ts +++ b/src/agents/pi-embedded-helpers/types.ts @@ -8,4 +8,5 @@ export type FailoverReason = | "billing" | "timeout" | "model_not_found" + | "session_expired" | "unknown"; diff --git a/src/commands/agent.test.ts b/src/commands/agent.test.ts index eca0169c256..b60eaffa9c3 100644 --- a/src/commands/agent.test.ts +++ b/src/commands/agent.test.ts @@ -4,7 +4,9 @@ import { beforeEach, describe, expect, it, type MockInstance, vi } from "vitest" import { withTempHome as withTempHomeBase } from "../../test/helpers/temp-home.js"; import "../cron/isolated-agent.mocks.js"; import * as cliRunnerModule from "../agents/cli-runner.js"; +import { FailoverError } from "../agents/failover-error.js"; import { loadModelCatalog } from "../agents/model-catalog.js"; +import * as modelSelectionModule from "../agents/model-selection.js"; import { runEmbeddedPiAgent } from "../agents/pi-embedded.js"; import type { OpenClawConfig } from "../config/config.js"; import * as configModule from "../config/config.js"; @@ -148,6 +150,7 @@ beforeEach(() => { }, }); vi.mocked(loadModelCatalog).mockResolvedValue([]); + vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(() => false); }); describe("agentCommand", () => { @@ -640,6 +643,66 @@ describe("agentCommand", () => { }); }); + it("clears stale Claude CLI legacy session IDs before retrying after session expiration", async () => { + vi.mocked(modelSelectionModule.isCliProvider).mockImplementation( + (provider) => provider.trim().toLowerCase() === "claude-cli", + ); + try { + await withTempHome(async (home) => { + const store = path.join(home, "sessions.json"); + const sessionKey = "agent:main:subagent:cli-expired"; + writeSessionStoreSeed(store, { + [sessionKey]: { + sessionId: "session-cli-123", + updatedAt: Date.now(), + providerOverride: "claude-cli", + modelOverride: "opus", + cliSessionIds: { "claude-cli": "stale-cli-session" }, + claudeCliSessionId: "stale-legacy-session", + }, + }); + mockConfig(home, store, { + model: { primary: "claude-cli/opus", fallbacks: [] }, + models: { "claude-cli/opus": {} }, + }); + runCliAgentSpy + .mockRejectedValueOnce( + new FailoverError("session expired", { + reason: "session_expired", + provider: "claude-cli", + model: "opus", + status: 410, + }), + ) + .mockRejectedValue(new Error("retry failed")); + + await expect(agentCommand({ message: "hi", sessionKey }, runtime)).rejects.toThrow( + "retry failed", + ); + + expect(runCliAgentSpy).toHaveBeenCalledTimes(2); + const firstCall = runCliAgentSpy.mock.calls[0]?.[0] as + | { cliSessionId?: string } + | undefined; + const secondCall = runCliAgentSpy.mock.calls[1]?.[0] as + | { cliSessionId?: string } + | undefined; + expect(firstCall?.cliSessionId).toBe("stale-cli-session"); + expect(secondCall?.cliSessionId).toBeUndefined(); + + const saved = JSON.parse(fs.readFileSync(store, "utf-8")) as Record< + string, + { cliSessionIds?: Record; claudeCliSessionId?: string } + >; + const entry = saved[sessionKey]; + expect(entry?.cliSessionIds?.["claude-cli"]).toBeUndefined(); + expect(entry?.claudeCliSessionId).toBeUndefined(); + }); + } finally { + vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(() => false); + } + }); + it("rejects unknown agent overrides", async () => { await withTempHome(async (home) => { const store = path.join(home, "sessions.json"); diff --git a/src/commands/agent.ts b/src/commands/agent.ts index 4dacd08cb72..0ebde04fe1e 100644 --- a/src/commands/agent.ts +++ b/src/commands/agent.ts @@ -1,6 +1,9 @@ import { getAcpSessionManager } from "../acp/control-plane/manager.js"; import { resolveAcpAgentPolicyError, resolveAcpDispatchPolicyError } from "../acp/policy.js"; import { toAcpRuntimeError } from "../acp/runtime/errors.js"; +import { createSubsystemLogger } from "../logging/subsystem.js"; + +const log = createSubsystemLogger("commands/agent"); import { listAgentIds, resolveAgentDir, @@ -12,8 +15,9 @@ import { import { ensureAuthProfileStore } from "../agents/auth-profiles.js"; import { clearSessionAuthProfileOverride } from "../agents/auth-profiles/session-override.js"; import { runCliAgent } from "../agents/cli-runner.js"; -import { getCliSessionId } from "../agents/cli-session.js"; +import { getCliSessionId, setCliSessionId } from "../agents/cli-session.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../agents/defaults.js"; +import { FailoverError } from "../agents/failover-error.js"; import { formatAgentInternalEventsForPrompt } from "../agents/internal-events.js"; import { AGENT_LANE_SUBAGENT } from "../agents/lanes.js"; import { loadModelCatalog } from "../agents/model-catalog.js"; @@ -23,6 +27,7 @@ import { isCliProvider, modelKey, normalizeModelRef, + normalizeProviderId, resolveConfiguredModelRef, resolveDefaultModelForAgent, resolveThinkingDefault, @@ -89,7 +94,8 @@ type OverrideFieldClearedByDelete = | "authProfileOverrideCompactionCount" | "fallbackNoticeSelectedModel" | "fallbackNoticeActiveModel" - | "fallbackNoticeReason"; + | "fallbackNoticeReason" + | "claudeCliSessionId"; const OVERRIDE_FIELDS_CLEARED_BY_DELETE: OverrideFieldClearedByDelete[] = [ "providerOverride", @@ -100,6 +106,7 @@ const OVERRIDE_FIELDS_CLEARED_BY_DELETE: OverrideFieldClearedByDelete[] = [ "fallbackNoticeSelectedModel", "fallbackNoticeActiveModel", "fallbackNoticeReason", + "claudeCliSessionId", ]; async function persistSessionEntry(params: PersistSessionEntryParams): Promise { @@ -162,6 +169,8 @@ function runAgentAttempt(params: { agentDir: string; onAgentEvent: (evt: { stream: string; data?: Record }) => void; primaryProvider: string; + sessionStore?: Record; + storePath?: string; }) { const senderIsOwner = params.opts.senderIsOwner ?? true; const effectivePrompt = resolveFallbackRetryPrompt({ @@ -187,6 +196,94 @@ function runAgentAttempt(params: { cliSessionId, images: params.isFallbackRetry ? undefined : params.opts.images, streamParams: params.opts.streamParams, + }).catch(async (err) => { + // Handle CLI session expired error + if ( + err instanceof FailoverError && + err.reason === "session_expired" && + cliSessionId && + params.sessionKey && + params.sessionStore && + params.storePath + ) { + log.warn( + `CLI session expired, clearing from session store: provider=${params.providerOverride} sessionKey=${params.sessionKey}`, + ); + + // Clear the expired session ID from the session store + const entry = params.sessionStore[params.sessionKey]; + if (entry) { + const updatedEntry = { ...entry }; + if (params.providerOverride === "claude-cli") { + delete updatedEntry.claudeCliSessionId; + } + if (updatedEntry.cliSessionIds) { + const normalizedProvider = normalizeProviderId(params.providerOverride); + const newCliSessionIds = { ...updatedEntry.cliSessionIds }; + delete newCliSessionIds[normalizedProvider]; + updatedEntry.cliSessionIds = newCliSessionIds; + } + updatedEntry.updatedAt = Date.now(); + + await persistSessionEntry({ + sessionStore: params.sessionStore, + sessionKey: params.sessionKey, + storePath: params.storePath, + entry: updatedEntry, + }); + + // Update the session entry reference + params.sessionEntry = updatedEntry; + } + + // Retry with no session ID (will create a new session) + return runCliAgent({ + sessionId: params.sessionId, + sessionKey: params.sessionKey, + agentId: params.sessionAgentId, + sessionFile: params.sessionFile, + workspaceDir: params.workspaceDir, + config: params.cfg, + prompt: effectivePrompt, + provider: params.providerOverride, + model: params.modelOverride, + thinkLevel: params.resolvedThinkLevel, + timeoutMs: params.timeoutMs, + runId: params.runId, + extraSystemPrompt: params.opts.extraSystemPrompt, + cliSessionId: undefined, // No session ID to force new session + images: params.isFallbackRetry ? undefined : params.opts.images, + streamParams: params.opts.streamParams, + }).then(async (result) => { + // Update session store with new CLI session ID if available + if ( + result.meta.agentMeta?.sessionId && + params.sessionKey && + params.sessionStore && + params.storePath + ) { + const entry = params.sessionStore[params.sessionKey]; + if (entry) { + const updatedEntry = { ...entry }; + setCliSessionId( + updatedEntry, + params.providerOverride, + result.meta.agentMeta.sessionId, + ); + updatedEntry.updatedAt = Date.now(); + + await persistSessionEntry({ + sessionStore: params.sessionStore, + sessionKey: params.sessionKey, + storePath: params.storePath, + entry: updatedEntry, + }); + } + } + return result; + }); + } + throw err; }); } @@ -766,6 +863,8 @@ export async function agentCommand( resolvedVerboseLevel, agentDir, primaryProvider: provider, + sessionStore, + storePath, onAgentEvent: (evt) => { // Track lifecycle end for fallback emission below. if (