From 33a26cd807992bf88e4cc86e2df64d3b6447ec2b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 07:38:59 +0100 Subject: [PATCH] fix: restart closed codex app-server clients --- CHANGELOG.md | 1 + extensions/codex/src/app-server/client.ts | 10 +++ .../codex/src/app-server/run-attempt.test.ts | 78 ++++++++++++++++++- .../codex/src/app-server/run-attempt.ts | 59 +++++++++----- .../codex/src/app-server/thread-lifecycle.ts | 5 +- 5 files changed, 133 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d45bffa24c..0cae76491b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ Docs: https://docs.openclaw.ai - Active Memory: use the configured recall timeout as the blocking prompt-build hook budget by default and move cold-start setup grace behind explicit `setupGraceTimeoutMs` config, so the plugin no longer silently extends 15000 ms configs to 45000 ms on the main lane. Fixes #75843. Thanks @vishutdhar. - Agents/sandbox: preserve existing workspace file modes when sandbox edits atomically replace files, so 0644 files do not collapse to 0600 after Write/Edit/apply_patch. Fixes #44077. Thanks @patosullivan. - Agents/models: keep legacy CLI runtime model refs such as `claude-cli/*` in the configured allowlist after canonical runtime migration, so cron `payload.model` overrides keep working. Fixes #75753. Thanks @RyanSandoval. +- Codex/app-server: restart the shared Codex app-server client once when it closes during startup thread resume, preserving the existing thread binding instead of retrying `thread/start` on a closed client. Thanks @vincentkoc. - Gateway/watch: keep colored subsystem log prefixes in the managed tmux pane even when the parent shell exports `NO_COLOR`, while preserving explicit `FORCE_COLOR=0` opt-out. Thanks @vincentkoc. - Agents/compaction: submit a non-empty runtime-event marker for pre-compaction memory flush turns, so strict Anthropic providers no longer reject the silent flush as an empty user message. Fixes #75305. Thanks @sableassistant3777-source. - Plugin SDK: re-export `isPrivateIpAddress` from `plugin-sdk/ssrf-runtime`, restoring source-checkout builds for SearXNG and Firecrawl private-network guards. Thanks @vincentkoc. diff --git a/extensions/codex/src/app-server/client.ts b/extensions/codex/src/app-server/client.ts index 38e36c756e5..be2d27cf8e6 100644 --- a/extensions/codex/src/app-server/client.ts +++ b/extensions/codex/src/app-server/client.ts @@ -48,6 +48,16 @@ export class CodexAppServerRpcError extends Error { } } +export function isCodexAppServerConnectionClosedError(error: unknown): boolean { + if (!(error instanceof Error)) { + return false; + } + return ( + error.message === "codex app-server client is closed" || + error.message.startsWith("codex app-server exited:") + ); +} + type CodexServerRequestHandler = ( request: Required> & { params?: JsonValue }, ) => Promise | JsonValue | undefined; diff --git a/extensions/codex/src/app-server/run-attempt.test.ts b/extensions/codex/src/app-server/run-attempt.test.ts index deedcdebece..cf2ca98521d 100644 --- a/extensions/codex/src/app-server/run-attempt.test.ts +++ b/extensions/codex/src/app-server/run-attempt.test.ts @@ -25,7 +25,7 @@ import { CODEX_GPT5_BEHAVIOR_CONTRACT } from "../../prompt-overlay.js"; import * as elicitationBridge from "./elicitation-bridge.js"; import type { CodexServerNotification } from "./protocol.js"; import { runCodexAppServerAttempt, __testing } from "./run-attempt.js"; -import { writeCodexAppServerBinding } from "./session-binding.js"; +import { readCodexAppServerBinding, writeCodexAppServerBinding } from "./session-binding.js"; import { createCodexTestModel } from "./test-support.js"; import { buildThreadResumeParams, @@ -1941,6 +1941,82 @@ describe("runCodexAppServerAttempt", () => { expect(request.mock.calls.map(([method]) => method)).toEqual(["thread/start", "thread/resume"]); }); + it("preserves the binding when the app-server closes during thread resume", async () => { + const sessionFile = path.join(tempDir, "session.jsonl"); + const workspaceDir = path.join(tempDir, "workspace"); + await writeExistingBinding(sessionFile, workspaceDir, { dynamicToolsFingerprint: "[]" }); + const appServer = createThreadLifecycleAppServerOptions(); + const request = vi.fn(async (method: string) => { + if (method === "thread/resume") { + throw new Error("codex app-server client is closed"); + } + throw new Error(`unexpected method: ${method}`); + }); + + await expect( + startOrResumeThread({ + client: { request } as never, + params: createParams(sessionFile, workspaceDir), + cwd: workspaceDir, + dynamicTools: [], + appServer, + }), + ).rejects.toThrow("codex app-server client is closed"); + + expect(request.mock.calls.map(([method]) => method)).toEqual(["thread/resume"]); + await expect(readCodexAppServerBinding(sessionFile)).resolves.toMatchObject({ + threadId: "thread-existing", + }); + }); + + it("restarts the app-server once when a shared client closes during startup", async () => { + const sessionFile = path.join(tempDir, "session.jsonl"); + const workspaceDir = path.join(tempDir, "workspace"); + await writeExistingBinding(sessionFile, workspaceDir, { dynamicToolsFingerprint: "[]" }); + const requests: string[][] = []; + let starts = 0; + let notify: (notification: CodexServerNotification) => Promise = async () => undefined; + __testing.setCodexAppServerClientFactoryForTests(async () => { + const startIndex = starts++; + const methods: string[] = []; + requests.push(methods); + return { + request: vi.fn(async (method: string) => { + methods.push(method); + if (method === "thread/resume" && startIndex === 0) { + throw new Error("codex app-server client is closed"); + } + if (method === "thread/resume") { + return threadStartResult("thread-existing"); + } + if (method === "turn/start") { + return turnStartResult(); + } + return {}; + }), + addNotificationHandler: (handler: typeof notify) => { + notify = handler; + return () => undefined; + }, + addRequestHandler: () => () => undefined, + } as never; + }); + + const run = runCodexAppServerAttempt(createParams(sessionFile, workspaceDir)); + await vi.waitFor(() => expect(requests[1]).toContain("turn/start"), { interval: 1 }); + await notify({ + method: "turn/completed", + params: { + threadId: "thread-existing", + turnId: "turn-1", + turn: { id: "turn-1", status: "completed" }, + }, + }); + + await expect(run).resolves.toMatchObject({ aborted: false }); + expect(requests).toEqual([["thread/resume"], ["thread/resume", "turn/start"]]); + }); + it("passes native hook relay config on thread start and resume", async () => { const sessionFile = path.join(tempDir, "session.jsonl"); const workspaceDir = path.join(tempDir, "workspace"); diff --git a/extensions/codex/src/app-server/run-attempt.ts b/extensions/codex/src/app-server/run-attempt.ts index 3f2faf337da..3f4ec9aa2c3 100644 --- a/extensions/codex/src/app-server/run-attempt.ts +++ b/extensions/codex/src/app-server/run-attempt.ts @@ -42,7 +42,11 @@ import { createCodexAppServerClientFactoryTestHooks, defaultCodexAppServerClientFactory, } from "./client-factory.js"; -import { isCodexAppServerApprovalRequest, type CodexAppServerClient } from "./client.js"; +import { + isCodexAppServerApprovalRequest, + isCodexAppServerConnectionClosedError, + type CodexAppServerClient, +} from "./client.js"; import { ensureCodexComputerUse } from "./computer-use.js"; import { readCodexPluginConfig, @@ -512,23 +516,42 @@ export async function runCodexAppServerAttempt( timeoutFloorMs: options.startupTimeoutFloorMs, signal: runAbortController.signal, operation: async () => { - const startupClient = await clientFactory(appServer.start, startupAuthProfileId, agentDir); - await ensureCodexComputerUse({ - client: startupClient, - pluginConfig: options.pluginConfig, - timeoutMs: appServer.requestTimeoutMs, - signal: runAbortController.signal, - }); - const startupThread = await startOrResumeThread({ - client: startupClient, - params, - cwd: effectiveWorkspace, - dynamicTools: toolBridge.specs, - appServer, - developerInstructions: promptBuild.developerInstructions, - config: nativeHookRelayConfig, - }); - return { client: startupClient, thread: startupThread }; + const startupAttempt = async () => { + const startupClient = await clientFactory( + appServer.start, + startupAuthProfileId, + agentDir, + ); + await ensureCodexComputerUse({ + client: startupClient, + pluginConfig: options.pluginConfig, + timeoutMs: appServer.requestTimeoutMs, + signal: runAbortController.signal, + }); + const startupThread = await startOrResumeThread({ + client: startupClient, + params, + cwd: effectiveWorkspace, + dynamicTools: toolBridge.specs, + appServer, + developerInstructions: promptBuild.developerInstructions, + config: nativeHookRelayConfig, + }); + return { client: startupClient, thread: startupThread }; + }; + try { + return await startupAttempt(); + } catch (error) { + if (runAbortController.signal.aborted || !isCodexAppServerConnectionClosedError(error)) { + throw error; + } + embeddedAgentLog.warn( + "codex app-server connection closed during startup; restarting app-server and retrying", + { error }, + ); + clearSharedCodexAppServerClient(); + return await startupAttempt(); + } }, })); emitCodexAppServerEvent(params, { diff --git a/extensions/codex/src/app-server/thread-lifecycle.ts b/extensions/codex/src/app-server/thread-lifecycle.ts index a60fdf89299..31405fcb840 100644 --- a/extensions/codex/src/app-server/thread-lifecycle.ts +++ b/extensions/codex/src/app-server/thread-lifecycle.ts @@ -4,7 +4,7 @@ import { } from "openclaw/plugin-sdk/agent-harness-runtime"; import { renderCodexPromptOverlay } from "../../prompt-overlay.js"; import { isModernCodexModel } from "../../provider.js"; -import type { CodexAppServerClient } from "./client.js"; +import { isCodexAppServerConnectionClosedError, type CodexAppServerClient } from "./client.js"; import { codexSandboxPolicyForTurn, type CodexAppServerRuntimeOptions } from "./config.js"; import { assertCodexThreadResumeResponse, @@ -86,6 +86,9 @@ export async function startOrResumeThread(params: { dynamicToolsFingerprint, }; } catch (error) { + if (isCodexAppServerConnectionClosedError(error)) { + throw error; + } embeddedAgentLog.warn("codex app-server thread resume failed; starting a new thread", { error, });