diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d318c548a3..752593556a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ Docs: https://docs.openclaw.ai - Memory Wiki: accept relative Markdown links that include the `.md` suffix during broken-wikilink validation, avoiding false positives for native render-mode links. Thanks @Kenneth8128. - OpenAI Codex: show the device-pairing code in the interactive SSH/headless prompt while keeping the short-lived code out of persistent runtime logs. Fixes #74212. Thanks @da22le123. - QA Lab: stop gateway children when the suite parent disappears, so interrupted local QA runs cannot leave hot orphaned gateways behind. +- Codex/app-server: tolerate a second connection close during startup recovery and include retry counts plus stringified errors in the restart warning, so concurrent lanes do not fail after one shared-client race. - Plugins/CLI: cache plugin CLI registration entries per command program so completion state generation does not repeat the full plugin sweep in one invocation. Thanks @ScientificProgrammer. - Plugins: reuse gateway-bindable plugin loader cache entries for later default-mode loads without serving default-built registries to gateway-bound requests, reducing repeated plugin registration during dispatch. Refs #61756. Thanks @DmitryPogodaev. - Gateway/secrets: include the caught error message in `secrets.reload` and `secrets.resolve` warning logs while keeping RPC errors generic, so operators can diagnose reload and permission failures. Thanks @davidangularme. diff --git a/extensions/codex/src/app-server/run-attempt.test.ts b/extensions/codex/src/app-server/run-attempt.test.ts index cf2ca98521d..6e49a7e558d 100644 --- a/extensions/codex/src/app-server/run-attempt.test.ts +++ b/extensions/codex/src/app-server/run-attempt.test.ts @@ -2017,6 +2017,58 @@ describe("runCodexAppServerAttempt", () => { expect(requests).toEqual([["thread/resume"], ["thread/resume", "turn/start"]]); }); + it("tolerates a second app-server close while retrying startup", async () => { + const sessionFile = path.join(tempDir, "session.jsonl"); + const workspaceDir = path.join(tempDir, "workspace"); + await writeExistingBinding(sessionFile, workspaceDir, { dynamicToolsFingerprint: "[]" }); + const requests: string[][] = []; + let starts = 0; + let notify: (notification: CodexServerNotification) => Promise = async () => undefined; + __testing.setCodexAppServerClientFactoryForTests(async () => { + const startIndex = starts++; + const methods: string[] = []; + requests.push(methods); + return { + request: vi.fn(async (method: string) => { + methods.push(method); + if (method === "thread/resume" && startIndex < 2) { + throw new Error("codex app-server client is closed"); + } + if (method === "thread/resume") { + return threadStartResult("thread-existing"); + } + if (method === "turn/start") { + return turnStartResult(); + } + return {}; + }), + addNotificationHandler: (handler: typeof notify) => { + notify = handler; + return () => undefined; + }, + addRequestHandler: () => () => undefined, + } as never; + }); + + const run = runCodexAppServerAttempt(createParams(sessionFile, workspaceDir)); + await vi.waitFor(() => expect(requests[2]).toContain("turn/start"), { interval: 1 }); + await notify({ + method: "turn/completed", + params: { + threadId: "thread-existing", + turnId: "turn-1", + turn: { id: "turn-1", status: "completed" }, + }, + }); + + await expect(run).resolves.toMatchObject({ aborted: false }); + expect(requests).toEqual([ + ["thread/resume"], + ["thread/resume"], + ["thread/resume", "turn/start"], + ]); + }); + it("passes native hook relay config on thread start and resume", async () => { const sessionFile = path.join(tempDir, "session.jsonl"); const workspaceDir = path.join(tempDir, "workspace"); diff --git a/extensions/codex/src/app-server/run-attempt.ts b/extensions/codex/src/app-server/run-attempt.ts index f4856592810..bfd765f6190 100644 --- a/extensions/codex/src/app-server/run-attempt.ts +++ b/extensions/codex/src/app-server/run-attempt.ts @@ -95,6 +95,7 @@ import { createCodexUserInputBridge } from "./user-input-bridge.js"; import { filterToolsForVisionInputs } from "./vision-tools.js"; const CODEX_DYNAMIC_TOOL_TIMEOUT_MS = 30_000; +const CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS = 3; const CODEX_TURN_COMPLETION_IDLE_TIMEOUT_MS = 60_000; const CODEX_TURN_TERMINAL_IDLE_TIMEOUT_MS = 30 * 60_000; const CODEX_STEER_ALL_DEBOUNCE_MS = 500; @@ -543,24 +544,51 @@ export async function runCodexAppServerAttempt( }); return { client: startupClient, thread: startupThread }; }; - try { - return await startupAttempt(); - } catch (error) { - if (runAbortController.signal.aborted || !isCodexAppServerConnectionClosedError(error)) { - throw error; + for ( + let attempt = 1; + attempt <= CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS; + attempt += 1 + ) { + try { + return await startupAttempt(); + } catch (error) { + if ( + runAbortController.signal.aborted || + !isCodexAppServerConnectionClosedError(error) + ) { + throw error; + } + const failedClient = attemptedClient; + const clearedSharedClient = clearSharedCodexAppServerClientIfCurrent(failedClient); + if (startupClientForCleanup === failedClient) { + startupClientForCleanup = undefined; + } + attemptedClient = undefined; + if (attempt >= CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS) { + embeddedAgentLog.warn( + "codex app-server connection closed during startup; retries exhausted", + { + attempt, + maxAttempts: CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS, + clearedSharedClient, + error: formatErrorMessage(error), + }, + ); + throw error; + } + embeddedAgentLog.warn( + "codex app-server connection closed during startup; restarting app-server and retrying", + { + attempt, + nextAttempt: attempt + 1, + maxAttempts: CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS, + clearedSharedClient, + error: formatErrorMessage(error), + }, + ); } - embeddedAgentLog.warn( - "codex app-server connection closed during startup; restarting app-server and retrying", - { error }, - ); - const failedClient = attemptedClient; - clearSharedCodexAppServerClientIfCurrent(failedClient); - if (startupClientForCleanup === failedClient) { - startupClientForCleanup = undefined; - } - attemptedClient = undefined; - return await startupAttempt(); } + throw new Error("codex app-server startup retry loop exited unexpectedly"); }, })); startupClientForCleanup = undefined;