fix: retry codex app-server startup closes

This commit is contained in:
Peter Steinberger
2026-05-02 11:32:05 +01:00
parent 8631cadf5b
commit e5dc3f712e
3 changed files with 97 additions and 16 deletions

View File

@@ -40,6 +40,7 @@ Docs: https://docs.openclaw.ai
- Memory Wiki: accept relative Markdown links that include the `.md` suffix during broken-wikilink validation, avoiding false positives for native render-mode links. Thanks @Kenneth8128.
- OpenAI Codex: show the device-pairing code in the interactive SSH/headless prompt while keeping the short-lived code out of persistent runtime logs. Fixes #74212. Thanks @da22le123.
- QA Lab: stop gateway children when the suite parent disappears, so interrupted local QA runs cannot leave hot orphaned gateways behind.
- Codex/app-server: tolerate a second connection close during startup recovery and include retry counts plus stringified errors in the restart warning, so concurrent lanes do not fail after one shared-client race.
- Plugins/CLI: cache plugin CLI registration entries per command program so completion state generation does not repeat the full plugin sweep in one invocation. Thanks @ScientificProgrammer.
- Plugins: reuse gateway-bindable plugin loader cache entries for later default-mode loads without serving default-built registries to gateway-bound requests, reducing repeated plugin registration during dispatch. Refs #61756. Thanks @DmitryPogodaev.
- Gateway/secrets: include the caught error message in `secrets.reload` and `secrets.resolve` warning logs while keeping RPC errors generic, so operators can diagnose reload and permission failures. Thanks @davidangularme.

View File

@@ -2017,6 +2017,58 @@ describe("runCodexAppServerAttempt", () => {
expect(requests).toEqual([["thread/resume"], ["thread/resume", "turn/start"]]);
});
it("tolerates a second app-server close while retrying startup", async () => {
const sessionFile = path.join(tempDir, "session.jsonl");
const workspaceDir = path.join(tempDir, "workspace");
await writeExistingBinding(sessionFile, workspaceDir, { dynamicToolsFingerprint: "[]" });
const requests: string[][] = [];
let starts = 0;
let notify: (notification: CodexServerNotification) => Promise<void> = async () => undefined;
__testing.setCodexAppServerClientFactoryForTests(async () => {
const startIndex = starts++;
const methods: string[] = [];
requests.push(methods);
return {
request: vi.fn(async (method: string) => {
methods.push(method);
if (method === "thread/resume" && startIndex < 2) {
throw new Error("codex app-server client is closed");
}
if (method === "thread/resume") {
return threadStartResult("thread-existing");
}
if (method === "turn/start") {
return turnStartResult();
}
return {};
}),
addNotificationHandler: (handler: typeof notify) => {
notify = handler;
return () => undefined;
},
addRequestHandler: () => () => undefined,
} as never;
});
const run = runCodexAppServerAttempt(createParams(sessionFile, workspaceDir));
await vi.waitFor(() => expect(requests[2]).toContain("turn/start"), { interval: 1 });
await notify({
method: "turn/completed",
params: {
threadId: "thread-existing",
turnId: "turn-1",
turn: { id: "turn-1", status: "completed" },
},
});
await expect(run).resolves.toMatchObject({ aborted: false });
expect(requests).toEqual([
["thread/resume"],
["thread/resume"],
["thread/resume", "turn/start"],
]);
});
it("passes native hook relay config on thread start and resume", async () => {
const sessionFile = path.join(tempDir, "session.jsonl");
const workspaceDir = path.join(tempDir, "workspace");

View File

@@ -95,6 +95,7 @@ import { createCodexUserInputBridge } from "./user-input-bridge.js";
import { filterToolsForVisionInputs } from "./vision-tools.js";
const CODEX_DYNAMIC_TOOL_TIMEOUT_MS = 30_000;
const CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS = 3;
const CODEX_TURN_COMPLETION_IDLE_TIMEOUT_MS = 60_000;
const CODEX_TURN_TERMINAL_IDLE_TIMEOUT_MS = 30 * 60_000;
const CODEX_STEER_ALL_DEBOUNCE_MS = 500;
@@ -543,24 +544,51 @@ export async function runCodexAppServerAttempt(
});
return { client: startupClient, thread: startupThread };
};
try {
return await startupAttempt();
} catch (error) {
if (runAbortController.signal.aborted || !isCodexAppServerConnectionClosedError(error)) {
throw error;
for (
let attempt = 1;
attempt <= CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS;
attempt += 1
) {
try {
return await startupAttempt();
} catch (error) {
if (
runAbortController.signal.aborted ||
!isCodexAppServerConnectionClosedError(error)
) {
throw error;
}
const failedClient = attemptedClient;
const clearedSharedClient = clearSharedCodexAppServerClientIfCurrent(failedClient);
if (startupClientForCleanup === failedClient) {
startupClientForCleanup = undefined;
}
attemptedClient = undefined;
if (attempt >= CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS) {
embeddedAgentLog.warn(
"codex app-server connection closed during startup; retries exhausted",
{
attempt,
maxAttempts: CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS,
clearedSharedClient,
error: formatErrorMessage(error),
},
);
throw error;
}
embeddedAgentLog.warn(
"codex app-server connection closed during startup; restarting app-server and retrying",
{
attempt,
nextAttempt: attempt + 1,
maxAttempts: CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS,
clearedSharedClient,
error: formatErrorMessage(error),
},
);
}
embeddedAgentLog.warn(
"codex app-server connection closed during startup; restarting app-server and retrying",
{ error },
);
const failedClient = attemptedClient;
clearSharedCodexAppServerClientIfCurrent(failedClient);
if (startupClientForCleanup === failedClient) {
startupClientForCleanup = undefined;
}
attemptedClient = undefined;
return await startupAttempt();
}
throw new Error("codex app-server startup retry loop exited unexpectedly");
},
}));
startupClientForCleanup = undefined;