diff --git a/CHANGELOG.md b/CHANGELOG.md index 84b1c55a5c2..ff6e0413cea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ Docs: https://docs.openclaw.ai - Channels/Telegram: publish webhook runtime state and warn when `setWebhook` has not completed after startup grace, so webhook-mode accounts no longer look healthy while registration is still failing or retrying. Refs #74299. Thanks @lolaopenclaw and @martingarramon. - Channels/Telegram: bound native command menu `deleteMyCommands` and `setMyCommands` Bot API calls and allow the same timeout-triggered transport fallback retry as other startup control calls, so Windows/WSL network stalls cannot leave command sync hanging behind an otherwise running provider. Refs #74086. Thanks @SymbolStar. - ACP/commands: accept forwarded ACP timeout config controls in the OpenClaw bridge, treat unsupported discard-close controls as recoverable cleanup, and restore native `/verbose full` plus no-arg status behavior, so Discord command menus and nested ACP turns no longer fail on supported session controls. Thanks @vincentkoc. +- Codex harness: interrupt and release native app-server turns that go quiet after an OpenClaw dynamic-tool response without sending `turn/completed`, so Discord and other chat lanes do not stay stuck in `processing`. Thanks @vincentkoc. - Codex harness: bound OpenClaw dynamic tool responses to 30 seconds and fail closed with an explicit tool result when the app-server bridge would otherwise strand the turn in `processing`. Thanks @vincentkoc. - TUI/status: clear stale `streaming` footer state when a final event arrives after the active run was already cleared and no tracked runs remain, while preserving concurrent-run ownership and inactive local `/btw` terminal handling. Fixes #64825; carries forward #64842, #64843, #64847, and #64862. Thanks @briandevans and @Yanhu007. - Channels/Discord: fail startup closed when Discord cannot resolve the bot's own identity and keep mention gating active when only configured mention patterns can detect mentions, so the provider no longer continues with a missing bot id. Fixes #42219; carries forward #46856 and #49218. Thanks @education-01 and @BenediktSchackenberg. diff --git a/docs/plugins/codex-harness.md b/docs/plugins/codex-harness.md index 5c77cca4dbe..dd9a835bfe7 100644 --- a/docs/plugins/codex-harness.md +++ b/docs/plugins/codex-harness.md @@ -575,6 +575,13 @@ an OpenClaw response within 30 seconds. On timeout, OpenClaw aborts the tool signal where supported and returns a failed dynamic-tool response to Codex so the turn can continue instead of leaving the session in `processing`. +After OpenClaw responds to a Codex turn-scoped app-server request, the harness +also expects Codex to finish the native turn with `turn/completed`. If the +app-server goes quiet for 60 seconds after that response, OpenClaw best-effort +interrupts the Codex turn, records a diagnostic timeout, and releases the +OpenClaw session lane so follow-up chat messages are not queued behind a stale +native turn. + Environment overrides remain available for local testing: - `OPENCLAW_CODEX_APP_SERVER_BIN` diff --git a/extensions/codex/src/app-server/run-attempt.test.ts b/extensions/codex/src/app-server/run-attempt.test.ts index 645900320e9..ee6b880a8f4 100644 --- a/extensions/codex/src/app-server/run-attempt.test.ts +++ b/extensions/codex/src/app-server/run-attempt.test.ts @@ -370,6 +370,79 @@ describe("runCodexAppServerAttempt", () => { expect(onTimeout).toHaveBeenCalledTimes(1); }); + it("releases the session when Codex never completes after a dynamic tool response", async () => { + let handleRequest: + | ((request: { id: string; method: string; params?: unknown }) => Promise) + | undefined; + const request = vi.fn(async (method: string) => { + if (method === "thread/start") { + return threadStartResult("thread-1"); + } + if (method === "turn/start") { + return turnStartResult("turn-1", "inProgress"); + } + return {}; + }); + __testing.setCodexAppServerClientFactoryForTests( + async () => + ({ + request, + addNotificationHandler: () => () => undefined, + addRequestHandler: ( + handler: (request: { + id: string; + method: string; + params?: unknown; + }) => Promise, + ) => { + handleRequest = handler; + return () => undefined; + }, + }) as never, + ); + const params = createParams( + path.join(tempDir, "session.jsonl"), + path.join(tempDir, "workspace"), + ); + params.timeoutMs = 60_000; + + const run = runCodexAppServerAttempt(params, { turnCompletionIdleTimeoutMs: 5 }); + await vi.waitFor(() => expect(handleRequest).toBeTypeOf("function"), { interval: 1 }); + + await expect( + handleRequest?.({ + id: "request-tool-1", + method: "item/tool/call", + params: { + threadId: "thread-1", + turnId: "turn-1", + callId: "call-1", + namespace: null, + tool: "message", + arguments: { action: "send", text: "already sent" }, + }, + }), + ).resolves.toMatchObject({ + success: false, + contentItems: [{ type: "inputText", text: "Unknown OpenClaw tool: message" }], + }); + + await expect(run).resolves.toMatchObject({ + aborted: true, + timedOut: true, + promptError: "codex app-server turn idle timed out waiting for turn/completed", + }); + await vi.waitFor( + () => + expect(request).toHaveBeenCalledWith("turn/interrupt", { + threadId: "thread-1", + turnId: "turn-1", + }), + { interval: 1 }, + ); + expect(queueAgentHarnessMessage("session-1", "after timeout")).toBe(false); + }); + it("applies before_prompt_build to Codex developer instructions and turn input", async () => { const beforePromptBuild = vi.fn(async () => ({ systemPrompt: "custom codex system", diff --git a/extensions/codex/src/app-server/run-attempt.ts b/extensions/codex/src/app-server/run-attempt.ts index 5a13d9b7d61..536878381e7 100644 --- a/extensions/codex/src/app-server/run-attempt.ts +++ b/extensions/codex/src/app-server/run-attempt.ts @@ -83,6 +83,7 @@ import { createCodexUserInputBridge } from "./user-input-bridge.js"; import { filterToolsForVisionInputs } from "./vision-tools.js"; const CODEX_DYNAMIC_TOOL_TIMEOUT_MS = 30_000; +const CODEX_TURN_COMPLETION_IDLE_TIMEOUT_MS = 60_000; type OpenClawCodingToolsOptions = NonNullable< Parameters<(typeof import("openclaw/plugin-sdk/agent-harness"))["createOpenClawCodingTools"]>[0] @@ -132,6 +133,7 @@ export async function runCodexAppServerAttempt( gatewayTimeoutMs?: number; hookTimeoutSec?: number; }; + turnCompletionIdleTimeoutMs?: number; } = {}, ): Promise { const attemptStartedAt = Date.now(); @@ -364,6 +366,8 @@ export async function runCodexAppServerAttempt( let userInputBridge: ReturnType | undefined; let completed = false; let timedOut = false; + let turnCompletionIdleTimedOut = false; + let turnCompletionIdleTimeoutMessage: string | undefined; let lifecycleStarted = false; let lifecycleTerminalEmitted = false; let resolveCompletion: (() => void) | undefined; @@ -371,6 +375,82 @@ export async function runCodexAppServerAttempt( resolveCompletion = resolve; }); let notificationQueue: Promise = Promise.resolve(); + const turnCompletionIdleTimeoutMs = resolveCodexTurnCompletionIdleTimeoutMs( + options.turnCompletionIdleTimeoutMs, + ); + let turnCompletionIdleTimer: ReturnType | undefined; + let turnCompletionIdleWatchArmed = false; + let turnCompletionLastActivityAt = Date.now(); + let turnCompletionLastActivityReason = "startup"; + let activeAppServerTurnRequests = 0; + + const clearTurnCompletionIdleTimer = () => { + if (turnCompletionIdleTimer) { + clearTimeout(turnCompletionIdleTimer); + turnCompletionIdleTimer = undefined; + } + }; + + const fireTurnCompletionIdleTimeout = () => { + if ( + completed || + runAbortController.signal.aborted || + !turnCompletionIdleWatchArmed || + activeAppServerTurnRequests > 0 + ) { + return; + } + const idleMs = Math.max(0, Date.now() - turnCompletionLastActivityAt); + if (idleMs < turnCompletionIdleTimeoutMs) { + scheduleTurnCompletionIdleWatch(); + return; + } + timedOut = true; + turnCompletionIdleTimedOut = true; + turnCompletionIdleTimeoutMessage = + "codex app-server turn idle timed out waiting for turn/completed"; + projector?.markTimedOut(); + trajectoryRecorder?.recordEvent("turn.completion_idle_timeout", { + threadId: thread.threadId, + turnId, + idleMs, + timeoutMs: turnCompletionIdleTimeoutMs, + lastActivityReason: turnCompletionLastActivityReason, + }); + embeddedAgentLog.warn("codex app-server turn idle timed out waiting for completion", { + threadId: thread.threadId, + turnId, + idleMs, + timeoutMs: turnCompletionIdleTimeoutMs, + lastActivityReason: turnCompletionLastActivityReason, + }); + runAbortController.abort("turn_completion_idle_timeout"); + }; + + function scheduleTurnCompletionIdleWatch() { + clearTurnCompletionIdleTimer(); + if ( + completed || + runAbortController.signal.aborted || + !turnCompletionIdleWatchArmed || + activeAppServerTurnRequests > 0 + ) { + return; + } + const elapsedMs = Math.max(0, Date.now() - turnCompletionLastActivityAt); + const delayMs = Math.max(1, turnCompletionIdleTimeoutMs - elapsedMs); + turnCompletionIdleTimer = setTimeout(fireTurnCompletionIdleTimeout, delayMs); + turnCompletionIdleTimer.unref?.(); + } + + const touchTurnCompletionActivity = (reason: string, options?: { arm?: boolean }) => { + turnCompletionLastActivityAt = Date.now(); + turnCompletionLastActivityReason = reason; + if (options?.arm) { + turnCompletionIdleWatchArmed = true; + } + scheduleTurnCompletionIdleWatch(); + }; const emitLifecycleStart = () => { emitCodexAppServerEvent(params, { @@ -396,6 +476,7 @@ export async function runCodexAppServerAttempt( }; const handleNotification = async (notification: CodexServerNotification) => { + touchTurnCompletionActivity(`notification:${notification.method}`); userInputBridge?.handleNotification(notification); if (!projector || !turnId) { pendingNotifications.push(notification); @@ -417,6 +498,7 @@ export async function runCodexAppServerAttempt( } finally { if (isTurnCompletion) { completed = true; + clearTurnCompletionIdleTimer(); resolveCompletion?.(); } } @@ -431,78 +513,93 @@ export async function runCodexAppServerAttempt( const notificationCleanup = client.addNotificationHandler(enqueueNotification); const requestCleanup = client.addRequestHandler(async (request) => { - if (request.method === "account/chatgptAuthTokens/refresh") { - return refreshCodexAppServerAuthTokens({ - agentDir, - authProfileId: startupAuthProfileId, - }); - } - if (!turnId) { - return undefined; - } - if (request.method === "mcpServer/elicitation/request") { - return handleCodexAppServerElicitationRequest({ - requestParams: request.params, - paramsForRun: params, - threadId: thread.threadId, - turnId, - signal: runAbortController.signal, - }); - } - if (request.method === "item/tool/requestUserInput") { - return userInputBridge?.handleRequest({ - id: request.id, - params: request.params, - }); - } - if (request.method !== "item/tool/call") { - if (isCodexAppServerApprovalRequest(request.method)) { - return handleApprovalRequest({ - method: request.method, - params: request.params, + activeAppServerTurnRequests += 1; + clearTurnCompletionIdleTimer(); + touchTurnCompletionActivity(`request:${request.method}`); + let armCompletionWatchOnResponse = false; + try { + if (request.method === "account/chatgptAuthTokens/refresh") { + return refreshCodexAppServerAuthTokens({ + agentDir, + authProfileId: startupAuthProfileId, + }); + } + if (!turnId) { + return undefined; + } + if (request.method === "mcpServer/elicitation/request") { + armCompletionWatchOnResponse = true; + return handleCodexAppServerElicitationRequest({ + requestParams: request.params, paramsForRun: params, threadId: thread.threadId, turnId, signal: runAbortController.signal, }); } - return undefined; - } - const call = readDynamicToolCallParams(request.params); - if (!call || call.threadId !== thread.threadId || call.turnId !== turnId) { - return undefined; - } - trajectoryRecorder?.recordEvent("tool.call", { - threadId: call.threadId, - turnId: call.turnId, - toolCallId: call.callId, - name: call.tool, - arguments: call.arguments, - }); - const response = await handleDynamicToolCallWithTimeout({ - call, - toolBridge, - signal: runAbortController.signal, - timeoutMs: CODEX_DYNAMIC_TOOL_TIMEOUT_MS, - onTimeout: () => { - trajectoryRecorder?.recordEvent("tool.timeout", { - threadId: call.threadId, - turnId: call.turnId, - toolCallId: call.callId, - name: call.tool, - timeoutMs: CODEX_DYNAMIC_TOOL_TIMEOUT_MS, + if (request.method === "item/tool/requestUserInput") { + armCompletionWatchOnResponse = true; + return userInputBridge?.handleRequest({ + id: request.id, + params: request.params, }); - }, - }); - trajectoryRecorder?.recordEvent("tool.result", { - threadId: call.threadId, - turnId: call.turnId, - toolCallId: call.callId, - name: call.tool, - success: response.success, - contentItems: response.contentItems, - }); - return response as JsonValue; + } + if (request.method !== "item/tool/call") { + if (isCodexAppServerApprovalRequest(request.method)) { + armCompletionWatchOnResponse = true; + return handleApprovalRequest({ + method: request.method, + params: request.params, + paramsForRun: params, + threadId: thread.threadId, + turnId, + signal: runAbortController.signal, + }); + } + return undefined; + } + const call = readDynamicToolCallParams(request.params); + if (!call || call.threadId !== thread.threadId || call.turnId !== turnId) { + return undefined; + } + armCompletionWatchOnResponse = true; + trajectoryRecorder?.recordEvent("tool.call", { + threadId: call.threadId, + turnId: call.turnId, + toolCallId: call.callId, + name: call.tool, + arguments: call.arguments, + }); + const response = await handleDynamicToolCallWithTimeout({ + call, + toolBridge, + signal: runAbortController.signal, + timeoutMs: CODEX_DYNAMIC_TOOL_TIMEOUT_MS, + onTimeout: () => { + trajectoryRecorder?.recordEvent("tool.timeout", { + threadId: call.threadId, + turnId: call.turnId, + toolCallId: call.callId, + name: call.tool, + timeoutMs: CODEX_DYNAMIC_TOOL_TIMEOUT_MS, + }); + }, + }); + trajectoryRecorder?.recordEvent("tool.result", { + threadId: call.threadId, + turnId: call.turnId, + toolCallId: call.callId, + name: call.tool, + success: response.success, + contentItems: response.contentItems, + }); + return response as JsonValue; + } finally { + activeAppServerTurnRequests = Math.max(0, activeAppServerTurnRequests - 1); + touchTurnCompletionActivity(`request:${request.method}:response`, { + arm: armCompletionWatchOnResponse, + }); + } }); const llmInputEvent = { @@ -638,6 +735,7 @@ export async function runCodexAppServerAttempt( abort: () => runAbortController.abort("aborted"), }; setActiveEmbeddedRun(params.sessionId, handle, params.sessionKey); + touchTurnCompletionActivity("turn:start"); const timeout = setTimeout( () => { @@ -664,7 +762,11 @@ export async function runCodexAppServerAttempt( await completion; const result = activeProjector.buildResult(toolBridge.telemetry, { yieldDetected }); const finalAborted = result.aborted || runAbortController.signal.aborted; - const finalPromptError = timedOut ? "codex app-server attempt timed out" : result.promptError; + const finalPromptError = turnCompletionIdleTimedOut + ? turnCompletionIdleTimeoutMessage + : timedOut + ? "codex app-server attempt timed out" + : result.promptError; const finalPromptErrorSource = timedOut ? "prompt" : result.promptErrorSource; recordCodexTrajectoryCompletion(trajectoryRecorder, { attempt: params, @@ -787,6 +889,7 @@ export async function runCodexAppServerAttempt( await trajectoryRecorder?.flush(); userInputBridge?.cancelPending(); clearTimeout(timeout); + clearTurnCompletionIdleTimer(); notificationCleanup(); requestCleanup(); nativeHookRelay?.unregister(); @@ -1055,6 +1158,16 @@ async function withCodexStartupTimeout(params: { } } +function resolveCodexTurnCompletionIdleTimeoutMs(value: number | undefined): number { + if (value === undefined) { + return CODEX_TURN_COMPLETION_IDLE_TIMEOUT_MS; + } + if (!Number.isFinite(value)) { + return CODEX_TURN_COMPLETION_IDLE_TIMEOUT_MS; + } + return Math.max(1, Math.floor(value)); +} + function readDynamicToolCallParams( value: JsonValue | undefined, ): CodexDynamicToolCallParams | undefined { @@ -1166,6 +1279,7 @@ function handleApprovalRequest(params: { export const __testing = { CODEX_DYNAMIC_TOOL_TIMEOUT_MS, + CODEX_TURN_COMPLETION_IDLE_TIMEOUT_MS, filterToolsForVisionInputs, handleDynamicToolCallWithTimeout, ...createCodexAppServerClientFactoryTestHooks((factory) => {