mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-28 06:59:56 +00:00
fix(codex): keep turn timeouts inside Codex (#86476)
Keep Codex app-server turn timeouts within the Codex runtime boundary so they interrupt the active turn without retiring the shared app-server client, poisoning auth-profile cooldowns, or falling through to generic provider/model fallback. Preserve concrete non-timeout provider failures for auth-profile rotation and fallback, and add regression coverage for prompt-stage timeouts, assistant idle timeouts, auth-profile cooldowns, and app-server timeout handling. Thanks @pashpashpash.
This commit is contained in:
@@ -4160,7 +4160,7 @@ describe("runCodexAppServerAttempt", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("closes the app-server client when the active turn goes idle past the attempt timeout", async () => {
|
||||
it("interrupts but keeps the app-server client alive when a turn timeout fires", async () => {
|
||||
const close = vi.fn();
|
||||
const request = vi.fn(async (method: string) => {
|
||||
if (method === "thread/start") {
|
||||
@@ -4204,7 +4204,7 @@ describe("runCodexAppServerAttempt", () => {
|
||||
},
|
||||
{ timeoutMs: 5_000 },
|
||||
);
|
||||
expect(close).toHaveBeenCalledTimes(1);
|
||||
expect(close).not.toHaveBeenCalled();
|
||||
expect(queueActiveRunMessageForTest("session-1", "after timeout")).toBe(false);
|
||||
});
|
||||
|
||||
@@ -5519,6 +5519,7 @@ describe("runCodexAppServerAttempt", () => {
|
||||
});
|
||||
|
||||
it("does not treat global rate-limit notifications as turn progress", async () => {
|
||||
const warn = vi.spyOn(embeddedAgentLog, "warn").mockImplementation(() => undefined);
|
||||
const harness = createStartedThreadHarness();
|
||||
const params = createParams(
|
||||
path.join(tempDir, "session.jsonl"),
|
||||
@@ -5561,6 +5562,10 @@ describe("runCodexAppServerAttempt", () => {
|
||||
),
|
||||
{ interval: 1 },
|
||||
);
|
||||
expect(warn).not.toHaveBeenCalledWith(
|
||||
"codex app-server client retired after timed-out turn",
|
||||
expect.anything(),
|
||||
);
|
||||
});
|
||||
|
||||
it("yields a macrotask before processing queued app-server notifications", async () => {
|
||||
|
||||
@@ -3229,19 +3229,11 @@ export async function runCodexAppServerAttempt(
|
||||
touchTurnCompletionActivity("turn:start", { attemptProgress: true });
|
||||
|
||||
const abortListener = () => {
|
||||
const shouldRetireClient = timedOut;
|
||||
interruptCodexTurnBestEffort(client, {
|
||||
threadId: thread.threadId,
|
||||
turnId: activeTurnId,
|
||||
timeoutMs: shouldRetireClient ? CODEX_APP_SERVER_INTERRUPT_TIMEOUT_MS : undefined,
|
||||
timeoutMs: timedOut ? CODEX_APP_SERVER_INTERRUPT_TIMEOUT_MS : undefined,
|
||||
});
|
||||
if (shouldRetireClient) {
|
||||
retireCodexAppServerClientAfterTimedOutTurn(client, {
|
||||
threadId: thread.threadId,
|
||||
turnId: activeTurnId,
|
||||
reason: String(runAbortController.signal.reason ?? "timeout"),
|
||||
});
|
||||
}
|
||||
resolveCompletion?.();
|
||||
};
|
||||
runAbortController.signal.addEventListener("abort", abortListener, { once: true });
|
||||
@@ -4057,29 +4049,6 @@ async function unsubscribeCodexThreadBestEffort(
|
||||
}
|
||||
}
|
||||
|
||||
function retireCodexAppServerClientAfterTimedOutTurn(
|
||||
client: CodexAppServerClient,
|
||||
params: {
|
||||
threadId: string;
|
||||
turnId: string;
|
||||
reason: string;
|
||||
},
|
||||
): void {
|
||||
const clearedSharedClient = clearSharedCodexAppServerClientIfCurrent(client);
|
||||
if (!clearedSharedClient) {
|
||||
const close = (client as { close?: () => void }).close;
|
||||
if (typeof close === "function") {
|
||||
close.call(client);
|
||||
}
|
||||
}
|
||||
embeddedAgentLog.warn("codex app-server client retired after timed-out turn", {
|
||||
threadId: params.threadId,
|
||||
turnId: params.turnId,
|
||||
reason: params.reason,
|
||||
clearedSharedClient,
|
||||
});
|
||||
}
|
||||
|
||||
type DynamicToolBuildParams = {
|
||||
params: EmbeddedRunAttemptParams;
|
||||
resolvedWorkspace: string;
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
loadRunOverflowCompactionHarness,
|
||||
MockedFailoverError,
|
||||
mockedClassifyFailoverReason,
|
||||
mockedMarkAuthProfileFailure,
|
||||
mockedRunEmbeddedAttempt,
|
||||
overflowBaseRunParams,
|
||||
resetRunOverflowCompactionHarnessMocks,
|
||||
@@ -203,6 +204,7 @@ describe("runEmbeddedPiAgent Codex app-server recovery", () => {
|
||||
"codex app-server turn idle timed out waiting for turn/completed",
|
||||
);
|
||||
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
|
||||
expect(mockedMarkAuthProfileFailure).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does not retry after visible assistant output", async () => {
|
||||
|
||||
@@ -221,6 +221,7 @@ export const mockedGetApiKeyForModel = vi.fn(
|
||||
mode: "api-key" as const,
|
||||
}),
|
||||
);
|
||||
export const mockedMarkAuthProfileFailure = vi.fn(async () => {});
|
||||
export const mockedEnsureAuthProfileStore = vi.fn(() => ({}));
|
||||
export const mockedEnsureAuthProfileStoreWithoutExternalProfiles = vi.fn(
|
||||
(_agentDir?: string, _options?: { allowKeychainPrompt?: boolean }) => ({}),
|
||||
@@ -408,6 +409,8 @@ export function resetRunOverflowCompactionHarnessMocks(): void {
|
||||
mode: "api-key",
|
||||
}),
|
||||
);
|
||||
mockedMarkAuthProfileFailure.mockReset();
|
||||
mockedMarkAuthProfileFailure.mockResolvedValue(undefined);
|
||||
mockedEnsureAuthProfileStore.mockReset();
|
||||
mockedEnsureAuthProfileStore.mockReturnValue({});
|
||||
mockedEnsureAuthProfileStoreWithoutExternalProfiles.mockReset();
|
||||
@@ -463,7 +466,7 @@ export async function loadRunOverflowCompactionHarness(): Promise<{
|
||||
|
||||
vi.doMock("../auth-profiles.js", () => ({
|
||||
isProfileInCooldown: vi.fn(() => false),
|
||||
markAuthProfileFailure: vi.fn(async () => {}),
|
||||
markAuthProfileFailure: mockedMarkAuthProfileFailure,
|
||||
markAuthProfileSuccess: mockedMarkAuthProfileSuccess,
|
||||
resolveProfilesUnavailableReason: vi.fn(() => undefined),
|
||||
}));
|
||||
|
||||
@@ -1153,6 +1153,11 @@ export async function runEmbeddedPiAgent(
|
||||
if (!profileId || !reason) {
|
||||
return;
|
||||
}
|
||||
if (pluginHarnessOwnsTransport && reason === "timeout") {
|
||||
// Harness-owned transport timeouts are lifecycle failures, not
|
||||
// credential evidence. Do not poison OpenClaw auth cooldowns.
|
||||
return;
|
||||
}
|
||||
await markAuthProfileFailure({
|
||||
store: profileFailureStore,
|
||||
profileId,
|
||||
@@ -2393,6 +2398,7 @@ export async function runEmbeddedPiAgent(
|
||||
fallbackConfigured,
|
||||
failoverFailure: promptFailoverFailure,
|
||||
failoverReason: promptFailoverReason,
|
||||
harnessOwnsTransport: pluginHarnessOwnsTransport,
|
||||
profileRotated: false,
|
||||
});
|
||||
if (
|
||||
@@ -2431,6 +2437,7 @@ export async function runEmbeddedPiAgent(
|
||||
fallbackConfigured,
|
||||
failoverFailure: promptFailoverFailure,
|
||||
failoverReason: promptFailoverReason,
|
||||
harnessOwnsTransport: pluginHarnessOwnsTransport,
|
||||
profileRotated: true,
|
||||
});
|
||||
}
|
||||
@@ -2597,6 +2604,7 @@ export async function runEmbeddedPiAgent(
|
||||
idleTimedOut,
|
||||
timedOutDuringCompaction,
|
||||
timedOutDuringToolExecution,
|
||||
harnessOwnsTransport: pluginHarnessOwnsTransport,
|
||||
profileRotated: false,
|
||||
});
|
||||
const assistantFailoverOutcome = await handleAssistantFailover({
|
||||
|
||||
@@ -298,6 +298,71 @@ describe("resolveRunFailoverDecision", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("does not rotate harness-owned assistant timeouts", () => {
|
||||
expect(
|
||||
resolveRunFailoverDecision({
|
||||
stage: "assistant",
|
||||
aborted: true,
|
||||
externalAbort: false,
|
||||
fallbackConfigured: true,
|
||||
failoverFailure: false,
|
||||
failoverReason: null,
|
||||
timedOut: true,
|
||||
idleTimedOut: false,
|
||||
timedOutDuringCompaction: false,
|
||||
timedOutDuringToolExecution: false,
|
||||
harnessOwnsTransport: true,
|
||||
profileRotated: false,
|
||||
}),
|
||||
).toEqual({
|
||||
action: "continue_normal",
|
||||
});
|
||||
});
|
||||
|
||||
it("rotates concrete assistant failover failures that accompany harness-owned timeouts", () => {
|
||||
expect(
|
||||
resolveRunFailoverDecision({
|
||||
stage: "assistant",
|
||||
aborted: false,
|
||||
externalAbort: false,
|
||||
fallbackConfigured: true,
|
||||
failoverFailure: true,
|
||||
failoverReason: "rate_limit",
|
||||
timedOut: true,
|
||||
idleTimedOut: false,
|
||||
timedOutDuringCompaction: false,
|
||||
timedOutDuringToolExecution: false,
|
||||
harnessOwnsTransport: true,
|
||||
profileRotated: false,
|
||||
}),
|
||||
).toEqual({
|
||||
action: "rotate_profile",
|
||||
reason: "rate_limit",
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back with the concrete assistant failover reason after harness-owned timeout rotation is exhausted", () => {
|
||||
expect(
|
||||
resolveRunFailoverDecision({
|
||||
stage: "assistant",
|
||||
aborted: false,
|
||||
externalAbort: false,
|
||||
fallbackConfigured: true,
|
||||
failoverFailure: true,
|
||||
failoverReason: "billing",
|
||||
timedOut: true,
|
||||
idleTimedOut: false,
|
||||
timedOutDuringCompaction: false,
|
||||
timedOutDuringToolExecution: false,
|
||||
harnessOwnsTransport: true,
|
||||
profileRotated: true,
|
||||
}),
|
||||
).toEqual({
|
||||
action: "fallback_model",
|
||||
reason: "billing",
|
||||
});
|
||||
});
|
||||
|
||||
it("treats idle watchdog timeouts during tool execution as model silence", () => {
|
||||
expect(
|
||||
resolveRunFailoverDecision({
|
||||
@@ -403,6 +468,45 @@ describe("resolveRunFailoverDecision", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("does not fallback harness-owned LLM idle timeouts after profile rotation is exhausted", () => {
|
||||
expect(
|
||||
resolveRunFailoverDecision({
|
||||
stage: "assistant",
|
||||
aborted: false,
|
||||
externalAbort: false,
|
||||
fallbackConfigured: true,
|
||||
failoverFailure: false,
|
||||
failoverReason: null,
|
||||
timedOut: false,
|
||||
idleTimedOut: true,
|
||||
timedOutDuringCompaction: false,
|
||||
timedOutDuringToolExecution: false,
|
||||
harnessOwnsTransport: true,
|
||||
profileRotated: true,
|
||||
}),
|
||||
).toEqual({
|
||||
action: "continue_normal",
|
||||
});
|
||||
});
|
||||
|
||||
it("surfaces harness-owned prompt timeouts instead of falling back", () => {
|
||||
expect(
|
||||
resolveRunFailoverDecision({
|
||||
stage: "prompt",
|
||||
aborted: false,
|
||||
externalAbort: false,
|
||||
fallbackConfigured: true,
|
||||
failoverFailure: true,
|
||||
failoverReason: "timeout",
|
||||
harnessOwnsTransport: true,
|
||||
profileRotated: true,
|
||||
}),
|
||||
).toEqual({
|
||||
action: "surface_error",
|
||||
reason: "timeout",
|
||||
});
|
||||
});
|
||||
|
||||
it("surfaces error on LLM idle timeout when no fallback is configured and rotation is exhausted", () => {
|
||||
expect(
|
||||
resolveRunFailoverDecision({
|
||||
|
||||
@@ -45,6 +45,7 @@ type PromptDecisionParams = {
|
||||
fallbackConfigured: boolean;
|
||||
failoverFailure: boolean;
|
||||
failoverReason: FailoverReason | null;
|
||||
harnessOwnsTransport?: boolean;
|
||||
profileRotated: boolean;
|
||||
};
|
||||
|
||||
@@ -60,6 +61,7 @@ type AssistantDecisionParams = {
|
||||
idleTimedOut: boolean;
|
||||
timedOutDuringCompaction: boolean;
|
||||
timedOutDuringToolExecution: boolean;
|
||||
harnessOwnsTransport?: boolean;
|
||||
profileRotated: boolean;
|
||||
};
|
||||
|
||||
@@ -103,11 +105,33 @@ function isAssistantTimeoutFailure(params: AssistantDecisionParams): boolean {
|
||||
);
|
||||
}
|
||||
|
||||
function isConcreteNonTimeoutAssistantFailure(params: AssistantDecisionParams): boolean {
|
||||
return (
|
||||
params.failoverFailure && Boolean(params.failoverReason) && params.failoverReason !== "timeout"
|
||||
);
|
||||
}
|
||||
|
||||
function shouldRotateAssistant(params: AssistantDecisionParams): boolean {
|
||||
if (isTerminalFormatFailure(params)) {
|
||||
return false;
|
||||
}
|
||||
return (!params.aborted && params.failoverFailure) || isAssistantTimeoutFailure(params);
|
||||
const timeoutFailure = isAssistantTimeoutFailure(params);
|
||||
if (
|
||||
timeoutFailure &&
|
||||
params.harnessOwnsTransport &&
|
||||
!isConcreteNonTimeoutAssistantFailure(params)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
return (!params.aborted && params.failoverFailure) || timeoutFailure;
|
||||
}
|
||||
|
||||
function assistantFallbackReason(params: AssistantDecisionParams): FailoverReason {
|
||||
const failoverReason = params.failoverReason;
|
||||
if (params.failoverFailure && failoverReason && failoverReason !== "timeout") {
|
||||
return failoverReason;
|
||||
}
|
||||
return isAssistantTimeoutFailure(params) ? "timeout" : (failoverReason ?? "unknown");
|
||||
}
|
||||
|
||||
export function mergeRetryFailoverReason(params: {
|
||||
@@ -146,6 +170,12 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R
|
||||
reason: params.failoverReason,
|
||||
};
|
||||
}
|
||||
if (params.harnessOwnsTransport && params.failoverReason === "timeout") {
|
||||
return {
|
||||
action: "surface_error",
|
||||
reason: params.failoverReason,
|
||||
};
|
||||
}
|
||||
if (!params.profileRotated && shouldRotatePrompt(params)) {
|
||||
return {
|
||||
action: "rotate_profile",
|
||||
@@ -186,7 +216,7 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R
|
||||
if (assistantShouldRotate && params.fallbackConfigured) {
|
||||
return {
|
||||
action: "fallback_model",
|
||||
reason: isAssistantTimeoutFailure(params) ? "timeout" : (params.failoverReason ?? "unknown"),
|
||||
reason: assistantFallbackReason(params),
|
||||
};
|
||||
}
|
||||
if (!assistantShouldRotate) {
|
||||
|
||||
Reference in New Issue
Block a user