diff --git a/CHANGELOG.md b/CHANGELOG.md index fff4a335577..628d6cc8a39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Agents/failover: forward embedded run abort signals into provider-owned model streams, cap implicit LLM idle watchdogs below long run timeouts, and mark 429 responses without usable retry timing as non-retryable so GitHub Copilot rate limits fail over or surface promptly instead of hanging until run timeout. Fixes #71120. - Browser/tool: keep explicit AI snapshots from inheriting the efficient role-snapshot default and preserve numeric Playwright AI refs, so `--format ai` remains a real AI snapshot path. Fixes #62550. Thanks @ly85206559. - Gateway/config: keep in-process config patch reload comparisons on the resolved source snapshot when `${VAR}` env refs are restored on disk, avoiding false full gateway restarts for unchanged gateway/plugin secrets. Fixes #71208. Thanks @robbiethompson18. - Slack/messages: serialize write-client requests and whole outbound sends per target so rapid multi-message Slack replies preserve send order. Fixes #69101. (#69105) Thanks @nightq and @ztexydt-cqh. diff --git a/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts b/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts index 1eb8432da4e..6ff312d01f2 100644 --- a/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts +++ b/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts @@ -50,13 +50,22 @@ describe("resolveLlmIdleTimeoutMs", () => { expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS); }); - it("falls back to agents.defaults.timeoutSeconds when llm.idleTimeoutSeconds is not set", () => { + it("caps agents.defaults.timeoutSeconds fallback at the default idle watchdog", () => { const cfg = { agents: { defaults: { timeoutSeconds: 300 } } } as OpenClawConfig; - expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(300_000); + expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS); }); - it("uses an explicit run timeout override when llm.idleTimeoutSeconds is not set", () => { - expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 900_000 })).toBe(900_000); + it("uses agents.defaults.timeoutSeconds when it is shorter than the default idle watchdog", () => { + const cfg = { agents: { defaults: { timeoutSeconds: 30 } } } as OpenClawConfig; + expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(30_000); + }); + + it("caps an explicit run timeout override at the default idle watchdog", () => { + expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 900_000 })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS); + }); + + it("uses an explicit run timeout override when shorter than the default idle watchdog", () => { + expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 30_000 })).toBe(30_000); }); it("disables the idle watchdog when an explicit run timeout disables timeouts", () => { @@ -91,9 +100,9 @@ describe("resolveLlmIdleTimeoutMs", () => { expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(0); }); - it("uses agents.defaults.timeoutSeconds for cron before disabling the default idle timeout", () => { + it("caps agents.defaults.timeoutSeconds for cron before disabling the default idle timeout", () => { const cfg = { agents: { defaults: { timeoutSeconds: 300 } } } as OpenClawConfig; - expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(300_000); + expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS); }); it("keeps an explicit cron idle timeout when configured", () => { diff --git a/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts b/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts index ab16eed42da..160cbb12de3 100644 --- a/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts +++ b/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts @@ -25,6 +25,8 @@ export function resolveLlmIdleTimeoutMs(params?: { runTimeoutMs?: number; }): number { const clampTimeoutMs = (valueMs: number) => Math.min(Math.floor(valueMs), MAX_SAFE_TIMEOUT_MS); + const clampImplicitTimeoutMs = (valueMs: number) => + clampTimeoutMs(Math.min(valueMs, DEFAULT_LLM_IDLE_TIMEOUT_MS)); const raw = params?.cfg?.agents?.defaults?.llm?.idleTimeoutSeconds; // 0 means explicitly disabled (no timeout). if (raw === 0) { @@ -39,7 +41,7 @@ export function resolveLlmIdleTimeoutMs(params?: { if (runTimeoutMs >= MAX_SAFE_TIMEOUT_MS) { return 0; } - return clampTimeoutMs(runTimeoutMs); + return clampImplicitTimeoutMs(runTimeoutMs); } const agentTimeoutSeconds = params?.cfg?.agents?.defaults?.timeoutSeconds; @@ -48,7 +50,7 @@ export function resolveLlmIdleTimeoutMs(params?: { Number.isFinite(agentTimeoutSeconds) && agentTimeoutSeconds > 0 ) { - return clampTimeoutMs(agentTimeoutSeconds * 1000); + return clampImplicitTimeoutMs(agentTimeoutSeconds * 1000); } if (params?.trigger === "cron") { diff --git a/src/agents/pi-embedded-runner/stream-resolution.test.ts b/src/agents/pi-embedded-runner/stream-resolution.test.ts index 287b234941e..ceea120e18b 100644 --- a/src/agents/pi-embedded-runner/stream-resolution.test.ts +++ b/src/agents/pi-embedded-runner/stream-resolution.test.ts @@ -153,4 +153,54 @@ describe("resolveEmbeddedAgentStreamFn", () => { expect(authStorage.getApiKey).not.toHaveBeenCalled(); expect(providerStreamFn).toHaveBeenCalledTimes(1); }); + + it("forwards the run abort signal into provider-owned stream functions", async () => { + const providerStreamFn = vi.fn(async (_model, _context, options) => options); + const signal = new AbortController().signal; + const streamFn = resolveEmbeddedAgentStreamFn({ + currentStreamFn: undefined, + providerStreamFn, + shouldUseWebSocketTransport: false, + sessionId: "session-1", + signal, + model: { + api: "openai-responses", + provider: "github-copilot", + id: "gpt-5.4", + } as never, + resolvedApiKey: "resolved-key", + }); + + await expect( + streamFn({ provider: "github-copilot", id: "gpt-5.4" } as never, {} as never, {}), + ).resolves.toMatchObject({ + signal, + }); + }); + + it("does not overwrite an explicit provider-owned stream signal", async () => { + const providerStreamFn = vi.fn(async (_model, _context, options) => options); + const runSignal = new AbortController().signal; + const explicitSignal = new AbortController().signal; + const streamFn = resolveEmbeddedAgentStreamFn({ + currentStreamFn: undefined, + providerStreamFn, + shouldUseWebSocketTransport: false, + sessionId: "session-1", + signal: runSignal, + model: { + api: "openai-responses", + provider: "github-copilot", + id: "gpt-5.4", + } as never, + }); + + await expect( + streamFn({ provider: "github-copilot", id: "gpt-5.4" } as never, {} as never, { + signal: explicitSignal, + }), + ).resolves.toMatchObject({ + signal: explicitSignal, + }); + }); }); diff --git a/src/agents/pi-embedded-runner/stream-resolution.ts b/src/agents/pi-embedded-runner/stream-resolution.ts index a3cbdc21bed..1f2b504690b 100644 --- a/src/agents/pi-embedded-runner/stream-resolution.ts +++ b/src/agents/pi-embedded-runner/stream-resolution.ts @@ -81,6 +81,10 @@ export function resolveEmbeddedAgentStreamFn(params: { systemPrompt: stripSystemPromptCacheBoundary(context.systemPrompt), } : context; + const mergeRunSignal = (options: Parameters[2]) => { + const signal = options?.signal ?? params.signal; + return signal ? { ...options, signal } : options; + }; // Provider-owned transports bypass pi-coding-agent's default auth lookup, // so keep injecting the resolved runtime apiKey for streamSimple-compatible // transports that still read credentials from options.apiKey. @@ -93,12 +97,12 @@ export function resolveEmbeddedAgentStreamFn(params: { authStorage, }); return inner(m, normalizeContext(context), { - ...options, + ...mergeRunSignal(options), apiKey: apiKey ?? options?.apiKey, }); }; } - return (m, context, options) => inner(m, normalizeContext(context), options); + return (m, context, options) => inner(m, normalizeContext(context), mergeRunSignal(options)); } const currentStreamFn = params.currentStreamFn ?? streamSimple; diff --git a/src/agents/provider-transport-fetch.test.ts b/src/agents/provider-transport-fetch.test.ts index 5777f0bf622..7ed262e0647 100644 --- a/src/agents/provider-transport-fetch.test.ts +++ b/src/agents/provider-transport-fetch.test.ts @@ -213,6 +213,46 @@ describe("buildGuardedModelFetch", () => { expect(response.headers.get("x-should-retry")).toBe("false"); }); + it("injects x-should-retry:false for terminal 429 responses without retry-after", async () => { + fetchWithSsrFGuardMock.mockResolvedValue({ + response: new Response("Sorry, you've exceeded your weekly rate limit.", { + status: 429, + headers: { "content-type": "text/plain; charset=utf-8" }, + }), + finalUrl: "https://api.individual.githubcopilot.com/responses", + release: vi.fn(async () => undefined), + }); + + const { buildGuardedModelFetch } = await import("./provider-transport-fetch.js"); + const response = await buildGuardedModelFetch(openaiModel)( + "https://api.individual.githubcopilot.com/responses", + { method: "POST" }, + ); + + expect(response.status).toBe(429); + expect(response.headers.get("x-should-retry")).toBe("false"); + await expect(response.text()).resolves.toContain("weekly rate limit"); + }); + + it("keeps short retry-after 429 responses retryable", async () => { + fetchWithSsrFGuardMock.mockResolvedValue({ + response: new Response(null, { + status: 429, + headers: { "retry-after": "30" }, + }), + finalUrl: "https://api.anthropic.com/v1/messages", + release: vi.fn(async () => undefined), + }); + + const { buildGuardedModelFetch } = await import("./provider-transport-fetch.js"); + const response = await buildGuardedModelFetch(anthropicModel)( + "https://api.anthropic.com/v1/messages", + { method: "POST" }, + ); + + expect(response.headers.get("x-should-retry")).toBeNull(); + }); + it("can be disabled with OPENCLAW_SDK_RETRY_MAX_WAIT_SECONDS=0", async () => { process.env.OPENCLAW_SDK_RETRY_MAX_WAIT_SECONDS = "0"; fetchWithSsrFGuardMock.mockResolvedValue({ @@ -252,7 +292,7 @@ describe("buildGuardedModelFetch", () => { expect(response.headers.get("x-should-retry")).toBeNull(); }); - it("ignores malformed retry-after values", async () => { + it("treats malformed 429 retry-after values as terminal", async () => { fetchWithSsrFGuardMock.mockResolvedValue({ response: new Response(null, { status: 429, @@ -268,7 +308,7 @@ describe("buildGuardedModelFetch", () => { { method: "POST" }, ); - expect(response.headers.get("x-should-retry")).toBeNull(); + expect(response.headers.get("x-should-retry")).toBe("false"); }); it("ignores retry-after on non-retryable responses", async () => { diff --git a/src/agents/provider-transport-fetch.ts b/src/agents/provider-transport-fetch.ts index 5842e553915..dadc4308242 100644 --- a/src/agents/provider-transport-fetch.ts +++ b/src/agents/provider-transport-fetch.ts @@ -68,7 +68,11 @@ function shouldBypassLongSdkRetry(response: Response): boolean { } const retryAfterSeconds = parseRetryAfterSeconds(response.headers); - return retryAfterSeconds !== undefined && retryAfterSeconds > maxWaitSeconds; + if (retryAfterSeconds !== undefined) { + return retryAfterSeconds > maxWaitSeconds; + } + + return status === 429; } function buildManagedResponse(response: Response, release: () => Promise): Response {