fix(agents): fail fast on terminal provider 429s

2026-05-06 08:40:44 +00:00 · 2026-04-25 01:30:53 +01:00
parent 26bc5e47ee
commit d12987d725
7 changed files with 123 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Agents/failover: forward embedded run abort signals into provider-owned model streams, cap implicit LLM idle watchdogs below long run timeouts, and mark 429 responses without usable retry timing as non-retryable so GitHub Copilot rate limits fail over or surface promptly instead of hanging until run timeout. Fixes #71120.
 - Browser/tool: keep explicit AI snapshots from inheriting the efficient role-snapshot default and preserve numeric Playwright AI refs, so `--format ai` remains a real AI snapshot path. Fixes #62550. Thanks @ly85206559.
 - Gateway/config: keep in-process config patch reload comparisons on the resolved source snapshot when `${VAR}` env refs are restored on disk, avoiding false full gateway restarts for unchanged gateway/plugin secrets. Fixes #71208. Thanks @robbiethompson18.
 - Slack/messages: serialize write-client requests and whole outbound sends per target so rapid multi-message Slack replies preserve send order. Fixes #69101. (#69105) Thanks @nightq and @ztexydt-cqh.
--- a/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts
+++ b/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts
@@ -50,13 +50,22 @@ describe("resolveLlmIdleTimeoutMs", () => {
    expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
  });

-  it("falls back to agents.defaults.timeoutSeconds when llm.idleTimeoutSeconds is not set", () => {
+  it("caps agents.defaults.timeoutSeconds fallback at the default idle watchdog", () => {
    const cfg = { agents: { defaults: { timeoutSeconds: 300 } } } as OpenClawConfig;
-    expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(300_000);
+    expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
  });

-  it("uses an explicit run timeout override when llm.idleTimeoutSeconds is not set", () => {
-    expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 900_000 })).toBe(900_000);
+  it("uses agents.defaults.timeoutSeconds when it is shorter than the default idle watchdog", () => {
+    const cfg = { agents: { defaults: { timeoutSeconds: 30 } } } as OpenClawConfig;
+    expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(30_000);
+  });
+
+  it("caps an explicit run timeout override at the default idle watchdog", () => {
+    expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 900_000 })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
+  });
+
+  it("uses an explicit run timeout override when shorter than the default idle watchdog", () => {
+    expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 30_000 })).toBe(30_000);
  });

  it("disables the idle watchdog when an explicit run timeout disables timeouts", () => {
@@ -91,9 +100,9 @@ describe("resolveLlmIdleTimeoutMs", () => {
    expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(0);
  });

-  it("uses agents.defaults.timeoutSeconds for cron before disabling the default idle timeout", () => {
+  it("caps agents.defaults.timeoutSeconds for cron before disabling the default idle timeout", () => {
    const cfg = { agents: { defaults: { timeoutSeconds: 300 } } } as OpenClawConfig;
-    expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(300_000);
+    expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
  });

  it("keeps an explicit cron idle timeout when configured", () => {
--- a/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts
+++ b/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts
@@ -25,6 +25,8 @@ export function resolveLlmIdleTimeoutMs(params?: {
  runTimeoutMs?: number;
 }): number {
  const clampTimeoutMs = (valueMs: number) => Math.min(Math.floor(valueMs), MAX_SAFE_TIMEOUT_MS);
+  const clampImplicitTimeoutMs = (valueMs: number) =>
+    clampTimeoutMs(Math.min(valueMs, DEFAULT_LLM_IDLE_TIMEOUT_MS));
  const raw = params?.cfg?.agents?.defaults?.llm?.idleTimeoutSeconds;
  // 0 means explicitly disabled (no timeout).
  if (raw === 0) {
@@ -39,7 +41,7 @@ export function resolveLlmIdleTimeoutMs(params?: {
    if (runTimeoutMs >= MAX_SAFE_TIMEOUT_MS) {
      return 0;
    }
-    return clampTimeoutMs(runTimeoutMs);
+    return clampImplicitTimeoutMs(runTimeoutMs);
  }

  const agentTimeoutSeconds = params?.cfg?.agents?.defaults?.timeoutSeconds;
@@ -48,7 +50,7 @@ export function resolveLlmIdleTimeoutMs(params?: {
    Number.isFinite(agentTimeoutSeconds) &&
    agentTimeoutSeconds > 0
  ) {
-    return clampTimeoutMs(agentTimeoutSeconds * 1000);
+    return clampImplicitTimeoutMs(agentTimeoutSeconds * 1000);
  }

  if (params?.trigger === "cron") {
--- a/src/agents/pi-embedded-runner/stream-resolution.test.ts
+++ b/src/agents/pi-embedded-runner/stream-resolution.test.ts
@@ -153,4 +153,54 @@ describe("resolveEmbeddedAgentStreamFn", () => {
    expect(authStorage.getApiKey).not.toHaveBeenCalled();
    expect(providerStreamFn).toHaveBeenCalledTimes(1);
  });
+
+  it("forwards the run abort signal into provider-owned stream functions", async () => {
+    const providerStreamFn = vi.fn(async (_model, _context, options) => options);
+    const signal = new AbortController().signal;
+    const streamFn = resolveEmbeddedAgentStreamFn({
+      currentStreamFn: undefined,
+      providerStreamFn,
+      shouldUseWebSocketTransport: false,
+      sessionId: "session-1",
+      signal,
+      model: {
+        api: "openai-responses",
+        provider: "github-copilot",
+        id: "gpt-5.4",
+      } as never,
+      resolvedApiKey: "resolved-key",
+    });
+
+    await expect(
+      streamFn({ provider: "github-copilot", id: "gpt-5.4" } as never, {} as never, {}),
+    ).resolves.toMatchObject({
+      signal,
+    });
+  });
+
+  it("does not overwrite an explicit provider-owned stream signal", async () => {
+    const providerStreamFn = vi.fn(async (_model, _context, options) => options);
+    const runSignal = new AbortController().signal;
+    const explicitSignal = new AbortController().signal;
+    const streamFn = resolveEmbeddedAgentStreamFn({
+      currentStreamFn: undefined,
+      providerStreamFn,
+      shouldUseWebSocketTransport: false,
+      sessionId: "session-1",
+      signal: runSignal,
+      model: {
+        api: "openai-responses",
+        provider: "github-copilot",
+        id: "gpt-5.4",
+      } as never,
+    });
+
+    await expect(
+      streamFn({ provider: "github-copilot", id: "gpt-5.4" } as never, {} as never, {
+        signal: explicitSignal,
+      }),
+    ).resolves.toMatchObject({
+      signal: explicitSignal,
+    });
+  });
 });
--- a/src/agents/pi-embedded-runner/stream-resolution.ts
+++ b/src/agents/pi-embedded-runner/stream-resolution.ts
@@ -81,6 +81,10 @@ export function resolveEmbeddedAgentStreamFn(params: {
            systemPrompt: stripSystemPromptCacheBoundary(context.systemPrompt),
          }
        : context;
+    const mergeRunSignal = (options: Parameters<StreamFn>[2]) => {
+      const signal = options?.signal ?? params.signal;
+      return signal ? { ...options, signal } : options;
+    };
    // Provider-owned transports bypass pi-coding-agent's default auth lookup,
    // so keep injecting the resolved runtime apiKey for streamSimple-compatible
    // transports that still read credentials from options.apiKey.
@@ -93,12 +97,12 @@ export function resolveEmbeddedAgentStreamFn(params: {
          authStorage,
        });
        return inner(m, normalizeContext(context), {
-          ...options,
+          ...mergeRunSignal(options),
          apiKey: apiKey ?? options?.apiKey,
        });
      };
    }
-    return (m, context, options) => inner(m, normalizeContext(context), options);
+    return (m, context, options) => inner(m, normalizeContext(context), mergeRunSignal(options));
  }

  const currentStreamFn = params.currentStreamFn ?? streamSimple;
--- a/src/agents/provider-transport-fetch.test.ts
+++ b/src/agents/provider-transport-fetch.test.ts
@@ -213,6 +213,46 @@ describe("buildGuardedModelFetch", () => {
      expect(response.headers.get("x-should-retry")).toBe("false");
    });

+    it("injects x-should-retry:false for terminal 429 responses without retry-after", async () => {
+      fetchWithSsrFGuardMock.mockResolvedValue({
+        response: new Response("Sorry, you've exceeded your weekly rate limit.", {
+          status: 429,
+          headers: { "content-type": "text/plain; charset=utf-8" },
+        }),
+        finalUrl: "https://api.individual.githubcopilot.com/responses",
+        release: vi.fn(async () => undefined),
+      });
+
+      const { buildGuardedModelFetch } = await import("./provider-transport-fetch.js");
+      const response = await buildGuardedModelFetch(openaiModel)(
+        "https://api.individual.githubcopilot.com/responses",
+        { method: "POST" },
+      );
+
+      expect(response.status).toBe(429);
+      expect(response.headers.get("x-should-retry")).toBe("false");
+      await expect(response.text()).resolves.toContain("weekly rate limit");
+    });
+
+    it("keeps short retry-after 429 responses retryable", async () => {
+      fetchWithSsrFGuardMock.mockResolvedValue({
+        response: new Response(null, {
+          status: 429,
+          headers: { "retry-after": "30" },
+        }),
+        finalUrl: "https://api.anthropic.com/v1/messages",
+        release: vi.fn(async () => undefined),
+      });
+
+      const { buildGuardedModelFetch } = await import("./provider-transport-fetch.js");
+      const response = await buildGuardedModelFetch(anthropicModel)(
+        "https://api.anthropic.com/v1/messages",
+        { method: "POST" },
+      );
+
+      expect(response.headers.get("x-should-retry")).toBeNull();
+    });
+
    it("can be disabled with OPENCLAW_SDK_RETRY_MAX_WAIT_SECONDS=0", async () => {
      process.env.OPENCLAW_SDK_RETRY_MAX_WAIT_SECONDS = "0";
      fetchWithSsrFGuardMock.mockResolvedValue({
@@ -252,7 +292,7 @@ describe("buildGuardedModelFetch", () => {
      expect(response.headers.get("x-should-retry")).toBeNull();
    });

-    it("ignores malformed retry-after values", async () => {
+    it("treats malformed 429 retry-after values as terminal", async () => {
      fetchWithSsrFGuardMock.mockResolvedValue({
        response: new Response(null, {
          status: 429,
@@ -268,7 +308,7 @@ describe("buildGuardedModelFetch", () => {
        { method: "POST" },
      );

-      expect(response.headers.get("x-should-retry")).toBeNull();
+      expect(response.headers.get("x-should-retry")).toBe("false");
    });

    it("ignores retry-after on non-retryable responses", async () => {
--- a/src/agents/provider-transport-fetch.ts
+++ b/src/agents/provider-transport-fetch.ts
@@ -68,7 +68,11 @@ function shouldBypassLongSdkRetry(response: Response): boolean {
  }

  const retryAfterSeconds = parseRetryAfterSeconds(response.headers);
-  return retryAfterSeconds !== undefined && retryAfterSeconds > maxWaitSeconds;
+  if (retryAfterSeconds !== undefined) {
+    return retryAfterSeconds > maxWaitSeconds;
+  }
+
+  return status === 429;
 }

 function buildManagedResponse(response: Response, release: () => Promise<void>): Response {