fix(agents): fail fast on terminal provider 429s

This commit is contained in:
Peter Steinberger
2026-04-25 01:30:53 +01:00
parent 26bc5e47ee
commit d12987d725
7 changed files with 123 additions and 13 deletions

View File

@@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Agents/failover: forward embedded run abort signals into provider-owned model streams, cap implicit LLM idle watchdogs below long run timeouts, and mark 429 responses without usable retry timing as non-retryable so GitHub Copilot rate limits fail over or surface promptly instead of hanging until run timeout. Fixes #71120.
- Browser/tool: keep explicit AI snapshots from inheriting the efficient role-snapshot default and preserve numeric Playwright AI refs, so `--format ai` remains a real AI snapshot path. Fixes #62550. Thanks @ly85206559.
- Gateway/config: keep in-process config patch reload comparisons on the resolved source snapshot when `${VAR}` env refs are restored on disk, avoiding false full gateway restarts for unchanged gateway/plugin secrets. Fixes #71208. Thanks @robbiethompson18.
- Slack/messages: serialize write-client requests and whole outbound sends per target so rapid multi-message Slack replies preserve send order. Fixes #69101. (#69105) Thanks @nightq and @ztexydt-cqh.

View File

@@ -50,13 +50,22 @@ describe("resolveLlmIdleTimeoutMs", () => {
expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
});
it("falls back to agents.defaults.timeoutSeconds when llm.idleTimeoutSeconds is not set", () => {
it("caps agents.defaults.timeoutSeconds fallback at the default idle watchdog", () => {
const cfg = { agents: { defaults: { timeoutSeconds: 300 } } } as OpenClawConfig;
expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(300_000);
expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
});
it("uses an explicit run timeout override when llm.idleTimeoutSeconds is not set", () => {
expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 900_000 })).toBe(900_000);
it("uses agents.defaults.timeoutSeconds when it is shorter than the default idle watchdog", () => {
const cfg = { agents: { defaults: { timeoutSeconds: 30 } } } as OpenClawConfig;
expect(resolveLlmIdleTimeoutMs({ cfg })).toBe(30_000);
});
it("caps an explicit run timeout override at the default idle watchdog", () => {
expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 900_000 })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
});
it("uses an explicit run timeout override when shorter than the default idle watchdog", () => {
expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 30_000 })).toBe(30_000);
});
it("disables the idle watchdog when an explicit run timeout disables timeouts", () => {
@@ -91,9 +100,9 @@ describe("resolveLlmIdleTimeoutMs", () => {
expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(0);
});
it("uses agents.defaults.timeoutSeconds for cron before disabling the default idle timeout", () => {
it("caps agents.defaults.timeoutSeconds for cron before disabling the default idle timeout", () => {
const cfg = { agents: { defaults: { timeoutSeconds: 300 } } } as OpenClawConfig;
expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(300_000);
expect(resolveLlmIdleTimeoutMs({ cfg, trigger: "cron" })).toBe(DEFAULT_LLM_IDLE_TIMEOUT_MS);
});
it("keeps an explicit cron idle timeout when configured", () => {

View File

@@ -25,6 +25,8 @@ export function resolveLlmIdleTimeoutMs(params?: {
runTimeoutMs?: number;
}): number {
const clampTimeoutMs = (valueMs: number) => Math.min(Math.floor(valueMs), MAX_SAFE_TIMEOUT_MS);
const clampImplicitTimeoutMs = (valueMs: number) =>
clampTimeoutMs(Math.min(valueMs, DEFAULT_LLM_IDLE_TIMEOUT_MS));
const raw = params?.cfg?.agents?.defaults?.llm?.idleTimeoutSeconds;
// 0 means explicitly disabled (no timeout).
if (raw === 0) {
@@ -39,7 +41,7 @@ export function resolveLlmIdleTimeoutMs(params?: {
if (runTimeoutMs >= MAX_SAFE_TIMEOUT_MS) {
return 0;
}
return clampTimeoutMs(runTimeoutMs);
return clampImplicitTimeoutMs(runTimeoutMs);
}
const agentTimeoutSeconds = params?.cfg?.agents?.defaults?.timeoutSeconds;
@@ -48,7 +50,7 @@ export function resolveLlmIdleTimeoutMs(params?: {
Number.isFinite(agentTimeoutSeconds) &&
agentTimeoutSeconds > 0
) {
return clampTimeoutMs(agentTimeoutSeconds * 1000);
return clampImplicitTimeoutMs(agentTimeoutSeconds * 1000);
}
if (params?.trigger === "cron") {

View File

@@ -153,4 +153,54 @@ describe("resolveEmbeddedAgentStreamFn", () => {
expect(authStorage.getApiKey).not.toHaveBeenCalled();
expect(providerStreamFn).toHaveBeenCalledTimes(1);
});
it("forwards the run abort signal into provider-owned stream functions", async () => {
const providerStreamFn = vi.fn(async (_model, _context, options) => options);
const signal = new AbortController().signal;
const streamFn = resolveEmbeddedAgentStreamFn({
currentStreamFn: undefined,
providerStreamFn,
shouldUseWebSocketTransport: false,
sessionId: "session-1",
signal,
model: {
api: "openai-responses",
provider: "github-copilot",
id: "gpt-5.4",
} as never,
resolvedApiKey: "resolved-key",
});
await expect(
streamFn({ provider: "github-copilot", id: "gpt-5.4" } as never, {} as never, {}),
).resolves.toMatchObject({
signal,
});
});
it("does not overwrite an explicit provider-owned stream signal", async () => {
const providerStreamFn = vi.fn(async (_model, _context, options) => options);
const runSignal = new AbortController().signal;
const explicitSignal = new AbortController().signal;
const streamFn = resolveEmbeddedAgentStreamFn({
currentStreamFn: undefined,
providerStreamFn,
shouldUseWebSocketTransport: false,
sessionId: "session-1",
signal: runSignal,
model: {
api: "openai-responses",
provider: "github-copilot",
id: "gpt-5.4",
} as never,
});
await expect(
streamFn({ provider: "github-copilot", id: "gpt-5.4" } as never, {} as never, {
signal: explicitSignal,
}),
).resolves.toMatchObject({
signal: explicitSignal,
});
});
});

View File

@@ -81,6 +81,10 @@ export function resolveEmbeddedAgentStreamFn(params: {
systemPrompt: stripSystemPromptCacheBoundary(context.systemPrompt),
}
: context;
const mergeRunSignal = (options: Parameters<StreamFn>[2]) => {
const signal = options?.signal ?? params.signal;
return signal ? { ...options, signal } : options;
};
// Provider-owned transports bypass pi-coding-agent's default auth lookup,
// so keep injecting the resolved runtime apiKey for streamSimple-compatible
// transports that still read credentials from options.apiKey.
@@ -93,12 +97,12 @@ export function resolveEmbeddedAgentStreamFn(params: {
authStorage,
});
return inner(m, normalizeContext(context), {
...options,
...mergeRunSignal(options),
apiKey: apiKey ?? options?.apiKey,
});
};
}
return (m, context, options) => inner(m, normalizeContext(context), options);
return (m, context, options) => inner(m, normalizeContext(context), mergeRunSignal(options));
}
const currentStreamFn = params.currentStreamFn ?? streamSimple;

View File

@@ -213,6 +213,46 @@ describe("buildGuardedModelFetch", () => {
expect(response.headers.get("x-should-retry")).toBe("false");
});
it("injects x-should-retry:false for terminal 429 responses without retry-after", async () => {
fetchWithSsrFGuardMock.mockResolvedValue({
response: new Response("Sorry, you've exceeded your weekly rate limit.", {
status: 429,
headers: { "content-type": "text/plain; charset=utf-8" },
}),
finalUrl: "https://api.individual.githubcopilot.com/responses",
release: vi.fn(async () => undefined),
});
const { buildGuardedModelFetch } = await import("./provider-transport-fetch.js");
const response = await buildGuardedModelFetch(openaiModel)(
"https://api.individual.githubcopilot.com/responses",
{ method: "POST" },
);
expect(response.status).toBe(429);
expect(response.headers.get("x-should-retry")).toBe("false");
await expect(response.text()).resolves.toContain("weekly rate limit");
});
it("keeps short retry-after 429 responses retryable", async () => {
fetchWithSsrFGuardMock.mockResolvedValue({
response: new Response(null, {
status: 429,
headers: { "retry-after": "30" },
}),
finalUrl: "https://api.anthropic.com/v1/messages",
release: vi.fn(async () => undefined),
});
const { buildGuardedModelFetch } = await import("./provider-transport-fetch.js");
const response = await buildGuardedModelFetch(anthropicModel)(
"https://api.anthropic.com/v1/messages",
{ method: "POST" },
);
expect(response.headers.get("x-should-retry")).toBeNull();
});
it("can be disabled with OPENCLAW_SDK_RETRY_MAX_WAIT_SECONDS=0", async () => {
process.env.OPENCLAW_SDK_RETRY_MAX_WAIT_SECONDS = "0";
fetchWithSsrFGuardMock.mockResolvedValue({
@@ -252,7 +292,7 @@ describe("buildGuardedModelFetch", () => {
expect(response.headers.get("x-should-retry")).toBeNull();
});
it("ignores malformed retry-after values", async () => {
it("treats malformed 429 retry-after values as terminal", async () => {
fetchWithSsrFGuardMock.mockResolvedValue({
response: new Response(null, {
status: 429,
@@ -268,7 +308,7 @@ describe("buildGuardedModelFetch", () => {
{ method: "POST" },
);
expect(response.headers.get("x-should-retry")).toBeNull();
expect(response.headers.get("x-should-retry")).toBe("false");
});
it("ignores retry-after on non-retryable responses", async () => {

View File

@@ -68,7 +68,11 @@ function shouldBypassLongSdkRetry(response: Response): boolean {
}
const retryAfterSeconds = parseRetryAfterSeconds(response.headers);
return retryAfterSeconds !== undefined && retryAfterSeconds > maxWaitSeconds;
if (retryAfterSeconds !== undefined) {
return retryAfterSeconds > maxWaitSeconds;
}
return status === 429;
}
function buildManagedResponse(response: Response, release: () => Promise<void>): Response {