diff --git a/CHANGELOG.md b/CHANGELOG.md index 1063cd2aea9..8c303d26c96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -139,6 +139,7 @@ Docs: https://docs.openclaw.ai - Mattermost/plugin SDK import policy: replace remaining monolithic `openclaw/plugin-sdk` imports in Mattermost mention-gating paths/tests with scoped subpaths (`openclaw/plugin-sdk/compat` and `openclaw/plugin-sdk/mattermost`) so `pnpm check` passes `lint:plugins:no-monolithic-plugin-sdk-entry-imports` on baseline. (#36480) Thanks @Takhoffman. - Agents/failover cooldown classification: stop treating generic `cooling down` text as provider `rate_limit` so healthy models no longer show false global cooldown/rate-limit warnings while explicit `model_cooldown` markers still trigger failover. (#32972) thanks @stakeswky. +- Agents/failover service-unavailable handling: stop treating bare proxy/CDN `service unavailable` errors as provider overload while keeping them retryable via the timeout/failover path, so transient outages no longer show false rate-limit warnings or block fallback. (#36646) thanks @jnMetaCode. ## 2026.3.2 diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index b3c874b855b..a46857ac851 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -549,11 +549,10 @@ describe("classifyFailoverReason", () => { ), ).toBe("rate_limit"); }); - it("does not classify bare 'service unavailable' as rate_limit (#32828)", () => { - // A generic "service unavailable" from a proxy/CDN should not trigger - // provider-overload failover — it may be a transient proxy error or an - // unrelated upstream failure. - expect(classifyFailoverReason("LLM error: service unavailable")).toBeNull(); + it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => { + // A generic "service unavailable" from a proxy/CDN should stay retryable, + // but it should not be treated as provider overload / rate limit. + expect(classifyFailoverReason("LLM error: service unavailable")).toBe("timeout"); }); it("classifies permanent auth errors as auth_permanent", () => { expect(classifyFailoverReason("invalid_api_key")).toBe("auth_permanent"); diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index 92bd7e20eaf..d1e266ff53d 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -24,6 +24,7 @@ const ERROR_PATTERNS = { timeout: [ "timeout", "timed out", + "service unavailable", "deadline exceeded", "context deadline exceeded", "connection error", diff --git a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts index cfefc20cc67..95450d2efd4 100644 --- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts +++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts @@ -658,6 +658,16 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined(); }); + it("rotates on bare service unavailable without cooling down the profile", async () => { + const { usageStats } = await runAutoPinnedRotationCase({ + errorMessage: "LLM error: service unavailable", + sessionKey: "agent:test:service-unavailable-no-cooldown", + runId: "run:service-unavailable-no-cooldown", + }); + expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); + expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined(); + }); + it("does not rotate for compaction timeouts", async () => { await withAgentWorkspace(async ({ agentDir, workspaceDir }) => { await writeAuthStore(agentDir);