Failover: treat bare service unavailable as timeout

2026-05-06 04:30:42 +00:00 · 2026-03-05 23:51:33 +03:00
parent d0ad9d0534
commit 46fb430612
4 changed files with 16 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -139,6 +139,7 @@ Docs: https://docs.openclaw.ai
 - Mattermost/plugin SDK import policy: replace remaining monolithic `openclaw/plugin-sdk` imports in Mattermost mention-gating paths/tests with scoped subpaths (`openclaw/plugin-sdk/compat` and `openclaw/plugin-sdk/mattermost`) so `pnpm check` passes `lint:plugins:no-monolithic-plugin-sdk-entry-imports` on baseline. (#36480) Thanks @Takhoffman.

 - Agents/failover cooldown classification: stop treating generic `cooling down` text as provider `rate_limit` so healthy models no longer show false global cooldown/rate-limit warnings while explicit `model_cooldown` markers still trigger failover. (#32972) thanks @stakeswky.
+- Agents/failover service-unavailable handling: stop treating bare proxy/CDN `service unavailable` errors as provider overload while keeping them retryable via the timeout/failover path, so transient outages no longer show false rate-limit warnings or block fallback. (#36646) thanks @jnMetaCode.

 ## 2026.3.2

--- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
+++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
@@ -549,11 +549,10 @@ describe("classifyFailoverReason", () => {
      ),
    ).toBe("rate_limit");
  });
-  it("does not classify bare 'service unavailable' as rate_limit (#32828)", () => {
-    // A generic "service unavailable" from a proxy/CDN should not trigger
-    // provider-overload failover — it may be a transient proxy error or an
-    // unrelated upstream failure.
-    expect(classifyFailoverReason("LLM error: service unavailable")).toBeNull();
+  it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => {
+    // A generic "service unavailable" from a proxy/CDN should stay retryable,
+    // but it should not be treated as provider overload / rate limit.
+    expect(classifyFailoverReason("LLM error: service unavailable")).toBe("timeout");
  });
  it("classifies permanent auth errors as auth_permanent", () => {
    expect(classifyFailoverReason("invalid_api_key")).toBe("auth_permanent");
--- a/src/agents/pi-embedded-helpers/failover-matches.ts
+++ b/src/agents/pi-embedded-helpers/failover-matches.ts
@@ -24,6 +24,7 @@ const ERROR_PATTERNS = {
  timeout: [
    "timeout",
    "timed out",
+    "service unavailable",
    "deadline exceeded",
    "context deadline exceeded",
    "connection error",
--- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts
+++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts
@@ -658,6 +658,16 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
    expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
  });

+  it("rotates on bare service unavailable without cooling down the profile", async () => {
+    const { usageStats } = await runAutoPinnedRotationCase({
+      errorMessage: "LLM error: service unavailable",
+      sessionKey: "agent:test:service-unavailable-no-cooldown",
+      runId: "run:service-unavailable-no-cooldown",
+    });
+    expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
+    expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
+  });
+
  it("does not rotate for compaction timeouts", async () => {
    await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
      await writeAuthStore(agentDir);