Failover: treat bare service unavailable as timeout

This commit is contained in:
Altay
2026-03-05 23:51:33 +03:00
parent d0ad9d0534
commit 46fb430612
4 changed files with 16 additions and 5 deletions

View File

@@ -139,6 +139,7 @@ Docs: https://docs.openclaw.ai
- Mattermost/plugin SDK import policy: replace remaining monolithic `openclaw/plugin-sdk` imports in Mattermost mention-gating paths/tests with scoped subpaths (`openclaw/plugin-sdk/compat` and `openclaw/plugin-sdk/mattermost`) so `pnpm check` passes `lint:plugins:no-monolithic-plugin-sdk-entry-imports` on baseline. (#36480) Thanks @Takhoffman.
- Agents/failover cooldown classification: stop treating generic `cooling down` text as provider `rate_limit` so healthy models no longer show false global cooldown/rate-limit warnings while explicit `model_cooldown` markers still trigger failover. (#32972) thanks @stakeswky.
- Agents/failover service-unavailable handling: stop treating bare proxy/CDN `service unavailable` errors as provider overload while keeping them retryable via the timeout/failover path, so transient outages no longer show false rate-limit warnings or block fallback. (#36646) thanks @jnMetaCode.
## 2026.3.2

View File

@@ -549,11 +549,10 @@ describe("classifyFailoverReason", () => {
),
).toBe("rate_limit");
});
it("does not classify bare 'service unavailable' as rate_limit (#32828)", () => {
// A generic "service unavailable" from a proxy/CDN should not trigger
// provider-overload failover — it may be a transient proxy error or an
// unrelated upstream failure.
expect(classifyFailoverReason("LLM error: service unavailable")).toBeNull();
it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => {
// A generic "service unavailable" from a proxy/CDN should stay retryable,
// but it should not be treated as provider overload / rate limit.
expect(classifyFailoverReason("LLM error: service unavailable")).toBe("timeout");
});
it("classifies permanent auth errors as auth_permanent", () => {
expect(classifyFailoverReason("invalid_api_key")).toBe("auth_permanent");

View File

@@ -24,6 +24,7 @@ const ERROR_PATTERNS = {
timeout: [
"timeout",
"timed out",
"service unavailable",
"deadline exceeded",
"context deadline exceeded",
"connection error",

View File

@@ -658,6 +658,16 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
});
it("rotates on bare service unavailable without cooling down the profile", async () => {
const { usageStats } = await runAutoPinnedRotationCase({
errorMessage: "LLM error: service unavailable",
sessionKey: "agent:test:service-unavailable-no-cooldown",
runId: "run:service-unavailable-no-cooldown",
});
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
});
it("does not rotate for compaction timeouts", async () => {
await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
await writeAuthStore(agentDir);