fix: make overload failover configurable

This commit is contained in:
Peter Steinberger
2026-03-31 21:33:35 +01:00
parent 2a60e34f2a
commit 418fa12dfa
14 changed files with 255 additions and 81 deletions

View File

@@ -7855,6 +7855,39 @@
"help": "Failure window (hours) for backoff counters (default: 24).",
"hasChildren": false
},
{
"path": "auth.cooldowns.overloadedBackoffMs",
"kind": "core",
"type": "integer",
"required": false,
"deprecated": false,
"sensitive": false,
"tags": [
"access",
"auth",
"reliability",
"storage"
],
"label": "Overloaded Backoff (ms)",
"help": "Fixed delay in milliseconds before retrying an overloaded provider/profile rotation (default: 0).",
"hasChildren": false
},
{
"path": "auth.cooldowns.overloadedProfileRotations",
"kind": "core",
"type": "integer",
"required": false,
"deprecated": false,
"sensitive": false,
"tags": [
"access",
"auth",
"storage"
],
"label": "Overloaded Profile Rotations",
"help": "Maximum same-provider auth-profile rotations allowed for overloaded errors before switching to model fallback (default: 1).",
"hasChildren": false
},
{
"path": "auth.order",
"kind": "core",

View File

@@ -1,4 +1,4 @@
{"generatedBy":"scripts/generate-config-doc-baseline.ts","recordType":"meta","totalPaths":5718}
{"generatedBy":"scripts/generate-config-doc-baseline.ts","recordType":"meta","totalPaths":5720}
{"recordType":"path","path":"acp","kind":"core","type":"object","required":false,"deprecated":false,"sensitive":false,"tags":["advanced"],"label":"ACP","help":"ACP runtime controls for enabling dispatch, selecting backends, constraining allowed agent targets, and tuning streamed turn projection behavior.","hasChildren":true}
{"recordType":"path","path":"acp.allowedAgents","kind":"core","type":"array","required":false,"deprecated":false,"sensitive":false,"tags":["access"],"label":"ACP Allowed Agents","help":"Allowlist of ACP target agent ids permitted for ACP runtime sessions. Empty means no additional allowlist restriction.","hasChildren":true}
{"recordType":"path","path":"acp.allowedAgents.*","kind":"core","type":"string","required":false,"deprecated":false,"sensitive":false,"tags":[],"hasChildren":false}
@@ -697,6 +697,8 @@
{"recordType":"path","path":"auth.cooldowns.billingBackoffHoursByProvider.*","kind":"core","type":"number","required":false,"deprecated":false,"sensitive":false,"tags":[],"hasChildren":false}
{"recordType":"path","path":"auth.cooldowns.billingMaxHours","kind":"core","type":"number","required":false,"deprecated":false,"sensitive":false,"tags":["access","auth","performance"],"label":"Billing Backoff Cap (hours)","help":"Cap (hours) for billing backoff (default: 24).","hasChildren":false}
{"recordType":"path","path":"auth.cooldowns.failureWindowHours","kind":"core","type":"number","required":false,"deprecated":false,"sensitive":false,"tags":["access","auth"],"label":"Failover Window (hours)","help":"Failure window (hours) for backoff counters (default: 24).","hasChildren":false}
{"recordType":"path","path":"auth.cooldowns.overloadedBackoffMs","kind":"core","type":"integer","required":false,"deprecated":false,"sensitive":false,"tags":["access","auth","reliability","storage"],"label":"Overloaded Backoff (ms)","help":"Fixed delay in milliseconds before retrying an overloaded provider/profile rotation (default: 0).","hasChildren":false}
{"recordType":"path","path":"auth.cooldowns.overloadedProfileRotations","kind":"core","type":"integer","required":false,"deprecated":false,"sensitive":false,"tags":["access","auth","storage"],"label":"Overloaded Profile Rotations","help":"Maximum same-provider auth-profile rotations allowed for overloaded errors before switching to model fallback (default: 1).","hasChildren":false}
{"recordType":"path","path":"auth.order","kind":"core","type":"object","required":false,"deprecated":false,"sensitive":false,"tags":["access","auth"],"label":"Auth Profile Order","help":"Ordered auth profile IDs per provider (used for automatic failover).","hasChildren":true}
{"recordType":"path","path":"auth.order.*","kind":"core","type":"array","required":false,"deprecated":false,"sensitive":false,"tags":[],"hasChildren":true}
{"recordType":"path","path":"auth.order.*.*","kind":"core","type":"string","required":false,"deprecated":false,"sensitive":false,"tags":[],"hasChildren":false}

View File

@@ -129,6 +129,8 @@ Defaults:
- Billing backoff starts at **5 hours**, doubles per billing failure, and caps at **24 hours**.
- Backoff counters reset if the profile hasnt failed for **24 hours** (configurable).
- Overloaded retries allow **1 same-provider profile rotation** before model fallback.
- Overloaded retries use **0 ms backoff** by default.
## Model fallback
@@ -136,6 +138,11 @@ If all profiles for a provider fail, OpenClaw moves to the next model in
`agents.defaults.model.fallbacks`. This applies to auth failures, rate limits, and
timeouts that exhausted profile rotation (other errors do not advance fallback).
Overloaded errors are handled more aggressively than billing cooldowns. By default,
OpenClaw allows one same-provider auth-profile retry, then switches to the next
configured model fallback without waiting. Tune this with
`auth.cooldowns.overloadedProfileRotations` and `auth.cooldowns.overloadedBackoffMs`.
When a run starts with a model override (hooks or CLI), fallbacks still end at
`agents.defaults.model.primary` after trying any configured fallbacks.
@@ -146,6 +153,7 @@ See [Gateway configuration](/gateway/configuration) for:
- `auth.profiles` / `auth.order`
- `auth.cooldowns.billingBackoffHours` / `auth.cooldowns.billingBackoffHoursByProvider`
- `auth.cooldowns.billingMaxHours` / `auth.cooldowns.failureWindowHours`
- `auth.cooldowns.overloadedProfileRotations` / `auth.cooldowns.overloadedBackoffMs`
- `agents.defaults.model.primary` / `agents.defaults.model.fallbacks`
- `agents.defaults.imageModel` routing

View File

@@ -3029,6 +3029,8 @@ Notes:
billingBackoffHoursByProvider: { anthropic: 3, openai: 8 },
billingMaxHours: 24,
failureWindowHours: 24,
overloadedProfileRotations: 1,
overloadedBackoffMs: 0,
},
},
}
@@ -3038,6 +3040,8 @@ Notes:
- `billingBackoffHoursByProvider`: optional per-provider overrides for billing backoff hours.
- `billingMaxHours`: cap in hours for billing backoff exponential growth (default: `24`).
- `failureWindowHours`: rolling window in hours used for backoff counters (default: `24`).
- `overloadedProfileRotations`: maximum same-provider auth-profile rotations for overloaded errors before switching to model fallback (default: `1`).
- `overloadedBackoffMs`: fixed delay before retrying an overloaded provider/profile rotation (default: `0`).
---