From 768736dc1971648a565639810de8ae616ba2ffdd Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 15:14:52 -0500 Subject: [PATCH 01/16] Tooling: add Knip workspace config --- knip.config.ts | 105 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 knip.config.ts diff --git a/knip.config.ts b/knip.config.ts new file mode 100644 index 00000000000..e4daabd7e95 --- /dev/null +++ b/knip.config.ts @@ -0,0 +1,105 @@ +const rootEntries = [ + "openclaw.mjs!", + "src/index.ts!", + "src/entry.ts!", + "src/cli/daemon-cli.ts!", + "src/extensionAPI.ts!", + "src/infra/warning-filter.ts!", + "src/channels/plugins/agent-tools/whatsapp-login.ts!", + "src/channels/plugins/actions/discord.ts!", + "src/channels/plugins/actions/signal.ts!", + "src/channels/plugins/actions/telegram.ts!", + "src/telegram/audit.ts!", + "src/telegram/token.ts!", + "src/line/accounts.ts!", + "src/line/send.ts!", + "src/line/template-messages.ts!", + "src/hooks/bundled/*/handler.ts!", + "src/hooks/llm-slug-generator.ts!", + "src/plugin-sdk/*.ts!", +] as const; + +const config = { + ignoreFiles: [ + "scripts/**", + "**/__tests__/**", + "src/test-utils/**", + "**/test-helpers/**", + "**/test-fixtures/**", + "**/live-*.ts", + "**/test-*.ts", + "**/*test-helpers.ts", + "**/*test-fixtures.ts", + "**/*test-harness.ts", + "**/*test-utils.ts", + "**/*mocks.ts", + "**/*.e2e-mocks.ts", + "**/*.e2e-*.ts", + "**/*.harness.ts", + "**/*.job-fixtures.ts", + "**/*.mock-harness.ts", + "**/*.suite-helpers.ts", + "**/*.test-setup.ts", + "**/job-fixtures.ts", + "**/*test-mocks.ts", + "**/*test-runtime*.ts", + "**/*.mock-setup.ts", + "**/*.cases.ts", + "**/*.e2e-harness.ts", + "**/*.fixture.ts", + "**/*.fixtures.ts", + "**/*.mocks.ts", + "**/*.mocks.shared.ts", + "**/*.shared-test.ts", + "**/*.suite.ts", + "**/*.test-runtime.ts", + "**/*.testkit.ts", + "**/*.test-fixtures.ts", + "**/*.test-harness.ts", + "**/*.test-helper.ts", + "**/*.test-helpers.ts", + "**/*.test-mocks.ts", + "**/*.test-utils.ts", + "src/gateway/live-image-probe.ts", + "src/secrets/credential-matrix.ts", + "src/agents/claude-cli-runner.ts", + "src/agents/pi-auth-json.ts", + "src/agents/tool-policy.conformance.ts", + "src/auto-reply/reply/audio-tags.ts", + "src/gateway/live-tool-probe-utils.ts", + "src/gateway/server.auth.shared.ts", + "src/shared/text/assistant-visible-text.ts", + "src/telegram/bot/reply-threading.ts", + "src/telegram/draft-chunking.ts", + "extensions/msteams/src/conversation-store-memory.ts", + "extensions/msteams/src/polls-store-memory.ts", + "extensions/voice-call/src/providers/index.ts", + "extensions/voice-call/src/providers/tts-openai.ts", + ], + workspaces: { + ".": { + entry: rootEntries, + project: [ + "src/**/*.ts!", + "scripts/**/*.{js,mjs,cjs,ts,mts,cts}!", + "*.config.{js,mjs,cjs,ts,mts,cts}!", + "*.mjs!", + ], + }, + ui: { + entry: ["index.html!", "src/main.ts!", "vite.config.ts!", "vitest*.ts!"], + project: ["src/**/*.{ts,tsx}!"], + }, + "packages/*": { + entry: ["index.js!", "scripts/postinstall.js!"], + project: ["index.js!", "scripts/**/*.js!"], + }, + "extensions/*": { + entry: ["index.ts!"], + project: ["index.ts!", "src/**/*.ts!"], + ignoreDependencies: ["openclaw"], + }, + }, +} as const; + +export default config; From b70d3c4af3804ddf39d5dedc60bbcf973140db27 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 15:14:59 -0500 Subject: [PATCH 02/16] Tooling: wire deadcode scripts to Knip --- package.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index a7b5e189dbc..8a8ea416427 100644 --- a/package.json +++ b/package.json @@ -231,8 +231,8 @@ "check:docs": "pnpm format:docs:check && pnpm lint:docs && pnpm docs:check-links", "check:host-env-policy:swift": "node scripts/generate-host-env-security-policy-swift.mjs --check", "check:loc": "node --import tsx scripts/check-ts-max-loc.ts --max 500", - "deadcode:ci": "pnpm deadcode:report:ci:knip && pnpm deadcode:report:ci:ts-prune && pnpm deadcode:report:ci:ts-unused", - "deadcode:knip": "pnpm dlx knip --no-progress", + "deadcode:ci": "pnpm deadcode:report:ci:knip", + "deadcode:knip": "pnpm dlx knip --config knip.config.ts --isolate-workspaces --production --no-progress --reporter compact --files --dependencies", "deadcode:report": "pnpm deadcode:knip; pnpm deadcode:ts-prune; pnpm deadcode:ts-unused", "deadcode:report:ci:knip": "mkdir -p .artifacts/deadcode && pnpm deadcode:knip > .artifacts/deadcode/knip.txt 2>&1 || true", "deadcode:report:ci:ts-prune": "mkdir -p .artifacts/deadcode && pnpm deadcode:ts-prune > .artifacts/deadcode/ts-prune.txt 2>&1 || true", From b17baca871dd0dbdd00fb0b7fb35652855984e50 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 15:15:06 -0500 Subject: [PATCH 03/16] CI: enable report-only Knip deadcode job --- .github/workflows/ci.yml | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 817f4b94d00..0e3c21e9119 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -213,25 +213,13 @@ jobs: - name: Enforce safe external URL opening policy run: pnpm lint:ui:no-raw-window-open - # Report-only dead-code scans. Runs after scope detection and stores machine-readable - # results as artifacts for later triage before we enable hard gates. - # Temporarily disabled in CI while we process initial findings. + # Report-only dead-code scan. Runs after scope detection and stores the Knip + # report as an artifact so we can triage findings before enabling hard gates. deadcode: name: dead-code report needs: [docs-scope, changed-scope] - # if: needs.docs-scope.outputs.docs_only != 'true' && (github.event_name == 'push' || needs.changed-scope.outputs.run_node == 'true') - if: false + if: needs.docs-scope.outputs.docs_only != 'true' && (github.event_name == 'push' || needs.changed-scope.outputs.run_node == 'true') runs-on: blacksmith-16vcpu-ubuntu-2404 - strategy: - fail-fast: false - matrix: - include: - - tool: knip - command: pnpm deadcode:report:ci:knip - - tool: ts-prune - command: pnpm deadcode:report:ci:ts-prune - - tool: ts-unused-exports - command: pnpm deadcode:report:ci:ts-unused steps: - name: Checkout uses: actions/checkout@v4 @@ -244,13 +232,13 @@ jobs: install-bun: "false" use-sticky-disk: "true" - - name: Run ${{ matrix.tool }} dead-code scan - run: ${{ matrix.command }} + - name: Run Knip dead-code scan + run: pnpm deadcode:report:ci:knip - name: Upload dead-code results uses: actions/upload-artifact@v4 with: - name: dead-code-${{ matrix.tool }}-${{ github.run_id }} + name: dead-code-knip-${{ github.run_id }} path: .artifacts/deadcode # Validate docs (format, lint, broken links) only when docs files changed. From ab5fcfcc01281f1f6cd6e8f43f7c302c12806feb Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 15:15:23 -0500 Subject: [PATCH 04/16] feat(gateway): add channel-backed readiness probes (#38285) * Changelog: add channel-backed readiness probe entry * Gateway: add channel-backed readiness probes * Docs: describe readiness probe behavior * Gateway: add readiness probe regression tests * Changelog: dedupe gateway probe entries * Docs: fix readiness startup grace description * Changelog: remove stale readiness entry * Gateway: cover readiness hardening * Gateway: harden readiness probes --- CHANGELOG.md | 2 +- docs/install/docker.md | 4 + src/channels/plugins/types.core.ts | 1 + src/gateway/channel-health-monitor.ts | 7 +- src/gateway/channel-health-policy.ts | 5 + src/gateway/server-channels.ts | 11 ++ src/gateway/server-http.probe.test.ts | 155 ++++++++++++++++++ src/gateway/server-http.test-harness.ts | 6 + src/gateway/server-http.ts | 84 ++++++++-- src/gateway/server-runtime-state.ts | 3 + src/gateway/server.impl.ts | 19 ++- src/gateway/server/readiness.test.ts | 202 ++++++++++++++++++++++++ src/gateway/server/readiness.ts | 79 +++++++++ 13 files changed, 558 insertions(+), 20 deletions(-) create mode 100644 src/gateway/server-http.probe.test.ts create mode 100644 src/gateway/server/readiness.test.ts create mode 100644 src/gateway/server/readiness.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b8cb0cde4b..99c04190526 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -207,7 +207,7 @@ Docs: https://docs.openclaw.ai - WhatsApp media upload caps: make outbound media sends and auto-replies honor `channels.whatsapp.mediaMaxMb` with per-account overrides so inbound and outbound limits use the same channel config. Thanks @vincentkoc. - Windows/Plugin install: when OpenClaw runs on Windows via Bun and `npm-cli.js` is not colocated with the runtime binary, fall back to `npm.cmd`/`npx.cmd` through the existing `cmd.exe` wrapper so `openclaw plugins install` no longer fails with `spawn EINVAL`. (#38056) Thanks @0xlin2023. - Telegram/send retry classification: retry grammY `Network request ... failed after N attempts` envelopes in send flows without reclassifying plain `Network request ... failed!` wrappers as transient, restoring the intended retry path while keeping broad send-context message matching tight. (#38056) Thanks @0xlin2023. -- Gateway/probe route precedence: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, so root-mounted SPA fallbacks no longer swallow machine probe routes while plugin-owned routes on those paths still keep precedence. (#18446) Thanks @vibecodooor and @vincentkoc. +- Gateway/probes: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, preserve plugin-owned route precedence on those paths, and make `/ready` and `/readyz` report channel-backed readiness with startup grace plus `503` on disconnected managed channels, while `/health` and `/healthz` stay shallow liveness probes. (#18446) Thanks @vibecodooor, @mahsumaktas, and @vincentkoc. ## 2026.3.2 diff --git a/docs/install/docker.md b/docs/install/docker.md index 8cbf2555e87..1dd0d2325d1 100644 --- a/docs/install/docker.md +++ b/docs/install/docker.md @@ -476,6 +476,10 @@ curl -fsS http://127.0.0.1:18789/readyz Aliases: `/health` and `/ready`. +`/healthz` is a shallow liveness probe for "the gateway process is up". +`/readyz` stays ready during startup grace, then becomes `503` only if required +managed channels are still disconnected after grace or disconnect later. + The Docker image includes a built-in `HEALTHCHECK` that pings `/healthz` in the background. In plain terms: Docker keeps checking if OpenClaw is still responsive. If checks keep failing, Docker marks the container as `unhealthy`, diff --git a/src/channels/plugins/types.core.ts b/src/channels/plugins/types.core.ts index 6cd5173e13b..22f8e458e79 100644 --- a/src/channels/plugins/types.core.ts +++ b/src/channels/plugins/types.core.ts @@ -102,6 +102,7 @@ export type ChannelAccountSnapshot = { linked?: boolean; running?: boolean; connected?: boolean; + restartPending?: boolean; reconnectAttempts?: number; lastConnectedAt?: number | null; lastDisconnect?: diff --git a/src/gateway/channel-health-monitor.ts b/src/gateway/channel-health-monitor.ts index e66bc4912af..4ed422468f0 100644 --- a/src/gateway/channel-health-monitor.ts +++ b/src/gateway/channel-health-monitor.ts @@ -1,6 +1,8 @@ import type { ChannelId } from "../channels/plugins/types.js"; import { createSubsystemLogger } from "../logging/subsystem.js"; import { + DEFAULT_CHANNEL_CONNECT_GRACE_MS, + DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, evaluateChannelHealth, resolveChannelRestartReason, type ChannelHealthPolicy, @@ -21,9 +23,6 @@ const ONE_HOUR_MS = 60 * 60_000; * This catches the half-dead WebSocket scenario where the connection appears * alive (health checks pass) but Slack silently stops delivering events. */ -const DEFAULT_STALE_EVENT_THRESHOLD_MS = 30 * 60_000; -const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000; - export type ChannelHealthTimingPolicy = { monitorStartupGraceMs: number; channelConnectGraceMs: number; @@ -70,7 +69,7 @@ function resolveTimingPolicy( staleEventThresholdMs: deps.timing?.staleEventThresholdMs ?? deps.staleEventThresholdMs ?? - DEFAULT_STALE_EVENT_THRESHOLD_MS, + DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, }; } diff --git a/src/gateway/channel-health-policy.ts b/src/gateway/channel-health-policy.ts index 31938a90471..d0616f04862 100644 --- a/src/gateway/channel-health-policy.ts +++ b/src/gateway/channel-health-policy.ts @@ -3,6 +3,7 @@ export type ChannelHealthSnapshot = { connected?: boolean; enabled?: boolean; configured?: boolean; + restartPending?: boolean; busy?: boolean; activeRuns?: number; lastRunActivityAt?: number | null; @@ -39,6 +40,10 @@ function isManagedAccount(snapshot: ChannelHealthSnapshot): boolean { } const BUSY_ACTIVITY_STALE_THRESHOLD_MS = 25 * 60_000; +// Keep these shared between the background health monitor and on-demand readiness +// probes so both surfaces evaluate channel lifecycle windows consistently. +export const DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS = 30 * 60_000; +export const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000; export function evaluateChannelHealth( snapshot: ChannelHealthSnapshot, diff --git a/src/gateway/server-channels.ts b/src/gateway/server-channels.ts index 6c291541369..4090791d285 100644 --- a/src/gateway/server-channels.ts +++ b/src/gateway/server-channels.ts @@ -180,6 +180,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage enabled: false, configured: true, running: false, + restartPending: false, lastError: plugin.config.disabledReason?.(account, cfg) ?? "disabled", }); return; @@ -195,6 +196,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage enabled: true, configured: false, running: false, + restartPending: false, lastError: plugin.config.unconfiguredReason?.(account, cfg) ?? "not configured", }); return; @@ -215,6 +217,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage enabled: true, configured: true, running: true, + restartPending: false, lastStartAt: Date.now(), lastError: null, reconnectAttempts: preserveRestartAttempts ? (restartAttempts.get(rKey) ?? 0) : 0, @@ -252,6 +255,11 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage const attempt = (restartAttempts.get(rKey) ?? 0) + 1; restartAttempts.set(rKey, attempt); if (attempt > MAX_RESTART_ATTEMPTS) { + setRuntime(channelId, id, { + accountId: id, + restartPending: false, + reconnectAttempts: attempt, + }); log.error?.(`[${id}] giving up after ${MAX_RESTART_ATTEMPTS} restart attempts`); return; } @@ -261,6 +269,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage ); setRuntime(channelId, id, { accountId: id, + restartPending: true, reconnectAttempts: attempt, }); try { @@ -349,6 +358,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage setRuntime(channelId, id, { accountId: id, running: false, + restartPending: false, lastStopAt: Date.now(), }); }), @@ -377,6 +387,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage const next: ChannelAccountSnapshot = { accountId: resolvedId, running: false, + restartPending: false, lastError: cleared ? "logged out" : current.lastError, }; if (typeof current.connected === "boolean") { diff --git a/src/gateway/server-http.probe.test.ts b/src/gateway/server-http.probe.test.ts new file mode 100644 index 00000000000..0e55ddeba32 --- /dev/null +++ b/src/gateway/server-http.probe.test.ts @@ -0,0 +1,155 @@ +import { describe, expect, it } from "vitest"; +import { + AUTH_TOKEN, + AUTH_NONE, + createRequest, + createResponse, + dispatchRequest, + withGatewayServer, +} from "./server-http.test-harness.js"; +import type { ReadinessChecker } from "./server/readiness.js"; + +describe("gateway probe endpoints", () => { + it("returns detailed readiness payload for local /ready requests", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: true, + failing: [], + uptimeMs: 45_000, + }); + + await withGatewayServer({ + prefix: "probe-ready", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/ready" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(200); + expect(JSON.parse(getBody())).toEqual({ ready: true, failing: [], uptimeMs: 45_000 }); + }, + }); + }); + + it("returns only readiness state for unauthenticated remote /ready requests", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord", "telegram"], + uptimeMs: 8_000, + }); + + await withGatewayServer({ + prefix: "probe-not-ready", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ + path: "/ready", + remoteAddress: "10.0.0.8", + host: "gateway.test", + }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(JSON.parse(getBody())).toEqual({ ready: false }); + }, + }); + }); + + it("returns detailed readiness payload for authenticated remote /ready requests", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord", "telegram"], + uptimeMs: 8_000, + }); + + await withGatewayServer({ + prefix: "probe-remote-authenticated", + resolvedAuth: AUTH_TOKEN, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ + path: "/ready", + remoteAddress: "10.0.0.8", + host: "gateway.test", + authorization: "Bearer test-token", + }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(JSON.parse(getBody())).toEqual({ + ready: false, + failing: ["discord", "telegram"], + uptimeMs: 8_000, + }); + }, + }); + }); + + it("returns typed internal error payload when readiness evaluation throws", async () => { + const getReadiness: ReadinessChecker = () => { + throw new Error("boom"); + }; + + await withGatewayServer({ + prefix: "probe-throws", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/ready" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(JSON.parse(getBody())).toEqual({ ready: false, failing: ["internal"], uptimeMs: 0 }); + }, + }); + }); + + it("keeps /healthz shallow even when readiness checker reports failing channels", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord"], + uptimeMs: 999, + }); + + await withGatewayServer({ + prefix: "probe-healthz-unaffected", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/healthz" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(200); + expect(getBody()).toBe(JSON.stringify({ ok: true, status: "live" })); + }, + }); + }); + + it("reflects readiness status on HEAD /readyz without a response body", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord"], + uptimeMs: 5_000, + }); + + await withGatewayServer({ + prefix: "probe-readyz-head", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/readyz", method: "HEAD" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(getBody()).toBe(""); + }, + }); + }); +}); diff --git a/src/gateway/server-http.test-harness.ts b/src/gateway/server-http.test-harness.ts index bf963487038..24612d60b1f 100644 --- a/src/gateway/server-http.test-harness.ts +++ b/src/gateway/server-http.test-harness.ts @@ -28,11 +28,15 @@ export function createRequest(params: { path: string; authorization?: string; method?: string; + remoteAddress?: string; + host?: string; }): IncomingMessage { return createGatewayRequest({ path: params.path, authorization: params.authorization, method: params.method, + remoteAddress: params.remoteAddress, + host: params.host, }); } @@ -127,6 +131,8 @@ export async function sendRequest( path: string; authorization?: string; method?: string; + remoteAddress?: string; + host?: string; }, ): Promise> { const response = createResponse(); diff --git a/src/gateway/server-http.ts b/src/gateway/server-http.ts index 41911f35b49..612ce90dbba 100644 --- a/src/gateway/server-http.ts +++ b/src/gateway/server-http.ts @@ -20,7 +20,12 @@ import { normalizeRateLimitClientIp, type AuthRateLimiter, } from "./auth-rate-limit.js"; -import { type GatewayAuthResult, type ResolvedGatewayAuth } from "./auth.js"; +import { + authorizeHttpGatewayConnect, + isLocalDirectRequest, + type GatewayAuthResult, + type ResolvedGatewayAuth, +} from "./auth.js"; import { normalizeCanvasScopedUrl } from "./canvas-capability.js"; import { handleControlUiAvatarRequest, @@ -46,6 +51,7 @@ import { resolveHookDeliver, } from "./hooks.js"; import { sendGatewayAuthFailure, setDefaultSecurityHeaders } from "./http-common.js"; +import { getBearerToken } from "./http-utils.js"; import { handleOpenAiHttpRequest } from "./openai-http.js"; import { handleOpenResponsesHttpRequest } from "./openresponses-http.js"; import { @@ -59,6 +65,7 @@ import { type PluginHttpRequestHandler, type PluginRoutePathContext, } from "./server/plugins-http.js"; +import type { ReadinessChecker } from "./server/readiness.js"; import type { GatewayWsClient } from "./server/ws-types.js"; import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js"; @@ -150,11 +157,39 @@ function shouldEnforceDefaultPluginGatewayAuth(pathContext: PluginRoutePathConte ); } -function handleGatewayProbeRequest( +async function canRevealReadinessDetails(params: { + req: IncomingMessage; + resolvedAuth: ResolvedGatewayAuth; + trustedProxies: string[]; + allowRealIpFallback: boolean; +}): Promise { + if (isLocalDirectRequest(params.req, params.trustedProxies, params.allowRealIpFallback)) { + return true; + } + if (params.resolvedAuth.mode === "none") { + return false; + } + + const bearerToken = getBearerToken(params.req); + const authResult = await authorizeHttpGatewayConnect({ + auth: params.resolvedAuth, + connectAuth: bearerToken ? { token: bearerToken, password: bearerToken } : null, + req: params.req, + trustedProxies: params.trustedProxies, + allowRealIpFallback: params.allowRealIpFallback, + }); + return authResult.ok; +} + +async function handleGatewayProbeRequest( req: IncomingMessage, res: ServerResponse, requestPath: string, -): boolean { + resolvedAuth: ResolvedGatewayAuth, + trustedProxies: string[], + allowRealIpFallback: boolean, + getReadiness?: ReadinessChecker, +): Promise { const status = GATEWAY_PROBE_STATUS_BY_PATH.get(requestPath); if (!status) { return false; @@ -169,14 +204,34 @@ function handleGatewayProbeRequest( return true; } - res.statusCode = 200; res.setHeader("Content-Type", "application/json; charset=utf-8"); res.setHeader("Cache-Control", "no-store"); - if (method === "HEAD") { - res.end(); - return true; + + let statusCode: number; + let body: string; + if (status === "ready" && getReadiness) { + const includeDetails = await canRevealReadinessDetails({ + req, + resolvedAuth, + trustedProxies, + allowRealIpFallback, + }); + try { + const result = getReadiness(); + statusCode = result.ready ? 200 : 503; + body = JSON.stringify(includeDetails ? result : { ready: result.ready }); + } catch { + statusCode = 503; + body = JSON.stringify( + includeDetails ? { ready: false, failing: ["internal"], uptimeMs: 0 } : { ready: false }, + ); + } + } else { + statusCode = 200; + body = JSON.stringify({ ok: true, status }); } - res.end(JSON.stringify({ ok: true, status })); + res.statusCode = statusCode; + res.end(method === "HEAD" ? undefined : body); return true; } @@ -519,6 +574,7 @@ export function createGatewayHttpServer(opts: { resolvedAuth: ResolvedGatewayAuth; /** Optional rate limiter for auth brute-force protection. */ rateLimiter?: AuthRateLimiter; + getReadiness?: ReadinessChecker; tlsOptions?: TlsOptions; }): HttpServer { const { @@ -537,6 +593,7 @@ export function createGatewayHttpServer(opts: { shouldEnforcePluginGatewayAuth, resolvedAuth, rateLimiter, + getReadiness, } = opts; const httpServer: HttpServer = opts.tlsOptions ? createHttpsServer(opts.tlsOptions, (req, res) => { @@ -693,7 +750,16 @@ export function createGatewayHttpServer(opts: { requestStages.push({ name: "gateway-probes", - run: () => handleGatewayProbeRequest(req, res, requestPath), + run: () => + handleGatewayProbeRequest( + req, + res, + requestPath, + resolvedAuth, + trustedProxies, + allowRealIpFallback, + getReadiness, + ), }); if (await runGatewayHttpRequestStages(requestStages)) { diff --git a/src/gateway/server-runtime-state.ts b/src/gateway/server-runtime-state.ts index 9054b3a2a3f..5733f3671e4 100644 --- a/src/gateway/server-runtime-state.ts +++ b/src/gateway/server-runtime-state.ts @@ -32,6 +32,7 @@ import { shouldEnforceGatewayAuthForPluginPath, type PluginRoutePathContext, } from "./server/plugins-http.js"; +import type { ReadinessChecker } from "./server/readiness.js"; import type { GatewayTlsRuntime } from "./server/tls.js"; import type { GatewayWsClient } from "./server/ws-types.js"; @@ -61,6 +62,7 @@ export async function createGatewayRuntimeState(params: { log: { info: (msg: string) => void; warn: (msg: string) => void }; logHooks: ReturnType; logPlugins: ReturnType; + getReadiness?: ReadinessChecker; }): Promise<{ canvasHost: CanvasHostHandler | null; httpServer: HttpServer; @@ -156,6 +158,7 @@ export async function createGatewayRuntimeState(params: { shouldEnforcePluginGatewayAuth, resolvedAuth: params.resolvedAuth, rateLimiter: params.rateLimiter, + getReadiness: params.getReadiness, tlsOptions: params.gatewayTls?.enabled ? params.gatewayTls.tlsOptions : undefined, }); try { diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index efb95e7a7cf..e9c83156260 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -106,6 +106,7 @@ import { incrementPresenceVersion, refreshGatewayHealthSnapshot, } from "./server/health-state.js"; +import { createReadinessChecker } from "./server/readiness.js"; import { loadGatewayTlsRuntime } from "./server/tls.js"; import { ensureGatewayStartupAuth, @@ -546,6 +547,17 @@ export async function startGatewayServer( if (cfgAtStart.gateway?.tls?.enabled && !gatewayTls.enabled) { throw new Error(gatewayTls.error ?? "gateway tls: failed to enable"); } + const serverStartedAt = Date.now(); + const channelManager = createChannelManager({ + loadConfig, + channelLogs, + channelRuntimeEnvs, + channelRuntime: createPluginRuntime().channel, + }); + const getReadiness = createReadinessChecker({ + channelManager, + startedAt: serverStartedAt, + }); const { canvasHost, httpServer, @@ -589,6 +601,7 @@ export async function startGatewayServer( log, logHooks, logPlugins, + getReadiness, }); let bonjourStop: (() => Promise) | null = null; const nodeRegistry = new NodeRegistry(); @@ -618,12 +631,6 @@ export async function startGatewayServer( }); let { cron, storePath: cronStorePath } = cronState; - const channelManager = createChannelManager({ - loadConfig, - channelLogs, - channelRuntimeEnvs, - channelRuntime: createPluginRuntime().channel, - }); const { getRuntimeSnapshot, startChannels, startChannel, stopChannel, markChannelLoggedOut } = channelManager; diff --git a/src/gateway/server/readiness.test.ts b/src/gateway/server/readiness.test.ts new file mode 100644 index 00000000000..c41f8d050f2 --- /dev/null +++ b/src/gateway/server/readiness.test.ts @@ -0,0 +1,202 @@ +import { describe, expect, it, vi } from "vitest"; +import type { ChannelId } from "../../channels/plugins/index.js"; +import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js"; +import type { ChannelManager, ChannelRuntimeSnapshot } from "../server-channels.js"; +import { createReadinessChecker } from "./readiness.js"; + +function snapshotWith( + accounts: Record>, +): ChannelRuntimeSnapshot { + const channels: ChannelRuntimeSnapshot["channels"] = {}; + const channelAccounts: ChannelRuntimeSnapshot["channelAccounts"] = {}; + + for (const [channelId, accountSnapshot] of Object.entries(accounts)) { + const resolved = { accountId: "default", ...accountSnapshot } as ChannelAccountSnapshot; + channels[channelId as ChannelId] = resolved; + channelAccounts[channelId as ChannelId] = { default: resolved }; + } + + return { channels, channelAccounts }; +} + +function createManager(snapshot: ChannelRuntimeSnapshot): ChannelManager { + return { + getRuntimeSnapshot: vi.fn(() => snapshot), + startChannels: vi.fn(), + startChannel: vi.fn(), + stopChannel: vi.fn(), + markChannelLoggedOut: vi.fn(), + isManuallyStopped: vi.fn(() => false), + resetRestartAttempts: vi.fn(), + }; +} + +describe("createReadinessChecker", () => { + it("reports ready when all managed channels are healthy", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: startedAt, + lastEventAt: Date.now() - 1_000, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("ignores disabled and unconfigured channels", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: false, + enabled: false, + configured: true, + lastStartAt: startedAt, + }, + telegram: { + running: false, + enabled: true, + configured: false, + lastStartAt: startedAt, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("uses startup grace before marking disconnected channels not ready", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 30_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: startedAt, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 30_000 }); + vi.useRealTimers(); + }); + + it("reports disconnected managed channels after startup grace", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: startedAt, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: false, failing: ["discord"], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("keeps restart-pending channels ready during reconnect backoff", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: false, + restartPending: true, + reconnectAttempts: 3, + enabled: true, + configured: true, + lastStartAt: startedAt - 30_000, + lastStopAt: Date.now() - 5_000, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("treats stale-socket channels as ready to avoid pulling healthy idle pods", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 31 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: startedAt, + lastEventAt: Date.now() - 31 * 60_000, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 1_860_000 }); + vi.useRealTimers(); + }); + + it("caches readiness snapshots briefly to keep repeated probes cheap", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: startedAt, + lastEventAt: Date.now() - 1_000, + }, + }), + ); + + const readiness = createReadinessChecker({ + channelManager: manager, + startedAt, + cacheTtlMs: 1_000, + }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.advanceTimersByTime(500); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_500 }); + expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(1); + + vi.advanceTimersByTime(600); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 301_100 }); + expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2); + vi.useRealTimers(); + }); +}); diff --git a/src/gateway/server/readiness.ts b/src/gateway/server/readiness.ts new file mode 100644 index 00000000000..e6ad2d92afb --- /dev/null +++ b/src/gateway/server/readiness.ts @@ -0,0 +1,79 @@ +import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js"; +import { + DEFAULT_CHANNEL_CONNECT_GRACE_MS, + DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, + evaluateChannelHealth, + type ChannelHealthPolicy, + type ChannelHealthEvaluation, +} from "../channel-health-policy.js"; +import type { ChannelManager } from "../server-channels.js"; + +export type ReadinessResult = { + ready: boolean; + failing: string[]; + uptimeMs: number; +}; + +export type ReadinessChecker = () => ReadinessResult; + +const DEFAULT_READINESS_CACHE_TTL_MS = 1_000; + +function shouldIgnoreReadinessFailure( + accountSnapshot: ChannelAccountSnapshot, + health: ChannelHealthEvaluation, +): boolean { + if (health.reason === "unmanaged" || health.reason === "stale-socket") { + return true; + } + // Channel restarts spend time in backoff with running=false before the next + // lifecycle re-enters startup grace. Keep readiness green during that handoff + // window, but still surface hard failures once restart attempts are exhausted. + return health.reason === "not-running" && accountSnapshot.restartPending === true; +} + +export function createReadinessChecker(deps: { + channelManager: ChannelManager; + startedAt: number; + cacheTtlMs?: number; +}): ReadinessChecker { + const { channelManager, startedAt } = deps; + const cacheTtlMs = Math.max(0, deps.cacheTtlMs ?? DEFAULT_READINESS_CACHE_TTL_MS); + let cachedAt = 0; + let cachedState: Omit | null = null; + + return (): ReadinessResult => { + const now = Date.now(); + const uptimeMs = now - startedAt; + if (cachedState && now - cachedAt < cacheTtlMs) { + return { ...cachedState, uptimeMs }; + } + + const snapshot = channelManager.getRuntimeSnapshot(); + const failing: string[] = []; + const policy: ChannelHealthPolicy = { + now, + staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, + channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS, + }; + + for (const [channelId, accounts] of Object.entries(snapshot.channelAccounts)) { + if (!accounts) { + continue; + } + for (const accountSnapshot of Object.values(accounts)) { + if (!accountSnapshot) { + continue; + } + const health = evaluateChannelHealth(accountSnapshot, policy); + if (!health.healthy && !shouldIgnoreReadinessFailure(accountSnapshot, health)) { + failing.push(channelId); + break; + } + } + } + + cachedAt = now; + cachedState = { ready: failing.length === 0, failing }; + return { ...cachedState, uptimeMs }; + }; +} From 7ce79c8972429d7bc94326662998c3c47f52c4fa Mon Sep 17 00:00:00 2001 From: AngryBird <48046333+angrybirddd@users.noreply.github.com> Date: Sat, 7 Mar 2026 05:22:19 +0800 Subject: [PATCH 05/16] docs: fix broken dashboard image on i18n pages (#38031) The dashboard screenshot uses a relative path `src="whatsapp-openclaw.jpg"` which resolves correctly on the English root page but produces 404 on zh-CN and ja-JP pages because Mintlify prepends the language subdirectory to the CDN path. Change to absolute path `/whatsapp-openclaw.jpg` in all three index files, consistent with other images on the same page that already use absolute paths (e.g. `/assets/openclaw-logo-text-dark.png`). --- docs/index.md | 2 +- docs/ja-JP/index.md | 2 +- docs/zh-CN/index.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/index.md b/docs/index.md index 606ff4828e5..2821cb1c84f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -124,7 +124,7 @@ Open the browser Control UI after the Gateway starts. - Remote access: [Web surfaces](/web) and [Tailscale](/gateway/tailscale)

- OpenClaw + OpenClaw

## Configuration (optional) diff --git a/docs/ja-JP/index.md b/docs/ja-JP/index.md index 63d83d74ab2..a47280c8dc2 100644 --- a/docs/ja-JP/index.md +++ b/docs/ja-JP/index.md @@ -118,7 +118,7 @@ Gatewayの起動後、ブラウザでControl UIを開きます。 - リモートアクセス: [Webサーフェス](/web)および[Tailscale](/gateway/tailscale)

- OpenClaw + OpenClaw

## 設定(オプション) diff --git a/docs/zh-CN/index.md b/docs/zh-CN/index.md index 65d2db9ea83..3999dc6fda4 100644 --- a/docs/zh-CN/index.md +++ b/docs/zh-CN/index.md @@ -118,7 +118,7 @@ Gateway 网关启动后,打开浏览器控制界面。 - 远程访问:[Web 界面](/web)和 [Tailscale](/gateway/tailscale)

- OpenClaw + OpenClaw

## 配置(可选) From 20db7afd5f2bf55fc23e97c4ad81ea03cd9e7f9c Mon Sep 17 00:00:00 2001 From: Anton Eicher <54324760+ant1eicher@users.noreply.github.com> Date: Fri, 6 Mar 2026 23:50:34 +0200 Subject: [PATCH 06/16] fix(feishu): remove invalid timeout properties from SDK method calls (#38267) The `timeout` property is not part of the Lark SDK method signatures, causing TS2353 errors. The client-level `httpTimeoutMs` already applies the timeout to all requests. Co-authored-by: Claude Opus 4.6 --- extensions/feishu/src/media.test.ts | 11 ++--------- extensions/feishu/src/media.ts | 4 ---- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/extensions/feishu/src/media.test.ts b/extensions/feishu/src/media.test.ts index 122b4477809..7fd5a0a7bd8 100644 --- a/extensions/feishu/src/media.test.ts +++ b/extensions/feishu/src/media.test.ts @@ -182,7 +182,7 @@ describe("sendMediaFeishu msg_type routing", () => { ); }); - it("uses image upload timeout override for image media", async () => { + it("uploads image for image media", async () => { await sendMediaFeishu({ cfg: {} as any, to: "user:ou_target", @@ -190,11 +190,7 @@ describe("sendMediaFeishu msg_type routing", () => { fileName: "photo.png", }); - expect(imageCreateMock).toHaveBeenCalledWith( - expect.objectContaining({ - timeout: 120_000, - }), - ); + expect(imageCreateMock).toHaveBeenCalled(); expect(messageCreateMock).toHaveBeenCalledWith( expect.objectContaining({ data: expect.objectContaining({ msg_type: "image" }), @@ -320,7 +316,6 @@ describe("sendMediaFeishu msg_type routing", () => { expect(imageGetMock).toHaveBeenCalledWith( expect.objectContaining({ path: { image_key: imageKey }, - timeout: 120_000, }), ); expect(result.buffer).toEqual(Buffer.from("image-data")); @@ -512,7 +507,6 @@ describe("downloadMessageResourceFeishu", () => { expect.objectContaining({ path: { message_id: "om_audio_msg", file_key: "file_key_audio" }, params: { type: "file" }, - timeout: 120_000, }), ); expect(result.buffer).toBeInstanceOf(Buffer); @@ -532,7 +526,6 @@ describe("downloadMessageResourceFeishu", () => { expect.objectContaining({ path: { message_id: "om_img_msg", file_key: "img_key_1" }, params: { type: "image" }, - timeout: 120_000, }), ); expect(result.buffer).toBeInstanceOf(Buffer); diff --git a/extensions/feishu/src/media.ts b/extensions/feishu/src/media.ts index 6d9f821c602..4aba038b4a9 100644 --- a/extensions/feishu/src/media.ts +++ b/extensions/feishu/src/media.ts @@ -106,7 +106,6 @@ export async function downloadImageFeishu(params: { const response = await client.im.image.get({ path: { image_key: normalizedImageKey }, - timeout: FEISHU_MEDIA_HTTP_TIMEOUT_MS, }); const buffer = await readFeishuResponseBuffer({ @@ -146,7 +145,6 @@ export async function downloadMessageResourceFeishu(params: { const response = await client.im.messageResource.get({ path: { message_id: messageId, file_key: normalizedFileKey }, params: { type }, - timeout: FEISHU_MEDIA_HTTP_TIMEOUT_MS, }); const buffer = await readFeishuResponseBuffer({ @@ -202,7 +200,6 @@ export async function uploadImageFeishu(params: { // eslint-disable-next-line @typescript-eslint/no-explicit-any -- SDK accepts Buffer or ReadStream image: imageData as any, }, - timeout: FEISHU_MEDIA_HTTP_TIMEOUT_MS, }); // SDK v1.30+ returns data directly without code wrapper on success @@ -277,7 +274,6 @@ export async function uploadFileFeishu(params: { file: fileData as any, ...(duration !== undefined && { duration }), }, - timeout: FEISHU_MEDIA_HTTP_TIMEOUT_MS, }); // SDK v1.30+ returns data directly without code wrapper on success From 864a1ecae7d0df2fb16b911bbb255001219ce4fc Mon Sep 17 00:00:00 2001 From: Shadow Date: Fri, 6 Mar 2026 15:53:10 -0600 Subject: [PATCH 07/16] docs: add changelog entry for Feishu timeouts (#38356) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99c04190526..bb22e361d2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -208,6 +208,7 @@ Docs: https://docs.openclaw.ai - Windows/Plugin install: when OpenClaw runs on Windows via Bun and `npm-cli.js` is not colocated with the runtime binary, fall back to `npm.cmd`/`npx.cmd` through the existing `cmd.exe` wrapper so `openclaw plugins install` no longer fails with `spawn EINVAL`. (#38056) Thanks @0xlin2023. - Telegram/send retry classification: retry grammY `Network request ... failed after N attempts` envelopes in send flows without reclassifying plain `Network request ... failed!` wrappers as transient, restoring the intended retry path while keeping broad send-context message matching tight. (#38056) Thanks @0xlin2023. - Gateway/probes: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, preserve plugin-owned route precedence on those paths, and make `/ready` and `/readyz` report channel-backed readiness with startup grace plus `503` on disconnected managed channels, while `/health` and `/healthz` stay shallow liveness probes. (#18446) Thanks @vibecodooor, @mahsumaktas, and @vincentkoc. +- Feishu/media downloads: drop invalid timeout fields from SDK method calls now that client-level `httpTimeoutMs` applies to requests. (#38267) Thanks @ant1eicher and @thewilloftheshadow. ## 2026.3.2 From c301c5d08345f532e00de6b47aeecb64f25ac2c5 Mon Sep 17 00:00:00 2001 From: Shadow Date: Fri, 6 Mar 2026 15:53:59 -0600 Subject: [PATCH 08/16] fix: add no-ci-pr auto-response label --- .github/workflows/auto-response.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/auto-response.yml b/.github/workflows/auto-response.yml index 5f20a699944..8ef4f407f44 100644 --- a/.github/workflows/auto-response.yml +++ b/.github/workflows/auto-response.yml @@ -49,6 +49,14 @@ jobs: message: "Please use [our support server](https://discord.gg/clawd) and ask in #help or #users-helping-users to resolve this, or follow the stuck FAQ at https://docs.openclaw.ai/help/faq#im-stuck-whats-the-fastest-way-to-get-unstuck.", }, + { + label: "r: no-ci-pr", + message: `Please don't make PRs for test failures on main. + +The team is aware of those and will handle them directly on the codebase, not only fixing the tests but also investigating what the root cause is. Having to sift through test-fix-PRs (including some that have been out of date for weeks...) on top of that doesn't help. There are already way too many PRs for humans to manage; please don't make the flood worse. + +Thank you.`, + }, { label: "r: too-many-prs", close: true, From 91494b259690e274174fe9620b451a2d6ba0a718 Mon Sep 17 00:00:00 2001 From: Shadow Date: Fri, 6 Mar 2026 16:24:50 -0600 Subject: [PATCH 09/16] fix: repair auto-response workflow YAML --- .github/workflows/auto-response.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/auto-response.yml b/.github/workflows/auto-response.yml index 8ef4f407f44..8fb76b99b9e 100644 --- a/.github/workflows/auto-response.yml +++ b/.github/workflows/auto-response.yml @@ -51,11 +51,10 @@ jobs: }, { label: "r: no-ci-pr", - message: `Please don't make PRs for test failures on main. - -The team is aware of those and will handle them directly on the codebase, not only fixing the tests but also investigating what the root cause is. Having to sift through test-fix-PRs (including some that have been out of date for weeks...) on top of that doesn't help. There are already way too many PRs for humans to manage; please don't make the flood worse. - -Thank you.`, + message: + "Please don't make PRs for test failures on main.\n\n" + + "The team is aware of those and will handle them directly on the codebase, not only fixing the tests but also investigating what the root cause is. Having to sift through test-fix-PRs (including some that have been out of date for weeks...) on top of that doesn't help. There are already way too many PRs for humans to manage; please don't make the flood worse.\n\n" + + "Thank you.", }, { label: "r: too-many-prs", From e601bf2d8ef41a4c799c6742e7a9cd875e83e9f7 Mon Sep 17 00:00:00 2001 From: Wei Zhou Date: Sat, 7 Mar 2026 06:31:15 +0800 Subject: [PATCH 10/16] fix(pi-embedded-runner): propagate sender identity to fix Feishu doc create auto-grant (#32915) Merged via squash. Prepared head SHA: efb229307559ad37062b454da444567f5dca8a96 Co-authored-by: cszhouwei <1811726+cszhouwei@users.noreply.github.com> Co-authored-by: jalehman <550978+jalehman@users.noreply.github.com> Reviewed-by: @jalehman --- CHANGELOG.md | 2 ++ src/agents/pi-embedded-runner/run.ts | 4 +++ .../usage-reporting.test.ts | 34 +++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb22e361d2d..60ec0602041 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -209,6 +209,7 @@ Docs: https://docs.openclaw.ai - Telegram/send retry classification: retry grammY `Network request ... failed after N attempts` envelopes in send flows without reclassifying plain `Network request ... failed!` wrappers as transient, restoring the intended retry path while keeping broad send-context message matching tight. (#38056) Thanks @0xlin2023. - Gateway/probes: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, preserve plugin-owned route precedence on those paths, and make `/ready` and `/readyz` report channel-backed readiness with startup grace plus `503` on disconnected managed channels, while `/health` and `/healthz` stay shallow liveness probes. (#18446) Thanks @vibecodooor, @mahsumaktas, and @vincentkoc. - Feishu/media downloads: drop invalid timeout fields from SDK method calls now that client-level `httpTimeoutMs` applies to requests. (#38267) Thanks @ant1eicher and @thewilloftheshadow. +- PI embedded runner/Feishu docs: propagate sender identity into embedded attempts so Feishu doc auto-grant restores requester access for embedded-runner executions. (#32915) thanks @cszhouwei. ## 2026.3.2 @@ -2880,6 +2881,7 @@ Docs: https://docs.openclaw.ai - BlueBubbles: resolve short message IDs safely and expose full IDs in templates. (#1387) Thanks @tyler6204. - Infra: preserve fetch helper methods when wrapping abort signals. (#1387) - macOS: default distribution packaging to universal binaries. (#1396) Thanks @JustYannicc. +- Embedded runner: forward sender identity into attempt execution so Feishu doc auto-grant receives requester context again. (#32915) Thanks @cszhouwei. ## 2026.1.20 diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 52faf8514b7..11be807e120 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -803,6 +803,10 @@ export async function runEmbeddedPiAgent( groupChannel: params.groupChannel, groupSpace: params.groupSpace, spawnedBy: params.spawnedBy, + senderId: params.senderId, + senderName: params.senderName, + senderUsername: params.senderUsername, + senderE164: params.senderE164, senderIsOwner: params.senderIsOwner, currentChannelId: params.currentChannelId, currentThreadTs: params.currentThreadTs, diff --git a/src/agents/pi-embedded-runner/usage-reporting.test.ts b/src/agents/pi-embedded-runner/usage-reporting.test.ts index ed8d1227225..f4d6f5cbe44 100644 --- a/src/agents/pi-embedded-runner/usage-reporting.test.ts +++ b/src/agents/pi-embedded-runner/usage-reporting.test.ts @@ -10,6 +10,40 @@ describe("runEmbeddedPiAgent usage reporting", () => { vi.clearAllMocks(); }); + it("forwards sender identity fields into embedded attempts", async () => { + mockedRunEmbeddedAttempt.mockResolvedValueOnce({ + aborted: false, + promptError: null, + timedOut: false, + sessionIdUsed: "test-session", + assistantTexts: ["Response 1"], + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } as any); + + await runEmbeddedPiAgent({ + sessionId: "test-session", + sessionKey: "test-key", + sessionFile: "/tmp/session.json", + workspaceDir: "/tmp/workspace", + prompt: "hello", + timeoutMs: 30000, + runId: "run-sender-forwarding", + senderId: "user-123", + senderName: "Josh Lehman", + senderUsername: "josh", + senderE164: "+15551234567", + }); + + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledWith( + expect.objectContaining({ + senderId: "user-123", + senderName: "Josh Lehman", + senderUsername: "josh", + senderE164: "+15551234567", + }), + ); + }); + it("reports total usage from the last turn instead of accumulated total", async () => { // Simulate a multi-turn run result. // Turn 1: Input 100, Output 50. Total 150. From 110ca23bab2793a1dc89672425a670f73bdb1e0c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 17:33:38 -0500 Subject: [PATCH 11/16] Feishu: update media timeout tests --- extensions/feishu/src/media.test.ts | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/extensions/feishu/src/media.test.ts b/extensions/feishu/src/media.test.ts index 7fd5a0a7bd8..813e5090292 100644 --- a/extensions/feishu/src/media.test.ts +++ b/extensions/feishu/src/media.test.ts @@ -16,6 +16,8 @@ const messageCreateMock = vi.hoisted(() => vi.fn()); const messageResourceGetMock = vi.hoisted(() => vi.fn()); const messageReplyMock = vi.hoisted(() => vi.fn()); +const FEISHU_MEDIA_HTTP_TIMEOUT_MS = 120_000; + vi.mock("./client.js", () => ({ createFeishuClient: createFeishuClientMock, })); @@ -54,6 +56,14 @@ function expectPathIsolatedToTmpRoot(pathValue: string, key: string): void { expect(rel === ".." || rel.startsWith(`..${path.sep}`)).toBe(false); } +function expectMediaTimeoutClientConfigured(): void { + expect(createFeishuClientMock).toHaveBeenCalledWith( + expect.objectContaining({ + httpTimeoutMs: FEISHU_MEDIA_HTTP_TIMEOUT_MS, + }), + ); +} + describe("sendMediaFeishu msg_type routing", () => { beforeEach(() => { vi.clearAllMocks(); @@ -182,7 +192,7 @@ describe("sendMediaFeishu msg_type routing", () => { ); }); - it("uploads image for image media", async () => { + it("configures the media client timeout for image uploads", async () => { await sendMediaFeishu({ cfg: {} as any, to: "user:ou_target", @@ -190,7 +200,7 @@ describe("sendMediaFeishu msg_type routing", () => { fileName: "photo.png", }); - expect(imageCreateMock).toHaveBeenCalled(); + expectMediaTimeoutClientConfigured(); expect(messageCreateMock).toHaveBeenCalledWith( expect.objectContaining({ data: expect.objectContaining({ msg_type: "image" }), @@ -318,6 +328,7 @@ describe("sendMediaFeishu msg_type routing", () => { path: { image_key: imageKey }, }), ); + expectMediaTimeoutClientConfigured(); expect(result.buffer).toEqual(Buffer.from("image-data")); expect(capturedPath).toBeDefined(); expectPathIsolatedToTmpRoot(capturedPath as string, imageKey); @@ -509,6 +520,7 @@ describe("downloadMessageResourceFeishu", () => { params: { type: "file" }, }), ); + expectMediaTimeoutClientConfigured(); expect(result.buffer).toBeInstanceOf(Buffer); }); @@ -528,6 +540,7 @@ describe("downloadMessageResourceFeishu", () => { params: { type: "image" }, }), ); + expectMediaTimeoutClientConfigured(); expect(result.buffer).toBeInstanceOf(Buffer); }); }); From 6e962d8b9e55f19c33f23a5a6973bcb7b7556589 Mon Sep 17 00:00:00 2001 From: Altay Date: Sat, 7 Mar 2026 01:42:11 +0300 Subject: [PATCH 12/16] fix(agents): handle overloaded failover separately (#38301) * fix(agents): skip auth-profile failure on overload * fix(agents): note overload auth-profile fallback fix * fix(agents): classify overloaded failures separately * fix(agents): back off before overload failover * fix(agents): tighten overload probe and backoff state * fix(agents): persist overloaded cooldown across runs * fix(agents): tighten overloaded status handling * test(agents): add overload regression coverage * fix(agents): restore runner imports after rebase * test(agents): add overload fallback integration coverage * fix(agents): harden overloaded failover abort handling * test(agents): tighten overload classifier coverage * test(agents): cover all-overloaded fallback exhaustion * fix(cron): retry overloaded fallback summaries * fix(cron): treat HTTP 529 as overloaded retry --- CHANGELOG.md | 1 + docs/automation/cron-jobs.md | 5 +- ...th-profiles.markauthprofilefailure.test.ts | 16 + src/agents/auth-profiles/types.ts | 1 + src/agents/auth-profiles/usage.test.ts | 18 + src/agents/auth-profiles/usage.ts | 5 +- src/agents/failover-error.test.ts | 31 +- src/agents/failover-error.ts | 2 + src/agents/model-fallback.probe.test.ts | 48 +- .../model-fallback.run-embedded.e2e.test.ts | 517 ++++++++++++++++++ src/agents/model-fallback.test.ts | 44 +- src/agents/model-fallback.ts | 12 +- ...dded-helpers.isbillingerrormessage.test.ts | 20 +- src/agents/pi-embedded-helpers/errors.ts | 33 +- src/agents/pi-embedded-helpers/types.ts | 1 + ...pi-agent.auth-profile-rotation.e2e.test.ts | 150 ++++- src/agents/pi-embedded-runner/run.ts | 82 ++- src/agents/pi-embedded-runner/run/params.ts | 6 +- .../reply/agent-runner-execution.ts | 2 +- src/auto-reply/reply/agent-runner-memory.ts | 2 +- src/auto-reply/reply/agent-runner-utils.ts | 4 +- .../agent-runner.runreplyagent.e2e.test.ts | 5 + src/auto-reply/reply/followup-runner.ts | 2 +- src/commands/agent.ts | 6 +- src/commands/models/list.probe.test.ts | 1 + src/commands/models/list.probe.ts | 2 +- src/config/config-misc.test.ts | 2 +- src/config/schema.help.ts | 4 +- src/config/types.cron.ts | 2 +- src/config/zod-schema.ts | 2 +- src/cron/isolated-agent/run.ts | 2 +- src/cron/service.issue-regressions.test.ts | 67 ++- src/cron/service/timer.ts | 2 + src/discord/monitor/auto-presence.test.ts | 20 + src/discord/monitor/auto-presence.ts | 1 + src/test-utils/model-fallback.mock.ts | 2 +- 36 files changed, 1036 insertions(+), 84 deletions(-) create mode 100644 src/agents/model-fallback.run-embedded.e2e.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 60ec0602041..1840fd3cde2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -121,6 +121,7 @@ Docs: https://docs.openclaw.ai - Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n. - Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant. - Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot. +- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts, add short overload backoff before retry/failover, record overloaded prompt/assistant failures as transient auth-profile cooldowns (with probeable same-provider fallback) instead of treating them like persistent auth/billing failures, and keep one-shot cron retry classification aligned so overloaded fallback summaries still count as transient retries. - Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan. - Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo. - Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc. diff --git a/docs/automation/cron-jobs.md b/docs/automation/cron-jobs.md index 1421480a7a0..b0798898910 100644 --- a/docs/automation/cron-jobs.md +++ b/docs/automation/cron-jobs.md @@ -370,6 +370,7 @@ When a job fails, OpenClaw classifies errors as **transient** (retryable) or **p ### Transient errors (retried) - Rate limit (429, too many requests, resource exhausted) +- Provider overload (for example Anthropic `529 overloaded_error`, overload fallback summaries) - Network errors (timeout, ECONNRESET, fetch failed, socket) - Server errors (5xx) - Cloudflare-related errors @@ -407,7 +408,7 @@ Configure `cron.retry` to override these defaults (see [Configuration](/automati retry: { maxAttempts: 3, backoffMs: [60000, 120000, 300000], - retryOn: ["rate_limit", "network", "server_error"], + retryOn: ["rate_limit", "overloaded", "network", "server_error"], }, webhook: "https://example.invalid/legacy", // deprecated fallback for stored notify:true jobs webhookToken: "replace-with-dedicated-webhook-token", // optional bearer token for webhook mode @@ -665,7 +666,7 @@ openclaw system event --mode now --text "Next heartbeat: check battery." - OpenClaw applies exponential retry backoff for recurring jobs after consecutive errors: 30s, 1m, 5m, 15m, then 60m between retries. - Backoff resets automatically after the next successful run. -- One-shot (`at`) jobs retry transient errors (rate limit, network, server_error) up to 3 times with backoff; permanent errors disable immediately. See [Retry policy](/automation/cron-jobs#retry-policy). +- One-shot (`at`) jobs retry transient errors (rate limit, overloaded, network, server_error) up to 3 times with backoff; permanent errors disable immediately. See [Retry policy](/automation/cron-jobs#retry-policy). ### Telegram delivers to the wrong place diff --git a/src/agents/auth-profiles.markauthprofilefailure.test.ts b/src/agents/auth-profiles.markauthprofilefailure.test.ts index 865fbf87816..e5690f75c6a 100644 --- a/src/agents/auth-profiles.markauthprofilefailure.test.ts +++ b/src/agents/auth-profiles.markauthprofilefailure.test.ts @@ -114,6 +114,22 @@ describe("markAuthProfileFailure", () => { expect(reloaded.usageStats?.["anthropic:default"]?.cooldownUntil).toBe(firstCooldownUntil); }); }); + it("records overloaded failures in the cooldown bucket", async () => { + await withAuthProfileStore(async ({ agentDir, store }) => { + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "overloaded", + agentDir, + }); + + const stats = store.usageStats?.["anthropic:default"]; + expect(typeof stats?.cooldownUntil).toBe("number"); + expect(stats?.disabledUntil).toBeUndefined(); + expect(stats?.disabledReason).toBeUndefined(); + expect(stats?.failureCounts?.overloaded).toBe(1); + }); + }); it("disables auth_permanent failures via disabledUntil (like billing)", async () => { await withAuthProfileStore(async ({ agentDir, store }) => { await markAuthProfileFailure({ diff --git a/src/agents/auth-profiles/types.ts b/src/agents/auth-profiles/types.ts index d01e7a07d68..127a444939b 100644 --- a/src/agents/auth-profiles/types.ts +++ b/src/agents/auth-profiles/types.ts @@ -39,6 +39,7 @@ export type AuthProfileFailureReason = | "auth" | "auth_permanent" | "format" + | "overloaded" | "rate_limit" | "billing" | "timeout" diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 8c499654b49..ffd6ec2daa7 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -177,6 +177,24 @@ describe("resolveProfilesUnavailableReason", () => { ).toBe("auth"); }); + it("returns overloaded for active overloaded cooldown windows", () => { + const now = Date.now(); + const store = makeStore({ + "anthropic:default": { + cooldownUntil: now + 60_000, + failureCounts: { overloaded: 2, rate_limit: 1 }, + }, + }); + + expect( + resolveProfilesUnavailableReason({ + store, + profileIds: ["anthropic:default"], + now, + }), + ).toBe("overloaded"); + }); + it("falls back to rate_limit when active cooldown has no reason history", () => { const now = Date.now(); const store = makeStore({ diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index e78a36db28c..733a96e13c4 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -9,6 +9,7 @@ const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [ "billing", "format", "model_not_found", + "overloaded", "timeout", "rate_limit", "unknown", @@ -35,7 +36,7 @@ export function resolveProfileUnusableUntil( } /** - * Check if a profile is currently in cooldown (due to rate limiting or errors). + * Check if a profile is currently in cooldown (due to rate limits, overload, or other transient failures). */ export function isProfileInCooldown( store: AuthProfileStore, @@ -508,7 +509,7 @@ export async function markAuthProfileFailure(params: { } /** - * Mark a profile as failed/rate-limited. Applies exponential backoff cooldown. + * Mark a profile as transiently failed. Applies exponential backoff cooldown. * Cooldown times: 1min, 5min, 25min, max 1 hour. * Uses store lock to avoid overwriting concurrent usage updates. */ diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index 60e7510e67e..f581dd0ede2 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -75,7 +75,7 @@ describe("failover-error", () => { expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull(); - expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit"); + expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded"); }); it("classifies documented provider error shapes at the error boundary", () => { @@ -90,7 +90,7 @@ describe("failover-error", () => { status: 529, message: ANTHROPIC_OVERLOADED_PAYLOAD, }), - ).toBe("rate_limit"); + ).toBe("overloaded"); expect( resolveFailoverReasonFromError({ status: 429, @@ -126,7 +126,22 @@ describe("failover-error", () => { status: 503, message: GROQ_SERVICE_UNAVAILABLE_MESSAGE, }), + ).toBe("overloaded"); + }); + + it("keeps status-only 503s conservative unless the payload is clearly overloaded", () => { + expect( + resolveFailoverReasonFromError({ + status: 503, + message: "Internal database error", + }), ).toBe("timeout"); + expect( + resolveFailoverReasonFromError({ + status: 503, + message: '{"error":{"message":"The model is overloaded. Please try later"}}', + }), + ).toBe("overloaded"); }); it("treats 400 insufficient_quota payloads as billing instead of format", () => { @@ -151,6 +166,14 @@ describe("failover-error", () => { ).toBe("rate_limit"); }); + it("treats overloaded provider payloads as overloaded", () => { + expect( + resolveFailoverReasonFromError({ + message: ANTHROPIC_OVERLOADED_PAYLOAD, + }), + ).toBe("overloaded"); + }); + it("keeps raw-text 402 weekly/monthly limit errors in billing", () => { expect( resolveFailoverReasonFromError({ @@ -221,6 +244,10 @@ describe("failover-error", () => { expect(err?.model).toBe("claude-opus-4-5"); }); + it("maps overloaded to a 503 fallback status", () => { + expect(resolveFailoverStatus("overloaded")).toBe(503); + }); + it("coerces format errors with a 400 status", () => { const err = coerceToFailoverError("invalid request format", { provider: "google", diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index 5c16d3508fd..a39685e1b16 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -49,6 +49,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine return 402; case "rate_limit": return 429; + case "overloaded": + return 503; case "auth": return 401; case "auth_permanent": diff --git a/src/agents/model-fallback.probe.test.ts b/src/agents/model-fallback.probe.test.ts index f220646cf3d..8dafd6533da 100644 --- a/src/agents/model-fallback.probe.test.ts +++ b/src/agents/model-fallback.probe.test.ts @@ -53,7 +53,7 @@ function expectPrimaryProbeSuccess( expect(result.result).toBe(expectedResult); expect(run).toHaveBeenCalledTimes(1); expect(run).toHaveBeenCalledWith("openai", "gpt-4.1-mini", { - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, }); } @@ -200,10 +200,48 @@ describe("runWithModelFallback – probe logic", () => { expect(result.result).toBe("fallback-ok"); expect(run).toHaveBeenCalledTimes(2); expect(run).toHaveBeenNthCalledWith(1, "openai", "gpt-4.1-mini", { - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, }); expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5", { - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, + }); + }); + + it("attempts non-primary fallbacks during overloaded cooldown after primary probe failure", async () => { + const cfg = makeCfg({ + agents: { + defaults: { + model: { + primary: "openai/gpt-4.1-mini", + fallbacks: ["anthropic/claude-haiku-3-5", "google/gemini-2-flash"], + }, + }, + }, + } as Partial); + + mockedIsProfileInCooldown.mockReturnValue(true); + mockedGetSoonestCooldownExpiry.mockReturnValue(NOW + 30 * 1000); + mockedResolveProfilesUnavailableReason.mockReturnValue("overloaded"); + + const run = vi + .fn() + .mockRejectedValueOnce(Object.assign(new Error("service overloaded"), { status: 503 })) + .mockResolvedValue("fallback-ok"); + + const result = await runWithModelFallback({ + cfg, + provider: "openai", + model: "gpt-4.1-mini", + run, + }); + + expect(result.result).toBe("fallback-ok"); + expect(run).toHaveBeenCalledTimes(2); + expect(run).toHaveBeenNthCalledWith(1, "openai", "gpt-4.1-mini", { + allowTransientCooldownProbe: true, + }); + expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5", { + allowTransientCooldownProbe: true, }); }); @@ -326,10 +364,10 @@ describe("runWithModelFallback – probe logic", () => { }); expect(run).toHaveBeenNthCalledWith(1, "openai", "gpt-4.1-mini", { - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, }); expect(run).toHaveBeenNthCalledWith(2, "openai", "gpt-4.1-mini", { - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, }); }); }); diff --git a/src/agents/model-fallback.run-embedded.e2e.test.ts b/src/agents/model-fallback.run-embedded.e2e.test.ts new file mode 100644 index 00000000000..61afb89c6bb --- /dev/null +++ b/src/agents/model-fallback.run-embedded.e2e.test.ts @@ -0,0 +1,517 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import type { AssistantMessage } from "@mariozechner/pi-ai"; +import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; +import type { AuthProfileFailureReason } from "./auth-profiles.js"; +import { runWithModelFallback } from "./model-fallback.js"; +import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js"; + +const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise>(); +const { computeBackoffMock, sleepWithAbortMock } = vi.hoisted(() => ({ + computeBackoffMock: vi.fn( + ( + _policy: { initialMs: number; maxMs: number; factor: number; jitter: number }, + _attempt: number, + ) => 321, + ), + sleepWithAbortMock: vi.fn(async (_ms: number, _abortSignal?: AbortSignal) => undefined), +})); + +vi.mock("./pi-embedded-runner/run/attempt.js", () => ({ + runEmbeddedAttempt: (params: unknown) => runEmbeddedAttemptMock(params), +})); + +vi.mock("../infra/backoff.js", () => ({ + computeBackoff: ( + policy: { initialMs: number; maxMs: number; factor: number; jitter: number }, + attempt: number, + ) => computeBackoffMock(policy, attempt), + sleepWithAbort: (ms: number, abortSignal?: AbortSignal) => sleepWithAbortMock(ms, abortSignal), +})); + +vi.mock("./models-config.js", async (importOriginal) => { + const mod = await importOriginal(); + return { + ...mod, + ensureOpenClawModelsJson: vi.fn(async () => ({ wrote: false })), + }; +}); + +let runEmbeddedPiAgent: typeof import("./pi-embedded-runner/run.js").runEmbeddedPiAgent; + +beforeAll(async () => { + ({ runEmbeddedPiAgent } = await import("./pi-embedded-runner/run.js")); +}); + +beforeEach(() => { + runEmbeddedAttemptMock.mockReset(); + computeBackoffMock.mockClear(); + sleepWithAbortMock.mockClear(); +}); + +const baseUsage = { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + totalTokens: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, +}; + +const OVERLOADED_ERROR_PAYLOAD = + '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}'; + +const buildAssistant = (overrides: Partial): AssistantMessage => ({ + role: "assistant", + content: [], + api: "openai-responses", + provider: "openai", + model: "mock-1", + usage: baseUsage, + stopReason: "stop", + timestamp: Date.now(), + ...overrides, +}); + +const makeAttempt = (overrides: Partial): EmbeddedRunAttemptResult => ({ + aborted: false, + timedOut: false, + timedOutDuringCompaction: false, + promptError: null, + sessionIdUsed: "session:test", + systemPromptReport: undefined, + messagesSnapshot: [], + assistantTexts: [], + toolMetas: [], + lastAssistant: undefined, + didSendViaMessagingTool: false, + messagingToolSentTexts: [], + messagingToolSentMediaUrls: [], + messagingToolSentTargets: [], + cloudCodeAssistFormatError: false, + ...overrides, +}); + +function makeConfig(): OpenClawConfig { + return { + agents: { + defaults: { + model: { + primary: "openai/mock-1", + fallbacks: ["groq/mock-2"], + }, + }, + }, + models: { + providers: { + openai: { + api: "openai-responses", + apiKey: "sk-openai", + baseUrl: "https://example.com/openai", + models: [ + { + id: "mock-1", + name: "Mock 1", + reasoning: false, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 16_000, + maxTokens: 2048, + }, + ], + }, + groq: { + api: "openai-responses", + apiKey: "sk-groq", + baseUrl: "https://example.com/groq", + models: [ + { + id: "mock-2", + name: "Mock 2", + reasoning: false, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 16_000, + maxTokens: 2048, + }, + ], + }, + }, + }, + } satisfies OpenClawConfig; +} + +async function withAgentWorkspace( + fn: (ctx: { agentDir: string; workspaceDir: string }) => Promise, +): Promise { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-model-fallback-")); + const agentDir = path.join(root, "agent"); + const workspaceDir = path.join(root, "workspace"); + await fs.mkdir(agentDir, { recursive: true }); + await fs.mkdir(workspaceDir, { recursive: true }); + try { + return await fn({ agentDir, workspaceDir }); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } +} + +async function writeAuthStore( + agentDir: string, + usageStats?: Record< + string, + { + lastUsed?: number; + cooldownUntil?: number; + disabledUntil?: number; + disabledReason?: AuthProfileFailureReason; + failureCounts?: Partial>; + } + >, +) { + await fs.writeFile( + path.join(agentDir, "auth-profiles.json"), + JSON.stringify({ + version: 1, + profiles: { + "openai:p1": { type: "api_key", provider: "openai", key: "sk-openai" }, + "groq:p1": { type: "api_key", provider: "groq", key: "sk-groq" }, + }, + usageStats: + usageStats ?? + ({ + "openai:p1": { lastUsed: 1 }, + "groq:p1": { lastUsed: 2 }, + } as const), + }), + ); +} + +async function readUsageStats(agentDir: string) { + const raw = await fs.readFile(path.join(agentDir, "auth-profiles.json"), "utf-8"); + return JSON.parse(raw).usageStats as Record | undefined>; +} + +async function runEmbeddedFallback(params: { + agentDir: string; + workspaceDir: string; + sessionKey: string; + runId: string; + abortSignal?: AbortSignal; +}) { + const cfg = makeConfig(); + return await runWithModelFallback({ + cfg, + provider: "openai", + model: "mock-1", + agentDir: params.agentDir, + run: (provider, model, options) => + runEmbeddedPiAgent({ + sessionId: `session:${params.runId}`, + sessionKey: params.sessionKey, + sessionFile: path.join(params.workspaceDir, `${params.runId}.jsonl`), + workspaceDir: params.workspaceDir, + agentDir: params.agentDir, + config: cfg, + prompt: "hello", + provider, + model, + authProfileIdSource: "auto", + allowTransientCooldownProbe: options?.allowTransientCooldownProbe, + timeoutMs: 5_000, + runId: params.runId, + abortSignal: params.abortSignal, + }), + }); +} + +function mockPrimaryOverloadedThenFallbackSuccess() { + runEmbeddedAttemptMock.mockImplementation(async (params: unknown) => { + const attemptParams = params as { provider: string; modelId: string; authProfileId?: string }; + if (attemptParams.provider === "openai") { + return makeAttempt({ + assistantTexts: [], + lastAssistant: buildAssistant({ + provider: "openai", + model: "mock-1", + stopReason: "error", + errorMessage: OVERLOADED_ERROR_PAYLOAD, + }), + }); + } + if (attemptParams.provider === "groq") { + return makeAttempt({ + assistantTexts: ["fallback ok"], + lastAssistant: buildAssistant({ + provider: "groq", + model: "mock-2", + stopReason: "stop", + content: [{ type: "text", text: "fallback ok" }], + }), + }); + } + throw new Error(`Unexpected provider ${attemptParams.provider}`); + }); +} + +function mockAllProvidersOverloaded() { + runEmbeddedAttemptMock.mockImplementation(async (params: unknown) => { + const attemptParams = params as { provider: string; modelId: string; authProfileId?: string }; + if (attemptParams.provider === "openai" || attemptParams.provider === "groq") { + return makeAttempt({ + assistantTexts: [], + lastAssistant: buildAssistant({ + provider: attemptParams.provider, + model: attemptParams.provider === "openai" ? "mock-1" : "mock-2", + stopReason: "error", + errorMessage: OVERLOADED_ERROR_PAYLOAD, + }), + }); + } + throw new Error(`Unexpected provider ${attemptParams.provider}`); + }); +} + +describe("runWithModelFallback + runEmbeddedPiAgent overload policy", () => { + it("falls back across providers after overloaded primary failure and persists transient cooldown", async () => { + await withAgentWorkspace(async ({ agentDir, workspaceDir }) => { + await writeAuthStore(agentDir); + mockPrimaryOverloadedThenFallbackSuccess(); + + const result = await runEmbeddedFallback({ + agentDir, + workspaceDir, + sessionKey: "agent:test:overloaded-cross-provider", + runId: "run:overloaded-cross-provider", + }); + + expect(result.provider).toBe("groq"); + expect(result.model).toBe("mock-2"); + expect(result.attempts[0]?.reason).toBe("overloaded"); + expect(result.result.payloads?.[0]?.text ?? "").toContain("fallback ok"); + + const usageStats = await readUsageStats(agentDir); + expect(typeof usageStats["openai:p1"]?.cooldownUntil).toBe("number"); + expect(usageStats["openai:p1"]?.failureCounts).toMatchObject({ overloaded: 1 }); + expect(typeof usageStats["groq:p1"]?.lastUsed).toBe("number"); + + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2); + const firstCall = runEmbeddedAttemptMock.mock.calls[0]?.[0] as + | { provider?: string } + | undefined; + const secondCall = runEmbeddedAttemptMock.mock.calls[1]?.[0] as + | { provider?: string } + | undefined; + expect(firstCall).toBeDefined(); + expect(secondCall).toBeDefined(); + expect(firstCall?.provider).toBe("openai"); + expect(secondCall?.provider).toBe("groq"); + expect(computeBackoffMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledTimes(1); + }); + }); + + it("surfaces a bounded overloaded summary when every fallback candidate is overloaded", async () => { + await withAgentWorkspace(async ({ agentDir, workspaceDir }) => { + await writeAuthStore(agentDir); + mockAllProvidersOverloaded(); + + let thrown: unknown; + try { + await runEmbeddedFallback({ + agentDir, + workspaceDir, + sessionKey: "agent:test:all-overloaded", + runId: "run:all-overloaded", + }); + } catch (err) { + thrown = err; + } + + expect(thrown).toBeInstanceOf(Error); + expect((thrown as Error).message).toMatch(/^All models failed \(2\): /); + expect((thrown as Error).message).toMatch( + /openai\/mock-1: .* \(overloaded\) \| groq\/mock-2: .* \(overloaded\)/, + ); + + const usageStats = await readUsageStats(agentDir); + expect(typeof usageStats["openai:p1"]?.cooldownUntil).toBe("number"); + expect(typeof usageStats["groq:p1"]?.cooldownUntil).toBe("number"); + expect(usageStats["openai:p1"]?.failureCounts).toMatchObject({ overloaded: 1 }); + expect(usageStats["groq:p1"]?.failureCounts).toMatchObject({ overloaded: 1 }); + expect(usageStats["openai:p1"]?.disabledUntil).toBeUndefined(); + expect(usageStats["groq:p1"]?.disabledUntil).toBeUndefined(); + + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2); + expect(computeBackoffMock).toHaveBeenCalledTimes(2); + expect(sleepWithAbortMock).toHaveBeenCalledTimes(2); + }); + }); + + it("probes a provider already in overloaded cooldown before falling back", async () => { + await withAgentWorkspace(async ({ agentDir, workspaceDir }) => { + const now = Date.now(); + await writeAuthStore(agentDir, { + "openai:p1": { + lastUsed: 1, + cooldownUntil: now + 60_000, + failureCounts: { overloaded: 2 }, + }, + "groq:p1": { lastUsed: 2 }, + }); + mockPrimaryOverloadedThenFallbackSuccess(); + + const result = await runEmbeddedFallback({ + agentDir, + workspaceDir, + sessionKey: "agent:test:overloaded-probe-fallback", + runId: "run:overloaded-probe-fallback", + }); + + expect(result.provider).toBe("groq"); + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2); + const firstCall = runEmbeddedAttemptMock.mock.calls[0]?.[0] as + | { provider?: string; authProfileId?: string } + | undefined; + const secondCall = runEmbeddedAttemptMock.mock.calls[1]?.[0] as + | { provider?: string } + | undefined; + expect(firstCall).toBeDefined(); + expect(secondCall).toBeDefined(); + expect(firstCall?.provider).toBe("openai"); + expect(firstCall?.authProfileId).toBe("openai:p1"); + expect(secondCall?.provider).toBe("groq"); + }); + }); + + it("persists overloaded cooldown across turns while still allowing one probe and fallback", async () => { + await withAgentWorkspace(async ({ agentDir, workspaceDir }) => { + await writeAuthStore(agentDir); + mockPrimaryOverloadedThenFallbackSuccess(); + + const firstResult = await runEmbeddedFallback({ + agentDir, + workspaceDir, + sessionKey: "agent:test:overloaded-two-turns:first", + runId: "run:overloaded-two-turns:first", + }); + + expect(firstResult.provider).toBe("groq"); + + runEmbeddedAttemptMock.mockClear(); + computeBackoffMock.mockClear(); + sleepWithAbortMock.mockClear(); + + mockPrimaryOverloadedThenFallbackSuccess(); + + const secondResult = await runEmbeddedFallback({ + agentDir, + workspaceDir, + sessionKey: "agent:test:overloaded-two-turns:second", + runId: "run:overloaded-two-turns:second", + }); + + expect(secondResult.provider).toBe("groq"); + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2); + + const firstCall = runEmbeddedAttemptMock.mock.calls[0]?.[0] as + | { provider?: string; authProfileId?: string } + | undefined; + const secondCall = runEmbeddedAttemptMock.mock.calls[1]?.[0] as + | { provider?: string } + | undefined; + expect(firstCall).toBeDefined(); + expect(secondCall).toBeDefined(); + expect(firstCall?.provider).toBe("openai"); + expect(firstCall?.authProfileId).toBe("openai:p1"); + expect(secondCall?.provider).toBe("groq"); + + const usageStats = await readUsageStats(agentDir); + expect(typeof usageStats["openai:p1"]?.cooldownUntil).toBe("number"); + expect(usageStats["openai:p1"]?.failureCounts).toMatchObject({ overloaded: 2 }); + expect(computeBackoffMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledTimes(1); + }); + }); + + it("keeps bare service-unavailable failures in the timeout lane without persisting cooldown", async () => { + await withAgentWorkspace(async ({ agentDir, workspaceDir }) => { + await writeAuthStore(agentDir); + runEmbeddedAttemptMock.mockImplementation(async (params: unknown) => { + const attemptParams = params as { provider: string }; + if (attemptParams.provider === "openai") { + return makeAttempt({ + assistantTexts: [], + lastAssistant: buildAssistant({ + provider: "openai", + model: "mock-1", + stopReason: "error", + errorMessage: "LLM error: service unavailable", + }), + }); + } + if (attemptParams.provider === "groq") { + return makeAttempt({ + assistantTexts: ["fallback ok"], + lastAssistant: buildAssistant({ + provider: "groq", + model: "mock-2", + stopReason: "stop", + content: [{ type: "text", text: "fallback ok" }], + }), + }); + } + throw new Error(`Unexpected provider ${attemptParams.provider}`); + }); + + const result = await runEmbeddedFallback({ + agentDir, + workspaceDir, + sessionKey: "agent:test:timeout-cross-provider", + runId: "run:timeout-cross-provider", + }); + + expect(result.provider).toBe("groq"); + expect(result.attempts[0]?.reason).toBe("timeout"); + + const usageStats = await readUsageStats(agentDir); + expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined(); + expect(usageStats["openai:p1"]?.failureCounts).toBeUndefined(); + expect(computeBackoffMock).not.toHaveBeenCalled(); + expect(sleepWithAbortMock).not.toHaveBeenCalled(); + }); + }); + + it("rethrows AbortError during overload backoff instead of falling through fallback", async () => { + await withAgentWorkspace(async ({ agentDir, workspaceDir }) => { + await writeAuthStore(agentDir); + const controller = new AbortController(); + mockPrimaryOverloadedThenFallbackSuccess(); + sleepWithAbortMock.mockImplementationOnce(async () => { + controller.abort(); + throw new Error("aborted"); + }); + + await expect( + runEmbeddedFallback({ + agentDir, + workspaceDir, + sessionKey: "agent:test:overloaded-backoff-abort", + runId: "run:overloaded-backoff-abort", + abortSignal: controller.signal, + }), + ).rejects.toMatchObject({ + name: "AbortError", + message: "Operation aborted", + }); + + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(1); + const firstCall = runEmbeddedAttemptMock.mock.calls[0]?.[0] as + | { provider?: string } + | undefined; + expect(firstCall?.provider).toBe("openai"); + }); + }); +}); diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index 69a9ba01a29..6379d6e0222 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -1062,7 +1062,7 @@ describe("runWithModelFallback", () => { describe("fallback behavior with provider cooldowns", () => { async function makeAuthStoreWithCooldown( provider: string, - reason: "rate_limit" | "auth" | "billing", + reason: "rate_limit" | "overloaded" | "auth" | "billing", ): Promise<{ store: AuthProfileStore; dir: string }> { const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-test-")); const now = Date.now(); @@ -1073,12 +1073,12 @@ describe("runWithModelFallback", () => { }, usageStats: { [`${provider}:default`]: - reason === "rate_limit" + reason === "rate_limit" || reason === "overloaded" ? { - // Real rate-limit cooldowns are tracked through cooldownUntil - // and failureCounts, not disabledReason. + // Transient cooldown reasons are tracked through + // cooldownUntil and failureCounts, not disabledReason. cooldownUntil: now + 300000, - failureCounts: { rate_limit: 1 }, + failureCounts: { [reason]: 1 }, } : { // Auth/billing issues use disabledUntil @@ -1117,7 +1117,37 @@ describe("runWithModelFallback", () => { expect(result.result).toBe("sonnet success"); expect(run).toHaveBeenCalledTimes(1); // Primary skipped, fallback attempted expect(run).toHaveBeenNthCalledWith(1, "anthropic", "claude-sonnet-4-5", { - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, + }); + }); + + it("attempts same-provider fallbacks during overloaded cooldown", async () => { + const { dir } = await makeAuthStoreWithCooldown("anthropic", "overloaded"); + const cfg = makeCfg({ + agents: { + defaults: { + model: { + primary: "anthropic/claude-opus-4-6", + fallbacks: ["anthropic/claude-sonnet-4-5", "groq/llama-3.3-70b-versatile"], + }, + }, + }, + }); + + const run = vi.fn().mockResolvedValueOnce("sonnet success"); + + const result = await runWithModelFallback({ + cfg, + provider: "anthropic", + model: "claude-opus-4-6", + run, + agentDir: dir, + }); + + expect(result.result).toBe("sonnet success"); + expect(run).toHaveBeenCalledTimes(1); + expect(run).toHaveBeenNthCalledWith(1, "anthropic", "claude-sonnet-4-5", { + allowTransientCooldownProbe: true, }); }); @@ -1224,7 +1254,7 @@ describe("runWithModelFallback", () => { expect(result.result).toBe("groq success"); expect(run).toHaveBeenCalledTimes(2); expect(run).toHaveBeenNthCalledWith(1, "anthropic", "claude-sonnet-4-5", { - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, }); // Rate limit allows attempt expect(run).toHaveBeenNthCalledWith(2, "groq", "llama-3.3-70b-versatile"); // Cross-provider works }); diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index f1c99d26a70..517c4448a27 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -34,7 +34,7 @@ type ModelCandidate = { }; export type ModelFallbackRunOptions = { - allowRateLimitCooldownProbe?: boolean; + allowTransientCooldownProbe?: boolean; }; type ModelFallbackRunFn = ( @@ -428,11 +428,11 @@ function resolveCooldownDecision(params: { } // For primary: try when requested model or when probe allows. - // For same-provider fallbacks: only relax cooldown on rate_limit, which - // is commonly model-scoped and can recover on a sibling model. + // For same-provider fallbacks: only relax cooldown on transient provider + // limits, which are often model-scoped and can recover on a sibling model. const shouldAttemptDespiteCooldown = (params.isPrimary && (!params.requestedModel || shouldProbe)) || - (!params.isPrimary && inferredReason === "rate_limit"); + (!params.isPrimary && (inferredReason === "rate_limit" || inferredReason === "overloaded")); if (!shouldAttemptDespiteCooldown) { return { type: "skip", @@ -514,8 +514,8 @@ export async function runWithModelFallback(params: { if (decision.markProbe) { lastProbeAttempt.set(probeThrottleKey, now); } - if (decision.reason === "rate_limit") { - runOptions = { allowRateLimitCooldownProbe: true }; + if (decision.reason === "rate_limit" || decision.reason === "overloaded") { + runOptions = { allowTransientCooldownProbe: true }; } } } diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 9eb2657158b..4919bc607c0 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -509,12 +509,12 @@ describe("classifyFailoverReason", () => { it("classifies documented provider error messages", () => { expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit"); expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit"); - expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("rate_limit"); + expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("overloaded"); expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing"); expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing"); - expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("timeout"); + expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("overloaded"); expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit"); - expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("timeout"); + expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("overloaded"); }); it("classifies internal and compatibility error messages", () => { @@ -572,25 +572,29 @@ describe("classifyFailoverReason", () => { "rate_limit", ); }); - it("classifies provider high-demand / service-unavailable messages as rate_limit", () => { + it("classifies provider high-demand / service-unavailable messages as overloaded", () => { expect( classifyFailoverReason( "This model is currently experiencing high demand. Please try again later.", ), - ).toBe("rate_limit"); - // "service unavailable" combined with overload/capacity indicator → rate_limit + ).toBe("overloaded"); + // "service unavailable" combined with overload/capacity indicator → overloaded // (exercises the new regex — none of the standalone patterns match here) - expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("rate_limit"); + expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("overloaded"); expect( classifyFailoverReason( '{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}', ), - ).toBe("rate_limit"); + ).toBe("overloaded"); }); it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => { // A generic "service unavailable" from a proxy/CDN should stay retryable, // but it should not be treated as provider overload / rate limit. expect(classifyFailoverReason("LLM error: service unavailable")).toBe("timeout"); + expect(classifyFailoverReason("503 Internal Database Error")).toBe("timeout"); + // Raw 529 text without explicit overload keywords still classifies as overloaded. + expect(classifyFailoverReason("529 API is busy")).toBe("overloaded"); + expect(classifyFailoverReason("529 Please try again")).toBe("overloaded"); }); it("classifies zhipuai Weekly/Monthly Limit Exhausted as rate_limit (#33785)", () => { expect( diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index e7cd440d779..5e4fc4c541e 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -293,13 +293,17 @@ export function classifyFailoverReasonFromHttpStatus( if (status === 408) { return "timeout"; } - // Keep the status-only path conservative and behavior-preserving. - // Message-path HTTP heuristics are broader and should not leak in here. - if (status === 502 || status === 503 || status === 504) { + if (status === 503) { + if (message && isOverloadedErrorMessage(message)) { + return "overloaded"; + } + return "timeout"; + } + if (status === 502 || status === 504) { return "timeout"; } if (status === 529) { - return "rate_limit"; + return "overloaded"; } if (status === 400) { // Some providers return quota/balance errors under HTTP 400, so do not @@ -854,13 +858,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null { if (isModelNotFoundErrorMessage(raw)) { return "model_not_found"; } - if (isTransientHttpError(raw)) { - // Treat transient 5xx provider failures as retryable transport issues. - return "timeout"; - } - if (isJsonApiInternalServerError(raw)) { - return "timeout"; - } if (isPeriodicUsageLimitErrorMessage(raw)) { return isBillingErrorMessage(raw) ? "billing" : "rate_limit"; } @@ -868,7 +865,19 @@ export function classifyFailoverReason(raw: string): FailoverReason | null { return "rate_limit"; } if (isOverloadedErrorMessage(raw)) { - return "rate_limit"; + return "overloaded"; + } + if (isTransientHttpError(raw)) { + // 529 is always overloaded, even without explicit overload keywords in the body. + const status = extractLeadingHttpStatus(raw.trim()); + if (status?.code === 529) { + return "overloaded"; + } + // Treat remaining transient 5xx provider failures as retryable transport issues. + return "timeout"; + } + if (isJsonApiInternalServerError(raw)) { + return "timeout"; } if (isCloudCodeAssistFormatError(raw)) { return "format"; diff --git a/src/agents/pi-embedded-helpers/types.ts b/src/agents/pi-embedded-helpers/types.ts index 86ee1c4cda1..5ae47d672d3 100644 --- a/src/agents/pi-embedded-helpers/types.ts +++ b/src/agents/pi-embedded-helpers/types.ts @@ -5,6 +5,7 @@ export type FailoverReason = | "auth_permanent" | "format" | "rate_limit" + | "overloaded" | "billing" | "timeout" | "model_not_found" diff --git a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts index 8c1aef240f7..87ffa6963c9 100644 --- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts +++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts @@ -9,11 +9,28 @@ import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise>(); const resolveCopilotApiTokenMock = vi.fn(); +const { computeBackoffMock, sleepWithAbortMock } = vi.hoisted(() => ({ + computeBackoffMock: vi.fn( + ( + _policy: { initialMs: number; maxMs: number; factor: number; jitter: number }, + _attempt: number, + ) => 321, + ), + sleepWithAbortMock: vi.fn(async (_ms: number, _abortSignal?: AbortSignal) => undefined), +})); vi.mock("./pi-embedded-runner/run/attempt.js", () => ({ runEmbeddedAttempt: (params: unknown) => runEmbeddedAttemptMock(params), })); +vi.mock("../infra/backoff.js", () => ({ + computeBackoff: ( + policy: { initialMs: number; maxMs: number; factor: number; jitter: number }, + attempt: number, + ) => computeBackoffMock(policy, attempt), + sleepWithAbort: (ms: number, abortSignal?: AbortSignal) => sleepWithAbortMock(ms, abortSignal), +})); + vi.mock("../providers/github-copilot-token.js", () => ({ DEFAULT_COPILOT_API_BASE_URL: "https://api.individual.githubcopilot.com", resolveCopilotApiToken: (...args: unknown[]) => resolveCopilotApiTokenMock(...args), @@ -43,6 +60,8 @@ beforeEach(() => { vi.useRealTimers(); runEmbeddedAttemptMock.mockClear(); resolveCopilotApiTokenMock.mockReset(); + computeBackoffMock.mockClear(); + sleepWithAbortMock.mockClear(); }); const baseUsage = { @@ -252,6 +271,24 @@ const mockFailedThenSuccessfulAttempt = (errorMessage = "rate limit") => { ); }; +const mockPromptErrorThenSuccessfulAttempt = (errorMessage: string) => { + runEmbeddedAttemptMock + .mockResolvedValueOnce( + makeAttempt({ + promptError: new Error(errorMessage), + }), + ) + .mockResolvedValueOnce( + makeAttempt({ + assistantTexts: ["ok"], + lastAssistant: buildAssistant({ + stopReason: "stop", + content: [{ type: "text", text: "ok" }], + }), + }), + ); +}; + async function runAutoPinnedOpenAiTurn(params: { agentDir: string; workspaceDir: string; @@ -320,6 +357,28 @@ async function runAutoPinnedRotationCase(params: { }); } +async function runAutoPinnedPromptErrorRotationCase(params: { + errorMessage: string; + sessionKey: string; + runId: string; +}) { + runEmbeddedAttemptMock.mockClear(); + return withAgentWorkspace(async ({ agentDir, workspaceDir }) => { + await writeAuthStore(agentDir); + mockPromptErrorThenSuccessfulAttempt(params.errorMessage); + await runAutoPinnedOpenAiTurn({ + agentDir, + workspaceDir, + sessionKey: params.sessionKey, + runId: params.runId, + }); + + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2); + const usageStats = await readUsageStats(agentDir); + return { usageStats }; + }); +} + function mockSingleSuccessfulAttempt() { runEmbeddedAttemptMock.mockResolvedValueOnce( makeAttempt({ @@ -639,13 +698,48 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); }); - it("rotates for overloaded prompt failures across auto-pinned profiles", async () => { + it("rotates for overloaded assistant failures across auto-pinned profiles", async () => { const { usageStats } = await runAutoPinnedRotationCase({ errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}', sessionKey: "agent:test:overloaded-rotation", runId: "run:overloaded-rotation", }); expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); + expect(typeof usageStats["openai:p1"]?.cooldownUntil).toBe("number"); + expect(computeBackoffMock).toHaveBeenCalledTimes(1); + expect(computeBackoffMock).toHaveBeenCalledWith( + expect.objectContaining({ + initialMs: 250, + maxMs: 1500, + factor: 2, + jitter: 0.2, + }), + 1, + ); + expect(sleepWithAbortMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined); + }); + + it("rotates for overloaded prompt failures across auto-pinned profiles", async () => { + const { usageStats } = await runAutoPinnedPromptErrorRotationCase({ + errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}', + sessionKey: "agent:test:overloaded-prompt-rotation", + runId: "run:overloaded-prompt-rotation", + }); + expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); + expect(typeof usageStats["openai:p1"]?.cooldownUntil).toBe("number"); + expect(computeBackoffMock).toHaveBeenCalledTimes(1); + expect(computeBackoffMock).toHaveBeenCalledWith( + expect.objectContaining({ + initialMs: 250, + maxMs: 1500, + factor: 2, + jitter: 0.2, + }), + 1, + ); + expect(sleepWithAbortMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined); }); it("rotates on timeout without cooling down the timed-out profile", async () => { @@ -656,6 +750,8 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined(); + expect(computeBackoffMock).not.toHaveBeenCalled(); + expect(sleepWithAbortMock).not.toHaveBeenCalled(); }); it("rotates on bare service unavailable without cooling down the profile", async () => { @@ -829,7 +925,7 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); }); - it("can probe one cooldowned profile when rate-limit cooldown probe is explicitly allowed", async () => { + it("can probe one cooldowned profile when transient cooldown probe is explicitly allowed", async () => { await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => { await writeAuthStore(agentDir, { usageStats: { @@ -859,7 +955,7 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { provider: "openai", model: "mock-1", authProfileIdSource: "auto", - allowRateLimitCooldownProbe: true, + allowTransientCooldownProbe: true, timeoutMs: 5_000, runId: "run:cooldown-probe", }); @@ -869,6 +965,54 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); }); + it("can probe one cooldowned profile when overloaded cooldown is explicitly probeable", async () => { + await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => { + await writeAuthStore(agentDir, { + usageStats: { + "openai:p1": { + lastUsed: 1, + cooldownUntil: now + 60 * 60 * 1000, + failureCounts: { overloaded: 4 }, + }, + "openai:p2": { + lastUsed: 2, + cooldownUntil: now + 60 * 60 * 1000, + failureCounts: { overloaded: 4 }, + }, + }, + }); + + runEmbeddedAttemptMock.mockResolvedValueOnce( + makeAttempt({ + assistantTexts: ["ok"], + lastAssistant: buildAssistant({ + stopReason: "stop", + content: [{ type: "text", text: "ok" }], + }), + }), + ); + + const result = await runEmbeddedPiAgent({ + sessionId: "session:test", + sessionKey: "agent:test:overloaded-cooldown-probe", + sessionFile: path.join(workspaceDir, "session.jsonl"), + workspaceDir, + agentDir, + config: makeConfig({ fallbacks: ["openai/mock-2"] }), + prompt: "hello", + provider: "openai", + model: "mock-1", + authProfileIdSource: "auto", + allowTransientCooldownProbe: true, + timeoutMs: 5_000, + runId: "run:overloaded-cooldown-probe", + }); + + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(1); + expect(result.payloads?.[0]?.text ?? "").toContain("ok"); + }); + }); + it("treats agent-level fallbacks as configured when defaults have none", async () => { await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => { await writeAuthStore(agentDir, { diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 11be807e120..c1d1d414c49 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -5,6 +5,7 @@ import { ensureContextEnginesInitialized, resolveContextEngine, } from "../../context-engine/index.js"; +import { computeBackoff, sleepWithAbort, type BackoffPolicy } from "../../infra/backoff.js"; import { generateSecureToken } from "../../infra/secure-random.js"; import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js"; import type { PluginHookBeforeAgentStartResult } from "../../plugins/types.js"; @@ -14,6 +15,7 @@ import { resolveOpenClawAgentDir } from "../agent-paths.js"; import { hasConfiguredModelFallbacks } from "../agent-scope.js"; import { isProfileInCooldown, + type AuthProfileFailureReason, markAuthProfileFailure, markAuthProfileGood, markAuthProfileUsed, @@ -79,6 +81,14 @@ type CopilotTokenState = { const COPILOT_REFRESH_MARGIN_MS = 5 * 60 * 1000; const COPILOT_REFRESH_RETRY_MS = 60 * 1000; const COPILOT_REFRESH_MIN_DELAY_MS = 5 * 1000; +// Keep overload pacing noticeable enough to avoid tight retry bursts, but short +// enough that fallback still feels responsive within a single turn. +const OVERLOAD_FAILOVER_BACKOFF_POLICY: BackoffPolicy = { + initialMs: 250, + maxMs: 1_500, + factor: 2, + jitter: 0.2, +}; // Avoid Anthropic's refusal test token poisoning session transcripts. const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL"; @@ -649,21 +659,21 @@ export async function runEmbeddedPiAgent( profileIds: autoProfileCandidates, }) ?? "rate_limit") : null; - const allowRateLimitCooldownProbe = - params.allowRateLimitCooldownProbe === true && + const allowTransientCooldownProbe = + params.allowTransientCooldownProbe === true && allAutoProfilesInCooldown && - unavailableReason === "rate_limit"; - let didRateLimitCooldownProbe = false; + (unavailableReason === "rate_limit" || unavailableReason === "overloaded"); + let didTransientCooldownProbe = false; while (profileIndex < profileCandidates.length) { const candidate = profileCandidates[profileIndex]; const inCooldown = candidate && candidate !== lockedProfileId && isProfileInCooldown(authStore, candidate); if (inCooldown) { - if (allowRateLimitCooldownProbe && !didRateLimitCooldownProbe) { - didRateLimitCooldownProbe = true; + if (allowTransientCooldownProbe && !didTransientCooldownProbe) { + didTransientCooldownProbe = true; log.warn( - `probing cooldowned auth profile for ${provider}/${modelId} due to rate_limit unavailability`, + `probing cooldowned auth profile for ${provider}/${modelId} due to ${unavailableReason ?? "transient"} unavailability`, ); } else { profileIndex += 1; @@ -722,9 +732,10 @@ export async function runEmbeddedPiAgent( let lastRunPromptUsage: ReturnType | undefined; let autoCompactionCount = 0; let runLoopIterations = 0; + let overloadFailoverAttempts = 0; const maybeMarkAuthProfileFailure = async (failure: { profileId?: string; - reason?: Parameters[0]["reason"] | null; + reason?: AuthProfileFailureReason | null; config?: RunEmbeddedPiAgentParams["config"]; agentDir?: RunEmbeddedPiAgentParams["agentDir"]; }) => { @@ -740,6 +751,36 @@ export async function runEmbeddedPiAgent( agentDir, }); }; + const resolveAuthProfileFailureReason = ( + failoverReason: FailoverReason | null, + ): AuthProfileFailureReason | null => { + // Timeouts are transport/model-path failures, not auth health signals, + // so they should not persist auth-profile failure state. + if (!failoverReason || failoverReason === "timeout") { + return null; + } + return failoverReason; + }; + const maybeBackoffBeforeOverloadFailover = async (reason: FailoverReason | null) => { + if (reason !== "overloaded") { + return; + } + overloadFailoverAttempts += 1; + const delayMs = computeBackoff(OVERLOAD_FAILOVER_BACKOFF_POLICY, overloadFailoverAttempts); + log.warn( + `overload backoff before failover for ${provider}/${modelId}: attempt=${overloadFailoverAttempts} delayMs=${delayMs}`, + ); + try { + await sleepWithAbort(delayMs, params.abortSignal); + } catch (err) { + if (params.abortSignal?.aborted) { + const abortErr = new Error("Operation aborted", { cause: err }); + abortErr.name = "AbortError"; + throw abortErr; + } + throw err; + } + }; // Resolve the context engine once and reuse across retries to avoid // repeated initialization/connection overhead per attempt. ensureContextEnginesInitialized(); @@ -1165,15 +1206,19 @@ export async function runEmbeddedPiAgent( }; } const promptFailoverReason = classifyFailoverReason(errorText); + const promptProfileFailureReason = + resolveAuthProfileFailureReason(promptFailoverReason); await maybeMarkAuthProfileFailure({ profileId: lastProfileId, - reason: promptFailoverReason, + reason: promptProfileFailureReason, }); + const promptFailoverFailure = isFailoverErrorMessage(errorText); if ( - isFailoverErrorMessage(errorText) && + promptFailoverFailure && promptFailoverReason !== "timeout" && (await advanceAuthProfile()) ) { + await maybeBackoffBeforeOverloadFailover(promptFailoverReason); continue; } const fallbackThinking = pickFallbackThinkingLevel({ @@ -1187,9 +1232,11 @@ export async function runEmbeddedPiAgent( thinkLevel = fallbackThinking; continue; } - // FIX: Throw FailoverError for prompt errors when fallbacks configured - // This enables model fallback for quota/rate limit errors during prompt submission - if (fallbackConfigured && isFailoverErrorMessage(errorText)) { + // Throw FailoverError for prompt-side failover reasons when fallbacks + // are configured so outer model fallback can continue on overload, + // rate-limit, auth, or billing failures. + if (fallbackConfigured && promptFailoverFailure) { + await maybeBackoffBeforeOverloadFailover(promptFailoverReason); throw new FailoverError(errorText, { reason: promptFailoverReason ?? "unknown", provider, @@ -1218,6 +1265,8 @@ export async function runEmbeddedPiAgent( const billingFailure = isBillingAssistantError(lastAssistant); const failoverFailure = isFailoverAssistantError(lastAssistant); const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? ""); + const assistantProfileFailureReason = + resolveAuthProfileFailureReason(assistantFailoverReason); const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError; const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? ""); @@ -1257,10 +1306,7 @@ export async function runEmbeddedPiAgent( if (shouldRotate) { if (lastProfileId) { - const reason = - timedOut || assistantFailoverReason === "timeout" - ? "timeout" - : (assistantFailoverReason ?? "unknown"); + const reason = timedOut ? "timeout" : assistantProfileFailureReason; // Skip cooldown for timeouts: a timeout is model/network-specific, // not an auth issue. Marking the profile would poison fallback models // on the same provider (e.g. gpt-5.3 timeout blocks gpt-5.2). @@ -1280,10 +1326,12 @@ export async function runEmbeddedPiAgent( const rotated = await advanceAuthProfile(); if (rotated) { + await maybeBackoffBeforeOverloadFailover(assistantFailoverReason); continue; } if (fallbackConfigured) { + await maybeBackoffBeforeOverloadFailover(assistantFailoverReason); // Prefer formatted error message (user-friendly) over raw errorMessage const message = (lastAssistant diff --git a/src/agents/pi-embedded-runner/run/params.ts b/src/agents/pi-embedded-runner/run/params.ts index fd0f2112361..6d067c910bf 100644 --- a/src/agents/pi-embedded-runner/run/params.ts +++ b/src/agents/pi-embedded-runner/run/params.ts @@ -115,10 +115,10 @@ export type RunEmbeddedPiAgentParams = { enforceFinalTag?: boolean; /** * Allow a single run attempt even when all auth profiles are in cooldown, - * but only for inferred `rate_limit` cooldowns. + * but only for inferred transient cooldowns like `rate_limit` or `overloaded`. * * This is used by model fallback when trying sibling models on providers - * where rate limits are often model-scoped. + * where transient service pressure is often model-scoped. */ - allowRateLimitCooldownProbe?: boolean; + allowTransientCooldownProbe?: boolean; }; diff --git a/src/auto-reply/reply/agent-runner-execution.ts b/src/auto-reply/reply/agent-runner-execution.ts index ed843a73014..524934ad469 100644 --- a/src/auto-reply/reply/agent-runner-execution.ts +++ b/src/auto-reply/reply/agent-runner-execution.ts @@ -311,7 +311,7 @@ export async function runAgentTurnWithFallback(params: { model, runId, authProfile, - allowRateLimitCooldownProbe: runOptions?.allowRateLimitCooldownProbe, + allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe, }); return (async () => { const result = await runEmbeddedPiAgent({ diff --git a/src/auto-reply/reply/agent-runner-memory.ts b/src/auto-reply/reply/agent-runner-memory.ts index ddb65d0fa22..374d37d52f7 100644 --- a/src/auto-reply/reply/agent-runner-memory.ts +++ b/src/auto-reply/reply/agent-runner-memory.ts @@ -487,7 +487,7 @@ export async function runMemoryFlushIfNeeded(params: { model, runId: flushRunId, authProfile, - allowRateLimitCooldownProbe: runOptions?.allowRateLimitCooldownProbe, + allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe, }); const result = await runEmbeddedPiAgent({ ...embeddedContext, diff --git a/src/auto-reply/reply/agent-runner-utils.ts b/src/auto-reply/reply/agent-runner-utils.ts index 960a1f21fed..b7ec4858e51 100644 --- a/src/auto-reply/reply/agent-runner-utils.ts +++ b/src/auto-reply/reply/agent-runner-utils.ts @@ -166,7 +166,7 @@ export function buildEmbeddedRunBaseParams(params: { model: string; runId: string; authProfile: ReturnType; - allowRateLimitCooldownProbe?: boolean; + allowTransientCooldownProbe?: boolean; }) { return { sessionFile: params.run.sessionFile, @@ -187,7 +187,7 @@ export function buildEmbeddedRunBaseParams(params: { bashElevated: params.run.bashElevated, timeoutMs: params.run.timeoutMs, runId: params.runId, - allowRateLimitCooldownProbe: params.allowRateLimitCooldownProbe, + allowTransientCooldownProbe: params.allowTransientCooldownProbe, }; } diff --git a/src/auto-reply/reply/agent-runner.runreplyagent.e2e.test.ts b/src/auto-reply/reply/agent-runner.runreplyagent.e2e.test.ts index a4f689412ab..83c1796515c 100644 --- a/src/auto-reply/reply/agent-runner.runreplyagent.e2e.test.ts +++ b/src/auto-reply/reply/agent-runner.runreplyagent.e2e.test.ts @@ -1054,6 +1054,11 @@ describe("runReplyAgent typing (heartbeat)", () => { reportedReason: "rate_limit", expectedReason: "rate limit", }, + { + existingReason: undefined, + reportedReason: "overloaded", + expectedReason: "overloaded", + }, { existingReason: "rate limit", reportedReason: "timeout", diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 7838a83bc4d..91e78138102 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -208,7 +208,7 @@ export function createFollowupRunner(params: { bashElevated: queued.run.bashElevated, timeoutMs: queued.run.timeoutMs, runId, - allowRateLimitCooldownProbe: runOptions?.allowRateLimitCooldownProbe, + allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe, blockReplyBreak: queued.run.blockReplyBreak, bootstrapPromptWarningSignaturesSeen, bootstrapPromptWarningSignature: diff --git a/src/commands/agent.ts b/src/commands/agent.ts index 215d249d964..fcbe593ec03 100644 --- a/src/commands/agent.ts +++ b/src/commands/agent.ts @@ -174,7 +174,7 @@ function runAgentAttempt(params: { primaryProvider: string; sessionStore?: Record; storePath?: string; - allowRateLimitCooldownProbe?: boolean; + allowTransientCooldownProbe?: boolean; }) { const effectivePrompt = resolveFallbackRetryPrompt({ body: params.body, @@ -325,7 +325,7 @@ function runAgentAttempt(params: { inputProvenance: params.opts.inputProvenance, streamParams: params.opts.streamParams, agentDir: params.agentDir, - allowRateLimitCooldownProbe: params.allowRateLimitCooldownProbe, + allowTransientCooldownProbe: params.allowTransientCooldownProbe, onAgentEvent: params.onAgentEvent, bootstrapPromptWarningSignaturesSeen, bootstrapPromptWarningSignature, @@ -868,7 +868,7 @@ async function agentCommandInternal( primaryProvider: provider, sessionStore, storePath, - allowRateLimitCooldownProbe: runOptions?.allowRateLimitCooldownProbe, + allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe, onAgentEvent: (evt) => { // Track lifecycle end for fallback emission below. if ( diff --git a/src/commands/models/list.probe.test.ts b/src/commands/models/list.probe.test.ts index 55c5ef064f3..70ffde1dd65 100644 --- a/src/commands/models/list.probe.test.ts +++ b/src/commands/models/list.probe.test.ts @@ -9,6 +9,7 @@ describe("mapFailoverReasonToProbeStatus", () => { it("keeps existing failover reason mappings", () => { expect(mapFailoverReasonToProbeStatus("auth")).toBe("auth"); expect(mapFailoverReasonToProbeStatus("rate_limit")).toBe("rate_limit"); + expect(mapFailoverReasonToProbeStatus("overloaded")).toBe("rate_limit"); expect(mapFailoverReasonToProbeStatus("billing")).toBe("billing"); expect(mapFailoverReasonToProbeStatus("timeout")).toBe("timeout"); expect(mapFailoverReasonToProbeStatus("format")).toBe("format"); diff --git a/src/commands/models/list.probe.ts b/src/commands/models/list.probe.ts index 433c005077d..8a2ec87adcc 100644 --- a/src/commands/models/list.probe.ts +++ b/src/commands/models/list.probe.ts @@ -106,7 +106,7 @@ export function mapFailoverReasonToProbeStatus(reason?: string | null): AuthProb // surface in the auth bucket instead of showing as unknown. return "auth"; } - if (reason === "rate_limit") { + if (reason === "rate_limit" || reason === "overloaded") { return "rate_limit"; } if (reason === "billing") { diff --git a/src/config/config-misc.test.ts b/src/config/config-misc.test.ts index b46b5b49766..647986a96e0 100644 --- a/src/config/config-misc.test.ts +++ b/src/config/config-misc.test.ts @@ -258,7 +258,7 @@ describe("cron webhook schema", () => { retry: { maxAttempts: 5, backoffMs: [60000, 120000, 300000], - retryOn: ["rate_limit", "network"], + retryOn: ["rate_limit", "overloaded", "network"], }, }, }); diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 39a43d46acb..f2ef2ff4ab8 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -1144,13 +1144,13 @@ export const FIELD_HELP: Record = { "cron.maxConcurrentRuns": "Limits how many cron jobs can execute at the same time when multiple schedules fire together. Use lower values to protect CPU/memory under heavy automation load, or raise carefully for higher throughput.", "cron.retry": - "Overrides the default retry policy for one-shot jobs when they fail with transient errors (rate limit, network, server_error). Omit to use defaults: maxAttempts 3, backoffMs [30000, 60000, 300000], retry all transient types.", + "Overrides the default retry policy for one-shot jobs when they fail with transient errors (rate limit, overloaded, network, server_error). Omit to use defaults: maxAttempts 3, backoffMs [30000, 60000, 300000], retry all transient types.", "cron.retry.maxAttempts": "Max retries for one-shot jobs on transient errors before permanent disable (default: 3).", "cron.retry.backoffMs": "Backoff delays in ms for each retry attempt (default: [30000, 60000, 300000]). Use shorter values for faster retries.", "cron.retry.retryOn": - "Error types to retry: rate_limit, network, timeout, server_error. Use to restrict which errors trigger retries; omit to retry all transient types.", + "Error types to retry: rate_limit, overloaded, network, timeout, server_error. Use to restrict which errors trigger retries; omit to retry all transient types.", "cron.webhook": 'Deprecated legacy fallback webhook URL used only for old jobs with `notify=true`. Migrate to per-job delivery using `delivery.mode="webhook"` plus `delivery.to`, and avoid relying on this global field.', "cron.webhookToken": diff --git a/src/config/types.cron.ts b/src/config/types.cron.ts index 251592251b6..0d3ee66dc19 100644 --- a/src/config/types.cron.ts +++ b/src/config/types.cron.ts @@ -1,7 +1,7 @@ import type { SecretInput } from "./types.secrets.js"; /** Error types that can trigger retries for one-shot jobs. */ -export type CronRetryOn = "rate_limit" | "network" | "timeout" | "server_error"; +export type CronRetryOn = "rate_limit" | "overloaded" | "network" | "timeout" | "server_error"; export type CronRetryConfig = { /** Max retries for transient errors before permanent disable (default: 3). */ diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 033044238e8..0db5be508c3 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -440,7 +440,7 @@ export const OpenClawSchema = z maxAttempts: z.number().int().min(0).max(10).optional(), backoffMs: z.array(z.number().int().nonnegative()).min(1).max(10).optional(), retryOn: z - .array(z.enum(["rate_limit", "network", "timeout", "server_error"])) + .array(z.enum(["rate_limit", "overloaded", "network", "timeout", "server_error"])) .min(1) .optional(), }) diff --git a/src/cron/isolated-agent/run.ts b/src/cron/isolated-agent/run.ts index 1fbcc08bad8..8d5a1db73a5 100644 --- a/src/cron/isolated-agent/run.ts +++ b/src/cron/isolated-agent/run.ts @@ -534,7 +534,7 @@ export async function runCronIsolatedAgentTurn(params: { // be blocked by a target it cannot satisfy (#27898). requireExplicitMessageTarget: deliveryRequested && resolvedDelivery.ok, disableMessageTool: deliveryRequested || deliveryPlan.mode === "none", - allowRateLimitCooldownProbe: runOptions?.allowRateLimitCooldownProbe, + allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe, abortSignal, bootstrapPromptWarningSignaturesSeen, bootstrapPromptWarningSignature, diff --git a/src/cron/service.issue-regressions.test.ts b/src/cron/service.issue-regressions.test.ts index 9665d40ec55..9aec71b7315 100644 --- a/src/cron/service.issue-regressions.test.ts +++ b/src/cron/service.issue-regressions.test.ts @@ -580,6 +580,7 @@ describe("Cron issue regressions", () => { const runRetryScenario = async (params: { id: string; deleteAfterRun: boolean; + firstError?: string; }): Promise<{ state: ReturnType; runIsolatedAgentJob: ReturnType; @@ -600,7 +601,10 @@ describe("Cron issue regressions", () => { let now = scheduledAt; const runIsolatedAgentJob = vi .fn() - .mockResolvedValueOnce({ status: "error", error: "429 rate limit exceeded" }) + .mockResolvedValueOnce({ + status: "error", + error: params.firstError ?? "429 rate limit exceeded", + }) .mockResolvedValueOnce({ status: "ok", summary: "done" }); const state = createCronServiceState({ cronEnabled: true, @@ -644,6 +648,19 @@ describe("Cron issue regressions", () => { ); expect(deletedJob).toBeUndefined(); expect(deleteResult.runIsolatedAgentJob).toHaveBeenCalledTimes(2); + + const overloadedResult = await runRetryScenario({ + id: "oneshot-overloaded-retry", + deleteAfterRun: false, + firstError: + "All models failed (2): anthropic/claude-3-5-sonnet: LLM error overloaded_error: overloaded (overloaded); openai/gpt-5.3-codex: LLM error overloaded_error: overloaded (overloaded)", + }); + const overloadedJob = overloadedResult.state.store?.jobs.find( + (j) => j.id === "oneshot-overloaded-retry", + ); + expect(overloadedJob).toBeDefined(); + expect(overloadedJob!.state.lastStatus).toBe("ok"); + expect(overloadedResult.runIsolatedAgentJob).toHaveBeenCalledTimes(2); }); it("#24355: one-shot job disabled after max transient retries", async () => { @@ -735,6 +752,54 @@ describe("Cron issue regressions", () => { expect(runIsolatedAgentJob).toHaveBeenCalledTimes(3); }); + it("#24355: one-shot job retries status-only 529 failures when retryOn only includes overloaded", async () => { + const store = makeStorePath(); + const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z"); + + const cronJob = createIsolatedRegressionJob({ + id: "oneshot-overloaded-529-only", + name: "reminder", + scheduledAt, + schedule: { kind: "at", at: new Date(scheduledAt).toISOString() }, + payload: { kind: "agentTurn", message: "remind me" }, + state: { nextRunAtMs: scheduledAt }, + }); + await writeCronJobs(store.storePath, [cronJob]); + + let now = scheduledAt; + const runIsolatedAgentJob = vi + .fn() + .mockResolvedValueOnce({ status: "error", error: "FailoverError: HTTP 529" }) + .mockResolvedValueOnce({ status: "ok", summary: "done" }); + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob, + cronConfig: { + retry: { maxAttempts: 1, backoffMs: [1000], retryOn: ["overloaded"] }, + }, + }); + + await onTimer(state); + const jobAfterRetry = state.store?.jobs.find((j) => j.id === "oneshot-overloaded-529-only"); + expect(jobAfterRetry).toBeDefined(); + expect(jobAfterRetry!.enabled).toBe(true); + expect(jobAfterRetry!.state.lastStatus).toBe("error"); + expect(jobAfterRetry!.state.nextRunAtMs).toBeGreaterThan(scheduledAt); + + now = (jobAfterRetry!.state.nextRunAtMs ?? now) + 1; + await onTimer(state); + + const finishedJob = state.store?.jobs.find((j) => j.id === "oneshot-overloaded-529-only"); + expect(finishedJob).toBeDefined(); + expect(finishedJob!.state.lastStatus).toBe("ok"); + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(2); + }); + it("#24355: one-shot job disabled immediately on permanent error", async () => { const store = makeStorePath(); const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z"); diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 8d1d40024ed..8502f3b6fe8 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -120,6 +120,8 @@ const DEFAULT_MAX_TRANSIENT_RETRIES = 3; const TRANSIENT_PATTERNS: Record = { rate_limit: /(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare)/i, + overloaded: + /\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i, network: /(network|econnreset|econnrefused|fetch failed|socket)/i, timeout: /(timeout|etimedout)/i, server_error: /\b5\d{2}\b/, diff --git a/src/discord/monitor/auto-presence.test.ts b/src/discord/monitor/auto-presence.test.ts index 0065ed77be7..b5a83d5242d 100644 --- a/src/discord/monitor/auto-presence.test.ts +++ b/src/discord/monitor/auto-presence.test.ts @@ -50,6 +50,26 @@ describe("discord auto presence", () => { expect(decision?.presence.activities[0]?.state).toBe("token exhausted"); }); + it("treats overloaded cooldown as exhausted", () => { + const now = Date.now(); + const decision = resolveDiscordAutoPresenceDecision({ + discordConfig: { + autoPresence: { + enabled: true, + exhaustedText: "token exhausted", + }, + }, + authStore: createStore({ cooldownUntil: now + 60_000, failureCounts: { overloaded: 2 } }), + gatewayConnected: true, + now, + }); + + expect(decision).toBeTruthy(); + expect(decision?.state).toBe("exhausted"); + expect(decision?.presence.status).toBe("dnd"); + expect(decision?.presence.activities[0]?.state).toBe("token exhausted"); + }); + it("recovers from exhausted to online once a profile becomes usable", () => { let now = Date.now(); let store = createStore({ cooldownUntil: now + 60_000, failureCounts: { rate_limit: 1 } }); diff --git a/src/discord/monitor/auto-presence.ts b/src/discord/monitor/auto-presence.ts index 74bdcab3617..8c139382dc6 100644 --- a/src/discord/monitor/auto-presence.ts +++ b/src/discord/monitor/auto-presence.ts @@ -104,6 +104,7 @@ function isExhaustedUnavailableReason(reason: AuthProfileFailureReason | null): } return ( reason === "rate_limit" || + reason === "overloaded" || reason === "billing" || reason === "auth" || reason === "auth_permanent" diff --git a/src/test-utils/model-fallback.mock.ts b/src/test-utils/model-fallback.mock.ts index 21053e2466e..4431db3db96 100644 --- a/src/test-utils/model-fallback.mock.ts +++ b/src/test-utils/model-fallback.mock.ts @@ -4,7 +4,7 @@ export async function runWithModelFallback(params: { run: ( provider: string, model: string, - options?: { allowRateLimitCooldownProbe?: boolean }, + options?: { allowTransientCooldownProbe?: boolean }, ) => Promise; }) { return { From a1902209671daa316c3192214d2354ccfb1db081 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 17:45:35 -0500 Subject: [PATCH 13/16] Tests: serialize low-memory test runner lanes --- scripts/test-parallel.mjs | 51 +++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/scripts/test-parallel.mjs b/scripts/test-parallel.mjs index 176737d7be3..d524fb87438 100644 --- a/scripts/test-parallel.mjs +++ b/scripts/test-parallel.mjs @@ -111,8 +111,17 @@ const useVmForks = const disableIsolation = process.env.OPENCLAW_TEST_NO_ISOLATE === "1"; const includeGatewaySuite = process.env.OPENCLAW_TEST_INCLUDE_GATEWAY === "1"; const includeExtensionsSuite = process.env.OPENCLAW_TEST_INCLUDE_EXTENSIONS === "1"; +const rawTestProfile = process.env.OPENCLAW_TEST_PROFILE?.trim().toLowerCase(); +const testProfile = + rawTestProfile === "low" || + rawTestProfile === "max" || + rawTestProfile === "normal" || + rawTestProfile === "serial" + ? rawTestProfile + : "normal"; +const shouldSplitUnitRuns = testProfile !== "low" && testProfile !== "serial"; const runs = [ - ...(useVmForks + ...(shouldSplitUnitRuns ? [ { name: "unit-fast", @@ -121,7 +130,7 @@ const runs = [ "run", "--config", "vitest.unit.config.ts", - "--pool=vmForks", + `--pool=${useVmForks ? "vmForks" : "forks"}`, ...(disableIsolation ? ["--isolate=false"] : []), ...unitIsolatedFiles.flatMap((file) => ["--exclude", file]), ], @@ -141,7 +150,14 @@ const runs = [ : [ { name: "unit", - args: ["vitest", "run", "--config", "vitest.unit.config.ts"], + args: [ + "vitest", + "run", + "--config", + "vitest.unit.config.ts", + `--pool=${useVmForks ? "vmForks" : "forks"}`, + ...(disableIsolation ? ["--isolate=false"] : []), + ], }, ]), ...(includeExtensionsSuite @@ -207,14 +223,7 @@ const silentArgs = const rawPassthroughArgs = process.argv.slice(2); const passthroughArgs = rawPassthroughArgs[0] === "--" ? rawPassthroughArgs.slice(1) : rawPassthroughArgs; -const rawTestProfile = process.env.OPENCLAW_TEST_PROFILE?.trim().toLowerCase(); -const testProfile = - rawTestProfile === "low" || - rawTestProfile === "max" || - rawTestProfile === "normal" || - rawTestProfile === "serial" - ? rawTestProfile - : "normal"; +const topLevelParallelEnabled = testProfile !== "low" && testProfile !== "serial"; const overrideWorkers = Number.parseInt(process.env.OPENCLAW_TEST_WORKERS ?? "", 10); const resolvedOverride = Number.isFinite(overrideWorkers) && overrideWorkers > 0 ? overrideWorkers : null; @@ -399,6 +408,23 @@ const run = async (entry) => { return 0; }; +const runEntries = async (entries) => { + if (topLevelParallelEnabled) { + const codes = await Promise.all(entries.map(run)); + return codes.find((code) => code !== 0); + } + + for (const entry of entries) { + // eslint-disable-next-line no-await-in-loop + const code = await run(entry); + if (code !== 0) { + return code; + } + } + + return undefined; +}; + const shutdown = (signal) => { for (const child of children) { child.kill(signal); @@ -451,8 +477,7 @@ if (passthroughArgs.length > 0) { process.exit(Number(code) || 0); } -const parallelCodes = await Promise.all(parallelRuns.map(run)); -const failedParallel = parallelCodes.find((code) => code !== 0); +const failedParallel = await runEntries(parallelRuns); if (failedParallel !== undefined) { process.exit(failedParallel); } From 455430a6f8dd4767612b8d708db9dc21369de36e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 17:53:02 -0500 Subject: [PATCH 14/16] Dead code: remove unused helper modules (#38318) * Dead code: remove unused provider runtime policy helper * Dead code: remove unused shared env writer * Dead code: remove unused auth store path collector --- src/config/runtime-group-policy-provider.ts | 19 -------- src/infra/env-file.ts | 54 --------------------- src/secrets/auth-store-paths.ts | 36 -------------- 3 files changed, 109 deletions(-) delete mode 100644 src/config/runtime-group-policy-provider.ts delete mode 100644 src/infra/env-file.ts delete mode 100644 src/secrets/auth-store-paths.ts diff --git a/src/config/runtime-group-policy-provider.ts b/src/config/runtime-group-policy-provider.ts deleted file mode 100644 index 887f35c3a0e..00000000000 --- a/src/config/runtime-group-policy-provider.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { resolveRuntimeGroupPolicy } from "./runtime-group-policy.js"; -import type { GroupPolicy } from "./types.base.js"; - -export function resolveProviderRuntimeGroupPolicy(params: { - providerConfigPresent: boolean; - groupPolicy?: GroupPolicy; - defaultGroupPolicy?: GroupPolicy; -}): { - groupPolicy: GroupPolicy; - providerMissingFallbackApplied: boolean; -} { - return resolveRuntimeGroupPolicy({ - providerConfigPresent: params.providerConfigPresent, - groupPolicy: params.groupPolicy, - defaultGroupPolicy: params.defaultGroupPolicy, - configuredFallbackPolicy: "open", - missingProviderFallbackPolicy: "allowlist", - }); -} diff --git a/src/infra/env-file.ts b/src/infra/env-file.ts deleted file mode 100644 index 525af40bbae..00000000000 --- a/src/infra/env-file.ts +++ /dev/null @@ -1,54 +0,0 @@ -import fs from "node:fs"; -import path from "node:path"; -import { escapeRegExp, resolveConfigDir } from "../utils.js"; - -export function upsertSharedEnvVar(params: { - key: string; - value: string; - env?: NodeJS.ProcessEnv; -}): { path: string; updated: boolean; created: boolean } { - const env = params.env ?? process.env; - const dir = resolveConfigDir(env); - const filepath = path.join(dir, ".env"); - const key = params.key.trim(); - const value = params.value; - - let raw = ""; - if (fs.existsSync(filepath)) { - raw = fs.readFileSync(filepath, "utf8"); - } - - const lines = raw.length ? raw.split(/\r?\n/) : []; - const matcher = new RegExp(`^(\\s*(?:export\\s+)?)${escapeRegExp(key)}\\s*=`); - let updated = false; - let replaced = false; - - const nextLines = lines.map((line) => { - const match = line.match(matcher); - if (!match) { - return line; - } - replaced = true; - const prefix = match[1] ?? ""; - const next = `${prefix}${key}=${value}`; - if (next !== line) { - updated = true; - } - return next; - }); - - if (!replaced) { - nextLines.push(`${key}=${value}`); - updated = true; - } - - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); - } - - const output = `${nextLines.join("\n")}\n`; - fs.writeFileSync(filepath, output, "utf8"); - fs.chmodSync(filepath, 0o600); - - return { path: filepath, updated, created: !raw }; -} diff --git a/src/secrets/auth-store-paths.ts b/src/secrets/auth-store-paths.ts deleted file mode 100644 index 12fe01dda4d..00000000000 --- a/src/secrets/auth-store-paths.ts +++ /dev/null @@ -1,36 +0,0 @@ -import fs from "node:fs"; -import path from "node:path"; -import { listAgentIds, resolveAgentDir } from "../agents/agent-scope.js"; -import { resolveAuthStorePath } from "../agents/auth-profiles/paths.js"; -import type { OpenClawConfig } from "../config/config.js"; -import { resolveUserPath } from "../utils.js"; - -export function collectAuthStorePaths(config: OpenClawConfig, stateDir: string): string[] { - const paths = new Set(); - // Scope default auth store discovery to the provided stateDir instead of - // ambient process env, so callers do not touch unrelated host-global stores. - paths.add(path.join(resolveUserPath(stateDir), "agents", "main", "agent", "auth-profiles.json")); - - const agentsRoot = path.join(resolveUserPath(stateDir), "agents"); - if (fs.existsSync(agentsRoot)) { - for (const entry of fs.readdirSync(agentsRoot, { withFileTypes: true })) { - if (!entry.isDirectory()) { - continue; - } - paths.add(path.join(agentsRoot, entry.name, "agent", "auth-profiles.json")); - } - } - - for (const agentId of listAgentIds(config)) { - if (agentId === "main") { - paths.add( - path.join(resolveUserPath(stateDir), "agents", "main", "agent", "auth-profiles.json"), - ); - continue; - } - const agentDir = resolveAgentDir(config, agentId); - paths.add(resolveUserPath(resolveAuthStorePath(agentDir))); - } - - return [...paths]; -} From 03b9abab84865122a27300e669c4afc1982ae394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Efe=20B=C3=BCken?= Date: Sat, 7 Mar 2026 01:57:15 +0300 Subject: [PATCH 15/16] feat(compaction): make post-compaction context sections configurable (#34556) Merged via squash. Prepared head SHA: 491bb28544b2e0d3563dd1c78593ed2d829d65f6 Co-authored-by: efe-arv <259833796+efe-arv@users.noreply.github.com> Co-authored-by: jalehman <550978+jalehman@users.noreply.github.com> Reviewed-by: @jalehman --- CHANGELOG.md | 1 + docs/gateway/configuration-reference.md | 2 + .../reply/post-compaction-context.test.ts | 190 ++++++++++++++---- .../reply/post-compaction-context.ts | 91 ++++++++- src/config/schema.help.quality.test.ts | 6 + src/config/schema.help.ts | 2 + src/config/schema.labels.ts | 1 + src/config/types.agent-defaults.ts | 6 + src/config/zod-schema.agent-defaults.ts | 1 + 9 files changed, 247 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1840fd3cde2..8fc4a7cd81b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Docs: https://docs.openclaw.ai - CLI: make read-only SecretRef status flows degrade safely (#37023) thanks @joshavant. - Docker/Podman extension dependency baking: add `OPENCLAW_EXTENSIONS` so container builds can preinstall selected bundled extension npm dependencies into the image for faster and more reproducible startup in container deployments. (#32223) Thanks @sallyom. - Onboarding/web search: add provider selection step and full provider list in configure wizard, with SecretRef ref-mode support during onboarding. (#34009) Thanks @kesku and @thewilloftheshadow. +- Agents/compaction post-context configurability: add `agents.defaults.compaction.postCompactionSections` so deployments can choose which `AGENTS.md` sections are re-injected after compaction, while preserving legacy fallback behavior when the documented default pair is configured in any order. (#34556) thanks @efe-arv. ### Breaking diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index 30559b5d55d..749b0d2b261 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -1003,6 +1003,7 @@ Periodic heartbeat runs. reserveTokensFloor: 24000, identifierPolicy: "strict", // strict | off | custom identifierInstructions: "Preserve deployment IDs, ticket IDs, and host:port pairs exactly.", // used when identifierPolicy=custom + postCompactionSections: ["Session Startup", "Red Lines"], // [] disables reinjection memoryFlush: { enabled: true, softThresholdTokens: 6000, @@ -1018,6 +1019,7 @@ Periodic heartbeat runs. - `mode`: `default` or `safeguard` (chunked summarization for long histories). See [Compaction](/concepts/compaction). - `identifierPolicy`: `strict` (default), `off`, or `custom`. `strict` prepends built-in opaque identifier retention guidance during compaction summarization. - `identifierInstructions`: optional custom identifier-preservation text used when `identifierPolicy=custom`. +- `postCompactionSections`: optional AGENTS.md H2/H3 section names to re-inject after compaction. Defaults to `["Session Startup", "Red Lines"]`; set `[]` to disable reinjection. When unset or explicitly set to that default pair, older `Every Session`/`Safety` headings are also accepted as a legacy fallback. - `memoryFlush`: silent agentic turn before auto-compaction to store durable memories. Skipped when workspace is read-only. ### `agents.defaults.contextPruning` diff --git a/src/auto-reply/reply/post-compaction-context.test.ts b/src/auto-reply/reply/post-compaction-context.test.ts index 34da43f2e7e..0c97df4d50b 100644 --- a/src/auto-reply/reply/post-compaction-context.test.ts +++ b/src/auto-reply/reply/post-compaction-context.test.ts @@ -228,56 +228,162 @@ Read WORKFLOW.md on startup. expect(result).toContain("Current time:"); }); - it("falls back to legacy section names (Every Session / Safety)", async () => { - const content = `# Rules + // ------------------------------------------------------------------------- + // postCompactionSections config + // ------------------------------------------------------------------------- + describe("agents.defaults.compaction.postCompactionSections", () => { + it("uses default sections (Session Startup + Red Lines) when config is not set", async () => { + const content = `## Session Startup\n\nDo startup.\n\n## Red Lines\n\nDo not break.\n\n## Other\n\nIgnore.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const result = await readPostCompactionContext(tmpDir); + expect(result).toContain("Session Startup"); + expect(result).toContain("Red Lines"); + expect(result).not.toContain("Other"); + }); -## Every Session + it("uses custom section names from config instead of defaults", async () => { + const content = `## Session Startup\n\nDo startup.\n\n## Critical Rules\n\nMy custom rules.\n\n## Red Lines\n\nDefault section.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: ["Critical Rules"] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + expect(result).not.toBeNull(); + expect(result).toContain("Critical Rules"); + expect(result).toContain("My custom rules"); + // Default sections must not be included when overridden + expect(result).not.toContain("Do startup"); + expect(result).not.toContain("Default section"); + }); -Read SOUL.md and USER.md. + it("supports multiple custom section names", async () => { + const content = `## Onboarding\n\nOnboard things.\n\n## Safety\n\nSafe things.\n\n## Noise\n\nIgnore.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: ["Onboarding", "Safety"] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + expect(result).not.toBeNull(); + expect(result).toContain("Onboard things"); + expect(result).toContain("Safe things"); + expect(result).not.toContain("Ignore"); + }); -## Safety + it("returns null when postCompactionSections is explicitly set to [] (opt-out)", async () => { + const content = `## Session Startup\n\nDo startup.\n\n## Red Lines\n\nDo not break.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: [] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + // Empty array = opt-out: no post-compaction context injection + expect(result).toBeNull(); + }); -Don't exfiltrate private data. + it("returns null when custom sections are configured but none found in AGENTS.md", async () => { + const content = `## Session Startup\n\nDo startup.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: ["Nonexistent Section"] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + expect(result).toBeNull(); + }); -## Other + it("does NOT reference 'Session Startup' in prose when custom sections are configured", async () => { + // Greptile review finding: hardcoded prose mentioned "Execute your Session Startup + // sequence now" even when custom section names were configured, causing agents to + // look for a non-existent section. Prose must adapt to the configured section names. + const content = `## Boot Sequence\n\nDo custom boot things.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: ["Boot Sequence"] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + expect(result).not.toBeNull(); + // Must not reference the hardcoded default section name + expect(result).not.toContain("Session Startup"); + // Must reference the actual configured section names + expect(result).toContain("Boot Sequence"); + }); -Ignore this. -`; - fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); - const result = await readPostCompactionContext(tmpDir); - expect(result).not.toBeNull(); - expect(result).toContain("Every Session"); - expect(result).toContain("Read SOUL.md"); - expect(result).toContain("Safety"); - expect(result).toContain("Don't exfiltrate"); - expect(result).not.toContain("Other"); - }); + it("uses default 'Session Startup' prose when default sections are active", async () => { + const content = `## Session Startup\n\nDo startup.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const result = await readPostCompactionContext(tmpDir); + expect(result).not.toBeNull(); + expect(result).toContain("Execute your Session Startup sequence now"); + }); - it("prefers new section names over legacy when both exist", async () => { - const content = `# Rules + it("falls back to legacy sections when defaults are explicitly configured", async () => { + // Older AGENTS.md templates use "Every Session" / "Safety" instead of + // "Session Startup" / "Red Lines". Explicitly setting the defaults should + // still trigger the legacy fallback — same behavior as leaving the field unset. + const content = `## Every Session\n\nDo startup things.\n\n## Safety\n\nBe safe.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: ["Session Startup", "Red Lines"] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + expect(result).not.toBeNull(); + expect(result).toContain("Do startup things"); + expect(result).toContain("Be safe"); + }); -## Session Startup + it("falls back to legacy sections when default sections are configured in a different order", async () => { + const content = `## Every Session\n\nDo startup things.\n\n## Safety\n\nBe safe.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: ["Red Lines", "Session Startup"] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + expect(result).not.toBeNull(); + expect(result).toContain("Do startup things"); + expect(result).toContain("Be safe"); + expect(result).toContain("Execute your Session Startup sequence now"); + }); -New startup instructions. - -## Every Session - -Old startup instructions. - -## Red Lines - -New red lines. - -## Safety - -Old safety rules. -`; - fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); - const result = await readPostCompactionContext(tmpDir); - expect(result).not.toBeNull(); - expect(result).toContain("New startup instructions"); - expect(result).toContain("New red lines"); - expect(result).not.toContain("Old startup instructions"); - expect(result).not.toContain("Old safety rules"); + it("custom section names are matched case-insensitively", async () => { + const content = `## WORKFLOW INIT\n\nInit things.\n`; + fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content); + const cfg = { + agents: { + defaults: { + compaction: { postCompactionSections: ["workflow init"] }, + }, + }, + } as OpenClawConfig; + const result = await readPostCompactionContext(tmpDir, cfg); + expect(result).not.toBeNull(); + expect(result).toContain("Init things"); + }); }); }); diff --git a/src/auto-reply/reply/post-compaction-context.ts b/src/auto-reply/reply/post-compaction-context.ts index 9a326b59323..316ac3c29b1 100644 --- a/src/auto-reply/reply/post-compaction-context.ts +++ b/src/auto-reply/reply/post-compaction-context.ts @@ -6,6 +6,37 @@ import type { OpenClawConfig } from "../../config/config.js"; import { openBoundaryFile } from "../../infra/boundary-file-read.js"; const MAX_CONTEXT_CHARS = 3000; +const DEFAULT_POST_COMPACTION_SECTIONS = ["Session Startup", "Red Lines"]; +const LEGACY_POST_COMPACTION_SECTIONS = ["Every Session", "Safety"]; + +// Compare configured section names as a case-insensitive set so deployments can +// pin the documented defaults in any order without changing fallback semantics. +function matchesSectionSet(sectionNames: string[], expectedSections: string[]): boolean { + if (sectionNames.length !== expectedSections.length) { + return false; + } + + const counts = new Map(); + for (const name of expectedSections) { + const normalized = name.trim().toLowerCase(); + counts.set(normalized, (counts.get(normalized) ?? 0) + 1); + } + + for (const name of sectionNames) { + const normalized = name.trim().toLowerCase(); + const count = counts.get(normalized); + if (!count) { + return false; + } + if (count === 1) { + counts.delete(normalized); + } else { + counts.set(normalized, count - 1); + } + } + + return counts.size === 0; +} function formatDateStamp(nowMs: number, timezone: string): string { const parts = new Intl.DateTimeFormat("en-US", { @@ -53,19 +84,39 @@ export async function readPostCompactionContext( } })(); - // Extract "## Session Startup" and "## Red Lines" sections. - // Also accept legacy names "Every Session" and "Safety" for backward - // compatibility with older AGENTS.md templates. - // Each section ends at the next "## " heading or end of file - let sections = extractSections(content, ["Session Startup", "Red Lines"]); - if (sections.length === 0) { - sections = extractSections(content, ["Every Session", "Safety"]); + // Extract configured sections from AGENTS.md (default: Session Startup + Red Lines). + // An explicit empty array disables post-compaction context injection entirely. + const configuredSections = cfg?.agents?.defaults?.compaction?.postCompactionSections; + const sectionNames = Array.isArray(configuredSections) + ? configuredSections + : DEFAULT_POST_COMPACTION_SECTIONS; + + if (sectionNames.length === 0) { + return null; + } + + const foundSectionNames: string[] = []; + let sections = extractSections(content, sectionNames, foundSectionNames); + + // Fall back to legacy section names ("Every Session" / "Safety") when using + // defaults and the current headings aren't found — preserves compatibility + // with older AGENTS.md templates. The fallback also applies when the user + // explicitly configures the default pair, so that pinning the documented + // defaults never silently changes behavior vs. leaving the field unset. + const isDefaultSections = + !Array.isArray(configuredSections) || + matchesSectionSet(configuredSections, DEFAULT_POST_COMPACTION_SECTIONS); + if (sections.length === 0 && isDefaultSections) { + sections = extractSections(content, LEGACY_POST_COMPACTION_SECTIONS, foundSectionNames); } if (sections.length === 0) { return null; } + // Only reference section names that were actually found and injected. + const displayNames = foundSectionNames.length > 0 ? foundSectionNames : sectionNames; + const resolvedNowMs = nowMs ?? Date.now(); const timezone = resolveUserTimezone(cfg?.agents?.defaults?.userTimezone); const dateStamp = formatDateStamp(resolvedNowMs, timezone); @@ -79,11 +130,24 @@ export async function readPostCompactionContext( ? combined.slice(0, MAX_CONTEXT_CHARS) + "\n...[truncated]..." : combined; + // When using the default section set, use precise prose that names the + // "Session Startup" sequence explicitly. When custom sections are configured, + // use generic prose — referencing a hardcoded "Session Startup" sequence + // would be misleading for deployments that use different section names. + const prose = isDefaultSections + ? "Session was just compacted. The conversation summary above is a hint, NOT a substitute for your startup sequence. " + + "Execute your Session Startup sequence now — read the required files before responding to the user." + : `Session was just compacted. The conversation summary above is a hint, NOT a substitute for your full startup sequence. ` + + `Re-read the sections injected below (${displayNames.join(", ")}) and follow your configured startup procedure before responding to the user.`; + + const sectionLabel = isDefaultSections + ? "Critical rules from AGENTS.md:" + : `Injected sections from AGENTS.md (${displayNames.join(", ")}):`; + return ( "[Post-compaction context refresh]\n\n" + - "Session was just compacted. The conversation summary above is a hint, NOT a substitute for your startup sequence. " + - "Execute your Session Startup sequence now — read the required files before responding to the user.\n\n" + - `Critical rules from AGENTS.md:\n\n${safeContent}\n\n${timeLine}` + `${prose}\n\n` + + `${sectionLabel}\n\n${safeContent}\n\n${timeLine}` ); } catch { return null; @@ -96,7 +160,11 @@ export async function readPostCompactionContext( * Skips content inside fenced code blocks. * Captures until the next heading of same or higher level, or end of string. */ -export function extractSections(content: string, sectionNames: string[]): string[] { +export function extractSections( + content: string, + sectionNames: string[], + foundNames?: string[], +): string[] { const results: string[] = []; const lines = content.split("\n"); @@ -157,6 +225,7 @@ export function extractSections(content: string, sectionNames: string[]): string if (sectionLines.length > 0) { results.push(sectionLines.join("\n").trim()); + foundNames?.push(name); } } diff --git a/src/config/schema.help.quality.test.ts b/src/config/schema.help.quality.test.ts index 146ffc17101..2ef7d8aae3a 100644 --- a/src/config/schema.help.quality.test.ts +++ b/src/config/schema.help.quality.test.ts @@ -375,6 +375,7 @@ const TARGET_KEYS = [ "agents.defaults.compaction.qualityGuard", "agents.defaults.compaction.qualityGuard.enabled", "agents.defaults.compaction.qualityGuard.maxRetries", + "agents.defaults.compaction.postCompactionSections", "agents.defaults.compaction.memoryFlush", "agents.defaults.compaction.memoryFlush.enabled", "agents.defaults.compaction.memoryFlush.softThresholdTokens", @@ -795,6 +796,11 @@ describe("config help copy quality", () => { expect(identifierPolicy.includes('"off"')).toBe(true); expect(identifierPolicy.includes('"custom"')).toBe(true); + const postCompactionSections = FIELD_HELP["agents.defaults.compaction.postCompactionSections"]; + expect(/Session Startup|Red Lines/i.test(postCompactionSections)).toBe(true); + expect(/Every Session|Safety/i.test(postCompactionSections)).toBe(true); + expect(/\[\]|disable/i.test(postCompactionSections)).toBe(true); + const flush = FIELD_HELP["agents.defaults.compaction.memoryFlush.enabled"]; expect(/pre-compaction|memory flush|token/i.test(flush)).toBe(true); }); diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index f2ef2ff4ab8..ee760f2d23f 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -1003,6 +1003,8 @@ export const FIELD_HELP: Record = { "Enables summary quality audits and regeneration retries for safeguard compaction. Default: false, so safeguard mode alone does not turn on retry behavior.", "agents.defaults.compaction.qualityGuard.maxRetries": "Maximum number of regeneration retries after a failed safeguard summary quality audit. Use small values to bound extra latency and token cost.", + "agents.defaults.compaction.postCompactionSections": + 'AGENTS.md H2/H3 section names re-injected after compaction so the agent reruns critical startup guidance. Leave unset to use "Session Startup"/"Red Lines" with legacy fallback to "Every Session"/"Safety"; set to [] to disable reinjection entirely.', "agents.defaults.compaction.memoryFlush": "Pre-compaction memory flush settings that run an agentic memory write before heavy compaction. Keep enabled for long sessions so salient context is persisted before aggressive trimming.", "agents.defaults.compaction.memoryFlush.enabled": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 64d444aab47..a5fec8dadcf 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -454,6 +454,7 @@ export const FIELD_LABELS: Record = { "agents.defaults.compaction.qualityGuard": "Compaction Quality Guard", "agents.defaults.compaction.qualityGuard.enabled": "Compaction Quality Guard Enabled", "agents.defaults.compaction.qualityGuard.maxRetries": "Compaction Quality Guard Max Retries", + "agents.defaults.compaction.postCompactionSections": "Post-Compaction Context Sections", "agents.defaults.compaction.memoryFlush": "Compaction Memory Flush", "agents.defaults.compaction.memoryFlush.enabled": "Compaction Memory Flush Enabled", "agents.defaults.compaction.memoryFlush.softThresholdTokens": diff --git a/src/config/types.agent-defaults.ts b/src/config/types.agent-defaults.ts index 6ceba822362..a7c40a5016b 100644 --- a/src/config/types.agent-defaults.ts +++ b/src/config/types.agent-defaults.ts @@ -314,6 +314,12 @@ export type AgentCompactionConfig = { qualityGuard?: AgentCompactionQualityGuardConfig; /** Pre-compaction memory flush (agentic turn). Default: enabled. */ memoryFlush?: AgentCompactionMemoryFlushConfig; + /** + * H2/H3 section names from AGENTS.md to inject after compaction. + * Defaults to ["Session Startup", "Red Lines"] when unset. + * Set to [] to disable post-compaction context injection entirely. + */ + postCompactionSections?: string[]; }; export type AgentCompactionMemoryFlushConfig = { diff --git a/src/config/zod-schema.agent-defaults.ts b/src/config/zod-schema.agent-defaults.ts index 276f97f586d..7c43a5a382d 100644 --- a/src/config/zod-schema.agent-defaults.ts +++ b/src/config/zod-schema.agent-defaults.ts @@ -102,6 +102,7 @@ export const AgentDefaultsSchema = z }) .strict() .optional(), + postCompactionSections: z.array(z.string()).optional(), memoryFlush: z .object({ enabled: z.boolean().optional(), From ae96a8191649c5d1d44c6e06f8503015216cd880 Mon Sep 17 00:00:00 2001 From: Drew Wagner <42811278+taw0002@users.noreply.github.com> Date: Fri, 6 Mar 2026 18:18:13 -0500 Subject: [PATCH 16/16] fix: strip skill-injected env vars from ACP harness spawn env (#36280) (#36316) * fix: strip skill-injected env vars from ACP harness spawn env Skill apiKey entries (e.g., openai-image-gen with primaryEnv=OPENAI_API_KEY) are set on process.env during agent runs and only reverted after the run completes. ACP harnesses like Codex CLI inherit these vars, causing them to silently use API billing instead of their own auth (e.g., OAuth). The fix tracks which env vars are actively injected by skill overrides in a module-level Set (activeSkillEnvKeys) and strips them in resolveAcpClientSpawnEnv() before spawning ACP child processes. Fixes #36280 * ACP: type spawn env for stripped keys * Skills: cover active env key lifecycle * Changelog: note ACP skill env isolation * ACP: preserve shell marker after env stripping --------- Co-authored-by: Vincent Koc --- CHANGELOG.md | 1 + src/acp/client.test.ts | 43 ++++++++++++++++++++++ src/acp/client.ts | 15 +++++++- src/agents/skills.test.ts | 3 ++ src/agents/skills/env-overrides.runtime.ts | 1 + src/agents/skills/env-overrides.ts | 15 ++++++++ 6 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 src/agents/skills/env-overrides.runtime.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fc4a7cd81b..dd664c4dadc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -206,6 +206,7 @@ Docs: https://docs.openclaw.ai - Skills/nano-banana-pro resolution override: respect explicit `--resolution` values during image editing and only auto-detect output size from input images when the flag is omitted. (#36880) Thanks @shuofengzhang and @vincentkoc. - Skills/openai-image-gen CLI validation: validate `--background` and `--style` inputs early, normalize supported values, and warn when those flags are ignored for incompatible models. (#36762) Thanks @shuofengzhang and @vincentkoc. - Skills/openai-image-gen output formats: validate `--output-format` values early, normalize aliases like `jpg -> jpeg`, and warn when the flag is ignored for incompatible models. (#36648) Thanks @shuofengzhang and @vincentkoc. +- ACP/skill env isolation: strip skill-injected API keys from ACP harness child-process environments so tools like Codex CLI keep their own auth flow instead of inheriting billed provider keys from active skills. (#36316) Thanks @taw0002 and @vincentkoc. - WhatsApp media upload caps: make outbound media sends and auto-replies honor `channels.whatsapp.mediaMaxMb` with per-account overrides so inbound and outbound limits use the same channel config. Thanks @vincentkoc. - Windows/Plugin install: when OpenClaw runs on Windows via Bun and `npm-cli.js` is not colocated with the runtime binary, fall back to `npm.cmd`/`npx.cmd` through the existing `cmd.exe` wrapper so `openclaw plugins install` no longer fails with `spawn EINVAL`. (#38056) Thanks @0xlin2023. - Telegram/send retry classification: retry grammY `Network request ... failed after N attempts` envelopes in send flows without reclassifying plain `Network request ... failed!` wrappers as transient, restoring the intended retry path while keeping broad send-context message matching tight. (#38056) Thanks @0xlin2023. diff --git a/src/acp/client.test.ts b/src/acp/client.test.ts index 72958ca57c2..bb5340115a1 100644 --- a/src/acp/client.test.ts +++ b/src/acp/client.test.ts @@ -60,6 +60,49 @@ describe("resolveAcpClientSpawnEnv", () => { }); expect(env.OPENCLAW_SHELL).toBe("acp-client"); }); + + it("strips skill-injected env keys when stripKeys is provided", () => { + const stripKeys = new Set(["OPENAI_API_KEY", "ELEVENLABS_API_KEY"]); + const env = resolveAcpClientSpawnEnv( + { + PATH: "/usr/bin", + OPENAI_API_KEY: "sk-leaked-from-skill", + ELEVENLABS_API_KEY: "el-leaked", + ANTHROPIC_API_KEY: "sk-keep-this", + }, + { stripKeys }, + ); + + expect(env.PATH).toBe("/usr/bin"); + expect(env.OPENCLAW_SHELL).toBe("acp-client"); + expect(env.ANTHROPIC_API_KEY).toBe("sk-keep-this"); + expect(env.OPENAI_API_KEY).toBeUndefined(); + expect(env.ELEVENLABS_API_KEY).toBeUndefined(); + }); + + it("does not modify the original baseEnv when stripping keys", () => { + const baseEnv: NodeJS.ProcessEnv = { + OPENAI_API_KEY: "sk-original", + PATH: "/usr/bin", + }; + const stripKeys = new Set(["OPENAI_API_KEY"]); + resolveAcpClientSpawnEnv(baseEnv, { stripKeys }); + + expect(baseEnv.OPENAI_API_KEY).toBe("sk-original"); + }); + + it("preserves OPENCLAW_SHELL even when stripKeys contains it", () => { + const env = resolveAcpClientSpawnEnv( + { + OPENCLAW_SHELL: "skill-overridden", + OPENAI_API_KEY: "sk-leaked", + }, + { stripKeys: new Set(["OPENCLAW_SHELL", "OPENAI_API_KEY"]) }, + ); + + expect(env.OPENCLAW_SHELL).toBe("acp-client"); + expect(env.OPENAI_API_KEY).toBeUndefined(); + }); }); describe("resolveAcpClientSpawnInvocation", () => { diff --git a/src/acp/client.ts b/src/acp/client.ts index 0cf9a194d88..54be5ffc455 100644 --- a/src/acp/client.ts +++ b/src/acp/client.ts @@ -348,8 +348,16 @@ function buildServerArgs(opts: AcpClientOptions): string[] { export function resolveAcpClientSpawnEnv( baseEnv: NodeJS.ProcessEnv = process.env, + options?: { stripKeys?: ReadonlySet }, ): NodeJS.ProcessEnv { - return { ...baseEnv, OPENCLAW_SHELL: "acp-client" }; + const env: NodeJS.ProcessEnv = { ...baseEnv }; + if (options?.stripKeys) { + for (const key of options.stripKeys) { + delete env[key]; + } + } + env.OPENCLAW_SHELL = "acp-client"; + return env; } type AcpSpawnRuntime = { @@ -450,7 +458,10 @@ export async function createAcpClient(opts: AcpClientOptions = {}): Promise { try { expect(process.env.ENV_KEY).toBe("injected"); + expect(getActiveSkillEnvKeys().has("ENV_KEY")).toBe(true); } finally { restore(); expect(process.env.ENV_KEY).toBeUndefined(); + expect(getActiveSkillEnvKeys().has("ENV_KEY")).toBe(false); } }); }); diff --git a/src/agents/skills/env-overrides.runtime.ts b/src/agents/skills/env-overrides.runtime.ts new file mode 100644 index 00000000000..ab8c4b305fb --- /dev/null +++ b/src/agents/skills/env-overrides.runtime.ts @@ -0,0 +1 @@ +export { getActiveSkillEnvKeys } from "./env-overrides.js"; diff --git a/src/agents/skills/env-overrides.ts b/src/agents/skills/env-overrides.ts index 83bb559bc7c..b56d02070df 100644 --- a/src/agents/skills/env-overrides.ts +++ b/src/agents/skills/env-overrides.ts @@ -12,6 +12,19 @@ const log = createSubsystemLogger("env-overrides"); type EnvUpdate = { key: string; prev: string | undefined }; type SkillConfig = NonNullable>; +/** + * Tracks env var keys that are currently injected by skill overrides. + * Used by ACP harness spawn to strip skill-injected keys so they don't + * leak to child processes (e.g., OPENAI_API_KEY leaking to Codex CLI). + * @see https://github.com/openclaw/openclaw/issues/36280 + */ +const activeSkillEnvKeys = new Set(); + +/** Returns a snapshot of env var keys currently injected by skill overrides. */ +export function getActiveSkillEnvKeys(): ReadonlySet { + return activeSkillEnvKeys; +} + type SanitizedSkillEnvOverrides = { allowed: Record; blocked: string[]; @@ -135,12 +148,14 @@ function applySkillConfigEnvOverrides(params: { } updates.push({ key: envKey, prev: process.env[envKey] }); process.env[envKey] = envValue; + activeSkillEnvKeys.add(envKey); } } function createEnvReverter(updates: EnvUpdate[]) { return () => { for (const update of updates) { + activeSkillEnvKeys.delete(update.key); if (update.prev === undefined) { delete process.env[update.key]; } else {