From 0a2d635e6849a856e1f53be6d5f3c5d6f8a8af2e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 28 Apr 2026 20:56:46 +0100 Subject: [PATCH] fix(gateway): harden local reachability checks Co-authored-by: arthurianresolve Co-authored-by: codexGW <9350182+codexGW@users.noreply.github.com> --- CHANGELOG.md | 3 + docs/gateway/protocol.md | 2 +- .../discord/src/monitor/gateway-plugin.ts | 11 +- .../src/monitor/provider.proxy.test.ts | 21 ++- src/commands/status.scan.shared.test.ts | 138 ++++++++++++++++++ src/commands/status.scan.shared.ts | 100 ++++++++++++- src/gateway/handshake-timeouts.test.ts | 25 ++++ src/gateway/handshake-timeouts.ts | 2 +- 8 files changed, 290 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f288ca45ea..54482e65d5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ Docs: https://docs.openclaw.ai - Agents/Claude CLI: reuse already-cached macOS Keychain credentials for no-prompt Claude credential reads, so doctor/runtime checks do not miss fresh interactive Claude auth. Fixes #73682. Thanks @RyanSandoval. - Agents/transcripts: strip empty assistant text blocks while preserving valid text, images, and signatures, so Anthropic-style providers no longer reject sanitized transcript turns. Fixes #73640. Thanks @jowhee327. - Providers/Bedrock: omit deprecated `temperature` for Claude Opus 4.7 Bedrock model ids, named and application inference profiles, including dotted `opus-4.7` refs, and classify the nested validation response for failover. Fixes #73663. Thanks @bstanbury. +- Gateway: raise the preauth/connect-challenge timeout to 15s so cold CLI starts on slower hosts have more time to process the WebSocket challenge before the Gateway closes the connection. Fixes #51469; refs #73592 and #62060. Thanks @GothicFox and @jackychen-png. +- CLI/status: fall back to a bounded local `status` RPC when loopback detail probes time out or report unknown capability, so reachable local gateways are no longer marked unreachable by slow read diagnostics. Fixes #48360; refs #62762, #73535, #51357, and #42019. Thanks @RacecarGuy, @justinschille, @DJBlackhawk, @tianyaqpzm, and @0xrsydn. +- Channels/Discord: give Discord Gateway WebSocket handshakes a 30s timeout so stalled TLS/network transitions emit an error and Carbon can continue its reconnect loop instead of leaving the bot silent until restart. Refs #50046. Thanks @codexGW. - NVIDIA/NIM: persist the `NVIDIA_API_KEY` provider marker and mark bundled NVIDIA Chat Completions models as string-content compatible, so NIM models load from `models.json` and OpenAI-compatible subagent calls send plain text content. Fixes #73013 and #50107; refs #73014. Thanks @bautrey, @iot2edge, @ifearghal, and @futhgar. - Channels/Discord: let text-only configs drop the `GuildVoiceStates` gateway intent and expose a bounded `/gateway/bot` metadata timeout with rate-limited fallback logs, reducing idle CPU and warning floods. Fixes #73709 and #73585. Thanks @sanchezm86 and @trac3r00. - Agents/sessions: mark same-turn `sessions_send` and A2A reply prompts with an inter-session `isUser=false` envelope before they reach the model, so foreign session output no longer lands as bare active user text. Fixes #73702; refs #73698, #73609, #73595, and #73622. Thanks @alvelda. diff --git a/docs/gateway/protocol.md b/docs/gateway/protocol.md index ce433f9b2e3..6a07dc8874a 100644 --- a/docs/gateway/protocol.md +++ b/docs/gateway/protocol.md @@ -554,7 +554,7 @@ stable across protocol v3 and are the expected baseline for third-party clients. | ----------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------------- | | `PROTOCOL_VERSION` | `3` | `src/gateway/protocol/schema/protocol-schemas.ts` | | Request timeout (per RPC) | `30_000` ms | `src/gateway/client.ts` (`requestTimeoutMs`) | -| Preauth / connect-challenge timeout | `10_000` ms | `src/gateway/handshake-timeouts.ts` (clamp `250`–`10_000`) | +| Preauth / connect-challenge timeout | `15_000` ms | `src/gateway/handshake-timeouts.ts` (clamp `250`–`15_000`) | | Initial reconnect backoff | `1_000` ms | `src/gateway/client.ts` (`backoffMs`) | | Max reconnect backoff | `30_000` ms | `src/gateway/client.ts` (`scheduleReconnect`) | | Fast-retry clamp after device-token close | `250` ms | `src/gateway/client.ts` | diff --git a/extensions/discord/src/monitor/gateway-plugin.ts b/extensions/discord/src/monitor/gateway-plugin.ts index 23cceeb5c5d..8439be08999 100644 --- a/extensions/discord/src/monitor/gateway-plugin.ts +++ b/extensions/discord/src/monitor/gateway-plugin.ts @@ -25,6 +25,7 @@ const DEFAULT_DISCORD_GATEWAY_INFO_TIMEOUT_MS = 30_000; const MAX_DISCORD_GATEWAY_INFO_TIMEOUT_MS = 120_000; const DISCORD_GATEWAY_INFO_TIMEOUT_ENV = "OPENCLAW_DISCORD_GATEWAY_INFO_TIMEOUT_MS"; const DISCORD_GATEWAY_METADATA_FALLBACK_LOG_INTERVAL_MS = 60_000; +const DISCORD_GATEWAY_HANDSHAKE_TIMEOUT_MS = 30_000; type DiscordGatewayMetadataResponse = Pick; type DiscordGatewayFetchInit = Record & { @@ -36,7 +37,10 @@ type DiscordGatewayFetch = ( ) => Promise; type DiscordGatewayMetadataError = Error & { transient?: boolean }; -type DiscordGatewayWebSocketCtor = new (url: string, options?: { agent?: unknown }) => ws.WebSocket; +type DiscordGatewayWebSocketCtor = new ( + url: string, + options?: { agent?: unknown; handshakeTimeout?: number }, +) => ws.WebSocket; const registrationPromises = new WeakMap>(); const gatewayMetadataFallbackLogLastAt = new WeakMap(); type CarbonGatewayRegistrationState = { @@ -421,7 +425,10 @@ function createGatewayPlugin(params: { // close-path crashes during Discord gateway teardown; the ws transport is // already our proxy path and behaves predictably for lifecycle cleanup. const WebSocketCtor = params.testing?.webSocketCtor ?? ws.default; - const socket = new WebSocketCtor(url, params.wsAgent ? { agent: params.wsAgent } : undefined); + const socket = new WebSocketCtor(url, { + handshakeTimeout: DISCORD_GATEWAY_HANDSHAKE_TIMEOUT_MS, + ...(params.wsAgent ? { agent: params.wsAgent } : {}), + }); const emitTransportActivity = () => { if ((this as unknown as { ws?: unknown }).ws !== socket) { return; diff --git a/extensions/discord/src/monitor/provider.proxy.test.ts b/extensions/discord/src/monitor/provider.proxy.test.ts index e3f6cdc3943..87f54126cc2 100644 --- a/extensions/discord/src/monitor/provider.proxy.test.ts +++ b/extensions/discord/src/monitor/provider.proxy.test.ts @@ -108,7 +108,10 @@ vi.mock("https-proxy-agent", () => ({ })); vi.mock("ws", () => ({ - default: function MockWebSocket(url: string, options?: { agent?: unknown }) { + default: function MockWebSocket( + url: string, + options?: { agent?: unknown; handshakeTimeout?: number }, + ) { webSocketSpy(url, options); }, })); @@ -159,9 +162,15 @@ describe("createDiscordGatewayPlugin", () => { return { HttpsProxyAgentCtor: HttpsProxyAgent as unknown as typeof import("https-proxy-agent").HttpsProxyAgent, - webSocketCtor: function WebSocketCtor(url: string, options?: { agent?: unknown }) { + webSocketCtor: function WebSocketCtor( + url: string, + options?: { agent?: unknown; handshakeTimeout?: number }, + ) { webSocketSpy(url, options); - } as unknown as new (url: string, options?: { agent?: unknown }) => import("ws").WebSocket, + } as unknown as new ( + url: string, + options?: { agent?: unknown; handshakeTimeout?: number }, + ) => import("ws").WebSocket, registerClient: async (_plugin: unknown, client: unknown) => { baseRegisterClientSpy(client); }, @@ -295,7 +304,9 @@ describe("createDiscordGatewayPlugin", () => { .createWebSocket; createWebSocket("wss://gateway.discord.gg"); - expect(webSocketSpy).toHaveBeenCalledWith("wss://gateway.discord.gg", undefined); + expect(webSocketSpy).toHaveBeenCalledWith("wss://gateway.discord.gg", { + handshakeTimeout: 30_000, + }); expect(wsProxyAgentSpy).not.toHaveBeenCalled(); }); @@ -409,7 +420,7 @@ describe("createDiscordGatewayPlugin", () => { expect(wsProxyAgentSpy).toHaveBeenCalledWith("http://127.0.0.1:8080"); expect(webSocketSpy).toHaveBeenCalledWith( "wss://gateway.discord.gg", - expect.objectContaining({ agent: getLastAgent() }), + expect.objectContaining({ agent: getLastAgent(), handshakeTimeout: 30_000 }), ); expect(runtime.log).toHaveBeenCalledWith("discord: gateway proxy enabled"); expect(runtime.error).not.toHaveBeenCalled(); diff --git a/src/commands/status.scan.shared.test.ts b/src/commands/status.scan.shared.test.ts index 72ebc7c574f..6e5feb0925a 100644 --- a/src/commands/status.scan.shared.test.ts +++ b/src/commands/status.scan.shared.test.ts @@ -8,6 +8,7 @@ const mocks = vi.hoisted(() => ({ buildGatewayConnectionDetailsWithResolvers: vi.fn(), resolveGatewayProbeTarget: vi.fn(), probeGateway: vi.fn(), + callGateway: vi.fn(), resolveGatewayProbeAuthResolution: vi.fn(), pickGatewaySelfPresence: vi.fn(), })); @@ -24,6 +25,10 @@ vi.mock("../gateway/probe.js", () => ({ probeGateway: mocks.probeGateway, })); +vi.mock("../gateway/call.js", () => ({ + callGateway: mocks.callGateway, +})); + vi.mock("./status.gateway-probe.js", () => ({ resolveGatewayProbeAuthResolution: mocks.resolveGatewayProbeAuthResolution, })); @@ -50,6 +55,7 @@ describe("resolveGatewayProbeSnapshot", () => { warning: "warn", }); mocks.pickGatewaySelfPresence.mockReturnValue({ host: "box" }); + mocks.callGateway.mockRejectedValue(new Error("status rpc unavailable")); }); it("skips auth resolution and probe for missing remote urls by default", async () => { @@ -178,6 +184,138 @@ describe("resolveGatewayProbeSnapshot", () => { expect(result.gatewayReachable).toBe(true); expect(result.gatewayProbe?.error).toBe("missing scope: operator.read; warn"); }); + + it("uses a bounded local status RPC fallback when the detail probe times out", async () => { + mocks.resolveGatewayProbeTarget.mockReturnValue({ + mode: "local", + gatewayMode: "local", + remoteUrlMissing: false, + }); + mocks.probeGateway.mockResolvedValue({ + ok: false, + url: "ws://127.0.0.1:18789", + connectLatencyMs: null, + error: "timeout", + close: null, + auth: { + role: null, + scopes: [], + capability: "unknown", + }, + health: null, + status: null, + presence: null, + configSnapshot: null, + }); + mocks.callGateway.mockResolvedValue({ sessions: 1 }); + + const result = await resolveGatewayProbeSnapshot({ + cfg: {}, + opts: { + timeoutMs: 8000, + }, + }); + + expect(mocks.callGateway).toHaveBeenCalledWith( + expect.objectContaining({ + config: {}, + method: "status", + token: "tok", + password: "pw", + timeoutMs: 2000, + mode: "backend", + clientName: "gateway-client", + }), + ); + expect(mocks.callGateway.mock.calls[0]?.[0]).not.toHaveProperty("deviceIdentity"); + expect(result.gatewayReachable).toBe(true); + expect(result.gatewayProbe).toMatchObject({ + ok: true, + error: "timeout", + status: { sessions: 1 }, + auth: { capability: "read_only" }, + }); + expect(result.gatewayProbeAuthWarning).toBe("warn"); + }); + + it("lets callGateway reuse paired-device auth for local status RPC fallback", async () => { + mocks.resolveGatewayProbeTarget.mockReturnValue({ + mode: "local", + gatewayMode: "local", + remoteUrlMissing: false, + }); + mocks.resolveGatewayProbeAuthResolution.mockResolvedValue({ + auth: {}, + warning: undefined, + }); + mocks.probeGateway.mockResolvedValue({ + ok: false, + url: "ws://127.0.0.1:18789", + connectLatencyMs: null, + error: "timeout", + close: null, + auth: { + role: "operator", + scopes: ["operator.read"], + capability: "read_only", + }, + health: null, + status: null, + presence: null, + configSnapshot: null, + }); + mocks.callGateway.mockResolvedValue({ sessions: 1 }); + + const result = await resolveGatewayProbeSnapshot({ + cfg: {}, + opts: {}, + }); + + expect(mocks.callGateway).toHaveBeenCalledWith( + expect.objectContaining({ + config: {}, + method: "status", + token: undefined, + password: undefined, + mode: "backend", + clientName: "gateway-client", + }), + ); + expect(mocks.callGateway.mock.calls[0]?.[0]).not.toHaveProperty("deviceIdentity"); + expect(result.gatewayReachable).toBe(true); + }); + + it("does not use the status RPC fallback for remote probe failures", async () => { + mocks.resolveGatewayProbeTarget.mockReturnValue({ + mode: "remote", + gatewayMode: "remote", + remoteUrlMissing: false, + }); + mocks.probeGateway.mockResolvedValue({ + ok: false, + url: "wss://gateway.example/ws", + connectLatencyMs: null, + error: "timeout", + close: null, + auth: { + role: null, + scopes: [], + capability: "unknown", + }, + health: null, + status: null, + presence: null, + configSnapshot: null, + }); + + const result = await resolveGatewayProbeSnapshot({ + cfg: { gateway: { mode: "remote", remote: { url: "wss://gateway.example/ws" } } }, + opts: {}, + }); + + expect(mocks.callGateway).not.toHaveBeenCalled(); + expect(result.gatewayReachable).toBe(false); + }); }); describe("resolveSharedMemoryStatusSnapshot", () => { diff --git a/src/commands/status.scan.shared.ts b/src/commands/status.scan.shared.ts index 03f4e348235..d5c03aaf39f 100644 --- a/src/commands/status.scan.shared.ts +++ b/src/commands/status.scan.shared.ts @@ -3,9 +3,11 @@ import type { OpenClawConfig } from "../config/types.js"; import { buildGatewayConnectionDetailsWithResolvers } from "../gateway/connection-details.js"; import { normalizeControlUiBasePath } from "../gateway/control-ui-shared.js"; import { resolveGatewayProbeTarget } from "../gateway/probe-target.js"; -import type { probeGateway as probeGatewayFn } from "../gateway/probe.js"; +import type { GatewayProbeResult, probeGateway as probeGatewayFn } from "../gateway/probe.js"; +import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../gateway/protocol/client-info.js"; import type { MemoryProviderStatus } from "../memory-host-sdk/engine-storage.js"; import { defaultSlotIdForKey } from "../plugins/slots.js"; +import { isLoopbackIpAddress } from "../shared/net/ip.js"; import { normalizeOptionalLowercaseString, normalizeOptionalString, @@ -16,6 +18,7 @@ export { pickGatewaySelfPresence } from "./gateway-presence.js"; let gatewayProbeModulePromise: Promise | undefined; let probeGatewayModulePromise: Promise | undefined; +let gatewayCallModulePromise: Promise | undefined; function loadGatewayProbeModule() { gatewayProbeModulePromise ??= import("./status.gateway-probe.js"); @@ -27,6 +30,11 @@ function loadProbeGatewayModule() { return probeGatewayModulePromise; } +function loadGatewayCallModule() { + gatewayCallModulePromise ??= import("../gateway/call.js"); + return gatewayCallModulePromise; +} + export type MemoryStatusSnapshot = MemoryProviderStatus & { agentId: string; }; @@ -70,6 +78,83 @@ type StatusMemorySearchManagerResolver = (params: { manager: StatusMemorySearchManager | null; }>; +function isLoopbackGatewayUrl(rawUrl: string): boolean { + try { + const hostname = new URL(rawUrl).hostname.toLowerCase(); + const unbracketed = + hostname.startsWith("[") && hostname.endsWith("]") ? hostname.slice(1, -1) : hostname; + return unbracketed === "localhost" || isLoopbackIpAddress(unbracketed); + } catch { + return false; + } +} + +function shouldTryLocalStatusRpcFallback(params: { + gatewayMode: "local" | "remote"; + gatewayUrl: string; + gatewayProbe: GatewayProbeResult | null; +}): params is { + gatewayMode: "local"; + gatewayUrl: string; + gatewayProbe: GatewayProbeResult; +} { + if ( + params.gatewayMode !== "local" || + !params.gatewayProbe || + params.gatewayProbe.ok || + !isLoopbackGatewayUrl(params.gatewayUrl) + ) { + return false; + } + const error = params.gatewayProbe.error?.toLowerCase() ?? ""; + return error.includes("timeout") || params.gatewayProbe.auth?.capability === "unknown"; +} + +async function applyLocalStatusRpcFallback(params: { + cfg: OpenClawConfig; + gatewayMode: "local" | "remote"; + gatewayUrl: string; + gatewayProbe: GatewayProbeResult | null; + gatewayProbeAuth: { + token?: string; + password?: string; + }; + timeoutMs: number; +}): Promise { + if (!shouldTryLocalStatusRpcFallback(params)) { + return params.gatewayProbe; + } + const status = await loadGatewayCallModule() + .then(({ callGateway }) => + callGateway({ + config: params.cfg, + method: "status", + token: params.gatewayProbeAuth.token, + password: params.gatewayProbeAuth.password, + timeoutMs: Math.min(2000, Math.max(1000, params.timeoutMs)), + mode: GATEWAY_CLIENT_MODES.BACKEND, + clientName: GATEWAY_CLIENT_NAMES.GATEWAY_CLIENT, + }), + ) + .catch(() => null); + if (!status) { + return params.gatewayProbe; + } + const auth = params.gatewayProbe.auth; + return { + ...params.gatewayProbe, + ok: true, + status, + auth: + auth.capability === "unknown" + ? { + ...auth, + capability: "read_only", + } + : auth, + }; +} + export function hasExplicitMemorySearchConfig(cfg: OpenClawConfig, agentId: string): boolean { if ( cfg.agents?.defaults && @@ -121,18 +206,27 @@ export async function resolveGatewayProbeSnapshot(params: { ) : { auth: {}, warning: undefined }; let gatewayProbeAuthWarning = gatewayProbeAuthResolution.warning; - const gatewayProbe = shouldProbe + const probeTimeoutMs = Math.min(params.opts.all ? 5000 : 2500, params.opts.timeoutMs ?? 10_000); + const initialGatewayProbe = shouldProbe ? await loadProbeGatewayModule() .then(({ probeGateway }) => probeGateway({ url: gatewayConnection.url, auth: gatewayProbeAuthResolution.auth, - timeoutMs: Math.min(params.opts.all ? 5000 : 2500, params.opts.timeoutMs ?? 10_000), + timeoutMs: probeTimeoutMs, detailLevel: params.opts.detailLevel ?? "presence", }), ) .catch(() => null) : null; + const gatewayProbe = await applyLocalStatusRpcFallback({ + cfg: params.cfg, + gatewayMode, + gatewayUrl: gatewayConnection.url, + gatewayProbe: initialGatewayProbe, + gatewayProbeAuth: gatewayProbeAuthResolution.auth, + timeoutMs: probeTimeoutMs, + }); if ( (params.opts.mergeAuthWarningIntoProbeError ?? true) && gatewayProbeAuthWarning && diff --git a/src/gateway/handshake-timeouts.test.ts b/src/gateway/handshake-timeouts.test.ts index 9feccdab8ef..01530bb22a0 100644 --- a/src/gateway/handshake-timeouts.test.ts +++ b/src/gateway/handshake-timeouts.test.ts @@ -36,6 +36,31 @@ describe("gateway handshake timeouts", () => { ).toBe(20); }); + test("ignores invalid handshake timeout overrides and falls back safely", () => { + expect( + getPreauthHandshakeTimeoutMsFromEnv({ + OPENCLAW_HANDSHAKE_TIMEOUT_MS: "abc", + }), + ).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS); + expect( + getPreauthHandshakeTimeoutMsFromEnv({ + OPENCLAW_HANDSHAKE_TIMEOUT_MS: "-1", + }), + ).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS); + expect( + getPreauthHandshakeTimeoutMsFromEnv({ + OPENCLAW_HANDSHAKE_TIMEOUT_MS: "0", + }), + ).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS); + expect( + getPreauthHandshakeTimeoutMsFromEnv({ + OPENCLAW_HANDSHAKE_TIMEOUT_MS: " ", + OPENCLAW_TEST_HANDSHAKE_TIMEOUT_MS: "20", + VITEST: "1", + }), + ).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS); + }); + test("getConnectChallengeTimeoutMsFromEnv reads OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS", () => { expect(getConnectChallengeTimeoutMsFromEnv({})).toBeUndefined(); expect( diff --git a/src/gateway/handshake-timeouts.ts b/src/gateway/handshake-timeouts.ts index 545d3e72d65..73e8ad5714a 100644 --- a/src/gateway/handshake-timeouts.ts +++ b/src/gateway/handshake-timeouts.ts @@ -1,4 +1,4 @@ -export const DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS = 10_000; +export const DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS = 15_000; export const MIN_CONNECT_CHALLENGE_TIMEOUT_MS = 250; export const MAX_CONNECT_CHALLENGE_TIMEOUT_MS = DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS;