fix(gateway): harden local reachability checks

Co-authored-by: arthurianresolve <arthurianresolve@users.noreply.github.com>
Co-authored-by: codexGW <9350182+codexGW@users.noreply.github.com>
This commit is contained in:
Peter Steinberger
2026-04-28 20:56:46 +01:00
parent 3d736f67cf
commit 0a2d635e68
8 changed files with 290 additions and 12 deletions

View File

@@ -22,6 +22,9 @@ Docs: https://docs.openclaw.ai
- Agents/Claude CLI: reuse already-cached macOS Keychain credentials for no-prompt Claude credential reads, so doctor/runtime checks do not miss fresh interactive Claude auth. Fixes #73682. Thanks @RyanSandoval.
- Agents/transcripts: strip empty assistant text blocks while preserving valid text, images, and signatures, so Anthropic-style providers no longer reject sanitized transcript turns. Fixes #73640. Thanks @jowhee327.
- Providers/Bedrock: omit deprecated `temperature` for Claude Opus 4.7 Bedrock model ids, named and application inference profiles, including dotted `opus-4.7` refs, and classify the nested validation response for failover. Fixes #73663. Thanks @bstanbury.
- Gateway: raise the preauth/connect-challenge timeout to 15s so cold CLI starts on slower hosts have more time to process the WebSocket challenge before the Gateway closes the connection. Fixes #51469; refs #73592 and #62060. Thanks @GothicFox and @jackychen-png.
- CLI/status: fall back to a bounded local `status` RPC when loopback detail probes time out or report unknown capability, so reachable local gateways are no longer marked unreachable by slow read diagnostics. Fixes #48360; refs #62762, #73535, #51357, and #42019. Thanks @RacecarGuy, @justinschille, @DJBlackhawk, @tianyaqpzm, and @0xrsydn.
- Channels/Discord: give Discord Gateway WebSocket handshakes a 30s timeout so stalled TLS/network transitions emit an error and Carbon can continue its reconnect loop instead of leaving the bot silent until restart. Refs #50046. Thanks @codexGW.
- NVIDIA/NIM: persist the `NVIDIA_API_KEY` provider marker and mark bundled NVIDIA Chat Completions models as string-content compatible, so NIM models load from `models.json` and OpenAI-compatible subagent calls send plain text content. Fixes #73013 and #50107; refs #73014. Thanks @bautrey, @iot2edge, @ifearghal, and @futhgar.
- Channels/Discord: let text-only configs drop the `GuildVoiceStates` gateway intent and expose a bounded `/gateway/bot` metadata timeout with rate-limited fallback logs, reducing idle CPU and warning floods. Fixes #73709 and #73585. Thanks @sanchezm86 and @trac3r00.
- Agents/sessions: mark same-turn `sessions_send` and A2A reply prompts with an inter-session `isUser=false` envelope before they reach the model, so foreign session output no longer lands as bare active user text. Fixes #73702; refs #73698, #73609, #73595, and #73622. Thanks @alvelda.

View File

@@ -554,7 +554,7 @@ stable across protocol v3 and are the expected baseline for third-party clients.
| ----------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------------- |
| `PROTOCOL_VERSION` | `3` | `src/gateway/protocol/schema/protocol-schemas.ts` |
| Request timeout (per RPC) | `30_000` ms | `src/gateway/client.ts` (`requestTimeoutMs`) |
| Preauth / connect-challenge timeout | `10_000` ms | `src/gateway/handshake-timeouts.ts` (clamp `250``10_000`) |
| Preauth / connect-challenge timeout | `15_000` ms | `src/gateway/handshake-timeouts.ts` (clamp `250``15_000`) |
| Initial reconnect backoff | `1_000` ms | `src/gateway/client.ts` (`backoffMs`) |
| Max reconnect backoff | `30_000` ms | `src/gateway/client.ts` (`scheduleReconnect`) |
| Fast-retry clamp after device-token close | `250` ms | `src/gateway/client.ts` |

View File

@@ -25,6 +25,7 @@ const DEFAULT_DISCORD_GATEWAY_INFO_TIMEOUT_MS = 30_000;
const MAX_DISCORD_GATEWAY_INFO_TIMEOUT_MS = 120_000;
const DISCORD_GATEWAY_INFO_TIMEOUT_ENV = "OPENCLAW_DISCORD_GATEWAY_INFO_TIMEOUT_MS";
const DISCORD_GATEWAY_METADATA_FALLBACK_LOG_INTERVAL_MS = 60_000;
const DISCORD_GATEWAY_HANDSHAKE_TIMEOUT_MS = 30_000;
type DiscordGatewayMetadataResponse = Pick<Response, "ok" | "status" | "text">;
type DiscordGatewayFetchInit = Record<string, unknown> & {
@@ -36,7 +37,10 @@ type DiscordGatewayFetch = (
) => Promise<DiscordGatewayMetadataResponse>;
type DiscordGatewayMetadataError = Error & { transient?: boolean };
type DiscordGatewayWebSocketCtor = new (url: string, options?: { agent?: unknown }) => ws.WebSocket;
type DiscordGatewayWebSocketCtor = new (
url: string,
options?: { agent?: unknown; handshakeTimeout?: number },
) => ws.WebSocket;
const registrationPromises = new WeakMap<carbonGateway.GatewayPlugin, Promise<void>>();
const gatewayMetadataFallbackLogLastAt = new WeakMap<RuntimeEnv, number>();
type CarbonGatewayRegistrationState = {
@@ -421,7 +425,10 @@ function createGatewayPlugin(params: {
// close-path crashes during Discord gateway teardown; the ws transport is
// already our proxy path and behaves predictably for lifecycle cleanup.
const WebSocketCtor = params.testing?.webSocketCtor ?? ws.default;
const socket = new WebSocketCtor(url, params.wsAgent ? { agent: params.wsAgent } : undefined);
const socket = new WebSocketCtor(url, {
handshakeTimeout: DISCORD_GATEWAY_HANDSHAKE_TIMEOUT_MS,
...(params.wsAgent ? { agent: params.wsAgent } : {}),
});
const emitTransportActivity = () => {
if ((this as unknown as { ws?: unknown }).ws !== socket) {
return;

View File

@@ -108,7 +108,10 @@ vi.mock("https-proxy-agent", () => ({
}));
vi.mock("ws", () => ({
default: function MockWebSocket(url: string, options?: { agent?: unknown }) {
default: function MockWebSocket(
url: string,
options?: { agent?: unknown; handshakeTimeout?: number },
) {
webSocketSpy(url, options);
},
}));
@@ -159,9 +162,15 @@ describe("createDiscordGatewayPlugin", () => {
return {
HttpsProxyAgentCtor:
HttpsProxyAgent as unknown as typeof import("https-proxy-agent").HttpsProxyAgent,
webSocketCtor: function WebSocketCtor(url: string, options?: { agent?: unknown }) {
webSocketCtor: function WebSocketCtor(
url: string,
options?: { agent?: unknown; handshakeTimeout?: number },
) {
webSocketSpy(url, options);
} as unknown as new (url: string, options?: { agent?: unknown }) => import("ws").WebSocket,
} as unknown as new (
url: string,
options?: { agent?: unknown; handshakeTimeout?: number },
) => import("ws").WebSocket,
registerClient: async (_plugin: unknown, client: unknown) => {
baseRegisterClientSpy(client);
},
@@ -295,7 +304,9 @@ describe("createDiscordGatewayPlugin", () => {
.createWebSocket;
createWebSocket("wss://gateway.discord.gg");
expect(webSocketSpy).toHaveBeenCalledWith("wss://gateway.discord.gg", undefined);
expect(webSocketSpy).toHaveBeenCalledWith("wss://gateway.discord.gg", {
handshakeTimeout: 30_000,
});
expect(wsProxyAgentSpy).not.toHaveBeenCalled();
});
@@ -409,7 +420,7 @@ describe("createDiscordGatewayPlugin", () => {
expect(wsProxyAgentSpy).toHaveBeenCalledWith("http://127.0.0.1:8080");
expect(webSocketSpy).toHaveBeenCalledWith(
"wss://gateway.discord.gg",
expect.objectContaining({ agent: getLastAgent() }),
expect.objectContaining({ agent: getLastAgent(), handshakeTimeout: 30_000 }),
);
expect(runtime.log).toHaveBeenCalledWith("discord: gateway proxy enabled");
expect(runtime.error).not.toHaveBeenCalled();

View File

@@ -8,6 +8,7 @@ const mocks = vi.hoisted(() => ({
buildGatewayConnectionDetailsWithResolvers: vi.fn(),
resolveGatewayProbeTarget: vi.fn(),
probeGateway: vi.fn(),
callGateway: vi.fn(),
resolveGatewayProbeAuthResolution: vi.fn(),
pickGatewaySelfPresence: vi.fn(),
}));
@@ -24,6 +25,10 @@ vi.mock("../gateway/probe.js", () => ({
probeGateway: mocks.probeGateway,
}));
vi.mock("../gateway/call.js", () => ({
callGateway: mocks.callGateway,
}));
vi.mock("./status.gateway-probe.js", () => ({
resolveGatewayProbeAuthResolution: mocks.resolveGatewayProbeAuthResolution,
}));
@@ -50,6 +55,7 @@ describe("resolveGatewayProbeSnapshot", () => {
warning: "warn",
});
mocks.pickGatewaySelfPresence.mockReturnValue({ host: "box" });
mocks.callGateway.mockRejectedValue(new Error("status rpc unavailable"));
});
it("skips auth resolution and probe for missing remote urls by default", async () => {
@@ -178,6 +184,138 @@ describe("resolveGatewayProbeSnapshot", () => {
expect(result.gatewayReachable).toBe(true);
expect(result.gatewayProbe?.error).toBe("missing scope: operator.read; warn");
});
it("uses a bounded local status RPC fallback when the detail probe times out", async () => {
mocks.resolveGatewayProbeTarget.mockReturnValue({
mode: "local",
gatewayMode: "local",
remoteUrlMissing: false,
});
mocks.probeGateway.mockResolvedValue({
ok: false,
url: "ws://127.0.0.1:18789",
connectLatencyMs: null,
error: "timeout",
close: null,
auth: {
role: null,
scopes: [],
capability: "unknown",
},
health: null,
status: null,
presence: null,
configSnapshot: null,
});
mocks.callGateway.mockResolvedValue({ sessions: 1 });
const result = await resolveGatewayProbeSnapshot({
cfg: {},
opts: {
timeoutMs: 8000,
},
});
expect(mocks.callGateway).toHaveBeenCalledWith(
expect.objectContaining({
config: {},
method: "status",
token: "tok",
password: "pw",
timeoutMs: 2000,
mode: "backend",
clientName: "gateway-client",
}),
);
expect(mocks.callGateway.mock.calls[0]?.[0]).not.toHaveProperty("deviceIdentity");
expect(result.gatewayReachable).toBe(true);
expect(result.gatewayProbe).toMatchObject({
ok: true,
error: "timeout",
status: { sessions: 1 },
auth: { capability: "read_only" },
});
expect(result.gatewayProbeAuthWarning).toBe("warn");
});
it("lets callGateway reuse paired-device auth for local status RPC fallback", async () => {
mocks.resolveGatewayProbeTarget.mockReturnValue({
mode: "local",
gatewayMode: "local",
remoteUrlMissing: false,
});
mocks.resolveGatewayProbeAuthResolution.mockResolvedValue({
auth: {},
warning: undefined,
});
mocks.probeGateway.mockResolvedValue({
ok: false,
url: "ws://127.0.0.1:18789",
connectLatencyMs: null,
error: "timeout",
close: null,
auth: {
role: "operator",
scopes: ["operator.read"],
capability: "read_only",
},
health: null,
status: null,
presence: null,
configSnapshot: null,
});
mocks.callGateway.mockResolvedValue({ sessions: 1 });
const result = await resolveGatewayProbeSnapshot({
cfg: {},
opts: {},
});
expect(mocks.callGateway).toHaveBeenCalledWith(
expect.objectContaining({
config: {},
method: "status",
token: undefined,
password: undefined,
mode: "backend",
clientName: "gateway-client",
}),
);
expect(mocks.callGateway.mock.calls[0]?.[0]).not.toHaveProperty("deviceIdentity");
expect(result.gatewayReachable).toBe(true);
});
it("does not use the status RPC fallback for remote probe failures", async () => {
mocks.resolveGatewayProbeTarget.mockReturnValue({
mode: "remote",
gatewayMode: "remote",
remoteUrlMissing: false,
});
mocks.probeGateway.mockResolvedValue({
ok: false,
url: "wss://gateway.example/ws",
connectLatencyMs: null,
error: "timeout",
close: null,
auth: {
role: null,
scopes: [],
capability: "unknown",
},
health: null,
status: null,
presence: null,
configSnapshot: null,
});
const result = await resolveGatewayProbeSnapshot({
cfg: { gateway: { mode: "remote", remote: { url: "wss://gateway.example/ws" } } },
opts: {},
});
expect(mocks.callGateway).not.toHaveBeenCalled();
expect(result.gatewayReachable).toBe(false);
});
});
describe("resolveSharedMemoryStatusSnapshot", () => {

View File

@@ -3,9 +3,11 @@ import type { OpenClawConfig } from "../config/types.js";
import { buildGatewayConnectionDetailsWithResolvers } from "../gateway/connection-details.js";
import { normalizeControlUiBasePath } from "../gateway/control-ui-shared.js";
import { resolveGatewayProbeTarget } from "../gateway/probe-target.js";
import type { probeGateway as probeGatewayFn } from "../gateway/probe.js";
import type { GatewayProbeResult, probeGateway as probeGatewayFn } from "../gateway/probe.js";
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../gateway/protocol/client-info.js";
import type { MemoryProviderStatus } from "../memory-host-sdk/engine-storage.js";
import { defaultSlotIdForKey } from "../plugins/slots.js";
import { isLoopbackIpAddress } from "../shared/net/ip.js";
import {
normalizeOptionalLowercaseString,
normalizeOptionalString,
@@ -16,6 +18,7 @@ export { pickGatewaySelfPresence } from "./gateway-presence.js";
let gatewayProbeModulePromise: Promise<typeof import("./status.gateway-probe.js")> | undefined;
let probeGatewayModulePromise: Promise<typeof import("../gateway/probe.js")> | undefined;
let gatewayCallModulePromise: Promise<typeof import("../gateway/call.js")> | undefined;
function loadGatewayProbeModule() {
gatewayProbeModulePromise ??= import("./status.gateway-probe.js");
@@ -27,6 +30,11 @@ function loadProbeGatewayModule() {
return probeGatewayModulePromise;
}
function loadGatewayCallModule() {
gatewayCallModulePromise ??= import("../gateway/call.js");
return gatewayCallModulePromise;
}
export type MemoryStatusSnapshot = MemoryProviderStatus & {
agentId: string;
};
@@ -70,6 +78,83 @@ type StatusMemorySearchManagerResolver = (params: {
manager: StatusMemorySearchManager | null;
}>;
function isLoopbackGatewayUrl(rawUrl: string): boolean {
try {
const hostname = new URL(rawUrl).hostname.toLowerCase();
const unbracketed =
hostname.startsWith("[") && hostname.endsWith("]") ? hostname.slice(1, -1) : hostname;
return unbracketed === "localhost" || isLoopbackIpAddress(unbracketed);
} catch {
return false;
}
}
function shouldTryLocalStatusRpcFallback(params: {
gatewayMode: "local" | "remote";
gatewayUrl: string;
gatewayProbe: GatewayProbeResult | null;
}): params is {
gatewayMode: "local";
gatewayUrl: string;
gatewayProbe: GatewayProbeResult;
} {
if (
params.gatewayMode !== "local" ||
!params.gatewayProbe ||
params.gatewayProbe.ok ||
!isLoopbackGatewayUrl(params.gatewayUrl)
) {
return false;
}
const error = params.gatewayProbe.error?.toLowerCase() ?? "";
return error.includes("timeout") || params.gatewayProbe.auth?.capability === "unknown";
}
async function applyLocalStatusRpcFallback(params: {
cfg: OpenClawConfig;
gatewayMode: "local" | "remote";
gatewayUrl: string;
gatewayProbe: GatewayProbeResult | null;
gatewayProbeAuth: {
token?: string;
password?: string;
};
timeoutMs: number;
}): Promise<GatewayProbeResult | null> {
if (!shouldTryLocalStatusRpcFallback(params)) {
return params.gatewayProbe;
}
const status = await loadGatewayCallModule()
.then(({ callGateway }) =>
callGateway({
config: params.cfg,
method: "status",
token: params.gatewayProbeAuth.token,
password: params.gatewayProbeAuth.password,
timeoutMs: Math.min(2000, Math.max(1000, params.timeoutMs)),
mode: GATEWAY_CLIENT_MODES.BACKEND,
clientName: GATEWAY_CLIENT_NAMES.GATEWAY_CLIENT,
}),
)
.catch(() => null);
if (!status) {
return params.gatewayProbe;
}
const auth = params.gatewayProbe.auth;
return {
...params.gatewayProbe,
ok: true,
status,
auth:
auth.capability === "unknown"
? {
...auth,
capability: "read_only",
}
: auth,
};
}
export function hasExplicitMemorySearchConfig(cfg: OpenClawConfig, agentId: string): boolean {
if (
cfg.agents?.defaults &&
@@ -121,18 +206,27 @@ export async function resolveGatewayProbeSnapshot(params: {
)
: { auth: {}, warning: undefined };
let gatewayProbeAuthWarning = gatewayProbeAuthResolution.warning;
const gatewayProbe = shouldProbe
const probeTimeoutMs = Math.min(params.opts.all ? 5000 : 2500, params.opts.timeoutMs ?? 10_000);
const initialGatewayProbe = shouldProbe
? await loadProbeGatewayModule()
.then(({ probeGateway }) =>
probeGateway({
url: gatewayConnection.url,
auth: gatewayProbeAuthResolution.auth,
timeoutMs: Math.min(params.opts.all ? 5000 : 2500, params.opts.timeoutMs ?? 10_000),
timeoutMs: probeTimeoutMs,
detailLevel: params.opts.detailLevel ?? "presence",
}),
)
.catch(() => null)
: null;
const gatewayProbe = await applyLocalStatusRpcFallback({
cfg: params.cfg,
gatewayMode,
gatewayUrl: gatewayConnection.url,
gatewayProbe: initialGatewayProbe,
gatewayProbeAuth: gatewayProbeAuthResolution.auth,
timeoutMs: probeTimeoutMs,
});
if (
(params.opts.mergeAuthWarningIntoProbeError ?? true) &&
gatewayProbeAuthWarning &&

View File

@@ -36,6 +36,31 @@ describe("gateway handshake timeouts", () => {
).toBe(20);
});
test("ignores invalid handshake timeout overrides and falls back safely", () => {
expect(
getPreauthHandshakeTimeoutMsFromEnv({
OPENCLAW_HANDSHAKE_TIMEOUT_MS: "abc",
}),
).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS);
expect(
getPreauthHandshakeTimeoutMsFromEnv({
OPENCLAW_HANDSHAKE_TIMEOUT_MS: "-1",
}),
).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS);
expect(
getPreauthHandshakeTimeoutMsFromEnv({
OPENCLAW_HANDSHAKE_TIMEOUT_MS: "0",
}),
).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS);
expect(
getPreauthHandshakeTimeoutMsFromEnv({
OPENCLAW_HANDSHAKE_TIMEOUT_MS: " ",
OPENCLAW_TEST_HANDSHAKE_TIMEOUT_MS: "20",
VITEST: "1",
}),
).toBe(DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS);
});
test("getConnectChallengeTimeoutMsFromEnv reads OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS", () => {
expect(getConnectChallengeTimeoutMsFromEnv({})).toBeUndefined();
expect(

View File

@@ -1,4 +1,4 @@
export const DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS = 10_000;
export const DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS = 15_000;
export const MIN_CONNECT_CHALLENGE_TIMEOUT_MS = 250;
export const MAX_CONNECT_CHALLENGE_TIMEOUT_MS = DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS;