diff --git a/src/cli/daemon-cli/lifecycle.test.ts b/src/cli/daemon-cli/lifecycle.test.ts index 20ee6a69a76..266d85e0775 100644 --- a/src/cli/daemon-cli/lifecycle.test.ts +++ b/src/cli/daemon-cli/lifecycle.test.ts @@ -39,16 +39,17 @@ const resolveGatewayPort = vi.hoisted(() => vi.fn((_cfg?: unknown, _env?: unknow const findVerifiedGatewayListenerPidsOnPortSync = vi.fn<(port: number) => number[]>(() => []); const signalVerifiedGatewayPidSync = vi.fn<(pid: number, signal: "SIGTERM" | "SIGUSR1") => void>(); const formatGatewayPidList = vi.fn<(pids: number[]) => string>((pids) => pids.join(", ")); -const probeGateway = vi.fn< - (opts: { - url: string; - auth?: { token?: string; password?: string }; - timeoutMs: number; - }) => Promise<{ - ok: boolean; - configSnapshot: unknown; - }> ->(); +const probeGateway = + vi.fn< + (opts: { + url: string; + auth?: { token?: string; password?: string }; + timeoutMs: number; + }) => Promise<{ + ok: boolean; + configSnapshot: unknown; + }> + >(); const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true); const loadConfig = vi.hoisted(() => vi.fn(() => ({}))); const recoverInstalledLaunchAgent = vi.hoisted(() => vi.fn()); diff --git a/src/gateway/call.ts b/src/gateway/call.ts index 51a5d6cc4fc..ce2612b4eb8 100644 --- a/src/gateway/call.ts +++ b/src/gateway/call.ts @@ -406,7 +406,17 @@ function formatGatewayCloseError( const hint = code === 1006 ? "abnormal closure (no close frame)" : code === 1000 ? "normal closure" : ""; const suffix = hint ? ` ${hint}` : ""; - return `gateway closed (${code}${suffix}): ${reasonText}\n${connectionDetails.message}`; + let message = `gateway closed (${code}${suffix}): ${reasonText}\n${connectionDetails.message}`; + // Add troubleshooting hints for common issues + if (code === 1006) { + message += + "\n\nPossible causes:" + + "\n- Gateway not yet ready to accept connections (retry after a moment)" + + "\n- TLS mismatch (connecting with ws:// to a wss:// gateway, or vice versa)" + + "\n- Gateway crashed or was terminated unexpectedly" + + "\nRun `openclaw doctor` for diagnostics."; + } + return message; } function formatGatewayTimeoutError( diff --git a/src/gateway/server-http.ts b/src/gateway/server-http.ts index 396e8b510eb..439fe864aad 100644 --- a/src/gateway/server-http.ts +++ b/src/gateway/server-http.ts @@ -1139,6 +1139,8 @@ export function attachGatewayUpgradeHandler(opts: { getResolvedAuth?: () => ResolvedGatewayAuth; /** Optional rate limiter for auth brute-force protection. */ rateLimiter?: AuthRateLimiter; + /** Optional logger for error diagnostics. */ + log?: { warn: (msg: string) => void }; }) { const { httpServer, @@ -1148,6 +1150,7 @@ export function attachGatewayUpgradeHandler(opts: { preauthConnectionBudget, resolvedAuth, rateLimiter, + log, } = opts; const getResolvedAuth = opts.getResolvedAuth ?? (() => resolvedAuth); httpServer.on("upgrade", (req, socket, head) => { @@ -1250,7 +1253,10 @@ export function attachGatewayUpgradeHandler(opts: { releaseUpgradeBudget(); throw new Error("gateway websocket upgrade failed"); } - })().catch(() => { + })().catch((err) => { + const remoteAddress = (socket as { remoteAddress?: string }).remoteAddress ?? "unknown"; + const errorMessage = err instanceof Error ? err.message : String(err); + log?.warn(`ws upgrade error from ${remoteAddress}: ${errorMessage}`); socket.destroy(); }); }); diff --git a/src/gateway/server-runtime-state.ts b/src/gateway/server-runtime-state.ts index 2058636c526..b77548ed615 100644 --- a/src/gateway/server-runtime-state.ts +++ b/src/gateway/server-runtime-state.ts @@ -84,6 +84,7 @@ export async function createGatewayRuntimeState(params: { httpServer: HttpServer; httpServers: HttpServer[]; httpBindHosts: string[]; + startListening: () => Promise; wss: WebSocketServer; preauthConnectionBudget: PreauthConnectionBudget; clients: Set; @@ -168,9 +169,18 @@ export async function createGatewayRuntimeState(params: { "Host-header origin fallback weakens origin checks and should only be used as break-glass.", ); } + // Create WebSocketServer first (with noServer: true) so we can attach upgrade handlers + // before HTTP servers start listening. This prevents a race condition where connections + // arrive before the upgrade handler is attached, which causes silent 1006 errors. + const wss = new WebSocketServer({ + noServer: true, + maxPayload: MAX_PREAUTH_PAYLOAD_BYTES, + }); + const preauthConnectionBudget = createPreauthConnectionBudget(); + const httpServers: HttpServer[] = []; const httpBindHosts: string[] = []; - for (const host of bindHosts) { + for (const _host of bindHosts) { const httpServer = createGatewayHttpServer({ canvasHost, clients, @@ -191,36 +201,9 @@ export async function createGatewayRuntimeState(params: { getReadiness: params.getReadiness, tlsOptions: params.gatewayTls?.enabled ? params.gatewayTls.tlsOptions : undefined, }); - try { - await listenGatewayHttpServer({ - httpServer, - bindHost: host, - port: params.port, - }); - httpServers.push(httpServer); - httpBindHosts.push(host); - } catch (err) { - if (host === bindHosts[0]) { - throw err; - } - params.log.warn( - `gateway: failed to bind loopback alias ${host}:${params.port} (${String(err)})`, - ); - } - } - const httpServer = httpServers[0]; - if (!httpServer) { - throw new Error("Gateway HTTP server failed to start"); - } - - const wss = new WebSocketServer({ - noServer: true, - maxPayload: MAX_PREAUTH_PAYLOAD_BYTES, - }); - const preauthConnectionBudget = createPreauthConnectionBudget(); - for (const server of httpServers) { + // Attach upgrade handler BEFORE listening to prevent race condition attachGatewayUpgradeHandler({ - httpServer: server, + httpServer, wss, canvasHost, clients, @@ -228,9 +211,53 @@ export async function createGatewayRuntimeState(params: { resolvedAuth: params.resolvedAuth, getResolvedAuth: params.getResolvedAuth, rateLimiter: params.rateLimiter, + log: params.log, }); + httpServers.push(httpServer); } - + const httpServer = httpServers[0]; + if (!httpServer) { + throw new Error("Gateway HTTP server failed to start"); + } + let startListeningPromise: Promise | null = null; + const startListening = async (): Promise => { + if (startListeningPromise) { + await startListeningPromise; + return; + } + startListeningPromise = (async () => { + for (const [index, host] of bindHosts.entries()) { + const server = httpServers[index]; + if (!server) { + throw new Error(`Missing gateway HTTP server for bind host ${host}`); + } + try { + await listenGatewayHttpServer({ + httpServer: server, + bindHost: host, + port: params.port, + }); + httpBindHosts.push(host); + } catch (err) { + if (host === bindHosts[0]) { + throw err; + } + params.log.warn( + `gateway: failed to bind loopback alias ${host}:${params.port} (${String(err)})`, + ); + } + } + if (httpBindHosts.length === 0) { + throw new Error("Gateway HTTP server failed to start"); + } + })(); + try { + await startListeningPromise; + } catch (err) { + startListeningPromise = null; + throw err; + } + }; const agentRunSeq = new Map(); const dedupe = new Map(); const chatRunState = createChatRunState(); @@ -257,6 +284,7 @@ export async function createGatewayRuntimeState(params: { httpServer, httpServers, httpBindHosts, + startListening, wss, preauthConnectionBudget, clients,