fix(gateway): make handshake timeout configurable

This commit is contained in:
Peter Steinberger
2026-04-28 23:50:16 +01:00
parent 75df09b9ec
commit bcc6a2400d
18 changed files with 210 additions and 14 deletions

View File

@@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai
- Plugins/runtime-deps: cache unchanged bundled runtime mirror dist-file materialization decisions and close file-lock handles on owner-write failures, reducing repeated startup chunk scans and avoiding FileHandle-GC recovery stalls. Refs #73532. Thanks @oadiazp and @bstanbury.
- CLI/TUI: keep `chat.history` off model-catalog discovery so initial Gateway-backed TUI history loads cannot block behind slow provider/plugin model scans on low-core hosts. Refs #73524. Thanks @harshcatsystems-collab.
- Channels/WhatsApp: flag recently reconnected linked accounts in channel status even when the socket is currently healthy, so flapping WhatsApp Web sessions no longer look clean after a brief reconnect. Refs #73602. Thanks @Vksh07.
- Gateway: expose `gateway.handshakeTimeoutMs` in config, schema, and docs while preserving `OPENCLAW_HANDSHAKE_TIMEOUT_MS` precedence, so loaded or low-powered hosts can tune local WebSocket pre-auth handshakes without patching dist files. Supersedes #51282; refs #73592 and #73652. Thanks @henry-the-frog.
- Agents/model selection: resolve slash-form aliases before provider/model parsing and keep alias-resolved primary models subject to transient provider cooldowns, so cron and persisted sessions do not retry cooled-down raw aliases. Fixes #73573 and #73657. Thanks @akai-shuuichi and @hashslingers.
- Agents/Claude CLI: reuse already-cached macOS Keychain credentials for no-prompt Claude credential reads, so doctor/runtime checks do not miss fresh interactive Claude auth. Fixes #73682. Thanks @RyanSandoval.
- Agents/transcripts: strip empty assistant text blocks while preserving valid text, images, and signatures, so Anthropic-style providers no longer reject sanitized transcript turns. Fixes #73640. Thanks @jowhee327.

View File

@@ -1,4 +1,4 @@
1265c4249f2740b6786b295d5a88391ba7eb0c30bdf460c60dfb4dfcb4153685 config-baseline.json
805bd3f63ff7327da45c01b78dbc990ed53bd13b89e0cbf50f319aa99334ba92 config-baseline.core.json
d4c98bce7b547349b9cbbe08ec1018eafce9900502d7794df993d07fdec0e2e0 config-baseline.json
6ce74b2ab3544e5375009a435a2360a3095e6bd759bb7dd8114293fb8a0e2b25 config-baseline.core.json
0e38bad86bdc96c38573f6d51ac9e6fc5306cc20fb4a454399c57c105a61ba87 config-baseline.channel.json
0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json

View File

@@ -441,6 +441,7 @@ See [Plugins](/tools/plugin).
- Relay-backed registrations are delegated to a specific gateway identity. The paired iOS app fetches `gateway.identity.get`, includes that identity in the relay registration, and forwards a registration-scoped send grant to the gateway. Another gateway cannot reuse that stored registration.
- `OPENCLAW_APNS_RELAY_BASE_URL` / `OPENCLAW_APNS_RELAY_TIMEOUT_MS`: temporary env overrides for the relay config above.
- `OPENCLAW_APNS_RELAY_ALLOW_HTTP=true`: development-only escape hatch for loopback HTTP relay URLs. Production relay URLs should stay on HTTPS.
- `gateway.handshakeTimeoutMs`: pre-auth Gateway WebSocket handshake timeout in milliseconds. Default: `15000`. `OPENCLAW_HANDSHAKE_TIMEOUT_MS` takes precedence when set. Increase this on loaded or low-powered hosts where local clients can connect while startup warmup is still settling.
- `gateway.channelHealthCheckMinutes`: channel health-monitor interval in minutes. Set `0` to disable health-monitor restarts globally. Default: `5`.
- `gateway.channelStaleEventThresholdMinutes`: stale-socket threshold in minutes. Keep this greater than or equal to `gateway.channelHealthCheckMinutes`. Default: `30`.
- `gateway.channelMaxRestartsPerHour`: maximum health-monitor restarts per channel/account in a rolling hour. Default: `10`.

View File

@@ -270,6 +270,24 @@ cannot roll back unrelated user settings.
</Accordion>
<Accordion title="Tune gateway WebSocket handshake timeout">
Give local clients more time to complete the pre-auth WebSocket handshake on
loaded or low-powered hosts:
```json5
{
gateway: {
handshakeTimeoutMs: 30000,
},
}
```
- Default is `15000` milliseconds.
- `OPENCLAW_HANDSHAKE_TIMEOUT_MS` still takes precedence for one-off service or shell overrides.
- Prefer fixing startup/event-loop stalls first; this knob is for hosts that are healthy but slow during warmup.
</Accordion>
<Accordion title="Configure sessions and resets">
Sessions control conversation continuity and isolation:

View File

@@ -409,6 +409,27 @@ describe("gateway.tools config", () => {
});
describe("gateway.channelHealthCheckMinutes", () => {
it("accepts preauth handshake timeout tuning", () => {
const res = validateConfigObject({
gateway: {
handshakeTimeoutMs: 30_000,
},
});
expect(res.ok).toBe(true);
});
it("rejects non-positive preauth handshake timeouts", () => {
const res = validateConfigObject({
gateway: {
handshakeTimeoutMs: 0,
},
});
expect(res.ok).toBe(false);
if (!res.ok) {
expect(res.issues[0]?.path).toBe("gateway.handshakeTimeoutMs");
}
});
it("accepts zero to disable monitor", () => {
const res = validateConfigObject({
gateway: {

View File

@@ -22343,6 +22343,14 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
},
additionalProperties: false,
},
handshakeTimeoutMs: {
type: "integer",
minimum: 1,
maximum: 9007199254740991,
title: "Gateway Handshake Timeout",
description:
"Pre-auth Gateway WebSocket handshake timeout in milliseconds. Use higher values on loaded or low-powered hosts where local clients can connect during startup warmup. OPENCLAW_HANDSHAKE_TIMEOUT_MS still takes precedence.",
},
channelHealthCheckMinutes: {
type: "integer",
minimum: 0,
@@ -24645,6 +24653,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
help: "Explicit gateway-level tool denylist to block risky tools even if lower-level policies allow them. Use deny rules for emergency response and defense-in-depth hardening.",
tags: ["access", "network"],
},
"gateway.handshakeTimeoutMs": {
label: "Gateway Handshake Timeout",
help: "Pre-auth Gateway WebSocket handshake timeout in milliseconds. Use higher values on loaded or low-powered hosts where local clients can connect during startup warmup. OPENCLAW_HANDSHAKE_TIMEOUT_MS still takes precedence.",
tags: ["network", "performance"],
},
"gateway.channelHealthCheckMinutes": {
label: "Gateway Channel Health Check Interval (min)",
help: "Interval in minutes for automatic channel health probing and status updates. Use lower intervals for faster detection, or higher intervals to reduce periodic probe noise.",

View File

@@ -95,6 +95,8 @@ export const FIELD_HELP: Record<string, string> = {
"Explicit gateway-level tool allowlist when you want a narrow set of tools available at runtime. Use this for locked-down environments where tool scope must be tightly controlled.",
"gateway.tools.deny":
"Explicit gateway-level tool denylist to block risky tools even if lower-level policies allow them. Use deny rules for emergency response and defense-in-depth hardening.",
"gateway.handshakeTimeoutMs":
"Pre-auth Gateway WebSocket handshake timeout in milliseconds. Use higher values on loaded or low-powered hosts where local clients can connect during startup warmup. OPENCLAW_HANDSHAKE_TIMEOUT_MS still takes precedence.",
"gateway.channelHealthCheckMinutes":
"Interval in minutes for automatic channel health probing and status updates. Use lower intervals for faster detection, or higher intervals to reduce periodic probe noise.",
"gateway.channelStaleEventThresholdMinutes":

View File

@@ -119,6 +119,7 @@ export const FIELD_LABELS: Record<string, string> = {
"gateway.tools": "Gateway Tool Exposure Policy",
"gateway.tools.allow": "Gateway Tool Allowlist",
"gateway.tools.deny": "Gateway Tool Denylist",
"gateway.handshakeTimeoutMs": "Gateway Handshake Timeout",
"gateway.channelHealthCheckMinutes": "Gateway Channel Health Check Interval (min)",
"gateway.channelStaleEventThresholdMinutes": "Gateway Channel Stale Event Threshold (min)",
"gateway.channelMaxRestartsPerHour": "Gateway Channel Max Restarts Per Hour",

View File

@@ -453,6 +453,11 @@ export type GatewayConfig = {
tools?: GatewayToolsConfig;
/** WebChat display/history settings. */
webchat?: GatewayWebchatConfig;
/**
* Pre-auth Gateway WebSocket handshake timeout in milliseconds.
* Env var OPENCLAW_HANDSHAKE_TIMEOUT_MS takes precedence. Default: 15000.
*/
handshakeTimeoutMs?: number;
/**
* Channel health monitor interval in minutes.
* Periodically checks channel health and restarts unhealthy channels.

View File

@@ -797,6 +797,7 @@ export const OpenClawSchema = z
})
.strict()
.optional(),
handshakeTimeoutMs: z.number().int().min(1).optional(),
channelHealthCheckMinutes: z.number().int().min(0).optional(),
channelStaleEventThresholdMinutes: z.number().int().min(1).optional(),
channelMaxRestartsPerHour: z.number().int().min(1).optional(),

View File

@@ -7,6 +7,7 @@ import {
MAX_CONNECT_CHALLENGE_TIMEOUT_MS,
MIN_CONNECT_CHALLENGE_TIMEOUT_MS,
resolveConnectChallengeTimeoutMs,
resolvePreauthHandshakeTimeoutMs,
} from "./handshake-timeouts.js";
describe("gateway handshake timeouts", () => {
@@ -36,6 +37,39 @@ describe("gateway handshake timeouts", () => {
).toBe(20);
});
test("resolves preauth handshake timeout with env over config over default", () => {
expect(
resolvePreauthHandshakeTimeoutMs({
env: { OPENCLAW_HANDSHAKE_TIMEOUT_MS: "75000" },
configuredTimeoutMs: 30_000,
}),
).toBe(75_000);
expect(
resolvePreauthHandshakeTimeoutMs({
env: {},
configuredTimeoutMs: 30_000,
}),
).toBe(30_000);
expect(
resolvePreauthHandshakeTimeoutMs({
env: { OPENCLAW_HANDSHAKE_TIMEOUT_MS: "garbage" },
configuredTimeoutMs: 30_000,
}),
).toBe(30_000);
expect(resolvePreauthHandshakeTimeoutMs({ env: {} })).toBe(
DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS,
);
});
test("resolves preauth handshake timeout from the test-only env before config", () => {
expect(
resolvePreauthHandshakeTimeoutMs({
env: { VITEST: "1", OPENCLAW_TEST_HANDSHAKE_TIMEOUT_MS: "50" },
configuredTimeoutMs: 30_000,
}),
).toBe(50);
});
test("ignores invalid handshake timeout overrides and falls back safely", () => {
expect(
getPreauthHandshakeTimeoutMsFromEnv({

View File

@@ -44,3 +44,23 @@ export function getPreauthHandshakeTimeoutMsFromEnv(env: NodeJS.ProcessEnv = pro
}
return DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS;
}
export function resolvePreauthHandshakeTimeoutMs(params?: {
env?: NodeJS.ProcessEnv;
configuredTimeoutMs?: number | null;
}): number {
const env = params?.env ?? process.env;
const configuredTimeout =
env.OPENCLAW_HANDSHAKE_TIMEOUT_MS || (env.VITEST && env.OPENCLAW_TEST_HANDSHAKE_TIMEOUT_MS);
if (configuredTimeout) {
const parsed = Number(configuredTimeout);
if (Number.isFinite(parsed) && parsed > 0) {
return parsed;
}
}
const configured = params?.configuredTimeoutMs;
if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
return configured;
}
return DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS;
}

View File

@@ -35,6 +35,7 @@ export function attachGatewayWsHandlers(params: GatewayWsRuntimeParams) {
getRequiredSharedGatewaySessionGeneration: params.getRequiredSharedGatewaySessionGeneration,
rateLimiter: params.rateLimiter,
browserRateLimiter: params.browserRateLimiter,
preauthHandshakeTimeoutMs: params.preauthHandshakeTimeoutMs,
gatewayMethods: params.gatewayMethods,
events: params.events,
refreshHealthSnapshot: params.context.refreshHealthSnapshot,

View File

@@ -535,6 +535,8 @@ export async function startGatewayServer(
current: resolveCurrentSharedGatewaySessionGeneration(),
required: null,
};
const preauthHandshakeTimeoutMs =
cfgAtStart.gateway?.handshakeTimeoutMs ?? getRuntimeConfig().gateway?.handshakeTimeoutMs;
const initialHooksConfig = runtimeConfig.hooksConfig;
const initialHookClientIpConfig = resolveHookClientIpConfig(cfgAtStart);
const canvasHostEnabled = runtimeConfig.canvasHostEnabled;
@@ -939,6 +941,7 @@ export async function startGatewayServer(
getRequiredSharedGatewaySessionGeneration(sharedGatewaySessionGenerationState),
rateLimiter: authRateLimiter,
browserRateLimiter: browserAuthRateLimiter,
preauthHandshakeTimeoutMs,
gatewayMethods: runtimeState.gatewayMethods,
events: GATEWAY_EVENTS,
logGateway: log,

View File

@@ -1,3 +1,4 @@
import { writeFile } from "node:fs/promises";
import http from "node:http";
import { afterEach, describe, expect, it } from "vitest";
import { WebSocketServer } from "ws";
@@ -151,6 +152,48 @@ describe("gateway pre-auth hardening", () => {
}
});
it("uses gateway.handshakeTimeoutMs for idle unauthenticated sockets", async () => {
const configPath = process.env.OPENCLAW_CONFIG_PATH;
if (!configPath) {
throw new Error("OPENCLAW_CONFIG_PATH missing in gateway preauth test");
}
await writeFile(
configPath,
JSON.stringify(
{
gateway: {
handshakeTimeoutMs: 250,
},
},
null,
2,
),
"utf-8",
);
try {
const harness = await createGatewaySuiteHarness({
serverOptions: { auth: { mode: "none" } },
});
try {
const ws = await harness.openWs();
await readConnectChallengeNonce(ws);
const close = await new Promise<{ code: number; elapsedMs: number }>((resolve) => {
const startedAt = Date.now();
ws.once("close", (code) => {
resolve({ code, elapsedMs: Date.now() - startedAt });
});
});
expect(close.code).toBe(1000);
expect(close.elapsedMs).toBeGreaterThan(0);
expect(close.elapsedMs).toBeLessThan(PREAUTH_HANDSHAKE_TEST_CLOSE_LIMIT_MS);
} finally {
await harness.close();
}
} finally {
await writeFile(configPath, "{}\n", "utf-8");
}
});
it("rejects oversized pre-auth connect frames before application-level auth responses", async () => {
resetDiagnosticEventsForTest();
const events: DiagnosticEventPayload[] = [];
@@ -227,7 +270,9 @@ describe("gateway pre-auth hardening", () => {
});
req.once("response", (res) => {
res.resume();
resolve(res.statusCode ?? 0);
res.once("end", () => {
resolve(res.statusCode ?? 0);
});
});
req.once("error", reject);
req.end();

View File

@@ -11,7 +11,7 @@ import { truncateUtf16Safe } from "../../utils.js";
import { isWebchatClient } from "../../utils/message-channel.js";
import type { AuthRateLimiter } from "../auth-rate-limit.js";
import type { ResolvedGatewayAuth } from "../auth.js";
import { getPreauthHandshakeTimeoutMsFromEnv } from "../handshake-timeouts.js";
import { resolvePreauthHandshakeTimeoutMs } from "../handshake-timeouts.js";
import { isLoopbackAddress } from "../net.js";
import { MAX_PAYLOAD_BYTES, MAX_PREAUTH_PAYLOAD_BYTES } from "../server-constants.js";
import { clearNodeWakeState } from "../server-methods/nodes-wake-state.js";
@@ -131,6 +131,7 @@ export type GatewayWsSharedHandlerParams = {
rateLimiter?: AuthRateLimiter;
/** Browser-origin fallback limiter (loopback is never exempt). */
browserRateLimiter?: AuthRateLimiter;
preauthHandshakeTimeoutMs?: number;
gatewayMethods: string[];
events: string[];
refreshHealthSnapshot: GatewayRequestContext["refreshHealthSnapshot"];
@@ -365,7 +366,9 @@ export function attachGatewayWsConnectionHandler(params: AttachGatewayWsConnecti
close();
});
const handshakeTimeoutMs = getPreauthHandshakeTimeoutMsFromEnv();
const handshakeTimeoutMs = resolvePreauthHandshakeTimeoutMs({
configuredTimeoutMs: params.preauthHandshakeTimeoutMs,
});
const handshakeTimer = setTimeout(() => {
if (!client) {
handshakeState = "failed";

View File

@@ -10,7 +10,7 @@ import {
type GatewayAuthResult,
type ResolvedGatewayAuth,
} from "../auth.js";
import { getPreauthHandshakeTimeoutMsFromEnv } from "../handshake-timeouts.js";
import { resolvePreauthHandshakeTimeoutMs } from "../handshake-timeouts.js";
import { VoiceClawGeminiLiveAdapter } from "./gemini-live.js";
import {
createVoiceClawRealtimeToolRuntime,
@@ -70,12 +70,17 @@ export class VoiceClawRealtimeSession {
}
attach(): void {
this.handshakeTimer = setTimeout(() => {
if (!this.config && !this.closed) {
log.warn(`session ${this.id} handshake timed out`);
this.ws.close(1000, "handshake timeout");
}
}, getPreauthHandshakeTimeoutMsFromEnv());
this.handshakeTimer = setTimeout(
() => {
if (!this.config && !this.closed) {
log.warn(`session ${this.id} handshake timed out`);
this.ws.close(1000, "handshake timeout");
}
},
resolvePreauthHandshakeTimeoutMs({
configuredTimeoutMs: this.gatewayConfig.gateway?.handshakeTimeoutMs,
}),
);
this.ws.on("message", (raw) => {
void this.handleRawMessage(raw).catch((err) => {

View File

@@ -69,12 +69,34 @@ describe("VoiceClaw realtime gateway upgrade", () => {
}
});
});
it("uses gateway.handshakeTimeoutMs for idle realtime sockets", async () => {
await withRealtimeGateway(
async ({ port }) => {
const ws = new WebSocket(`ws://127.0.0.1:${port}${VOICECLAW_REALTIME_PATH}`);
try {
await waitForOpen(ws);
await expect(waitForClose(ws)).resolves.toMatchObject({
code: 1000,
reason: "handshake timeout",
});
} finally {
await closeWebSocket(ws);
}
},
{ gateway: { auth: { mode: "none" }, handshakeTimeoutMs: 60 } },
);
});
});
async function withRealtimeGateway(run: (params: { port: number }) => Promise<void>) {
async function withRealtimeGateway(
run: (params: { port: number }) => Promise<void>,
cfg: Record<string, unknown> = { gateway: { auth: { mode: "none" } } },
) {
const resolvedAuth: ResolvedGatewayAuth = { mode: "none", allowTailscale: false };
await withTempConfig({
cfg: { gateway: { auth: { mode: "none" } } },
cfg,
run: async () => {
const clients = new Set<GatewayWsClient>();
const httpServer = createGatewayHttpServer({