From f7c32fc8befda7e8e4ef6dbf2c1ec823ff68a3bb Mon Sep 17 00:00:00 2001 From: amittell Date: Thu, 28 May 2026 00:43:13 -0400 Subject: [PATCH] fix(telegram): lower polling keepalive delay (#83304) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(telegram): enable TCP keepalive on getUpdates connections to prevent NAT timeout stalls Long-polling connections to api.telegram.org stay idle for up to the getUpdates timeout (~900 s). Most home/office NAT tables expire idle TCP entries after 60–1800 s (commonly ~1000 s). When the NAT entry is silently dropped the connection hangs rather than returning an error, leaving the grammY runner stuck until the 90 s stall watchdog fires and forces a restart cycle. Fix: unconditionally set `keepAlive: true` and `keepAliveInitialDelay: 30_000` (30 s) on the undici Agent `connect` options built in `buildTelegramConnectOptions`. OS-level TCP keepalive probes sent every ~75 s (OS default) will: 1. Refresh the NAT table entry before it expires. 2. Surface dead connections immediately with ETIMEDOUT instead of hanging forever. The `return Object.keys(connect).length > 0 ? connect : null` guard is also removed; `connect` is now always non-empty so it always returns the object. Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit 92e454c0614256201cdf6f0f73c7897d006616d4) * fix(telegram): stop self-flagging disconnected on poll-cycle start; widen channel connect grace to 300s (cherry picked from commit 1ca963a05dac0d9d605e9a15dc97fced9cf7725e) * fix(telegram): catch hung polling startups that preserve inherited connected:true The widened 300s channel connect grace and the removal of connected:false from notePollingStart left a path where a polling restart could hang forever looking healthy. notePollingStart clears lastConnectedAt, lastEventAt, and lastTransportActivityAt but deliberately omits connected, so server-channels' patch-merge inherits a connected:true from the previous lifecycle. After grace, evaluateChannelHealth's stale-socket branch requires lastTransportActivityAt to be non-null and the connected:false branch is masked, so the channel sits healthy with no first getUpdates. Add a post-grace branch to evaluateChannelHealth that flags polling channels as stale-socket when connected:true is paired with null lastConnectedAt and null lastTransportActivityAt and a non-null lastStartAt. Scoped to mode:polling so webhook channels and channels without continuous transport tracking are not falsely flagged. Align TELEGRAM_POLLING_CONNECT_GRACE_MS in the Telegram status diagnostic with DEFAULT_CHANNEL_CONNECT_GRACE_MS so openclaw channels status agrees with the shared health monitor on the grace window. Refresh the notePollingStart comment to point at the new evaluateChannelHealth branch. Addresses clawsweeper review on #83304 (P1 connect-grace startup-hang, P2 diagnostic grace drift). Tests cover the new flagged path, the in-grace happy path, and the prior-successful-connect happy path. * fix(telegram): clear polling connected state on startup * fix(gateway): add defense-in-depth health-policy branch for hung polling startups Defense in depth on top of 87db46c576's notePollingStart connected:false fix. The primary path (notePollingStart writes connected:false explicitly so evaluateChannelHealth's existing connected===false branch catches a hung restart) is unchanged. This adds a defensive post-grace branch that catches the same hang via a different signature -- inherited connected:true paired with null lastConnectedAt and null lastTransportActivityAt -- in case a future code path forgets to clear the inherited connected flag on lifecycle start. Scoped to mode:polling so webhook channels and channels without continuous transport tracking are not falsely flagged. Also bump lastStartAt: Date.now() - 121_000 to 301_000 in the spool-handler timeout test added by upstream #83505 so it falls past the widened 300s TELEGRAM_POLLING_CONNECT_GRACE_MS suppression window (mirroring the same fixup already applied to the two adjacent polling-startup tests). * revert(telegram,gateway): keep connect grace at 120s Drop the 120s -> 300s widening from this PR after maintainer feedback that the extra grace masks real startup bugs. The defense-in-depth checks added in earlier commits (notePollingStart clearing inherited connected state, the stale-socket policy branch, the per-snapshot startup grace test) all work fine at 120s and remain valuable on their own. Reverts in: - src/gateway/channel-health-policy.ts: DEFAULT_CHANNEL_CONNECT_GRACE_MS 300 -> 120 - extensions/telegram/src/status-issues.ts: TELEGRAM_POLLING_CONNECT_GRACE_MS 300 -> 120 - extensions/telegram/src/status.test.ts: lastStartAt 301_000 -> 121_000 (3 cases) The new channel-health-policy.test.ts cases use explicit channelConnectGraceMs: 10_000 in the policy, so they are unaffected by the default constant change. * fix(telegram): narrow polling keepalive fix --------- Co-authored-by: Yibei Ou Co-authored-by: Claude Sonnet 4.6 Co-authored-by: Ayaan Zaidi --- extensions/telegram/src/fetch.test.ts | 8 ++++++ extensions/telegram/src/fetch.ts | 38 +++++++++++++++------------ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/extensions/telegram/src/fetch.test.ts b/extensions/telegram/src/fetch.test.ts index 7514abb5ccb..bc7d581d4f4 100644 --- a/extensions/telegram/src/fetch.test.ts +++ b/extensions/telegram/src/fetch.test.ts @@ -281,6 +281,11 @@ function expectStickyAutoSelectDispatcher( expect(options?.autoSelectFamilyAttemptTimeout).toBe(300); } +function expectTelegramKeepAliveOptions(options: Record | undefined): void { + expect(options?.keepAlive).toBe(true); + expect(options?.keepAliveInitialDelay).toBe(30_000); +} + function expectHttp1OnlyDispatcher( dispatcher: | { @@ -424,6 +429,7 @@ describe("resolveTelegramFetch", () => { expectHttp1OnlyDispatcher(dispatcher); expect(dispatcher?.options?.connect?.autoSelectFamily).toBe(true); expect(dispatcher?.options?.connect?.autoSelectFamilyAttemptTimeout).toBe(300); + expectTelegramKeepAliveOptions(dispatcher?.options?.connect); expect(typeof dispatcher?.options?.connect?.lookup).toBe("function"); }); @@ -460,8 +466,10 @@ describe("resolveTelegramFetch", () => { expectHttp1OnlyDispatcher(dispatcher); expect(dispatcher?.options?.connect?.autoSelectFamily).toBe(false); expect(dispatcher?.options?.connect?.autoSelectFamilyAttemptTimeout).toBe(300); + expectTelegramKeepAliveOptions(dispatcher?.options?.connect); expect(dispatcher?.options?.proxyTls?.autoSelectFamily).toBe(false); expect(dispatcher?.options?.proxyTls?.autoSelectFamilyAttemptTimeout).toBe(300); + expectTelegramKeepAliveOptions(dispatcher?.options?.proxyTls); }); it("adds managed proxy CA trust to Telegram env proxy dispatchers", async () => { diff --git a/extensions/telegram/src/fetch.ts b/extensions/telegram/src/fetch.ts index 460f8694a3f..655b4f9896d 100644 --- a/extensions/telegram/src/fetch.ts +++ b/extensions/telegram/src/fetch.ts @@ -159,6 +159,8 @@ function createDnsResultOrderLookup( }; } +const TELEGRAM_KEEPALIVE_INITIAL_DELAY_MS = 30_000; + function buildTelegramConnectOptions(params: { autoSelectFamily: boolean | null; dnsResultOrder: TelegramDnsResultOrder | null; @@ -167,14 +169,21 @@ function buildTelegramConnectOptions(params: { autoSelectFamily?: boolean; autoSelectFamilyAttemptTimeout?: number; family?: number; + keepAlive?: boolean; + keepAliveInitialDelay?: number; lookup?: LookupFunction; -} | null { +} { const connect: { autoSelectFamily?: boolean; autoSelectFamilyAttemptTimeout?: number; family?: number; + keepAlive?: boolean; + keepAliveInitialDelay?: number; lookup?: LookupFunction; - } = {}; + } = { + keepAlive: true, + keepAliveInitialDelay: TELEGRAM_KEEPALIVE_INITIAL_DELAY_MS, + }; if (params.forceIpv4) { connect.family = 4; @@ -189,7 +198,7 @@ function buildTelegramConnectOptions(params: { connect.lookup = lookup; } - return Object.keys(connect).length > 0 ? connect : null; + return connect; } function shouldBypassEnvProxyForTelegramApi(env: NodeJS.ProcessEnv = process.env): boolean { @@ -252,18 +261,12 @@ function resolveTelegramDispatcherPolicy(params: { const explicitProxyUrl = params.proxyUrl?.trim(); if (explicitProxyUrl) { return { - policy: connect - ? { - mode: "explicit-proxy", - proxyUrl: explicitProxyUrl, - allowPrivateProxy: true, - proxyTls: { ...connect }, - } - : { - mode: "explicit-proxy", - proxyUrl: explicitProxyUrl, - allowPrivateProxy: true, - }, + policy: { + mode: "explicit-proxy", + proxyUrl: explicitProxyUrl, + allowPrivateProxy: true, + proxyTls: { ...connect }, + }, mode: "explicit-proxy", }; } @@ -271,7 +274,8 @@ function resolveTelegramDispatcherPolicy(params: { return { policy: { mode: "env-proxy", - ...(connect ? { connect: { ...connect }, proxyTls: { ...connect } } : {}), + connect: { ...connect }, + proxyTls: { ...connect }, }, mode: "env-proxy", }; @@ -279,7 +283,7 @@ function resolveTelegramDispatcherPolicy(params: { return { policy: { mode: "direct", - ...(connect ? { connect: { ...connect } } : {}), + connect: { ...connect }, }, mode: "direct", };