diff --git a/CHANGELOG.md b/CHANGELOG.md index da341abad8f..7c2b98df354 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -226,6 +226,7 @@ Docs: https://docs.openclaw.ai - Feishu/reply delivery reliability: disable block streaming in Feishu reply options so plain-text auto-render replies are no longer silently dropped before final delivery. (#38258) Thanks @xinhuagu. - Agents/reply MEDIA delivery: normalize local assistant `MEDIA:` paths before block/final delivery, keep media dedupe aligned with message-tool sends, and contain malformed media normalization failures so generated files send reliably instead of falling back to empty responses. (#38572) Thanks @obviyus. - Sessions/bootstrap cache rollover invalidation: clear cached workspace bootstrap snapshots whenever an existing `sessionKey` rolls to a new `sessionId` across auto-reply, command, and isolated cron session resolvers, so `AGENTS.md`/`MEMORY.md`/`USER.md` updates are reloaded after daily, idle, or forced session resets instead of staying stale until gateway restart. (#38494) Thanks @LivingInDrm. +- Gateway/Telegram polling health monitor: skip stale-socket restarts for Telegram long-polling channels and thread channel identity through shared health evaluation so polling connections are not restarted on the WebSocket stale-socket heuristic. (#38395) Thanks @ql-wade and @Takhoffman. ## 2026.3.2 diff --git a/src/gateway/channel-health-monitor.ts b/src/gateway/channel-health-monitor.ts index 4ed422468f0..fb8715a12f1 100644 --- a/src/gateway/channel-health-monitor.ts +++ b/src/gateway/channel-health-monitor.ts @@ -122,6 +122,7 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann continue; } const healthPolicy: ChannelHealthPolicy = { + channelId, now, staleEventThresholdMs: timing.staleEventThresholdMs, channelConnectGraceMs: timing.channelConnectGraceMs, diff --git a/src/gateway/channel-health-policy.test.ts b/src/gateway/channel-health-policy.test.ts index 71b8f7ce896..125658ae53a 100644 --- a/src/gateway/channel-health-policy.test.ts +++ b/src/gateway/channel-health-policy.test.ts @@ -10,6 +10,7 @@ describe("evaluateChannelHealth", () => { configured: true, }, { + channelId: "discord", now: 100_000, channelConnectGraceMs: 10_000, staleEventThresholdMs: 30_000, @@ -28,6 +29,7 @@ describe("evaluateChannelHealth", () => { lastStartAt: 95_000, }, { + channelId: "discord", now: 100_000, channelConnectGraceMs: 10_000, staleEventThresholdMs: 30_000, @@ -48,6 +50,7 @@ describe("evaluateChannelHealth", () => { lastRunActivityAt: now - 30_000, }, { + channelId: "discord", now, channelConnectGraceMs: 10_000, staleEventThresholdMs: 30_000, @@ -68,6 +71,7 @@ describe("evaluateChannelHealth", () => { lastRunActivityAt: now - 26 * 60_000, }, { + channelId: "discord", now, channelConnectGraceMs: 10_000, staleEventThresholdMs: 30_000, @@ -90,6 +94,7 @@ describe("evaluateChannelHealth", () => { lastRunActivityAt: now - 31_000, }, { + channelId: "discord", now, channelConnectGraceMs: 10_000, staleEventThresholdMs: 30_000, @@ -109,6 +114,7 @@ describe("evaluateChannelHealth", () => { lastEventAt: null, }, { + channelId: "discord", now: 100_000, channelConnectGraceMs: 10_000, staleEventThresholdMs: 30_000, @@ -116,6 +122,26 @@ describe("evaluateChannelHealth", () => { ); expect(evaluation).toEqual({ healthy: false, reason: "stale-socket" }); }); + + it("skips stale-socket detection for telegram long-polling channels", () => { + const evaluation = evaluateChannelHealth( + { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: 0, + lastEventAt: null, + }, + { + channelId: "telegram", + now: 100_000, + channelConnectGraceMs: 10_000, + staleEventThresholdMs: 30_000, + }, + ); + expect(evaluation).toEqual({ healthy: true, reason: "healthy" }); + }); }); describe("resolveChannelRestartReason", () => { diff --git a/src/gateway/channel-health-policy.ts b/src/gateway/channel-health-policy.ts index d0616f04862..80b0b3546ad 100644 --- a/src/gateway/channel-health-policy.ts +++ b/src/gateway/channel-health-policy.ts @@ -1,3 +1,5 @@ +import type { ChannelId } from "../channels/plugins/types.js"; + export type ChannelHealthSnapshot = { running?: boolean; connected?: boolean; @@ -28,6 +30,7 @@ export type ChannelHealthEvaluation = { }; export type ChannelHealthPolicy = { + channelId: ChannelId; now: number; staleEventThresholdMs: number; channelConnectGraceMs: number; @@ -97,14 +100,19 @@ export function evaluateChannelHealth( if (snapshot.connected === false) { return { healthy: false, reason: "disconnected" }; } - if (snapshot.lastEventAt != null || snapshot.lastStartAt != null) { - const upSince = snapshot.lastStartAt ?? 0; - const upDuration = policy.now - upSince; - if (upDuration > policy.staleEventThresholdMs) { - const lastEvent = snapshot.lastEventAt ?? 0; - const eventAge = policy.now - lastEvent; - if (eventAge > policy.staleEventThresholdMs) { - return { healthy: false, reason: "stale-socket" }; + // Skip stale-socket check for Telegram (long-polling mode). Each polling request + // acts as a heartbeat, so the half-dead WebSocket scenario this check is designed + // to catch does not apply to Telegram's long-polling architecture. + if (policy.channelId !== "telegram") { + if (snapshot.lastEventAt != null || snapshot.lastStartAt != null) { + const upSince = snapshot.lastStartAt ?? 0; + const upDuration = policy.now - upSince; + if (upDuration > policy.staleEventThresholdMs) { + const lastEvent = snapshot.lastEventAt ?? 0; + const eventAge = policy.now - lastEvent; + if (eventAge > policy.staleEventThresholdMs) { + return { healthy: false, reason: "stale-socket" }; + } } } } diff --git a/src/gateway/server/readiness.test.ts b/src/gateway/server/readiness.test.ts index c41f8d050f2..9e502077d20 100644 --- a/src/gateway/server/readiness.test.ts +++ b/src/gateway/server/readiness.test.ts @@ -167,6 +167,28 @@ describe("createReadinessChecker", () => { vi.useRealTimers(); }); + it("keeps telegram long-polling channels ready without stale-socket classification", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 31 * 60_000; + const manager = createManager( + snapshotWith({ + telegram: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: startedAt, + lastEventAt: null, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 1_860_000 }); + vi.useRealTimers(); + }); + it("caches readiness snapshots briefly to keep repeated probes cheap", () => { vi.useFakeTimers(); vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); diff --git a/src/gateway/server/readiness.ts b/src/gateway/server/readiness.ts index e6ad2d92afb..527dad24949 100644 --- a/src/gateway/server/readiness.ts +++ b/src/gateway/server/readiness.ts @@ -50,11 +50,6 @@ export function createReadinessChecker(deps: { const snapshot = channelManager.getRuntimeSnapshot(); const failing: string[] = []; - const policy: ChannelHealthPolicy = { - now, - staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, - channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS, - }; for (const [channelId, accounts] of Object.entries(snapshot.channelAccounts)) { if (!accounts) { @@ -64,6 +59,12 @@ export function createReadinessChecker(deps: { if (!accountSnapshot) { continue; } + const policy: ChannelHealthPolicy = { + now, + staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, + channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS, + channelId, + }; const health = evaluateChannelHealth(accountSnapshot, policy); if (!health.healthy && !shouldIgnoreReadinessFailure(accountSnapshot, health)) { failing.push(channelId);