fix(gateway): skip stale-socket restarts for Telegram polling (openclaw#38405)

Verified:
- pnpm build
- pnpm check
- pnpm test:macmini

Co-authored-by: ql-wade <262266039+ql-wade@users.noreply.github.com>
This commit is contained in:
ql-wade
2026-03-07 14:20:34 +08:00
committed by GitHub
parent 2e31aead39
commit a5c07fa115
6 changed files with 72 additions and 13 deletions

View File

@@ -226,6 +226,7 @@ Docs: https://docs.openclaw.ai
- Feishu/reply delivery reliability: disable block streaming in Feishu reply options so plain-text auto-render replies are no longer silently dropped before final delivery. (#38258) Thanks @xinhuagu.
- Agents/reply MEDIA delivery: normalize local assistant `MEDIA:` paths before block/final delivery, keep media dedupe aligned with message-tool sends, and contain malformed media normalization failures so generated files send reliably instead of falling back to empty responses. (#38572) Thanks @obviyus.
- Sessions/bootstrap cache rollover invalidation: clear cached workspace bootstrap snapshots whenever an existing `sessionKey` rolls to a new `sessionId` across auto-reply, command, and isolated cron session resolvers, so `AGENTS.md`/`MEMORY.md`/`USER.md` updates are reloaded after daily, idle, or forced session resets instead of staying stale until gateway restart. (#38494) Thanks @LivingInDrm.
- Gateway/Telegram polling health monitor: skip stale-socket restarts for Telegram long-polling channels and thread channel identity through shared health evaluation so polling connections are not restarted on the WebSocket stale-socket heuristic. (#38395) Thanks @ql-wade and @Takhoffman.
## 2026.3.2

View File

@@ -122,6 +122,7 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann
continue;
}
const healthPolicy: ChannelHealthPolicy = {
channelId,
now,
staleEventThresholdMs: timing.staleEventThresholdMs,
channelConnectGraceMs: timing.channelConnectGraceMs,

View File

@@ -10,6 +10,7 @@ describe("evaluateChannelHealth", () => {
configured: true,
},
{
channelId: "discord",
now: 100_000,
channelConnectGraceMs: 10_000,
staleEventThresholdMs: 30_000,
@@ -28,6 +29,7 @@ describe("evaluateChannelHealth", () => {
lastStartAt: 95_000,
},
{
channelId: "discord",
now: 100_000,
channelConnectGraceMs: 10_000,
staleEventThresholdMs: 30_000,
@@ -48,6 +50,7 @@ describe("evaluateChannelHealth", () => {
lastRunActivityAt: now - 30_000,
},
{
channelId: "discord",
now,
channelConnectGraceMs: 10_000,
staleEventThresholdMs: 30_000,
@@ -68,6 +71,7 @@ describe("evaluateChannelHealth", () => {
lastRunActivityAt: now - 26 * 60_000,
},
{
channelId: "discord",
now,
channelConnectGraceMs: 10_000,
staleEventThresholdMs: 30_000,
@@ -90,6 +94,7 @@ describe("evaluateChannelHealth", () => {
lastRunActivityAt: now - 31_000,
},
{
channelId: "discord",
now,
channelConnectGraceMs: 10_000,
staleEventThresholdMs: 30_000,
@@ -109,6 +114,7 @@ describe("evaluateChannelHealth", () => {
lastEventAt: null,
},
{
channelId: "discord",
now: 100_000,
channelConnectGraceMs: 10_000,
staleEventThresholdMs: 30_000,
@@ -116,6 +122,26 @@ describe("evaluateChannelHealth", () => {
);
expect(evaluation).toEqual({ healthy: false, reason: "stale-socket" });
});
it("skips stale-socket detection for telegram long-polling channels", () => {
const evaluation = evaluateChannelHealth(
{
running: true,
connected: true,
enabled: true,
configured: true,
lastStartAt: 0,
lastEventAt: null,
},
{
channelId: "telegram",
now: 100_000,
channelConnectGraceMs: 10_000,
staleEventThresholdMs: 30_000,
},
);
expect(evaluation).toEqual({ healthy: true, reason: "healthy" });
});
});
describe("resolveChannelRestartReason", () => {

View File

@@ -1,3 +1,5 @@
import type { ChannelId } from "../channels/plugins/types.js";
export type ChannelHealthSnapshot = {
running?: boolean;
connected?: boolean;
@@ -28,6 +30,7 @@ export type ChannelHealthEvaluation = {
};
export type ChannelHealthPolicy = {
channelId: ChannelId;
now: number;
staleEventThresholdMs: number;
channelConnectGraceMs: number;
@@ -97,14 +100,19 @@ export function evaluateChannelHealth(
if (snapshot.connected === false) {
return { healthy: false, reason: "disconnected" };
}
if (snapshot.lastEventAt != null || snapshot.lastStartAt != null) {
const upSince = snapshot.lastStartAt ?? 0;
const upDuration = policy.now - upSince;
if (upDuration > policy.staleEventThresholdMs) {
const lastEvent = snapshot.lastEventAt ?? 0;
const eventAge = policy.now - lastEvent;
if (eventAge > policy.staleEventThresholdMs) {
return { healthy: false, reason: "stale-socket" };
// Skip stale-socket check for Telegram (long-polling mode). Each polling request
// acts as a heartbeat, so the half-dead WebSocket scenario this check is designed
// to catch does not apply to Telegram's long-polling architecture.
if (policy.channelId !== "telegram") {
if (snapshot.lastEventAt != null || snapshot.lastStartAt != null) {
const upSince = snapshot.lastStartAt ?? 0;
const upDuration = policy.now - upSince;
if (upDuration > policy.staleEventThresholdMs) {
const lastEvent = snapshot.lastEventAt ?? 0;
const eventAge = policy.now - lastEvent;
if (eventAge > policy.staleEventThresholdMs) {
return { healthy: false, reason: "stale-socket" };
}
}
}
}

View File

@@ -167,6 +167,28 @@ describe("createReadinessChecker", () => {
vi.useRealTimers();
});
it("keeps telegram long-polling channels ready without stale-socket classification", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 31 * 60_000;
const manager = createManager(
snapshotWith({
telegram: {
running: true,
connected: true,
enabled: true,
configured: true,
lastStartAt: startedAt,
lastEventAt: null,
},
}),
);
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 1_860_000 });
vi.useRealTimers();
});
it("caches readiness snapshots briefly to keep repeated probes cheap", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));

View File

@@ -50,11 +50,6 @@ export function createReadinessChecker(deps: {
const snapshot = channelManager.getRuntimeSnapshot();
const failing: string[] = [];
const policy: ChannelHealthPolicy = {
now,
staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS,
};
for (const [channelId, accounts] of Object.entries(snapshot.channelAccounts)) {
if (!accounts) {
@@ -64,6 +59,12 @@ export function createReadinessChecker(deps: {
if (!accountSnapshot) {
continue;
}
const policy: ChannelHealthPolicy = {
now,
staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS,
channelId,
};
const health = evaluateChannelHealth(accountSnapshot, policy);
if (!health.healthy && !shouldIgnoreReadinessFailure(accountSnapshot, health)) {
failing.push(channelId);