fix: use transport activity for stale health

This commit is contained in:
Peter Steinberger
2026-04-22 02:56:32 +01:00
parent 270003aefd
commit d8d0380297
29 changed files with 192 additions and 122 deletions

View File

@@ -1066,6 +1066,8 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
const trackInboundEvent = opts.setStatus
? () => {
const at = Date.now();
// Carbon handles gateway heartbeats internally but does not expose a
// stable heartbeat-ack event, so Discord app events stay app-level only.
opts.setStatus?.({ lastEventAt: at, lastInboundAt: at });
}
: undefined;

View File

@@ -1,5 +1,8 @@
import type { ChannelAccountSnapshot } from "openclaw/plugin-sdk/channel-contract";
import { createConnectedChannelStatusPatch } from "openclaw/plugin-sdk/gateway-runtime";
import {
createConnectedChannelStatusPatch,
createTransportActivityStatusPatch,
} from "openclaw/plugin-sdk/gateway-runtime";
import { formatMatrixErrorMessage } from "../errors.js";
import {
isMatrixDisconnectedSyncState,
@@ -52,12 +55,15 @@ export function createMatrixMonitorStatusController(params: {
});
};
const noteConnected = (at = Date.now()) => {
const noteConnected = (at = Date.now(), options?: { transportActivity?: boolean }) => {
if (status.connected === true) {
status.lastEventAt = at;
} else {
Object.assign(status, createConnectedChannelStatusPatch(at));
}
if (options?.transportActivity) {
Object.assign(status, createTransportActivityStatusPatch(at));
}
status.lastError = null;
status.lastDisconnect = null;
status.healthState = "healthy";
@@ -83,7 +89,10 @@ export function createMatrixMonitorStatusController(params: {
return {
noteSyncState(state: MatrixSyncState, error?: unknown, at = Date.now()) {
if (isMatrixReadySyncState(state)) {
noteConnected(at);
// matrix-js-sdk emits SYNCING after each successful /sync response.
// PREPARED can be cache-backed and CATCHUP is a lifecycle bridge, so
// neither should refresh transport liveness.
noteConnected(at, { transportActivity: state === "SYNCING" });
return;
}
if (isMatrixDisconnectedSyncState(state)) {

View File

@@ -129,6 +129,42 @@ describe("createMatrixMonitorSyncLifecycle", () => {
);
});
it("only refreshes transport liveness for successful sync responses", async () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-04-10T16:21:00.000Z"));
const { client, lifecycle, setStatus } = createSyncLifecycleHarness();
try {
setStatus.mockClear();
client.emit("sync.state", "PREPARED", null, undefined);
expect(setStatus).toHaveBeenLastCalledWith(
expect.not.objectContaining({
lastTransportActivityAt: expect.any(Number),
}),
);
await vi.advanceTimersByTimeAsync(2_000);
client.emit("sync.state", "SYNCING", "PREPARED", undefined);
const syncAt = Date.now();
expect(setStatus).toHaveBeenLastCalledWith(
expect.objectContaining({
lastTransportActivityAt: syncAt,
}),
);
await vi.advanceTimersByTimeAsync(3_000);
client.emit("sync.state", "CATCHUP", "SYNCING", undefined);
expect(setStatus).toHaveBeenLastCalledWith(
expect.objectContaining({
lastTransportActivityAt: syncAt,
}),
);
} finally {
lifecycle.dispose();
vi.useRealTimers();
}
});
it("does not downgrade a fatal error to stopped during shutdown", async () => {
const { client, lifecycle, setStatus, setStopping, statusController } =
createSyncLifecycleHarness({

View File

@@ -284,10 +284,6 @@ describe("slackPlugin actions", () => {
});
describe("slackPlugin status", () => {
it("opts out of the generic stale socket health check", () => {
expect(slackPlugin.status?.skipStaleSocketHealthCheck).toBe(true);
});
it("uses the direct Slack probe helper when runtime is not initialized", async () => {
const probeSpy = vi.spyOn(probeModule, "probeSlack").mockResolvedValueOnce({
ok: true,

View File

@@ -508,9 +508,8 @@ export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) {
removeAckAfterReply,
});
// Wire up event liveness tracking: update lastEventAt on every inbound event
// so the health monitor can detect "half-dead" sockets that pass health checks
// but silently stop delivering events.
// Slack's socket-mode client keeps ping/pong health private and closes on
// missed pongs. App events are useful status activity, but not transport proof.
const trackEvent = opts.setStatus
? () => {
opts.setStatus!({ lastEventAt: Date.now(), lastInboundAt: Date.now() });

View File

@@ -14,7 +14,7 @@ export type MonitorSlackOpts = {
abortSignal?: AbortSignal;
mediaMaxMb?: number;
slashCommand?: SlackSlashCommandConfig;
/** Callback to update the channel account status snapshot (e.g. lastEventAt). */
/** Callback to update app-level channel account activity (e.g. lastEventAt). */
setStatus?: (next: Record<string, unknown>) => void;
/** Callback to read the current channel account status snapshot. */
getStatus?: () => Record<string, unknown>;

View File

@@ -590,6 +590,7 @@ describe("TelegramPollingSession", () => {
connected: false,
lastConnectedAt: null,
lastEventAt: null,
lastTransportActivityAt: null,
});
const connectedPatch = setStatus.mock.calls.find(
([patch]) => (patch as Record<string, unknown>).connected === true,
@@ -599,9 +600,11 @@ describe("TelegramPollingSession", () => {
mode: "polling",
lastConnectedAt: expect.any(Number),
lastEventAt: expect.any(Number),
lastTransportActivityAt: expect.any(Number),
lastError: null,
});
expect(connectedPatch?.lastConnectedAt).toBe(connectedPatch?.lastEventAt);
expect(connectedPatch?.lastTransportActivityAt).toBe(connectedPatch?.lastEventAt);
abort.abort();
resolveFirstTask();
@@ -681,6 +684,7 @@ describe("TelegramPollingSession", () => {
mode: "polling",
lastConnectedAt: null,
lastEventAt: null,
lastTransportActivityAt: null,
});
expect(disconnectedPatches[1]?.[0]).toEqual({
mode: "polling",

View File

@@ -15,12 +15,14 @@ describe("createTelegramPollingStatusPublisher", () => {
connected: false,
lastConnectedAt: null,
lastEventAt: null,
lastTransportActivityAt: null,
});
expect(setStatus).toHaveBeenNthCalledWith(2, {
mode: "polling",
connected: true,
lastConnectedAt: 1234,
lastEventAt: 1234,
lastTransportActivityAt: 1234,
lastError: null,
});
expect(setStatus).toHaveBeenNthCalledWith(3, {

View File

@@ -1,5 +1,8 @@
import type { ChannelAccountSnapshot } from "openclaw/plugin-sdk/channel-contract";
import { createConnectedChannelStatusPatch } from "openclaw/plugin-sdk/gateway-runtime";
import {
createConnectedChannelStatusPatch,
createTransportActivityStatusPatch,
} from "openclaw/plugin-sdk/gateway-runtime";
type TelegramPollingStatusSink = (patch: Omit<ChannelAccountSnapshot, "accountId">) => void;
@@ -11,11 +14,15 @@ export function createTelegramPollingStatusPublisher(setStatus?: TelegramPolling
connected: false,
lastConnectedAt: null,
lastEventAt: null,
lastTransportActivityAt: null,
});
},
notePollSuccess(at = Date.now()) {
setStatus?.({
...createConnectedChannelStatusPatch(at),
// A successful getUpdates call proves the Telegram HTTP long-poll is alive
// even when the response has no user-visible updates.
...createTransportActivityStatusPatch(at),
mode: "polling",
lastError: null,
});