diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b8cb0cde4b..99c04190526 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -207,7 +207,7 @@ Docs: https://docs.openclaw.ai - WhatsApp media upload caps: make outbound media sends and auto-replies honor `channels.whatsapp.mediaMaxMb` with per-account overrides so inbound and outbound limits use the same channel config. Thanks @vincentkoc. - Windows/Plugin install: when OpenClaw runs on Windows via Bun and `npm-cli.js` is not colocated with the runtime binary, fall back to `npm.cmd`/`npx.cmd` through the existing `cmd.exe` wrapper so `openclaw plugins install` no longer fails with `spawn EINVAL`. (#38056) Thanks @0xlin2023. - Telegram/send retry classification: retry grammY `Network request ... failed after N attempts` envelopes in send flows without reclassifying plain `Network request ... failed!` wrappers as transient, restoring the intended retry path while keeping broad send-context message matching tight. (#38056) Thanks @0xlin2023. -- Gateway/probe route precedence: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, so root-mounted SPA fallbacks no longer swallow machine probe routes while plugin-owned routes on those paths still keep precedence. (#18446) Thanks @vibecodooor and @vincentkoc. +- Gateway/probes: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, preserve plugin-owned route precedence on those paths, and make `/ready` and `/readyz` report channel-backed readiness with startup grace plus `503` on disconnected managed channels, while `/health` and `/healthz` stay shallow liveness probes. (#18446) Thanks @vibecodooor, @mahsumaktas, and @vincentkoc. ## 2026.3.2 diff --git a/docs/install/docker.md b/docs/install/docker.md index 8cbf2555e87..1dd0d2325d1 100644 --- a/docs/install/docker.md +++ b/docs/install/docker.md @@ -476,6 +476,10 @@ curl -fsS http://127.0.0.1:18789/readyz Aliases: `/health` and `/ready`. +`/healthz` is a shallow liveness probe for "the gateway process is up". +`/readyz` stays ready during startup grace, then becomes `503` only if required +managed channels are still disconnected after grace or disconnect later. + The Docker image includes a built-in `HEALTHCHECK` that pings `/healthz` in the background. In plain terms: Docker keeps checking if OpenClaw is still responsive. If checks keep failing, Docker marks the container as `unhealthy`, diff --git a/src/channels/plugins/types.core.ts b/src/channels/plugins/types.core.ts index 6cd5173e13b..22f8e458e79 100644 --- a/src/channels/plugins/types.core.ts +++ b/src/channels/plugins/types.core.ts @@ -102,6 +102,7 @@ export type ChannelAccountSnapshot = { linked?: boolean; running?: boolean; connected?: boolean; + restartPending?: boolean; reconnectAttempts?: number; lastConnectedAt?: number | null; lastDisconnect?: diff --git a/src/gateway/channel-health-monitor.ts b/src/gateway/channel-health-monitor.ts index e66bc4912af..4ed422468f0 100644 --- a/src/gateway/channel-health-monitor.ts +++ b/src/gateway/channel-health-monitor.ts @@ -1,6 +1,8 @@ import type { ChannelId } from "../channels/plugins/types.js"; import { createSubsystemLogger } from "../logging/subsystem.js"; import { + DEFAULT_CHANNEL_CONNECT_GRACE_MS, + DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, evaluateChannelHealth, resolveChannelRestartReason, type ChannelHealthPolicy, @@ -21,9 +23,6 @@ const ONE_HOUR_MS = 60 * 60_000; * This catches the half-dead WebSocket scenario where the connection appears * alive (health checks pass) but Slack silently stops delivering events. */ -const DEFAULT_STALE_EVENT_THRESHOLD_MS = 30 * 60_000; -const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000; - export type ChannelHealthTimingPolicy = { monitorStartupGraceMs: number; channelConnectGraceMs: number; @@ -70,7 +69,7 @@ function resolveTimingPolicy( staleEventThresholdMs: deps.timing?.staleEventThresholdMs ?? deps.staleEventThresholdMs ?? - DEFAULT_STALE_EVENT_THRESHOLD_MS, + DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, }; } diff --git a/src/gateway/channel-health-policy.ts b/src/gateway/channel-health-policy.ts index 31938a90471..d0616f04862 100644 --- a/src/gateway/channel-health-policy.ts +++ b/src/gateway/channel-health-policy.ts @@ -3,6 +3,7 @@ export type ChannelHealthSnapshot = { connected?: boolean; enabled?: boolean; configured?: boolean; + restartPending?: boolean; busy?: boolean; activeRuns?: number; lastRunActivityAt?: number | null; @@ -39,6 +40,10 @@ function isManagedAccount(snapshot: ChannelHealthSnapshot): boolean { } const BUSY_ACTIVITY_STALE_THRESHOLD_MS = 25 * 60_000; +// Keep these shared between the background health monitor and on-demand readiness +// probes so both surfaces evaluate channel lifecycle windows consistently. +export const DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS = 30 * 60_000; +export const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000; export function evaluateChannelHealth( snapshot: ChannelHealthSnapshot, diff --git a/src/gateway/server-channels.ts b/src/gateway/server-channels.ts index 6c291541369..4090791d285 100644 --- a/src/gateway/server-channels.ts +++ b/src/gateway/server-channels.ts @@ -180,6 +180,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage enabled: false, configured: true, running: false, + restartPending: false, lastError: plugin.config.disabledReason?.(account, cfg) ?? "disabled", }); return; @@ -195,6 +196,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage enabled: true, configured: false, running: false, + restartPending: false, lastError: plugin.config.unconfiguredReason?.(account, cfg) ?? "not configured", }); return; @@ -215,6 +217,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage enabled: true, configured: true, running: true, + restartPending: false, lastStartAt: Date.now(), lastError: null, reconnectAttempts: preserveRestartAttempts ? (restartAttempts.get(rKey) ?? 0) : 0, @@ -252,6 +255,11 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage const attempt = (restartAttempts.get(rKey) ?? 0) + 1; restartAttempts.set(rKey, attempt); if (attempt > MAX_RESTART_ATTEMPTS) { + setRuntime(channelId, id, { + accountId: id, + restartPending: false, + reconnectAttempts: attempt, + }); log.error?.(`[${id}] giving up after ${MAX_RESTART_ATTEMPTS} restart attempts`); return; } @@ -261,6 +269,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage ); setRuntime(channelId, id, { accountId: id, + restartPending: true, reconnectAttempts: attempt, }); try { @@ -349,6 +358,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage setRuntime(channelId, id, { accountId: id, running: false, + restartPending: false, lastStopAt: Date.now(), }); }), @@ -377,6 +387,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage const next: ChannelAccountSnapshot = { accountId: resolvedId, running: false, + restartPending: false, lastError: cleared ? "logged out" : current.lastError, }; if (typeof current.connected === "boolean") { diff --git a/src/gateway/server-http.probe.test.ts b/src/gateway/server-http.probe.test.ts new file mode 100644 index 00000000000..0e55ddeba32 --- /dev/null +++ b/src/gateway/server-http.probe.test.ts @@ -0,0 +1,155 @@ +import { describe, expect, it } from "vitest"; +import { + AUTH_TOKEN, + AUTH_NONE, + createRequest, + createResponse, + dispatchRequest, + withGatewayServer, +} from "./server-http.test-harness.js"; +import type { ReadinessChecker } from "./server/readiness.js"; + +describe("gateway probe endpoints", () => { + it("returns detailed readiness payload for local /ready requests", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: true, + failing: [], + uptimeMs: 45_000, + }); + + await withGatewayServer({ + prefix: "probe-ready", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/ready" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(200); + expect(JSON.parse(getBody())).toEqual({ ready: true, failing: [], uptimeMs: 45_000 }); + }, + }); + }); + + it("returns only readiness state for unauthenticated remote /ready requests", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord", "telegram"], + uptimeMs: 8_000, + }); + + await withGatewayServer({ + prefix: "probe-not-ready", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ + path: "/ready", + remoteAddress: "10.0.0.8", + host: "gateway.test", + }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(JSON.parse(getBody())).toEqual({ ready: false }); + }, + }); + }); + + it("returns detailed readiness payload for authenticated remote /ready requests", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord", "telegram"], + uptimeMs: 8_000, + }); + + await withGatewayServer({ + prefix: "probe-remote-authenticated", + resolvedAuth: AUTH_TOKEN, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ + path: "/ready", + remoteAddress: "10.0.0.8", + host: "gateway.test", + authorization: "Bearer test-token", + }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(JSON.parse(getBody())).toEqual({ + ready: false, + failing: ["discord", "telegram"], + uptimeMs: 8_000, + }); + }, + }); + }); + + it("returns typed internal error payload when readiness evaluation throws", async () => { + const getReadiness: ReadinessChecker = () => { + throw new Error("boom"); + }; + + await withGatewayServer({ + prefix: "probe-throws", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/ready" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(JSON.parse(getBody())).toEqual({ ready: false, failing: ["internal"], uptimeMs: 0 }); + }, + }); + }); + + it("keeps /healthz shallow even when readiness checker reports failing channels", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord"], + uptimeMs: 999, + }); + + await withGatewayServer({ + prefix: "probe-healthz-unaffected", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/healthz" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(200); + expect(getBody()).toBe(JSON.stringify({ ok: true, status: "live" })); + }, + }); + }); + + it("reflects readiness status on HEAD /readyz without a response body", async () => { + const getReadiness: ReadinessChecker = () => ({ + ready: false, + failing: ["discord"], + uptimeMs: 5_000, + }); + + await withGatewayServer({ + prefix: "probe-readyz-head", + resolvedAuth: AUTH_NONE, + overrides: { getReadiness }, + run: async (server) => { + const req = createRequest({ path: "/readyz", method: "HEAD" }); + const { res, getBody } = createResponse(); + await dispatchRequest(server, req, res); + + expect(res.statusCode).toBe(503); + expect(getBody()).toBe(""); + }, + }); + }); +}); diff --git a/src/gateway/server-http.test-harness.ts b/src/gateway/server-http.test-harness.ts index bf963487038..24612d60b1f 100644 --- a/src/gateway/server-http.test-harness.ts +++ b/src/gateway/server-http.test-harness.ts @@ -28,11 +28,15 @@ export function createRequest(params: { path: string; authorization?: string; method?: string; + remoteAddress?: string; + host?: string; }): IncomingMessage { return createGatewayRequest({ path: params.path, authorization: params.authorization, method: params.method, + remoteAddress: params.remoteAddress, + host: params.host, }); } @@ -127,6 +131,8 @@ export async function sendRequest( path: string; authorization?: string; method?: string; + remoteAddress?: string; + host?: string; }, ): Promise> { const response = createResponse(); diff --git a/src/gateway/server-http.ts b/src/gateway/server-http.ts index 41911f35b49..612ce90dbba 100644 --- a/src/gateway/server-http.ts +++ b/src/gateway/server-http.ts @@ -20,7 +20,12 @@ import { normalizeRateLimitClientIp, type AuthRateLimiter, } from "./auth-rate-limit.js"; -import { type GatewayAuthResult, type ResolvedGatewayAuth } from "./auth.js"; +import { + authorizeHttpGatewayConnect, + isLocalDirectRequest, + type GatewayAuthResult, + type ResolvedGatewayAuth, +} from "./auth.js"; import { normalizeCanvasScopedUrl } from "./canvas-capability.js"; import { handleControlUiAvatarRequest, @@ -46,6 +51,7 @@ import { resolveHookDeliver, } from "./hooks.js"; import { sendGatewayAuthFailure, setDefaultSecurityHeaders } from "./http-common.js"; +import { getBearerToken } from "./http-utils.js"; import { handleOpenAiHttpRequest } from "./openai-http.js"; import { handleOpenResponsesHttpRequest } from "./openresponses-http.js"; import { @@ -59,6 +65,7 @@ import { type PluginHttpRequestHandler, type PluginRoutePathContext, } from "./server/plugins-http.js"; +import type { ReadinessChecker } from "./server/readiness.js"; import type { GatewayWsClient } from "./server/ws-types.js"; import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js"; @@ -150,11 +157,39 @@ function shouldEnforceDefaultPluginGatewayAuth(pathContext: PluginRoutePathConte ); } -function handleGatewayProbeRequest( +async function canRevealReadinessDetails(params: { + req: IncomingMessage; + resolvedAuth: ResolvedGatewayAuth; + trustedProxies: string[]; + allowRealIpFallback: boolean; +}): Promise { + if (isLocalDirectRequest(params.req, params.trustedProxies, params.allowRealIpFallback)) { + return true; + } + if (params.resolvedAuth.mode === "none") { + return false; + } + + const bearerToken = getBearerToken(params.req); + const authResult = await authorizeHttpGatewayConnect({ + auth: params.resolvedAuth, + connectAuth: bearerToken ? { token: bearerToken, password: bearerToken } : null, + req: params.req, + trustedProxies: params.trustedProxies, + allowRealIpFallback: params.allowRealIpFallback, + }); + return authResult.ok; +} + +async function handleGatewayProbeRequest( req: IncomingMessage, res: ServerResponse, requestPath: string, -): boolean { + resolvedAuth: ResolvedGatewayAuth, + trustedProxies: string[], + allowRealIpFallback: boolean, + getReadiness?: ReadinessChecker, +): Promise { const status = GATEWAY_PROBE_STATUS_BY_PATH.get(requestPath); if (!status) { return false; @@ -169,14 +204,34 @@ function handleGatewayProbeRequest( return true; } - res.statusCode = 200; res.setHeader("Content-Type", "application/json; charset=utf-8"); res.setHeader("Cache-Control", "no-store"); - if (method === "HEAD") { - res.end(); - return true; + + let statusCode: number; + let body: string; + if (status === "ready" && getReadiness) { + const includeDetails = await canRevealReadinessDetails({ + req, + resolvedAuth, + trustedProxies, + allowRealIpFallback, + }); + try { + const result = getReadiness(); + statusCode = result.ready ? 200 : 503; + body = JSON.stringify(includeDetails ? result : { ready: result.ready }); + } catch { + statusCode = 503; + body = JSON.stringify( + includeDetails ? { ready: false, failing: ["internal"], uptimeMs: 0 } : { ready: false }, + ); + } + } else { + statusCode = 200; + body = JSON.stringify({ ok: true, status }); } - res.end(JSON.stringify({ ok: true, status })); + res.statusCode = statusCode; + res.end(method === "HEAD" ? undefined : body); return true; } @@ -519,6 +574,7 @@ export function createGatewayHttpServer(opts: { resolvedAuth: ResolvedGatewayAuth; /** Optional rate limiter for auth brute-force protection. */ rateLimiter?: AuthRateLimiter; + getReadiness?: ReadinessChecker; tlsOptions?: TlsOptions; }): HttpServer { const { @@ -537,6 +593,7 @@ export function createGatewayHttpServer(opts: { shouldEnforcePluginGatewayAuth, resolvedAuth, rateLimiter, + getReadiness, } = opts; const httpServer: HttpServer = opts.tlsOptions ? createHttpsServer(opts.tlsOptions, (req, res) => { @@ -693,7 +750,16 @@ export function createGatewayHttpServer(opts: { requestStages.push({ name: "gateway-probes", - run: () => handleGatewayProbeRequest(req, res, requestPath), + run: () => + handleGatewayProbeRequest( + req, + res, + requestPath, + resolvedAuth, + trustedProxies, + allowRealIpFallback, + getReadiness, + ), }); if (await runGatewayHttpRequestStages(requestStages)) { diff --git a/src/gateway/server-runtime-state.ts b/src/gateway/server-runtime-state.ts index 9054b3a2a3f..5733f3671e4 100644 --- a/src/gateway/server-runtime-state.ts +++ b/src/gateway/server-runtime-state.ts @@ -32,6 +32,7 @@ import { shouldEnforceGatewayAuthForPluginPath, type PluginRoutePathContext, } from "./server/plugins-http.js"; +import type { ReadinessChecker } from "./server/readiness.js"; import type { GatewayTlsRuntime } from "./server/tls.js"; import type { GatewayWsClient } from "./server/ws-types.js"; @@ -61,6 +62,7 @@ export async function createGatewayRuntimeState(params: { log: { info: (msg: string) => void; warn: (msg: string) => void }; logHooks: ReturnType; logPlugins: ReturnType; + getReadiness?: ReadinessChecker; }): Promise<{ canvasHost: CanvasHostHandler | null; httpServer: HttpServer; @@ -156,6 +158,7 @@ export async function createGatewayRuntimeState(params: { shouldEnforcePluginGatewayAuth, resolvedAuth: params.resolvedAuth, rateLimiter: params.rateLimiter, + getReadiness: params.getReadiness, tlsOptions: params.gatewayTls?.enabled ? params.gatewayTls.tlsOptions : undefined, }); try { diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index efb95e7a7cf..e9c83156260 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -106,6 +106,7 @@ import { incrementPresenceVersion, refreshGatewayHealthSnapshot, } from "./server/health-state.js"; +import { createReadinessChecker } from "./server/readiness.js"; import { loadGatewayTlsRuntime } from "./server/tls.js"; import { ensureGatewayStartupAuth, @@ -546,6 +547,17 @@ export async function startGatewayServer( if (cfgAtStart.gateway?.tls?.enabled && !gatewayTls.enabled) { throw new Error(gatewayTls.error ?? "gateway tls: failed to enable"); } + const serverStartedAt = Date.now(); + const channelManager = createChannelManager({ + loadConfig, + channelLogs, + channelRuntimeEnvs, + channelRuntime: createPluginRuntime().channel, + }); + const getReadiness = createReadinessChecker({ + channelManager, + startedAt: serverStartedAt, + }); const { canvasHost, httpServer, @@ -589,6 +601,7 @@ export async function startGatewayServer( log, logHooks, logPlugins, + getReadiness, }); let bonjourStop: (() => Promise) | null = null; const nodeRegistry = new NodeRegistry(); @@ -618,12 +631,6 @@ export async function startGatewayServer( }); let { cron, storePath: cronStorePath } = cronState; - const channelManager = createChannelManager({ - loadConfig, - channelLogs, - channelRuntimeEnvs, - channelRuntime: createPluginRuntime().channel, - }); const { getRuntimeSnapshot, startChannels, startChannel, stopChannel, markChannelLoggedOut } = channelManager; diff --git a/src/gateway/server/readiness.test.ts b/src/gateway/server/readiness.test.ts new file mode 100644 index 00000000000..c41f8d050f2 --- /dev/null +++ b/src/gateway/server/readiness.test.ts @@ -0,0 +1,202 @@ +import { describe, expect, it, vi } from "vitest"; +import type { ChannelId } from "../../channels/plugins/index.js"; +import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js"; +import type { ChannelManager, ChannelRuntimeSnapshot } from "../server-channels.js"; +import { createReadinessChecker } from "./readiness.js"; + +function snapshotWith( + accounts: Record>, +): ChannelRuntimeSnapshot { + const channels: ChannelRuntimeSnapshot["channels"] = {}; + const channelAccounts: ChannelRuntimeSnapshot["channelAccounts"] = {}; + + for (const [channelId, accountSnapshot] of Object.entries(accounts)) { + const resolved = { accountId: "default", ...accountSnapshot } as ChannelAccountSnapshot; + channels[channelId as ChannelId] = resolved; + channelAccounts[channelId as ChannelId] = { default: resolved }; + } + + return { channels, channelAccounts }; +} + +function createManager(snapshot: ChannelRuntimeSnapshot): ChannelManager { + return { + getRuntimeSnapshot: vi.fn(() => snapshot), + startChannels: vi.fn(), + startChannel: vi.fn(), + stopChannel: vi.fn(), + markChannelLoggedOut: vi.fn(), + isManuallyStopped: vi.fn(() => false), + resetRestartAttempts: vi.fn(), + }; +} + +describe("createReadinessChecker", () => { + it("reports ready when all managed channels are healthy", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: startedAt, + lastEventAt: Date.now() - 1_000, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("ignores disabled and unconfigured channels", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: false, + enabled: false, + configured: true, + lastStartAt: startedAt, + }, + telegram: { + running: false, + enabled: true, + configured: false, + lastStartAt: startedAt, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("uses startup grace before marking disconnected channels not ready", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 30_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: startedAt, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 30_000 }); + vi.useRealTimers(); + }); + + it("reports disconnected managed channels after startup grace", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: startedAt, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: false, failing: ["discord"], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("keeps restart-pending channels ready during reconnect backoff", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: false, + restartPending: true, + reconnectAttempts: 3, + enabled: true, + configured: true, + lastStartAt: startedAt - 30_000, + lastStopAt: Date.now() - 5_000, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.useRealTimers(); + }); + + it("treats stale-socket channels as ready to avoid pulling healthy idle pods", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 31 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: startedAt, + lastEventAt: Date.now() - 31 * 60_000, + }, + }), + ); + + const readiness = createReadinessChecker({ channelManager: manager, startedAt }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 1_860_000 }); + vi.useRealTimers(); + }); + + it("caches readiness snapshots briefly to keep repeated probes cheap", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-03-06T12:00:00Z")); + const startedAt = Date.now() - 5 * 60_000; + const manager = createManager( + snapshotWith({ + discord: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: startedAt, + lastEventAt: Date.now() - 1_000, + }, + }), + ); + + const readiness = createReadinessChecker({ + channelManager: manager, + startedAt, + cacheTtlMs: 1_000, + }); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 }); + vi.advanceTimersByTime(500); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_500 }); + expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(1); + + vi.advanceTimersByTime(600); + expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 301_100 }); + expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2); + vi.useRealTimers(); + }); +}); diff --git a/src/gateway/server/readiness.ts b/src/gateway/server/readiness.ts new file mode 100644 index 00000000000..e6ad2d92afb --- /dev/null +++ b/src/gateway/server/readiness.ts @@ -0,0 +1,79 @@ +import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js"; +import { + DEFAULT_CHANNEL_CONNECT_GRACE_MS, + DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, + evaluateChannelHealth, + type ChannelHealthPolicy, + type ChannelHealthEvaluation, +} from "../channel-health-policy.js"; +import type { ChannelManager } from "../server-channels.js"; + +export type ReadinessResult = { + ready: boolean; + failing: string[]; + uptimeMs: number; +}; + +export type ReadinessChecker = () => ReadinessResult; + +const DEFAULT_READINESS_CACHE_TTL_MS = 1_000; + +function shouldIgnoreReadinessFailure( + accountSnapshot: ChannelAccountSnapshot, + health: ChannelHealthEvaluation, +): boolean { + if (health.reason === "unmanaged" || health.reason === "stale-socket") { + return true; + } + // Channel restarts spend time in backoff with running=false before the next + // lifecycle re-enters startup grace. Keep readiness green during that handoff + // window, but still surface hard failures once restart attempts are exhausted. + return health.reason === "not-running" && accountSnapshot.restartPending === true; +} + +export function createReadinessChecker(deps: { + channelManager: ChannelManager; + startedAt: number; + cacheTtlMs?: number; +}): ReadinessChecker { + const { channelManager, startedAt } = deps; + const cacheTtlMs = Math.max(0, deps.cacheTtlMs ?? DEFAULT_READINESS_CACHE_TTL_MS); + let cachedAt = 0; + let cachedState: Omit | null = null; + + return (): ReadinessResult => { + const now = Date.now(); + const uptimeMs = now - startedAt; + if (cachedState && now - cachedAt < cacheTtlMs) { + return { ...cachedState, uptimeMs }; + } + + const snapshot = channelManager.getRuntimeSnapshot(); + const failing: string[] = []; + const policy: ChannelHealthPolicy = { + now, + staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS, + channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS, + }; + + for (const [channelId, accounts] of Object.entries(snapshot.channelAccounts)) { + if (!accounts) { + continue; + } + for (const accountSnapshot of Object.values(accounts)) { + if (!accountSnapshot) { + continue; + } + const health = evaluateChannelHealth(accountSnapshot, policy); + if (!health.healthy && !shouldIgnoreReadinessFailure(accountSnapshot, health)) { + failing.push(channelId); + break; + } + } + } + + cachedAt = now; + cachedState = { ready: failing.length === 0, failing }; + return { ...cachedState, uptimeMs }; + }; +}