diff --git a/CHANGELOG.md b/CHANGELOG.md index 58731c16017..3b93f646ab9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ Docs: https://docs.openclaw.ai ### Fixes +- Gateway/startup: defer heartbeat, cron, and pending delivery recovery until sidecars finish so Sandbox wake and chat history startup gates cannot block channel resume. (#65365) Thanks @lml2468. + ## 2026.4.12-beta.1 ### Changes diff --git a/src/gateway/server-runtime-services.test.ts b/src/gateway/server-runtime-services.test.ts new file mode 100644 index 00000000000..6c61fc2c83b --- /dev/null +++ b/src/gateway/server-runtime-services.test.ts @@ -0,0 +1,126 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const hoisted = vi.hoisted(() => { + const heartbeatRunner = { + stop: vi.fn(), + updateConfig: vi.fn(), + }; + return { + heartbeatRunner, + startHeartbeatRunner: vi.fn(() => heartbeatRunner), + startChannelHealthMonitor: vi.fn(() => ({ stop: vi.fn() })), + startGatewayModelPricingRefresh: vi.fn(() => vi.fn()), + recoverPendingDeliveries: vi.fn(async () => undefined), + deliverOutboundPayloads: vi.fn(), + }; +}); + +vi.mock("../infra/heartbeat-runner.js", () => ({ + startHeartbeatRunner: hoisted.startHeartbeatRunner, +})); + +vi.mock("../infra/outbound/deliver.js", () => ({ + deliverOutboundPayloads: hoisted.deliverOutboundPayloads, +})); + +vi.mock("../infra/outbound/delivery-queue.js", () => ({ + recoverPendingDeliveries: hoisted.recoverPendingDeliveries, +})); + +vi.mock("./channel-health-monitor.js", () => ({ + startChannelHealthMonitor: hoisted.startChannelHealthMonitor, +})); + +vi.mock("./model-pricing-cache.js", () => ({ + startGatewayModelPricingRefresh: hoisted.startGatewayModelPricingRefresh, +})); + +const { activateGatewayScheduledServices, startGatewayRuntimeServices } = + await import("./server-runtime-services.js"); + +describe("server-runtime-services", () => { + beforeEach(() => { + hoisted.heartbeatRunner.stop.mockClear(); + hoisted.heartbeatRunner.updateConfig.mockClear(); + hoisted.startHeartbeatRunner.mockClear(); + hoisted.startChannelHealthMonitor.mockClear(); + hoisted.startGatewayModelPricingRefresh.mockClear(); + hoisted.recoverPendingDeliveries.mockClear(); + hoisted.deliverOutboundPayloads.mockClear(); + }); + + it("keeps scheduled services inert during initial runtime setup", () => { + const services = startGatewayRuntimeServices({ + minimalTestGateway: false, + cfgAtStart: {} as never, + channelManager: { + getRuntimeSnapshot: vi.fn(), + isHealthMonitorEnabled: vi.fn(), + isManuallyStopped: vi.fn(), + } as never, + log: createLog(), + }); + + expect(hoisted.startChannelHealthMonitor).toHaveBeenCalledTimes(1); + expect(hoisted.startHeartbeatRunner).not.toHaveBeenCalled(); + expect(hoisted.recoverPendingDeliveries).not.toHaveBeenCalled(); + + services.heartbeatRunner.stop(); + expect(hoisted.heartbeatRunner.stop).not.toHaveBeenCalled(); + }); + + it("activates heartbeat, cron, and delivery recovery after sidecars are ready", async () => { + const cron = { start: vi.fn(async () => undefined) }; + const log = createLog(); + + const services = activateGatewayScheduledServices({ + minimalTestGateway: false, + cfgAtStart: {} as never, + cron, + logCron: { error: vi.fn() }, + log, + }); + + expect(hoisted.startHeartbeatRunner).toHaveBeenCalledTimes(1); + expect(cron.start).toHaveBeenCalledTimes(1); + expect(services.heartbeatRunner).toBe(hoisted.heartbeatRunner); + await vi.waitFor(() => { + expect(hoisted.recoverPendingDeliveries).toHaveBeenCalledWith( + expect.objectContaining({ + deliver: hoisted.deliverOutboundPayloads, + cfg: {}, + }), + ); + }); + }); + + it("keeps scheduled services disabled for minimal test gateways", () => { + const cron = { start: vi.fn(async () => undefined) }; + + const services = activateGatewayScheduledServices({ + minimalTestGateway: true, + cfgAtStart: {} as never, + cron, + logCron: { error: vi.fn() }, + log: createLog(), + }); + + expect(hoisted.startHeartbeatRunner).not.toHaveBeenCalled(); + expect(cron.start).not.toHaveBeenCalled(); + expect(hoisted.recoverPendingDeliveries).not.toHaveBeenCalled(); + + services.heartbeatRunner.stop(); + expect(hoisted.heartbeatRunner.stop).not.toHaveBeenCalled(); + }); +}); + +function createLog() { + return { + child: vi.fn(() => ({ + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + })), + error: vi.fn(), + }; +} diff --git a/src/gateway/server-runtime-services.ts b/src/gateway/server-runtime-services.ts index 5da78182f0f..0b79f3e02fc 100644 --- a/src/gateway/server-runtime-services.ts +++ b/src/gateway/server-runtime-services.ts @@ -77,9 +77,7 @@ export function startGatewayRuntimeServices(params: { channelHealthMonitor: ChannelHealthMonitor | null; stopModelPricingRefresh: () => void; } { - // Return a noop heartbeat runner for now. The real runner is created - // in activateGatewayScheduledServices() after sidecars finish and - // chat.history becomes available. See #65322. + // Keep scheduled work inert until post-attach sidecars finish. const channelHealthMonitor = startGatewayChannelHealthMonitor({ cfg: params.cfgAtStart, channelManager: params.channelManager, @@ -95,25 +93,9 @@ export function startGatewayRuntimeServices(params: { }; } -/** - * Activate cron scheduler and pending delivery recovery AFTER gateway - * sidecars are fully started and chat.history is available. - * - * Previously these ran inside startGatewayRuntimeServices(), which - * fires before sidecars finish — creating a race where cron/heartbeat - * jobs could call chat.history while it was still marked unavailable. - * See: https://github.com/openclaw/openclaw/issues/65322 - */ /** * Activate cron scheduler, heartbeat runner, and pending delivery recovery - * AFTER gateway sidecars are fully started and chat.history is available. - * - * Previously these ran inside startGatewayRuntimeServices(), which fires - * before sidecars finish — creating a race where cron/heartbeat jobs - * could call chat.history while it was still marked unavailable. - * See: https://github.com/openclaw/openclaw/issues/65322 - * - * Returns the real heartbeat runner so the caller can update runtimeState. + * after gateway sidecars are fully started and chat.history is available. */ export function activateGatewayScheduledServices(params: { minimalTestGateway: boolean;