From 92776b8d77c691e320dbb8afba07dc534ebaa454 Mon Sep 17 00:00:00 2001 From: limenglin Date: Sun, 12 Apr 2026 21:35:29 +0800 Subject: [PATCH] fix(gateway): defer cron AND heartbeat activation until sidecars are ready (#65322) startGatewayRuntimeServices() previously started both the cron scheduler AND heartbeat runner BEFORE gateway sidecars finished initialising. Because chat.history is marked unavailable until sidecars complete, any cron job or heartbeat tick that called chat.history during this window received a hard UNAVAILABLE error. Fix: create a noop heartbeat placeholder in the early startGatewayRuntimeServices() call, then activate the real heartbeat runner, cron scheduler, and pending delivery recovery in a new activateGatewayScheduledServices() function that runs AFTER startGatewayPostAttachRuntime() completes. channelHealthMonitor and model pricing refresh remain in the early call since they do not depend on chat.history. Root cause analysis by luban, cross-validated by tongluo. Reviewer feedback addressed: heartbeat runner is now also deferred (previously only cron was deferred). --- src/gateway/server-runtime-services.ts | 63 +++++++++++++++++++------- src/gateway/server.impl.ts | 20 ++++++-- 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/src/gateway/server-runtime-services.ts b/src/gateway/server-runtime-services.ts index 39376b6c696..5da78182f0f 100644 --- a/src/gateway/server-runtime-services.ts +++ b/src/gateway/server-runtime-services.ts @@ -71,35 +71,22 @@ export function startGatewayRuntimeServices(params: { minimalTestGateway: boolean; cfgAtStart: OpenClawConfig; channelManager: GatewayChannelManager; - cron: { start: () => Promise }; - logCron: { error: (message: string) => void }; log: GatewayRuntimeServiceLogger; }): { heartbeatRunner: HeartbeatRunner; channelHealthMonitor: ChannelHealthMonitor | null; stopModelPricingRefresh: () => void; } { - const heartbeatRunner = params.minimalTestGateway - ? createNoopHeartbeatRunner() - : startHeartbeatRunner({ cfg: params.cfgAtStart }); + // Return a noop heartbeat runner for now. The real runner is created + // in activateGatewayScheduledServices() after sidecars finish and + // chat.history becomes available. See #65322. const channelHealthMonitor = startGatewayChannelHealthMonitor({ cfg: params.cfgAtStart, channelManager: params.channelManager, }); - if (!params.minimalTestGateway) { - startGatewayCronWithLogging({ - cron: params.cron, - logCron: params.logCron, - }); - recoverPendingOutboundDeliveries({ - cfg: params.cfgAtStart, - log: params.log, - }); - } - return { - heartbeatRunner, + heartbeatRunner: createNoopHeartbeatRunner(), channelHealthMonitor, stopModelPricingRefresh: !params.minimalTestGateway && process.env.VITEST !== "1" @@ -107,3 +94,45 @@ export function startGatewayRuntimeServices(params: { : () => {}, }; } + +/** + * Activate cron scheduler and pending delivery recovery AFTER gateway + * sidecars are fully started and chat.history is available. + * + * Previously these ran inside startGatewayRuntimeServices(), which + * fires before sidecars finish — creating a race where cron/heartbeat + * jobs could call chat.history while it was still marked unavailable. + * See: https://github.com/openclaw/openclaw/issues/65322 + */ +/** + * Activate cron scheduler, heartbeat runner, and pending delivery recovery + * AFTER gateway sidecars are fully started and chat.history is available. + * + * Previously these ran inside startGatewayRuntimeServices(), which fires + * before sidecars finish — creating a race where cron/heartbeat jobs + * could call chat.history while it was still marked unavailable. + * See: https://github.com/openclaw/openclaw/issues/65322 + * + * Returns the real heartbeat runner so the caller can update runtimeState. + */ +export function activateGatewayScheduledServices(params: { + minimalTestGateway: boolean; + cfgAtStart: OpenClawConfig; + cron: { start: () => Promise }; + logCron: { error: (message: string) => void }; + log: GatewayRuntimeServiceLogger; +}): { heartbeatRunner: HeartbeatRunner } { + if (params.minimalTestGateway) { + return { heartbeatRunner: createNoopHeartbeatRunner() }; + } + const heartbeatRunner = startHeartbeatRunner({ cfg: params.cfgAtStart }); + startGatewayCronWithLogging({ + cron: params.cron, + logCron: params.logCron, + }); + recoverPendingOutboundDeliveries({ + cfg: params.cfgAtStart, + log: params.log, + }); + return { heartbeatRunner }; +} diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 528e4577f10..29901b2f5ed 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -56,7 +56,10 @@ import { setFallbackGatewayContextResolver } from "./server-plugins.js"; import { startManagedGatewayConfigReloader } from "./server-reload-handlers.js"; import { createGatewayRequestContext } from "./server-request-context.js"; import { resolveGatewayRuntimeConfig } from "./server-runtime-config.js"; -import { startGatewayRuntimeServices } from "./server-runtime-services.js"; +import { + activateGatewayScheduledServices, + startGatewayRuntimeServices, +} from "./server-runtime-services.js"; import { createGatewayRuntimeState } from "./server-runtime-state.js"; import { startGatewayEventSubscriptions } from "./server-runtime-subscriptions.js"; import { resolveSessionKeyForRun } from "./server-session-key.js"; @@ -608,8 +611,6 @@ export async function startGatewayServer( minimalTestGateway, cfgAtStart, channelManager, - cron: runtimeState.cronState.cron, - logCron, log, }), ); @@ -755,6 +756,19 @@ export async function startGatewayServer( unavailableGatewayMethods, })); + // Activate cron scheduler, heartbeat runner, and pending delivery + // recovery now that sidecars are ready and chat.history is available. + // Previously these ran before sidecars finished, causing a race. + // See #65322. + const activated = activateGatewayScheduledServices({ + minimalTestGateway, + cfgAtStart, + cron: runtimeState.cronState.cron, + logCron, + log, + }); + runtimeState.heartbeatRunner = activated.heartbeatRunner; + runtimeState.configReloader = startManagedGatewayConfigReloader({ minimalTestGateway, initialConfig: cfgAtStart,