fix(gateway): defer cron AND heartbeat activation until sidecars are ready (#65322)

startGatewayRuntimeServices() previously started both the cron
scheduler AND heartbeat runner BEFORE gateway sidecars finished
initialising.  Because chat.history is marked unavailable until
sidecars complete, any cron job or heartbeat tick that called
chat.history during this window received a hard UNAVAILABLE error.

Fix: create a noop heartbeat placeholder in the early
startGatewayRuntimeServices() call, then activate the real
heartbeat runner, cron scheduler, and pending delivery recovery
in a new activateGatewayScheduledServices() function that runs
AFTER startGatewayPostAttachRuntime() completes.

channelHealthMonitor and model pricing refresh remain in the
early call since they do not depend on chat.history.

Root cause analysis by luban, cross-validated by tongluo.
Reviewer feedback addressed: heartbeat runner is now also
deferred (previously only cron was deferred).
This commit is contained in:
limenglin
2026-04-12 21:35:29 +08:00
committed by Peter Steinberger
parent 03d042d2b9
commit 92776b8d77
2 changed files with 63 additions and 20 deletions

View File

@@ -71,35 +71,22 @@ export function startGatewayRuntimeServices(params: {
minimalTestGateway: boolean;
cfgAtStart: OpenClawConfig;
channelManager: GatewayChannelManager;
cron: { start: () => Promise<void> };
logCron: { error: (message: string) => void };
log: GatewayRuntimeServiceLogger;
}): {
heartbeatRunner: HeartbeatRunner;
channelHealthMonitor: ChannelHealthMonitor | null;
stopModelPricingRefresh: () => void;
} {
const heartbeatRunner = params.minimalTestGateway
? createNoopHeartbeatRunner()
: startHeartbeatRunner({ cfg: params.cfgAtStart });
// Return a noop heartbeat runner for now. The real runner is created
// in activateGatewayScheduledServices() after sidecars finish and
// chat.history becomes available. See #65322.
const channelHealthMonitor = startGatewayChannelHealthMonitor({
cfg: params.cfgAtStart,
channelManager: params.channelManager,
});
if (!params.minimalTestGateway) {
startGatewayCronWithLogging({
cron: params.cron,
logCron: params.logCron,
});
recoverPendingOutboundDeliveries({
cfg: params.cfgAtStart,
log: params.log,
});
}
return {
heartbeatRunner,
heartbeatRunner: createNoopHeartbeatRunner(),
channelHealthMonitor,
stopModelPricingRefresh:
!params.minimalTestGateway && process.env.VITEST !== "1"
@@ -107,3 +94,45 @@ export function startGatewayRuntimeServices(params: {
: () => {},
};
}
/**
* Activate cron scheduler and pending delivery recovery AFTER gateway
* sidecars are fully started and chat.history is available.
*
* Previously these ran inside startGatewayRuntimeServices(), which
* fires before sidecars finish — creating a race where cron/heartbeat
* jobs could call chat.history while it was still marked unavailable.
* See: https://github.com/openclaw/openclaw/issues/65322
*/
/**
* Activate cron scheduler, heartbeat runner, and pending delivery recovery
* AFTER gateway sidecars are fully started and chat.history is available.
*
* Previously these ran inside startGatewayRuntimeServices(), which fires
* before sidecars finish — creating a race where cron/heartbeat jobs
* could call chat.history while it was still marked unavailable.
* See: https://github.com/openclaw/openclaw/issues/65322
*
* Returns the real heartbeat runner so the caller can update runtimeState.
*/
export function activateGatewayScheduledServices(params: {
minimalTestGateway: boolean;
cfgAtStart: OpenClawConfig;
cron: { start: () => Promise<void> };
logCron: { error: (message: string) => void };
log: GatewayRuntimeServiceLogger;
}): { heartbeatRunner: HeartbeatRunner } {
if (params.minimalTestGateway) {
return { heartbeatRunner: createNoopHeartbeatRunner() };
}
const heartbeatRunner = startHeartbeatRunner({ cfg: params.cfgAtStart });
startGatewayCronWithLogging({
cron: params.cron,
logCron: params.logCron,
});
recoverPendingOutboundDeliveries({
cfg: params.cfgAtStart,
log: params.log,
});
return { heartbeatRunner };
}

View File

@@ -56,7 +56,10 @@ import { setFallbackGatewayContextResolver } from "./server-plugins.js";
import { startManagedGatewayConfigReloader } from "./server-reload-handlers.js";
import { createGatewayRequestContext } from "./server-request-context.js";
import { resolveGatewayRuntimeConfig } from "./server-runtime-config.js";
import { startGatewayRuntimeServices } from "./server-runtime-services.js";
import {
activateGatewayScheduledServices,
startGatewayRuntimeServices,
} from "./server-runtime-services.js";
import { createGatewayRuntimeState } from "./server-runtime-state.js";
import { startGatewayEventSubscriptions } from "./server-runtime-subscriptions.js";
import { resolveSessionKeyForRun } from "./server-session-key.js";
@@ -608,8 +611,6 @@ export async function startGatewayServer(
minimalTestGateway,
cfgAtStart,
channelManager,
cron: runtimeState.cronState.cron,
logCron,
log,
}),
);
@@ -755,6 +756,19 @@ export async function startGatewayServer(
unavailableGatewayMethods,
}));
// Activate cron scheduler, heartbeat runner, and pending delivery
// recovery now that sidecars are ready and chat.history is available.
// Previously these ran before sidecars finished, causing a race.
// See #65322.
const activated = activateGatewayScheduledServices({
minimalTestGateway,
cfgAtStart,
cron: runtimeState.cronState.cron,
logCron,
log,
});
runtimeState.heartbeatRunner = activated.heartbeatRunner;
runtimeState.configReloader = startManagedGatewayConfigReloader({
minimalTestGateway,
initialConfig: cfgAtStart,