fix(gateway): keep cron startup after maintenance failure

This commit is contained in:
Vincent Koc
2026-05-03 21:32:55 -07:00
parent 32b4d1ec8a
commit 654b70dde8
4 changed files with 80 additions and 15 deletions

View File

@@ -47,6 +47,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Gateway/startup: start cron and record the post-ready memory trace even when deferred maintenance timers fail after readiness, so a non-fatal timer setup issue does not silently leave scheduled jobs idle. Thanks @vincentkoc.
- Agents/session status: keep semantic `session_status({ sessionKey: "current" })` on the live run session even before that run has a persisted session-store entry, instead of falling back to the sandbox policy key. Thanks @vincentkoc.
- QA/Slack: resolve bundled official plugin public-surface package aliases during source-mode QA runs, so release Slack live validation can load `@openclaw/slack/api.js` without workspace symlinks. Thanks @vincentkoc.
- Codex: pass the live run session key into app-server dynamic tools when sandbox policy uses a separate session key, so `session_status({ sessionKey: "current" })` reports the active run instead of the sandbox policy key. Thanks @vincentkoc.

View File

@@ -52,8 +52,11 @@ vi.mock("./model-pricing-cache.js", () => ({
startGatewayModelPricingRefresh: hoisted.startGatewayModelPricingRefresh,
}));
const { activateGatewayScheduledServices, startGatewayRuntimeServices } =
await import("./server-runtime-services.js");
const {
activateGatewayScheduledServices,
runGatewayPostReadyMaintenance,
startGatewayRuntimeServices,
} = await import("./server-runtime-services.js");
describe("server-runtime-services", () => {
beforeEach(() => {
@@ -217,6 +220,31 @@ describe("server-runtime-services", () => {
expect(hoisted.recoverPendingDeliveries).toHaveBeenCalledTimes(1);
});
it("starts cron and records memory when post-ready maintenance fails", async () => {
const cron = { start: vi.fn(async () => undefined) };
const log = createLog();
const recordPostReadyMemory = vi.fn();
await runGatewayPostReadyMaintenance({
startMaintenance: vi.fn(async () => {
throw new Error("timers unavailable");
}),
applyMaintenance: vi.fn(),
shouldStartCron: () => true,
markCronStartHandled: vi.fn(),
cron,
logCron: { error: vi.fn() },
log,
recordPostReadyMemory,
});
expect(log.warn).toHaveBeenCalledWith(
"gateway post-ready maintenance startup failed: Error: timers unavailable",
);
expect(cron.start).toHaveBeenCalledTimes(1);
expect(recordPostReadyMemory).toHaveBeenCalledTimes(1);
});
it("keeps scheduled services disabled for minimal test gateways", () => {
const cron = { start: vi.fn(async () => undefined) };
@@ -247,6 +275,7 @@ function createLog() {
warn: vi.fn(),
error: vi.fn(),
})),
warn: vi.fn(),
error: vi.fn(),
};
}

View File

@@ -5,6 +5,7 @@ import type { PluginMetadataRegistryView } from "../plugins/plugin-metadata-snap
import type { ChannelHealthMonitor } from "./channel-health-monitor.js";
import { startChannelHealthMonitor } from "./channel-health-monitor.js";
import { isGatewayModelPricingEnabled } from "./model-pricing-config.js";
import type { startGatewayMaintenanceTimers } from "./server-maintenance.js";
type GatewayRuntimeServiceLogger = {
child: (name: string) => {
@@ -14,6 +15,12 @@ type GatewayRuntimeServiceLogger = {
};
error: (message: string) => void;
};
type GatewayPostReadyLogger = {
warn: (message: string) => void;
};
type GatewayMaintenanceHandles = NonNullable<
Awaited<ReturnType<typeof startGatewayMaintenanceTimers>>
>;
export type GatewayChannelManager = Parameters<
typeof startChannelHealthMonitor
@@ -53,6 +60,34 @@ export function startGatewayCronWithLogging(params: {
void params.cron.start().catch((err) => params.logCron.error(`failed to start: ${String(err)}`));
}
export async function runGatewayPostReadyMaintenance(params: {
startMaintenance: () => Promise<GatewayMaintenanceHandles | null>;
applyMaintenance: (maintenance: GatewayMaintenanceHandles) => void;
shouldStartCron: () => boolean;
markCronStartHandled: () => void;
cron: { start: () => Promise<void> };
logCron: { error: (message: string) => void };
log: GatewayPostReadyLogger;
recordPostReadyMemory: () => void;
}): Promise<void> {
try {
const maintenance = await params.startMaintenance();
if (maintenance) {
params.applyMaintenance(maintenance);
}
} catch (err) {
params.log.warn(`gateway post-ready maintenance startup failed: ${String(err)}`);
}
if (params.shouldStartCron()) {
params.markCronStartHandled();
startGatewayCronWithLogging({
cron: params.cron,
logCron: params.logCron,
});
}
params.recordPostReadyMemory();
}
function recoverPendingOutboundDeliveries(params: {
cfg: OpenClawConfig;
log: GatewayRuntimeServiceLogger;

View File

@@ -1491,24 +1491,24 @@ export async function startGatewayServer(
});
if (!minimalTestGateway) {
const handle = setTimeout(() => {
void (async () => {
const maintenance = await earlyRuntime.startMaintenance();
if (maintenance) {
void gatewayRuntimeServices.runGatewayPostReadyMaintenance({
startMaintenance: earlyRuntime.startMaintenance,
applyMaintenance: (maintenance) => {
runtimeState.tickInterval = maintenance.tickInterval;
runtimeState.healthInterval = maintenance.healthInterval;
runtimeState.dedupeCleanup = maintenance.dedupeCleanup;
runtimeState.mediaCleanup = maintenance.mediaCleanup;
}
if (!gatewayCronStartHandled) {
},
shouldStartCron: () => !gatewayCronStartHandled,
markCronStartHandled: () => {
gatewayCronStartHandled = true;
gatewayRuntimeServices.startGatewayCronWithLogging({
cron: runtimeState.cronState.cron,
logCron,
});
}
startupTrace.detail("memory.post-ready", collectProcessMemoryUsageMb());
})().catch((err) => {
log.warn(`gateway post-ready maintenance startup failed: ${String(err)}`);
},
cron: runtimeState.cronState.cron,
logCron,
log,
recordPostReadyMemory: () => {
startupTrace.detail("memory.post-ready", collectProcessMemoryUsageMb());
},
});
}, POST_READY_MAINTENANCE_DELAY_MS);
handle.unref?.();