fix(gateway): keep startup sidecars responsive

This commit is contained in:
Peter Steinberger
2026-04-27 22:44:16 +01:00
parent 75c03b28e0
commit 43ababf96b
10 changed files with 172 additions and 167 deletions

View File

@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
- Channels/commands: make generated `/dock-*` commands switch the active session reply route through `session.identityLinks` instead of falling through to normal chat. Fixes #69206; carries forward #73033. Thanks @clawbones and @michaelatamuk.
- Providers/Cloudflare AI Gateway: strip assistant prefill turns from Anthropic Messages payloads when thinking is enabled, so Claude requests through Cloudflare AI Gateway no longer fail Anthropic conversation-ending validation. Fixes #72905; carries forward #73005. Thanks @AaronFaby and @sahilsatralkar.
- Gateway/startup: keep primary-model startup prewarm on scoped metadata preparation, let native approval bootstraps retry outside channel startup, and skip the global hook runner when no `gateway_start` hook is registered, so clean post-ready sidecar work stays off the critical path. Refs #72846. Thanks @RayWoo, @livekm0309, and @mrz1836.
- Gateway/startup: scope primary-model provider discovery during channel prewarm to the configured provider owner and add split startup trace timings, so boot avoids staging unrelated bundled provider dependencies while setup discovery remains broad. Fixes #73002. Thanks @Schnup03.
- Channels/Microsoft Teams: unwrap staged CommonJS JWT runtime dependencies before Bot Connector token validation so inbound Teams messages no longer 401 after the bundled runtime-deps move. Fixes #73026. Thanks @kbrown10000.
- Channels/sessions: prevent guarded inbound session recording from creating route-only phantom sessions while still allowing last-route updates for sessions that already exist. Carries forward #73009. Thanks @jzakirov.

View File

@@ -188,9 +188,25 @@ describe("server-channels approval bootstrap", () => {
).toBeUndefined();
});
it("keeps the account stopped when approval bootstrap startup fails", async () => {
it("continues account startup when approval bootstrap startup fails", async () => {
const channelRuntime = createRuntimeChannel();
const startAccount = vi.fn(async () => {});
const stopped = createDeferred();
const startAccount = vi.fn(
async ({
abortSignal,
}: Parameters<NonNullable<NonNullable<ChannelPlugin["gateway"]>["startAccount"]>>[0]) => {
await new Promise<void>((resolve) => {
abortSignal.addEventListener(
"abort",
() => {
stopped.resolve();
resolve();
},
{ once: true },
);
});
},
);
hoisted.startChannelApprovalHandlerBootstrap.mockRejectedValue(new Error("boom"));
installTestRegistry(createTestPlugin({ startAccount }));
@@ -198,16 +214,19 @@ describe("server-channels approval bootstrap", () => {
await manager.startChannels();
expect(startAccount).not.toHaveBeenCalled();
expect(startAccount).toHaveBeenCalledTimes(1);
const accountSnapshot =
manager.getRuntimeSnapshot().channelAccounts.discord?.[DEFAULT_ACCOUNT_ID];
expect(accountSnapshot).toEqual(
expect.objectContaining({
accountId: DEFAULT_ACCOUNT_ID,
running: false,
running: true,
restartPending: false,
lastError: "boom",
lastError: null,
}),
);
await manager.stopChannel("discord", DEFAULT_ACCOUNT_ID);
await stopped.promise;
});
});

View File

@@ -119,6 +119,7 @@ function createManager(options?: {
resolveChannelRuntime?: () => PluginRuntime["channel"] | Promise<PluginRuntime["channel"]>;
getRuntimeConfig?: () => Record<string, unknown>;
channelIds?: ChannelId[];
startupTrace?: { measure: <T>(name: string, run: () => T | Promise<T>) => Promise<T> };
}) {
const log = createSubsystemLogger("gateway/server-channels-test");
const channelLogs = { discord: log } as Record<ChannelId, SubsystemLogger>;
@@ -137,6 +138,7 @@ function createManager(options?: {
...(options?.resolveChannelRuntime
? { resolveChannelRuntime: options.resolveChannelRuntime }
: {}),
...(options?.startupTrace ? { startupTrace: options.startupTrace } : {}),
});
}
@@ -456,6 +458,30 @@ describe("server-channels auto restart", () => {
expect(succeedingStart).toHaveBeenCalledTimes(1);
});
it("emits startup trace spans for channel preflight and handoff", async () => {
const measureMock = vi.fn(async (name: string, run: () => unknown) => await run());
const startupTrace = {
measure: async <T>(name: string, run: () => T | Promise<T>) =>
(await measureMock(name, run)) as T,
};
const startAccount = vi.fn(async () => {});
installTestRegistry(createTestPlugin({ startAccount }));
const manager = createManager({ startupTrace });
await manager.startChannels();
const names = measureMock.mock.calls.map(([name]) => name);
expect(names).toEqual(
expect.arrayContaining([
"channels.discord.start",
"channels.discord.list-accounts",
"channels.discord.runtime",
"channels.discord.approval-bootstrap",
]),
);
});
it("evicts stale account lifecycle state during whole-channel reload", async () => {
let accountIds = [DEFAULT_ACCOUNT_ID];
const startAccount = vi.fn(

View File

@@ -47,6 +47,10 @@ type ChannelHealthMonitorConfig = HealthMonitorConfig & {
accounts?: Record<string, HealthMonitorConfig>;
};
type GatewayStartupTrace = {
measure: <T>(name: string, run: () => T | Promise<T>) => Promise<T>;
};
function createRuntimeStore(): ChannelRuntimeStore {
return {
aborts: new Map(),
@@ -161,6 +165,7 @@ type ChannelManagerOptions = {
* `createPluginRuntime().channel` surface.
*/
resolveChannelRuntime?: () => ChannelRuntimeSurface | Promise<ChannelRuntimeSurface>;
startupTrace?: GatewayStartupTrace;
};
type StartChannelOptions = {
@@ -187,6 +192,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
channelRuntimeEnvs,
channelRuntime,
resolveChannelRuntime,
startupTrace,
} = opts;
const channelStores = new Map<ChannelId, ChannelRuntimeStore>();
@@ -286,6 +292,9 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
const getChannelRuntime = async (): Promise<ChannelRuntimeSurface | undefined> => {
return channelRuntime ?? (await resolveChannelRuntime?.());
};
const measureStartup = async <T>(name: string, run: () => T | Promise<T>): Promise<T> => {
return startupTrace ? startupTrace.measure(name, run) : await run();
};
const evictStaleChannelAccountState = (
channelId: ChannelId,
@@ -322,7 +331,11 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
const cfg = getRuntimeConfig();
resetDirectoryCache({ channel: channelId, accountId });
const store = getStore(channelId);
const accountIds = accountId ? [accountId] : plugin.config.listAccountIds(cfg);
const accountIds = accountId
? [accountId]
: await measureStartup(`channels.${channelId}.list-accounts`, () =>
plugin.config.listAccountIds(cfg),
);
if (!accountId) {
evictStaleChannelAccountState(channelId, store, accountIds);
}
@@ -391,7 +404,9 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
let configured = true;
if (plugin.config.isConfigured) {
configured = await plugin.config.isConfigured(account, cfg);
configured = await measureStartup(`channels.${channelId}.is-configured`, () =>
plugin.config.isConfigured!(account, cfg),
);
}
if (!configured) {
setRuntime(channelId, id, {
@@ -420,21 +435,31 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
return;
}
scopedChannelRuntime = createTaskScopedChannelRuntime({
channelRuntime: await getChannelRuntime(),
});
scopedChannelRuntime = await measureStartup(`channels.${channelId}.runtime`, async () =>
createTaskScopedChannelRuntime({
channelRuntime: await getChannelRuntime(),
}),
);
channelRuntimeForTask = scopedChannelRuntime.channelRuntime;
if (!preserveRestartAttempts) {
restartAttempts.delete(rKey);
}
stopApprovalBootstrap = await startChannelApprovalHandlerBootstrap({
plugin,
cfg,
accountId: id,
channelRuntime: channelRuntimeForTask,
logger: log,
});
try {
stopApprovalBootstrap = await measureStartup(
`channels.${channelId}.approval-bootstrap`,
() =>
startChannelApprovalHandlerBootstrap({
plugin,
cfg,
accountId: id,
channelRuntime: channelRuntimeForTask,
logger: log,
}),
);
} catch (error) {
log.error?.(`[${id}] native approval bootstrap failed: ${formatErrorMessage(error)}`);
}
setRuntime(channelId, id, {
accountId: id,
enabled: true,
@@ -446,17 +471,19 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
reconnectAttempts: preserveRestartAttempts ? (restartAttempts.get(rKey) ?? 0) : 0,
});
const task = Promise.resolve().then(() =>
startAccount({
cfg,
accountId: id,
account,
runtime: channelRuntimeEnvs[channelId],
abortSignal: abort.signal,
log,
getStatus: () => getRuntime(channelId, id),
setStatus: (next) => setRuntime(channelId, id, next),
...(channelRuntimeForTask ? { channelRuntime: channelRuntimeForTask } : {}),
}),
measureStartup(`channels.${channelId}.start-account`, () =>
startAccount({
cfg,
accountId: id,
account,
runtime: channelRuntimeEnvs[channelId],
abortSignal: abort.signal,
log,
getStatus: () => getRuntime(channelId, id),
setStatus: (next) => setRuntime(channelId, id, next),
...(channelRuntimeForTask ? { channelRuntime: channelRuntimeForTask } : {}),
}),
),
);
const trackedPromise = task
.catch((err) => {
@@ -636,7 +663,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
return;
}
try {
await startChannel(plugin.id);
await measureStartup(`channels.${plugin.id}.start`, () => startChannel(plugin.id));
} catch (err) {
channelLogs[plugin.id]?.error?.(
`[${plugin.id}] channel startup failed: ${formatErrorMessage(err)}`,

View File

@@ -391,6 +391,10 @@ describe("startGatewayPostAttachRuntime", () => {
hooks: { internal: { enabled: false } },
plugins: { entries: { demo: { enabled: true } } },
} as never,
pluginRegistry: {
...createPostAttachParams().pluginRegistry,
typedHooks: [{ hookName: "gateway_start" }],
} as never,
deps: { cron: initialCron } as never,
});
@@ -428,6 +432,19 @@ describe("startGatewayPostAttachRuntime", () => {
expect(getCron()).toBe(reloadedCron);
});
it("does not resolve the global hook runner when no gateway_start hooks are registered", async () => {
const getGlobalHookRunner = vi.fn(async () => {
throw new Error("should not load hook runner");
});
await startGatewayPostAttachRuntime(
createPostAttachParams(),
createPostAttachRuntimeDeps({ getGlobalHookRunner }),
);
expect(getGlobalHookRunner).not.toHaveBeenCalled();
});
it("resolves gateway_start cron from the live runtime getter before deps fallback", async () => {
const runGatewayStart = vi.fn<
(event: PluginHookGatewayStartEvent, ctx: PluginHookGatewayContext) => Promise<void>
@@ -443,6 +460,10 @@ describe("startGatewayPostAttachRuntime", () => {
const params = createPostAttachParams({
deps: { cron: depsCron } as never,
getCronService: () => currentLiveCron,
pluginRegistry: {
...createPostAttachParams().pluginRegistry,
typedHooks: [{ hookName: "gateway_start" }],
} as never,
});
await startGatewayPostAttachRuntime(
@@ -509,6 +530,7 @@ function createPostAttachParams(overrides: Partial<PostAttachParams> = {}): Post
{ id: "cold", status: "disabled" },
{ id: "broken", status: "error" },
],
typedHooks: [],
} as never,
defaultWorkspaceDir: "/tmp/openclaw-workspace",
deps: {} as never,

View File

@@ -47,6 +47,10 @@ function shouldStartGatewayMemoryBackend(cfg: OpenClawConfig): boolean {
return cfg.memory?.backend === "qmd";
}
function hasGatewayStartHooks(pluginRegistry: ReturnType<typeof loadOpenClawPlugins>): boolean {
return pluginRegistry.typedHooks.some((hook) => hook.hookName === "gateway_start");
}
function isConfiguredCliBackendPrimary(params: {
cfg: OpenClawConfig;
explicitPrimary: string;
@@ -116,18 +120,12 @@ async function prewarmConfiguredPrimaryModel(params: {
const [
{ resolveOpenClawAgentDir },
{ DEFAULT_MODEL, DEFAULT_PROVIDER },
{ selectAgentHarness },
{ isCliProvider, resolveConfiguredModelRef },
{ ensureOpenClawModelsJson },
{ resolveModel, resolveModelAsync },
{ resolveEmbeddedAgentRuntime },
] = await Promise.all([
import("../agents/agent-paths.js"),
import("../agents/defaults.js"),
import("../agents/harness/selection.js"),
import("../agents/model-selection.js"),
import("../agents/models-config.js"),
import("../agents/pi-embedded-runner/model.js"),
import("../agents/pi-embedded-runner/runtime.js"),
]);
const { provider, model } = resolveConfiguredModelRef({
@@ -142,26 +140,14 @@ async function prewarmConfiguredPrimaryModel(params: {
if (runtime !== "auto" && runtime !== "pi") {
return;
}
if (selectAgentHarness({ provider, modelId: model, config: params.cfg }).id !== "pi") {
return;
}
// Keep startup prewarm metadata-only; resolving models can import provider runtimes and block readiness.
const { ensureOpenClawModelsJson } = await import("../agents/models-config.js");
const agentDir = resolveOpenClawAgentDir();
try {
await ensureOpenClawModelsJson(params.cfg, agentDir, {
providerDiscoveryProviderIds: [provider],
providerDiscoveryTimeoutMs: STARTUP_PROVIDER_DISCOVERY_TIMEOUT_MS,
});
const resolved = resolveModel(provider, model, agentDir, params.cfg, {
skipProviderRuntimeHooks: true,
});
if (!resolved.model) {
const asyncResolved = await resolveModelAsync(provider, model, agentDir, params.cfg);
if (!asyncResolved.model) {
throw new Error(
resolved.error ?? asyncResolved.error ?? `Unknown model: ${provider}/${model}`,
);
}
}
} catch (err) {
params.log.warn(`startup model warmup failed for ${provider}/${model}: ${String(err)}`);
}
@@ -599,6 +585,10 @@ export async function startGatewayPostAttachRuntime(
if (params.minimalTestGateway) {
return;
}
if (!hasGatewayStartHooks(params.pluginRegistry)) {
return;
}
await new Promise<void>((resolve) => setImmediate(resolve));
const hookRunner = await runtimeDeps.getGlobalHookRunner();
if (hookRunner?.hasHooks("gateway_start")) {
void hookRunner

View File

@@ -8,36 +8,7 @@ const ensureOpenClawModelsJsonMock = vi.fn<
options?: unknown,
) => Promise<{ agentDir: string; wrote: boolean }>
>(async () => ({ agentDir: "/tmp/agent", wrote: false }));
const resolveModelMock = vi.fn<
(
provider: unknown,
modelId: unknown,
agentDir: unknown,
cfg: unknown,
options?: unknown,
) => { model: { id: string; provider: string; api: string } }
>(() => ({
model: {
id: "gpt-5.4",
provider: "openai-codex",
api: "openai-codex-responses",
},
}));
const resolveModelAsyncMock = vi.fn<
(
provider: unknown,
modelId: unknown,
agentDir: unknown,
cfg: unknown,
) => Promise<{ model?: { id: string; provider: string; api: string }; error?: string }>
>(async () => ({
model: {
id: "gpt-5.4",
provider: "openai-codex",
api: "openai-codex-responses",
},
}));
const selectAgentHarnessMock = vi.fn((_params: unknown) => ({ id: "pi" }));
const piModelModuleLoadedMock = vi.fn();
const resolveEmbeddedAgentRuntimeMock = vi.fn(() => "auto");
vi.mock("../agents/agent-paths.js", () => ({
@@ -49,21 +20,12 @@ vi.mock("../agents/models-config.js", () => ({
ensureOpenClawModelsJsonMock(config, agentDir, options),
}));
vi.mock("../agents/harness/selection.js", () => ({
selectAgentHarness: (params: unknown) => selectAgentHarnessMock(params),
}));
vi.mock("../agents/pi-embedded-runner/model.js", () => ({
resolveModel: (
provider: unknown,
modelId: unknown,
agentDir: unknown,
cfg: unknown,
options?: unknown,
) => resolveModelMock(provider, modelId, agentDir, cfg, options),
resolveModelAsync: (provider: unknown, modelId: unknown, agentDir: unknown, cfg: unknown) =>
resolveModelAsyncMock(provider, modelId, agentDir, cfg),
}));
vi.mock("../agents/pi-embedded-runner/model.js", () => {
piModelModuleLoadedMock();
return {
resolveModel: () => ({}),
};
});
vi.mock("../agents/pi-embedded-runner/runtime.js", () => ({
resolveEmbeddedAgentRuntime: () => resolveEmbeddedAgentRuntimeMock(),
@@ -80,10 +42,7 @@ describe("gateway startup primary model warmup", () => {
beforeEach(() => {
ensureOpenClawModelsJsonMock.mockClear();
resolveModelMock.mockClear();
resolveModelAsyncMock.mockClear();
selectAgentHarnessMock.mockClear();
selectAgentHarnessMock.mockReturnValue({ id: "pi" });
piModelModuleLoadedMock.mockClear();
resolveEmbeddedAgentRuntimeMock.mockClear();
resolveEmbeddedAgentRuntimeMock.mockReturnValue("auto");
});
@@ -112,9 +71,7 @@ describe("gateway startup primary model warmup", () => {
providerDiscoveryTimeoutMs: 5000,
}),
);
expect(resolveModelMock).toHaveBeenCalledWith("openai-codex", "gpt-5.4", "/tmp/agent", cfg, {
skipProviderRuntimeHooks: true,
});
expect(piModelModuleLoadedMock).not.toHaveBeenCalled();
});
it("skips warmup when no explicit primary model is configured", async () => {
@@ -124,7 +81,7 @@ describe("gateway startup primary model warmup", () => {
});
expect(ensureOpenClawModelsJsonMock).not.toHaveBeenCalled();
expect(resolveModelMock).not.toHaveBeenCalled();
expect(piModelModuleLoadedMock).not.toHaveBeenCalled();
});
it("skips static warmup for configured CLI backends", async () => {
@@ -148,33 +105,7 @@ describe("gateway startup primary model warmup", () => {
});
expect(ensureOpenClawModelsJsonMock).not.toHaveBeenCalled();
expect(resolveModelMock).not.toHaveBeenCalled();
});
it("skips static warmup when another agent harness handles the model", async () => {
selectAgentHarnessMock.mockReturnValue({ id: "codex" });
const cfg = {
agents: {
defaults: {
model: {
primary: "codex/gpt-5.4",
},
},
},
} as OpenClawConfig;
await prewarmConfiguredPrimaryModel({
cfg,
log: { warn: vi.fn() },
});
expect(selectAgentHarnessMock).toHaveBeenCalledWith({
provider: "codex",
modelId: "gpt-5.4",
config: cfg,
});
expect(ensureOpenClawModelsJsonMock).not.toHaveBeenCalled();
expect(resolveModelMock).not.toHaveBeenCalled();
expect(piModelModuleLoadedMock).not.toHaveBeenCalled();
});
it("skips static warmup when a non-PI agent runtime is forced", async () => {
@@ -192,9 +123,8 @@ describe("gateway startup primary model warmup", () => {
log: { warn: vi.fn() },
});
expect(selectAgentHarnessMock).not.toHaveBeenCalled();
expect(ensureOpenClawModelsJsonMock).not.toHaveBeenCalled();
expect(resolveModelMock).not.toHaveBeenCalled();
expect(piModelModuleLoadedMock).not.toHaveBeenCalled();
});
it("keeps PI static warmup when the PI agent runtime is forced", async () => {
@@ -214,11 +144,6 @@ describe("gateway startup primary model warmup", () => {
log: { warn: vi.fn() },
});
expect(selectAgentHarnessMock).toHaveBeenCalledWith({
provider: "openai-codex",
modelId: "gpt-5.4",
config: cfg,
});
expect(ensureOpenClawModelsJsonMock).toHaveBeenCalledWith(
cfg,
"/tmp/agent",
@@ -227,38 +152,11 @@ describe("gateway startup primary model warmup", () => {
providerDiscoveryTimeoutMs: 5000,
}),
);
expect(resolveModelMock).toHaveBeenCalled();
expect(piModelModuleLoadedMock).not.toHaveBeenCalled();
});
it("falls back to async model resolution before warning", async () => {
resolveModelMock.mockReturnValueOnce({ model: undefined } as never);
resolveModelAsyncMock.mockResolvedValueOnce({
model: {
id: "gpt-5.4",
provider: "codex",
api: "openai-codex-responses",
},
});
const warn = vi.fn();
const cfg = {
agents: {
defaults: {
model: {
primary: "codex/gpt-5.4",
},
},
},
} as OpenClawConfig;
await prewarmConfiguredPrimaryModel({ cfg, log: { warn } });
expect(resolveModelAsyncMock).toHaveBeenCalledWith("codex", "gpt-5.4", "/tmp/agent", cfg);
expect(warn).not.toHaveBeenCalled();
});
it("warns only when both static and async model resolution miss", async () => {
resolveModelMock.mockReturnValueOnce({ model: undefined, error: "static miss" } as never);
resolveModelAsyncMock.mockResolvedValueOnce({ error: "async miss" });
it("warns when scoped models.json preparation fails", async () => {
ensureOpenClawModelsJsonMock.mockRejectedValueOnce(new Error("models write failed"));
const warn = vi.fn();
await prewarmConfiguredPrimaryModel({

View File

@@ -558,6 +558,7 @@ export async function startGatewayServer(
channelLogs,
channelRuntimeEnvs,
resolveChannelRuntime: getChannelRuntime,
startupTrace,
});
const getReadiness = createReadinessChecker({
channelManager,

View File

@@ -98,6 +98,7 @@ describe("startChannelApprovalHandlerBootstrap", () => {
const lease = registerApprovalContext(channelRuntime);
const cleanup = await startTestBootstrap({ channelRuntime });
await flushTransitions();
expect(createChannelApprovalHandlerFromCapability).toHaveBeenCalledTimes(1);
expect(start).toHaveBeenCalledTimes(1);
@@ -107,6 +108,22 @@ describe("startChannelApprovalHandlerBootstrap", () => {
lease.dispose();
});
it("does not block bootstrap return on an existing runtime context", async () => {
const channelRuntime = createRuntimeChannel();
createChannelApprovalHandlerFromCapability.mockReturnValue(new Promise(() => {}));
registerApprovalContext(channelRuntime);
const result = await Promise.race([
startTestBootstrap({ channelRuntime }).then((cleanup) => ({ cleanup })),
new Promise<"timeout">((resolve) => setTimeout(() => resolve("timeout"), 50)),
]);
expect(result).not.toBe("timeout");
if (result !== "timeout") {
await result.cleanup();
}
});
it("does not start a handler after the runtime context is unregistered mid-boot", async () => {
const channelRuntime = createRuntimeChannel();
let resolveRuntime:

View File

@@ -155,7 +155,11 @@ export async function startChannelApprovalHandlerBootstrap(params: {
if (existingContext !== undefined) {
clearRetryTimer();
invalidateActiveHandler();
await startHandlerForContext(existingContext, activeGeneration);
const generation = activeGeneration;
spawn(
"failed to start native approval handler",
startHandlerForRegisteredContext(existingContext, generation),
);
}
return async () => {