From f85cfc8b6c7eaf9b3b0ea28359bf242e757f3b68 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 22 Mar 2026 12:12:08 -0700 Subject: [PATCH] fix(gateway): harden first-turn startup readiness (#52387) * fix(gateway): harden first-turn startup readiness * fix(gateway): scope startup model retry --- CHANGELOG.md | 1 + .../model.startup-retry.test.ts | 98 +++++++++++++++++++ src/agents/pi-embedded-runner/model.ts | 37 ++++--- src/gateway/server-startup.test.ts | 88 +++++++++++++++++ src/gateway/server-startup.ts | 39 ++++++++ src/plugins/provider-runtime.ts | 6 +- 6 files changed, 256 insertions(+), 13 deletions(-) create mode 100644 src/agents/pi-embedded-runner/model.startup-retry.test.ts create mode 100644 src/gateway/server-startup.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ea85f5a99f..a3811fec512 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -123,6 +123,7 @@ Docs: https://docs.openclaw.ai - Doctor/extensions: keep Matrix DM `allowFrom` repairs on the canonical `dm.allowFrom` path and stop treating Zalouser group sender gating as if it fell back to `allowFrom`, so doctor warnings and `--fix` stay aligned with runtime access control. Thanks @vincentkoc. - Doctor/refactor: centralize built-in channel doctor semantics in one static capability registry with conservative fallback behavior for unknown/external channels, so future extension changes stop depending on scattered shared string checks. Thanks @vincentkoc. - Models/OpenRouter runtime capabilities: fetch uncatalogued OpenRouter model metadata on first use so newly added vision models keep image input instead of silently degrading to text-only, with top-level capability field fallbacks for `/api/v1/models`. (#45824) Thanks @DJjjjhao. +- Gateway/startup: prewarm the configured primary model before channel startup and retry one transient provider-runtime miss so the first Telegram or Discord message after boot no longer fails with `Unknown model: openai-codex/gpt-5.4`. Thanks @vincentkoc. - Channels/plugins: keep shared interactive payloads merge-ready by fixing Slack custom callback routing and repeat-click dedupe, allowing interactive-only sends, and preserving ordered Discord shared text blocks. (#47715) Thanks @vincentkoc. - Slack/interactive replies: preserve `channelData.slack.blocks` through live DM delivery and preview-finalized edits so Block Kit button and select directives render instead of falling back to raw text. (#45890) Thanks @vincentkoc. - Feishu/actions: expand the runtime action surface with message read/edit, explicit thread replies, pinning, and operator-facing chat/member inspection so Feishu can operate more of the workspace directly. (#47968) Thanks @Takhoffman. diff --git a/src/agents/pi-embedded-runner/model.startup-retry.test.ts b/src/agents/pi-embedded-runner/model.startup-retry.test.ts new file mode 100644 index 00000000000..1cfa0ae4609 --- /dev/null +++ b/src/agents/pi-embedded-runner/model.startup-retry.test.ts @@ -0,0 +1,98 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const discoverAuthStorageMock = vi.fn<(agentDir?: string) => { mocked: true }>(() => ({ + mocked: true, +})); +const discoverModelsMock = vi.fn< + (authStorage: unknown, agentDir: string) => { find: ReturnType } +>(() => ({ find: vi.fn(() => null) })); + +let hookCacheCleared = false; +const clearProviderRuntimeHookCacheMock = vi.fn<() => void>(() => { + hookCacheCleared = true; +}); +const resolveProviderRuntimePluginMock = vi.fn<(params: unknown) => unknown>(() => + hookCacheCleared ? { id: "openai", label: "OpenAI", auth: [] } : undefined, +); +const prepareProviderDynamicModelMock = vi.fn<(params: unknown) => Promise>(async () => {}); +const runProviderDynamicModelMock = vi.fn<(params: unknown) => unknown>(() => + hookCacheCleared + ? { + id: "gpt-5.4", + name: "gpt-5.4", + provider: "openai-codex", + api: "openai-codex-responses", + baseUrl: "https://chatgpt.com/backend-api", + reasoning: true, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 1_050_000, + maxTokens: 128_000, + } + : undefined, +); + +vi.mock("../pi-model-discovery.js", () => ({ + discoverAuthStorage: discoverAuthStorageMock, + discoverModels: discoverModelsMock, +})); + +vi.mock("../../plugins/provider-runtime.js", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + clearProviderRuntimeHookCache: clearProviderRuntimeHookCacheMock, + normalizeProviderResolvedModelWithPlugin: () => undefined, + prepareProviderDynamicModel: (params: unknown) => prepareProviderDynamicModelMock(params), + resolveProviderRuntimePlugin: (params: unknown) => resolveProviderRuntimePluginMock(params), + runProviderDynamicModel: (params: unknown) => runProviderDynamicModelMock(params), + }; +}); + +describe("resolveModelAsync startup retry", () => { + beforeEach(() => { + hookCacheCleared = false; + clearProviderRuntimeHookCacheMock.mockClear(); + resolveProviderRuntimePluginMock.mockClear(); + prepareProviderDynamicModelMock.mockClear(); + runProviderDynamicModelMock.mockClear(); + discoverAuthStorageMock.mockClear(); + discoverModelsMock.mockClear(); + }); + + it("retries once after clearing the provider-runtime hook cache", async () => { + const { resolveModelAsync } = await import("./model.js"); + + const result = await resolveModelAsync( + "openai-codex", + "gpt-5.4", + "/tmp/agent", + {}, + { + retryTransientProviderRuntimeMiss: true, + }, + ); + + expect(result.error).toBeUndefined(); + expect(result.model).toMatchObject({ + provider: "openai-codex", + id: "gpt-5.4", + api: "openai-codex-responses", + }); + expect(clearProviderRuntimeHookCacheMock).toHaveBeenCalledTimes(1); + expect(resolveProviderRuntimePluginMock).toHaveBeenCalledTimes(2); + expect(runProviderDynamicModelMock).toHaveBeenCalledTimes(2); + }); + + it("does not clear the hook cache during steady-state misses", async () => { + const { resolveModelAsync } = await import("./model.js"); + + const result = await resolveModelAsync("openai-codex", "gpt-5.4", "/tmp/agent", {}); + + expect(result.model).toBeUndefined(); + expect(result.error).toBe("Unknown model: openai-codex/gpt-5.4"); + expect(clearProviderRuntimeHookCacheMock).not.toHaveBeenCalled(); + expect(resolveProviderRuntimePluginMock).toHaveBeenCalledTimes(1); + expect(runProviderDynamicModelMock).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/agents/pi-embedded-runner/model.ts b/src/agents/pi-embedded-runner/model.ts index 375ad55127a..997c9088dfd 100644 --- a/src/agents/pi-embedded-runner/model.ts +++ b/src/agents/pi-embedded-runner/model.ts @@ -3,6 +3,7 @@ import type { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent"; import type { OpenClawConfig } from "../../config/config.js"; import type { ModelDefinitionConfig } from "../../config/types.js"; import { + clearProviderRuntimeHookCache, prepareProviderDynamicModel, resolveProviderRuntimePlugin, runProviderDynamicModel, @@ -349,6 +350,9 @@ export async function resolveModelAsync( modelId: string, agentDir?: string, cfg?: OpenClawConfig, + options?: { + retryTransientProviderRuntimeMiss?: boolean; + }, ): Promise<{ model?: Model; error?: string; @@ -372,7 +376,11 @@ export async function resolveModelAsync( modelRegistry, }; } - if (!explicitModel) { + const providerConfig = resolveConfiguredProviderConfig(cfg, provider); + const resolveDynamicAttempt = async (options?: { clearHookCache?: boolean }) => { + if (options?.clearHookCache) { + clearProviderRuntimeHookCache(); + } const providerPlugin = resolveProviderRuntimePlugin({ provider, config: cfg, @@ -387,21 +395,26 @@ export async function resolveModelAsync( provider, modelId, modelRegistry, - providerConfig: resolveConfiguredProviderConfig(cfg, provider), + providerConfig, }, }); } + return resolveModelWithRegistry({ + provider, + modelId, + modelRegistry, + cfg, + agentDir: resolvedAgentDir, + }); + }; + let model = + explicitModel?.kind === "resolved" ? explicitModel.model : await resolveDynamicAttempt(); + if (!model && !explicitModel && options?.retryTransientProviderRuntimeMiss) { + // Startup can race the first provider-runtime snapshot load on a fresh + // gateway boot. Retry once with a cleared hook cache before surfacing a + // user-visible "Unknown model" that disappears on the next message. + model = await resolveDynamicAttempt({ clearHookCache: true }); } - const model = - explicitModel?.kind === "resolved" - ? explicitModel.model - : resolveModelWithRegistry({ - provider, - modelId, - modelRegistry, - cfg, - agentDir: resolvedAgentDir, - }); if (model) { return { model, authStorage, modelRegistry }; } diff --git a/src/gateway/server-startup.test.ts b/src/gateway/server-startup.test.ts new file mode 100644 index 00000000000..40b9d796e5c --- /dev/null +++ b/src/gateway/server-startup.test.ts @@ -0,0 +1,88 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; + +const ensureOpenClawModelsJsonMock = vi.fn< + (config: unknown, agentDir: unknown) => Promise<{ agentDir: string; wrote: boolean }> +>(async () => ({ agentDir: "/tmp/agent", wrote: false })); +const resolveModelAsyncMock = vi.fn< + ( + provider: unknown, + modelId: unknown, + agentDir: unknown, + cfg: unknown, + options?: unknown, + ) => Promise<{ model: { id: string; provider: string; api: string } }> +>(async () => ({ + model: { + id: "gpt-5.4", + provider: "openai-codex", + api: "openai-codex-responses", + }, +})); + +vi.mock("../agents/agent-paths.js", () => ({ + resolveOpenClawAgentDir: () => "/tmp/agent", +})); + +vi.mock("../agents/models-config.js", () => ({ + ensureOpenClawModelsJson: (config: unknown, agentDir: unknown) => + ensureOpenClawModelsJsonMock(config, agentDir), +})); + +vi.mock("../agents/pi-embedded-runner/model.js", () => ({ + resolveModelAsync: ( + provider: unknown, + modelId: unknown, + agentDir: unknown, + cfg: unknown, + options?: unknown, + ) => resolveModelAsyncMock(provider, modelId, agentDir, cfg, options), +})); + +describe("gateway startup primary model warmup", () => { + beforeEach(() => { + ensureOpenClawModelsJsonMock.mockClear(); + resolveModelAsyncMock.mockClear(); + }); + + it("prewarms an explicit configured primary model", async () => { + const { __testing } = await import("./server-startup.js"); + const cfg = { + agents: { + defaults: { + model: { + primary: "openai-codex/gpt-5.4", + }, + }, + }, + } as OpenClawConfig; + + await __testing.prewarmConfiguredPrimaryModel({ + cfg, + log: { warn: vi.fn() }, + }); + + expect(ensureOpenClawModelsJsonMock).toHaveBeenCalledWith(cfg, "/tmp/agent"); + expect(resolveModelAsyncMock).toHaveBeenCalledWith( + "openai-codex", + "gpt-5.4", + "/tmp/agent", + cfg, + { + retryTransientProviderRuntimeMiss: true, + }, + ); + }); + + it("skips warmup when no explicit primary model is configured", async () => { + const { __testing } = await import("./server-startup.js"); + + await __testing.prewarmConfiguredPrimaryModel({ + cfg: {} as OpenClawConfig, + log: { warn: vi.fn() }, + }); + + expect(ensureOpenClawModelsJsonMock).not.toHaveBeenCalled(); + expect(resolveModelAsyncMock).not.toHaveBeenCalled(); + }); +}); diff --git a/src/gateway/server-startup.ts b/src/gateway/server-startup.ts index 01ec6266df6..69b6d3f2810 100644 --- a/src/gateway/server-startup.ts +++ b/src/gateway/server-startup.ts @@ -1,5 +1,6 @@ import { getAcpSessionManager } from "../acp/control-plane/manager.js"; import { ACP_SESSION_IDENTITY_RENDERER_VERSION } from "../acp/runtime/session-identifiers.js"; +import { resolveOpenClawAgentDir } from "../agents/agent-paths.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../agents/defaults.js"; import { loadModelCatalog } from "../agents/model-catalog.js"; import { @@ -7,10 +8,13 @@ import { resolveConfiguredModelRef, resolveHooksGmailModel, } from "../agents/model-selection.js"; +import { ensureOpenClawModelsJson } from "../agents/models-config.js"; +import { resolveModelAsync } from "../agents/pi-embedded-runner/model.js"; import { resolveAgentSessionDirs } from "../agents/session-dirs.js"; import { cleanStaleLockFiles } from "../agents/session-write-lock.js"; import type { CliDeps } from "../cli/deps.js"; import type { loadConfig } from "../config/config.js"; +import { resolveAgentModelPrimaryValue } from "../config/model-input.js"; import { resolveStateDir } from "../config/paths.js"; import { startGmailWatcherWithLogs } from "../hooks/gmail-watcher-lifecycle.js"; import { @@ -31,6 +35,33 @@ import { startGatewayMemoryBackend } from "./server-startup-memory.js"; const SESSION_LOCK_STALE_MS = 30 * 60 * 1000; +async function prewarmConfiguredPrimaryModel(params: { + cfg: ReturnType; + log: { warn: (msg: string) => void }; +}): Promise { + const explicitPrimary = resolveAgentModelPrimaryValue(params.cfg.agents?.defaults?.model)?.trim(); + if (!explicitPrimary) { + return; + } + const { provider, model } = resolveConfiguredModelRef({ + cfg: params.cfg, + defaultProvider: DEFAULT_PROVIDER, + defaultModel: DEFAULT_MODEL, + }); + const agentDir = resolveOpenClawAgentDir(); + try { + await ensureOpenClawModelsJson(params.cfg, agentDir); + const resolved = await resolveModelAsync(provider, model, agentDir, params.cfg, { + retryTransientProviderRuntimeMiss: true, + }); + if (!resolved.model) { + throw new Error(resolved.error ?? `Unknown model: ${provider}/${model}`); + } + } catch (err) { + params.log.warn(`startup model warmup failed for ${provider}/${model}: ${String(err)}`); + } +} + export async function startGatewaySidecars(params: { cfg: ReturnType; pluginRegistry: ReturnType; @@ -129,6 +160,10 @@ export async function startGatewaySidecars(params: { isTruthyEnvValue(process.env.OPENCLAW_SKIP_PROVIDERS); if (!skipChannels) { try { + await prewarmConfiguredPrimaryModel({ + cfg: params.cfg, + log: params.log, + }); await params.startChannels(); } catch (err) { params.logChannels.error(`channel startup failed: ${String(err)}`); @@ -189,3 +224,7 @@ export async function startGatewaySidecars(params: { return { browserControl, pluginServices }; } + +export const __testing = { + prewarmConfiguredPrimaryModel, +}; diff --git a/src/plugins/provider-runtime.ts b/src/plugins/provider-runtime.ts index 561154196f0..ab9f7518f82 100644 --- a/src/plugins/provider-runtime.ts +++ b/src/plugins/provider-runtime.ts @@ -89,7 +89,7 @@ function buildHookProviderCacheKey(params: { return `${roots.workspace ?? ""}::${roots.global}::${roots.stock ?? ""}::${JSON.stringify(params.onlyPluginIds ?? [])}`; } -export function resetProviderRuntimeHookCacheForTest(): void { +export function clearProviderRuntimeHookCache(): void { cachedHookProvidersWithoutConfig = new WeakMap< NodeJS.ProcessEnv, Map @@ -100,6 +100,10 @@ export function resetProviderRuntimeHookCacheForTest(): void { >(); } +export function resetProviderRuntimeHookCacheForTest(): void { + clearProviderRuntimeHookCache(); +} + function resolveProviderPluginsForHooks(params: { config?: OpenClawConfig; workspaceDir?: string;