fix(gateway): harden first-turn startup readiness (#52387)

* fix(gateway): harden first-turn startup readiness

* fix(gateway): scope startup model retry
This commit is contained in:
Vincent Koc
2026-03-22 12:12:08 -07:00
committed by GitHub
parent aef2c60aa5
commit f85cfc8b6c
6 changed files with 256 additions and 13 deletions

View File

@@ -123,6 +123,7 @@ Docs: https://docs.openclaw.ai
- Doctor/extensions: keep Matrix DM `allowFrom` repairs on the canonical `dm.allowFrom` path and stop treating Zalouser group sender gating as if it fell back to `allowFrom`, so doctor warnings and `--fix` stay aligned with runtime access control. Thanks @vincentkoc.
- Doctor/refactor: centralize built-in channel doctor semantics in one static capability registry with conservative fallback behavior for unknown/external channels, so future extension changes stop depending on scattered shared string checks. Thanks @vincentkoc.
- Models/OpenRouter runtime capabilities: fetch uncatalogued OpenRouter model metadata on first use so newly added vision models keep image input instead of silently degrading to text-only, with top-level capability field fallbacks for `/api/v1/models`. (#45824) Thanks @DJjjjhao.
- Gateway/startup: prewarm the configured primary model before channel startup and retry one transient provider-runtime miss so the first Telegram or Discord message after boot no longer fails with `Unknown model: openai-codex/gpt-5.4`. Thanks @vincentkoc.
- Channels/plugins: keep shared interactive payloads merge-ready by fixing Slack custom callback routing and repeat-click dedupe, allowing interactive-only sends, and preserving ordered Discord shared text blocks. (#47715) Thanks @vincentkoc.
- Slack/interactive replies: preserve `channelData.slack.blocks` through live DM delivery and preview-finalized edits so Block Kit button and select directives render instead of falling back to raw text. (#45890) Thanks @vincentkoc.
- Feishu/actions: expand the runtime action surface with message read/edit, explicit thread replies, pinning, and operator-facing chat/member inspection so Feishu can operate more of the workspace directly. (#47968) Thanks @Takhoffman.

View File

@@ -0,0 +1,98 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const discoverAuthStorageMock = vi.fn<(agentDir?: string) => { mocked: true }>(() => ({
mocked: true,
}));
const discoverModelsMock = vi.fn<
(authStorage: unknown, agentDir: string) => { find: ReturnType<typeof vi.fn> }
>(() => ({ find: vi.fn(() => null) }));
let hookCacheCleared = false;
const clearProviderRuntimeHookCacheMock = vi.fn<() => void>(() => {
hookCacheCleared = true;
});
const resolveProviderRuntimePluginMock = vi.fn<(params: unknown) => unknown>(() =>
hookCacheCleared ? { id: "openai", label: "OpenAI", auth: [] } : undefined,
);
const prepareProviderDynamicModelMock = vi.fn<(params: unknown) => Promise<void>>(async () => {});
const runProviderDynamicModelMock = vi.fn<(params: unknown) => unknown>(() =>
hookCacheCleared
? {
id: "gpt-5.4",
name: "gpt-5.4",
provider: "openai-codex",
api: "openai-codex-responses",
baseUrl: "https://chatgpt.com/backend-api",
reasoning: true,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 1_050_000,
maxTokens: 128_000,
}
: undefined,
);
vi.mock("../pi-model-discovery.js", () => ({
discoverAuthStorage: discoverAuthStorageMock,
discoverModels: discoverModelsMock,
}));
vi.mock("../../plugins/provider-runtime.js", async (importOriginal) => {
const actual = await importOriginal<typeof import("../../plugins/provider-runtime.js")>();
return {
...actual,
clearProviderRuntimeHookCache: clearProviderRuntimeHookCacheMock,
normalizeProviderResolvedModelWithPlugin: () => undefined,
prepareProviderDynamicModel: (params: unknown) => prepareProviderDynamicModelMock(params),
resolveProviderRuntimePlugin: (params: unknown) => resolveProviderRuntimePluginMock(params),
runProviderDynamicModel: (params: unknown) => runProviderDynamicModelMock(params),
};
});
describe("resolveModelAsync startup retry", () => {
beforeEach(() => {
hookCacheCleared = false;
clearProviderRuntimeHookCacheMock.mockClear();
resolveProviderRuntimePluginMock.mockClear();
prepareProviderDynamicModelMock.mockClear();
runProviderDynamicModelMock.mockClear();
discoverAuthStorageMock.mockClear();
discoverModelsMock.mockClear();
});
it("retries once after clearing the provider-runtime hook cache", async () => {
const { resolveModelAsync } = await import("./model.js");
const result = await resolveModelAsync(
"openai-codex",
"gpt-5.4",
"/tmp/agent",
{},
{
retryTransientProviderRuntimeMiss: true,
},
);
expect(result.error).toBeUndefined();
expect(result.model).toMatchObject({
provider: "openai-codex",
id: "gpt-5.4",
api: "openai-codex-responses",
});
expect(clearProviderRuntimeHookCacheMock).toHaveBeenCalledTimes(1);
expect(resolveProviderRuntimePluginMock).toHaveBeenCalledTimes(2);
expect(runProviderDynamicModelMock).toHaveBeenCalledTimes(2);
});
it("does not clear the hook cache during steady-state misses", async () => {
const { resolveModelAsync } = await import("./model.js");
const result = await resolveModelAsync("openai-codex", "gpt-5.4", "/tmp/agent", {});
expect(result.model).toBeUndefined();
expect(result.error).toBe("Unknown model: openai-codex/gpt-5.4");
expect(clearProviderRuntimeHookCacheMock).not.toHaveBeenCalled();
expect(resolveProviderRuntimePluginMock).toHaveBeenCalledTimes(1);
expect(runProviderDynamicModelMock).toHaveBeenCalledTimes(1);
});
});

View File

@@ -3,6 +3,7 @@ import type { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent";
import type { OpenClawConfig } from "../../config/config.js";
import type { ModelDefinitionConfig } from "../../config/types.js";
import {
clearProviderRuntimeHookCache,
prepareProviderDynamicModel,
resolveProviderRuntimePlugin,
runProviderDynamicModel,
@@ -349,6 +350,9 @@ export async function resolveModelAsync(
modelId: string,
agentDir?: string,
cfg?: OpenClawConfig,
options?: {
retryTransientProviderRuntimeMiss?: boolean;
},
): Promise<{
model?: Model<Api>;
error?: string;
@@ -372,7 +376,11 @@ export async function resolveModelAsync(
modelRegistry,
};
}
if (!explicitModel) {
const providerConfig = resolveConfiguredProviderConfig(cfg, provider);
const resolveDynamicAttempt = async (options?: { clearHookCache?: boolean }) => {
if (options?.clearHookCache) {
clearProviderRuntimeHookCache();
}
const providerPlugin = resolveProviderRuntimePlugin({
provider,
config: cfg,
@@ -387,21 +395,26 @@ export async function resolveModelAsync(
provider,
modelId,
modelRegistry,
providerConfig: resolveConfiguredProviderConfig(cfg, provider),
providerConfig,
},
});
}
return resolveModelWithRegistry({
provider,
modelId,
modelRegistry,
cfg,
agentDir: resolvedAgentDir,
});
};
let model =
explicitModel?.kind === "resolved" ? explicitModel.model : await resolveDynamicAttempt();
if (!model && !explicitModel && options?.retryTransientProviderRuntimeMiss) {
// Startup can race the first provider-runtime snapshot load on a fresh
// gateway boot. Retry once with a cleared hook cache before surfacing a
// user-visible "Unknown model" that disappears on the next message.
model = await resolveDynamicAttempt({ clearHookCache: true });
}
const model =
explicitModel?.kind === "resolved"
? explicitModel.model
: resolveModelWithRegistry({
provider,
modelId,
modelRegistry,
cfg,
agentDir: resolvedAgentDir,
});
if (model) {
return { model, authStorage, modelRegistry };
}

View File

@@ -0,0 +1,88 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
const ensureOpenClawModelsJsonMock = vi.fn<
(config: unknown, agentDir: unknown) => Promise<{ agentDir: string; wrote: boolean }>
>(async () => ({ agentDir: "/tmp/agent", wrote: false }));
const resolveModelAsyncMock = vi.fn<
(
provider: unknown,
modelId: unknown,
agentDir: unknown,
cfg: unknown,
options?: unknown,
) => Promise<{ model: { id: string; provider: string; api: string } }>
>(async () => ({
model: {
id: "gpt-5.4",
provider: "openai-codex",
api: "openai-codex-responses",
},
}));
vi.mock("../agents/agent-paths.js", () => ({
resolveOpenClawAgentDir: () => "/tmp/agent",
}));
vi.mock("../agents/models-config.js", () => ({
ensureOpenClawModelsJson: (config: unknown, agentDir: unknown) =>
ensureOpenClawModelsJsonMock(config, agentDir),
}));
vi.mock("../agents/pi-embedded-runner/model.js", () => ({
resolveModelAsync: (
provider: unknown,
modelId: unknown,
agentDir: unknown,
cfg: unknown,
options?: unknown,
) => resolveModelAsyncMock(provider, modelId, agentDir, cfg, options),
}));
describe("gateway startup primary model warmup", () => {
beforeEach(() => {
ensureOpenClawModelsJsonMock.mockClear();
resolveModelAsyncMock.mockClear();
});
it("prewarms an explicit configured primary model", async () => {
const { __testing } = await import("./server-startup.js");
const cfg = {
agents: {
defaults: {
model: {
primary: "openai-codex/gpt-5.4",
},
},
},
} as OpenClawConfig;
await __testing.prewarmConfiguredPrimaryModel({
cfg,
log: { warn: vi.fn() },
});
expect(ensureOpenClawModelsJsonMock).toHaveBeenCalledWith(cfg, "/tmp/agent");
expect(resolveModelAsyncMock).toHaveBeenCalledWith(
"openai-codex",
"gpt-5.4",
"/tmp/agent",
cfg,
{
retryTransientProviderRuntimeMiss: true,
},
);
});
it("skips warmup when no explicit primary model is configured", async () => {
const { __testing } = await import("./server-startup.js");
await __testing.prewarmConfiguredPrimaryModel({
cfg: {} as OpenClawConfig,
log: { warn: vi.fn() },
});
expect(ensureOpenClawModelsJsonMock).not.toHaveBeenCalled();
expect(resolveModelAsyncMock).not.toHaveBeenCalled();
});
});

View File

@@ -1,5 +1,6 @@
import { getAcpSessionManager } from "../acp/control-plane/manager.js";
import { ACP_SESSION_IDENTITY_RENDERER_VERSION } from "../acp/runtime/session-identifiers.js";
import { resolveOpenClawAgentDir } from "../agents/agent-paths.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../agents/defaults.js";
import { loadModelCatalog } from "../agents/model-catalog.js";
import {
@@ -7,10 +8,13 @@ import {
resolveConfiguredModelRef,
resolveHooksGmailModel,
} from "../agents/model-selection.js";
import { ensureOpenClawModelsJson } from "../agents/models-config.js";
import { resolveModelAsync } from "../agents/pi-embedded-runner/model.js";
import { resolveAgentSessionDirs } from "../agents/session-dirs.js";
import { cleanStaleLockFiles } from "../agents/session-write-lock.js";
import type { CliDeps } from "../cli/deps.js";
import type { loadConfig } from "../config/config.js";
import { resolveAgentModelPrimaryValue } from "../config/model-input.js";
import { resolveStateDir } from "../config/paths.js";
import { startGmailWatcherWithLogs } from "../hooks/gmail-watcher-lifecycle.js";
import {
@@ -31,6 +35,33 @@ import { startGatewayMemoryBackend } from "./server-startup-memory.js";
const SESSION_LOCK_STALE_MS = 30 * 60 * 1000;
async function prewarmConfiguredPrimaryModel(params: {
cfg: ReturnType<typeof loadConfig>;
log: { warn: (msg: string) => void };
}): Promise<void> {
const explicitPrimary = resolveAgentModelPrimaryValue(params.cfg.agents?.defaults?.model)?.trim();
if (!explicitPrimary) {
return;
}
const { provider, model } = resolveConfiguredModelRef({
cfg: params.cfg,
defaultProvider: DEFAULT_PROVIDER,
defaultModel: DEFAULT_MODEL,
});
const agentDir = resolveOpenClawAgentDir();
try {
await ensureOpenClawModelsJson(params.cfg, agentDir);
const resolved = await resolveModelAsync(provider, model, agentDir, params.cfg, {
retryTransientProviderRuntimeMiss: true,
});
if (!resolved.model) {
throw new Error(resolved.error ?? `Unknown model: ${provider}/${model}`);
}
} catch (err) {
params.log.warn(`startup model warmup failed for ${provider}/${model}: ${String(err)}`);
}
}
export async function startGatewaySidecars(params: {
cfg: ReturnType<typeof loadConfig>;
pluginRegistry: ReturnType<typeof loadOpenClawPlugins>;
@@ -129,6 +160,10 @@ export async function startGatewaySidecars(params: {
isTruthyEnvValue(process.env.OPENCLAW_SKIP_PROVIDERS);
if (!skipChannels) {
try {
await prewarmConfiguredPrimaryModel({
cfg: params.cfg,
log: params.log,
});
await params.startChannels();
} catch (err) {
params.logChannels.error(`channel startup failed: ${String(err)}`);
@@ -189,3 +224,7 @@ export async function startGatewaySidecars(params: {
return { browserControl, pluginServices };
}
export const __testing = {
prewarmConfiguredPrimaryModel,
};

View File

@@ -89,7 +89,7 @@ function buildHookProviderCacheKey(params: {
return `${roots.workspace ?? ""}::${roots.global}::${roots.stock ?? ""}::${JSON.stringify(params.onlyPluginIds ?? [])}`;
}
export function resetProviderRuntimeHookCacheForTest(): void {
export function clearProviderRuntimeHookCache(): void {
cachedHookProvidersWithoutConfig = new WeakMap<
NodeJS.ProcessEnv,
Map<string, ProviderPlugin[]>
@@ -100,6 +100,10 @@ export function resetProviderRuntimeHookCacheForTest(): void {
>();
}
export function resetProviderRuntimeHookCacheForTest(): void {
clearProviderRuntimeHookCache();
}
function resolveProviderPluginsForHooks(params: {
config?: OpenClawConfig;
workspaceDir?: string;