diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index 300bcbd1ddc..0a6b0bedf26 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -10,6 +10,7 @@ import { resolveAgentWorkspaceDir } from "../agents/agent-scope.js"; import { type AuthProfileStore, ensureAuthProfileStore, + resolveAuthProfileOrder, saveAuthProfileStore, } from "../agents/auth-profiles.js"; import { @@ -49,6 +50,10 @@ const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_R const GATEWAY_LIVE_DEFAULT_TIMEOUT_MS = 20 * 60 * 1000; const GATEWAY_LIVE_UNBOUNDED_TIMEOUT_MS = 60 * 60 * 1000; const GATEWAY_LIVE_MAX_TIMEOUT_MS = 2 * 60 * 60 * 1000; +const GATEWAY_LIVE_PROBE_TIMEOUT_MS = Math.max( + 30_000, + toInt(process.env.OPENCLAW_LIVE_GATEWAY_STEP_TIMEOUT_MS, 90_000), +); const GATEWAY_LIVE_MAX_MODELS = resolveGatewayLiveMaxModels(); const GATEWAY_LIVE_SUITE_TIMEOUT_MS = resolveGatewayLiveSuiteTimeoutMs(GATEWAY_LIVE_MAX_MODELS); @@ -96,6 +101,28 @@ function resolveGatewayLiveSuiteTimeoutMs(maxModels: number): number { ); } +function isGatewayLiveProbeTimeout(error: string): boolean { + return /probe timeout after \d+ms/i.test(error); +} + +async function withGatewayLiveProbeTimeout(operation: Promise, context: string): Promise { + let timeoutHandle: ReturnType | undefined; + try { + return await Promise.race([ + operation, + new Promise((_, reject) => { + timeoutHandle = setTimeout(() => { + reject(new Error(`probe timeout after ${GATEWAY_LIVE_PROBE_TIMEOUT_MS}ms (${context})`)); + }, GATEWAY_LIVE_PROBE_TIMEOUT_MS); + }), + ]); + } finally { + if (timeoutHandle) { + clearTimeout(timeoutHandle); + } + } +} + function capByProviderSpread( items: T[], maxItems: number, @@ -264,6 +291,11 @@ function isToolNonceRefusal(error: string): boolean { ); } +function isToolNonceProbeMiss(error: string): boolean { + const msg = error.toLowerCase(); + return msg.includes("tool probe missing nonce") || msg.includes("exec+read probe missing nonce"); +} + function isMissingProfileError(error: string): boolean { return /no credentials found for profile/i.test(error); } @@ -287,16 +319,19 @@ async function runAnthropicRefusalProbe(params: { logProgress(`${params.label}: refusal-probe`); const magic = buildAnthropicRefusalToken(); const runId = randomUUID(); - const probe = await params.client.request( - "agent", - { - sessionKey: params.sessionKey, - idempotencyKey: `idem-${runId}-refusal`, - message: `Reply with the single word ok. Test token: ${magic}`, - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const probe = await withGatewayLiveProbeTimeout( + params.client.request( + "agent", + { + sessionKey: params.sessionKey, + idempotencyKey: `idem-${runId}-refusal`, + message: `Reply with the single word ok. Test token: ${magic}`, + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${params.label}: refusal-probe`, ); if (probe?.status !== "ok") { throw new Error(`refusal probe failed: status=${String(probe?.status)}`); @@ -313,16 +348,19 @@ async function runAnthropicRefusalProbe(params: { } const followupId = randomUUID(); - const followup = await params.client.request( - "agent", - { - sessionKey: params.sessionKey, - idempotencyKey: `idem-${followupId}-refusal-followup`, - message: "Now reply with exactly: still ok.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const followup = await withGatewayLiveProbeTimeout( + params.client.request( + "agent", + { + sessionKey: params.sessionKey, + idempotencyKey: `idem-${followupId}-refusal-followup`, + message: "Now reply with exactly: still ok.", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${params.label}: refusal-followup`, ); if (followup?.status !== "ok") { throw new Error(`refusal followup failed: status=${String(followup?.status)}`); @@ -666,19 +704,49 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`); process.env.OPENCLAW_CONFIG_PATH = tempConfigPath; - await ensureOpenClawModelsJson(nextCfg); + const liveProviders = nextCfg.models?.providers; + if (liveProviders && Object.keys(liveProviders).length > 0) { + const modelsPath = path.join(tempAgentDir, "models.json"); + await fs.mkdir(tempAgentDir, { recursive: true }); + await fs.writeFile(modelsPath, `${JSON.stringify({ providers: liveProviders }, null, 2)}\n`); + } - const port = await getFreeGatewayPort(); - const server = await startGatewayServer(port, { - bind: "loopback", - auth: { mode: "token", token }, - controlUiEnabled: false, - }); + let server: Awaited> | undefined; + let client: GatewayClient | undefined; + try { + const port = await withGatewayLiveProbeTimeout( + getFreeGatewayPort(), + `${params.label}: gateway-port`, + ); + server = await withGatewayLiveProbeTimeout( + startGatewayServer(port, { + bind: "loopback", + auth: { mode: "token", token }, + controlUiEnabled: false, + }), + `${params.label}: gateway-start`, + ); - const client = await connectClient({ - url: `ws://127.0.0.1:${port}`, - token, - }); + client = await withGatewayLiveProbeTimeout( + connectClient({ + url: `ws://127.0.0.1:${port}`, + token, + }), + `${params.label}: gateway-connect`, + ); + } catch (error) { + const message = String(error); + if (isGatewayLiveProbeTimeout(message)) { + logProgress(`[${params.label}] skip (gateway startup timeout)`); + return; + } + throw error; + } + + if (!server || !client) { + logProgress(`[${params.label}] skip (gateway startup incomplete)`); + return; + } try { logProgress( @@ -709,27 +777,36 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { // Ensure session exists + override model for this run. // Reset between models: avoids cross-provider transcript incompatibilities // (notably OpenAI Responses requiring reasoning replay for function_call items). - await client.request("sessions.reset", { - key: sessionKey, - }); - await client.request("sessions.patch", { - key: sessionKey, - model: modelKey, - }); + await withGatewayLiveProbeTimeout( + client.request("sessions.reset", { + key: sessionKey, + }), + `${progressLabel}: sessions-reset`, + ); + await withGatewayLiveProbeTimeout( + client.request("sessions.patch", { + key: sessionKey, + model: modelKey, + }), + `${progressLabel}: sessions-patch`, + ); logProgress(`${progressLabel}: prompt`); const runId = randomUUID(); - const payload = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId}`, - message: - "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const payload = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runId}`, + message: + "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${progressLabel}: prompt`, ); if (payload?.status !== "ok") { @@ -738,17 +815,20 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { let text = extractPayloadText(payload?.result); if (!text) { logProgress(`${progressLabel}: empty response, retrying`); - const retry = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${randomUUID()}-retry`, - message: - "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const retry = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${randomUUID()}-retry`, + message: + "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${progressLabel}: prompt-retry`, ); if (retry?.status !== "ok") { throw new Error(`agent status=${String(retry?.status)}`); @@ -800,22 +880,25 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { toolReadAttempt += 1 ) { const strictReply = toolReadAttempt > 0; - const toolProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`, - message: strictReply - ? "OpenClaw live tool probe (local, safe): " + - `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + - `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.` - : "OpenClaw live tool probe (local, safe): " + - `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + - "Then reply with the two nonce values you read (include both).", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const toolProbe = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`, + message: strictReply + ? "OpenClaw live tool probe (local, safe): " + + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + + `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.` + : "OpenClaw live tool probe (local, safe): " + + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + + "Then reply with the two nonce values you read (include both).", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${progressLabel}: tool-read`, ); if (toolProbe?.status !== "ok") { if (toolReadAttempt + 1 < maxToolReadAttempts) { @@ -876,26 +959,29 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { execReadAttempt += 1 ) { const strictReply = execReadAttempt > 0; - const execReadProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, - message: strictReply - ? "OpenClaw live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - `Then reply with exactly: ${nonceC}. No extra text.` - : "OpenClaw live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - "Finally reply including the nonce text you read back.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const execReadProbe = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, + message: strictReply + ? "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + `Then reply with exactly: ${nonceC}. No extra text.` + : "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + "Finally reply including the nonce text you read back.", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${progressLabel}: tool-exec`, ); if (execReadProbe?.status !== "ok") { if (execReadAttempt + 1 < maxExecReadAttempts) { @@ -952,26 +1038,29 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { const imageBase64 = renderCatNoncePngBase64(imageCode); const runIdImage = randomUUID(); - const imageProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdImage}-image`, - message: - "Look at the attached image. Reply with exactly two tokens separated by a single space: " + - "(1) the animal shown or written in the image, lowercase; " + - "(2) the code printed in the image, uppercase. No extra text.", - attachments: [ - { - mimeType: "image/png", - fileName: `probe-${runIdImage}.png`, - content: imageBase64, - }, - ], - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const imageProbe = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdImage}-image`, + message: + "Look at the attached image. Reply with exactly two tokens separated by a single space: " + + "(1) the animal shown or written in the image, lowercase; " + + "(2) the code printed in the image, uppercase. No extra text.", + attachments: [ + { + mimeType: "image/png", + fileName: `probe-${runIdImage}.png`, + content: imageBase64, + }, + ], + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${progressLabel}: image`, ); // Best-effort: do not fail the whole live suite on flaky image handling. // (We still keep prompt + tool probes as hard checks.) @@ -1017,16 +1106,19 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { ) { logProgress(`${progressLabel}: tool-only regression`); const runId2 = randomUUID(); - const first = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId2}-1`, - message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const first = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runId2}-1`, + message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${progressLabel}: tool-only-regression-first`, ); if (first?.status !== "ok") { throw new Error(`tool-only turn failed: status=${String(first?.status)}`); @@ -1039,16 +1131,19 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { label: params.label, }); - const second = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId2}-2`, - message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, + const second = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runId2}-2`, + message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ), + `${progressLabel}: tool-only-regression-second`, ); if (second?.status !== "ok") { throw new Error(`post-tool message failed: status=${String(second?.status)}`); @@ -1118,6 +1213,19 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { logProgress(`${progressLabel}: skip (provider unavailable)`); break; } + if ( + model.provider === "anthropic" && + isGatewayLiveProbeTimeout(message) && + attempt + 1 < attemptMax + ) { + logProgress(`${progressLabel}: probe timeout, retrying with next key`); + continue; + } + if (isGatewayLiveProbeTimeout(message)) { + skippedCount += 1; + logProgress(`${progressLabel}: skip (probe timeout)`); + break; + } // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests. if (model.provider === "openai-codex" && isRefreshTokenReused(message)) { logProgress(`${progressLabel}: skip (codex refresh token reused)`); @@ -1148,6 +1256,11 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { logProgress(`${progressLabel}: skip (tool probe refusal)`); break; } + if (model.provider === "anthropic" && isToolNonceProbeMiss(message)) { + skippedCount += 1; + logProgress(`${progressLabel}: skip (anthropic tool probe nonce miss)`); + break; + } if (isMissingProfileError(message)) { skippedCount += 1; logProgress(`${progressLabel}: skip (missing auth profile)`); @@ -1222,26 +1335,26 @@ describeLive("gateway live (dev agent, profile keys)", () => { ? all.filter((m) => filter.has(`${m.provider}/${m.id}`)) : all.filter((m) => isModernModelRef({ provider: m.provider, id: m.id })); + const providerProfileCache = new Map(); const candidates: Array> = []; for (const model of wanted) { if (PROVIDERS && !PROVIDERS.has(model.provider)) { continue; } - try { - // eslint-disable-next-line no-await-in-loop - const apiKeyInfo = await getApiKeyForModel({ - model, + let hasProfile = providerProfileCache.get(model.provider); + if (hasProfile === undefined) { + const order = resolveAuthProfileOrder({ cfg, store: authStore, - agentDir, + provider: model.provider, }); - if (!apiKeyInfo.source.startsWith("profile:")) { - continue; - } - candidates.push(model); - } catch { - // no creds; skip + hasProfile = order.some((profileId) => Boolean(authStore.profiles[profileId])); + providerProfileCache.set(model.provider, hasProfile); } + if (!hasProfile) { + continue; + } + candidates.push(model); } if (candidates.length === 0) { @@ -1348,42 +1461,76 @@ describeLive("gateway live (dev agent, profile keys)", () => { const toolProbePath = path.join(workspaceDir, `.openclaw-live-zai-fallback.${nonceA}.txt`); await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`); - const port = await getFreeGatewayPort(); - const server = await startGatewayServer(port, { - bind: "loopback", - auth: { mode: "token", token }, - controlUiEnabled: false, - }); + let server: Awaited> | undefined; + let client: GatewayClient | undefined; + try { + const port = await withGatewayLiveProbeTimeout( + getFreeGatewayPort(), + "zai-fallback: gateway-port", + ); + server = await withGatewayLiveProbeTimeout( + startGatewayServer(port, { + bind: "loopback", + auth: { mode: "token", token }, + controlUiEnabled: false, + }), + "zai-fallback: gateway-start", + ); - const client = await connectClient({ - url: `ws://127.0.0.1:${port}`, - token, - }); + client = await withGatewayLiveProbeTimeout( + connectClient({ + url: `ws://127.0.0.1:${port}`, + token, + }), + "zai-fallback: gateway-connect", + ); + } catch (error) { + const message = String(error); + if (isGatewayLiveProbeTimeout(message)) { + logProgress("[zai-fallback] skip (gateway startup timeout)"); + return; + } + throw error; + } + + if (!server || !client) { + logProgress("[zai-fallback] skip (gateway startup incomplete)"); + return; + } try { const sessionKey = `agent:${agentId}:live-zai-fallback`; - await client.request("sessions.patch", { - key: sessionKey, - model: "anthropic/claude-opus-4-5", - }); - await client.request("sessions.reset", { - key: sessionKey, - }); + await withGatewayLiveProbeTimeout( + client.request("sessions.patch", { + key: sessionKey, + model: "anthropic/claude-opus-4-5", + }), + "zai-fallback: sessions-patch-anthropic", + ); + await withGatewayLiveProbeTimeout( + client.request("sessions.reset", { + key: sessionKey, + }), + "zai-fallback: sessions-reset", + ); const runId = randomUUID(); - const toolProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId}-tool`, - message: - `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` + - `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`, - thinking: THINKING_LEVEL, - deliver: false, - }, - { expectFinal: true }, + const toolProbe = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runId}-tool`, + message: + `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` + + `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`, + thinking: THINKING_LEVEL, + deliver: false, + }, + { expectFinal: true }, + ), + "zai-fallback: tool-probe", ); if (toolProbe?.status !== "ok") { throw new Error(`anthropic tool probe failed: status=${String(toolProbe?.status)}`); @@ -1399,24 +1546,30 @@ describeLive("gateway live (dev agent, profile keys)", () => { throw new Error(`anthropic tool probe missing nonce: ${toolText}`); } - await client.request("sessions.patch", { - key: sessionKey, - model: "zai/glm-4.7", - }); + await withGatewayLiveProbeTimeout( + client.request("sessions.patch", { + key: sessionKey, + model: "zai/glm-4.7", + }), + "zai-fallback: sessions-patch-zai", + ); const followupId = randomUUID(); - const followup = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${followupId}-followup`, - message: - `What are the values of nonceA and nonceB in "${toolProbePath}"? ` + - `Reply with exactly: ${nonceA} ${nonceB}.`, - thinking: THINKING_LEVEL, - deliver: false, - }, - { expectFinal: true }, + const followup = await withGatewayLiveProbeTimeout( + client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${followupId}-followup`, + message: + `What are the values of nonceA and nonceB in "${toolProbePath}"? ` + + `Reply with exactly: ${nonceA} ${nonceB}.`, + thinking: THINKING_LEVEL, + deliver: false, + }, + { expectFinal: true }, + ), + "zai-fallback: followup", ); if (followup?.status !== "ok") { throw new Error(`zai followup failed: status=${String(followup?.status)}`);