diff --git a/docs/help/testing.md b/docs/help/testing.md index 870e1daaeb7..78f4e2fee2c 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -498,7 +498,15 @@ If you want to rely on env keys (e.g. exported in your `~/.profile`), run local These Docker runners split into two buckets: -- Live-model runners: `test:docker:live-models` and `test:docker:live-gateway` run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted). +- Live-model runners: `test:docker:live-models` and `test:docker:live-gateway` run only their matching profile-key live file inside the repo Docker image (`src/agents/models.profiles.live.test.ts` and `src/gateway/gateway-models.profiles.live.test.ts`), mounting your local config dir and workspace (and sourcing `~/.profile` if mounted). The matching local entrypoints are `test:live:models-profiles` and `test:live:gateway-profiles`. +- Docker live runners default to a smaller smoke cap so a full Docker sweep stays practical: + `test:docker:live-models` defaults to `OPENCLAW_LIVE_MAX_MODELS=12`, and + `test:docker:live-gateway` defaults to `OPENCLAW_LIVE_GATEWAY_SMOKE=1`, + `OPENCLAW_LIVE_GATEWAY_MAX_MODELS=8`, + `OPENCLAW_LIVE_GATEWAY_STEP_TIMEOUT_MS=45000`, and + `OPENCLAW_LIVE_GATEWAY_MODEL_TIMEOUT_MS=90000`. Override those env vars when you + explicitly want the larger exhaustive scan. +- `test:docker:all` builds the live Docker image once via `test:docker:live-build`, then reuses it for the two live Docker lanes. - Container smoke runners: `test:docker:openwebui`, `test:docker:onboard`, `test:docker:gateway-network`, `test:docker:mcp-channels`, and `test:docker:plugins` boot one or more real containers and verify higher-level integration paths. The live-model Docker runners also bind-mount only the needed CLI auth homes (or all supported ones when the run is not narrowed), then copy them into the container home before the run so external-CLI OAuth can refresh tokens without mutating the host auth store: diff --git a/package.json b/package.json index f51b6b24775..d3eb1d6d967 100644 --- a/package.json +++ b/package.json @@ -1155,11 +1155,12 @@ "test:contracts:plugins": "OPENCLAW_TEST_PROFILE=serial pnpm exec vitest run --config vitest.contracts.config.ts src/plugins/contracts", "test:coverage": "vitest run --config vitest.unit.config.ts --coverage", "test:coverage:changed": "vitest run --config vitest.unit.config.ts --coverage --changed origin/main", - "test:docker:all": "pnpm test:docker:live-models && pnpm test:docker:live-gateway && pnpm test:docker:openwebui && pnpm test:docker:onboard && pnpm test:docker:gateway-network && pnpm test:docker:mcp-channels && pnpm test:docker:qr && pnpm test:docker:doctor-switch && pnpm test:docker:plugins && pnpm test:docker:cleanup", + "test:docker:all": "pnpm test:docker:live-build && OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-models && OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-gateway && pnpm test:docker:openwebui && pnpm test:docker:onboard && pnpm test:docker:gateway-network && pnpm test:docker:mcp-channels && pnpm test:docker:qr && pnpm test:docker:doctor-switch && pnpm test:docker:plugins && pnpm test:docker:cleanup", "test:docker:cleanup": "bash scripts/test-cleanup-docker.sh", "test:docker:doctor-switch": "bash scripts/e2e/doctor-install-switch-docker.sh", "test:docker:gateway-network": "bash scripts/e2e/gateway-network-docker.sh", "test:docker:live-acp-bind": "bash scripts/test-live-acp-bind-docker.sh", + "test:docker:live-build": "bash scripts/test-live-build-docker.sh", "test:docker:live-cli-backend": "bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-gateway": "bash scripts/test-live-gateway-models-docker.sh", "test:docker:live-models": "bash scripts/test-live-models-docker.sh", @@ -1182,6 +1183,8 @@ "test:install:e2e:openai": "OPENCLAW_E2E_MODELS=openai bash scripts/test-install-sh-e2e-docker.sh", "test:install:smoke": "bash scripts/test-install-sh-docker.sh", "test:live": "node scripts/test-live.mjs", + "test:live:gateway-profiles": "node scripts/test-live.mjs -- src/gateway/gateway-models.profiles.live.test.ts", + "test:live:models-profiles": "node scripts/test-live.mjs -- src/agents/models.profiles.live.test.ts", "test:max": "node scripts/test-parallel.mjs --profile max", "test:parallels:linux": "bash scripts/e2e/parallels-linux-smoke.sh", "test:parallels:macos": "bash scripts/e2e/parallels-macos-smoke.sh", diff --git a/scripts/e2e/plugins-docker.sh b/scripts/e2e/plugins-docker.sh index 6c974755144..814cf732794 100755 --- a/scripts/e2e/plugins-docker.sh +++ b/scripts/e2e/plugins-docker.sh @@ -282,11 +282,13 @@ cat > "$demo_plugin_root/openclaw.plugin.json" <<'JSON' JSON node "$OPENCLAW_ENTRY" plugins list --json > /tmp/plugins.json +node "$OPENCLAW_ENTRY" plugins inspect demo-plugin --json > /tmp/plugins-inspect.json node - <<'NODE' const fs = require("node:fs"); const data = JSON.parse(fs.readFileSync("/tmp/plugins.json", "utf8")); +const inspect = JSON.parse(fs.readFileSync("/tmp/plugins-inspect.json", "utf8")); const plugin = (data.plugins || []).find((entry) => entry.id === "demo-plugin"); if (!plugin) throw new Error("plugin not found"); if (plugin.status !== "loaded") { @@ -299,10 +301,13 @@ const assertIncludes = (list, value, label) => { } }; -assertIncludes(plugin.toolNames, "demo_tool", "tool"); -assertIncludes(plugin.gatewayMethods, "demo.ping", "gateway method"); -assertIncludes(plugin.cliCommands, "demo", "cli command"); -assertIncludes(plugin.services, "demo-service", "service"); +const inspectToolNames = Array.isArray(inspect.tools) + ? inspect.tools.flatMap((entry) => (Array.isArray(entry?.names) ? entry.names : [])) + : []; +assertIncludes(inspectToolNames, "demo_tool", "tool"); +assertIncludes(inspect.gatewayMethods, "demo.ping", "gateway method"); +assertIncludes(inspect.cliCommands, "demo", "cli command"); +assertIncludes(inspect.services, "demo-service", "service"); const diagErrors = (data.diagnostics || []).filter((diag) => diag.level === "error"); if (diagErrors.length > 0) { @@ -344,17 +349,19 @@ tar -czf /tmp/demo-plugin-tgz.tgz -C "$pack_dir" package node "$OPENCLAW_ENTRY" plugins install /tmp/demo-plugin-tgz.tgz node "$OPENCLAW_ENTRY" plugins list --json > /tmp/plugins2.json +node "$OPENCLAW_ENTRY" plugins inspect demo-plugin-tgz --json > /tmp/plugins2-inspect.json node - <<'NODE' const fs = require("node:fs"); const data = JSON.parse(fs.readFileSync("/tmp/plugins2.json", "utf8")); +const inspect = JSON.parse(fs.readFileSync("/tmp/plugins2-inspect.json", "utf8")); const plugin = (data.plugins || []).find((entry) => entry.id === "demo-plugin-tgz"); if (!plugin) throw new Error("tgz plugin not found"); if (plugin.status !== "loaded") { throw new Error(`unexpected status: ${plugin.status}`); } -if (!Array.isArray(plugin.gatewayMethods) || !plugin.gatewayMethods.includes("demo.tgz")) { +if (!Array.isArray(inspect.gatewayMethods) || !inspect.gatewayMethods.includes("demo.tgz")) { throw new Error("expected gateway method demo.tgz"); } console.log("ok"); @@ -390,17 +397,19 @@ JSON node "$OPENCLAW_ENTRY" plugins install "$dir_plugin" node "$OPENCLAW_ENTRY" plugins list --json > /tmp/plugins3.json +node "$OPENCLAW_ENTRY" plugins inspect demo-plugin-dir --json > /tmp/plugins3-inspect.json node - <<'NODE' const fs = require("node:fs"); const data = JSON.parse(fs.readFileSync("/tmp/plugins3.json", "utf8")); +const inspect = JSON.parse(fs.readFileSync("/tmp/plugins3-inspect.json", "utf8")); const plugin = (data.plugins || []).find((entry) => entry.id === "demo-plugin-dir"); if (!plugin) throw new Error("dir plugin not found"); if (plugin.status !== "loaded") { throw new Error(`unexpected status: ${plugin.status}`); } -if (!Array.isArray(plugin.gatewayMethods) || !plugin.gatewayMethods.includes("demo.dir")) { +if (!Array.isArray(inspect.gatewayMethods) || !inspect.gatewayMethods.includes("demo.dir")) { throw new Error("expected gateway method demo.dir"); } console.log("ok"); @@ -437,17 +446,19 @@ JSON node "$OPENCLAW_ENTRY" plugins install "file:$file_pack_dir/package" node "$OPENCLAW_ENTRY" plugins list --json > /tmp/plugins4.json +node "$OPENCLAW_ENTRY" plugins inspect demo-plugin-file --json > /tmp/plugins4-inspect.json node - <<'NODE' const fs = require("node:fs"); const data = JSON.parse(fs.readFileSync("/tmp/plugins4.json", "utf8")); +const inspect = JSON.parse(fs.readFileSync("/tmp/plugins4-inspect.json", "utf8")); const plugin = (data.plugins || []).find((entry) => entry.id === "demo-plugin-file"); if (!plugin) throw new Error("file plugin not found"); if (plugin.status !== "loaded") { throw new Error(`unexpected status: ${plugin.status}`); } -if (!Array.isArray(plugin.gatewayMethods) || !plugin.gatewayMethods.includes("demo.file")) { +if (!Array.isArray(inspect.gatewayMethods) || !inspect.gatewayMethods.includes("demo.file")) { throw new Error("expected gateway method demo.file"); } console.log("ok"); @@ -704,11 +715,19 @@ NODE node "$OPENCLAW_ENTRY" plugins install marketplace-shortcut@claude-fixtures node "$OPENCLAW_ENTRY" plugins install marketplace-direct --marketplace claude-fixtures node "$OPENCLAW_ENTRY" plugins list --json > /tmp/plugins-marketplace.json +node "$OPENCLAW_ENTRY" plugins inspect marketplace-shortcut --json > /tmp/plugins-marketplace-shortcut-inspect.json +node "$OPENCLAW_ENTRY" plugins inspect marketplace-direct --json > /tmp/plugins-marketplace-direct-inspect.json node - <<'NODE' const fs = require("node:fs"); const data = JSON.parse(fs.readFileSync("/tmp/plugins-marketplace.json", "utf8")); +const shortcutInspect = JSON.parse( + fs.readFileSync("/tmp/plugins-marketplace-shortcut-inspect.json", "utf8"), +); +const directInspect = JSON.parse( + fs.readFileSync("/tmp/plugins-marketplace-direct-inspect.json", "utf8"), +); const getPlugin = (id) => { const plugin = (data.plugins || []).find((entry) => entry.id === id); if (!plugin) throw new Error(`plugin not found: ${id}`); @@ -726,10 +745,10 @@ if (shortcut.version !== "0.0.1") { if (direct.version !== "0.0.1") { throw new Error(`unexpected direct version: ${direct.version}`); } -if (!shortcut.gatewayMethods.includes("demo.marketplace.shortcut.v1")) { +if (!shortcutInspect.gatewayMethods.includes("demo.marketplace.shortcut.v1")) { throw new Error("expected marketplace shortcut gateway method"); } -if (!direct.gatewayMethods.includes("demo.marketplace.direct.v1")) { +if (!directInspect.gatewayMethods.includes("demo.marketplace.direct.v1")) { throw new Error("expected marketplace direct gateway method"); } console.log("ok"); @@ -766,18 +785,20 @@ write_fixture_plugin \ node "$OPENCLAW_ENTRY" plugins update marketplace-shortcut --dry-run node "$OPENCLAW_ENTRY" plugins update marketplace-shortcut node "$OPENCLAW_ENTRY" plugins list --json > /tmp/plugins-marketplace-updated.json +node "$OPENCLAW_ENTRY" plugins inspect marketplace-shortcut --json > /tmp/plugins-marketplace-updated-inspect.json node - <<'NODE' const fs = require("node:fs"); const data = JSON.parse(fs.readFileSync("/tmp/plugins-marketplace-updated.json", "utf8")); +const inspect = JSON.parse(fs.readFileSync("/tmp/plugins-marketplace-updated-inspect.json", "utf8")); const plugin = (data.plugins || []).find((entry) => entry.id === "marketplace-shortcut"); if (!plugin) throw new Error("updated marketplace plugin not found"); if (plugin.version !== "0.0.2") { throw new Error(`unexpected updated version: ${plugin.version}`); } -if (!plugin.gatewayMethods.includes("demo.marketplace.shortcut.v2")) { - throw new Error(`expected updated gateway method, got ${plugin.gatewayMethods.join(", ")}`); +if (!inspect.gatewayMethods.includes("demo.marketplace.shortcut.v2")) { + throw new Error(`expected updated gateway method, got ${inspect.gatewayMethods.join(", ")}`); } console.log("ok"); NODE diff --git a/scripts/test-live-build-docker.sh b/scripts/test-live-build-docker.sh new file mode 100755 index 00000000000..af8f815ca4b --- /dev/null +++ b/scripts/test-live-build-docker.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +IMAGE_NAME="${OPENCLAW_IMAGE:-openclaw:local}" +LIVE_IMAGE_NAME="${OPENCLAW_LIVE_IMAGE:-${IMAGE_NAME}-live}" + +if [[ "${OPENCLAW_SKIP_DOCKER_BUILD:-}" == "1" ]]; then + echo "==> Reuse live-test image: $LIVE_IMAGE_NAME" + exit 0 +fi + +echo "==> Build live-test image: $LIVE_IMAGE_NAME (target=build)" +docker build --target build -t "$LIVE_IMAGE_NAME" -f "$ROOT_DIR/Dockerfile" "$ROOT_DIR" diff --git a/scripts/test-live-gateway-models-docker.sh b/scripts/test-live-gateway-models-docker.sh index 98c32fe0940..e3dc63d037a 100755 --- a/scripts/test-live-gateway-models-docker.sh +++ b/scripts/test-live-gateway-models-docker.sh @@ -115,13 +115,13 @@ elif [ -d /app/dist/extensions ]; then export OPENCLAW_BUNDLED_PLUGINS_DIR=/app/dist/extensions fi cd "$tmp_dir" -pnpm test:live +pnpm test:live:gateway-profiles EOF -echo "==> Build live-test image: $LIVE_IMAGE_NAME (target=build)" -docker build --target build -t "$LIVE_IMAGE_NAME" -f "$ROOT_DIR/Dockerfile" "$ROOT_DIR" +"$ROOT_DIR/scripts/test-live-build-docker.sh" echo "==> Run gateway live model tests (profile keys)" +echo "==> Target: src/gateway/gateway-models.profiles.live.test.ts" echo "==> External auth dirs: ${AUTH_DIRS_CSV:-none}" echo "==> External auth files: ${AUTH_FILES_CSV:-none}" docker run --rm -t \ @@ -135,8 +135,10 @@ docker run --rm -t \ -e OPENCLAW_LIVE_TEST=1 \ -e OPENCLAW_LIVE_GATEWAY_MODELS="${OPENCLAW_LIVE_GATEWAY_MODELS:-modern}" \ -e OPENCLAW_LIVE_GATEWAY_PROVIDERS="${OPENCLAW_LIVE_GATEWAY_PROVIDERS:-}" \ - -e OPENCLAW_LIVE_GATEWAY_MAX_MODELS="${OPENCLAW_LIVE_GATEWAY_MAX_MODELS:-24}" \ - -e OPENCLAW_LIVE_GATEWAY_MODEL_TIMEOUT_MS="${OPENCLAW_LIVE_GATEWAY_MODEL_TIMEOUT_MS:-}" \ + -e OPENCLAW_LIVE_GATEWAY_SMOKE="${OPENCLAW_LIVE_GATEWAY_SMOKE:-1}" \ + -e OPENCLAW_LIVE_GATEWAY_MAX_MODELS="${OPENCLAW_LIVE_GATEWAY_MAX_MODELS:-8}" \ + -e OPENCLAW_LIVE_GATEWAY_STEP_TIMEOUT_MS="${OPENCLAW_LIVE_GATEWAY_STEP_TIMEOUT_MS:-45000}" \ + -e OPENCLAW_LIVE_GATEWAY_MODEL_TIMEOUT_MS="${OPENCLAW_LIVE_GATEWAY_MODEL_TIMEOUT_MS:-90000}" \ -v "$ROOT_DIR":/src:ro \ -v "$CONFIG_DIR":/home/node/.openclaw \ -v "$WORKSPACE_DIR":/home/node/.openclaw/workspace \ diff --git a/scripts/test-live-models-docker.sh b/scripts/test-live-models-docker.sh index e432847a341..7dba7ff677e 100755 --- a/scripts/test-live-models-docker.sh +++ b/scripts/test-live-models-docker.sh @@ -125,13 +125,13 @@ elif [ -d /app/dist/extensions ]; then export OPENCLAW_BUNDLED_PLUGINS_DIR=/app/dist/extensions fi cd "$tmp_dir" -pnpm test:live +pnpm test:live:models-profiles EOF -echo "==> Build live-test image: $LIVE_IMAGE_NAME (target=build)" -docker build --target build -t "$LIVE_IMAGE_NAME" -f "$ROOT_DIR/Dockerfile" "$ROOT_DIR" +"$ROOT_DIR/scripts/test-live-build-docker.sh" echo "==> Run live model tests (profile keys)" +echo "==> Target: src/agents/models.profiles.live.test.ts" echo "==> External auth dirs: ${AUTH_DIRS_CSV:-none}" echo "==> External auth files: ${AUTH_FILES_CSV:-none}" docker run --rm -t \ @@ -145,7 +145,7 @@ docker run --rm -t \ -e OPENCLAW_LIVE_TEST=1 \ -e OPENCLAW_LIVE_MODELS="${OPENCLAW_LIVE_MODELS:-modern}" \ -e OPENCLAW_LIVE_PROVIDERS="${OPENCLAW_LIVE_PROVIDERS:-}" \ - -e OPENCLAW_LIVE_MAX_MODELS="${OPENCLAW_LIVE_MAX_MODELS:-48}" \ + -e OPENCLAW_LIVE_MAX_MODELS="${OPENCLAW_LIVE_MAX_MODELS:-12}" \ -e OPENCLAW_LIVE_MODEL_TIMEOUT_MS="${OPENCLAW_LIVE_MODEL_TIMEOUT_MS:-}" \ -e OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS="${OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS:-}" \ -e OPENCLAW_LIVE_GATEWAY_MODELS="${OPENCLAW_LIVE_GATEWAY_MODELS:-}" \ diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index c1bde246dc4..8a41037bdc7 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -46,7 +46,10 @@ import { loadSessionEntry, readSessionMessages } from "./session-utils.js"; const ZAI_FALLBACK = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY_ZAI_FALLBACK); const REQUIRE_PROFILE_KEYS = isLiveProfileKeyModeEnabled(); const PROVIDERS = parseFilter(process.env.OPENCLAW_LIVE_GATEWAY_PROVIDERS); -const THINKING_LEVEL = "high"; +const GATEWAY_LIVE_SMOKE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY_SMOKE); +const THINKING_LEVEL = GATEWAY_LIVE_SMOKE ? "low" : "high"; +const ENABLE_EXTRA_TOOL_PROBES = !GATEWAY_LIVE_SMOKE; +const ENABLE_EXTRA_IMAGE_PROBES = !GATEWAY_LIVE_SMOKE; const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i; const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i; const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL"; @@ -57,6 +60,7 @@ const GATEWAY_LIVE_PROBE_TIMEOUT_MS = Math.max( 30_000, toInt(process.env.OPENCLAW_LIVE_GATEWAY_STEP_TIMEOUT_MS, 90_000), ); +const GATEWAY_LIVE_MODEL_TIMEOUT_MS = resolveGatewayLiveModelTimeoutMs(); const GATEWAY_LIVE_HEARTBEAT_MS = Math.max( 1_000, toInt(process.env.OPENCLAW_LIVE_GATEWAY_HEARTBEAT_MS, 30_000), @@ -90,6 +94,28 @@ function parseFilter(raw?: string): Set | null { return ids.length ? new Set(ids) : null; } +function shouldSuppressGatewayLiveOllamaWarnings(): boolean { + return PROVIDERS !== null && !PROVIDERS.has("ollama"); +} + +async function withSuppressedGatewayLiveWarnings(run: () => Promise): Promise { + if (!shouldSuppressGatewayLiveOllamaWarnings()) { + return await run(); + } + const originalWarn = console.warn; + console.warn = (...args: unknown[]) => { + if (args.some((arg) => typeof arg === "string" && isOllamaUnavailableErrorMessage(arg))) { + return; + } + originalWarn(...args); + }; + try { + return await run(); + } finally { + console.warn = originalWarn; + } +} + function toInt(value: string | undefined, fallback: number): number { const trimmed = value?.trim(); if (!trimmed) { @@ -120,28 +146,50 @@ function resolveGatewayLiveSuiteTimeoutMs(maxModels: number): number { ); } +function resolveGatewayLiveModelTimeoutMs( + gatewayModelTimeoutRaw = process.env.OPENCLAW_LIVE_GATEWAY_MODEL_TIMEOUT_MS, + liveModelTimeoutRaw = process.env.OPENCLAW_LIVE_MODEL_TIMEOUT_MS, + stepTimeoutMs = GATEWAY_LIVE_PROBE_TIMEOUT_MS, +): number { + const requested = toInt(gatewayModelTimeoutRaw, toInt(liveModelTimeoutRaw, 120_000)); + return Math.max(stepTimeoutMs, requested); +} + function isGatewayLiveProbeTimeout(error: string): boolean { return /probe timeout after \d+ms/i.test(error); } -async function withGatewayLiveProbeTimeout(operation: Promise, context: string): Promise { +function isGatewayLiveModelTimeout(error: string): boolean { + return /model timeout after \d+ms/i.test(error); +} + +async function withGatewayLiveTimeout(params: { + operation: Promise; + timeoutMs: number; + timeoutLabel: "probe" | "model"; + context: string; +}): Promise { let timeoutHandle: ReturnType | undefined; const startedAt = Date.now(); let heartbeatCount = 0; const heartbeat = setInterval(() => { heartbeatCount += 1; logProgress( - `${context}: still running (${Math.max(1, Math.round((Date.now() - startedAt) / 1_000))}s)`, + `${params.context}: still running (${Math.max(1, Math.round((Date.now() - startedAt) / 1_000))}s)`, ); }, GATEWAY_LIVE_HEARTBEAT_MS); heartbeat.unref?.(); try { return await Promise.race([ - operation, + params.operation, new Promise((_, reject) => { timeoutHandle = setTimeout(() => { - reject(new Error(`probe timeout after ${GATEWAY_LIVE_PROBE_TIMEOUT_MS}ms (${context})`)); - }, GATEWAY_LIVE_PROBE_TIMEOUT_MS); + reject( + new Error( + `${params.timeoutLabel} timeout after ${params.timeoutMs}ms (${params.context})`, + ), + ); + }, params.timeoutMs); }), ]); } finally { @@ -151,12 +199,30 @@ async function withGatewayLiveProbeTimeout(operation: Promise, context: st } if (heartbeatCount > 0) { logProgress( - `${context}: completed after ${Math.max(1, Math.round((Date.now() - startedAt) / 1_000))}s`, + `${params.context}: completed after ${Math.max(1, Math.round((Date.now() - startedAt) / 1_000))}s`, ); } } } +async function withGatewayLiveProbeTimeout(operation: Promise, context: string): Promise { + return await withGatewayLiveTimeout({ + operation, + timeoutMs: GATEWAY_LIVE_PROBE_TIMEOUT_MS, + timeoutLabel: "probe", + context, + }); +} + +async function withGatewayLiveModelTimeout(operation: Promise, context: string): Promise { + return await withGatewayLiveTimeout({ + operation, + timeoutMs: GATEWAY_LIVE_MODEL_TIMEOUT_MS, + timeoutLabel: "model", + context, + }); +} + function capByProviderSpread( items: T[], maxItems: number, @@ -422,6 +488,20 @@ describe("shouldSkipExecReadNonceMissForLiveModel", () => { }); }); +describe("resolveGatewayLiveModelTimeoutMs", () => { + it("prefers gateway-specific timeout when provided", () => { + expect(resolveGatewayLiveModelTimeoutMs("180000", "45000", 90_000)).toBe(180_000); + }); + + it("falls back to the shared live timeout", () => { + expect(resolveGatewayLiveModelTimeoutMs("", "45000", 30_000)).toBe(45_000); + }); + + it("never goes below the probe timeout", () => { + expect(resolveGatewayLiveModelTimeoutMs("45000", undefined, 90_000)).toBe(90_000); + }); +}); + function isGoogleModelNotFoundText(text: string): boolean { const trimmed = text.trim(); if (!trimmed) { @@ -1133,7 +1213,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { `[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`, ); logProgress( - `[${params.label}] heartbeat=${Math.max(1, Math.round(GATEWAY_LIVE_HEARTBEAT_MS / 1_000))}s probe-timeout=${Math.max(1, Math.round(GATEWAY_LIVE_PROBE_TIMEOUT_MS / 1_000))}s`, + `[${params.label}] heartbeat=${Math.max(1, Math.round(GATEWAY_LIVE_HEARTBEAT_MS / 1_000))}s probe-timeout=${Math.max(1, Math.round(GATEWAY_LIVE_PROBE_TIMEOUT_MS / 1_000))}s model-timeout=${Math.max(1, Math.round(GATEWAY_LIVE_MODEL_TIMEOUT_MS / 1_000))}s`, ); const anthropicKeys = collectAnthropicApiKeys(); if (anthropicKeys.length > 0) { @@ -1157,347 +1237,355 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt]; } try { - // Ensure session exists + override model for this run. - // Reset between models: avoids cross-provider transcript incompatibilities - // (notably OpenAI Responses requiring reasoning replay for function_call items). - await withGatewayLiveProbeTimeout( - client.request("sessions.reset", { - key: sessionKey, - }), - `${progressLabel}: sessions-reset`, - ); - await withGatewayLiveProbeTimeout( - client.request("sessions.patch", { - key: sessionKey, - model: modelKey, - }), - `${progressLabel}: sessions-patch`, - ); - - logProgress(`${progressLabel}: prompt`); - let text = await requestGatewayAgentText({ - client, - sessionKey, - idempotencyKey: `idem-${randomUUID()}`, - modelKey, - message: - "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", - thinkingLevel: params.thinkingLevel, - context: `${progressLabel}: prompt`, - }); - if (!text) { - logProgress(`${progressLabel}: empty response, retrying`); - text = await requestGatewayAgentText({ - client, - sessionKey, - idempotencyKey: `idem-${randomUUID()}-retry`, - modelKey, - message: - "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", - thinkingLevel: params.thinkingLevel, - context: `${progressLabel}: prompt-retry`, - }); - } - if ( - !text && - shouldSkipEmptyResponseForLiveModel({ - provider: model.provider, - allowNotFoundSkip: params.allowNotFoundSkip, - }) - ) { - logProgress(`${progressLabel}: skip (${model.provider} empty response)`); - break; - } - if ( - isEmptyStreamText(text) && - shouldSkipEmptyResponseForLiveModel({ - provider: model.provider, - allowNotFoundSkip: params.allowNotFoundSkip, - }) - ) { - logProgress(`${progressLabel}: skip (${model.provider} empty response)`); - break; - } - if (isGoogleishProvider(model.provider) && isGoogleModelNotFoundText(text)) { - // Catalog drift: model IDs can disappear or become unavailable on the API. - // Treat as skip when scanning "all models" for Google. - logProgress(`${progressLabel}: skip (google model not found)`); - break; - } - if (params.allowNotFoundSkip && isModelNotFoundErrorMessage(text)) { - logProgress(`${progressLabel}: skip (model not found)`); - break; - } - assertNoReasoningTags({ - text, - model: modelKey, - phase: "prompt", - label: params.label, - }); - if (!isMeaningful(text)) { - if (isGoogleishProvider(model.provider) && /gemini/i.test(model.id)) { - logProgress(`${progressLabel}: skip (google not meaningful)`); - break; - } - throw new Error(`not meaningful: ${text}`); - } - if (!/\bmicro\s*-?\s*tasks?\b/i.test(text) || !/\bmacro\s*-?\s*tasks?\b/i.test(text)) { - throw new Error(`missing required keywords: ${text}`); - } - - // Real tool invocation: force the agent to Read a local file and echo a nonce. - logProgress(`${progressLabel}: tool-read`); - const runIdTool = randomUUID(); - const maxToolReadAttempts = 3; - let toolText = ""; - for ( - let toolReadAttempt = 0; - toolReadAttempt < maxToolReadAttempts; - toolReadAttempt += 1 - ) { - const strictReply = toolReadAttempt > 0; - toolText = await requestGatewayAgentText({ - client, - sessionKey, - idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`, - modelKey, - message: strictReply - ? "OpenClaw live tool probe (local, safe): " + - `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + - `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.` - : "OpenClaw live tool probe (local, safe): " + - `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + - "Then reply with the two nonce values you read (include both).", - thinkingLevel: params.thinkingLevel, - context: `${progressLabel}: tool-read`, - }); - if ( - isEmptyStreamText(toolText) && - shouldSkipEmptyResponseForLiveModel({ - provider: model.provider, - allowNotFoundSkip: params.allowNotFoundSkip, - }) - ) { - logProgress(`${progressLabel}: skip (${model.provider} empty response)`); - break; - } - assertNoReasoningTags({ - text: toolText, - model: modelKey, - phase: "tool-read", - label: params.label, - }); - if (hasExpectedToolNonce(toolText, nonceA, nonceB)) { - break; - } - if ( - shouldRetryToolReadProbe({ - text: toolText, - nonceA, - nonceB, - provider: model.provider, - attempt: toolReadAttempt, - maxAttempts: maxToolReadAttempts, - }) - ) { - logProgress( - `${progressLabel}: tool-read retry (${toolReadAttempt + 2}/${maxToolReadAttempts}) malformed tool output`, + const modelResult = await withGatewayLiveModelTimeout<"done" | "skip">( + (async () => { + // Ensure session exists + override model for this run. + // Reset between models: avoids cross-provider transcript incompatibilities + // (notably OpenAI Responses requiring reasoning replay for function_call items). + await withGatewayLiveProbeTimeout( + client.request("sessions.reset", { + key: sessionKey, + }), + `${progressLabel}: sessions-reset`, + ); + await withGatewayLiveProbeTimeout( + client.request("sessions.patch", { + key: sessionKey, + model: modelKey, + }), + `${progressLabel}: sessions-patch`, ); - continue; - } - throw new Error(`tool probe missing nonce: ${toolText}`); - } - if (!hasExpectedToolNonce(toolText, nonceA, nonceB)) { - throw new Error(`tool probe missing nonce: ${toolText}`); - } - if (params.extraToolProbes) { - logProgress(`${progressLabel}: tool-exec`); - const nonceC = randomUUID(); - const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`); - const maxExecReadAttempts = 3; - let execReadText = ""; - for ( - let execReadAttempt = 0; - execReadAttempt < maxExecReadAttempts; - execReadAttempt += 1 - ) { - const strictReply = execReadAttempt > 0; - execReadText = await requestGatewayAgentText({ + logProgress(`${progressLabel}: prompt`); + let text = await requestGatewayAgentText({ client, sessionKey, - idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, + idempotencyKey: `idem-${randomUUID()}`, modelKey, - message: strictReply - ? "OpenClaw live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - `Then reply with exactly: ${nonceC}. No extra text.` - : "OpenClaw live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - "Finally reply including the nonce text you read back.", + message: + "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", thinkingLevel: params.thinkingLevel, - context: `${progressLabel}: tool-exec`, + context: `${progressLabel}: prompt`, }); + if (!text) { + logProgress(`${progressLabel}: empty response, retrying`); + text = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${randomUUID()}-retry`, + modelKey, + message: + "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: prompt-retry`, + }); + } if ( - isEmptyStreamText(execReadText) && + !text && shouldSkipEmptyResponseForLiveModel({ provider: model.provider, allowNotFoundSkip: params.allowNotFoundSkip, }) ) { logProgress(`${progressLabel}: skip (${model.provider} empty response)`); - break; - } - assertNoReasoningTags({ - text: execReadText, - model: modelKey, - phase: "tool-exec", - label: params.label, - }); - if (hasExpectedSingleNonce(execReadText, nonceC)) { - break; + return "skip"; } if ( - shouldRetryExecReadProbe({ - text: execReadText, - nonce: nonceC, + isEmptyStreamText(text) && + shouldSkipEmptyResponseForLiveModel({ provider: model.provider, - attempt: execReadAttempt, - maxAttempts: maxExecReadAttempts, + allowNotFoundSkip: params.allowNotFoundSkip, }) ) { - logProgress( - `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) malformed tool output`, - ); - continue; + logProgress(`${progressLabel}: skip (${model.provider} empty response)`); + return "skip"; + } + if (isGoogleishProvider(model.provider) && isGoogleModelNotFoundText(text)) { + // Catalog drift: model IDs can disappear or become unavailable on the API. + // Treat as skip when scanning "all models" for Google. + logProgress(`${progressLabel}: skip (google model not found)`); + return "skip"; + } + if (params.allowNotFoundSkip && isModelNotFoundErrorMessage(text)) { + logProgress(`${progressLabel}: skip (model not found)`); + return "skip"; } - throw new Error(`exec+read probe missing nonce: ${execReadText}`); - } - if (!hasExpectedSingleNonce(execReadText, nonceC)) { - throw new Error(`exec+read probe missing nonce: ${execReadText}`); - } - - await fs.rm(toolWritePath, { force: true }); - } - - if (params.extraImageProbes && model.input?.includes("image")) { - logProgress(`${progressLabel}: image`); - // Shorter code => less OCR flake across providers, still tests image attachments end-to-end. - const imageCode = randomImageProbeCode(); - const imageBase64 = renderCatNoncePngBase64(imageCode); - const runIdImage = randomUUID(); - - const imageText = await requestGatewayAgentText({ - client, - sessionKey, - idempotencyKey: `idem-${runIdImage}-image`, - modelKey, - message: - "Look at the attached image. Reply with exactly two tokens separated by a single space: " + - "(1) the animal shown or written in the image, lowercase; " + - "(2) the code printed in the image, uppercase. No extra text.", - attachments: [ - { - mimeType: "image/png", - fileName: `probe-${runIdImage}.png`, - content: imageBase64, - }, - ], - thinkingLevel: params.thinkingLevel, - context: `${progressLabel}: image`, - }); - // Best-effort: do not fail the whole live suite on flaky image handling. - // (We still keep prompt + tool probes as hard checks.) - if ( - isEmptyStreamText(imageText) && - shouldSkipEmptyResponseForLiveModel({ - provider: model.provider, - allowNotFoundSkip: params.allowNotFoundSkip, - }) - ) { - logProgress(`${progressLabel}: image skip (${model.provider} empty response)`); - } else { assertNoReasoningTags({ - text: imageText, + text, model: modelKey, - phase: "image", + phase: "prompt", label: params.label, }); - if (!/\bcat\b/i.test(imageText)) { - logProgress(`${progressLabel}: image skip (missing 'cat')`); - } else { - const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? []; - const bestDistance = candidates.reduce((best, cand) => { - if (Math.abs(cand.length - imageCode.length) > 2) { - return best; + if (!isMeaningful(text)) { + if (isGoogleishProvider(model.provider) && /gemini/i.test(model.id)) { + logProgress(`${progressLabel}: skip (google not meaningful)`); + return "skip"; + } + throw new Error(`not meaningful: ${text}`); + } + if ( + !/\bmicro\s*-?\s*tasks?\b/i.test(text) || + !/\bmacro\s*-?\s*tasks?\b/i.test(text) + ) { + throw new Error(`missing required keywords: ${text}`); + } + + // Real tool invocation: force the agent to Read a local file and echo a nonce. + logProgress(`${progressLabel}: tool-read`); + const runIdTool = randomUUID(); + const maxToolReadAttempts = 3; + let toolText = ""; + for ( + let toolReadAttempt = 0; + toolReadAttempt < maxToolReadAttempts; + toolReadAttempt += 1 + ) { + const strictReply = toolReadAttempt > 0; + toolText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`, + modelKey, + message: strictReply + ? "OpenClaw live tool probe (local, safe): " + + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + + `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.` + : "OpenClaw live tool probe (local, safe): " + + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + + "Then reply with the two nonce values you read (include both).", + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-read`, + }); + if ( + isEmptyStreamText(toolText) && + shouldSkipEmptyResponseForLiveModel({ + provider: model.provider, + allowNotFoundSkip: params.allowNotFoundSkip, + }) + ) { + logProgress(`${progressLabel}: skip (${model.provider} empty response)`); + return "skip"; + } + assertNoReasoningTags({ + text: toolText, + model: modelKey, + phase: "tool-read", + label: params.label, + }); + if (hasExpectedToolNonce(toolText, nonceA, nonceB)) { + break; + } + if ( + shouldRetryToolReadProbe({ + text: toolText, + nonceA, + nonceB, + provider: model.provider, + attempt: toolReadAttempt, + maxAttempts: maxToolReadAttempts, + }) + ) { + logProgress( + `${progressLabel}: tool-read retry (${toolReadAttempt + 2}/${maxToolReadAttempts}) malformed tool output`, + ); + continue; + } + throw new Error(`tool probe missing nonce: ${toolText}`); + } + if (!hasExpectedToolNonce(toolText, nonceA, nonceB)) { + throw new Error(`tool probe missing nonce: ${toolText}`); + } + + if (params.extraToolProbes) { + logProgress(`${progressLabel}: tool-exec`); + const nonceC = randomUUID(); + const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`); + const maxExecReadAttempts = 3; + let execReadText = ""; + for ( + let execReadAttempt = 0; + execReadAttempt < maxExecReadAttempts; + execReadAttempt += 1 + ) { + const strictReply = execReadAttempt > 0; + execReadText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, + modelKey, + message: strictReply + ? "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + `Then reply with exactly: ${nonceC}. No extra text.` + : "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + "Finally reply including the nonce text you read back.", + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-exec`, + }); + if ( + isEmptyStreamText(execReadText) && + shouldSkipEmptyResponseForLiveModel({ + provider: model.provider, + allowNotFoundSkip: params.allowNotFoundSkip, + }) + ) { + logProgress(`${progressLabel}: skip (${model.provider} empty response)`); + return "skip"; + } + assertNoReasoningTags({ + text: execReadText, + model: modelKey, + phase: "tool-exec", + label: params.label, + }); + if (hasExpectedSingleNonce(execReadText, nonceC)) { + break; + } + if ( + shouldRetryExecReadProbe({ + text: execReadText, + nonce: nonceC, + provider: model.provider, + attempt: execReadAttempt, + maxAttempts: maxExecReadAttempts, + }) + ) { + logProgress( + `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) malformed tool output`, + ); + continue; + } + throw new Error(`exec+read probe missing nonce: ${execReadText}`); + } + if (!hasExpectedSingleNonce(execReadText, nonceC)) { + throw new Error(`exec+read probe missing nonce: ${execReadText}`); + } + + await fs.rm(toolWritePath, { force: true }); + } + + if (params.extraImageProbes && model.input?.includes("image")) { + logProgress(`${progressLabel}: image`); + // Shorter code => less OCR flake across providers, still tests image attachments end-to-end. + const imageCode = randomImageProbeCode(); + const imageBase64 = renderCatNoncePngBase64(imageCode); + const runIdImage = randomUUID(); + + const imageText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runIdImage}-image`, + modelKey, + message: + "Look at the attached image. Reply with exactly two tokens separated by a single space: " + + "(1) the animal shown or written in the image, lowercase; " + + "(2) the code printed in the image, uppercase. No extra text.", + attachments: [ + { + mimeType: "image/png", + fileName: `probe-${runIdImage}.png`, + content: imageBase64, + }, + ], + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: image`, + }); + if ( + isEmptyStreamText(imageText) && + shouldSkipEmptyResponseForLiveModel({ + provider: model.provider, + allowNotFoundSkip: params.allowNotFoundSkip, + }) + ) { + logProgress(`${progressLabel}: image skip (${model.provider} empty response)`); + } else { + assertNoReasoningTags({ + text: imageText, + model: modelKey, + phase: "image", + label: params.label, + }); + if (!/\bcat\b/i.test(imageText)) { + logProgress(`${progressLabel}: image skip (missing 'cat')`); + } else { + const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? []; + const bestDistance = candidates.reduce((best, cand) => { + if (Math.abs(cand.length - imageCode.length) > 2) { + return best; + } + return Math.min(best, editDistance(cand, imageCode)); + }, Number.POSITIVE_INFINITY); + if (!(bestDistance <= 3)) { + logProgress(`${progressLabel}: image skip (code mismatch)`); + } } - return Math.min(best, editDistance(cand, imageCode)); - }, Number.POSITIVE_INFINITY); - // OCR / image-read flake: allow a small edit distance, but still require the "cat" token above. - if (!(bestDistance <= 3)) { - logProgress(`${progressLabel}: image skip (code mismatch)`); } } - } + + if ( + (model.provider === "openai" && model.api === "openai-responses") || + (model.provider === "openai-codex" && model.api === "openai-codex-responses") + ) { + logProgress(`${progressLabel}: tool-only regression`); + const runId2 = randomUUID(); + const firstText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runId2}-1`, + modelKey, + message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-only-regression-first`, + }); + assertNoReasoningTags({ + text: firstText, + model: modelKey, + phase: "tool-only", + label: params.label, + }); + + const reply = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runId2}-2`, + modelKey, + message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-only-regression-second`, + }); + assertNoReasoningTags({ + text: reply, + model: modelKey, + phase: "tool-only-followup", + label: params.label, + }); + if (!reply.includes(nonceA) || !reply.includes(nonceB)) { + throw new Error(`unexpected reply: ${reply}`); + } + } + + if (model.provider === "anthropic") { + await runAnthropicRefusalProbe({ + client, + sessionKey, + modelKey, + label: progressLabel, + thinkingLevel: params.thinkingLevel, + }); + } + return "done"; + })(), + `${progressLabel}: model`, + ); + if (modelResult === "skip") { + skippedCount += 1; + break; } - - // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class). - if ( - (model.provider === "openai" && model.api === "openai-responses") || - (model.provider === "openai-codex" && model.api === "openai-codex-responses") - ) { - logProgress(`${progressLabel}: tool-only regression`); - const runId2 = randomUUID(); - const firstText = await requestGatewayAgentText({ - client, - sessionKey, - idempotencyKey: `idem-${runId2}-1`, - modelKey, - message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, - thinkingLevel: params.thinkingLevel, - context: `${progressLabel}: tool-only-regression-first`, - }); - assertNoReasoningTags({ - text: firstText, - model: modelKey, - phase: "tool-only", - label: params.label, - }); - - const reply = await requestGatewayAgentText({ - client, - sessionKey, - idempotencyKey: `idem-${runId2}-2`, - modelKey, - message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, - thinkingLevel: params.thinkingLevel, - context: `${progressLabel}: tool-only-regression-second`, - }); - assertNoReasoningTags({ - text: reply, - model: modelKey, - phase: "tool-only-followup", - label: params.label, - }); - if (!reply.includes(nonceA) || !reply.includes(nonceB)) { - throw new Error(`unexpected reply: ${reply}`); - } - } - - if (model.provider === "anthropic") { - await runAnthropicRefusalProbe({ - client, - sessionKey, - modelKey, - label: progressLabel, - thinkingLevel: params.thinkingLevel, - }); - } - logProgress(`${progressLabel}: done`); break; } catch (err) { @@ -1591,6 +1679,11 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { logProgress(`${progressLabel}: skip (probe timeout)`); break; } + if (isGatewayLiveModelTimeout(message)) { + skippedCount += 1; + logProgress(`${progressLabel}: skip (model timeout)`); + break; + } // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests. if (model.provider === "openai-codex" && isRefreshTokenReused(message)) { skippedCount += 1; @@ -1698,110 +1791,113 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { describeLive("gateway live (dev agent, profile keys)", () => { it( "runs meaningful prompts across models with available keys", - async () => { - clearRuntimeConfigSnapshot(); - const cfg = loadConfig(); - await ensureOpenClawModelsJson(cfg); + async () => + await withSuppressedGatewayLiveWarnings(async () => { + clearRuntimeConfigSnapshot(); + const cfg = loadConfig(); + await ensureOpenClawModelsJson(cfg); - const agentDir = resolveOpenClawAgentDir(); - const authStorage = discoverAuthStorage(agentDir); - const modelRegistry = discoverModels(authStorage, agentDir); - const all = modelRegistry.getAll(); + const agentDir = resolveOpenClawAgentDir(); + const authStorage = discoverAuthStorage(agentDir); + const modelRegistry = discoverModels(authStorage, agentDir); + const all = modelRegistry.getAll(); - const rawModels = process.env.OPENCLAW_LIVE_GATEWAY_MODELS?.trim(); - const useModern = !rawModels || rawModels === "modern" || rawModels === "all"; - const useExplicit = Boolean(rawModels) && !useModern; - const filter = useExplicit ? parseFilter(rawModels) : null; - const maxModels = GATEWAY_LIVE_MAX_MODELS; - const wanted = filter - ? all.filter((m) => filter.has(`${m.provider}/${m.id}`)) - : all.filter((m) => isHighSignalLiveModelRef({ provider: m.provider, id: m.id })); + const rawModels = process.env.OPENCLAW_LIVE_GATEWAY_MODELS?.trim(); + const useModern = !rawModels || rawModels === "modern" || rawModels === "all"; + const useExplicit = Boolean(rawModels) && !useModern; + const filter = useExplicit ? parseFilter(rawModels) : null; + const maxModels = GATEWAY_LIVE_MAX_MODELS; + const wanted = filter + ? all.filter((m) => filter.has(`${m.provider}/${m.id}`)) + : all.filter((m) => isHighSignalLiveModelRef({ provider: m.provider, id: m.id })); - const candidates: Array> = []; - const skipped: Array<{ model: string; error: string }> = []; - for (const model of wanted) { - if (shouldSuppressBuiltInModel({ provider: model.provider, id: model.id })) { - continue; - } - if (PROVIDERS && !PROVIDERS.has(model.provider)) { - continue; - } - const modelRef = `${model.provider}/${model.id}`; - try { - const apiKeyInfo = await getApiKeyForModel({ model, cfg }); - if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) { - skipped.push({ - model: modelRef, - error: `non-profile credential source: ${apiKeyInfo.source}`, - }); + const candidates: Array> = []; + const skipped: Array<{ model: string; error: string }> = []; + for (const model of wanted) { + if (shouldSuppressBuiltInModel({ provider: model.provider, id: model.id })) { continue; } - candidates.push(model); - } catch (error) { - skipped.push({ model: modelRef, error: String(error) }); + if (PROVIDERS && !PROVIDERS.has(model.provider)) { + continue; + } + const modelRef = `${model.provider}/${model.id}`; + try { + const apiKeyInfo = await getApiKeyForModel({ model, cfg }); + if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) { + skipped.push({ + model: modelRef, + error: `non-profile credential source: ${apiKeyInfo.source}`, + }); + continue; + } + candidates.push(model); + } catch (error) { + skipped.push({ model: modelRef, error: String(error) }); + } } - } - if (candidates.length === 0) { - if (skipped.length > 0) { + if (candidates.length === 0) { + if (skipped.length > 0) { + logProgress( + `[all-models] auth lookup skipped candidates:\n${formatFailurePreview(skipped, 8)}`, + ); + } + logProgress("[all-models] no API keys found; skipping"); + return; + } + const selectedCandidates = capByProviderSpread( + candidates, + maxModels > 0 ? maxModels : candidates.length, + (model) => model.provider, + ); + logProgress(`[all-models] selection=${useExplicit ? "explicit" : "high-signal"}`); + if (selectedCandidates.length < candidates.length) { logProgress( - `[all-models] auth lookup skipped candidates:\n${formatFailurePreview(skipped, 8)}`, + `[all-models] capped to ${selectedCandidates.length}/${candidates.length} via OPENCLAW_LIVE_GATEWAY_MAX_MODELS=${maxModels}`, ); } - logProgress("[all-models] no API keys found; skipping"); - return; - } - const selectedCandidates = capByProviderSpread( - candidates, - maxModels > 0 ? maxModels : candidates.length, - (model) => model.provider, - ); - logProgress(`[all-models] selection=${useExplicit ? "explicit" : "high-signal"}`); - if (selectedCandidates.length < candidates.length) { - logProgress( - `[all-models] capped to ${selectedCandidates.length}/${candidates.length} via OPENCLAW_LIVE_GATEWAY_MAX_MODELS=${maxModels}`, - ); - } - const imageCandidates = selectedCandidates.filter((m) => m.input?.includes("image")); - if (imageCandidates.length === 0) { - logProgress("[all-models] no image-capable models selected; image probe will be skipped"); - } - await runGatewayModelSuite({ - label: "all-models", - cfg, - candidates: selectedCandidates, - allowNotFoundSkip: useModern, - extraToolProbes: true, - extraImageProbes: true, - thinkingLevel: THINKING_LEVEL, - }); - - const minimaxCandidates = selectedCandidates.filter((model) => model.provider === "minimax"); - if (minimaxCandidates.length === 0) { - logProgress("[minimax] no candidates with keys; skipping dual endpoint probes"); - return; - } - - const minimaxAnthropic = buildMinimaxProviderOverride({ - cfg, - api: "anthropic-messages", - baseUrl: "https://api.minimax.io/anthropic", - }); - if (minimaxAnthropic) { + const imageCandidates = selectedCandidates.filter((m) => m.input?.includes("image")); + if (imageCandidates.length === 0) { + logProgress("[all-models] no image-capable models selected; image probe will be skipped"); + } await runGatewayModelSuite({ - label: "minimax-anthropic", + label: "all-models", cfg, - candidates: minimaxCandidates, + candidates: selectedCandidates, allowNotFoundSkip: useModern, - extraToolProbes: true, - extraImageProbes: true, + extraToolProbes: ENABLE_EXTRA_TOOL_PROBES, + extraImageProbes: ENABLE_EXTRA_IMAGE_PROBES, thinkingLevel: THINKING_LEVEL, - providerOverrides: { minimax: minimaxAnthropic }, }); - } else { - logProgress("[minimax-anthropic] missing minimax provider config; skipping"); - } - }, + + const minimaxCandidates = selectedCandidates.filter( + (model) => model.provider === "minimax", + ); + if (minimaxCandidates.length === 0) { + logProgress("[minimax] no candidates with keys; skipping dual endpoint probes"); + return; + } + + const minimaxAnthropic = buildMinimaxProviderOverride({ + cfg, + api: "anthropic-messages", + baseUrl: "https://api.minimax.io/anthropic", + }); + if (minimaxAnthropic) { + await runGatewayModelSuite({ + label: "minimax-anthropic", + cfg, + candidates: minimaxCandidates, + allowNotFoundSkip: useModern, + extraToolProbes: ENABLE_EXTRA_TOOL_PROBES, + extraImageProbes: ENABLE_EXTRA_IMAGE_PROBES, + thinkingLevel: THINKING_LEVEL, + providerOverrides: { minimax: minimaxAnthropic }, + }); + } else { + logProgress("[minimax-anthropic] missing minimax provider config; skipping"); + } + }), GATEWAY_LIVE_SUITE_TIMEOUT_MS, );