From ac063568d354c14e0a0a43bcffb9725bb3fca651 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 05:03:47 +0100 Subject: [PATCH] test: speed up live model sweeps --- docs/help/testing-live.md | 36 +++++++++++++------------ package.json | 6 ++--- src/agents/live-model-filter.ts | 31 +++++++++++++++++++++ src/agents/model-compat.test.ts | 28 +++++++++++++++++++ src/agents/models.profiles.live.test.ts | 24 +++++++++++------ 5 files changed, 97 insertions(+), 28 deletions(-) diff --git a/docs/help/testing-live.md b/docs/help/testing-live.md index c0a61b6013e..a0642a1b2bc 100644 --- a/docs/help/testing-live.md +++ b/docs/help/testing-live.md @@ -48,10 +48,12 @@ Live tests are split into two layers so we can isolate failures: - `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly) - Set `OPENCLAW_LIVE_MODELS=modern` (or `all`, alias for modern) to actually run this suite; otherwise it skips to keep `pnpm test:live` focused on gateway smoke - How to select models: - - `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4) + - `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4) - `OPENCLAW_LIVE_MODELS=all` is an alias for the modern allowlist - - or `OPENCLAW_LIVE_MODELS="openai/gpt-5.4,openai-codex/gpt-5.5,anthropic/claude-opus-4-6,..."` (comma allowlist) + - or `OPENCLAW_LIVE_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,..."` (comma allowlist) - Modern/all sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap. + - Exhaustive sweeps use `OPENCLAW_LIVE_TEST_TIMEOUT_MS` for the whole direct-model test timeout. Default: 60 minutes. + - Set `OPENCLAW_LIVE_MODEL_CONCURRENCY=10` to run direct-model probes in parallel. Default: 1. - How to select providers: - `OPENCLAW_LIVE_PROVIDERS="google,google-antigravity,google-gemini-cli"` (comma allowlist) - Where keys come from: @@ -80,7 +82,7 @@ Live tests are split into two layers so we can isolate failures: - How to enable: - `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly) - How to select models: - - Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4) + - Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4) - `OPENCLAW_LIVE_GATEWAY_MODELS=all` is an alias for the modern allowlist - Or set `OPENCLAW_LIVE_GATEWAY_MODELS="provider/model"` (or comma list) to narrow - Modern/all gateway sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_GATEWAY_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap. @@ -115,7 +117,7 @@ openclaw models list --json - Default provider/model: `claude-cli/claude-sonnet-4-6` - Command/args/image behavior come from the owning CLI backend plugin metadata. - Overrides (optional): - - `OPENCLAW_LIVE_CLI_BACKEND_MODEL="codex-cli/gpt-5.5"` + - `OPENCLAW_LIVE_CLI_BACKEND_MODEL="codex-cli/gpt-5.2"` - `OPENCLAW_LIVE_CLI_BACKEND_COMMAND="/full/path/to/codex"` - `OPENCLAW_LIVE_CLI_BACKEND_ARGS='["exec","--json","--color","never","--sandbox","read-only","--skip-git-repo-check"]'` - `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE=1` to send a real image attachment (paths are injected into the prompt). @@ -128,7 +130,7 @@ Example: ```bash OPENCLAW_LIVE_CLI_BACKEND=1 \ - OPENCLAW_LIVE_CLI_BACKEND_MODEL="codex-cli/gpt-5.5" \ + OPENCLAW_LIVE_CLI_BACKEND_MODEL="codex-cli/gpt-5.2" \ pnpm test:live src/gateway/gateway-cli-backend.live.test.ts ``` @@ -178,8 +180,8 @@ Notes: - `OPENCLAW_LIVE_ACP_BIND_AGENT=gemini` - `OPENCLAW_LIVE_ACP_BIND_AGENTS=claude,codex,gemini` - `OPENCLAW_LIVE_ACP_BIND_AGENT_COMMAND='npx -y @agentclientprotocol/claude-agent-acp@'` - - `OPENCLAW_LIVE_ACP_BIND_CODEX_MODEL=gpt-5.5` - - `OPENCLAW_LIVE_ACP_BIND_PARENT_MODEL=openai/gpt-5.4` + - `OPENCLAW_LIVE_ACP_BIND_CODEX_MODEL=gpt-5.2` + - `OPENCLAW_LIVE_ACP_BIND_PARENT_MODEL=openai/gpt-5.2` - Notes: - This lane uses the gateway `chat.send` surface with admin-only synthetic originating-route fields so tests can attach message-channel context without pretending to deliver externally. - When `OPENCLAW_LIVE_ACP_BIND_AGENT_COMMAND` is unset, the test uses the embedded `acpx` plugin's built-in agent registry for the selected ACP harness agent. @@ -220,7 +222,7 @@ Docker notes: `agent` method: - load the bundled `codex` plugin - select `OPENCLAW_AGENT_RUNTIME=codex` - - send a first gateway agent turn to `openai/gpt-5.4` with the Codex harness forced + - send a first gateway agent turn to `openai/gpt-5.2` with the Codex harness forced - send a second turn to the same OpenClaw session and verify the app-server thread can resume - run `/codex status` and `/codex models` through the same gateway command @@ -230,7 +232,7 @@ Docker notes: denied so the agent asks back - Test: `src/gateway/gateway-codex-harness.live.test.ts` - Enable: `OPENCLAW_LIVE_CODEX_HARNESS=1` -- Default model: `openai/gpt-5.4` +- Default model: `openai/gpt-5.2` - Optional image probe: `OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1` - Optional MCP/tool probe: `OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1` - Optional Guardian probe: `OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1` @@ -248,7 +250,7 @@ OPENCLAW_LIVE_CODEX_HARNESS=1 \ OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1 \ OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1 \ OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1 \ - OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.4 \ + OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.2 \ pnpm test:live -- src/gateway/gateway-codex-harness.live.test.ts ``` @@ -279,13 +281,13 @@ Docker notes: Narrow, explicit allowlists are fastest and least flaky: - Single model, direct (no gateway): - - `OPENCLAW_LIVE_MODELS="openai/gpt-5.4" pnpm test:live src/agents/models.profiles.live.test.ts` + - `OPENCLAW_LIVE_MODELS="openai/gpt-5.2" pnpm test:live src/agents/models.profiles.live.test.ts` - Single model, gateway smoke: - - `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.4" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` + - `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` - Tool calling across several providers: - - `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.4,openai-codex/gpt-5.5,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` + - `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` - Google focus (Gemini API key + Antigravity): - Gemini (API key): `OPENCLAW_LIVE_GATEWAY_MODELS="google/gemini-3-flash-preview" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` @@ -308,8 +310,8 @@ There is no fixed “CI model list” (live is opt-in), but these are the **reco This is the “common models” run we expect to keep working: -- OpenAI (non-Codex): `openai/gpt-5.4` (optional: `openai/gpt-5.4-mini`) -- OpenAI Codex OAuth: `openai-codex/gpt-5.5` +- OpenAI (non-Codex): `openai/gpt-5.2` +- OpenAI Codex OAuth: `openai-codex/gpt-5.2` - Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`) - Google (Gemini API): `google/gemini-3.1-pro-preview` and `google/gemini-3-flash-preview` (avoid older Gemini 2.x models) - Google (Antigravity): `google-antigravity/claude-opus-4-6-thinking` and `google-antigravity/gemini-3-flash` @@ -317,13 +319,13 @@ This is the “common models” run we expect to keep working: - MiniMax: `minimax/MiniMax-M2.7` Run gateway smoke with tools + image: -`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.4,openai-codex/gpt-5.5,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` +`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` ### Baseline: tool calling (Read + optional Exec) Pick at least one per provider family: -- OpenAI: `openai/gpt-5.4` (or `openai/gpt-5.4-mini`) +- OpenAI: `openai/gpt-5.2` - Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`) - Google: `google/gemini-3-flash-preview` (or `google/gemini-3.1-pro-preview`) - Z.AI (GLM): `zai/glm-4.7` diff --git a/package.json b/package.json index a935b924a65..4a447400496 100644 --- a/package.json +++ b/package.json @@ -1450,17 +1450,17 @@ "test:docker:live-cli-backend": "bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-cli-backend:claude": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=claude-cli/claude-sonnet-4-6 bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-cli-backend:claude-subscription": "OPENCLAW_LIVE_CLI_BACKEND_AUTH=subscription OPENCLAW_LIVE_CLI_BACKEND_MODEL=claude-cli/claude-sonnet-4-6 OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG=1 OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE=0 OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1 OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE=0 OPENCLAW_LIVE_CLI_BACKEND_MCP_PROBE=0 bash scripts/test-live-cli-backend-docker.sh", - "test:docker:live-cli-backend:codex": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=codex-cli/gpt-5.5 bash scripts/test-live-cli-backend-docker.sh", + "test:docker:live-cli-backend:codex": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=codex-cli/gpt-5.2 bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-cli-backend:gemini": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=google-gemini-cli/gemini-3-flash-preview bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-codex-bind": "OPENCLAW_LIVE_CODEX_BIND=1 OPENCLAW_LIVE_CODEX_TEST_FILES=src/gateway/gateway-codex-bind.live.test.ts bash scripts/test-live-codex-harness-docker.sh", "test:docker:live-codex-harness": "bash scripts/test-live-codex-harness-docker.sh", "test:docker:live-gateway": "bash scripts/test-live-gateway-models-docker.sh", "test:docker:live-gateway:claude": "OPENCLAW_LIVE_GATEWAY_PROVIDERS=claude-cli OPENCLAW_LIVE_GATEWAY_MODELS=claude-cli/claude-sonnet-4-6 bash scripts/test-live-gateway-models-docker.sh", - "test:docker:live-gateway:codex": "OPENCLAW_LIVE_GATEWAY_PROVIDERS=codex-cli OPENCLAW_LIVE_GATEWAY_MODELS=codex-cli/gpt-5.5 bash scripts/test-live-gateway-models-docker.sh", + "test:docker:live-gateway:codex": "OPENCLAW_LIVE_GATEWAY_PROVIDERS=codex-cli OPENCLAW_LIVE_GATEWAY_MODELS=codex-cli/gpt-5.2 bash scripts/test-live-gateway-models-docker.sh", "test:docker:live-gateway:gemini": "OPENCLAW_LIVE_GATEWAY_PROVIDERS=google-gemini-cli OPENCLAW_LIVE_GATEWAY_MODELS=google-gemini-cli/gemini-3.1-pro-preview bash scripts/test-live-gateway-models-docker.sh", "test:docker:live-models": "bash scripts/test-live-models-docker.sh", "test:docker:live-models:claude": "OPENCLAW_LIVE_PROVIDERS=claude-cli OPENCLAW_LIVE_MODELS=claude-cli/claude-sonnet-4-6 bash scripts/test-live-models-docker.sh", - "test:docker:live-models:codex": "OPENCLAW_LIVE_PROVIDERS=codex-cli OPENCLAW_LIVE_MODELS=codex-cli/gpt-5.5 bash scripts/test-live-models-docker.sh", + "test:docker:live-models:codex": "OPENCLAW_LIVE_PROVIDERS=codex-cli OPENCLAW_LIVE_MODELS=codex-cli/gpt-5.2 bash scripts/test-live-models-docker.sh", "test:docker:live-models:gemini": "OPENCLAW_LIVE_PROVIDERS=google-gemini-cli OPENCLAW_LIVE_MODELS=google-gemini-cli/gemini-3.1-pro-preview bash scripts/test-live-models-docker.sh", "test:docker:mcp-channels": "bash scripts/e2e/mcp-channels-docker.sh", "test:docker:npm-onboard-channel-agent": "bash scripts/e2e/npm-onboard-channel-agent-docker.sh", diff --git a/src/agents/live-model-filter.ts b/src/agents/live-model-filter.ts index 69d44cebdbd..b0a70b3c936 100644 --- a/src/agents/live-model-filter.ts +++ b/src/agents/live-model-filter.ts @@ -70,6 +70,33 @@ function isPreGemini3ModelId(id: string): boolean { return Number.isFinite(major) && major < 3; } +function isOpenAiFamilyLiveModel(provider: string, id: string): boolean { + const normalized = normalizeLowercaseStringOrEmpty(id); + const modelName = normalized.split("/").pop() ?? ""; + if (provider === "openrouter") { + return normalized.startsWith("openai/"); + } + if (provider === "opencode") { + return modelName.startsWith("gpt-"); + } + return ( + provider === "openai" || + provider === "openai-codex" || + provider === "codex-cli" || + provider === "opencode" || + provider === "github-copilot" || + provider === "microsoft-foundry" + ); +} + +function isUnsupportedOpenAiLiveModelRef(provider: string, id: string): boolean { + if (!isOpenAiFamilyLiveModel(provider, id)) { + return false; + } + const modelName = normalizeLowercaseStringOrEmpty(id).split("/").pop() ?? ""; + return !modelName.startsWith("gpt-5.2"); +} + export function isModernModelRef(ref: ModelRef): boolean { const provider = normalizeProviderId(ref.provider ?? ""); const id = normalizeLowercaseStringOrEmpty(ref.id); @@ -91,6 +118,7 @@ export function isModernModelRef(ref: ModelRef): boolean { } export function isHighSignalLiveModelRef(ref: ModelRef): boolean { + const provider = normalizeProviderId(ref.provider ?? ""); const id = normalizeLowercaseStringOrEmpty(ref.id); if (!isModernModelRef(ref) || !id) { return false; @@ -98,6 +126,9 @@ export function isHighSignalLiveModelRef(ref: ModelRef): boolean { if (isPreGemini3ModelId(id)) { return false; } + if (isUnsupportedOpenAiLiveModelRef(provider, id)) { + return false; + } return isHighSignalClaudeModelId(id); } diff --git a/src/agents/model-compat.test.ts b/src/agents/model-compat.test.ts index c51a7597d15..2769da3b9b9 100644 --- a/src/agents/model-compat.test.ts +++ b/src/agents/model-compat.test.ts @@ -475,6 +475,34 @@ describe("isHighSignalLiveModelRef", () => { true, ); }); + + it("keeps only GPT-5.2 OpenAI-family models in the default live matrix", () => { + providerRuntimeMocks.resolveProviderModernModelRef.mockReturnValue(true); + + expect(isHighSignalLiveModelRef({ provider: "openrouter", id: "openai/gpt-3.5-turbo" })).toBe( + false, + ); + expect(isHighSignalLiveModelRef({ provider: "openrouter", id: "openai/gpt-oss-120b" })).toBe( + false, + ); + expect(isHighSignalLiveModelRef({ provider: "openrouter", id: "openai/o1" })).toBe(false); + expect(isHighSignalLiveModelRef({ provider: "openai", id: "gpt-4.1" })).toBe(false); + expect(isHighSignalLiveModelRef({ provider: "openai", id: "gpt-4o" })).toBe(false); + expect(isHighSignalLiveModelRef({ provider: "openai", id: "gpt-5" })).toBe(false); + expect(isHighSignalLiveModelRef({ provider: "openai", id: "gpt-5.1" })).toBe(false); + expect(isHighSignalLiveModelRef({ provider: "openai", id: "gpt-5.4" })).toBe(false); + expect(isHighSignalLiveModelRef({ provider: "openai", id: "gpt-5.5" })).toBe(false); + expect(isHighSignalLiveModelRef({ provider: "openrouter", id: "openai/gpt-5.1-chat" })).toBe( + false, + ); + expect(isHighSignalLiveModelRef({ provider: "opencode", id: "gpt-5.1-codex-mini" })).toBe( + false, + ); + expect(isHighSignalLiveModelRef({ provider: "openai", id: "gpt-5.2" })).toBe(true); + expect(isHighSignalLiveModelRef({ provider: "openrouter", id: "openai/gpt-5.2-chat" })).toBe( + true, + ); + }); }); describe("selectHighSignalLiveItems", () => { diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts index 7af749ee3ab..bfb41c87052 100644 --- a/src/agents/models.profiles.live.test.ts +++ b/src/agents/models.profiles.live.test.ts @@ -3,6 +3,7 @@ import { Type } from "typebox"; import { describe, expect, it } from "vitest"; import { loadConfig } from "../config/config.js"; import { parseLiveCsvFilter } from "../media-generation/live-test-helpers.js"; +import { runTasksWithConcurrency } from "../utils/run-with-concurrency.js"; import { resolveOpenClawAgentDir } from "./agent-paths.js"; import { collectAnthropicApiKeys, @@ -52,6 +53,11 @@ const LIVE_SETUP_TIMEOUT_MS = Math.max( 1_000, toInt(process.env.OPENCLAW_LIVE_SETUP_TIMEOUT_MS, 45_000), ); +const LIVE_TEST_TIMEOUT_MS = Math.max( + 1_000, + toInt(process.env.OPENCLAW_LIVE_TEST_TIMEOUT_MS, 60 * 60 * 1000), +); +const LIVE_MODEL_CONCURRENCY = Math.max(1, toInt(process.env.OPENCLAW_LIVE_MODEL_CONCURRENCY, 1)); const LIVE_MODELS_JSON_TIMEOUT_MS = resolveLiveModelsJsonTimeoutMs(); const LIVE_FILE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_FILE_PROBE_ENV); const LIVE_IMAGE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_IMAGE_PROBE_ENV); @@ -316,9 +322,6 @@ function resolveTestReasoning( if (id.includes("deep-research")) { return "medium"; } - if (id === "gpt-5.4-pro") { - return "medium"; - } if (model.provider === "openrouter" && id.startsWith("qwq")) { return undefined; } @@ -657,11 +660,11 @@ describeLive("live models (profile keys)", () => { } logProgress(`[live-models] running ${selectedCandidates.length} models`); logProgress( - `[live-models] heartbeat=${formatElapsedSeconds(LIVE_HEARTBEAT_MS)} timeout=${formatElapsedSeconds(perModelTimeoutMs)}`, + `[live-models] heartbeat=${formatElapsedSeconds(LIVE_HEARTBEAT_MS)} timeout=${formatElapsedSeconds(perModelTimeoutMs)} concurrency=${LIVE_MODEL_CONCURRENCY}`, ); const total = selectedCandidates.length; - for (const [index, entry] of selectedCandidates.entries()) { + const tasks = selectedCandidates.map((entry, index) => async () => { const { model, apiKeyInfo } = entry; const id = `${model.provider}/${model.id}`; const progressLabel = `[live-models] ${index + 1}/${total} ${id}`; @@ -680,7 +683,7 @@ describeLive("live models (profile keys)", () => { if ( model.provider === "openai" && model.api === "openai-responses" && - (model.id === "gpt-5.5" || model.id === "gpt-5.4") + model.id === "gpt-5.2" ) { logProgress(`${progressLabel}: tool-only regression`); const noopTool = { @@ -1023,7 +1026,12 @@ describeLive("live models (profile keys)", () => { break; } } - } + }); + + await runTasksWithConcurrency({ + tasks, + limit: LIVE_MODEL_CONCURRENCY, + }); if (failures.length > 0) { const preview = formatFailurePreview(failures, 20); @@ -1034,6 +1042,6 @@ describeLive("live models (profile keys)", () => { void skipped; }, - 15 * 60 * 1000, + LIVE_TEST_TIMEOUT_MS, ); });