From 364ec537850072a2275d246adc67a64f07c1f3f6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 1 May 2026 21:42:12 +0100 Subject: [PATCH] test(release): prefer GPT-5.5 smoke models --- AGENTS.md | 2 +- docs/ci.md | 2 +- docs/help/testing.md | 2 +- docs/reference/RELEASING.md | 2 +- scripts/docker/install-sh-e2e/run.sh | 6 ++---- .../upgrade-survivor/config-recipe/agents.json | 6 +++--- scripts/e2e/openwebui-docker.sh | 6 +++--- scripts/e2e/parallels/provider-auth.ts | 4 ++-- scripts/lib/openclaw-test-state.mjs | 12 ++++++------ scripts/release-check.ts | 2 +- .../auto-reply/trigger-handling-test-harness.ts | 2 +- test/scripts/parallels-smoke-model.test.ts | 15 ++++++--------- 12 files changed, 28 insertions(+), 33 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index cbcdfc16877..9f216346dfb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -125,7 +125,7 @@ Telegraph style. Root rules only. Read scoped `AGENTS.md` before subtree work. ## Tests -- Vitest. Colocated `*.test.ts`; e2e `*.e2e.test.ts`; example models `sonnet-4.6`, `gpt-5.4`. +- Vitest. Colocated `*.test.ts`; e2e `*.e2e.test.ts`; example models `sonnet-4.6`, `gpt-5.5`; test GPT with 5.5 preferred, 5.4 ok, no GPT-4.x agent-smoke defaults. - Avoid brittle tests that grep workflow/docs strings for operator policy. Prefer executable behavior, parsed config/schema checks, or live run proof; put release/CI policy reminders in AGENTS/docs instead. - Clean timers/env/globals/mocks/sockets/temp dirs/module state; `--isolate=false` safe. - Hot tests: avoid per-test `vi.resetModules()` + heavy imports. Measure with `pnpm test:perf:imports ` / `pnpm test:perf:hotspots --limit N`. diff --git a/docs/ci.md b/docs/ci.md index ca0312e006f..3bd293e4c8a 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -188,7 +188,7 @@ Keep `workflow_ref` and `package_ref` separate. `workflow_ref` is the trusted wo The `package` profile uses offline plugin coverage so published-package validation is not gated on live ClawHub availability. The optional Telegram lane reuses the `package-under-test` artifact in `NPM Telegram Beta E2E`, with the published npm spec path kept for standalone dispatches. -Release checks call Package Acceptance with `source=ref`, `package_ref=`, `workflow_ref=`, `suite_profile=custom`, `docker_lanes='plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. Release-path Docker chunks cover the overlapping package/update/plugin lanes; Package Acceptance keeps offline plugin, update, and Telegram proof against the same resolved package tarball. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Set `published_upgrade_survivor_baselines=release-history` to expand the lane across a deduped history matrix: the latest six stable releases, `2026.4.23`, and the latest stable release before `2026-03-15`. Set `published_upgrade_survivor_scenarios=reported-issues` to expand the same baselines across issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, tilde log paths, and stale legacy plugin dependency roots. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.4-mini`, so the install and gateway proof stays fast and deterministic. +Release checks call Package Acceptance with `source=ref`, `package_ref=`, `workflow_ref=`, `suite_profile=custom`, `docker_lanes='plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. Release-path Docker chunks cover the overlapping package/update/plugin lanes; Package Acceptance keeps offline plugin, update, and Telegram proof against the same resolved package tarball. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Set `published_upgrade_survivor_baselines=release-history` to expand the lane across a deduped history matrix: the latest six stable releases, `2026.4.23`, and the latest stable release before `2026-03-15`. Set `published_upgrade_survivor_scenarios=reported-issues` to expand the same baselines across issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, tilde log paths, and stale legacy plugin dependency roots. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.5`, so the install and gateway proof stays on the preferred GPT-5 test model. ### Legacy compatibility windows diff --git a/docs/help/testing.md b/docs/help/testing.md index 6c209f47c5f..f037a71121b 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -248,7 +248,7 @@ gh workflow run package-acceptance.yml --ref main \ - Use `--platform macos`, `--platform windows`, or `--platform linux` while iterating on one guest. Use `--json` for the summary artifact path and per-lane status. - - The OpenAI lane uses `openai/gpt-5.4` for the live agent-turn proof by + - The OpenAI lane uses `openai/gpt-5.5` for the live agent-turn proof by default. Pass `--model ` or set `OPENCLAW_PARALLELS_OPENAI_MODEL` when deliberately validating another OpenAI model. diff --git a/docs/reference/RELEASING.md b/docs/reference/RELEASING.md index 4332bb85f5b..43b3841d065 100644 --- a/docs/reference/RELEASING.md +++ b/docs/reference/RELEASING.md @@ -276,7 +276,7 @@ ref once as `release-package-under-test` and reuses that artifact in both release-path Docker checks and Package Acceptance. This keeps all package-facing boxes on the same bytes and avoids repeated package builds. The cross-OS OpenAI install smoke uses `OPENCLAW_CROSS_OS_OPENAI_MODEL` when the -repo/org variable is set, otherwise `openai/gpt-5.4-mini`, because this lane is +repo/org variable is set, otherwise `openai/gpt-5.5`, because this lane is proving package install, onboarding, gateway startup, and one live agent turn rather than benchmarking the slowest default model. The broader live provider matrix remains the place for model-specific coverage. diff --git a/scripts/docker/install-sh-e2e/run.sh b/scripts/docker/install-sh-e2e/run.sh index 5b4e10dfb7c..e4c4a59e73f 100755 --- a/scripts/docker/install-sh-e2e/run.sh +++ b/scripts/docker/install-sh-e2e/run.sh @@ -605,11 +605,9 @@ run_profile() { if [[ "$agent_model_provider" == "openai" ]]; then agent_model="$(set_agent_model "$profile" \ "openai/gpt-5.5" \ - "openai/gpt-4o-mini" \ - "openai/gpt-4o")" + "openai/gpt-5.4-mini")" image_model="$(set_image_model "$profile" \ - "openai/gpt-4o-mini" \ - "openai/gpt-4o")" + "openai/gpt-5.4-image-2")" else agent_model="$(set_agent_model "$profile" \ "anthropic/claude-opus-4-6" \ diff --git a/scripts/e2e/lib/upgrade-survivor/config-recipe/agents.json b/scripts/e2e/lib/upgrade-survivor/config-recipe/agents.json index cd9d479f513..bf5a663613a 100644 --- a/scripts/e2e/lib/upgrade-survivor/config-recipe/agents.json +++ b/scripts/e2e/lib/upgrade-survivor/config-recipe/agents.json @@ -1,7 +1,7 @@ { "defaults": { "model": { - "primary": "openai/gpt-4.1-mini" + "primary": "openai/gpt-5.5" }, "contextTokens": 64000 }, @@ -12,7 +12,7 @@ "name": "Main", "workspace": "~/workspace", "model": { - "primary": "openai/gpt-4.1-mini" + "primary": "openai/gpt-5.5" }, "thinkingDefault": "low", "skills": ["memory"] @@ -22,7 +22,7 @@ "name": "Ops", "workspace": "~/workspace/ops", "model": { - "primary": "openai/gpt-4.1-mini" + "primary": "openai/gpt-5.5" }, "fastModeDefault": true } diff --git a/scripts/e2e/openwebui-docker.sh b/scripts/e2e/openwebui-docker.sh index 26623a74f42..013a35cd9a7 100755 --- a/scripts/e2e/openwebui-docker.sh +++ b/scripts/e2e/openwebui-docker.sh @@ -8,9 +8,9 @@ source "$ROOT_DIR/scripts/lib/docker-e2e-image.sh" IMAGE_NAME="$(docker_e2e_resolve_image "openclaw-openwebui-e2e" OPENCLAW_OPENWEBUI_E2E_IMAGE)" OPENWEBUI_IMAGE="${OPENWEBUI_IMAGE:-ghcr.io/open-webui/open-webui:v0.8.10}" -# Keep the default on a broadly available non-reasoning OpenAI model for -# Open WebUI compatibility smoke. Callers can still override this explicitly. -MODEL="${OPENCLAW_OPENWEBUI_MODEL:-openai/gpt-4.1-mini}" +# Keep the default on the preferred GPT-5 OpenAI model for Open WebUI +# compatibility smoke. Callers can still override this explicitly. +MODEL="${OPENCLAW_OPENWEBUI_MODEL:-openai/gpt-5.5}" PROMPT_NONCE="OPENWEBUI_DOCKER_E2E_$(date +%s)_$$" PROMPT="${OPENCLAW_OPENWEBUI_PROMPT:-Reply with exactly this token and nothing else: ${PROMPT_NONCE}}" PORT="${OPENCLAW_OPENWEBUI_GATEWAY_PORT:-18789}" diff --git a/scripts/e2e/parallels/provider-auth.ts b/scripts/e2e/parallels/provider-auth.ts index 48c17522532..030a2ceabc9 100644 --- a/scripts/e2e/parallels/provider-auth.ts +++ b/scripts/e2e/parallels/provider-auth.ts @@ -42,7 +42,7 @@ export function resolveProviderAuth(input: { apiKeyEnv: input.apiKeyEnv || "OPENAI_API_KEY", authChoice: "openai-api-key", authKeyFlag: "openai-api-key", - modelId: input.modelId || process.env.OPENCLAW_PARALLELS_OPENAI_MODEL || "openai/gpt-5.4", + modelId: input.modelId || process.env.OPENCLAW_PARALLELS_OPENAI_MODEL || "openai/gpt-5.5", }, }; const resolved = providerDefaults[input.provider]; @@ -69,7 +69,7 @@ export function resolveWindowsProviderAuth(input: { if (process.env.OPENCLAW_PARALLELS_OPENAI_MODEL?.trim()) { return auth; } - return { ...auth, modelId: "openai/gpt-4.1-mini" }; + return { ...auth, modelId: "openai/gpt-5.5" }; } export function providerIdFromModelId(modelId: string): string { diff --git a/scripts/lib/openclaw-test-state.mjs b/scripts/lib/openclaw-test-state.mjs index fce0e5e1563..d4cd1f2284c 100644 --- a/scripts/lib/openclaw-test-state.mjs +++ b/scripts/lib/openclaw-test-state.mjs @@ -117,7 +117,7 @@ function scenarioConfig(scenario, options = {}) { agents: { defaults: { model: { - primary: "openai/gpt-4.1-mini", + primary: "openai/gpt-5.5", }, contextTokens: 64000, skills: ["memory"], @@ -129,7 +129,7 @@ function scenarioConfig(scenario, options = {}) { name: "Main", workspace: "~/workspace", model: { - primary: "openai/gpt-4.1-mini", + primary: "openai/gpt-5.5", }, thinkingDefault: "low", skills: ["memory"], @@ -140,7 +140,7 @@ function scenarioConfig(scenario, options = {}) { name: "Ops", workspace: "~/workspace/ops", model: { - primary: "openai/gpt-4.1-mini", + primary: "openai/gpt-5.5", }, fastModeDefault: true, }, @@ -433,7 +433,7 @@ OPENCLAW_TEST_STATE_JSON "agents": { "defaults": { "model": { - "primary": "openai/gpt-4.1-mini" + "primary": "openai/gpt-5.5" }, "contextTokens": 64000, "skills": [ @@ -447,7 +447,7 @@ OPENCLAW_TEST_STATE_JSON "name": "Main", "workspace": "~/workspace", "model": { - "primary": "openai/gpt-4.1-mini" + "primary": "openai/gpt-5.5" }, "thinkingDefault": "low", "skills": [ @@ -460,7 +460,7 @@ OPENCLAW_TEST_STATE_JSON "name": "Ops", "workspace": "~/workspace/ops", "model": { - "primary": "openai/gpt-4.1-mini" + "primary": "openai/gpt-5.5" }, "fastModeDefault": true } diff --git a/scripts/release-check.ts b/scripts/release-check.ts index 990f0924822..cc558a6dc40 100755 --- a/scripts/release-check.ts +++ b/scripts/release-check.ts @@ -322,7 +322,7 @@ function writePackedBundledPluginActivationConfig(homeDir: string): void { { agents: { defaults: { - model: { primary: "openai/gpt-4.1-mini" }, + model: { primary: "openai/gpt-5.5" }, }, }, channels: { diff --git a/test/helpers/auto-reply/trigger-handling-test-harness.ts b/test/helpers/auto-reply/trigger-handling-test-harness.ts index 177ad51075a..d00302253f4 100644 --- a/test/helpers/auto-reply/trigger-handling-test-harness.ts +++ b/test/helpers/auto-reply/trigger-handling-test-harness.ts @@ -97,7 +97,7 @@ const modelCatalogMocks = getSharedMocks("openclaw.trigger-handling.model-catalo name: "Claude Opus 4.5 (OpenRouter)", contextWindow: 200000, }, - { provider: "openai", id: "gpt-4.1-mini", name: "GPT-4.1 mini" }, + { provider: "openai", id: "gpt-5.4-mini", name: "GPT-5.4 mini" }, { provider: "openai", id: "gpt-5.5", name: "GPT-5.5" }, { provider: "openai-codex", id: "gpt-5.5", name: "GPT-5.5 (Codex)" }, { provider: "minimax", id: "MiniMax-M2.7", name: "MiniMax M2.7" }, diff --git a/test/scripts/parallels-smoke-model.test.ts b/test/scripts/parallels-smoke-model.test.ts index eccb60c3fef..5d4b3ba57c0 100644 --- a/test/scripts/parallels-smoke-model.test.ts +++ b/test/scripts/parallels-smoke-model.test.ts @@ -88,8 +88,7 @@ describe("Parallels smoke model selection", () => { expect(providerAuth).toContain("OPENCLAW_PARALLELS_OPENAI_MODEL"); expect(providerAuth).toContain("OPENCLAW_PARALLELS_WINDOWS_OPENAI_MODEL"); - expect(providerAuth).toContain("openai/gpt-5.4"); - expect(providerAuth).toContain("openai/gpt-4.1-mini"); + expect(providerAuth).toContain("openai/gpt-5.5"); expect(providerAuth).toContain('authChoice: "openai-api-key"'); expect(providerAuth).toContain('authChoice: "apiKey"'); expect(providerAuth).toContain('authChoice: "minimax-global-api"'); @@ -106,16 +105,14 @@ describe("Parallels smoke model selection", () => { it("writes full model ids as config map keys in provider batches", () => { const source = ` import { modelProviderConfigBatchJson } from "./${TS_PATHS.common}"; -const result = modelProviderConfigBatchJson("openai/gpt-4.1-mini", "windows"); +const result = modelProviderConfigBatchJson("openai/gpt-5.5", "windows"); console.log(result); `; const batch = JSON.parse(runTsEval(source, { OPENAI_API_KEY: "sk-openai" })) as Array<{ path: string; }>; - expect(batch.map((entry) => entry.path)).toContain( - 'agents.defaults.models["openai/gpt-4.1-mini"]', - ); + expect(batch.map((entry) => entry.path)).toContain('agents.defaults.models["openai/gpt-5.5"]'); }); it("keeps snapshot, host, package, and quote helpers shared", () => { @@ -242,7 +239,7 @@ console.log(resolveUbuntuVmName("Ubuntu missing")); apiKeyValue: "sk-openai", authChoice: "openai-api-key", authKeyFlag: "openai-api-key", - modelId: "openai/gpt-5.4", + modelId: "openai/gpt-5.5", }); expect( @@ -260,7 +257,7 @@ console.log(resolveUbuntuVmName("Ubuntu missing")); }); }); - it("uses the faster OpenAI model for Windows smoke unless overridden", () => { + it("uses the shared GPT-5 OpenAI model for Windows smoke unless overridden", () => { const source = ` import { resolveWindowsProviderAuth } from "./${TS_PATHS.common}"; const result = resolveWindowsProviderAuth({ @@ -270,7 +267,7 @@ console.log(JSON.stringify(result)); `; expect(JSON.parse(runTsEval(source, { OPENAI_API_KEY: "sk-openai" }))).toMatchObject({ apiKeyEnv: "OPENAI_API_KEY", - modelId: "openai/gpt-4.1-mini", + modelId: "openai/gpt-5.5", }); expect(