test(release): prefer GPT-5.5 smoke models

This commit is contained in:
Peter Steinberger
2026-05-01 21:42:12 +01:00
parent ac8633debe
commit 364ec53785
12 changed files with 28 additions and 33 deletions

View File

@@ -125,7 +125,7 @@ Telegraph style. Root rules only. Read scoped `AGENTS.md` before subtree work.
## Tests
- Vitest. Colocated `*.test.ts`; e2e `*.e2e.test.ts`; example models `sonnet-4.6`, `gpt-5.4`.
- Vitest. Colocated `*.test.ts`; e2e `*.e2e.test.ts`; example models `sonnet-4.6`, `gpt-5.5`; test GPT with 5.5 preferred, 5.4 ok, no GPT-4.x agent-smoke defaults.
- Avoid brittle tests that grep workflow/docs strings for operator policy. Prefer executable behavior, parsed config/schema checks, or live run proof; put release/CI policy reminders in AGENTS/docs instead.
- Clean timers/env/globals/mocks/sockets/temp dirs/module state; `--isolate=false` safe.
- Hot tests: avoid per-test `vi.resetModules()` + heavy imports. Measure with `pnpm test:perf:imports <file>` / `pnpm test:perf:hotspots --limit N`.

View File

@@ -188,7 +188,7 @@ Keep `workflow_ref` and `package_ref` separate. `workflow_ref` is the trusted wo
The `package` profile uses offline plugin coverage so published-package validation is not gated on live ClawHub availability. The optional Telegram lane reuses the `package-under-test` artifact in `NPM Telegram Beta E2E`, with the published npm spec path kept for standalone dispatches.
Release checks call Package Acceptance with `source=ref`, `package_ref=<release-ref>`, `workflow_ref=<release workflow ref>`, `suite_profile=custom`, `docker_lanes='plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. Release-path Docker chunks cover the overlapping package/update/plugin lanes; Package Acceptance keeps offline plugin, update, and Telegram proof against the same resolved package tarball. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Set `published_upgrade_survivor_baselines=release-history` to expand the lane across a deduped history matrix: the latest six stable releases, `2026.4.23`, and the latest stable release before `2026-03-15`. Set `published_upgrade_survivor_scenarios=reported-issues` to expand the same baselines across issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, tilde log paths, and stale legacy plugin dependency roots. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.4-mini`, so the install and gateway proof stays fast and deterministic.
Release checks call Package Acceptance with `source=ref`, `package_ref=<release-ref>`, `workflow_ref=<release workflow ref>`, `suite_profile=custom`, `docker_lanes='plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. Release-path Docker chunks cover the overlapping package/update/plugin lanes; Package Acceptance keeps offline plugin, update, and Telegram proof against the same resolved package tarball. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Set `published_upgrade_survivor_baselines=release-history` to expand the lane across a deduped history matrix: the latest six stable releases, `2026.4.23`, and the latest stable release before `2026-03-15`. Set `published_upgrade_survivor_scenarios=reported-issues` to expand the same baselines across issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, tilde log paths, and stale legacy plugin dependency roots. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.5`, so the install and gateway proof stays on the preferred GPT-5 test model.
### Legacy compatibility windows

View File

@@ -248,7 +248,7 @@ gh workflow run package-acceptance.yml --ref main \
- Use `--platform macos`, `--platform windows`, or `--platform linux` while
iterating on one guest. Use `--json` for the summary artifact path and
per-lane status.
- The OpenAI lane uses `openai/gpt-5.4` for the live agent-turn proof by
- The OpenAI lane uses `openai/gpt-5.5` for the live agent-turn proof by
default. Pass `--model <provider/model>` or set
`OPENCLAW_PARALLELS_OPENAI_MODEL` when deliberately validating another
OpenAI model.

View File

@@ -276,7 +276,7 @@ ref once as `release-package-under-test` and reuses that artifact in both
release-path Docker checks and Package Acceptance. This keeps all
package-facing boxes on the same bytes and avoids repeated package builds.
The cross-OS OpenAI install smoke uses `OPENCLAW_CROSS_OS_OPENAI_MODEL` when the
repo/org variable is set, otherwise `openai/gpt-5.4-mini`, because this lane is
repo/org variable is set, otherwise `openai/gpt-5.5`, because this lane is
proving package install, onboarding, gateway startup, and one live agent turn
rather than benchmarking the slowest default model. The broader live provider
matrix remains the place for model-specific coverage.

View File

@@ -605,11 +605,9 @@ run_profile() {
if [[ "$agent_model_provider" == "openai" ]]; then
agent_model="$(set_agent_model "$profile" \
"openai/gpt-5.5" \
"openai/gpt-4o-mini" \
"openai/gpt-4o")"
"openai/gpt-5.4-mini")"
image_model="$(set_image_model "$profile" \
"openai/gpt-4o-mini" \
"openai/gpt-4o")"
"openai/gpt-5.4-image-2")"
else
agent_model="$(set_agent_model "$profile" \
"anthropic/claude-opus-4-6" \

View File

@@ -1,7 +1,7 @@
{
"defaults": {
"model": {
"primary": "openai/gpt-4.1-mini"
"primary": "openai/gpt-5.5"
},
"contextTokens": 64000
},
@@ -12,7 +12,7 @@
"name": "Main",
"workspace": "~/workspace",
"model": {
"primary": "openai/gpt-4.1-mini"
"primary": "openai/gpt-5.5"
},
"thinkingDefault": "low",
"skills": ["memory"]
@@ -22,7 +22,7 @@
"name": "Ops",
"workspace": "~/workspace/ops",
"model": {
"primary": "openai/gpt-4.1-mini"
"primary": "openai/gpt-5.5"
},
"fastModeDefault": true
}

View File

@@ -8,9 +8,9 @@ source "$ROOT_DIR/scripts/lib/docker-e2e-image.sh"
IMAGE_NAME="$(docker_e2e_resolve_image "openclaw-openwebui-e2e" OPENCLAW_OPENWEBUI_E2E_IMAGE)"
OPENWEBUI_IMAGE="${OPENWEBUI_IMAGE:-ghcr.io/open-webui/open-webui:v0.8.10}"
# Keep the default on a broadly available non-reasoning OpenAI model for
# Open WebUI compatibility smoke. Callers can still override this explicitly.
MODEL="${OPENCLAW_OPENWEBUI_MODEL:-openai/gpt-4.1-mini}"
# Keep the default on the preferred GPT-5 OpenAI model for Open WebUI
# compatibility smoke. Callers can still override this explicitly.
MODEL="${OPENCLAW_OPENWEBUI_MODEL:-openai/gpt-5.5}"
PROMPT_NONCE="OPENWEBUI_DOCKER_E2E_$(date +%s)_$$"
PROMPT="${OPENCLAW_OPENWEBUI_PROMPT:-Reply with exactly this token and nothing else: ${PROMPT_NONCE}}"
PORT="${OPENCLAW_OPENWEBUI_GATEWAY_PORT:-18789}"

View File

@@ -42,7 +42,7 @@ export function resolveProviderAuth(input: {
apiKeyEnv: input.apiKeyEnv || "OPENAI_API_KEY",
authChoice: "openai-api-key",
authKeyFlag: "openai-api-key",
modelId: input.modelId || process.env.OPENCLAW_PARALLELS_OPENAI_MODEL || "openai/gpt-5.4",
modelId: input.modelId || process.env.OPENCLAW_PARALLELS_OPENAI_MODEL || "openai/gpt-5.5",
},
};
const resolved = providerDefaults[input.provider];
@@ -69,7 +69,7 @@ export function resolveWindowsProviderAuth(input: {
if (process.env.OPENCLAW_PARALLELS_OPENAI_MODEL?.trim()) {
return auth;
}
return { ...auth, modelId: "openai/gpt-4.1-mini" };
return { ...auth, modelId: "openai/gpt-5.5" };
}
export function providerIdFromModelId(modelId: string): string {

View File

@@ -117,7 +117,7 @@ function scenarioConfig(scenario, options = {}) {
agents: {
defaults: {
model: {
primary: "openai/gpt-4.1-mini",
primary: "openai/gpt-5.5",
},
contextTokens: 64000,
skills: ["memory"],
@@ -129,7 +129,7 @@ function scenarioConfig(scenario, options = {}) {
name: "Main",
workspace: "~/workspace",
model: {
primary: "openai/gpt-4.1-mini",
primary: "openai/gpt-5.5",
},
thinkingDefault: "low",
skills: ["memory"],
@@ -140,7 +140,7 @@ function scenarioConfig(scenario, options = {}) {
name: "Ops",
workspace: "~/workspace/ops",
model: {
primary: "openai/gpt-4.1-mini",
primary: "openai/gpt-5.5",
},
fastModeDefault: true,
},
@@ -433,7 +433,7 @@ OPENCLAW_TEST_STATE_JSON
"agents": {
"defaults": {
"model": {
"primary": "openai/gpt-4.1-mini"
"primary": "openai/gpt-5.5"
},
"contextTokens": 64000,
"skills": [
@@ -447,7 +447,7 @@ OPENCLAW_TEST_STATE_JSON
"name": "Main",
"workspace": "~/workspace",
"model": {
"primary": "openai/gpt-4.1-mini"
"primary": "openai/gpt-5.5"
},
"thinkingDefault": "low",
"skills": [
@@ -460,7 +460,7 @@ OPENCLAW_TEST_STATE_JSON
"name": "Ops",
"workspace": "~/workspace/ops",
"model": {
"primary": "openai/gpt-4.1-mini"
"primary": "openai/gpt-5.5"
},
"fastModeDefault": true
}

View File

@@ -322,7 +322,7 @@ function writePackedBundledPluginActivationConfig(homeDir: string): void {
{
agents: {
defaults: {
model: { primary: "openai/gpt-4.1-mini" },
model: { primary: "openai/gpt-5.5" },
},
},
channels: {

View File

@@ -97,7 +97,7 @@ const modelCatalogMocks = getSharedMocks("openclaw.trigger-handling.model-catalo
name: "Claude Opus 4.5 (OpenRouter)",
contextWindow: 200000,
},
{ provider: "openai", id: "gpt-4.1-mini", name: "GPT-4.1 mini" },
{ provider: "openai", id: "gpt-5.4-mini", name: "GPT-5.4 mini" },
{ provider: "openai", id: "gpt-5.5", name: "GPT-5.5" },
{ provider: "openai-codex", id: "gpt-5.5", name: "GPT-5.5 (Codex)" },
{ provider: "minimax", id: "MiniMax-M2.7", name: "MiniMax M2.7" },

View File

@@ -88,8 +88,7 @@ describe("Parallels smoke model selection", () => {
expect(providerAuth).toContain("OPENCLAW_PARALLELS_OPENAI_MODEL");
expect(providerAuth).toContain("OPENCLAW_PARALLELS_WINDOWS_OPENAI_MODEL");
expect(providerAuth).toContain("openai/gpt-5.4");
expect(providerAuth).toContain("openai/gpt-4.1-mini");
expect(providerAuth).toContain("openai/gpt-5.5");
expect(providerAuth).toContain('authChoice: "openai-api-key"');
expect(providerAuth).toContain('authChoice: "apiKey"');
expect(providerAuth).toContain('authChoice: "minimax-global-api"');
@@ -106,16 +105,14 @@ describe("Parallels smoke model selection", () => {
it("writes full model ids as config map keys in provider batches", () => {
const source = `
import { modelProviderConfigBatchJson } from "./${TS_PATHS.common}";
const result = modelProviderConfigBatchJson("openai/gpt-4.1-mini", "windows");
const result = modelProviderConfigBatchJson("openai/gpt-5.5", "windows");
console.log(result);
`;
const batch = JSON.parse(runTsEval(source, { OPENAI_API_KEY: "sk-openai" })) as Array<{
path: string;
}>;
expect(batch.map((entry) => entry.path)).toContain(
'agents.defaults.models["openai/gpt-4.1-mini"]',
);
expect(batch.map((entry) => entry.path)).toContain('agents.defaults.models["openai/gpt-5.5"]');
});
it("keeps snapshot, host, package, and quote helpers shared", () => {
@@ -242,7 +239,7 @@ console.log(resolveUbuntuVmName("Ubuntu missing"));
apiKeyValue: "sk-openai",
authChoice: "openai-api-key",
authKeyFlag: "openai-api-key",
modelId: "openai/gpt-5.4",
modelId: "openai/gpt-5.5",
});
expect(
@@ -260,7 +257,7 @@ console.log(resolveUbuntuVmName("Ubuntu missing"));
});
});
it("uses the faster OpenAI model for Windows smoke unless overridden", () => {
it("uses the shared GPT-5 OpenAI model for Windows smoke unless overridden", () => {
const source = `
import { resolveWindowsProviderAuth } from "./${TS_PATHS.common}";
const result = resolveWindowsProviderAuth({
@@ -270,7 +267,7 @@ console.log(JSON.stringify(result));
`;
expect(JSON.parse(runTsEval(source, { OPENAI_API_KEY: "sk-openai" }))).toMatchObject({
apiKeyEnv: "OPENAI_API_KEY",
modelId: "openai/gpt-4.1-mini",
modelId: "openai/gpt-5.5",
});
expect(