From e973275fd0fd4b51bfbfe9c5e91be3fc736308a0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 7 Apr 2026 16:05:39 +0100 Subject: [PATCH] fix: harden claude-cli live switch smoke --- docs/help/testing.md | 4 +- scripts/lib/live-docker-stage.sh | 9 ++- scripts/test-live-cli-backend-docker.sh | 1 + .../gateway-cli-backend.live-helpers.test.ts | 22 +++++++ .../gateway-cli-backend.live-helpers.ts | 23 +++++++ src/gateway/gateway-cli-backend.live.test.ts | 65 ++++++++++++++++++- 6 files changed, 119 insertions(+), 5 deletions(-) diff --git a/docs/help/testing.md b/docs/help/testing.md index 6a7b7b26f45..24bc96bb063 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -272,6 +272,7 @@ openclaw models list --json - `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="--image"` to pass image file paths as CLI args instead of prompt injection. - `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="repeat"` (or `"list"`) to control how image args are passed when `IMAGE_ARG` is set. - `OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1` to send a second turn and validate resume flow. + - `OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE=0` to disable the default Claude Sonnet -> Opus same-session continuity probe (set to `1` to force it on when the selected model supports a switch target). Example: @@ -301,6 +302,7 @@ Notes: - It runs the live CLI-backend smoke inside the repo Docker image as the non-root `node` user. - It resolves CLI smoke metadata from the owning extension, then installs the matching Linux CLI package (`@anthropic-ai/claude-code`, `@openai/codex`, or `@google/gemini-cli`) into a cached writable prefix at `OPENCLAW_DOCKER_CLI_TOOLS_DIR` (default: `~/.cache/openclaw/docker-cli-tools`). - The live CLI-backend smoke now exercises the same end-to-end flow for Claude, Codex, and Gemini: text turn, image classification turn, then MCP `cron` tool call verified through the gateway CLI. +- Claude's default smoke also patches the session from Sonnet to Opus and verifies the resumed session still remembers an earlier note. ## Live: ACP bind smoke (`/acp spawn ... --bind here`) @@ -448,7 +450,7 @@ Live tests discover credentials the same way the CLI does. Practical implication - Per-agent auth profiles: `~/.openclaw/agents//agent/auth-profiles.json` (this is what “profile keys” means in the live tests) - Config: `~/.openclaw/openclaw.json` (or `OPENCLAW_CONFIG_PATH`) - Legacy state dir: `~/.openclaw/credentials/` (copied into the staged live home when present, but not the main profile-key store) -- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; `agents.*.workspace` / `agentDir` path overrides are stripped in that staged config so probes stay off your real host workspace. +- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; staged live homes skip `workspace/` and `sandboxes/`, and `agents.*.workspace` / `agentDir` path overrides are stripped so probes stay off your real host workspace. If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container). diff --git a/scripts/lib/live-docker-stage.sh b/scripts/lib/live-docker-stage.sh index 61dc3d993e8..c7778a77209 100644 --- a/scripts/lib/live-docker-stage.sh +++ b/scripts/lib/live-docker-stage.sh @@ -40,7 +40,14 @@ openclaw_live_stage_state_dir() { mkdir -p "$dest_dir" if [ -d "$source_dir" ]; then - tar -C "$source_dir" --exclude=workspace -cf - . | tar -C "$dest_dir" -xf - + # Sandbox workspaces can accumulate root-owned artifacts from prior Docker + # runs. They are not needed for live-test auth/config staging and can make + # temp-dir cleanup fail on exit, so keep them out of the staged state copy. + tar -C "$source_dir" \ + --exclude=workspace \ + --exclude=sandboxes \ + -cf - . | tar -C "$dest_dir" -xf - + chmod -R u+rwX "$dest_dir" || true if [ -d "$source_dir/workspace" ] && [ ! -e "$dest_dir/workspace" ]; then ln -s "$source_dir/workspace" "$dest_dir/workspace" fi diff --git a/scripts/test-live-cli-backend-docker.sh b/scripts/test-live-cli-backend-docker.sh index 254c11a8d60..aeeb28718be 100644 --- a/scripts/test-live-cli-backend-docker.sh +++ b/scripts/test-live-cli-backend-docker.sh @@ -210,6 +210,7 @@ docker run --rm -t \ -e OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV="${OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV:-}" \ -e OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG="$CLI_DISABLE_MCP_CONFIG" \ -e OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE:-}" \ + -e OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE:-}" \ -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE:-}" \ -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG:-}" \ -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE:-}" \ diff --git a/src/gateway/gateway-cli-backend.live-helpers.test.ts b/src/gateway/gateway-cli-backend.live-helpers.test.ts index 812bf5cf6c5..3d43b8f8af9 100644 --- a/src/gateway/gateway-cli-backend.live-helpers.test.ts +++ b/src/gateway/gateway-cli-backend.live-helpers.test.ts @@ -100,4 +100,26 @@ describe("gateway cli backend live helpers", () => { }); expect(gatewayClientState.lastOptions).not.toHaveProperty("requestTimeoutMs"); }); + + it("defaults the model switch probe to Claude Sonnet -> Opus", async () => { + const { resolveCliModelSwitchProbeTarget, shouldRunCliModelSwitchProbe } = + await import("./gateway-cli-backend.live-helpers.js"); + + delete process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE; + + expect(resolveCliModelSwitchProbeTarget("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe( + "claude-cli/claude-opus-4-6", + ); + expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(true); + expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-opus-4-6")).toBe(false); + expect(shouldRunCliModelSwitchProbe("codex-cli", "codex-cli/gpt-5.4")).toBe(false); + }); + + it("lets env disable the model switch probe", async () => { + const { shouldRunCliModelSwitchProbe } = await import("./gateway-cli-backend.live-helpers.js"); + + process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE = "0"; + + expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(false); + }); }); diff --git a/src/gateway/gateway-cli-backend.live-helpers.ts b/src/gateway/gateway-cli-backend.live-helpers.ts index c25b0d903f8..016c4c1df16 100644 --- a/src/gateway/gateway-cli-backend.live-helpers.ts +++ b/src/gateway/gateway-cli-backend.live-helpers.ts @@ -97,6 +97,29 @@ export function shouldRunCliMcpProbe(providerId: string): boolean { return resolveCliBackendLiveTest(providerId)?.defaultMcpProbe === true; } +export function resolveCliModelSwitchProbeTarget( + providerId: string, + modelRef: string, +): string | undefined { + const normalizedProvider = providerId.trim().toLowerCase(); + const normalizedModelRef = modelRef.trim().toLowerCase(); + if (normalizedProvider !== "claude-cli") { + return undefined; + } + if (normalizedModelRef !== "claude-cli/claude-sonnet-4-6") { + return undefined; + } + return "claude-cli/claude-opus-4-6"; +} + +export function shouldRunCliModelSwitchProbe(providerId: string, modelRef: string): boolean { + const raw = process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE?.trim(); + if (raw) { + return isTruthyEnvValue(raw); + } + return typeof resolveCliModelSwitchProbeTarget(providerId, modelRef) === "string"; +} + export function matchesCliBackendReply(text: string, expected: string): boolean { const normalized = text.trim(); const target = expected.trim(); diff --git a/src/gateway/gateway-cli-backend.live.test.ts b/src/gateway/gateway-cli-backend.live.test.ts index cce304282f5..bce1ec7d336 100644 --- a/src/gateway/gateway-cli-backend.live.test.ts +++ b/src/gateway/gateway-cli-backend.live.test.ts @@ -16,8 +16,10 @@ import { matchesCliBackendReply, parseImageMode, parseJsonStringArray, + resolveCliModelSwitchProbeTarget, restoreCliBackendLiveEnv, shouldRunCliImageProbe, + shouldRunCliModelSwitchProbe, shouldRunCliMcpProbe, snapshotCliBackendLiveEnv, type SystemPromptReport, @@ -81,11 +83,17 @@ describeLive("gateway live (cli backend)", () => { const backendResolved = resolveCliBackendConfig(providerId); const enableCliImageProbe = shouldRunCliImageProbe(providerId); const enableCliMcpProbe = shouldRunCliMcpProbe(providerId); + const enableCliModelSwitchProbe = shouldRunCliModelSwitchProbe(providerId, modelKey); + const modelSwitchTarget = enableCliModelSwitchProbe + ? resolveCliModelSwitchProbeTarget(providerId, modelKey) + : undefined; logCliBackendLiveStep("model-selected", { providerId, modelKey, enableCliImageProbe, enableCliMcpProbe, + enableCliModelSwitchProbe, + modelSwitchTarget, }); const providerDefaults = backendResolved?.config; @@ -173,7 +181,10 @@ describeLive("gateway live (cli backend)", () => { ...cfg.agents?.defaults, ...(bootstrapWorkspace ? { workspace: bootstrapWorkspace.workspaceRootDir } : {}), model: { primary: modelKey }, - models: { [modelKey]: {} }, + models: { + [modelKey]: {}, + ...(modelSwitchTarget ? { [modelSwitchTarget]: {} } : {}), + }, cliBackends: { ...existingBackends, [providerId]: { @@ -216,6 +227,8 @@ describeLive("gateway live (cli backend)", () => { try { const sessionKey = "agent:dev:live-cli-backend"; const nonce = randomBytes(3).toString("hex").toUpperCase(); + const memoryNonce = randomBytes(3).toString("hex").toUpperCase(); + const memoryToken = `CLI-MEM-${memoryNonce}`; logCliBackendLiveStep("agent-request:start", { sessionKey, nonce }); const payload = await client.request( "agent", @@ -225,7 +238,11 @@ describeLive("gateway live (cli backend)", () => { message: providerId === "codex-cli" ? `Please include the token CLI-BACKEND-${nonce} in your reply.` - : `Reply with exactly: CLI backend OK ${nonce}.`, + : enableCliModelSwitchProbe + ? `Reply with exactly: CLI backend OK ${nonce}.` + + ` Also remember this session note for later: ${memoryToken}.` + + " Do not include the note in your reply." + : `Reply with exactly: CLI backend OK ${nonce}.`, deliver: false, }, { expectFinal: true }, @@ -250,7 +267,49 @@ describeLive("gateway live (cli backend)", () => { ).toEqual(expect.arrayContaining(bootstrapWorkspace?.expectedInjectedFiles ?? [])); } - if (CLI_RESUME) { + if (modelSwitchTarget) { + const switchNonce = randomBytes(3).toString("hex").toUpperCase(); + logCliBackendLiveStep("agent-switch:start", { + sessionKey, + fromModel: modelKey, + toModel: modelSwitchTarget, + switchNonce, + memoryToken, + }); + const patchPayload = await client.request("sessions.patch", { + key: sessionKey, + model: modelSwitchTarget, + }); + if (!patchPayload || typeof patchPayload !== "object" || !("ok" in patchPayload)) { + throw new Error( + `sessions.patch failed for model switch: ${JSON.stringify(patchPayload)}`, + ); + } + const switchPayload = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${randomUUID()}`, + message: + "We just switched from Claude Sonnet to Claude Opus in the same session. " + + `What session note did I ask you to remember earlier? ` + + `Reply with exactly: CLI backend SWITCH OK ${switchNonce} .`, + deliver: false, + }, + { expectFinal: true }, + ); + if (switchPayload?.status !== "ok") { + throw new Error(`switch status=${String(switchPayload?.status)}`); + } + logCliBackendLiveStep("agent-switch:done", { status: switchPayload?.status }); + const switchText = extractPayloadText(switchPayload?.result); + expect( + matchesCliBackendReply( + switchText, + `CLI backend SWITCH OK ${switchNonce} ${memoryToken}.`, + ), + ).toBe(true); + } else if (CLI_RESUME) { const resumeNonce = randomBytes(3).toString("hex").toUpperCase(); logCliBackendLiveStep("agent-resume:start", { sessionKey, resumeNonce }); const resumePayload = await client.request(