From e973275fd0fd4b51bfbfe9c5e91be3fc736308a0 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Tue, 7 Apr 2026 16:05:39 +0100
Subject: [PATCH] fix: harden claude-cli live switch smoke

---
 docs/help/testing.md                          |  4 +-
 scripts/lib/live-docker-stage.sh              |  9 ++-
 scripts/test-live-cli-backend-docker.sh       |  1 +
 .../gateway-cli-backend.live-helpers.test.ts  | 22 +++++++
 .../gateway-cli-backend.live-helpers.ts       | 23 +++++++
 src/gateway/gateway-cli-backend.live.test.ts  | 65 ++++++++++++++++++-
 6 files changed, 119 insertions(+), 5 deletions(-)
diff --git a/docs/help/testing.md b/docs/help/testing.md
index 6a7b7b26f45..24bc96bb063 100644
--- a/docs/help/testing.md
+++ b/docs/help/testing.md
@@ -272,6 +272,7 @@ openclaw models list --json
   - `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="--image"` to pass image file paths as CLI args instead of prompt injection.
   - `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="repeat"` (or `"list"`) to control how image args are passed when `IMAGE_ARG` is set.
   - `OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1` to send a second turn and validate resume flow.
+  - `OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE=0` to disable the default Claude Sonnet -> Opus same-session continuity probe (set to `1` to force it on when the selected model supports a switch target).
 
 Example:
 
@@ -301,6 +302,7 @@ Notes:
 - It runs the live CLI-backend smoke inside the repo Docker image as the non-root `node` user.
 - It resolves CLI smoke metadata from the owning extension, then installs the matching Linux CLI package (`@anthropic-ai/claude-code`, `@openai/codex`, or `@google/gemini-cli`) into a cached writable prefix at `OPENCLAW_DOCKER_CLI_TOOLS_DIR` (default: `~/.cache/openclaw/docker-cli-tools`).
 - The live CLI-backend smoke now exercises the same end-to-end flow for Claude, Codex, and Gemini: text turn, image classification turn, then MCP `cron` tool call verified through the gateway CLI.
+- Claude's default smoke also patches the session from Sonnet to Opus and verifies the resumed session still remembers an earlier note.
 
 ## Live: ACP bind smoke (`/acp spawn ... --bind here`)
 
@@ -448,7 +450,7 @@ Live tests discover credentials the same way the CLI does. Practical implication
 - Per-agent auth profiles: `~/.openclaw/agents/<agentId>/agent/auth-profiles.json` (this is what “profile keys” means in the live tests)
 - Config: `~/.openclaw/openclaw.json` (or `OPENCLAW_CONFIG_PATH`)
 - Legacy state dir: `~/.openclaw/credentials/` (copied into the staged live home when present, but not the main profile-key store)
-- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; `agents.*.workspace` / `agentDir` path overrides are stripped in that staged config so probes stay off your real host workspace.
+- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; staged live homes skip `workspace/` and `sandboxes/`, and `agents.*.workspace` / `agentDir` path overrides are stripped so probes stay off your real host workspace.
 
 If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container).
 
diff --git a/scripts/lib/live-docker-stage.sh b/scripts/lib/live-docker-stage.sh
index 61dc3d993e8..c7778a77209 100644
--- a/scripts/lib/live-docker-stage.sh
+++ b/scripts/lib/live-docker-stage.sh
@@ -40,7 +40,14 @@ openclaw_live_stage_state_dir() {
 
   mkdir -p "$dest_dir"
   if [ -d "$source_dir" ]; then
-    tar -C "$source_dir" --exclude=workspace -cf - . | tar -C "$dest_dir" -xf -
+    # Sandbox workspaces can accumulate root-owned artifacts from prior Docker
+    # runs. They are not needed for live-test auth/config staging and can make
+    # temp-dir cleanup fail on exit, so keep them out of the staged state copy.
+    tar -C "$source_dir" \
+      --exclude=workspace \
+      --exclude=sandboxes \
+      -cf - . | tar -C "$dest_dir" -xf -
+    chmod -R u+rwX "$dest_dir" || true
     if [ -d "$source_dir/workspace" ] && [ ! -e "$dest_dir/workspace" ]; then
       ln -s "$source_dir/workspace" "$dest_dir/workspace"
     fi
diff --git a/scripts/test-live-cli-backend-docker.sh b/scripts/test-live-cli-backend-docker.sh
index 254c11a8d60..aeeb28718be 100644
--- a/scripts/test-live-cli-backend-docker.sh
+++ b/scripts/test-live-cli-backend-docker.sh
@@ -210,6 +210,7 @@ docker run --rm -t \
   -e OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV="${OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV:-}" \
   -e OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG="$CLI_DISABLE_MCP_CONFIG" \
   -e OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE:-}" \
+  -e OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE:-}" \
   -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE:-}" \
   -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG:-}" \
   -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE:-}" \
diff --git a/src/gateway/gateway-cli-backend.live-helpers.test.ts b/src/gateway/gateway-cli-backend.live-helpers.test.ts
index 812bf5cf6c5..3d43b8f8af9 100644
--- a/src/gateway/gateway-cli-backend.live-helpers.test.ts
+++ b/src/gateway/gateway-cli-backend.live-helpers.test.ts
@@ -100,4 +100,26 @@ describe("gateway cli backend live helpers", () => {
     });
     expect(gatewayClientState.lastOptions).not.toHaveProperty("requestTimeoutMs");
   });
+
+  it("defaults the model switch probe to Claude Sonnet -> Opus", async () => {
+    const { resolveCliModelSwitchProbeTarget, shouldRunCliModelSwitchProbe } =
+      await import("./gateway-cli-backend.live-helpers.js");
+
+    delete process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE;
+
+    expect(resolveCliModelSwitchProbeTarget("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(
+      "claude-cli/claude-opus-4-6",
+    );
+    expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(true);
+    expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-opus-4-6")).toBe(false);
+    expect(shouldRunCliModelSwitchProbe("codex-cli", "codex-cli/gpt-5.4")).toBe(false);
+  });
+
+  it("lets env disable the model switch probe", async () => {
+    const { shouldRunCliModelSwitchProbe } = await import("./gateway-cli-backend.live-helpers.js");
+
+    process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE = "0";
+
+    expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(false);
+  });
 });
diff --git a/src/gateway/gateway-cli-backend.live-helpers.ts b/src/gateway/gateway-cli-backend.live-helpers.ts
index c25b0d903f8..016c4c1df16 100644
--- a/src/gateway/gateway-cli-backend.live-helpers.ts
+++ b/src/gateway/gateway-cli-backend.live-helpers.ts
@@ -97,6 +97,29 @@ export function shouldRunCliMcpProbe(providerId: string): boolean {
   return resolveCliBackendLiveTest(providerId)?.defaultMcpProbe === true;
 }
 
+export function resolveCliModelSwitchProbeTarget(
+  providerId: string,
+  modelRef: string,
+): string | undefined {
+  const normalizedProvider = providerId.trim().toLowerCase();
+  const normalizedModelRef = modelRef.trim().toLowerCase();
+  if (normalizedProvider !== "claude-cli") {
+    return undefined;
+  }
+  if (normalizedModelRef !== "claude-cli/claude-sonnet-4-6") {
+    return undefined;
+  }
+  return "claude-cli/claude-opus-4-6";
+}
+
+export function shouldRunCliModelSwitchProbe(providerId: string, modelRef: string): boolean {
+  const raw = process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE?.trim();
+  if (raw) {
+    return isTruthyEnvValue(raw);
+  }
+  return typeof resolveCliModelSwitchProbeTarget(providerId, modelRef) === "string";
+}
+
 export function matchesCliBackendReply(text: string, expected: string): boolean {
   const normalized = text.trim();
   const target = expected.trim();
diff --git a/src/gateway/gateway-cli-backend.live.test.ts b/src/gateway/gateway-cli-backend.live.test.ts
index cce304282f5..bce1ec7d336 100644
--- a/src/gateway/gateway-cli-backend.live.test.ts
+++ b/src/gateway/gateway-cli-backend.live.test.ts
@@ -16,8 +16,10 @@ import {
   matchesCliBackendReply,
   parseImageMode,
   parseJsonStringArray,
+  resolveCliModelSwitchProbeTarget,
   restoreCliBackendLiveEnv,
   shouldRunCliImageProbe,
+  shouldRunCliModelSwitchProbe,
   shouldRunCliMcpProbe,
   snapshotCliBackendLiveEnv,
   type SystemPromptReport,
@@ -81,11 +83,17 @@ describeLive("gateway live (cli backend)", () => {
       const backendResolved = resolveCliBackendConfig(providerId);
       const enableCliImageProbe = shouldRunCliImageProbe(providerId);
       const enableCliMcpProbe = shouldRunCliMcpProbe(providerId);
+      const enableCliModelSwitchProbe = shouldRunCliModelSwitchProbe(providerId, modelKey);
+      const modelSwitchTarget = enableCliModelSwitchProbe
+        ? resolveCliModelSwitchProbeTarget(providerId, modelKey)
+        : undefined;
       logCliBackendLiveStep("model-selected", {
         providerId,
         modelKey,
         enableCliImageProbe,
         enableCliMcpProbe,
+        enableCliModelSwitchProbe,
+        modelSwitchTarget,
       });
       const providerDefaults = backendResolved?.config;
 
@@ -173,7 +181,10 @@ describeLive("gateway live (cli backend)", () => {
             ...cfg.agents?.defaults,
             ...(bootstrapWorkspace ? { workspace: bootstrapWorkspace.workspaceRootDir } : {}),
             model: { primary: modelKey },
-            models: { [modelKey]: {} },
+            models: {
+              [modelKey]: {},
+              ...(modelSwitchTarget ? { [modelSwitchTarget]: {} } : {}),
+            },
             cliBackends: {
               ...existingBackends,
               [providerId]: {
@@ -216,6 +227,8 @@ describeLive("gateway live (cli backend)", () => {
       try {
         const sessionKey = "agent:dev:live-cli-backend";
         const nonce = randomBytes(3).toString("hex").toUpperCase();
+        const memoryNonce = randomBytes(3).toString("hex").toUpperCase();
+        const memoryToken = `CLI-MEM-${memoryNonce}`;
         logCliBackendLiveStep("agent-request:start", { sessionKey, nonce });
         const payload = await client.request(
           "agent",
@@ -225,7 +238,11 @@ describeLive("gateway live (cli backend)", () => {
             message:
               providerId === "codex-cli"
                 ? `Please include the token CLI-BACKEND-${nonce} in your reply.`
-                : `Reply with exactly: CLI backend OK ${nonce}.`,
+                : enableCliModelSwitchProbe
+                  ? `Reply with exactly: CLI backend OK ${nonce}.` +
+                    ` Also remember this session note for later: ${memoryToken}.` +
+                    " Do not include the note in your reply."
+                  : `Reply with exactly: CLI backend OK ${nonce}.`,
             deliver: false,
           },
           { expectFinal: true },
@@ -250,7 +267,49 @@ describeLive("gateway live (cli backend)", () => {
           ).toEqual(expect.arrayContaining(bootstrapWorkspace?.expectedInjectedFiles ?? []));
         }
 
-        if (CLI_RESUME) {
+        if (modelSwitchTarget) {
+          const switchNonce = randomBytes(3).toString("hex").toUpperCase();
+          logCliBackendLiveStep("agent-switch:start", {
+            sessionKey,
+            fromModel: modelKey,
+            toModel: modelSwitchTarget,
+            switchNonce,
+            memoryToken,
+          });
+          const patchPayload = await client.request("sessions.patch", {
+            key: sessionKey,
+            model: modelSwitchTarget,
+          });
+          if (!patchPayload || typeof patchPayload !== "object" || !("ok" in patchPayload)) {
+            throw new Error(
+              `sessions.patch failed for model switch: ${JSON.stringify(patchPayload)}`,
+            );
+          }
+          const switchPayload = await client.request(
+            "agent",
+            {
+              sessionKey,
+              idempotencyKey: `idem-${randomUUID()}`,
+              message:
+                "We just switched from Claude Sonnet to Claude Opus in the same session. " +
+                `What session note did I ask you to remember earlier? ` +
+                `Reply with exactly: CLI backend SWITCH OK ${switchNonce} <remembered-note>.`,
+              deliver: false,
+            },
+            { expectFinal: true },
+          );
+          if (switchPayload?.status !== "ok") {
+            throw new Error(`switch status=${String(switchPayload?.status)}`);
+          }
+          logCliBackendLiveStep("agent-switch:done", { status: switchPayload?.status });
+          const switchText = extractPayloadText(switchPayload?.result);
+          expect(
+            matchesCliBackendReply(
+              switchText,
+              `CLI backend SWITCH OK ${switchNonce} ${memoryToken}.`,
+            ),
+          ).toBe(true);
+        } else if (CLI_RESUME) {
           const resumeNonce = randomBytes(3).toString("hex").toUpperCase();
           logCliBackendLiveStep("agent-resume:start", { sessionKey, resumeNonce });
           const resumePayload = await client.request(