fix: harden claude-cli live switch smoke

2026-04-12 01:31:08 +00:00 · 2026-04-07 16:05:39 +01:00
parent 9c56c84ce0
commit e973275fd0
6 changed files with 119 additions and 5 deletions
--- a/docs/help/testing.md
+++ b/docs/help/testing.md
@@ -272,6 +272,7 @@ openclaw models list --json
  - `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="--image"` to pass image file paths as CLI args instead of prompt injection.
  - `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="repeat"` (or `"list"`) to control how image args are passed when `IMAGE_ARG` is set.
  - `OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1` to send a second turn and validate resume flow.
+  - `OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE=0` to disable the default Claude Sonnet -> Opus same-session continuity probe (set to `1` to force it on when the selected model supports a switch target).

 Example:

@@ -301,6 +302,7 @@ Notes:
 - It runs the live CLI-backend smoke inside the repo Docker image as the non-root `node` user.
 - It resolves CLI smoke metadata from the owning extension, then installs the matching Linux CLI package (`@anthropic-ai/claude-code`, `@openai/codex`, or `@google/gemini-cli`) into a cached writable prefix at `OPENCLAW_DOCKER_CLI_TOOLS_DIR` (default: `~/.cache/openclaw/docker-cli-tools`).
 - The live CLI-backend smoke now exercises the same end-to-end flow for Claude, Codex, and Gemini: text turn, image classification turn, then MCP `cron` tool call verified through the gateway CLI.
+- Claude's default smoke also patches the session from Sonnet to Opus and verifies the resumed session still remembers an earlier note.

 ## Live: ACP bind smoke (`/acp spawn ... --bind here`)

@@ -448,7 +450,7 @@ Live tests discover credentials the same way the CLI does. Practical implication
 - Per-agent auth profiles: `~/.openclaw/agents/<agentId>/agent/auth-profiles.json` (this is what “profile keys” means in the live tests)
 - Config: `~/.openclaw/openclaw.json` (or `OPENCLAW_CONFIG_PATH`)
 - Legacy state dir: `~/.openclaw/credentials/` (copied into the staged live home when present, but not the main profile-key store)
- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; `agents.*.workspace` / `agentDir` path overrides are stripped in that staged config so probes stay off your real host workspace.
+- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; staged live homes skip `workspace/` and `sandboxes/`, and `agents.*.workspace` / `agentDir` path overrides are stripped so probes stay off your real host workspace.

 If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container).

--- a/scripts/lib/live-docker-stage.sh
+++ b/scripts/lib/live-docker-stage.sh
@@ -40,7 +40,14 @@ openclaw_live_stage_state_dir() {

  mkdir -p "$dest_dir"
  if [ -d "$source_dir" ]; then
-    tar -C "$source_dir" --exclude=workspace -cf - . | tar -C "$dest_dir" -xf -
+    # Sandbox workspaces can accumulate root-owned artifacts from prior Docker
+    # runs. They are not needed for live-test auth/config staging and can make
+    # temp-dir cleanup fail on exit, so keep them out of the staged state copy.
+    tar -C "$source_dir" \
+      --exclude=workspace \
+      --exclude=sandboxes \
+      -cf - . | tar -C "$dest_dir" -xf -
+    chmod -R u+rwX "$dest_dir" || true
    if [ -d "$source_dir/workspace" ] && [ ! -e "$dest_dir/workspace" ]; then
      ln -s "$source_dir/workspace" "$dest_dir/workspace"
    fi
--- a/scripts/test-live-cli-backend-docker.sh
+++ b/scripts/test-live-cli-backend-docker.sh
@@ -210,6 +210,7 @@ docker run --rm -t \
  -e OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV="${OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV:-}" \
  -e OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG="$CLI_DISABLE_MCP_CONFIG" \
  -e OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE:-}" \
+  -e OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE:-}" \
  -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE:-}" \
  -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG:-}" \
  -e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE:-}" \
--- a/src/gateway/gateway-cli-backend.live-helpers.test.ts
+++ b/src/gateway/gateway-cli-backend.live-helpers.test.ts
@@ -100,4 +100,26 @@ describe("gateway cli backend live helpers", () => {
    });
    expect(gatewayClientState.lastOptions).not.toHaveProperty("requestTimeoutMs");
  });
+
+  it("defaults the model switch probe to Claude Sonnet -> Opus", async () => {
+    const { resolveCliModelSwitchProbeTarget, shouldRunCliModelSwitchProbe } =
+      await import("./gateway-cli-backend.live-helpers.js");
+
+    delete process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE;
+
+    expect(resolveCliModelSwitchProbeTarget("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(
+      "claude-cli/claude-opus-4-6",
+    );
+    expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(true);
+    expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-opus-4-6")).toBe(false);
+    expect(shouldRunCliModelSwitchProbe("codex-cli", "codex-cli/gpt-5.4")).toBe(false);
+  });
+
+  it("lets env disable the model switch probe", async () => {
+    const { shouldRunCliModelSwitchProbe } = await import("./gateway-cli-backend.live-helpers.js");
+
+    process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE = "0";
+
+    expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(false);
+  });
 });
--- a/src/gateway/gateway-cli-backend.live-helpers.ts
+++ b/src/gateway/gateway-cli-backend.live-helpers.ts
@@ -97,6 +97,29 @@ export function shouldRunCliMcpProbe(providerId: string): boolean {
  return resolveCliBackendLiveTest(providerId)?.defaultMcpProbe === true;
 }

+export function resolveCliModelSwitchProbeTarget(
+  providerId: string,
+  modelRef: string,
+): string | undefined {
+  const normalizedProvider = providerId.trim().toLowerCase();
+  const normalizedModelRef = modelRef.trim().toLowerCase();
+  if (normalizedProvider !== "claude-cli") {
+    return undefined;
+  }
+  if (normalizedModelRef !== "claude-cli/claude-sonnet-4-6") {
+    return undefined;
+  }
+  return "claude-cli/claude-opus-4-6";
+}
+
+export function shouldRunCliModelSwitchProbe(providerId: string, modelRef: string): boolean {
+  const raw = process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE?.trim();
+  if (raw) {
+    return isTruthyEnvValue(raw);
+  }
+  return typeof resolveCliModelSwitchProbeTarget(providerId, modelRef) === "string";
+}
+
 export function matchesCliBackendReply(text: string, expected: string): boolean {
  const normalized = text.trim();
  const target = expected.trim();
--- a/src/gateway/gateway-cli-backend.live.test.ts
+++ b/src/gateway/gateway-cli-backend.live.test.ts
@@ -16,8 +16,10 @@ import {
  matchesCliBackendReply,
  parseImageMode,
  parseJsonStringArray,
+  resolveCliModelSwitchProbeTarget,
  restoreCliBackendLiveEnv,
  shouldRunCliImageProbe,
+  shouldRunCliModelSwitchProbe,
  shouldRunCliMcpProbe,
  snapshotCliBackendLiveEnv,
  type SystemPromptReport,
@@ -81,11 +83,17 @@ describeLive("gateway live (cli backend)", () => {
      const backendResolved = resolveCliBackendConfig(providerId);
      const enableCliImageProbe = shouldRunCliImageProbe(providerId);
      const enableCliMcpProbe = shouldRunCliMcpProbe(providerId);
+      const enableCliModelSwitchProbe = shouldRunCliModelSwitchProbe(providerId, modelKey);
+      const modelSwitchTarget = enableCliModelSwitchProbe
+        ? resolveCliModelSwitchProbeTarget(providerId, modelKey)
+        : undefined;
      logCliBackendLiveStep("model-selected", {
        providerId,
        modelKey,
        enableCliImageProbe,
        enableCliMcpProbe,
+        enableCliModelSwitchProbe,
+        modelSwitchTarget,
      });
      const providerDefaults = backendResolved?.config;

@@ -173,7 +181,10 @@ describeLive("gateway live (cli backend)", () => {
            ...cfg.agents?.defaults,
            ...(bootstrapWorkspace ? { workspace: bootstrapWorkspace.workspaceRootDir } : {}),
            model: { primary: modelKey },
-            models: { [modelKey]: {} },
+            models: {
+              [modelKey]: {},
+              ...(modelSwitchTarget ? { [modelSwitchTarget]: {} } : {}),
+            },
            cliBackends: {
              ...existingBackends,
              [providerId]: {
@@ -216,6 +227,8 @@ describeLive("gateway live (cli backend)", () => {
      try {
        const sessionKey = "agent:dev:live-cli-backend";
        const nonce = randomBytes(3).toString("hex").toUpperCase();
+        const memoryNonce = randomBytes(3).toString("hex").toUpperCase();
+        const memoryToken = `CLI-MEM-${memoryNonce}`;
        logCliBackendLiveStep("agent-request:start", { sessionKey, nonce });
        const payload = await client.request(
          "agent",
@@ -225,7 +238,11 @@ describeLive("gateway live (cli backend)", () => {
            message:
              providerId === "codex-cli"
                ? `Please include the token CLI-BACKEND-${nonce} in your reply.`
-                : `Reply with exactly: CLI backend OK ${nonce}.`,
+                : enableCliModelSwitchProbe
+                  ? `Reply with exactly: CLI backend OK ${nonce}.` +
+                    ` Also remember this session note for later: ${memoryToken}.` +
+                    " Do not include the note in your reply."
+                  : `Reply with exactly: CLI backend OK ${nonce}.`,
            deliver: false,
          },
          { expectFinal: true },
@@ -250,7 +267,49 @@ describeLive("gateway live (cli backend)", () => {
          ).toEqual(expect.arrayContaining(bootstrapWorkspace?.expectedInjectedFiles ?? []));
        }

-        if (CLI_RESUME) {
+        if (modelSwitchTarget) {
+          const switchNonce = randomBytes(3).toString("hex").toUpperCase();
+          logCliBackendLiveStep("agent-switch:start", {
+            sessionKey,
+            fromModel: modelKey,
+            toModel: modelSwitchTarget,
+            switchNonce,
+            memoryToken,
+          });
+          const patchPayload = await client.request("sessions.patch", {
+            key: sessionKey,
+            model: modelSwitchTarget,
+          });
+          if (!patchPayload || typeof patchPayload !== "object" || !("ok" in patchPayload)) {
+            throw new Error(
+              `sessions.patch failed for model switch: ${JSON.stringify(patchPayload)}`,
+            );
+          }
+          const switchPayload = await client.request(
+            "agent",
+            {
+              sessionKey,
+              idempotencyKey: `idem-${randomUUID()}`,
+              message:
+                "We just switched from Claude Sonnet to Claude Opus in the same session. " +
+                `What session note did I ask you to remember earlier? ` +
+                `Reply with exactly: CLI backend SWITCH OK ${switchNonce} <remembered-note>.`,
+              deliver: false,
+            },
+            { expectFinal: true },
+          );
+          if (switchPayload?.status !== "ok") {
+            throw new Error(`switch status=${String(switchPayload?.status)}`);
+          }
+          logCliBackendLiveStep("agent-switch:done", { status: switchPayload?.status });
+          const switchText = extractPayloadText(switchPayload?.result);
+          expect(
+            matchesCliBackendReply(
+              switchText,
+              `CLI backend SWITCH OK ${switchNonce} ${memoryToken}.`,
+            ),
+          ).toBe(true);
+        } else if (CLI_RESUME) {
          const resumeNonce = randomBytes(3).toString("hex").toUpperCase();
          logCliBackendLiveStep("agent-resume:start", { sessionKey, resumeNonce });
          const resumePayload = await client.request(