fix: harden claude-cli live switch smoke

This commit is contained in:
Peter Steinberger
2026-04-07 16:05:39 +01:00
parent 9c56c84ce0
commit e973275fd0
6 changed files with 119 additions and 5 deletions

View File

@@ -272,6 +272,7 @@ openclaw models list --json
- `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="--image"` to pass image file paths as CLI args instead of prompt injection.
- `OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="repeat"` (or `"list"`) to control how image args are passed when `IMAGE_ARG` is set.
- `OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1` to send a second turn and validate resume flow.
- `OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE=0` to disable the default Claude Sonnet -> Opus same-session continuity probe (set to `1` to force it on when the selected model supports a switch target).
Example:
@@ -301,6 +302,7 @@ Notes:
- It runs the live CLI-backend smoke inside the repo Docker image as the non-root `node` user.
- It resolves CLI smoke metadata from the owning extension, then installs the matching Linux CLI package (`@anthropic-ai/claude-code`, `@openai/codex`, or `@google/gemini-cli`) into a cached writable prefix at `OPENCLAW_DOCKER_CLI_TOOLS_DIR` (default: `~/.cache/openclaw/docker-cli-tools`).
- The live CLI-backend smoke now exercises the same end-to-end flow for Claude, Codex, and Gemini: text turn, image classification turn, then MCP `cron` tool call verified through the gateway CLI.
- Claude's default smoke also patches the session from Sonnet to Opus and verifies the resumed session still remembers an earlier note.
## Live: ACP bind smoke (`/acp spawn ... --bind here`)
@@ -448,7 +450,7 @@ Live tests discover credentials the same way the CLI does. Practical implication
- Per-agent auth profiles: `~/.openclaw/agents/<agentId>/agent/auth-profiles.json` (this is what “profile keys” means in the live tests)
- Config: `~/.openclaw/openclaw.json` (or `OPENCLAW_CONFIG_PATH`)
- Legacy state dir: `~/.openclaw/credentials/` (copied into the staged live home when present, but not the main profile-key store)
- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; `agents.*.workspace` / `agentDir` path overrides are stripped in that staged config so probes stay off your real host workspace.
- Live local runs copy the active config, per-agent `auth-profiles.json` files, legacy `credentials/`, and supported external CLI auth dirs into a temp test home by default; staged live homes skip `workspace/` and `sandboxes/`, and `agents.*.workspace` / `agentDir` path overrides are stripped so probes stay off your real host workspace.
If you want to rely on env keys (e.g. exported in your `~/.profile`), run local tests after `source ~/.profile`, or use the Docker runners below (they can mount `~/.profile` into the container).

View File

@@ -40,7 +40,14 @@ openclaw_live_stage_state_dir() {
mkdir -p "$dest_dir"
if [ -d "$source_dir" ]; then
tar -C "$source_dir" --exclude=workspace -cf - . | tar -C "$dest_dir" -xf -
# Sandbox workspaces can accumulate root-owned artifacts from prior Docker
# runs. They are not needed for live-test auth/config staging and can make
# temp-dir cleanup fail on exit, so keep them out of the staged state copy.
tar -C "$source_dir" \
--exclude=workspace \
--exclude=sandboxes \
-cf - . | tar -C "$dest_dir" -xf -
chmod -R u+rwX "$dest_dir" || true
if [ -d "$source_dir/workspace" ] && [ ! -e "$dest_dir/workspace" ]; then
ln -s "$source_dir/workspace" "$dest_dir/workspace"
fi

View File

@@ -210,6 +210,7 @@ docker run --rm -t \
-e OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV="${OPENCLAW_LIVE_CLI_BACKEND_PRESERVE_ENV:-}" \
-e OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG="$CLI_DISABLE_MCP_CONFIG" \
-e OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE:-}" \
-e OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE:-}" \
-e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE:-}" \
-e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_ARG:-}" \
-e OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE="${OPENCLAW_LIVE_CLI_BACKEND_IMAGE_MODE:-}" \

View File

@@ -100,4 +100,26 @@ describe("gateway cli backend live helpers", () => {
});
expect(gatewayClientState.lastOptions).not.toHaveProperty("requestTimeoutMs");
});
it("defaults the model switch probe to Claude Sonnet -> Opus", async () => {
const { resolveCliModelSwitchProbeTarget, shouldRunCliModelSwitchProbe } =
await import("./gateway-cli-backend.live-helpers.js");
delete process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE;
expect(resolveCliModelSwitchProbeTarget("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(
"claude-cli/claude-opus-4-6",
);
expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(true);
expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-opus-4-6")).toBe(false);
expect(shouldRunCliModelSwitchProbe("codex-cli", "codex-cli/gpt-5.4")).toBe(false);
});
it("lets env disable the model switch probe", async () => {
const { shouldRunCliModelSwitchProbe } = await import("./gateway-cli-backend.live-helpers.js");
process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE = "0";
expect(shouldRunCliModelSwitchProbe("claude-cli", "claude-cli/claude-sonnet-4-6")).toBe(false);
});
});

View File

@@ -97,6 +97,29 @@ export function shouldRunCliMcpProbe(providerId: string): boolean {
return resolveCliBackendLiveTest(providerId)?.defaultMcpProbe === true;
}
export function resolveCliModelSwitchProbeTarget(
providerId: string,
modelRef: string,
): string | undefined {
const normalizedProvider = providerId.trim().toLowerCase();
const normalizedModelRef = modelRef.trim().toLowerCase();
if (normalizedProvider !== "claude-cli") {
return undefined;
}
if (normalizedModelRef !== "claude-cli/claude-sonnet-4-6") {
return undefined;
}
return "claude-cli/claude-opus-4-6";
}
export function shouldRunCliModelSwitchProbe(providerId: string, modelRef: string): boolean {
const raw = process.env.OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE?.trim();
if (raw) {
return isTruthyEnvValue(raw);
}
return typeof resolveCliModelSwitchProbeTarget(providerId, modelRef) === "string";
}
export function matchesCliBackendReply(text: string, expected: string): boolean {
const normalized = text.trim();
const target = expected.trim();

View File

@@ -16,8 +16,10 @@ import {
matchesCliBackendReply,
parseImageMode,
parseJsonStringArray,
resolveCliModelSwitchProbeTarget,
restoreCliBackendLiveEnv,
shouldRunCliImageProbe,
shouldRunCliModelSwitchProbe,
shouldRunCliMcpProbe,
snapshotCliBackendLiveEnv,
type SystemPromptReport,
@@ -81,11 +83,17 @@ describeLive("gateway live (cli backend)", () => {
const backendResolved = resolveCliBackendConfig(providerId);
const enableCliImageProbe = shouldRunCliImageProbe(providerId);
const enableCliMcpProbe = shouldRunCliMcpProbe(providerId);
const enableCliModelSwitchProbe = shouldRunCliModelSwitchProbe(providerId, modelKey);
const modelSwitchTarget = enableCliModelSwitchProbe
? resolveCliModelSwitchProbeTarget(providerId, modelKey)
: undefined;
logCliBackendLiveStep("model-selected", {
providerId,
modelKey,
enableCliImageProbe,
enableCliMcpProbe,
enableCliModelSwitchProbe,
modelSwitchTarget,
});
const providerDefaults = backendResolved?.config;
@@ -173,7 +181,10 @@ describeLive("gateway live (cli backend)", () => {
...cfg.agents?.defaults,
...(bootstrapWorkspace ? { workspace: bootstrapWorkspace.workspaceRootDir } : {}),
model: { primary: modelKey },
models: { [modelKey]: {} },
models: {
[modelKey]: {},
...(modelSwitchTarget ? { [modelSwitchTarget]: {} } : {}),
},
cliBackends: {
...existingBackends,
[providerId]: {
@@ -216,6 +227,8 @@ describeLive("gateway live (cli backend)", () => {
try {
const sessionKey = "agent:dev:live-cli-backend";
const nonce = randomBytes(3).toString("hex").toUpperCase();
const memoryNonce = randomBytes(3).toString("hex").toUpperCase();
const memoryToken = `CLI-MEM-${memoryNonce}`;
logCliBackendLiveStep("agent-request:start", { sessionKey, nonce });
const payload = await client.request(
"agent",
@@ -225,7 +238,11 @@ describeLive("gateway live (cli backend)", () => {
message:
providerId === "codex-cli"
? `Please include the token CLI-BACKEND-${nonce} in your reply.`
: `Reply with exactly: CLI backend OK ${nonce}.`,
: enableCliModelSwitchProbe
? `Reply with exactly: CLI backend OK ${nonce}.` +
` Also remember this session note for later: ${memoryToken}.` +
" Do not include the note in your reply."
: `Reply with exactly: CLI backend OK ${nonce}.`,
deliver: false,
},
{ expectFinal: true },
@@ -250,7 +267,49 @@ describeLive("gateway live (cli backend)", () => {
).toEqual(expect.arrayContaining(bootstrapWorkspace?.expectedInjectedFiles ?? []));
}
if (CLI_RESUME) {
if (modelSwitchTarget) {
const switchNonce = randomBytes(3).toString("hex").toUpperCase();
logCliBackendLiveStep("agent-switch:start", {
sessionKey,
fromModel: modelKey,
toModel: modelSwitchTarget,
switchNonce,
memoryToken,
});
const patchPayload = await client.request("sessions.patch", {
key: sessionKey,
model: modelSwitchTarget,
});
if (!patchPayload || typeof patchPayload !== "object" || !("ok" in patchPayload)) {
throw new Error(
`sessions.patch failed for model switch: ${JSON.stringify(patchPayload)}`,
);
}
const switchPayload = await client.request(
"agent",
{
sessionKey,
idempotencyKey: `idem-${randomUUID()}`,
message:
"We just switched from Claude Sonnet to Claude Opus in the same session. " +
`What session note did I ask you to remember earlier? ` +
`Reply with exactly: CLI backend SWITCH OK ${switchNonce} <remembered-note>.`,
deliver: false,
},
{ expectFinal: true },
);
if (switchPayload?.status !== "ok") {
throw new Error(`switch status=${String(switchPayload?.status)}`);
}
logCliBackendLiveStep("agent-switch:done", { status: switchPayload?.status });
const switchText = extractPayloadText(switchPayload?.result);
expect(
matchesCliBackendReply(
switchText,
`CLI backend SWITCH OK ${switchNonce} ${memoryToken}.`,
),
).toBe(true);
} else if (CLI_RESUME) {
const resumeNonce = randomBytes(3).toString("hex").toUpperCase();
logCliBackendLiveStep("agent-resume:start", { sessionKey, resumeNonce });
const resumePayload = await client.request(