diff --git a/docs/help/testing.md b/docs/help/testing.md index 141a887a162..5923f0ab9ae 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -435,6 +435,8 @@ These run `pnpm test:live` inside the repo Docker image, mounting your local con The live-model Docker runners also bind-mount the current checkout read-only and stage it into a temporary workdir inside the container. This keeps the runtime image slim while still running Vitest against your exact local source/config. +They also set `OPENCLAW_SKIP_CHANNELS=1` so gateway live probes do not start +real Telegram/Discord/etc. channel workers inside the container. `test:docker:live-models` still runs `pnpm test:live`, so pass through `OPENCLAW_LIVE_GATEWAY_*` as well when you need to narrow or exclude gateway live coverage from that Docker lane. diff --git a/scripts/test-live-gateway-models-docker.sh b/scripts/test-live-gateway-models-docker.sh index 051808acfe6..43bf0a67c4a 100755 --- a/scripts/test-live-gateway-models-docker.sh +++ b/scripts/test-live-gateway-models-docker.sh @@ -86,6 +86,7 @@ docker run --rm -t \ -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \ -e HOME=/home/node \ -e NODE_OPTIONS=--disable-warning=ExperimentalWarning \ + -e OPENCLAW_SKIP_CHANNELS=1 \ -e OPENCLAW_DOCKER_AUTH_DIRS_RESOLVED="$AUTH_DIRS_CSV" \ -e OPENCLAW_LIVE_TEST=1 \ -e OPENCLAW_LIVE_GATEWAY_MODELS="${OPENCLAW_LIVE_GATEWAY_MODELS:-modern}" \ diff --git a/scripts/test-live-models-docker.sh b/scripts/test-live-models-docker.sh index 56c9eddca60..928f8e5b602 100755 --- a/scripts/test-live-models-docker.sh +++ b/scripts/test-live-models-docker.sh @@ -91,6 +91,7 @@ docker run --rm -t \ -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \ -e HOME=/home/node \ -e NODE_OPTIONS=--disable-warning=ExperimentalWarning \ + -e OPENCLAW_SKIP_CHANNELS=1 \ -e OPENCLAW_DOCKER_AUTH_DIRS_RESOLVED="$AUTH_DIRS_CSV" \ -e OPENCLAW_LIVE_TEST=1 \ -e OPENCLAW_LIVE_MODELS="${OPENCLAW_LIVE_MODELS:-modern}" \ diff --git a/src/agents/live-model-errors.test.ts b/src/agents/live-model-errors.test.ts index ec9440fbe57..1449164d532 100644 --- a/src/agents/live-model-errors.test.ts +++ b/src/agents/live-model-errors.test.ts @@ -8,6 +8,11 @@ describe("live model error helpers", () => { it("detects generic model-not-found messages", () => { expect(isModelNotFoundErrorMessage('{"code":404,"message":"model not found"}')).toBe(true); expect(isModelNotFoundErrorMessage("model: MiniMax-M2.7-highspeed not found")).toBe(true); + expect( + isModelNotFoundErrorMessage( + "HTTP 400 not_found_error: model: claude-3-5-haiku-20241022 (request_id: req_123)", + ), + ).toBe(true); expect(isModelNotFoundErrorMessage("request ended without sending any chunks")).toBe(false); }); diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index bb3327680cb..85ea692c098 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -4,7 +4,7 @@ import { createServer } from "node:net"; import os from "node:os"; import path from "node:path"; import type { Api, Model } from "@mariozechner/pi-ai"; -import { describe, it } from "vitest"; +import { describe, expect, it } from "vitest"; import { resolveOpenClawAgentDir } from "../agents/agent-paths.js"; import { resolveAgentWorkspaceDir } from "../agents/agent-scope.js"; import { @@ -17,6 +17,7 @@ import { isAnthropicBillingError, isAnthropicRateLimitError, } from "../agents/live-auth-keys.js"; +import { isModelNotFoundErrorMessage } from "../agents/live-model-errors.js"; import { isModernModelRef } from "../agents/live-model-filter.js"; import { isLiveTestEnabled } from "../agents/live-test-helpers.js"; import { getApiKeyForModel } from "../agents/model-auth.js"; @@ -28,6 +29,7 @@ import { clearRuntimeConfigSnapshot, loadConfig } from "../config/config.js"; import type { ModelsConfig, OpenClawConfig, ModelProviderConfig } from "../config/types.js"; import { isTruthyEnvValue } from "../infra/env.js"; import { DEFAULT_AGENT_ID } from "../routing/session-key.js"; +import { stripAssistantInternalScaffolding } from "../shared/text/assistant-visible-text.js"; import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js"; import { GatewayClient } from "./client.js"; import { renderCatNoncePngBase64 } from "./live-image-probe.js"; @@ -58,6 +60,7 @@ const GATEWAY_LIVE_HEARTBEAT_MS = Math.max( 1_000, toInt(process.env.OPENCLAW_LIVE_GATEWAY_HEARTBEAT_MS, 30_000), ); +const GATEWAY_LIVE_STRIP_SCAFFOLDING_MODEL_KEYS = new Set(["google/gemini-3-flash-preview"]); const GATEWAY_LIVE_MAX_MODELS = resolveGatewayLiveMaxModels(); const GATEWAY_LIVE_SUITE_TIMEOUT_MS = resolveGatewayLiveSuiteTimeoutMs(GATEWAY_LIVE_MAX_MODELS); @@ -267,6 +270,34 @@ function isMeaningful(text: string): boolean { return true; } +function shouldStripAssistantScaffoldingForLiveModel(modelKey?: string): boolean { + return !!modelKey && GATEWAY_LIVE_STRIP_SCAFFOLDING_MODEL_KEYS.has(modelKey); +} + +function maybeStripAssistantScaffoldingForLiveModel(text: string, modelKey?: string): string { + if (!shouldStripAssistantScaffoldingForLiveModel(modelKey)) { + return text; + } + return stripAssistantInternalScaffolding(text).trim(); +} + +describe("maybeStripAssistantScaffoldingForLiveModel", () => { + it("strips scaffolding only for the targeted live model", () => { + expect( + maybeStripAssistantScaffoldingForLiveModel( + "hiddenVisible", + "google/gemini-3-flash-preview", + ), + ).toBe("Visible"); + expect( + maybeStripAssistantScaffoldingForLiveModel( + "hiddenVisible", + "google/gemini-3-pro-preview", + ), + ).toBe("hiddenVisible"); + }); +}); + function isGoogleModelNotFoundText(text: string): boolean { const trimmed = text.trim(); if (!trimmed) { @@ -370,6 +401,7 @@ async function runAnthropicRefusalProbe(params: { message: `Reply with the single word ok. Test token: ${magic}`, thinkingLevel: params.thinkingLevel, context: `${params.label}: refusal-probe`, + modelKey: params.modelKey, }); assertNoReasoningTags({ text: probeText, @@ -388,6 +420,7 @@ async function runAnthropicRefusalProbe(params: { message: "Now reply with exactly: still ok.", thinkingLevel: params.thinkingLevel, context: `${params.label}: refusal-followup`, + modelKey: params.modelKey, }); assertNoReasoningTags({ text: followupText, @@ -560,7 +593,7 @@ function extractTranscriptMessageText(message: unknown): string { .trim(); } -function readSessionAssistantTexts(sessionKey: string): string[] { +function readSessionAssistantTexts(sessionKey: string, modelKey?: string): string[] { const { storePath, entry } = loadSessionEntry(sessionKey); if (!entry?.sessionId) { return []; @@ -575,7 +608,9 @@ function readSessionAssistantTexts(sessionKey: string): string[] { if (role !== "assistant") { continue; } - assistantTexts.push(extractTranscriptMessageText(message)); + assistantTexts.push( + maybeStripAssistantScaffoldingForLiveModel(extractTranscriptMessageText(message), modelKey), + ); } return assistantTexts; } @@ -584,12 +619,13 @@ async function waitForSessionAssistantText(params: { sessionKey: string; baselineAssistantCount: number; context: string; + modelKey?: string; }) { const startedAt = Date.now(); let lastHeartbeatAt = startedAt; let delayMs = 50; while (Date.now() - startedAt < GATEWAY_LIVE_PROBE_TIMEOUT_MS) { - const assistantTexts = readSessionAssistantTexts(params.sessionKey); + const assistantTexts = readSessionAssistantTexts(params.sessionKey, params.modelKey); if (assistantTexts.length > params.baselineAssistantCount) { const freshText = assistantTexts .slice(params.baselineAssistantCount) @@ -618,13 +654,17 @@ async function requestGatewayAgentText(params: { thinkingLevel: string; context: string; idempotencyKey: string; + modelKey?: string; attachments?: Array<{ mimeType: string; fileName: string; content: string; }>; }) { - const baselineAssistantCount = readSessionAssistantTexts(params.sessionKey).length; + const baselineAssistantCount = readSessionAssistantTexts( + params.sessionKey, + params.modelKey, + ).length; const accepted = await withGatewayLiveProbeTimeout( params.client.request<{ runId?: unknown; status?: unknown }>("agent", { sessionKey: params.sessionKey, @@ -643,6 +683,7 @@ async function requestGatewayAgentText(params: { sessionKey: params.sessionKey, baselineAssistantCount, context: `${params.context}: transcript-final`, + modelKey: params.modelKey, }); } @@ -650,6 +691,7 @@ type GatewayModelSuiteParams = { label: string; cfg: OpenClawConfig; candidates: Array>; + allowNotFoundSkip: boolean; extraToolProbes: boolean; extraImageProbes: boolean; thinkingLevel: string; @@ -935,6 +977,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { client, sessionKey, idempotencyKey: `idem-${randomUUID()}`, + modelKey, message: "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", thinkingLevel: params.thinkingLevel, @@ -946,6 +989,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { client, sessionKey, idempotencyKey: `idem-${randomUUID()}-retry`, + modelKey, message: "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", thinkingLevel: params.thinkingLevel, @@ -969,6 +1013,10 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { logProgress(`${progressLabel}: skip (google model not found)`); break; } + if (params.allowNotFoundSkip && isModelNotFoundErrorMessage(text)) { + logProgress(`${progressLabel}: skip (model not found)`); + break; + } assertNoReasoningTags({ text, model: modelKey, @@ -1001,6 +1049,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { client, sessionKey, idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`, + modelKey, message: strictReply ? "OpenClaw live tool probe (local, safe): " + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + @@ -1064,6 +1113,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { client, sessionKey, idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, + modelKey, message: strictReply ? "OpenClaw live tool probe (local, safe): " + "use the tool named `exec` (or `Exec`) to run this command: " + @@ -1128,6 +1178,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { client, sessionKey, idempotencyKey: `idem-${runIdImage}-image`, + modelKey, message: "Look at the attached image. Reply with exactly two tokens separated by a single space: " + "(1) the animal shown or written in the image, lowercase; " + @@ -1185,6 +1236,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { client, sessionKey, idempotencyKey: `idem-${runId2}-1`, + modelKey, message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, thinkingLevel: params.thinkingLevel, context: `${progressLabel}: tool-only-regression-first`, @@ -1200,6 +1252,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { client, sessionKey, idempotencyKey: `idem-${runId2}-2`, + modelKey, message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, thinkingLevel: params.thinkingLevel, context: `${progressLabel}: tool-only-regression-second`, @@ -1268,11 +1321,27 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { logProgress(`${progressLabel}: skip (google rate limit)`); break; } + if ( + (model.provider === "minimax" || + model.provider === "opencode" || + model.provider === "opencode-go" || + model.provider === "zai") && + isRateLimitErrorMessage(message) + ) { + skippedCount += 1; + logProgress(`${progressLabel}: skip (rate limit)`); + break; + } if (isProviderUnavailableErrorMessage(message)) { skippedCount += 1; logProgress(`${progressLabel}: skip (provider unavailable)`); break; } + if (params.allowNotFoundSkip && isModelNotFoundErrorMessage(message)) { + skippedCount += 1; + logProgress(`${progressLabel}: skip (model not found)`); + break; + } if ( model.provider === "anthropic" && isGatewayLiveProbeTimeout(message) && @@ -1448,6 +1517,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { label: "all-models", cfg, candidates: selectedCandidates, + allowNotFoundSkip: useModern, extraToolProbes: true, extraImageProbes: true, thinkingLevel: THINKING_LEVEL, @@ -1469,6 +1539,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { label: "minimax-anthropic", cfg, candidates: minimaxCandidates, + allowNotFoundSkip: useModern, extraToolProbes: true, extraImageProbes: true, thinkingLevel: THINKING_LEVEL, @@ -1589,6 +1660,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { client, sessionKey, idempotencyKey: `idem-${randomUUID()}-tool`, + modelKey: "anthropic/claude-opus-4-5", message: `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` + `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`, @@ -1617,6 +1689,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { client, sessionKey, idempotencyKey: `idem-${randomUUID()}-followup`, + modelKey: "zai/glm-4.7", message: `What are the values of nonceA and nonceB in "${toolProbePath}"? ` + `Reply with exactly: ${nonceA} ${nonceB}.`, diff --git a/src/gateway/session-utils.fs.test.ts b/src/gateway/session-utils.fs.test.ts index ca95b86aca1..711948f5b6f 100644 --- a/src/gateway/session-utils.fs.test.ts +++ b/src/gateway/session-utils.fs.test.ts @@ -556,6 +556,39 @@ describe("readSessionMessages", () => { expect((out[0] as { __openclaw?: { seq?: number } }).__openclaw?.seq).toBe(1); } }); + + test("preserves raw assistant transcript content on disk reads", () => { + const sessionId = "assistant-scaffolding"; + const transcriptPath = path.join(tmpDir, `${sessionId}.jsonl`); + fs.writeFileSync( + transcriptPath, + [ + JSON.stringify({ type: "session", version: 1, id: sessionId }), + JSON.stringify({ + message: { + role: "assistant", + text: "hiddenVisible top-level", + content: [ + { type: "text", text: "secretVisible content" }, + { type: "tool_result", text: "keep?Visible tool text" }, + ], + }, + }), + ].join("\n"), + "utf-8", + ); + + const out = readSessionMessages(sessionId, storePath); + expect(out).toHaveLength(1); + expect(out[0]).toMatchObject({ + role: "assistant", + text: "hiddenVisible top-level", + content: [ + { type: "text", text: "secretVisible content" }, + { type: "tool_result", text: "keep?Visible tool text" }, + ], + }); + }); }); describe("readSessionPreviewItemsFromTranscript", () => {