From be81fa4424abb2a18d0fecb501c2cc290a803b09 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 23 Apr 2026 14:29:50 +0100 Subject: [PATCH] test: stabilize live docker probes --- src/gateway/gateway-acp-bind.live.test.ts | 21 +++++++------------ .../gateway-cli-backend.live-probe-helpers.ts | 4 ++-- ...gateway-codex-harness.live-helpers.test.ts | 18 ++++++++++++++++ .../gateway-codex-harness.live-helpers.ts | 7 ++++++- .../gateway-codex-harness.live.test.ts | 4 ++-- .../gateway-models.profiles.live.test.ts | 14 ++++++++++++- 6 files changed, 48 insertions(+), 20 deletions(-) diff --git a/src/gateway/gateway-acp-bind.live.test.ts b/src/gateway/gateway-acp-bind.live.test.ts index 3ed7c7c1aac..bd9f55cb712 100644 --- a/src/gateway/gateway-acp-bind.live.test.ts +++ b/src/gateway/gateway-acp-bind.live.test.ts @@ -223,16 +223,6 @@ function formatAssistantTextPreview(texts: string[], maxChars = 600): string { return combined.slice(-maxChars); } -function findAssistantTextContaining(texts: string[], needle: string): string | null { - for (let i = texts.length - 1; i >= 0; i -= 1) { - const text = texts[i]; - if (text?.includes(needle)) { - return text; - } - } - return null; -} - async function bindConversationAndWait(params: { client: GatewayClient; sessionKey: string; @@ -376,8 +366,11 @@ async function waitForAssistantText(params: { const messages = history.messages ?? []; const assistantTexts = extractAssistantTexts(messages); const lastAssistantText = assistantTexts.at(-1) ?? ""; - const matchedAssistantText = findAssistantTextContaining(assistantTexts, params.contains); - if (assistantTexts.length >= (params.minAssistantCount ?? 1) && matchedAssistantText) { + const minAssistantCount = params.minAssistantCount ?? 1; + const matchedAssistantText = assistantTexts + .slice(Math.max(0, minAssistantCount - 1)) + .find((text) => text.includes(params.contains)); + if (assistantTexts.length >= minAssistantCount && matchedAssistantText) { return { messages, lastAssistantText, matchedAssistantText }; } await sleep(500); @@ -715,8 +708,8 @@ describeLive("gateway live (ACP bind)", () => { sessionKey: originalSessionKey, idempotencyKey: `idem-image-${attempt}-${randomUUID()}`, message: - "Best match for the attached image: lobster, mouse, cat, horse. " + - "Reply with one lowercase word only.", + "Read the large word printed at the bottom of the attached image. " + + "Reply with that word in lowercase and nothing else.", originatingChannel: "slack", originatingTo: conversationId, originatingAccountId: accountId, diff --git a/src/gateway/gateway-cli-backend.live-probe-helpers.ts b/src/gateway/gateway-cli-backend.live-probe-helpers.ts index f28544bc743..2ca33999364 100644 --- a/src/gateway/gateway-cli-backend.live-probe-helpers.ts +++ b/src/gateway/gateway-cli-backend.live-probe-helpers.ts @@ -267,8 +267,8 @@ export async function verifyCliBackendImageProbe(params: { // still receives a local file path, but now via the runner code we // actually want to validate instead of an ad hoc prompt-only shortcut. message: - "Best match for the image: lobster, mouse, cat, horse. " + - "Reply with one lowercase word only.", + "Read the large word printed at the bottom of the attached image. " + + "Reply with that word in lowercase and nothing else.", attachments: [ { mimeType: "image/png", diff --git a/src/gateway/gateway-codex-harness.live-helpers.test.ts b/src/gateway/gateway-codex-harness.live-helpers.test.ts index be18fedabc0..965fd934bea 100644 --- a/src/gateway/gateway-codex-harness.live-helpers.test.ts +++ b/src/gateway/gateway-codex-harness.live-helpers.test.ts @@ -32,6 +32,24 @@ describe("gateway codex harness live helpers", () => { expect(isExpectedCodexModelsCommandText(text)).toBe(true); }); + it("accepts the agent-id summary with active Codex model", () => { + const text = [ + "Available agent IDs in this session:", + "", + "- `dev`", + "", + "Current active model:", + "- `codex/gpt-5.4`", + "", + "I couldn’t get a fuller model catalog from the local `codex` CLI here.", + ].join("\n"); + + expect( + EXPECTED_CODEX_MODELS_COMMAND_TEXT.some((expectedText) => text.includes(expectedText)), + ).toBe(true); + expect(isExpectedCodexModelsCommandText(text)).toBe(true); + }); + it("accepts sandbox namespace failures with current-session model fallback", () => { const text = [ "I can’t enumerate `/codex models` from this sandbox because the local `codex` CLI fails to start here with a user-namespace restriction (`bwrap: No permissions to create a new namespace`).", diff --git a/src/gateway/gateway-codex-harness.live-helpers.ts b/src/gateway/gateway-codex-harness.live-helpers.ts index bc8a338fbb8..bcc44a3e6b8 100644 --- a/src/gateway/gateway-codex-harness.live-helpers.ts +++ b/src/gateway/gateway-codex-harness.live-helpers.ts @@ -4,6 +4,7 @@ export const EXPECTED_CODEX_MODELS_COMMAND_TEXT = [ "Available models, local cache:", "Available agent target:", "Available agent targets:", + "Available agent IDs in this session:", "opened an interactive trust prompt", "opened an interactive model-selection prompt", "running as Codex on `codex/", @@ -98,7 +99,8 @@ export function isExpectedCodexModelsCommandText(text: string): boolean { normalized.includes("interactive tui"); const mentionsVisibleOptions = normalized.includes("visible options in this session:") || - normalized.includes("visible options:"); + normalized.includes("visible options:") || + normalized.includes("available agent ids in this session:"); const mentionsCurrentActiveModel = normalized.includes("current active model is `codex/") || normalized.includes("current active model is codex/"); @@ -110,6 +112,8 @@ export function isExpectedCodexModelsCommandText(text: string): boolean { mentionsInteractiveSelection && mentionsVisibleOptions && mentionsCurrentActiveModel; + const isAgentIdModelSummary = + normalized.includes("available agent ids in this session:") && text.includes("`codex/"); const isInteractiveTuiSummary = mentionsCodexModelsCommand && mentionsInteractiveSelection && @@ -120,6 +124,7 @@ export function isExpectedCodexModelsCommandText(text: string): boolean { isSandboxFallback || isSessionConfigFallback || isInteractiveSelectionSummary || + isAgentIdModelSummary || isInteractiveTuiSummary ); } diff --git a/src/gateway/gateway-codex-harness.live.test.ts b/src/gateway/gateway-codex-harness.live.test.ts index 8c35fe8076a..bbda3e5cdf0 100644 --- a/src/gateway/gateway-codex-harness.live.test.ts +++ b/src/gateway/gateway-codex-harness.live.test.ts @@ -375,8 +375,8 @@ async function verifyCodexImageProbe(params: { sessionKey: params.sessionKey, idempotencyKey: `idem-${runId}-image`, message: - "Best match for the image: lobster, mouse, cat, horse. " + - "Reply with one lowercase word only.", + "Read the large word printed at the bottom of the attached image. " + + "Reply with that word in lowercase and nothing else.", attachments: [ { mimeType: "image/png", diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index dad70ddb8e8..a3afb1f967d 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -57,7 +57,7 @@ const GATEWAY_LIVE_SMOKE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY_SM const THINKING_LEVEL = GATEWAY_LIVE_SMOKE ? "low" : "high"; const ENABLE_EXTRA_TOOL_PROBES = !GATEWAY_LIVE_SMOKE; const ENABLE_EXTRA_IMAGE_PROBES = !GATEWAY_LIVE_SMOKE; -const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i; +const THINKING_TAG_RE = /<\s*\/?\s*(?:(?:antml:)?(?:think(?:ing)?|thought)|antthinking)\s*>/i; const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i; const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL"; const GATEWAY_LIVE_DEFAULT_TIMEOUT_MS = 20 * 60 * 1000; @@ -331,6 +331,9 @@ function shouldStripAssistantScaffoldingForLiveModel(modelKey?: string): boolean } const [provider, ...rest] = modelKey.split("/"); const modelId = rest.join("/"); + if (provider === "anthropic") { + return true; + } if (provider === "minimax" || provider === "minimax-portal") { // MiniMax transcript persistence can mirror our wrapper style even // though user-visible surfaces already strip it. Keep the live reader @@ -433,6 +436,15 @@ describe("maybeStripAssistantScaffoldingForLiveModel", () => { ).toBe("Visible"); }); + it("strips Anthropic antml transcript wrappers", () => { + expect( + maybeStripAssistantScaffoldingForLiveModel( + "hiddenVisible", + "anthropic/claude-opus-4-6", + ), + ).toBe("Visible"); + }); + it("strips scaffolding for MiniMax transcript wrappers", () => { expect( maybeStripAssistantScaffoldingForLiveModel(