test: stabilize live docker probes

2026-05-06 06:50:43 +00:00 · 2026-04-23 14:29:50 +01:00
parent e98331b0be
commit be81fa4424
6 changed files with 48 additions and 20 deletions
--- a/src/gateway/gateway-acp-bind.live.test.ts
+++ b/src/gateway/gateway-acp-bind.live.test.ts
@@ -223,16 +223,6 @@ function formatAssistantTextPreview(texts: string[], maxChars = 600): string {
  return combined.slice(-maxChars);
 }

-function findAssistantTextContaining(texts: string[], needle: string): string | null {
-  for (let i = texts.length - 1; i >= 0; i -= 1) {
-    const text = texts[i];
-    if (text?.includes(needle)) {
-      return text;
-    }
-  }
-  return null;
-}
-
 async function bindConversationAndWait(params: {
  client: GatewayClient;
  sessionKey: string;
@@ -376,8 +366,11 @@ async function waitForAssistantText(params: {
    const messages = history.messages ?? [];
    const assistantTexts = extractAssistantTexts(messages);
    const lastAssistantText = assistantTexts.at(-1) ?? "";
-    const matchedAssistantText = findAssistantTextContaining(assistantTexts, params.contains);
-    if (assistantTexts.length >= (params.minAssistantCount ?? 1) && matchedAssistantText) {
+    const minAssistantCount = params.minAssistantCount ?? 1;
+    const matchedAssistantText = assistantTexts
+      .slice(Math.max(0, minAssistantCount - 1))
+      .find((text) => text.includes(params.contains));
+    if (assistantTexts.length >= minAssistantCount && matchedAssistantText) {
      return { messages, lastAssistantText, matchedAssistantText };
    }
    await sleep(500);
@@ -715,8 +708,8 @@ describeLive("gateway live (ACP bind)", () => {
            sessionKey: originalSessionKey,
            idempotencyKey: `idem-image-${attempt}-${randomUUID()}`,
            message:
-              "Best match for the attached image: lobster, mouse, cat, horse. " +
-              "Reply with one lowercase word only.",
+              "Read the large word printed at the bottom of the attached image. " +
+              "Reply with that word in lowercase and nothing else.",
            originatingChannel: "slack",
            originatingTo: conversationId,
            originatingAccountId: accountId,
--- a/src/gateway/gateway-cli-backend.live-probe-helpers.ts
+++ b/src/gateway/gateway-cli-backend.live-probe-helpers.ts
@@ -267,8 +267,8 @@ export async function verifyCliBackendImageProbe(params: {
      // still receives a local file path, but now via the runner code we
      // actually want to validate instead of an ad hoc prompt-only shortcut.
      message:
-        "Best match for the image: lobster, mouse, cat, horse. " +
-        "Reply with one lowercase word only.",
+        "Read the large word printed at the bottom of the attached image. " +
+        "Reply with that word in lowercase and nothing else.",
      attachments: [
        {
          mimeType: "image/png",
--- a/src/gateway/gateway-codex-harness.live-helpers.test.ts
+++ b/src/gateway/gateway-codex-harness.live-helpers.test.ts
@@ -32,6 +32,24 @@ describe("gateway codex harness live helpers", () => {
    expect(isExpectedCodexModelsCommandText(text)).toBe(true);
  });

+  it("accepts the agent-id summary with active Codex model", () => {
+    const text = [
+      "Available agent IDs in this session:",
+      "",
+      "- `dev`",
+      "",
+      "Current active model:",
+      "- `codex/gpt-5.4`",
+      "",
+      "I couldn’t get a fuller model catalog from the local `codex` CLI here.",
+    ].join("\n");
+
+    expect(
+      EXPECTED_CODEX_MODELS_COMMAND_TEXT.some((expectedText) => text.includes(expectedText)),
+    ).toBe(true);
+    expect(isExpectedCodexModelsCommandText(text)).toBe(true);
+  });
+
  it("accepts sandbox namespace failures with current-session model fallback", () => {
    const text = [
      "I can’t enumerate `/codex models` from this sandbox because the local `codex` CLI fails to start here with a user-namespace restriction (`bwrap: No permissions to create a new namespace`).",
--- a/src/gateway/gateway-codex-harness.live-helpers.ts
+++ b/src/gateway/gateway-codex-harness.live-helpers.ts
@@ -4,6 +4,7 @@ export const EXPECTED_CODEX_MODELS_COMMAND_TEXT = [
  "Available models, local cache:",
  "Available agent target:",
  "Available agent targets:",
+  "Available agent IDs in this session:",
  "opened an interactive trust prompt",
  "opened an interactive model-selection prompt",
  "running as Codex on `codex/",
@@ -98,7 +99,8 @@ export function isExpectedCodexModelsCommandText(text: string): boolean {
    normalized.includes("interactive tui");
  const mentionsVisibleOptions =
    normalized.includes("visible options in this session:") ||
-    normalized.includes("visible options:");
+    normalized.includes("visible options:") ||
+    normalized.includes("available agent ids in this session:");
  const mentionsCurrentActiveModel =
    normalized.includes("current active model is `codex/") ||
    normalized.includes("current active model is codex/");
@@ -110,6 +112,8 @@ export function isExpectedCodexModelsCommandText(text: string): boolean {
    mentionsInteractiveSelection &&
    mentionsVisibleOptions &&
    mentionsCurrentActiveModel;
+  const isAgentIdModelSummary =
+    normalized.includes("available agent ids in this session:") && text.includes("`codex/");
  const isInteractiveTuiSummary =
    mentionsCodexModelsCommand &&
    mentionsInteractiveSelection &&
@@ -120,6 +124,7 @@ export function isExpectedCodexModelsCommandText(text: string): boolean {
    isSandboxFallback ||
    isSessionConfigFallback ||
    isInteractiveSelectionSummary ||
+    isAgentIdModelSummary ||
    isInteractiveTuiSummary
  );
 }
--- a/src/gateway/gateway-codex-harness.live.test.ts
+++ b/src/gateway/gateway-codex-harness.live.test.ts
@@ -375,8 +375,8 @@ async function verifyCodexImageProbe(params: {
      sessionKey: params.sessionKey,
      idempotencyKey: `idem-${runId}-image`,
      message:
-        "Best match for the image: lobster, mouse, cat, horse. " +
-        "Reply with one lowercase word only.",
+        "Read the large word printed at the bottom of the attached image. " +
+        "Reply with that word in lowercase and nothing else.",
      attachments: [
        {
          mimeType: "image/png",
--- a/src/gateway/gateway-models.profiles.live.test.ts
+++ b/src/gateway/gateway-models.profiles.live.test.ts
@@ -57,7 +57,7 @@ const GATEWAY_LIVE_SMOKE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY_SM
 const THINKING_LEVEL = GATEWAY_LIVE_SMOKE ? "low" : "high";
 const ENABLE_EXTRA_TOOL_PROBES = !GATEWAY_LIVE_SMOKE;
 const ENABLE_EXTRA_IMAGE_PROBES = !GATEWAY_LIVE_SMOKE;
-const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
+const THINKING_TAG_RE = /<\s*\/?\s*(?:(?:antml:)?(?:think(?:ing)?|thought)|antthinking)\s*>/i;
 const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
 const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
 const GATEWAY_LIVE_DEFAULT_TIMEOUT_MS = 20 * 60 * 1000;
@@ -331,6 +331,9 @@ function shouldStripAssistantScaffoldingForLiveModel(modelKey?: string): boolean
  }
  const [provider, ...rest] = modelKey.split("/");
  const modelId = rest.join("/");
+  if (provider === "anthropic") {
+    return true;
+  }
  if (provider === "minimax" || provider === "minimax-portal") {
    // MiniMax transcript persistence can mirror our <final> wrapper style even
    // though user-visible surfaces already strip it. Keep the live reader
@@ -433,6 +436,15 @@ describe("maybeStripAssistantScaffoldingForLiveModel", () => {
    ).toBe("<final>Visible</final>");
  });

+  it("strips Anthropic antml transcript wrappers", () => {
+    expect(
+      maybeStripAssistantScaffoldingForLiveModel(
+        "<antml:thinking>hidden</thinking>Visible",
+        "anthropic/claude-opus-4-6",
+      ),
+    ).toBe("Visible");
+  });
+
  it("strips scaffolding for MiniMax transcript wrappers", () => {
    expect(
      maybeStripAssistantScaffoldingForLiveModel(