test: stabilize live docker probes

This commit is contained in:
Peter Steinberger
2026-04-23 14:29:50 +01:00
parent e98331b0be
commit be81fa4424
6 changed files with 48 additions and 20 deletions

View File

@@ -223,16 +223,6 @@ function formatAssistantTextPreview(texts: string[], maxChars = 600): string {
return combined.slice(-maxChars);
}
function findAssistantTextContaining(texts: string[], needle: string): string | null {
for (let i = texts.length - 1; i >= 0; i -= 1) {
const text = texts[i];
if (text?.includes(needle)) {
return text;
}
}
return null;
}
async function bindConversationAndWait(params: {
client: GatewayClient;
sessionKey: string;
@@ -376,8 +366,11 @@ async function waitForAssistantText(params: {
const messages = history.messages ?? [];
const assistantTexts = extractAssistantTexts(messages);
const lastAssistantText = assistantTexts.at(-1) ?? "";
const matchedAssistantText = findAssistantTextContaining(assistantTexts, params.contains);
if (assistantTexts.length >= (params.minAssistantCount ?? 1) && matchedAssistantText) {
const minAssistantCount = params.minAssistantCount ?? 1;
const matchedAssistantText = assistantTexts
.slice(Math.max(0, minAssistantCount - 1))
.find((text) => text.includes(params.contains));
if (assistantTexts.length >= minAssistantCount && matchedAssistantText) {
return { messages, lastAssistantText, matchedAssistantText };
}
await sleep(500);
@@ -715,8 +708,8 @@ describeLive("gateway live (ACP bind)", () => {
sessionKey: originalSessionKey,
idempotencyKey: `idem-image-${attempt}-${randomUUID()}`,
message:
"Best match for the attached image: lobster, mouse, cat, horse. " +
"Reply with one lowercase word only.",
"Read the large word printed at the bottom of the attached image. " +
"Reply with that word in lowercase and nothing else.",
originatingChannel: "slack",
originatingTo: conversationId,
originatingAccountId: accountId,

View File

@@ -267,8 +267,8 @@ export async function verifyCliBackendImageProbe(params: {
// still receives a local file path, but now via the runner code we
// actually want to validate instead of an ad hoc prompt-only shortcut.
message:
"Best match for the image: lobster, mouse, cat, horse. " +
"Reply with one lowercase word only.",
"Read the large word printed at the bottom of the attached image. " +
"Reply with that word in lowercase and nothing else.",
attachments: [
{
mimeType: "image/png",

View File

@@ -32,6 +32,24 @@ describe("gateway codex harness live helpers", () => {
expect(isExpectedCodexModelsCommandText(text)).toBe(true);
});
it("accepts the agent-id summary with active Codex model", () => {
const text = [
"Available agent IDs in this session:",
"",
"- `dev`",
"",
"Current active model:",
"- `codex/gpt-5.4`",
"",
"I couldnt get a fuller model catalog from the local `codex` CLI here.",
].join("\n");
expect(
EXPECTED_CODEX_MODELS_COMMAND_TEXT.some((expectedText) => text.includes(expectedText)),
).toBe(true);
expect(isExpectedCodexModelsCommandText(text)).toBe(true);
});
it("accepts sandbox namespace failures with current-session model fallback", () => {
const text = [
"I cant enumerate `/codex models` from this sandbox because the local `codex` CLI fails to start here with a user-namespace restriction (`bwrap: No permissions to create a new namespace`).",

View File

@@ -4,6 +4,7 @@ export const EXPECTED_CODEX_MODELS_COMMAND_TEXT = [
"Available models, local cache:",
"Available agent target:",
"Available agent targets:",
"Available agent IDs in this session:",
"opened an interactive trust prompt",
"opened an interactive model-selection prompt",
"running as Codex on `codex/",
@@ -98,7 +99,8 @@ export function isExpectedCodexModelsCommandText(text: string): boolean {
normalized.includes("interactive tui");
const mentionsVisibleOptions =
normalized.includes("visible options in this session:") ||
normalized.includes("visible options:");
normalized.includes("visible options:") ||
normalized.includes("available agent ids in this session:");
const mentionsCurrentActiveModel =
normalized.includes("current active model is `codex/") ||
normalized.includes("current active model is codex/");
@@ -110,6 +112,8 @@ export function isExpectedCodexModelsCommandText(text: string): boolean {
mentionsInteractiveSelection &&
mentionsVisibleOptions &&
mentionsCurrentActiveModel;
const isAgentIdModelSummary =
normalized.includes("available agent ids in this session:") && text.includes("`codex/");
const isInteractiveTuiSummary =
mentionsCodexModelsCommand &&
mentionsInteractiveSelection &&
@@ -120,6 +124,7 @@ export function isExpectedCodexModelsCommandText(text: string): boolean {
isSandboxFallback ||
isSessionConfigFallback ||
isInteractiveSelectionSummary ||
isAgentIdModelSummary ||
isInteractiveTuiSummary
);
}

View File

@@ -375,8 +375,8 @@ async function verifyCodexImageProbe(params: {
sessionKey: params.sessionKey,
idempotencyKey: `idem-${runId}-image`,
message:
"Best match for the image: lobster, mouse, cat, horse. " +
"Reply with one lowercase word only.",
"Read the large word printed at the bottom of the attached image. " +
"Reply with that word in lowercase and nothing else.",
attachments: [
{
mimeType: "image/png",

View File

@@ -57,7 +57,7 @@ const GATEWAY_LIVE_SMOKE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY_SM
const THINKING_LEVEL = GATEWAY_LIVE_SMOKE ? "low" : "high";
const ENABLE_EXTRA_TOOL_PROBES = !GATEWAY_LIVE_SMOKE;
const ENABLE_EXTRA_IMAGE_PROBES = !GATEWAY_LIVE_SMOKE;
const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
const THINKING_TAG_RE = /<\s*\/?\s*(?:(?:antml:)?(?:think(?:ing)?|thought)|antthinking)\s*>/i;
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
const GATEWAY_LIVE_DEFAULT_TIMEOUT_MS = 20 * 60 * 1000;
@@ -331,6 +331,9 @@ function shouldStripAssistantScaffoldingForLiveModel(modelKey?: string): boolean
}
const [provider, ...rest] = modelKey.split("/");
const modelId = rest.join("/");
if (provider === "anthropic") {
return true;
}
if (provider === "minimax" || provider === "minimax-portal") {
// MiniMax transcript persistence can mirror our <final> wrapper style even
// though user-visible surfaces already strip it. Keep the live reader
@@ -433,6 +436,15 @@ describe("maybeStripAssistantScaffoldingForLiveModel", () => {
).toBe("<final>Visible</final>");
});
it("strips Anthropic antml transcript wrappers", () => {
expect(
maybeStripAssistantScaffoldingForLiveModel(
"<antml:thinking>hidden</thinking>Visible",
"anthropic/claude-opus-4-6",
),
).toBe("Visible");
});
it("strips scaffolding for MiniMax transcript wrappers", () => {
expect(
maybeStripAssistantScaffoldingForLiveModel(