From 1e1aaa51e146dd9085d506f1d2ddb0cb71a048df Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 23 Apr 2026 15:27:22 +0100 Subject: [PATCH] test: harden live model extra probes --- docs/help/testing.md | 8 ++-- src/agents/live-model-turn-probes.test.ts | 37 +++++++++++++++---- src/agents/live-model-turn-probes.ts | 45 ++++++++++++++++------- src/agents/models.profiles.live.test.ts | 28 +++++++++++++- 4 files changed, 91 insertions(+), 27 deletions(-) diff --git a/docs/help/testing.md b/docs/help/testing.md index 55d5bfa59a3..cd6f09cc775 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -40,10 +40,10 @@ When debugging real providers/models (requires real creds): - Live suite (models + gateway tool/image probes): `pnpm test:live` - Target one live file quietly: `pnpm test:live -- src/agents/models.profiles.live.test.ts` - Docker live model sweep: `pnpm test:docker:live-models` - - Each selected model now runs a text turn plus a small file-read-style text - block probe. Models whose metadata advertises `image` input also run a tiny - image turn. Disable the extra probes with `OPENCLAW_LIVE_MODEL_FILE_PROBE=0` - or `OPENCLAW_LIVE_MODEL_IMAGE_PROBE=0` when isolating provider failures. + - Each selected model now runs a text turn plus a small file-read-style probe. + Models whose metadata advertises `image` input also run a tiny image turn. + Disable the extra probes with `OPENCLAW_LIVE_MODEL_FILE_PROBE=0` or + `OPENCLAW_LIVE_MODEL_IMAGE_PROBE=0` when isolating provider failures. - CI coverage: daily `OpenClaw Scheduled Live And E2E Checks` and manual `OpenClaw Release Checks` both call the reusable live/E2E workflow with `include_live_suites: true`, which includes separate Docker live model diff --git a/src/agents/live-model-turn-probes.test.ts b/src/agents/live-model-turn-probes.test.ts index 17c910049bb..b403a0bd3e3 100644 --- a/src/agents/live-model-turn-probes.test.ts +++ b/src/agents/live-model-turn-probes.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import { buildLiveModelFileProbeContext, + buildLiveModelFileProbeRetryContext, buildLiveModelImageProbeContext, extractAssistantText, fileProbeTextMatches, @@ -8,6 +9,7 @@ import { isLiveModelProbeEnabled, LIVE_MODEL_FILE_PROBE_TOKEN, modelSupportsImageInput, + shouldSkipLiveModelExtraProbes, } from "./live-model-turn-probes.js"; describe("live model turn probes", () => { @@ -27,15 +29,19 @@ describe("live model turn probes", () => { ).toBe(true); }); - it("builds a text-block file read probe", () => { + it("builds a text file read probe", () => { const context = buildLiveModelFileProbeContext({ systemPrompt: "sys" }); expect(context.systemPrompt).toBe("sys"); - expect(context.messages[0]?.content).toEqual([ - expect.objectContaining({ - type: "text", - text: expect.stringContaining(`LIVE_FILE_TOKEN=${LIVE_MODEL_FILE_PROBE_TOKEN}`), - }), - ]); + expect(context.messages[0]?.content).toEqual( + expect.stringContaining(`LIVE_FILE_TOKEN=${LIVE_MODEL_FILE_PROBE_TOKEN}`), + ); + }); + + it("builds a stricter file read retry probe", () => { + const context = buildLiveModelFileProbeRetryContext({}); + expect(context.messages[0]?.content).toEqual( + expect.stringContaining(`Reply with exactly ${LIVE_MODEL_FILE_PROBE_TOKEN}`), + ); }); it("builds an image probe with native image content", () => { @@ -63,9 +69,24 @@ describe("live model turn probes", () => { expect(modelSupportsImageInput({ input: ["text"] })).toBe(false); }); + it("skips known stale extra probe routes", () => { + expect( + shouldSkipLiveModelExtraProbes({ + provider: "openrouter", + id: "amazon/nova-2-lite-v1", + }), + ).toBe(true); + expect( + shouldSkipLiveModelExtraProbes({ + provider: "openrouter", + id: "amazon/nova-lite-v1", + }), + ).toBe(false); + }); + it("matches expected probe replies", () => { expect(fileProbeTextMatches(`The value is ${LIVE_MODEL_FILE_PROBE_TOKEN}.`)).toBe(true); - expect(fileProbeTextMatches("OPAL-731")).toBe(false); + expect(fileProbeTextMatches("amber")).toBe(false); expect(imageProbeTextMatches("OK")).toBe(true); expect(imageProbeTextMatches("blue")).toBe(false); }); diff --git a/src/agents/live-model-turn-probes.ts b/src/agents/live-model-turn-probes.ts index 92642dd9fb4..f47eb4fb149 100644 --- a/src/agents/live-model-turn-probes.ts +++ b/src/agents/live-model-turn-probes.ts @@ -1,12 +1,14 @@ import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai"; -export const LIVE_MODEL_FILE_PROBE_TOKEN = "OPAL_731"; +export const LIVE_MODEL_FILE_PROBE_TOKEN = "opal"; export const LIVE_MODEL_FILE_PROBE_ENV = "OPENCLAW_LIVE_MODEL_FILE_PROBE"; export const LIVE_MODEL_IMAGE_PROBE_ENV = "OPENCLAW_LIVE_MODEL_IMAGE_PROBE"; const PROBE_PNG_BASE64 = - "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAALklEQVR4nO3OoQEAAAyDsP7/9HYGJgJNdtuVDQAAAAAAACAHxH8AAAAAAACAHvBX0fhq85dN7QAAAABJRU5ErkJggg=="; + "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAALUlEQVR4nO3OIQEAAAwCMPrnod8fAzMxv7S9pQgICAgICAgICAgICAgICKwDD+yWbLXSniMNAAAAAElFTkSuQmCC"; + +const KNOWN_EMPTY_EXTRA_PROBE_MODELS = new Set(["openrouter/amazon/nova-2-lite-v1"]); export function isLiveModelProbeEnabled( env: Record, @@ -31,22 +33,39 @@ export function modelSupportsImageInput(model: Pick, "input">): boole return model.input.includes("image"); } +export function shouldSkipLiveModelExtraProbes( + model: Pick, "id" | "provider">, +): boolean { + return KNOWN_EMPTY_EXTRA_PROBE_MODELS.has(`${model.provider}/${model.id}`); +} + export function buildLiveModelFileProbeContext(params: { systemPrompt?: string }): Context { return { systemPrompt: params.systemPrompt, messages: [ { role: "user", - content: [ - { - type: "text", - text: - "Read this file excerpt and reply with only the value after LIVE_FILE_TOKEN.\n\n" + - '\n' + - `LIVE_FILE_TOKEN=${LIVE_MODEL_FILE_PROBE_TOKEN}\n` + - "", - }, - ], + content: + "Read this file excerpt and reply with only the value after LIVE_FILE_TOKEN.\n\n" + + "File: live-model-probe.txt\n" + + "MIME: text/plain\n\n" + + `LIVE_FILE_TOKEN=${LIVE_MODEL_FILE_PROBE_TOKEN}`, + timestamp: Date.now(), + }, + ], + }; +} + +export function buildLiveModelFileProbeRetryContext(params: { systemPrompt?: string }): Context { + return { + systemPrompt: params.systemPrompt, + messages: [ + { + role: "user", + content: + "The file live-model-probe.txt contains exactly this token:\n\n" + + `${LIVE_MODEL_FILE_PROBE_TOKEN}\n\n` + + `Reply with exactly ${LIVE_MODEL_FILE_PROBE_TOKEN}.`, timestamp: Date.now(), }, ], @@ -77,7 +96,7 @@ export function buildLiveModelImageProbeContext(params: { systemPrompt?: string } export function fileProbeTextMatches(text: string): boolean { - return text.toUpperCase().includes(LIVE_MODEL_FILE_PROBE_TOKEN); + return text.toLowerCase().includes(LIVE_MODEL_FILE_PROBE_TOKEN.toLowerCase()); } export function imageProbeTextMatches(text: string): boolean { diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts index 1a2c0111276..d827f658a6d 100644 --- a/src/agents/models.profiles.live.test.ts +++ b/src/agents/models.profiles.live.test.ts @@ -18,6 +18,7 @@ import { } from "./live-model-filter.js"; import { buildLiveModelFileProbeContext, + buildLiveModelFileProbeRetryContext, buildLiveModelImageProbeContext, extractAssistantText, fileProbeTextMatches, @@ -27,6 +28,7 @@ import { LIVE_MODEL_FILE_PROBE_TOKEN, LIVE_MODEL_IMAGE_PROBE_ENV, modelSupportsImageInput, + shouldSkipLiveModelExtraProbes, } from "./live-model-turn-probes.js"; import { createLiveTargetMatcher } from "./live-target-matcher.js"; import { isLiveProfileKeyModeEnabled, isLiveTestEnabled } from "./live-test-helpers.js"; @@ -452,10 +454,14 @@ async function runExtraTurnProbes(params: { timeoutMs: number; progressLabel: string; }) { + if (shouldSkipLiveModelExtraProbes(params.model)) { + logProgress(`${params.progressLabel}: extra probes skipped (known empty route)`); + return; + } const options = { apiKey: params.apiKey, reasoning: resolveTestReasoning(params.model), - maxTokens: 64, + maxTokens: 128, }; if (LIVE_FILE_PROBE_ENABLED) { logProgress(`${params.progressLabel}: file-read probe`); @@ -469,7 +475,25 @@ async function runExtraTurnProbes(params: { if (file.stopReason === "error") { throw new Error(file.errorMessage || "file-read probe returned error with no message"); } - const fileText = extractAssistantText(file); + let fileText = extractAssistantText(file); + if (!fileProbeTextMatches(fileText)) { + logProgress(`${params.progressLabel}: file-read probe retry`); + const retry = await completeSimpleWithTimeout( + params.model, + buildLiveModelFileProbeRetryContext({ + systemPrompt: resolveLiveSystemPrompt(params.model), + }), + options, + params.timeoutMs, + `${params.progressLabel}: file-read probe retry`, + ); + if (retry.stopReason === "error") { + throw new Error( + retry.errorMessage || "file-read probe retry returned error with no message", + ); + } + fileText = extractAssistantText(retry); + } if (!fileProbeTextMatches(fileText)) { throw new Error(`file-read probe did not return ${LIVE_MODEL_FILE_PROBE_TOKEN}: ${fileText}`); }