test(deepseek): add live v4 model coverage

This commit is contained in:
Peter Steinberger
2026-04-24 16:27:41 +01:00
parent 6b618f0635
commit aef0bb4915
5 changed files with 168 additions and 9 deletions

View File

@@ -48,7 +48,7 @@ Live tests are split into two layers so we can isolate failures:
- `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly)
- Set `OPENCLAW_LIVE_MODELS=modern` (or `all`, alias for modern) to actually run this suite; otherwise it skips to keep `pnpm test:live` focused on gateway smoke
- How to select models:
- `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4)
- `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, DeepSeek V4, GLM 4.7, MiniMax M2.7, Grok 4)
- `OPENCLAW_LIVE_MODELS=all` is an alias for the modern allowlist
- or `OPENCLAW_LIVE_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,..."` (comma allowlist)
- Modern/all sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap.
@@ -82,7 +82,7 @@ Live tests are split into two layers so we can isolate failures:
- How to enable:
- `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly)
- How to select models:
- Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4)
- Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, DeepSeek V4, GLM 4.7, MiniMax M2.7, Grok 4)
- `OPENCLAW_LIVE_GATEWAY_MODELS=all` is an alias for the modern allowlist
- Or set `OPENCLAW_LIVE_GATEWAY_MODELS="provider/model"` (or comma list) to narrow
- Modern/all gateway sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_GATEWAY_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap.
@@ -287,7 +287,7 @@ Narrow, explicit allowlists are fastest and least flaky:
- `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
- Tool calling across several providers:
- `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
- `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,deepseek/deepseek-v4-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
- Google focus (Gemini API key + Antigravity):
- Gemini (API key): `OPENCLAW_LIVE_GATEWAY_MODELS="google/gemini-3-flash-preview" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
@@ -315,11 +315,12 @@ This is the “common models” run we expect to keep working:
- Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`)
- Google (Gemini API): `google/gemini-3.1-pro-preview` and `google/gemini-3-flash-preview` (avoid older Gemini 2.x models)
- Google (Antigravity): `google-antigravity/claude-opus-4-6-thinking` and `google-antigravity/gemini-3-flash`
- DeepSeek: `deepseek/deepseek-v4-flash` and `deepseek/deepseek-v4-pro`
- Z.AI (GLM): `zai/glm-4.7`
- MiniMax: `minimax/MiniMax-M2.7`
Run gateway smoke with tools + image:
`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,deepseek/deepseek-v4-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
### Baseline: tool calling (Read + optional Exec)
@@ -328,6 +329,7 @@ Pick at least one per provider family:
- OpenAI: `openai/gpt-5.2`
- Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`)
- Google: `google/gemini-3-flash-preview` (or `google/gemini-3.1-pro-preview`)
- DeepSeek: `deepseek/deepseek-v4-flash`
- Z.AI (GLM): `zai/glm-4.7`
- MiniMax: `minimax/MiniMax-M2.7`

View File

@@ -46,5 +46,9 @@ export default defineSingleProviderPluginEntry({
/\bdeepseek\b.*(?:input.*too long|context.*exceed)/i.test(errorMessage),
...buildProviderReplayFamilyHooks({ family: "openai-compatible" }),
wrapStreamFn: (ctx) => createDeepSeekV4ThinkingWrapper(ctx.streamFn, ctx.thinkingLevel),
isModernModelRef: ({ modelId }) => {
const lower = modelId.toLowerCase();
return lower === "deepseek-v4-flash" || lower === "deepseek-v4-pro";
},
},
});

View File

@@ -15,6 +15,8 @@ const HIGH_SIGNAL_LIVE_MODEL_PRIORITY = [
"anthropic/claude-sonnet-4-6",
"google/gemini-3.1-pro-preview",
"google/gemini-3-flash-preview",
"deepseek/deepseek-v4-flash",
"deepseek/deepseek-v4-pro",
"minimax/minimax-m2.7",
"openai/gpt-5.2",
"openai-codex/gpt-5.2",

View File

@@ -519,6 +519,16 @@ describe("isHighSignalLiveModelRef", () => {
true,
);
});
it("keeps DeepSeek V4 models in the default live matrix when the provider marks them modern", () => {
providerRuntimeMocks.resolveProviderModernModelRef.mockImplementation(({ provider, context }) =>
provider === "deepseek" && context.modelId.startsWith("deepseek-v4") ? true : undefined,
);
expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-v4-flash" })).toBe(true);
expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-v4-pro" })).toBe(true);
expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-chat" })).toBe(false);
});
});
describe("selectHighSignalLiveItems", () => {
@@ -528,6 +538,7 @@ describe("selectHighSignalLiveItems", () => {
{ provider: "anthropic", id: "claude-opus-4-6" },
{ provider: "google", id: "gemini-3.1-pro-preview" },
{ provider: "google", id: "gemini-3-flash-preview" },
{ provider: "deepseek", id: "deepseek-v4-flash" },
{ provider: "openai", id: "gpt-5.2" },
{ provider: "opencode", id: "big-pickle" },
];
@@ -546,6 +557,28 @@ describe("selectHighSignalLiveItems", () => {
{ provider: "google", id: "gemini-3-flash-preview" },
]);
});
it("prioritizes DeepSeek V4 before later fallback providers", () => {
const items = [
{ provider: "openai", id: "gpt-5.2" },
{ provider: "deepseek", id: "deepseek-v4-flash" },
{ provider: "deepseek", id: "deepseek-v4-pro" },
{ provider: "minimax", id: "minimax-m2.7" },
];
expect(
selectHighSignalLiveItems(
items,
3,
(item) => item,
(item) => item.provider,
),
).toEqual([
{ provider: "deepseek", id: "deepseek-v4-flash" },
{ provider: "deepseek", id: "deepseek-v4-pro" },
{ provider: "minimax", id: "minimax-m2.7" },
]);
});
});
describe("resolveHighSignalLiveModelLimit", () => {

View File

@@ -58,8 +58,12 @@ const LIVE_TEST_TIMEOUT_MS = Math.max(
toInt(process.env.OPENCLAW_LIVE_TEST_TIMEOUT_MS, 60 * 60 * 1000),
);
const DEFAULT_LIVE_MODEL_CONCURRENCY = 20;
const LIVE_MODEL_CONCURRENCY = resolveLiveModelConcurrency();
const LIVE_MODELS_JSON_TIMEOUT_MS = resolveLiveModelsJsonTimeoutMs();
const LIVE_MODEL_CONCURRENCY = resolveLiveModelConcurrency(
process.env.OPENCLAW_LIVE_MODEL_CONCURRENCY,
);
const LIVE_MODELS_JSON_TIMEOUT_MS = resolveLiveModelsJsonTimeoutMs(
process.env.OPENCLAW_LIVE_MODELS_JSON_TIMEOUT_MS,
);
const LIVE_FILE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_FILE_PROBE_ENV);
const LIVE_IMAGE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_IMAGE_PROBE_ENV);
@@ -318,13 +322,13 @@ function toInt(value: string | undefined, fallback: number): number {
return Number.isFinite(parsed) ? parsed : fallback;
}
function resolveLiveModelConcurrency(raw = process.env.OPENCLAW_LIVE_MODEL_CONCURRENCY): number {
function resolveLiveModelConcurrency(raw?: string): number {
return Math.max(1, toInt(raw, DEFAULT_LIVE_MODEL_CONCURRENCY));
}
describe("resolveLiveModelConcurrency", () => {
it("defaults direct-model probes to 20-way concurrency", () => {
expect(resolveLiveModelConcurrency(undefined)).toBe(20);
expect(resolveLiveModelConcurrency()).toBe(20);
});
it("accepts explicit concurrency overrides", () => {
@@ -334,7 +338,7 @@ describe("resolveLiveModelConcurrency", () => {
});
function resolveLiveModelsJsonTimeoutMs(
modelsJsonTimeoutRaw = process.env.OPENCLAW_LIVE_MODELS_JSON_TIMEOUT_MS,
modelsJsonTimeoutRaw?: string,
setupTimeoutMs = LIVE_SETUP_TIMEOUT_MS,
): number {
return Math.max(setupTimeoutMs, toInt(modelsJsonTimeoutRaw, 120_000));
@@ -491,6 +495,102 @@ async function completeOkWithRetry(params: {
return await runOnce(256);
}
function isDeepSeekV4Model(model: Pick<Model<Api>, "id" | "provider">): boolean {
return (
model.provider === "deepseek" &&
(model.id === "deepseek-v4-flash" || model.id === "deepseek-v4-pro")
);
}
async function runDeepSeekV4ReplayRegression(params: {
model: Model<Api>;
apiKey: string;
timeoutMs: number;
progressLabel: string;
}) {
const noopTool = {
name: "noop",
description: "Return ok.",
parameters: Type.Object({}, { additionalProperties: false }),
};
let firstUser = {
role: "user" as const,
content: "Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
};
let first = await completeSimpleWithTimeout(
params.model,
{ messages: [firstUser], tools: [noopTool] },
{
apiKey: params.apiKey,
reasoning: resolveTestReasoning(params.model),
maxTokens: 256,
},
params.timeoutMs,
`${params.progressLabel}: DeepSeek V4 replay first call`,
);
let toolCall = first.content.find((block) => block.type === "toolCall");
for (let i = 0; i < 2 && !toolCall; i += 1) {
firstUser = {
role: "user" as const,
content: "Call the tool `noop` with {}. IMPORTANT: respond with the tool call.",
timestamp: Date.now(),
};
first = await completeSimpleWithTimeout(
params.model,
{ messages: [firstUser], tools: [noopTool] },
{
apiKey: params.apiKey,
reasoning: resolveTestReasoning(params.model),
maxTokens: 256,
},
params.timeoutMs,
`${params.progressLabel}: DeepSeek V4 replay retry ${i + 1}`,
);
toolCall = first.content.find((block) => block.type === "toolCall");
}
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("expected DeepSeek V4 tool call");
}
const second = await completeSimpleWithTimeout(
params.model,
{
messages: [
firstUser,
first,
{
role: "toolResult",
toolCallId: toolCall.id,
toolName: "noop",
content: [{ type: "text", text: "ok" }],
isError: false,
timestamp: Date.now(),
},
{
role: "user",
content: "Reply with the word ok.",
timestamp: Date.now(),
},
],
},
{
apiKey: params.apiKey,
reasoning: resolveTestReasoning(params.model),
maxTokens: 256,
},
params.timeoutMs,
`${params.progressLabel}: DeepSeek V4 replay followup`,
);
if (second.stopReason === "error") {
throw new Error(second.errorMessage || "DeepSeek V4 replay followup returned error");
}
expect(extractAssistantText(second).length).toBeGreaterThan(0);
}
async function runExtraTurnProbes(params: {
model: Model<Api>;
apiKey: string;
@@ -849,6 +949,24 @@ describeLive("live models (profile keys)", () => {
break;
}
if (isDeepSeekV4Model(model)) {
logProgress(`${progressLabel}: DeepSeek V4 replay regression`);
await runDeepSeekV4ReplayRegression({
model,
apiKey,
timeoutMs: perModelTimeoutMs,
progressLabel,
});
await runExtraTurnProbes({
model,
apiKey,
timeoutMs: perModelTimeoutMs,
progressLabel,
});
logProgress(`${progressLabel}: done`);
break;
}
logProgress(`${progressLabel}: prompt`);
const ok = await completeOkWithRetry({
model,