From aef0bb49158225a598e48805c5cf1881814ee3fc Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 16:27:41 +0100 Subject: [PATCH] test(deepseek): add live v4 model coverage --- docs/help/testing-live.md | 10 +- extensions/deepseek/index.ts | 4 + src/agents/live-model-filter.ts | 2 + src/agents/model-compat.test.ts | 33 ++++++ src/agents/models.profiles.live.test.ts | 128 +++++++++++++++++++++++- 5 files changed, 168 insertions(+), 9 deletions(-) diff --git a/docs/help/testing-live.md b/docs/help/testing-live.md index 51ec6f89917..f8fb22f8c27 100644 --- a/docs/help/testing-live.md +++ b/docs/help/testing-live.md @@ -48,7 +48,7 @@ Live tests are split into two layers so we can isolate failures: - `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly) - Set `OPENCLAW_LIVE_MODELS=modern` (or `all`, alias for modern) to actually run this suite; otherwise it skips to keep `pnpm test:live` focused on gateway smoke - How to select models: - - `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4) + - `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, DeepSeek V4, GLM 4.7, MiniMax M2.7, Grok 4) - `OPENCLAW_LIVE_MODELS=all` is an alias for the modern allowlist - or `OPENCLAW_LIVE_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,..."` (comma allowlist) - Modern/all sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap. @@ -82,7 +82,7 @@ Live tests are split into two layers so we can isolate failures: - How to enable: - `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly) - How to select models: - - Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4) + - Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, DeepSeek V4, GLM 4.7, MiniMax M2.7, Grok 4) - `OPENCLAW_LIVE_GATEWAY_MODELS=all` is an alias for the modern allowlist - Or set `OPENCLAW_LIVE_GATEWAY_MODELS="provider/model"` (or comma list) to narrow - Modern/all gateway sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_GATEWAY_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap. @@ -287,7 +287,7 @@ Narrow, explicit allowlists are fastest and least flaky: - `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` - Tool calling across several providers: - - `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` + - `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,deepseek/deepseek-v4-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` - Google focus (Gemini API key + Antigravity): - Gemini (API key): `OPENCLAW_LIVE_GATEWAY_MODELS="google/gemini-3-flash-preview" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` @@ -315,11 +315,12 @@ This is the “common models” run we expect to keep working: - Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`) - Google (Gemini API): `google/gemini-3.1-pro-preview` and `google/gemini-3-flash-preview` (avoid older Gemini 2.x models) - Google (Antigravity): `google-antigravity/claude-opus-4-6-thinking` and `google-antigravity/gemini-3-flash` +- DeepSeek: `deepseek/deepseek-v4-flash` and `deepseek/deepseek-v4-pro` - Z.AI (GLM): `zai/glm-4.7` - MiniMax: `minimax/MiniMax-M2.7` Run gateway smoke with tools + image: -`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` +`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,deepseek/deepseek-v4-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` ### Baseline: tool calling (Read + optional Exec) @@ -328,6 +329,7 @@ Pick at least one per provider family: - OpenAI: `openai/gpt-5.2` - Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`) - Google: `google/gemini-3-flash-preview` (or `google/gemini-3.1-pro-preview`) +- DeepSeek: `deepseek/deepseek-v4-flash` - Z.AI (GLM): `zai/glm-4.7` - MiniMax: `minimax/MiniMax-M2.7` diff --git a/extensions/deepseek/index.ts b/extensions/deepseek/index.ts index e8923781752..ba9e7007803 100644 --- a/extensions/deepseek/index.ts +++ b/extensions/deepseek/index.ts @@ -46,5 +46,9 @@ export default defineSingleProviderPluginEntry({ /\bdeepseek\b.*(?:input.*too long|context.*exceed)/i.test(errorMessage), ...buildProviderReplayFamilyHooks({ family: "openai-compatible" }), wrapStreamFn: (ctx) => createDeepSeekV4ThinkingWrapper(ctx.streamFn, ctx.thinkingLevel), + isModernModelRef: ({ modelId }) => { + const lower = modelId.toLowerCase(); + return lower === "deepseek-v4-flash" || lower === "deepseek-v4-pro"; + }, }, }); diff --git a/src/agents/live-model-filter.ts b/src/agents/live-model-filter.ts index c61774b5665..7e4921bec1f 100644 --- a/src/agents/live-model-filter.ts +++ b/src/agents/live-model-filter.ts @@ -15,6 +15,8 @@ const HIGH_SIGNAL_LIVE_MODEL_PRIORITY = [ "anthropic/claude-sonnet-4-6", "google/gemini-3.1-pro-preview", "google/gemini-3-flash-preview", + "deepseek/deepseek-v4-flash", + "deepseek/deepseek-v4-pro", "minimax/minimax-m2.7", "openai/gpt-5.2", "openai-codex/gpt-5.2", diff --git a/src/agents/model-compat.test.ts b/src/agents/model-compat.test.ts index a81b8641f61..4c08f56117a 100644 --- a/src/agents/model-compat.test.ts +++ b/src/agents/model-compat.test.ts @@ -519,6 +519,16 @@ describe("isHighSignalLiveModelRef", () => { true, ); }); + + it("keeps DeepSeek V4 models in the default live matrix when the provider marks them modern", () => { + providerRuntimeMocks.resolveProviderModernModelRef.mockImplementation(({ provider, context }) => + provider === "deepseek" && context.modelId.startsWith("deepseek-v4") ? true : undefined, + ); + + expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-v4-flash" })).toBe(true); + expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-v4-pro" })).toBe(true); + expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-chat" })).toBe(false); + }); }); describe("selectHighSignalLiveItems", () => { @@ -528,6 +538,7 @@ describe("selectHighSignalLiveItems", () => { { provider: "anthropic", id: "claude-opus-4-6" }, { provider: "google", id: "gemini-3.1-pro-preview" }, { provider: "google", id: "gemini-3-flash-preview" }, + { provider: "deepseek", id: "deepseek-v4-flash" }, { provider: "openai", id: "gpt-5.2" }, { provider: "opencode", id: "big-pickle" }, ]; @@ -546,6 +557,28 @@ describe("selectHighSignalLiveItems", () => { { provider: "google", id: "gemini-3-flash-preview" }, ]); }); + + it("prioritizes DeepSeek V4 before later fallback providers", () => { + const items = [ + { provider: "openai", id: "gpt-5.2" }, + { provider: "deepseek", id: "deepseek-v4-flash" }, + { provider: "deepseek", id: "deepseek-v4-pro" }, + { provider: "minimax", id: "minimax-m2.7" }, + ]; + + expect( + selectHighSignalLiveItems( + items, + 3, + (item) => item, + (item) => item.provider, + ), + ).toEqual([ + { provider: "deepseek", id: "deepseek-v4-flash" }, + { provider: "deepseek", id: "deepseek-v4-pro" }, + { provider: "minimax", id: "minimax-m2.7" }, + ]); + }); }); describe("resolveHighSignalLiveModelLimit", () => { diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts index 4b9e9bb1594..ecaba327051 100644 --- a/src/agents/models.profiles.live.test.ts +++ b/src/agents/models.profiles.live.test.ts @@ -58,8 +58,12 @@ const LIVE_TEST_TIMEOUT_MS = Math.max( toInt(process.env.OPENCLAW_LIVE_TEST_TIMEOUT_MS, 60 * 60 * 1000), ); const DEFAULT_LIVE_MODEL_CONCURRENCY = 20; -const LIVE_MODEL_CONCURRENCY = resolveLiveModelConcurrency(); -const LIVE_MODELS_JSON_TIMEOUT_MS = resolveLiveModelsJsonTimeoutMs(); +const LIVE_MODEL_CONCURRENCY = resolveLiveModelConcurrency( + process.env.OPENCLAW_LIVE_MODEL_CONCURRENCY, +); +const LIVE_MODELS_JSON_TIMEOUT_MS = resolveLiveModelsJsonTimeoutMs( + process.env.OPENCLAW_LIVE_MODELS_JSON_TIMEOUT_MS, +); const LIVE_FILE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_FILE_PROBE_ENV); const LIVE_IMAGE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_IMAGE_PROBE_ENV); @@ -318,13 +322,13 @@ function toInt(value: string | undefined, fallback: number): number { return Number.isFinite(parsed) ? parsed : fallback; } -function resolveLiveModelConcurrency(raw = process.env.OPENCLAW_LIVE_MODEL_CONCURRENCY): number { +function resolveLiveModelConcurrency(raw?: string): number { return Math.max(1, toInt(raw, DEFAULT_LIVE_MODEL_CONCURRENCY)); } describe("resolveLiveModelConcurrency", () => { it("defaults direct-model probes to 20-way concurrency", () => { - expect(resolveLiveModelConcurrency(undefined)).toBe(20); + expect(resolveLiveModelConcurrency()).toBe(20); }); it("accepts explicit concurrency overrides", () => { @@ -334,7 +338,7 @@ describe("resolveLiveModelConcurrency", () => { }); function resolveLiveModelsJsonTimeoutMs( - modelsJsonTimeoutRaw = process.env.OPENCLAW_LIVE_MODELS_JSON_TIMEOUT_MS, + modelsJsonTimeoutRaw?: string, setupTimeoutMs = LIVE_SETUP_TIMEOUT_MS, ): number { return Math.max(setupTimeoutMs, toInt(modelsJsonTimeoutRaw, 120_000)); @@ -491,6 +495,102 @@ async function completeOkWithRetry(params: { return await runOnce(256); } +function isDeepSeekV4Model(model: Pick, "id" | "provider">): boolean { + return ( + model.provider === "deepseek" && + (model.id === "deepseek-v4-flash" || model.id === "deepseek-v4-pro") + ); +} + +async function runDeepSeekV4ReplayRegression(params: { + model: Model; + apiKey: string; + timeoutMs: number; + progressLabel: string; +}) { + const noopTool = { + name: "noop", + description: "Return ok.", + parameters: Type.Object({}, { additionalProperties: false }), + }; + let firstUser = { + role: "user" as const, + content: "Call the tool `noop` with {}. Do not write any other text.", + timestamp: Date.now(), + }; + let first = await completeSimpleWithTimeout( + params.model, + { messages: [firstUser], tools: [noopTool] }, + { + apiKey: params.apiKey, + reasoning: resolveTestReasoning(params.model), + maxTokens: 256, + }, + params.timeoutMs, + `${params.progressLabel}: DeepSeek V4 replay first call`, + ); + let toolCall = first.content.find((block) => block.type === "toolCall"); + + for (let i = 0; i < 2 && !toolCall; i += 1) { + firstUser = { + role: "user" as const, + content: "Call the tool `noop` with {}. IMPORTANT: respond with the tool call.", + timestamp: Date.now(), + }; + first = await completeSimpleWithTimeout( + params.model, + { messages: [firstUser], tools: [noopTool] }, + { + apiKey: params.apiKey, + reasoning: resolveTestReasoning(params.model), + maxTokens: 256, + }, + params.timeoutMs, + `${params.progressLabel}: DeepSeek V4 replay retry ${i + 1}`, + ); + toolCall = first.content.find((block) => block.type === "toolCall"); + } + + expect(toolCall).toBeTruthy(); + if (!toolCall || toolCall.type !== "toolCall") { + throw new Error("expected DeepSeek V4 tool call"); + } + + const second = await completeSimpleWithTimeout( + params.model, + { + messages: [ + firstUser, + first, + { + role: "toolResult", + toolCallId: toolCall.id, + toolName: "noop", + content: [{ type: "text", text: "ok" }], + isError: false, + timestamp: Date.now(), + }, + { + role: "user", + content: "Reply with the word ok.", + timestamp: Date.now(), + }, + ], + }, + { + apiKey: params.apiKey, + reasoning: resolveTestReasoning(params.model), + maxTokens: 256, + }, + params.timeoutMs, + `${params.progressLabel}: DeepSeek V4 replay followup`, + ); + if (second.stopReason === "error") { + throw new Error(second.errorMessage || "DeepSeek V4 replay followup returned error"); + } + expect(extractAssistantText(second).length).toBeGreaterThan(0); +} + async function runExtraTurnProbes(params: { model: Model; apiKey: string; @@ -849,6 +949,24 @@ describeLive("live models (profile keys)", () => { break; } + if (isDeepSeekV4Model(model)) { + logProgress(`${progressLabel}: DeepSeek V4 replay regression`); + await runDeepSeekV4ReplayRegression({ + model, + apiKey, + timeoutMs: perModelTimeoutMs, + progressLabel, + }); + await runExtraTurnProbes({ + model, + apiKey, + timeoutMs: perModelTimeoutMs, + progressLabel, + }); + logProgress(`${progressLabel}: done`); + break; + } + logProgress(`${progressLabel}: prompt`); const ok = await completeOkWithRetry({ model,