mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:20:43 +00:00
test(deepseek): add live v4 model coverage
This commit is contained in:
@@ -48,7 +48,7 @@ Live tests are split into two layers so we can isolate failures:
|
||||
- `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly)
|
||||
- Set `OPENCLAW_LIVE_MODELS=modern` (or `all`, alias for modern) to actually run this suite; otherwise it skips to keep `pnpm test:live` focused on gateway smoke
|
||||
- How to select models:
|
||||
- `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4)
|
||||
- `OPENCLAW_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, DeepSeek V4, GLM 4.7, MiniMax M2.7, Grok 4)
|
||||
- `OPENCLAW_LIVE_MODELS=all` is an alias for the modern allowlist
|
||||
- or `OPENCLAW_LIVE_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,..."` (comma allowlist)
|
||||
- Modern/all sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap.
|
||||
@@ -82,7 +82,7 @@ Live tests are split into two layers so we can isolate failures:
|
||||
- How to enable:
|
||||
- `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly)
|
||||
- How to select models:
|
||||
- Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, GLM 4.7, MiniMax M2.7, Grok 4)
|
||||
- Default: modern allowlist (Opus/Sonnet 4.6+, GPT-5.2 + Codex, Gemini 3, DeepSeek V4, GLM 4.7, MiniMax M2.7, Grok 4)
|
||||
- `OPENCLAW_LIVE_GATEWAY_MODELS=all` is an alias for the modern allowlist
|
||||
- Or set `OPENCLAW_LIVE_GATEWAY_MODELS="provider/model"` (or comma list) to narrow
|
||||
- Modern/all gateway sweeps default to a curated high-signal cap; set `OPENCLAW_LIVE_GATEWAY_MAX_MODELS=0` for an exhaustive modern sweep or a positive number for a smaller cap.
|
||||
@@ -287,7 +287,7 @@ Narrow, explicit allowlists are fastest and least flaky:
|
||||
- `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||
|
||||
- Tool calling across several providers:
|
||||
- `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||
- `OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3-flash-preview,deepseek/deepseek-v4-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||
|
||||
- Google focus (Gemini API key + Antigravity):
|
||||
- Gemini (API key): `OPENCLAW_LIVE_GATEWAY_MODELS="google/gemini-3-flash-preview" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||
@@ -315,11 +315,12 @@ This is the “common models” run we expect to keep working:
|
||||
- Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`)
|
||||
- Google (Gemini API): `google/gemini-3.1-pro-preview` and `google/gemini-3-flash-preview` (avoid older Gemini 2.x models)
|
||||
- Google (Antigravity): `google-antigravity/claude-opus-4-6-thinking` and `google-antigravity/gemini-3-flash`
|
||||
- DeepSeek: `deepseek/deepseek-v4-flash` and `deepseek/deepseek-v4-pro`
|
||||
- Z.AI (GLM): `zai/glm-4.7`
|
||||
- MiniMax: `minimax/MiniMax-M2.7`
|
||||
|
||||
Run gateway smoke with tools + image:
|
||||
`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||
`OPENCLAW_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-6,google/gemini-3.1-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-6-thinking,google-antigravity/gemini-3-flash,deepseek/deepseek-v4-flash,zai/glm-4.7,minimax/MiniMax-M2.7" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||
|
||||
### Baseline: tool calling (Read + optional Exec)
|
||||
|
||||
@@ -328,6 +329,7 @@ Pick at least one per provider family:
|
||||
- OpenAI: `openai/gpt-5.2`
|
||||
- Anthropic: `anthropic/claude-opus-4-6` (or `anthropic/claude-sonnet-4-6`)
|
||||
- Google: `google/gemini-3-flash-preview` (or `google/gemini-3.1-pro-preview`)
|
||||
- DeepSeek: `deepseek/deepseek-v4-flash`
|
||||
- Z.AI (GLM): `zai/glm-4.7`
|
||||
- MiniMax: `minimax/MiniMax-M2.7`
|
||||
|
||||
|
||||
@@ -46,5 +46,9 @@ export default defineSingleProviderPluginEntry({
|
||||
/\bdeepseek\b.*(?:input.*too long|context.*exceed)/i.test(errorMessage),
|
||||
...buildProviderReplayFamilyHooks({ family: "openai-compatible" }),
|
||||
wrapStreamFn: (ctx) => createDeepSeekV4ThinkingWrapper(ctx.streamFn, ctx.thinkingLevel),
|
||||
isModernModelRef: ({ modelId }) => {
|
||||
const lower = modelId.toLowerCase();
|
||||
return lower === "deepseek-v4-flash" || lower === "deepseek-v4-pro";
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
@@ -15,6 +15,8 @@ const HIGH_SIGNAL_LIVE_MODEL_PRIORITY = [
|
||||
"anthropic/claude-sonnet-4-6",
|
||||
"google/gemini-3.1-pro-preview",
|
||||
"google/gemini-3-flash-preview",
|
||||
"deepseek/deepseek-v4-flash",
|
||||
"deepseek/deepseek-v4-pro",
|
||||
"minimax/minimax-m2.7",
|
||||
"openai/gpt-5.2",
|
||||
"openai-codex/gpt-5.2",
|
||||
|
||||
@@ -519,6 +519,16 @@ describe("isHighSignalLiveModelRef", () => {
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps DeepSeek V4 models in the default live matrix when the provider marks them modern", () => {
|
||||
providerRuntimeMocks.resolveProviderModernModelRef.mockImplementation(({ provider, context }) =>
|
||||
provider === "deepseek" && context.modelId.startsWith("deepseek-v4") ? true : undefined,
|
||||
);
|
||||
|
||||
expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-v4-flash" })).toBe(true);
|
||||
expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-v4-pro" })).toBe(true);
|
||||
expect(isHighSignalLiveModelRef({ provider: "deepseek", id: "deepseek-chat" })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("selectHighSignalLiveItems", () => {
|
||||
@@ -528,6 +538,7 @@ describe("selectHighSignalLiveItems", () => {
|
||||
{ provider: "anthropic", id: "claude-opus-4-6" },
|
||||
{ provider: "google", id: "gemini-3.1-pro-preview" },
|
||||
{ provider: "google", id: "gemini-3-flash-preview" },
|
||||
{ provider: "deepseek", id: "deepseek-v4-flash" },
|
||||
{ provider: "openai", id: "gpt-5.2" },
|
||||
{ provider: "opencode", id: "big-pickle" },
|
||||
];
|
||||
@@ -546,6 +557,28 @@ describe("selectHighSignalLiveItems", () => {
|
||||
{ provider: "google", id: "gemini-3-flash-preview" },
|
||||
]);
|
||||
});
|
||||
|
||||
it("prioritizes DeepSeek V4 before later fallback providers", () => {
|
||||
const items = [
|
||||
{ provider: "openai", id: "gpt-5.2" },
|
||||
{ provider: "deepseek", id: "deepseek-v4-flash" },
|
||||
{ provider: "deepseek", id: "deepseek-v4-pro" },
|
||||
{ provider: "minimax", id: "minimax-m2.7" },
|
||||
];
|
||||
|
||||
expect(
|
||||
selectHighSignalLiveItems(
|
||||
items,
|
||||
3,
|
||||
(item) => item,
|
||||
(item) => item.provider,
|
||||
),
|
||||
).toEqual([
|
||||
{ provider: "deepseek", id: "deepseek-v4-flash" },
|
||||
{ provider: "deepseek", id: "deepseek-v4-pro" },
|
||||
{ provider: "minimax", id: "minimax-m2.7" },
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveHighSignalLiveModelLimit", () => {
|
||||
|
||||
@@ -58,8 +58,12 @@ const LIVE_TEST_TIMEOUT_MS = Math.max(
|
||||
toInt(process.env.OPENCLAW_LIVE_TEST_TIMEOUT_MS, 60 * 60 * 1000),
|
||||
);
|
||||
const DEFAULT_LIVE_MODEL_CONCURRENCY = 20;
|
||||
const LIVE_MODEL_CONCURRENCY = resolveLiveModelConcurrency();
|
||||
const LIVE_MODELS_JSON_TIMEOUT_MS = resolveLiveModelsJsonTimeoutMs();
|
||||
const LIVE_MODEL_CONCURRENCY = resolveLiveModelConcurrency(
|
||||
process.env.OPENCLAW_LIVE_MODEL_CONCURRENCY,
|
||||
);
|
||||
const LIVE_MODELS_JSON_TIMEOUT_MS = resolveLiveModelsJsonTimeoutMs(
|
||||
process.env.OPENCLAW_LIVE_MODELS_JSON_TIMEOUT_MS,
|
||||
);
|
||||
const LIVE_FILE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_FILE_PROBE_ENV);
|
||||
const LIVE_IMAGE_PROBE_ENABLED = isLiveModelProbeEnabled(process.env, LIVE_MODEL_IMAGE_PROBE_ENV);
|
||||
|
||||
@@ -318,13 +322,13 @@ function toInt(value: string | undefined, fallback: number): number {
|
||||
return Number.isFinite(parsed) ? parsed : fallback;
|
||||
}
|
||||
|
||||
function resolveLiveModelConcurrency(raw = process.env.OPENCLAW_LIVE_MODEL_CONCURRENCY): number {
|
||||
function resolveLiveModelConcurrency(raw?: string): number {
|
||||
return Math.max(1, toInt(raw, DEFAULT_LIVE_MODEL_CONCURRENCY));
|
||||
}
|
||||
|
||||
describe("resolveLiveModelConcurrency", () => {
|
||||
it("defaults direct-model probes to 20-way concurrency", () => {
|
||||
expect(resolveLiveModelConcurrency(undefined)).toBe(20);
|
||||
expect(resolveLiveModelConcurrency()).toBe(20);
|
||||
});
|
||||
|
||||
it("accepts explicit concurrency overrides", () => {
|
||||
@@ -334,7 +338,7 @@ describe("resolveLiveModelConcurrency", () => {
|
||||
});
|
||||
|
||||
function resolveLiveModelsJsonTimeoutMs(
|
||||
modelsJsonTimeoutRaw = process.env.OPENCLAW_LIVE_MODELS_JSON_TIMEOUT_MS,
|
||||
modelsJsonTimeoutRaw?: string,
|
||||
setupTimeoutMs = LIVE_SETUP_TIMEOUT_MS,
|
||||
): number {
|
||||
return Math.max(setupTimeoutMs, toInt(modelsJsonTimeoutRaw, 120_000));
|
||||
@@ -491,6 +495,102 @@ async function completeOkWithRetry(params: {
|
||||
return await runOnce(256);
|
||||
}
|
||||
|
||||
function isDeepSeekV4Model(model: Pick<Model<Api>, "id" | "provider">): boolean {
|
||||
return (
|
||||
model.provider === "deepseek" &&
|
||||
(model.id === "deepseek-v4-flash" || model.id === "deepseek-v4-pro")
|
||||
);
|
||||
}
|
||||
|
||||
async function runDeepSeekV4ReplayRegression(params: {
|
||||
model: Model<Api>;
|
||||
apiKey: string;
|
||||
timeoutMs: number;
|
||||
progressLabel: string;
|
||||
}) {
|
||||
const noopTool = {
|
||||
name: "noop",
|
||||
description: "Return ok.",
|
||||
parameters: Type.Object({}, { additionalProperties: false }),
|
||||
};
|
||||
let firstUser = {
|
||||
role: "user" as const,
|
||||
content: "Call the tool `noop` with {}. Do not write any other text.",
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
let first = await completeSimpleWithTimeout(
|
||||
params.model,
|
||||
{ messages: [firstUser], tools: [noopTool] },
|
||||
{
|
||||
apiKey: params.apiKey,
|
||||
reasoning: resolveTestReasoning(params.model),
|
||||
maxTokens: 256,
|
||||
},
|
||||
params.timeoutMs,
|
||||
`${params.progressLabel}: DeepSeek V4 replay first call`,
|
||||
);
|
||||
let toolCall = first.content.find((block) => block.type === "toolCall");
|
||||
|
||||
for (let i = 0; i < 2 && !toolCall; i += 1) {
|
||||
firstUser = {
|
||||
role: "user" as const,
|
||||
content: "Call the tool `noop` with {}. IMPORTANT: respond with the tool call.",
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
first = await completeSimpleWithTimeout(
|
||||
params.model,
|
||||
{ messages: [firstUser], tools: [noopTool] },
|
||||
{
|
||||
apiKey: params.apiKey,
|
||||
reasoning: resolveTestReasoning(params.model),
|
||||
maxTokens: 256,
|
||||
},
|
||||
params.timeoutMs,
|
||||
`${params.progressLabel}: DeepSeek V4 replay retry ${i + 1}`,
|
||||
);
|
||||
toolCall = first.content.find((block) => block.type === "toolCall");
|
||||
}
|
||||
|
||||
expect(toolCall).toBeTruthy();
|
||||
if (!toolCall || toolCall.type !== "toolCall") {
|
||||
throw new Error("expected DeepSeek V4 tool call");
|
||||
}
|
||||
|
||||
const second = await completeSimpleWithTimeout(
|
||||
params.model,
|
||||
{
|
||||
messages: [
|
||||
firstUser,
|
||||
first,
|
||||
{
|
||||
role: "toolResult",
|
||||
toolCallId: toolCall.id,
|
||||
toolName: "noop",
|
||||
content: [{ type: "text", text: "ok" }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: "Reply with the word ok.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
apiKey: params.apiKey,
|
||||
reasoning: resolveTestReasoning(params.model),
|
||||
maxTokens: 256,
|
||||
},
|
||||
params.timeoutMs,
|
||||
`${params.progressLabel}: DeepSeek V4 replay followup`,
|
||||
);
|
||||
if (second.stopReason === "error") {
|
||||
throw new Error(second.errorMessage || "DeepSeek V4 replay followup returned error");
|
||||
}
|
||||
expect(extractAssistantText(second).length).toBeGreaterThan(0);
|
||||
}
|
||||
|
||||
async function runExtraTurnProbes(params: {
|
||||
model: Model<Api>;
|
||||
apiKey: string;
|
||||
@@ -849,6 +949,24 @@ describeLive("live models (profile keys)", () => {
|
||||
break;
|
||||
}
|
||||
|
||||
if (isDeepSeekV4Model(model)) {
|
||||
logProgress(`${progressLabel}: DeepSeek V4 replay regression`);
|
||||
await runDeepSeekV4ReplayRegression({
|
||||
model,
|
||||
apiKey,
|
||||
timeoutMs: perModelTimeoutMs,
|
||||
progressLabel,
|
||||
});
|
||||
await runExtraTurnProbes({
|
||||
model,
|
||||
apiKey,
|
||||
timeoutMs: perModelTimeoutMs,
|
||||
progressLabel,
|
||||
});
|
||||
logProgress(`${progressLabel}: done`);
|
||||
break;
|
||||
}
|
||||
|
||||
logProgress(`${progressLabel}: prompt`);
|
||||
const ok = await completeOkWithRetry({
|
||||
model,
|
||||
|
||||
Reference in New Issue
Block a user