mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 04:50:44 +00:00
refactor(vllm): own qwen thinking payloads
This commit is contained in:
@@ -30,7 +30,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Git hooks: skip ignored staged paths when formatting and restaging pre-commit files, so merge commits no longer abort when `.gitignore` newly ignores staged merged content. Fixes #72744. Thanks @100yenadmin.
|
||||
- Memory-core/dreaming: add a supported `dreaming.model` knob for Dream Diary narrative subagents, wired through phase config and the existing plugin subagent model-override trust gate. Refs #65963. Thanks @esqandil and @mjamiv.
|
||||
- Agents/Anthropic: remove trailing assistant prefill payloads when extended thinking is enabled, so Opus 4.7/Sonnet 4.6 requests do not fail Anthropic's user-final-turn validation. Fixes #72739. Thanks @superandylin.
|
||||
- Agents/vLLM: honor `compat.thinkingFormat: "qwen-chat-template"` by sending Qwen chat-template thinking kwargs, including preserved thinking for agent loops, and support DashScope-style `qwen` top-level thinking flags. Fixes #72329. Thanks @stavrostzagadouris.
|
||||
- Agents/vLLM/Qwen: add plugin-owned Qwen thinking controls for vLLM chat-template kwargs and DashScope-style top-level `enable_thinking` flags, including preserved thinking for agent loops. Fixes #72329. Thanks @stavrostzagadouris.
|
||||
- Memory-core/dreaming: treat request-scoped narrative fallback as expected, skip session cleanup when no subagent run was created, and remove duplicate phase-level cleanup so fallback no longer emits warning noise. Fixes #67152. Thanks @jsompis.
|
||||
- Agents/exec: apply configured `tools.exec.timeoutSec` to background, `yieldMs`, and node `system.run` commands when no per-call timeout is set, preventing auto-backgrounded and remote node commands from running indefinitely. Fixes #67600; supersedes #67603. Thanks @dlmpx and @kagura-agent.
|
||||
- Config/doctor: stop masking unknown-key validation diagnostics such as `agents.defaults.llm`, and have `openclaw doctor --fix` remove the retired `agents.defaults.llm` timeout block. Thanks @aidiffuser.
|
||||
|
||||
@@ -371,7 +371,7 @@ Time format in system prompt. Default: `auto` (OS preference).
|
||||
- `params`: global default provider parameters applied to all models. Set at `agents.defaults.params` (e.g. `{ cacheRetention: "long" }`).
|
||||
- `params` merge precedence (config): `agents.defaults.params` (global base) is overridden by `agents.defaults.models["provider/model"].params` (per-model), then `agents.list[].params` (matching agent id) overrides by key. See [Prompt Caching](/reference/prompt-caching) for details.
|
||||
- `params.extra_body`/`params.extraBody`: advanced pass-through JSON merged into `api: "openai-completions"` request bodies for OpenAI-compatible proxies. If it collides with generated request keys, the extra body wins; non-native completions routes still strip OpenAI-only `store` afterward.
|
||||
- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; models with `compat.thinkingFormat: "qwen-chat-template"` map OpenClaw thinking controls to `chat_template_kwargs.enable_thinking` plus `preserve_thinking: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence.
|
||||
- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence. For vLLM Qwen thinking controls, set `params.qwenThinkingFormat` to `"chat-template"` or `"top-level"` on that model entry.
|
||||
- `params.preserveThinking`: Z.AI-only opt-in for preserved thinking. When enabled and thinking is on, OpenClaw sends `thinking.clear_thinking: false` and replays prior `reasoning_content`; see [Z.AI thinking and preserved thinking](/providers/zai#thinking-and-preserved-thinking).
|
||||
- `agentRuntime`: default low-level agent runtime policy. Omitted id defaults to OpenClaw Pi. Use `id: "pi"` to force the built-in PI harness, `id: "auto"` to let registered plugin harnesses claim supported models, a registered harness id such as `id: "codex"`, or a supported CLI backend alias such as `id: "claude-cli"`. Set `fallback: "none"` to disable automatic PI fallback. Explicit plugin runtimes such as `codex` fail closed by default unless you set `fallback: "pi"` in the same override scope. Keep model refs canonical as `provider/model`; select Codex, Claude CLI, Gemini CLI, and other execution backends through runtime config instead of legacy runtime provider prefixes. See [Agent runtimes](/concepts/agent-runtimes) for how this differs from provider/model selection.
|
||||
- Config writers that mutate these fields (for example `/models set`, `/models set-image`, and fallback add/remove commands) save canonical object form and preserve existing fallback lists when possible.
|
||||
|
||||
@@ -169,6 +169,13 @@ Availability can still vary by endpoint and billing plan even when a model is
|
||||
present in the bundled catalog.
|
||||
</Note>
|
||||
|
||||
## Thinking Controls
|
||||
|
||||
For reasoning-enabled Qwen Cloud models, the bundled provider maps OpenClaw
|
||||
thinking levels to DashScope's top-level `enable_thinking` request flag. Disabled
|
||||
thinking sends `enable_thinking: false`; other thinking levels send
|
||||
`enable_thinking: true`.
|
||||
|
||||
## Multimodal add-ons
|
||||
|
||||
The `qwen` plugin also exposes multimodal capabilities on the **Standard**
|
||||
|
||||
@@ -131,7 +131,7 @@ Use explicit config when:
|
||||
|
||||
<Accordion title="Qwen thinking controls">
|
||||
For Qwen models served through vLLM, set
|
||||
`compat.thinkingFormat: "qwen-chat-template"` on the model entry when the
|
||||
`params.qwenThinkingFormat: "chat-template"` on the model entry when the
|
||||
server expects Qwen chat-template kwargs. OpenClaw maps `/think off` to:
|
||||
|
||||
```json
|
||||
@@ -145,8 +145,8 @@ Use explicit config when:
|
||||
|
||||
Non-`off` thinking levels send `enable_thinking: true`. If your endpoint
|
||||
expects DashScope-style top-level flags instead, use
|
||||
`compat.thinkingFormat: "qwen"` to send `enable_thinking` at the request
|
||||
root.
|
||||
`params.qwenThinkingFormat: "top-level"` to send `enable_thinking` at the
|
||||
request root. Snake-case `params.qwen_thinking_format` is also accepted.
|
||||
|
||||
</Accordion>
|
||||
|
||||
|
||||
@@ -31,3 +31,4 @@ export {
|
||||
MODELSTUDIO_MODEL_CATALOG,
|
||||
} from "./models.js";
|
||||
export { buildModelStudioProvider, buildQwenProvider } from "./provider-catalog.js";
|
||||
export { createQwenThinkingWrapper, wrapQwenProviderStream } from "./stream.js";
|
||||
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
QWEN_DEFAULT_MODEL_REF,
|
||||
} from "./onboard.js";
|
||||
import { buildQwenProvider } from "./provider-catalog.js";
|
||||
import { wrapQwenProviderStream } from "./stream.js";
|
||||
import { buildQwenVideoGenerationProvider } from "./video-generation-provider.js";
|
||||
|
||||
const PROVIDER_ID = "qwen";
|
||||
@@ -165,6 +166,7 @@ export default defineSingleProviderPluginEntry({
|
||||
},
|
||||
applyNativeStreamingUsageCompat: ({ providerConfig }) =>
|
||||
applyQwenNativeStreamingUsageCompat(providerConfig),
|
||||
wrapStreamFn: wrapQwenProviderStream,
|
||||
normalizeConfig: ({ providerConfig }) => {
|
||||
if (!isQwenCodingPlanBaseUrl(providerConfig.baseUrl)) {
|
||||
return undefined;
|
||||
|
||||
94
extensions/qwen/stream.test.ts
Normal file
94
extensions/qwen/stream.test.ts
Normal file
@@ -0,0 +1,94 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import type { Context, Model } from "@mariozechner/pi-ai";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { createQwenThinkingWrapper, wrapQwenProviderStream } from "./stream.js";
|
||||
|
||||
function capturePayload(params: {
|
||||
thinkingLevel?: "off" | "low" | "medium" | "high" | "xhigh" | "max";
|
||||
reasoning?: unknown;
|
||||
initialPayload?: Record<string, unknown>;
|
||||
model?: Partial<Model<"openai-completions">>;
|
||||
}): Record<string, unknown> {
|
||||
let captured: Record<string, unknown> = {};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
const payload = { ...params.initialPayload };
|
||||
options?.onPayload?.(payload, _model);
|
||||
captured = payload;
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
|
||||
const wrapped = createQwenThinkingWrapper(baseStreamFn, params.thinkingLevel ?? "high");
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-completions",
|
||||
provider: "qwen",
|
||||
id: "qwen3.6-plus",
|
||||
reasoning: true,
|
||||
...params.model,
|
||||
} as Model<"openai-completions">,
|
||||
{ messages: [] } as Context,
|
||||
params.reasoning === undefined ? {} : ({ reasoning: params.reasoning } as never),
|
||||
);
|
||||
|
||||
return captured;
|
||||
}
|
||||
|
||||
describe("createQwenThinkingWrapper", () => {
|
||||
it("maps disabled thinking to Qwen top-level enable_thinking", () => {
|
||||
const payload = capturePayload({
|
||||
reasoning: "none",
|
||||
initialPayload: {
|
||||
reasoning_effort: "high",
|
||||
reasoning: { effort: "high" },
|
||||
reasoningEffort: "high",
|
||||
},
|
||||
});
|
||||
|
||||
expect(payload).toEqual({ enable_thinking: false });
|
||||
});
|
||||
|
||||
it("maps enabled thinking to Qwen top-level enable_thinking", () => {
|
||||
expect(capturePayload({ reasoning: "medium" })).toEqual({ enable_thinking: true });
|
||||
});
|
||||
|
||||
it("falls back to the session thinking level", () => {
|
||||
expect(capturePayload({ thinkingLevel: "off" })).toEqual({ enable_thinking: false });
|
||||
expect(capturePayload({ thinkingLevel: "high" })).toEqual({ enable_thinking: true });
|
||||
});
|
||||
|
||||
it("skips non-reasoning and non-completions models", () => {
|
||||
expect(capturePayload({ model: { reasoning: false } })).toEqual({});
|
||||
expect(capturePayload({ model: { api: "openai-responses" as never } })).toEqual({});
|
||||
});
|
||||
});
|
||||
|
||||
describe("wrapQwenProviderStream", () => {
|
||||
it("only registers for Qwen-family OpenAI-compatible providers", () => {
|
||||
expect(
|
||||
wrapQwenProviderStream({
|
||||
provider: "qwencloud",
|
||||
modelId: "qwen3.6-plus",
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "qwen",
|
||||
id: "qwen3.6-plus",
|
||||
reasoning: true,
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeTypeOf("function");
|
||||
|
||||
expect(
|
||||
wrapQwenProviderStream({
|
||||
provider: "openai",
|
||||
modelId: "gpt-5.4",
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "openai",
|
||||
id: "gpt-5.4",
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeUndefined();
|
||||
});
|
||||
});
|
||||
56
extensions/qwen/stream.ts
Normal file
56
extensions/qwen/stream.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import { streamSimple } from "@mariozechner/pi-ai";
|
||||
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
|
||||
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
|
||||
import { streamWithPayloadPatch } from "openclaw/plugin-sdk/provider-stream-shared";
|
||||
|
||||
type QwenThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
|
||||
|
||||
function isQwenProviderId(providerId: string): boolean {
|
||||
const normalized = normalizeProviderId(providerId);
|
||||
return (
|
||||
normalized === "qwen" ||
|
||||
normalized === "modelstudio" ||
|
||||
normalized === "qwencloud" ||
|
||||
normalized === "dashscope"
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenAICompatibleThinkingEnabled(params: {
|
||||
thinkingLevel: QwenThinkingLevel;
|
||||
options: Parameters<StreamFn>[2];
|
||||
}): boolean {
|
||||
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
|
||||
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
|
||||
if (typeof raw !== "string") {
|
||||
return true;
|
||||
}
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
return normalized !== "off" && normalized !== "none";
|
||||
}
|
||||
|
||||
export function createQwenThinkingWrapper(
|
||||
baseStreamFn: StreamFn | undefined,
|
||||
thinkingLevel: QwenThinkingLevel,
|
||||
): StreamFn {
|
||||
const underlying = baseStreamFn ?? streamSimple;
|
||||
return (model, context, options) => {
|
||||
if (model.api !== "openai-completions" || !model.reasoning) {
|
||||
return underlying(model, context, options);
|
||||
}
|
||||
const enableThinking = resolveOpenAICompatibleThinkingEnabled({ thinkingLevel, options });
|
||||
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
|
||||
payloadObj.enable_thinking = enableThinking;
|
||||
delete payloadObj.reasoning_effort;
|
||||
delete payloadObj.reasoningEffort;
|
||||
delete payloadObj.reasoning;
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
export function wrapQwenProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
|
||||
if (!isQwenProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
|
||||
return undefined;
|
||||
}
|
||||
return createQwenThinkingWrapper(ctx.streamFn, ctx.thinkingLevel);
|
||||
}
|
||||
@@ -5,3 +5,4 @@ export {
|
||||
VLLM_PROVIDER_LABEL,
|
||||
} from "./defaults.js";
|
||||
export { buildVllmProvider } from "./models.js";
|
||||
export { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";
|
||||
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
VLLM_MODEL_PLACEHOLDER,
|
||||
VLLM_PROVIDER_LABEL,
|
||||
} from "./api.js";
|
||||
import { wrapVllmProviderStream } from "./stream.js";
|
||||
|
||||
const PROVIDER_ID = "vllm";
|
||||
|
||||
@@ -89,6 +90,7 @@ export default definePluginEntry({
|
||||
"vLLM requires authentication to be registered as a provider. " +
|
||||
'Set VLLM_API_KEY (any value works) or run "openclaw configure". ' +
|
||||
"See: https://docs.openclaw.ai/providers/vllm",
|
||||
wrapStreamFn: wrapVllmProviderStream,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
170
extensions/vllm/stream.test.ts
Normal file
170
extensions/vllm/stream.test.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import type { Context, Model } from "@mariozechner/pi-ai";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";
|
||||
|
||||
function capturePayload(params: {
|
||||
format: "chat-template" | "top-level";
|
||||
thinkingLevel?: "off" | "low" | "medium" | "high" | "xhigh" | "max";
|
||||
reasoning?: unknown;
|
||||
initialPayload?: Record<string, unknown>;
|
||||
model?: Partial<Model<"openai-completions">>;
|
||||
}): Record<string, unknown> {
|
||||
let captured: Record<string, unknown> = {};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
const payload = { ...params.initialPayload };
|
||||
options?.onPayload?.(payload, _model);
|
||||
captured = payload;
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
|
||||
const wrapped = createVllmQwenThinkingWrapper({
|
||||
baseStreamFn,
|
||||
format: params.format,
|
||||
thinkingLevel: params.thinkingLevel ?? "high",
|
||||
});
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
reasoning: true,
|
||||
...params.model,
|
||||
} as Model<"openai-completions">,
|
||||
{ messages: [] } as Context,
|
||||
params.reasoning === undefined ? {} : ({ reasoning: params.reasoning } as never),
|
||||
);
|
||||
|
||||
return captured;
|
||||
}
|
||||
|
||||
describe("createVllmQwenThinkingWrapper", () => {
|
||||
it("maps Qwen chat-template thinking off to chat_template_kwargs", () => {
|
||||
const payload = capturePayload({
|
||||
format: "chat-template",
|
||||
reasoning: "none",
|
||||
initialPayload: {
|
||||
reasoning_effort: "high",
|
||||
reasoning: { effort: "high" },
|
||||
reasoningEffort: "high",
|
||||
},
|
||||
});
|
||||
|
||||
expect(payload).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: false,
|
||||
preserve_thinking: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("maps Qwen chat-template thinking on to chat_template_kwargs", () => {
|
||||
expect(capturePayload({ format: "chat-template", reasoning: "medium" })).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: true,
|
||||
preserve_thinking: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("preserves explicit chat-template kwargs while setting enable_thinking", () => {
|
||||
expect(
|
||||
capturePayload({
|
||||
format: "chat-template",
|
||||
thinkingLevel: "off",
|
||||
initialPayload: {
|
||||
chat_template_kwargs: {
|
||||
preserve_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: false,
|
||||
preserve_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("maps Qwen top-level thinking format to enable_thinking", () => {
|
||||
expect(capturePayload({ format: "top-level", thinkingLevel: "off" })).toEqual({
|
||||
enable_thinking: false,
|
||||
});
|
||||
expect(capturePayload({ format: "top-level", thinkingLevel: "high" })).toEqual({
|
||||
enable_thinking: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("skips non-reasoning and non-completions models", () => {
|
||||
expect(capturePayload({ format: "chat-template", model: { reasoning: false } })).toEqual({});
|
||||
expect(
|
||||
capturePayload({ format: "chat-template", model: { api: "openai-responses" as never } }),
|
||||
).toEqual({});
|
||||
});
|
||||
});
|
||||
|
||||
describe("wrapVllmProviderStream", () => {
|
||||
it("registers when vLLM Qwen thinking format params are configured", () => {
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "Qwen/Qwen3-8B",
|
||||
extraParams: { qwenThinkingFormat: "chat-template" },
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
reasoning: true,
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeTypeOf("function");
|
||||
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "Qwen/Qwen3-8B",
|
||||
extraParams: { qwen_thinking_format: "enable_thinking" },
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
reasoning: true,
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeTypeOf("function");
|
||||
});
|
||||
|
||||
it("skips unconfigured vLLM and non-vLLM providers", () => {
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "Qwen/Qwen3-8B",
|
||||
extraParams: {},
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeUndefined();
|
||||
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "openai",
|
||||
modelId: "gpt-5.4",
|
||||
extraParams: { qwenThinkingFormat: "chat-template" },
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "openai",
|
||||
id: "gpt-5.4",
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeUndefined();
|
||||
});
|
||||
});
|
||||
117
extensions/vllm/stream.ts
Normal file
117
extensions/vllm/stream.ts
Normal file
@@ -0,0 +1,117 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import { streamSimple } from "@mariozechner/pi-ai";
|
||||
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
|
||||
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
|
||||
import { streamWithPayloadPatch } from "openclaw/plugin-sdk/provider-stream-shared";
|
||||
|
||||
type VllmThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
|
||||
type VllmQwenThinkingFormat = "chat-template" | "top-level";
|
||||
|
||||
function isVllmProviderId(providerId: string): boolean {
|
||||
return normalizeProviderId(providerId) === "vllm";
|
||||
}
|
||||
|
||||
function normalizeQwenThinkingFormat(value: unknown): VllmQwenThinkingFormat | undefined {
|
||||
if (typeof value !== "string") {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = value.trim().toLowerCase().replace(/_/g, "-");
|
||||
if (
|
||||
normalized === "chat-template" ||
|
||||
normalized === "chat-template-kwargs" ||
|
||||
normalized === "chat-template-kwarg" ||
|
||||
normalized === "chat-template-arguments"
|
||||
) {
|
||||
return "chat-template";
|
||||
}
|
||||
if (
|
||||
normalized === "top-level" ||
|
||||
normalized === "enable-thinking" ||
|
||||
normalized === "request-body"
|
||||
) {
|
||||
return "top-level";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function resolveVllmQwenThinkingFormat(
|
||||
extraParams: ProviderWrapStreamFnContext["extraParams"],
|
||||
): VllmQwenThinkingFormat | undefined {
|
||||
return normalizeQwenThinkingFormat(
|
||||
extraParams?.qwenThinkingFormat ?? extraParams?.qwen_thinking_format,
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenAICompatibleThinkingEnabled(params: {
|
||||
thinkingLevel: VllmThinkingLevel;
|
||||
options: Parameters<StreamFn>[2];
|
||||
}): boolean {
|
||||
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
|
||||
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
|
||||
if (typeof raw !== "string") {
|
||||
return true;
|
||||
}
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
return normalized !== "off" && normalized !== "none";
|
||||
}
|
||||
|
||||
function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled: boolean): void {
|
||||
const existing = payload.chat_template_kwargs;
|
||||
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
|
||||
const next: Record<string, unknown> = {
|
||||
...(existing as Record<string, unknown>),
|
||||
enable_thinking: enabled,
|
||||
};
|
||||
if (!Object.hasOwn(next, "preserve_thinking")) {
|
||||
next.preserve_thinking = true;
|
||||
}
|
||||
payload.chat_template_kwargs = next;
|
||||
return;
|
||||
}
|
||||
payload.chat_template_kwargs = {
|
||||
enable_thinking: enabled,
|
||||
preserve_thinking: true,
|
||||
};
|
||||
}
|
||||
|
||||
export function createVllmQwenThinkingWrapper(params: {
|
||||
baseStreamFn: StreamFn | undefined;
|
||||
format: VllmQwenThinkingFormat;
|
||||
thinkingLevel: VllmThinkingLevel;
|
||||
}): StreamFn {
|
||||
const underlying = params.baseStreamFn ?? streamSimple;
|
||||
return (model, context, options) => {
|
||||
if (model.api !== "openai-completions" || !model.reasoning) {
|
||||
return underlying(model, context, options);
|
||||
}
|
||||
const enableThinking = resolveOpenAICompatibleThinkingEnabled({
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
options,
|
||||
});
|
||||
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
|
||||
if (params.format === "chat-template") {
|
||||
setQwenChatTemplateThinking(payloadObj, enableThinking);
|
||||
} else {
|
||||
payloadObj.enable_thinking = enableThinking;
|
||||
}
|
||||
delete payloadObj.reasoning_effort;
|
||||
delete payloadObj.reasoningEffort;
|
||||
delete payloadObj.reasoning;
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
export function wrapVllmProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
|
||||
if (!isVllmProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
|
||||
return undefined;
|
||||
}
|
||||
const format = resolveVllmQwenThinkingFormat(ctx.extraParams);
|
||||
if (!format) {
|
||||
return undefined;
|
||||
}
|
||||
return createVllmQwenThinkingWrapper({
|
||||
baseStreamFn: ctx.streamFn,
|
||||
format,
|
||||
thinkingLevel: ctx.thinkingLevel,
|
||||
});
|
||||
}
|
||||
@@ -17,7 +17,7 @@ export type OpenAICompletionsCompatDefaults = {
|
||||
supportsReasoningEffort: boolean;
|
||||
supportsUsageInStreaming: boolean;
|
||||
maxTokensField: "max_completion_tokens" | "max_tokens";
|
||||
thinkingFormat: "openai" | "openrouter" | "deepseek" | "zai" | "qwen" | "qwen-chat-template";
|
||||
thinkingFormat: "openai" | "openrouter" | "deepseek" | "zai";
|
||||
visibleReasoningDetailTypes: string[];
|
||||
supportsStrictMode: boolean;
|
||||
};
|
||||
|
||||
@@ -1816,78 +1816,6 @@ describe("openai transport stream", () => {
|
||||
expect(params.stream_options).toMatchObject({ include_usage: true });
|
||||
});
|
||||
|
||||
it("maps qwen-chat-template thinking compat to vLLM chat template kwargs", () => {
|
||||
const baseModel = {
|
||||
id: "Qwen/Qwen3-8B",
|
||||
name: "Qwen3 8B",
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
baseUrl: "http://127.0.0.1:8000/v1",
|
||||
reasoning: true,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 32768,
|
||||
maxTokens: 8192,
|
||||
compat: { thinkingFormat: "qwen-chat-template" },
|
||||
} as unknown as Model<"openai-completions">;
|
||||
const context = {
|
||||
systemPrompt: "system",
|
||||
messages: [],
|
||||
tools: [],
|
||||
} as never;
|
||||
|
||||
const disabled = buildOpenAICompletionsParams(baseModel, context, {
|
||||
reasoning: "none",
|
||||
} as never) as {
|
||||
chat_template_kwargs?: { enable_thinking?: unknown; preserve_thinking?: unknown };
|
||||
};
|
||||
const enabled = buildOpenAICompletionsParams(baseModel, context, {
|
||||
reasoning: "medium",
|
||||
} as never) as {
|
||||
chat_template_kwargs?: { enable_thinking?: unknown; preserve_thinking?: unknown };
|
||||
};
|
||||
|
||||
expect(disabled.chat_template_kwargs).toEqual({
|
||||
enable_thinking: false,
|
||||
preserve_thinking: true,
|
||||
});
|
||||
expect(disabled).not.toHaveProperty("reasoning_effort");
|
||||
expect(enabled.chat_template_kwargs).toEqual({
|
||||
enable_thinking: true,
|
||||
preserve_thinking: true,
|
||||
});
|
||||
expect(enabled).not.toHaveProperty("reasoning_effort");
|
||||
});
|
||||
|
||||
it("maps qwen thinking compat to top-level enable_thinking", () => {
|
||||
const params = buildOpenAICompletionsParams(
|
||||
{
|
||||
id: "qwen3.6-plus",
|
||||
name: "Qwen 3.6 Plus",
|
||||
api: "openai-completions",
|
||||
provider: "qwen-custom",
|
||||
baseUrl: "https://example.com/v1",
|
||||
reasoning: true,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 32768,
|
||||
maxTokens: 8192,
|
||||
compat: { thinkingFormat: "qwen" },
|
||||
} as unknown as Model<"openai-completions">,
|
||||
{
|
||||
systemPrompt: "system",
|
||||
messages: [],
|
||||
tools: [],
|
||||
} as never,
|
||||
{
|
||||
reasoning: "none",
|
||||
} as never,
|
||||
) as { enable_thinking?: unknown; reasoning_effort?: unknown };
|
||||
|
||||
expect(params.enable_thinking).toBe(false);
|
||||
expect(params).not.toHaveProperty("reasoning_effort");
|
||||
});
|
||||
|
||||
it("enables streaming usage compat for generic providers on native DashScope endpoints", () => {
|
||||
const params = buildOpenAICompletionsParams(
|
||||
{
|
||||
|
||||
@@ -87,8 +87,12 @@ type OpenAICompletionsOptions = BaseStreamOptions & {
|
||||
reasoningEffort?: OpenAIReasoningEffort;
|
||||
};
|
||||
|
||||
type OpenAIModeCompatInput = Omit<ModelCompatConfig, "thinkingFormat"> & {
|
||||
thinkingFormat?: string;
|
||||
};
|
||||
|
||||
type OpenAIModeModel = Omit<Model<Api>, "compat"> & {
|
||||
compat?: ModelCompatConfig;
|
||||
compat?: OpenAIModeCompatInput | null;
|
||||
};
|
||||
|
||||
type MutableAssistantOutput = {
|
||||
@@ -1592,7 +1596,7 @@ function getCompat(model: OpenAIModeModel): {
|
||||
requiresAssistantAfterToolResult:
|
||||
compat.requiresAssistantAfterToolResult ?? detected.requiresAssistantAfterToolResult,
|
||||
requiresThinkingAsText: compat.requiresThinkingAsText ?? detected.requiresThinkingAsText,
|
||||
thinkingFormat: (compat.thinkingFormat as string | undefined) ?? detected.thinkingFormat,
|
||||
thinkingFormat: compat.thinkingFormat ?? detected.thinkingFormat,
|
||||
openRouterRouting: (compat.openRouterRouting as Record<string, unknown> | undefined) ?? {},
|
||||
vercelGatewayRouting:
|
||||
(compat.vercelGatewayRouting as Record<string, unknown> | undefined) ??
|
||||
@@ -1631,29 +1635,6 @@ function resolveOpenAICompletionsReasoningEffort(options: OpenAICompletionsOptio
|
||||
return options?.reasoningEffort ?? options?.reasoning ?? "high";
|
||||
}
|
||||
|
||||
function isCompletionsThinkingEnabled(effort: string): boolean {
|
||||
return normalizeOpenAIReasoningEffort(effort) !== "none";
|
||||
}
|
||||
|
||||
function setChatTemplateThinking(params: Record<string, unknown>, enabled: boolean): void {
|
||||
const existing = params.chat_template_kwargs;
|
||||
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
|
||||
const next: Record<string, unknown> = {
|
||||
...(existing as Record<string, unknown>),
|
||||
enable_thinking: enabled,
|
||||
};
|
||||
if (!Object.hasOwn(next, "preserve_thinking")) {
|
||||
next.preserve_thinking = true;
|
||||
}
|
||||
params.chat_template_kwargs = next;
|
||||
return;
|
||||
}
|
||||
params.chat_template_kwargs = {
|
||||
enable_thinking: enabled,
|
||||
preserve_thinking: true,
|
||||
};
|
||||
}
|
||||
|
||||
function convertTools(
|
||||
tools: NonNullable<Context["tools"]>,
|
||||
compat: ReturnType<typeof getCompat>,
|
||||
@@ -1837,15 +1818,7 @@ export function buildOpenAICompletionsParams(
|
||||
fallbackMap: compat.reasoningEffortMap,
|
||||
})
|
||||
: undefined;
|
||||
if (compat.thinkingFormat === "qwen" && model.reasoning && completionsReasoningEffort) {
|
||||
params.enable_thinking = isCompletionsThinkingEnabled(completionsReasoningEffort);
|
||||
} else if (
|
||||
compat.thinkingFormat === "qwen-chat-template" &&
|
||||
model.reasoning &&
|
||||
completionsReasoningEffort
|
||||
) {
|
||||
setChatTemplateThinking(params, isCompletionsThinkingEnabled(completionsReasoningEffort));
|
||||
} else if (
|
||||
if (
|
||||
compat.thinkingFormat === "openrouter" &&
|
||||
model.reasoning &&
|
||||
resolvedCompletionsReasoningEffort
|
||||
|
||||
@@ -642,7 +642,7 @@ describe("model compat config schema", () => {
|
||||
supportsUsageInStreaming: true,
|
||||
supportsStrictMode: false,
|
||||
requiresStringContent: true,
|
||||
thinkingFormat: "qwen",
|
||||
thinkingFormat: "zai",
|
||||
requiresToolResultName: true,
|
||||
requiresAssistantAfterToolResult: false,
|
||||
requiresThinkingAsText: false,
|
||||
|
||||
@@ -3105,14 +3105,6 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
type: "string",
|
||||
const: "zai",
|
||||
},
|
||||
{
|
||||
type: "string",
|
||||
const: "qwen",
|
||||
},
|
||||
{
|
||||
type: "string",
|
||||
const: "qwen-chat-template",
|
||||
},
|
||||
],
|
||||
},
|
||||
requiresToolResultName: {
|
||||
|
||||
@@ -50,10 +50,9 @@ type SupportedAnthropicMessagesCompatFields = Pick<
|
||||
>;
|
||||
|
||||
type SupportedThinkingFormat =
|
||||
| NonNullable<OpenAICompletionsCompat["thinkingFormat"]>
|
||||
| Exclude<NonNullable<OpenAICompletionsCompat["thinkingFormat"]>, "qwen" | "qwen-chat-template">
|
||||
| "deepseek"
|
||||
| "openrouter"
|
||||
| "qwen-chat-template";
|
||||
| "openrouter";
|
||||
|
||||
export type ModelCompatConfig = SupportedOpenAICompatFields &
|
||||
SupportedOpenAIResponsesCompatFields &
|
||||
|
||||
@@ -204,8 +204,6 @@ export const ModelCompatSchema = z
|
||||
z.literal("openrouter"),
|
||||
z.literal("deepseek"),
|
||||
z.literal("zai"),
|
||||
z.literal("qwen"),
|
||||
z.literal("qwen-chat-template"),
|
||||
])
|
||||
.optional(),
|
||||
requiresToolResultName: z.boolean().optional(),
|
||||
|
||||
@@ -212,9 +212,7 @@ function normalizeModelCatalogCompat(value: unknown): ModelCompatConfig | undefi
|
||||
thinkingFormat === "openai" ||
|
||||
thinkingFormat === "openrouter" ||
|
||||
thinkingFormat === "deepseek" ||
|
||||
thinkingFormat === "zai" ||
|
||||
thinkingFormat === "qwen" ||
|
||||
thinkingFormat === "qwen-chat-template"
|
||||
thinkingFormat === "zai"
|
||||
) {
|
||||
compat.thinkingFormat = thinkingFormat;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user