From 75c8c1bebee1119c5641d0d8f90bf1b6c23c3892 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 27 Apr 2026 11:26:23 +0100 Subject: [PATCH] fix(agents): honor qwen chat-template thinking compat --- CHANGELOG.md | 1 + docs/gateway/config-agents.md | 2 +- docs/providers/vllm.md | 21 +++++++ src/agents/openai-completions-compat.ts | 2 +- src/agents/openai-transport-stream.test.ts | 72 ++++++++++++++++++++++ src/agents/openai-transport-stream.ts | 33 +++++++++- 6 files changed, 128 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 484055d8ba6..fa742119a64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai - Agents/OpenAI-compatible: retry replay-safe empty `stop` turns once for `openai-completions` endpoints, so transient empty local backend responses no longer surface as “Agent couldn't generate a response” when a continuation succeeds, and restore `openclaw agent --model` for one-shot CLI runs. Fixes #72751. Thanks @moooV252. - Git hooks: skip ignored staged paths when formatting and restaging pre-commit files, so merge commits no longer abort when `.gitignore` newly ignores staged merged content. Fixes #72744. Thanks @100yenadmin. - Memory-core/dreaming: add a supported `dreaming.model` knob for Dream Diary narrative subagents, wired through phase config and the existing plugin subagent model-override trust gate. Refs #65963. Thanks @esqandil and @mjamiv. +- Agents/vLLM: honor `compat.thinkingFormat: "qwen-chat-template"` by sending Qwen chat-template thinking kwargs, including preserved thinking for agent loops, and support DashScope-style `qwen` top-level thinking flags. Fixes #72329. Thanks @stavrostzagadouris. - Memory-core/dreaming: treat request-scoped narrative fallback as expected, skip session cleanup when no subagent run was created, and remove duplicate phase-level cleanup so fallback no longer emits warning noise. Fixes #67152. Thanks @jsompis. - Agents/exec: apply configured `tools.exec.timeoutSec` to background, `yieldMs`, and node `system.run` commands when no per-call timeout is set, preventing auto-backgrounded and remote node commands from running indefinitely. Fixes #67600; supersedes #67603. Thanks @dlmpx and @kagura-agent. - Config/doctor: stop masking unknown-key validation diagnostics such as `agents.defaults.llm`, and have `openclaw doctor --fix` remove the retired `agents.defaults.llm` timeout block. Thanks @aidiffuser. diff --git a/docs/gateway/config-agents.md b/docs/gateway/config-agents.md index 4c36bf8e8d1..b6a2616c0fa 100644 --- a/docs/gateway/config-agents.md +++ b/docs/gateway/config-agents.md @@ -371,7 +371,7 @@ Time format in system prompt. Default: `auto` (OS preference). - `params`: global default provider parameters applied to all models. Set at `agents.defaults.params` (e.g. `{ cacheRetention: "long" }`). - `params` merge precedence (config): `agents.defaults.params` (global base) is overridden by `agents.defaults.models["provider/model"].params` (per-model), then `agents.list[].params` (matching agent id) overrides by key. See [Prompt Caching](/reference/prompt-caching) for details. - `params.extra_body`/`params.extraBody`: advanced pass-through JSON merged into `api: "openai-completions"` request bodies for OpenAI-compatible proxies. If it collides with generated request keys, the extra body wins; non-native completions routes still strip OpenAI-only `store` afterward. -- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; explicit `chat_template_kwargs` override those defaults, and `extra_body.chat_template_kwargs` still has final precedence. +- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; models with `compat.thinkingFormat: "qwen-chat-template"` map OpenClaw thinking controls to `chat_template_kwargs.enable_thinking` plus `preserve_thinking: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence. - `params.preserveThinking`: Z.AI-only opt-in for preserved thinking. When enabled and thinking is on, OpenClaw sends `thinking.clear_thinking: false` and replays prior `reasoning_content`; see [Z.AI thinking and preserved thinking](/providers/zai#thinking-and-preserved-thinking). - `agentRuntime`: default low-level agent runtime policy. Omitted id defaults to OpenClaw Pi. Use `id: "pi"` to force the built-in PI harness, `id: "auto"` to let registered plugin harnesses claim supported models, a registered harness id such as `id: "codex"`, or a supported CLI backend alias such as `id: "claude-cli"`. Set `fallback: "none"` to disable automatic PI fallback. Explicit plugin runtimes such as `codex` fail closed by default unless you set `fallback: "pi"` in the same override scope. Keep model refs canonical as `provider/model`; select Codex, Claude CLI, Gemini CLI, and other execution backends through runtime config instead of legacy runtime provider prefixes. See [Agent runtimes](/concepts/agent-runtimes) for how this differs from provider/model selection. - Config writers that mutate these fields (for example `/models set`, `/models set-image`, and fallback add/remove commands) save canonical object form and preserve existing fallback lists when possible. diff --git a/docs/providers/vllm.md b/docs/providers/vllm.md index 48dc533a676..d1efe226fed 100644 --- a/docs/providers/vllm.md +++ b/docs/providers/vllm.md @@ -129,6 +129,27 @@ Use explicit config when: + + For Qwen models served through vLLM, set + `compat.thinkingFormat: "qwen-chat-template"` on the model entry when the + server expects Qwen chat-template kwargs. OpenClaw maps `/think off` to: + + ```json + { + "chat_template_kwargs": { + "enable_thinking": false, + "preserve_thinking": true + } + } + ``` + + Non-`off` thinking levels send `enable_thinking: true`. If your endpoint + expects DashScope-style top-level flags instead, use + `compat.thinkingFormat: "qwen"` to send `enable_thinking` at the request + root. + + + vLLM/Nemotron 3 can use chat-template kwargs to control whether reasoning is returned as hidden reasoning or visible answer text. When an OpenClaw session diff --git a/src/agents/openai-completions-compat.ts b/src/agents/openai-completions-compat.ts index e81fcfa9c87..b0619f5f25c 100644 --- a/src/agents/openai-completions-compat.ts +++ b/src/agents/openai-completions-compat.ts @@ -17,7 +17,7 @@ export type OpenAICompletionsCompatDefaults = { supportsReasoningEffort: boolean; supportsUsageInStreaming: boolean; maxTokensField: "max_completion_tokens" | "max_tokens"; - thinkingFormat: "openai" | "openrouter" | "deepseek" | "zai"; + thinkingFormat: "openai" | "openrouter" | "deepseek" | "zai" | "qwen" | "qwen-chat-template"; visibleReasoningDetailTypes: string[]; supportsStrictMode: boolean; }; diff --git a/src/agents/openai-transport-stream.test.ts b/src/agents/openai-transport-stream.test.ts index e961c131352..bb44991b6fa 100644 --- a/src/agents/openai-transport-stream.test.ts +++ b/src/agents/openai-transport-stream.test.ts @@ -1816,6 +1816,78 @@ describe("openai transport stream", () => { expect(params.stream_options).toMatchObject({ include_usage: true }); }); + it("maps qwen-chat-template thinking compat to vLLM chat template kwargs", () => { + const baseModel = { + id: "Qwen/Qwen3-8B", + name: "Qwen3 8B", + api: "openai-completions", + provider: "vllm", + baseUrl: "http://127.0.0.1:8000/v1", + reasoning: true, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 32768, + maxTokens: 8192, + compat: { thinkingFormat: "qwen-chat-template" }, + } as unknown as Model<"openai-completions">; + const context = { + systemPrompt: "system", + messages: [], + tools: [], + } as never; + + const disabled = buildOpenAICompletionsParams(baseModel, context, { + reasoning: "none", + } as never) as { + chat_template_kwargs?: { enable_thinking?: unknown; preserve_thinking?: unknown }; + }; + const enabled = buildOpenAICompletionsParams(baseModel, context, { + reasoning: "medium", + } as never) as { + chat_template_kwargs?: { enable_thinking?: unknown; preserve_thinking?: unknown }; + }; + + expect(disabled.chat_template_kwargs).toEqual({ + enable_thinking: false, + preserve_thinking: true, + }); + expect(disabled).not.toHaveProperty("reasoning_effort"); + expect(enabled.chat_template_kwargs).toEqual({ + enable_thinking: true, + preserve_thinking: true, + }); + expect(enabled).not.toHaveProperty("reasoning_effort"); + }); + + it("maps qwen thinking compat to top-level enable_thinking", () => { + const params = buildOpenAICompletionsParams( + { + id: "qwen3.6-plus", + name: "Qwen 3.6 Plus", + api: "openai-completions", + provider: "qwen-custom", + baseUrl: "https://example.com/v1", + reasoning: true, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 32768, + maxTokens: 8192, + compat: { thinkingFormat: "qwen" }, + } as unknown as Model<"openai-completions">, + { + systemPrompt: "system", + messages: [], + tools: [], + } as never, + { + reasoning: "none", + } as never, + ) as { enable_thinking?: unknown; reasoning_effort?: unknown }; + + expect(params.enable_thinking).toBe(false); + expect(params).not.toHaveProperty("reasoning_effort"); + }); + it("enables streaming usage compat for generic providers on native DashScope endpoints", () => { const params = buildOpenAICompletionsParams( { diff --git a/src/agents/openai-transport-stream.ts b/src/agents/openai-transport-stream.ts index 67045555539..6238ff7699b 100644 --- a/src/agents/openai-transport-stream.ts +++ b/src/agents/openai-transport-stream.ts @@ -1631,6 +1631,29 @@ function resolveOpenAICompletionsReasoningEffort(options: OpenAICompletionsOptio return options?.reasoningEffort ?? options?.reasoning ?? "high"; } +function isCompletionsThinkingEnabled(effort: string): boolean { + return normalizeOpenAIReasoningEffort(effort) !== "none"; +} + +function setChatTemplateThinking(params: Record, enabled: boolean): void { + const existing = params.chat_template_kwargs; + if (existing && typeof existing === "object" && !Array.isArray(existing)) { + const next: Record = { + ...(existing as Record), + enable_thinking: enabled, + }; + if (!Object.hasOwn(next, "preserve_thinking")) { + next.preserve_thinking = true; + } + params.chat_template_kwargs = next; + return; + } + params.chat_template_kwargs = { + enable_thinking: enabled, + preserve_thinking: true, + }; +} + function convertTools( tools: NonNullable, compat: ReturnType, @@ -1814,7 +1837,15 @@ export function buildOpenAICompletionsParams( fallbackMap: compat.reasoningEffortMap, }) : undefined; - if ( + if (compat.thinkingFormat === "qwen" && model.reasoning && completionsReasoningEffort) { + params.enable_thinking = isCompletionsThinkingEnabled(completionsReasoningEffort); + } else if ( + compat.thinkingFormat === "qwen-chat-template" && + model.reasoning && + completionsReasoningEffort + ) { + setChatTemplateThinking(params, isCompletionsThinkingEnabled(completionsReasoningEffort)); + } else if ( compat.thinkingFormat === "openrouter" && model.reasoning && resolvedCompletionsReasoningEffort