fix(agents): honor qwen chat-template thinking compat

2026-05-06 05:30:42 +00:00 · 2026-04-27 11:26:23 +01:00
parent 3db407da40
commit 75c8c1bebe
6 changed files with 128 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
 - Agents/OpenAI-compatible: retry replay-safe empty `stop` turns once for `openai-completions` endpoints, so transient empty local backend responses no longer surface as “Agent couldn't generate a response” when a continuation succeeds, and restore `openclaw agent --model` for one-shot CLI runs. Fixes #72751. Thanks @moooV252.
 - Git hooks: skip ignored staged paths when formatting and restaging pre-commit files, so merge commits no longer abort when `.gitignore` newly ignores staged merged content. Fixes #72744. Thanks @100yenadmin.
 - Memory-core/dreaming: add a supported `dreaming.model` knob for Dream Diary narrative subagents, wired through phase config and the existing plugin subagent model-override trust gate. Refs #65963. Thanks @esqandil and @mjamiv.
+- Agents/vLLM: honor `compat.thinkingFormat: "qwen-chat-template"` by sending Qwen chat-template thinking kwargs, including preserved thinking for agent loops, and support DashScope-style `qwen` top-level thinking flags. Fixes #72329. Thanks @stavrostzagadouris.
 - Memory-core/dreaming: treat request-scoped narrative fallback as expected, skip session cleanup when no subagent run was created, and remove duplicate phase-level cleanup so fallback no longer emits warning noise. Fixes #67152. Thanks @jsompis.
 - Agents/exec: apply configured `tools.exec.timeoutSec` to background, `yieldMs`, and node `system.run` commands when no per-call timeout is set, preventing auto-backgrounded and remote node commands from running indefinitely. Fixes #67600; supersedes #67603. Thanks @dlmpx and @kagura-agent.
 - Config/doctor: stop masking unknown-key validation diagnostics such as `agents.defaults.llm`, and have `openclaw doctor --fix` remove the retired `agents.defaults.llm` timeout block. Thanks @aidiffuser.
--- a/docs/gateway/config-agents.md
+++ b/docs/gateway/config-agents.md
@@ -371,7 +371,7 @@ Time format in system prompt. Default: `auto` (OS preference).
 - `params`: global default provider parameters applied to all models. Set at `agents.defaults.params` (e.g. `{ cacheRetention: "long" }`).
 - `params` merge precedence (config): `agents.defaults.params` (global base) is overridden by `agents.defaults.models["provider/model"].params` (per-model), then `agents.list[].params` (matching agent id) overrides by key. See [Prompt Caching](/reference/prompt-caching) for details.
 - `params.extra_body`/`params.extraBody`: advanced pass-through JSON merged into `api: "openai-completions"` request bodies for OpenAI-compatible proxies. If it collides with generated request keys, the extra body wins; non-native completions routes still strip OpenAI-only `store` afterward.
- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; explicit `chat_template_kwargs` override those defaults, and `extra_body.chat_template_kwargs` still has final precedence.
+- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; models with `compat.thinkingFormat: "qwen-chat-template"` map OpenClaw thinking controls to `chat_template_kwargs.enable_thinking` plus `preserve_thinking: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence.
 - `params.preserveThinking`: Z.AI-only opt-in for preserved thinking. When enabled and thinking is on, OpenClaw sends `thinking.clear_thinking: false` and replays prior `reasoning_content`; see [Z.AI thinking and preserved thinking](/providers/zai#thinking-and-preserved-thinking).
 - `agentRuntime`: default low-level agent runtime policy. Omitted id defaults to OpenClaw Pi. Use `id: "pi"` to force the built-in PI harness, `id: "auto"` to let registered plugin harnesses claim supported models, a registered harness id such as `id: "codex"`, or a supported CLI backend alias such as `id: "claude-cli"`. Set `fallback: "none"` to disable automatic PI fallback. Explicit plugin runtimes such as `codex` fail closed by default unless you set `fallback: "pi"` in the same override scope. Keep model refs canonical as `provider/model`; select Codex, Claude CLI, Gemini CLI, and other execution backends through runtime config instead of legacy runtime provider prefixes. See [Agent runtimes](/concepts/agent-runtimes) for how this differs from provider/model selection.
 - Config writers that mutate these fields (for example `/models set`, `/models set-image`, and fallback add/remove commands) save canonical object form and preserve existing fallback lists when possible.
--- a/docs/providers/vllm.md
+++ b/docs/providers/vllm.md
@@ -129,6 +129,27 @@ Use explicit config when:

  </Accordion>

+  <Accordion title="Qwen thinking controls">
+    For Qwen models served through vLLM, set
+    `compat.thinkingFormat: "qwen-chat-template"` on the model entry when the
+    server expects Qwen chat-template kwargs. OpenClaw maps `/think off` to:
+
+    ```json
+    {
+      "chat_template_kwargs": {
+        "enable_thinking": false,
+        "preserve_thinking": true
+      }
+    }
+    ```
+
+    Non-`off` thinking levels send `enable_thinking: true`. If your endpoint
+    expects DashScope-style top-level flags instead, use
+    `compat.thinkingFormat: "qwen"` to send `enable_thinking` at the request
+    root.
+
+  </Accordion>
+
  <Accordion title="Nemotron 3 thinking controls">
    vLLM/Nemotron 3 can use chat-template kwargs to control whether reasoning is
    returned as hidden reasoning or visible answer text. When an OpenClaw session
--- a/src/agents/openai-completions-compat.ts
+++ b/src/agents/openai-completions-compat.ts
@@ -17,7 +17,7 @@ export type OpenAICompletionsCompatDefaults = {
  supportsReasoningEffort: boolean;
  supportsUsageInStreaming: boolean;
  maxTokensField: "max_completion_tokens" | "max_tokens";
-  thinkingFormat: "openai" | "openrouter" | "deepseek" | "zai";
+  thinkingFormat: "openai" | "openrouter" | "deepseek" | "zai" | "qwen" | "qwen-chat-template";
  visibleReasoningDetailTypes: string[];
  supportsStrictMode: boolean;
 };
--- a/src/agents/openai-transport-stream.test.ts
+++ b/src/agents/openai-transport-stream.test.ts
@@ -1816,6 +1816,78 @@ describe("openai transport stream", () => {
    expect(params.stream_options).toMatchObject({ include_usage: true });
  });

+  it("maps qwen-chat-template thinking compat to vLLM chat template kwargs", () => {
+    const baseModel = {
+      id: "Qwen/Qwen3-8B",
+      name: "Qwen3 8B",
+      api: "openai-completions",
+      provider: "vllm",
+      baseUrl: "http://127.0.0.1:8000/v1",
+      reasoning: true,
+      input: ["text"],
+      cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+      contextWindow: 32768,
+      maxTokens: 8192,
+      compat: { thinkingFormat: "qwen-chat-template" },
+    } as unknown as Model<"openai-completions">;
+    const context = {
+      systemPrompt: "system",
+      messages: [],
+      tools: [],
+    } as never;
+
+    const disabled = buildOpenAICompletionsParams(baseModel, context, {
+      reasoning: "none",
+    } as never) as {
+      chat_template_kwargs?: { enable_thinking?: unknown; preserve_thinking?: unknown };
+    };
+    const enabled = buildOpenAICompletionsParams(baseModel, context, {
+      reasoning: "medium",
+    } as never) as {
+      chat_template_kwargs?: { enable_thinking?: unknown; preserve_thinking?: unknown };
+    };
+
+    expect(disabled.chat_template_kwargs).toEqual({
+      enable_thinking: false,
+      preserve_thinking: true,
+    });
+    expect(disabled).not.toHaveProperty("reasoning_effort");
+    expect(enabled.chat_template_kwargs).toEqual({
+      enable_thinking: true,
+      preserve_thinking: true,
+    });
+    expect(enabled).not.toHaveProperty("reasoning_effort");
+  });
+
+  it("maps qwen thinking compat to top-level enable_thinking", () => {
+    const params = buildOpenAICompletionsParams(
+      {
+        id: "qwen3.6-plus",
+        name: "Qwen 3.6 Plus",
+        api: "openai-completions",
+        provider: "qwen-custom",
+        baseUrl: "https://example.com/v1",
+        reasoning: true,
+        input: ["text"],
+        cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+        contextWindow: 32768,
+        maxTokens: 8192,
+        compat: { thinkingFormat: "qwen" },
+      } as unknown as Model<"openai-completions">,
+      {
+        systemPrompt: "system",
+        messages: [],
+        tools: [],
+      } as never,
+      {
+        reasoning: "none",
+      } as never,
+    ) as { enable_thinking?: unknown; reasoning_effort?: unknown };
+
+    expect(params.enable_thinking).toBe(false);
+    expect(params).not.toHaveProperty("reasoning_effort");
+  });
+
  it("enables streaming usage compat for generic providers on native DashScope endpoints", () => {
    const params = buildOpenAICompletionsParams(
      {
--- a/src/agents/openai-transport-stream.ts
+++ b/src/agents/openai-transport-stream.ts
@@ -1631,6 +1631,29 @@ function resolveOpenAICompletionsReasoningEffort(options: OpenAICompletionsOptio
  return options?.reasoningEffort ?? options?.reasoning ?? "high";
 }

+function isCompletionsThinkingEnabled(effort: string): boolean {
+  return normalizeOpenAIReasoningEffort(effort) !== "none";
+}
+
+function setChatTemplateThinking(params: Record<string, unknown>, enabled: boolean): void {
+  const existing = params.chat_template_kwargs;
+  if (existing && typeof existing === "object" && !Array.isArray(existing)) {
+    const next: Record<string, unknown> = {
+      ...(existing as Record<string, unknown>),
+      enable_thinking: enabled,
+    };
+    if (!Object.hasOwn(next, "preserve_thinking")) {
+      next.preserve_thinking = true;
+    }
+    params.chat_template_kwargs = next;
+    return;
+  }
+  params.chat_template_kwargs = {
+    enable_thinking: enabled,
+    preserve_thinking: true,
+  };
+}
+
 function convertTools(
  tools: NonNullable<Context["tools"]>,
  compat: ReturnType<typeof getCompat>,
@@ -1814,7 +1837,15 @@ export function buildOpenAICompletionsParams(
        fallbackMap: compat.reasoningEffortMap,
      })
    : undefined;
-  if (
+  if (compat.thinkingFormat === "qwen" && model.reasoning && completionsReasoningEffort) {
+    params.enable_thinking = isCompletionsThinkingEnabled(completionsReasoningEffort);
+  } else if (
+    compat.thinkingFormat === "qwen-chat-template" &&
+    model.reasoning &&
+    completionsReasoningEffort
+  ) {
+    setChatTemplateThinking(params, isCompletionsThinkingEnabled(completionsReasoningEffort));
+  } else if (
    compat.thinkingFormat === "openrouter" &&
    model.reasoning &&
    resolvedCompletionsReasoningEffort