fix(ollama): forward think:false for qwen3 chat requests (#69967)

Forward top-level Ollama think flags on native /api/chat requests so --thinking off sends think:false.\n\nThanks @WZH8898.
2026-05-06 06:40:44 +00:00 · 2026-04-22 06:49:16 +02:00
parent 276c00015c
commit d4f91a354e
4 changed files with 114 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Ollama: forward OpenClaw thinking control to native `/api/chat` requests as top-level `think`, so `/think off` and `openclaw agent --thinking off` suppress thinking on models such as qwen3 instead of idling until the watchdog fires. Fixes #69902. (#69967) Thanks @WZH8898.
 - Memory-core/dreaming: suppress the startup-only managed dreaming cron unavailable warning when the cron service is still attaching, while preserving the runtime warning if cron genuinely remains unavailable. Fixes #69939. (#69941) Thanks @Sanjays2402.
 - Mattermost: suppress reasoning-only payloads even when they arrive as blockquoted `> Reasoning:` text, preventing `/reasoning on` from leaking thinking into channel posts. (#69927) Thanks @lawrence3699.
 - Discord: read `channel.parentId` through a safe accessor in the slash-command, reaction, and model-picker paths so partial `GuildThreadChannel` prototype getters no longer throw `Cannot access rawData on partial Channel` when commands like `/new` run from inside a thread. Fixes #69861. (#69908) Thanks @neeravmakwana.
--- a/docs/providers/ollama.md
+++ b/docs/providers/ollama.md
@@ -463,6 +463,8 @@ For the full setup and behavior details, see [Ollama Web Search](/tools/ollama-s
  <Accordion title="Streaming configuration">
    OpenClaw's Ollama integration uses the **native Ollama API** (`/api/chat`) by default, which fully supports streaming and tool calling simultaneously. No special configuration is needed.

+    For native `/api/chat` requests, OpenClaw also forwards thinking control directly to Ollama: `/think off` and `openclaw agent --thinking off` send top-level `think: false`, while non-`off` thinking levels send `think: true`.
+
    <Tip>
    If you need to use the OpenAI-compatible endpoint, see the "Legacy OpenAI-compatible mode" section above. Streaming and tool calling may not work simultaneously in that mode.
    </Tip>
--- a/extensions/ollama/src/stream-runtime.test.ts
+++ b/extensions/ollama/src/stream-runtime.test.ts
@@ -96,6 +96,112 @@ describe("createConfiguredOllamaCompatStreamWrapper", () => {
      options: { num_ctx: 262144 },
    });
  });
+
+  it("forwards think=false on native Ollama chat requests when thinking is off", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"ok"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":1}',
+      ],
+      async (fetchMock) => {
+        const baseStreamFn = createOllamaStreamFn("http://ollama-host:11434");
+        const model = {
+          api: "ollama",
+          provider: "ollama",
+          id: "qwen3:32b",
+          contextWindow: 131072,
+        };
+
+        const wrapped = createConfiguredOllamaCompatStreamWrapper({
+          provider: "ollama",
+          modelId: "qwen3:32b",
+          model,
+          streamFn: baseStreamFn,
+          thinkingLevel: "off",
+        } as never);
+        if (!wrapped) {
+          throw new Error("Expected wrapped Ollama stream function");
+        }
+
+        const stream = await Promise.resolve(
+          wrapped(
+            model as never,
+            {
+              messages: [{ role: "user", content: "hello" }],
+            } as never,
+            {} as never,
+          ),
+        );
+
+        await collectStreamEvents(stream);
+
+        const requestInit = getGuardedFetchCall(fetchMock).init ?? {};
+        if (typeof requestInit.body !== "string") {
+          throw new Error("Expected string request body");
+        }
+        const requestBody = JSON.parse(requestInit.body) as {
+          think?: boolean;
+          options?: { think?: boolean; num_ctx?: number };
+        };
+        expect(requestBody.think).toBe(false);
+        expect(requestBody.options?.think).toBeUndefined();
+        expect(requestBody.options?.num_ctx).toBe(131072);
+      },
+    );
+  });
+
+  it("forwards think=true on native Ollama chat requests when thinking is enabled", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"ok"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":1}',
+      ],
+      async (fetchMock) => {
+        const baseStreamFn = createOllamaStreamFn("http://ollama-host:11434");
+        const model = {
+          api: "ollama",
+          provider: "ollama",
+          id: "qwen3:32b",
+          contextWindow: 131072,
+        };
+
+        const wrapped = createConfiguredOllamaCompatStreamWrapper({
+          provider: "ollama",
+          modelId: "qwen3:32b",
+          model,
+          streamFn: baseStreamFn,
+          thinkingLevel: "low",
+        } as never);
+        if (!wrapped) {
+          throw new Error("Expected wrapped Ollama stream function");
+        }
+
+        const stream = await Promise.resolve(
+          wrapped(
+            model as never,
+            {
+              messages: [{ role: "user", content: "hello" }],
+            } as never,
+            {} as never,
+          ),
+        );
+
+        await collectStreamEvents(stream);
+
+        const requestInit = getGuardedFetchCall(fetchMock).init ?? {};
+        if (typeof requestInit.body !== "string") {
+          throw new Error("Expected string request body");
+        }
+        const requestBody = JSON.parse(requestInit.body) as {
+          think?: boolean;
+          options?: { think?: boolean; num_ctx?: number };
+        };
+        expect(requestBody.think).toBe(true);
+        expect(requestBody.options?.think).toBeUndefined();
+        expect(requestBody.options?.num_ctx).toBe(131072);
+      },
+    );
+  });
 });

 describe("convertToOllamaMessages", () => {
--- a/extensions/ollama/src/stream.ts
+++ b/extensions/ollama/src/stream.ts
@@ -153,14 +153,10 @@ export function wrapOllamaCompatNumCtx(baseFn: StreamFn | undefined, numCtx: num

 function createOllamaThinkingWrapper(baseFn: StreamFn | undefined, think: boolean): StreamFn {
  const streamFn = baseFn ?? streamSimple;
-  return (model, context, options) => {
-    if (model.api !== "ollama") {
-      return streamFn(model, context, options);
-    }
-    return streamWithPayloadPatch(streamFn, model, context, options, (payloadRecord) => {
+  return (model, context, options) =>
+    streamWithPayloadPatch(streamFn, model, context, options, (payloadRecord) => {
      payloadRecord.think = think;
    });
-  };
 }

 function resolveOllamaCompatNumCtx(model: ProviderRuntimeModel): number {
@@ -178,6 +174,7 @@ export function createConfiguredOllamaCompatStreamWrapper(
  let streamFn = ctx.streamFn;
  const model = ctx.model;
  let injectNumCtx = false;
+  const isNativeOllamaTransport = model?.api === "ollama";

  if (model) {
    const providerId =
@@ -199,9 +196,9 @@ export function createConfiguredOllamaCompatStreamWrapper(
    streamFn = wrapOllamaCompatNumCtx(streamFn, resolveOllamaCompatNumCtx(model));
  }

-  if (ctx.thinkingLevel === "off") {
+  if (isNativeOllamaTransport && ctx.thinkingLevel === "off") {
    streamFn = createOllamaThinkingWrapper(streamFn, false);
-  } else if (ctx.thinkingLevel) {
+  } else if (isNativeOllamaTransport && ctx.thinkingLevel) {
    // Any non-off ThinkLevel (minimal, low, medium, high, xhigh, adaptive, max)
    // should enable Ollama's native thinking mode.
    streamFn = createOllamaThinkingWrapper(streamFn, true);