fix: enable thinking support for the ollama api (#62712)

Merged via squash. Prepared head SHA: c0b995035e Co-authored-by: hoyyeva <63033505+hoyyeva@users.noreply.github.com> Co-authored-by: BruceMacD <5853428+BruceMacD@users.noreply.github.com> Reviewed-by: @BruceMacD
2026-06-11 05:02:55 +00:00 · 2026-04-08 13:26:18 -07:00
parent 37fb1eb9ad
commit d7bf97adb3
4 changed files with 457 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,7 @@ Docs: https://docs.openclaw.ai
 - Codex CLI: pass OpenClaw's system prompt through Codex's `model_instructions_file` config override so fresh Codex CLI sessions receive the same prompt guidance as Claude CLI sessions.
 - Matrix/gateway: wait for Matrix sync readiness before marking startup successful, keep Matrix background handler failures contained, and route fatal Matrix sync stops through channel-level restart handling instead of crashing the whole gateway. (#62779) Thanks @gumadeiras.
 - Browser/security: re-run blocked-destination safety checks after interaction-driven main-frame navigations from click, evaluate, hook-triggered click, and batched action flows, so browser interactions cannot bypass the SSRF quarantine when they land on forbidden URLs. (#63226) Thanks @eleqtrizit.
+- Providers/Ollama: allow Ollama models using the native `api: "ollama"` path to optionally display thinking output when `/think` is set to a non-off level. (#62712) Thanks @hoyyeva.

 ## 2026.4.8

--- a/extensions/ollama/index.test.ts
+++ b/extensions/ollama/index.test.ts
@@ -445,4 +445,111 @@ describe("ollama plugin", () => {
    expect(payloadSeen?.think).toBe(false);
    expect((payloadSeen?.options as Record<string, unknown> | undefined)?.think).toBeUndefined();
  });
+
+  it("wraps native Ollama payloads with top-level think=true when thinking is enabled", () => {
+    const provider = registerProvider();
+    let payloadSeen: Record<string, unknown> | undefined;
+    const baseStreamFn = vi.fn((_model, _context, options) => {
+      const payload: Record<string, unknown> = {
+        messages: [],
+        options: { num_ctx: 65536 },
+        stream: true,
+      };
+      options?.onPayload?.(payload, _model);
+      payloadSeen = payload;
+      return {} as never;
+    });
+
+    const wrapped = provider.wrapStreamFn?.({
+      config: {
+        models: {
+          providers: {
+            ollama: {
+              api: "ollama",
+              baseUrl: "http://127.0.0.1:11434",
+              models: [],
+            },
+          },
+        },
+      },
+      provider: "ollama",
+      modelId: "qwen3.5:9b",
+      thinkingLevel: "low",
+      model: {
+        api: "ollama",
+        provider: "ollama",
+        id: "qwen3.5:9b",
+        baseUrl: "http://127.0.0.1:11434",
+        contextWindow: 131_072,
+      },
+      streamFn: baseStreamFn,
+    });
+
+    expect(typeof wrapped).toBe("function");
+    void wrapped?.(
+      {
+        api: "ollama",
+        provider: "ollama",
+        id: "qwen3.5:9b",
+      } as never,
+      {} as never,
+      {},
+    );
+    expect(baseStreamFn).toHaveBeenCalledTimes(1);
+    expect(payloadSeen?.think).toBe(true);
+    expect((payloadSeen?.options as Record<string, unknown> | undefined)?.think).toBeUndefined();
+  });
+
+  it("does not set think param when thinkingLevel is undefined", () => {
+    const provider = registerProvider();
+    let payloadSeen: Record<string, unknown> | undefined;
+    const baseStreamFn = vi.fn((_model, _context, options) => {
+      const payload: Record<string, unknown> = {
+        messages: [],
+        options: { num_ctx: 65536 },
+        stream: true,
+      };
+      options?.onPayload?.(payload, _model);
+      payloadSeen = payload;
+      return {} as never;
+    });
+
+    const wrapped = provider.wrapStreamFn?.({
+      config: {
+        models: {
+          providers: {
+            ollama: {
+              api: "ollama",
+              baseUrl: "http://127.0.0.1:11434",
+              models: [],
+            },
+          },
+        },
+      },
+      provider: "ollama",
+      modelId: "qwen3.5:9b",
+      thinkingLevel: undefined,
+      model: {
+        api: "ollama",
+        provider: "ollama",
+        id: "qwen3.5:9b",
+        baseUrl: "http://127.0.0.1:11434",
+        contextWindow: 131_072,
+      },
+      streamFn: baseStreamFn,
+    });
+
+    expect(typeof wrapped).toBe("function");
+    void wrapped?.(
+      {
+        api: "ollama",
+        provider: "ollama",
+        id: "qwen3.5:9b",
+      } as never,
+      {} as never,
+      {},
+    );
+    expect(baseStreamFn).toHaveBeenCalledTimes(1);
+    expect(payloadSeen?.think).toBeUndefined();
+  });
 });
--- a/extensions/ollama/src/stream.test.ts
+++ b/extensions/ollama/src/stream.test.ts
@@ -0,0 +1,228 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { buildAssistantMessage, createOllamaStreamFn } from "./stream.js";
+
+function makeOllamaResponse(params: {
+  content?: string;
+  thinking?: string;
+  reasoning?: string;
+  tool_calls?: Array<{ function: { name: string; arguments: Record<string, unknown> } }>;
+}) {
+  return {
+    model: "qwen3.5",
+    created_at: new Date().toISOString(),
+    message: {
+      role: "assistant" as const,
+      content: params.content ?? "",
+      ...(params.thinking != null ? { thinking: params.thinking } : {}),
+      ...(params.reasoning != null ? { reasoning: params.reasoning } : {}),
+      ...(params.tool_calls ? { tool_calls: params.tool_calls } : {}),
+    },
+    done: true,
+    prompt_eval_count: 100,
+    eval_count: 50,
+  };
+}
+
+const MODEL_INFO = { api: "ollama", provider: "ollama", id: "qwen3.5" };
+
+describe("buildAssistantMessage", () => {
+  it("includes thinking block when response has thinking field", () => {
+    const response = makeOllamaResponse({
+      thinking: "Let me think about this",
+      content: "The answer is 42",
+    });
+    const msg = buildAssistantMessage(response, MODEL_INFO);
+    expect(msg.content).toHaveLength(2);
+    expect(msg.content[0]).toEqual({ type: "thinking", thinking: "Let me think about this" });
+    expect(msg.content[1]).toEqual({ type: "text", text: "The answer is 42" });
+  });
+
+  it("includes thinking block when response has reasoning field", () => {
+    const response = makeOllamaResponse({
+      reasoning: "Step by step analysis",
+      content: "Result is 7",
+    });
+    const msg = buildAssistantMessage(response, MODEL_INFO);
+    expect(msg.content).toHaveLength(2);
+    expect(msg.content[0]).toEqual({ type: "thinking", thinking: "Step by step analysis" });
+    expect(msg.content[1]).toEqual({ type: "text", text: "Result is 7" });
+  });
+
+  it("prefers thinking over reasoning when both are present", () => {
+    const response = makeOllamaResponse({
+      thinking: "From thinking field",
+      reasoning: "From reasoning field",
+      content: "Answer",
+    });
+    const msg = buildAssistantMessage(response, MODEL_INFO);
+    expect(msg.content[0]).toEqual({ type: "thinking", thinking: "From thinking field" });
+  });
+
+  it("omits thinking block when no thinking or reasoning field", () => {
+    const response = makeOllamaResponse({
+      content: "Just text",
+    });
+    const msg = buildAssistantMessage(response, MODEL_INFO);
+    expect(msg.content).toHaveLength(1);
+    expect(msg.content[0]).toEqual({ type: "text", text: "Just text" });
+  });
+
+  it("omits thinking block when thinking field is empty", () => {
+    const response = makeOllamaResponse({
+      thinking: "",
+      content: "Just text",
+    });
+    const msg = buildAssistantMessage(response, MODEL_INFO);
+    expect(msg.content).toHaveLength(1);
+    expect(msg.content[0]).toEqual({ type: "text", text: "Just text" });
+  });
+});
+
+describe("createOllamaStreamFn thinking events", () => {
+  afterEach(() => vi.unstubAllGlobals());
+
+  function makeNdjsonBody(chunks: Array<Record<string, unknown>>): ReadableStream<Uint8Array> {
+    const encoder = new TextEncoder();
+    const lines = chunks.map((c) => JSON.stringify(c) + "\n").join("");
+    return new ReadableStream({
+      start(controller) {
+        controller.enqueue(encoder.encode(lines));
+        controller.close();
+      },
+    });
+  }
+
+  it("emits thinking_start, thinking_delta, and thinking_end events for thinking content", async () => {
+    const thinkingChunks = [
+      {
+        model: "qwen3.5",
+        created_at: "2026-01-01T00:00:00Z",
+        message: { role: "assistant", content: "", thinking: "Step 1" },
+        done: false,
+      },
+      {
+        model: "qwen3.5",
+        created_at: "2026-01-01T00:00:01Z",
+        message: { role: "assistant", content: "", thinking: " and step 2" },
+        done: false,
+      },
+      {
+        model: "qwen3.5",
+        created_at: "2026-01-01T00:00:02Z",
+        message: { role: "assistant", content: "The answer", thinking: "" },
+        done: false,
+      },
+      {
+        model: "qwen3.5",
+        created_at: "2026-01-01T00:00:03Z",
+        message: { role: "assistant", content: "" },
+        done: true,
+        done_reason: "stop",
+        prompt_eval_count: 10,
+        eval_count: 5,
+      },
+    ];
+
+    const body = makeNdjsonBody(thinkingChunks);
+    const fetchMock = vi.fn().mockResolvedValue({
+      ok: true,
+      body,
+    });
+    vi.stubGlobal("fetch", fetchMock);
+
+    const streamFn = createOllamaStreamFn("http://localhost:11434");
+    const stream = streamFn(
+      { api: "ollama", provider: "ollama", id: "qwen3.5", contextWindow: 65536 } as never,
+      { messages: [{ role: "user", content: "test" }] } as never,
+      {},
+    );
+
+    const events: Array<{ type: string; [key: string]: unknown }> = [];
+    for await (const event of stream as AsyncIterable<{ type: string; [key: string]: unknown }>) {
+      events.push(event);
+    }
+
+    const eventTypes = events.map((e) => e.type);
+
+    expect(eventTypes).toContain("thinking_start");
+    expect(eventTypes).toContain("thinking_delta");
+    expect(eventTypes).toContain("thinking_end");
+    expect(eventTypes).toContain("text_start");
+    expect(eventTypes).toContain("text_delta");
+    expect(eventTypes).toContain("done");
+
+    // thinking_start comes before text_start
+    const thinkingStartIndex = eventTypes.indexOf("thinking_start");
+    const textStartIndex = eventTypes.indexOf("text_start");
+    expect(thinkingStartIndex).toBeLessThan(textStartIndex);
+
+    // thinking_end comes before text_start
+    const thinkingEndIndex = eventTypes.indexOf("thinking_end");
+    expect(thinkingEndIndex).toBeLessThan(textStartIndex);
+
+    // Thinking deltas have correct content
+    const thinkingDeltas = events.filter((e) => e.type === "thinking_delta");
+    expect(thinkingDeltas).toHaveLength(2);
+    expect(thinkingDeltas[0].delta).toBe("Step 1");
+    expect(thinkingDeltas[1].delta).toBe(" and step 2");
+
+    // Content index: thinking at 0, text at 1
+    const thinkingStart = events.find((e) => e.type === "thinking_start");
+    expect(thinkingStart?.contentIndex).toBe(0);
+    const textStart = events.find((e) => e.type === "text_start");
+    expect(textStart?.contentIndex).toBe(1);
+
+    // Final message has thinking block
+    const done = events.find((e) => e.type === "done") as { message?: { content: unknown[] } };
+    const content = done?.message?.content ?? [];
+    expect(content[0]).toMatchObject({ type: "thinking", thinking: "Step 1 and step 2" });
+    expect(content[1]).toMatchObject({ type: "text", text: "The answer" });
+  });
+
+  it("streams without thinking events when no thinking content is present", async () => {
+    const chunks = [
+      {
+        model: "qwen3.5",
+        created_at: "2026-01-01T00:00:00Z",
+        message: { role: "assistant", content: "Hello" },
+        done: false,
+      },
+      {
+        model: "qwen3.5",
+        created_at: "2026-01-01T00:00:01Z",
+        message: { role: "assistant", content: "" },
+        done: true,
+        done_reason: "stop",
+        prompt_eval_count: 10,
+        eval_count: 5,
+      },
+    ];
+
+    const body = makeNdjsonBody(chunks);
+    vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: true, body }));
+
+    const streamFn = createOllamaStreamFn("http://localhost:11434");
+    const stream = streamFn(
+      { api: "ollama", provider: "ollama", id: "qwen3.5", contextWindow: 65536 } as never,
+      { messages: [{ role: "user", content: "test" }] } as never,
+      {},
+    );
+
+    const events: Array<{ type: string }> = [];
+    for await (const event of stream as AsyncIterable<{ type: string }>) {
+      events.push(event);
+    }
+
+    const eventTypes = events.map((e) => e.type);
+    expect(eventTypes).not.toContain("thinking_start");
+    expect(eventTypes).not.toContain("thinking_delta");
+    expect(eventTypes).not.toContain("thinking_end");
+    expect(eventTypes).toContain("text_start");
+    expect(eventTypes).toContain("text_delta");
+    expect(eventTypes).toContain("done");
+
+    // Text content index should be 0 (no thinking block)
+    const textStart = events.find((e) => e.type === "text_start") as { contentIndex?: number };
+    expect(textStart?.contentIndex).toBe(0);
+  });
+});
--- a/extensions/ollama/src/stream.ts
+++ b/extensions/ollama/src/stream.ts
@@ -4,6 +4,7 @@ import type {
  AssistantMessage,
  StopReason,
  TextContent,
+  ThinkingContent,
  ToolCall,
  Tool,
  Usage,
@@ -148,14 +149,14 @@ export function wrapOllamaCompatNumCtx(baseFn: StreamFn | undefined, numCtx: num
    });
 }

-function createOllamaThinkingOffWrapper(baseFn: StreamFn | undefined): StreamFn {
+function createOllamaThinkingWrapper(baseFn: StreamFn | undefined, think: boolean): StreamFn {
  const streamFn = baseFn ?? streamSimple;
  return (model, context, options) => {
    if (model.api !== "ollama") {
      return streamFn(model, context, options);
    }
    return streamWithPayloadPatch(streamFn, model, context, options, (payloadRecord) => {
-      payloadRecord.think = false;
+      payloadRecord.think = think;
    });
  };
 }
@@ -197,7 +198,11 @@ export function createConfiguredOllamaCompatStreamWrapper(
  }

  if (ctx.thinkingLevel === "off") {
-    streamFn = createOllamaThinkingOffWrapper(streamFn);
+    streamFn = createOllamaThinkingWrapper(streamFn, false);
+  } else if (ctx.thinkingLevel) {
+    // Any non-off ThinkLevel (minimal, low, medium, high, xhigh, adaptive)
+    // should enable Ollama's native thinking mode.
+    streamFn = createOllamaThinkingWrapper(streamFn, true);
  }

  if (normalizeProviderId(ctx.provider) === "ollama" && isOllamaCloudKimiModelRef(ctx.modelId)) {
@@ -511,7 +516,11 @@ export function buildAssistantMessage(
  response: OllamaChatResponse,
  modelInfo: StreamModelDescriptor,
 ): AssistantMessage {
-  const content: (TextContent | ToolCall)[] = [];
+  const content: (TextContent | ThinkingContent | ToolCall)[] = [];
+  const thinking = response.message.thinking ?? response.message.reasoning ?? "";
+  if (thinking) {
+    content.push({ type: "thinking", thinking });
+  }
  const text = response.message.content || "";
  if (text) {
    content.push({ type: "text", text });
@@ -654,39 +663,78 @@ export function createOllamaStreamFn(

        const reader = response.body.getReader();
        let accumulatedContent = "";
+        let accumulatedThinking = "";
        const accumulatedToolCalls: OllamaToolCall[] = [];
        let finalResponse: OllamaChatResponse | undefined;
        const modelInfo = { api: model.api, provider: model.provider, id: model.id };
        let streamStarted = false;
+        let thinkingStarted = false;
+        let thinkingEnded = false;
+        let textBlockStarted = false;
        let textBlockClosed = false;

+        // Content index tracking: thinking block (if present) is index 0,
+        // text block follows at index 1 (or 0 when no thinking).
+        const textContentIndex = () => (thinkingStarted ? 1 : 0);
+
+        const buildCurrentContent = (): (TextContent | ThinkingContent | ToolCall)[] => {
+          const parts: (TextContent | ThinkingContent | ToolCall)[] = [];
+          if (accumulatedThinking) {
+            parts.push({
+              type: "thinking",
+              thinking: accumulatedThinking,
+            });
+          }
+          if (accumulatedContent) {
+            parts.push({ type: "text", text: accumulatedContent });
+          }
+          return parts;
+        };
+
+        const closeThinkingBlock = () => {
+          if (!thinkingStarted || thinkingEnded) {
+            return;
+          }
+          thinkingEnded = true;
+          const partial = buildStreamAssistantMessage({
+            model: modelInfo,
+            content: buildCurrentContent(),
+            stopReason: "stop",
+            usage: buildUsageWithNoCost({}),
+          });
+          stream.push({
+            type: "thinking_end",
+            contentIndex: 0,
+            content: accumulatedThinking,
+            partial,
+          });
+        };
+
        const closeTextBlock = () => {
-          if (!streamStarted || textBlockClosed) {
+          if (!textBlockStarted || textBlockClosed) {
            return;
          }
          textBlockClosed = true;
          const partial = buildStreamAssistantMessage({
            model: modelInfo,
-            content: [{ type: "text", text: accumulatedContent }],
+            content: buildCurrentContent(),
            stopReason: "stop",
            usage: buildUsageWithNoCost({}),
          });
          stream.push({
            type: "text_end",
-            contentIndex: 0,
+            contentIndex: textContentIndex(),
            content: accumulatedContent,
            partial,
          });
        };

        for await (const chunk of parseNdjsonStream(reader)) {
-          if (chunk.message?.content) {
-            const delta = chunk.message.content;
-
+          // Handle thinking/reasoning deltas from Ollama's native think mode.
+          const thinkingDelta = chunk.message?.thinking ?? chunk.message?.reasoning;
+          if (thinkingDelta) {
            if (!streamStarted) {
              streamStarted = true;
-              // Emit start/text_start with an empty partial before accumulating
-              // the first delta, matching the Anthropic/OpenAI provider contract.
              const emptyPartial = buildStreamAssistantMessage({
                model: modelInfo,
                content: [],
@@ -694,19 +742,72 @@ export function createOllamaStreamFn(
                usage: buildUsageWithNoCost({}),
              });
              stream.push({ type: "start", partial: emptyPartial });
-              stream.push({ type: "text_start", contentIndex: 0, partial: emptyPartial });
+            }
+            if (!thinkingStarted) {
+              thinkingStarted = true;
+              const partial = buildStreamAssistantMessage({
+                model: modelInfo,
+                content: buildCurrentContent(),
+                stopReason: "stop",
+                usage: buildUsageWithNoCost({}),
+              });
+              stream.push({ type: "thinking_start", contentIndex: 0, partial });
+            }
+            accumulatedThinking += thinkingDelta;
+            const partial = buildStreamAssistantMessage({
+              model: modelInfo,
+              content: buildCurrentContent(),
+              stopReason: "stop",
+              usage: buildUsageWithNoCost({}),
+            });
+            stream.push({
+              type: "thinking_delta",
+              contentIndex: 0,
+              delta: thinkingDelta,
+              partial,
+            });
+          }
+
+          if (chunk.message?.content) {
+            const delta = chunk.message.content;
+
+            // Transition from thinking to text: close the thinking block first.
+            if (thinkingStarted && !thinkingEnded) {
+              closeThinkingBlock();
+            }
+
+            if (!streamStarted) {
+              streamStarted = true;
+              const emptyPartial = buildStreamAssistantMessage({
+                model: modelInfo,
+                content: [],
+                stopReason: "stop",
+                usage: buildUsageWithNoCost({}),
+              });
+              stream.push({ type: "start", partial: emptyPartial });
+            }
+            if (!textBlockStarted) {
+              textBlockStarted = true;
+              const partial = buildStreamAssistantMessage({
+                model: modelInfo,
+                content: buildCurrentContent(),
+                stopReason: "stop",
+                usage: buildUsageWithNoCost({}),
+              });
+              stream.push({ type: "text_start", contentIndex: textContentIndex(), partial });
            }

            accumulatedContent += delta;
            const partial = buildStreamAssistantMessage({
              model: modelInfo,
-              content: [{ type: "text", text: accumulatedContent }],
+              content: buildCurrentContent(),
              stopReason: "stop",
              usage: buildUsageWithNoCost({}),
            });
-            stream.push({ type: "text_delta", contentIndex: 0, delta, partial });
+            stream.push({ type: "text_delta", contentIndex: textContentIndex(), delta, partial });
          }
          if (chunk.message?.tool_calls) {
+            closeThinkingBlock();
            closeTextBlock();
            accumulatedToolCalls.push(...chunk.message.tool_calls);
          }
@@ -721,13 +822,17 @@ export function createOllamaStreamFn(
        }

        finalResponse.message.content = accumulatedContent;
+        if (accumulatedThinking) {
+          finalResponse.message.thinking = accumulatedThinking;
+        }
        if (accumulatedToolCalls.length > 0) {
          finalResponse.message.tool_calls = accumulatedToolCalls;
        }

        const assistantMessage = buildAssistantMessage(finalResponse, modelInfo);

-        // Close the text block if we emitted any text_delta events.
+        // Close any open blocks before emitting the done event.
+        closeThinkingBlock();
        closeTextBlock();

        stream.push({