From d6d04f361e1c49cdac81e8371ece13238da74133 Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Sat, 7 Mar 2026 19:53:02 -0500
Subject: [PATCH] fix(ollama): preserve local limits and native thinking
 fallback (#39292)

* fix(ollama): support thinking field fallback in native stream

* fix(models): honor explicit lower token limits in merge mode

* fix(ollama): prefer streamed content over fallback thinking

* changelog: note Ollama local model fixes
---
 CHANGELOG.md                                  |  1 +
 ...ssing-provider-apikey-from-env-var.test.ts | 31 ++++++-
 src/agents/models-config.ts                   | 38 +++++++--
 src/agents/ollama-stream.test.ts              | 85 ++++++++++++++++++-
 src/agents/ollama-stream.ts                   | 22 +++--
 5 files changed, 155 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 770f9a8b86b..04248fe0d4a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -220,6 +220,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/HTTP route migration diagnostics: rewrite legacy `api.registerHttpHandler(...)` loader failures into actionable migration guidance so doctor/plugin diagnostics point operators to `api.registerHttpRoute(...)` or `registerPluginHttpRoute(...)`. (#36794) Thanks @vincentkoc
 - Doctor/Heartbeat upgrade diagnostics: warn when heartbeat delivery is configured with an implicit `directPolicy` so upgrades pin direct/DM behavior explicitly instead of relying on the current default. (#36789) Thanks @vincentkoc.
 - Agents/current-time UTC anchor: append a machine-readable UTC suffix alongside local `Current time:` lines in shared cron-style prompt contexts so agents can compare UTC-stamped workspace timestamps without doing timezone math. (#32423) thanks @jriff.
+- Ollama/local model handling: preserve explicit lower `contextWindow` / `maxTokens` overrides during merge refresh, and keep native Ollama streamed replies from surfacing fallback `thinking` / `reasoning` text once real content starts streaming. (#39292) Thanks @vincentkoc.
 - TUI/webchat command-owner scope alignment: treat internal-channel gateway sessions with `operator.admin` as owner-authorized in command auth, restoring cron/gateway/connector tool access for affected TUI/webchat sessions while keeping external channels on identity-based owner checks. (from #35666, #35673, #35704) Thanks @Naylenv, @Octane0411, and @Sid-Qin.
 - Discord/inbound timeout isolation: separate inbound worker timeout tracking from listener timeout budgets so queued Discord replies are no longer dropped when listener watchdog windows expire mid-run. (#36602) Thanks @dutifulbob.
 - Memory/doctor SecretRef handling: treat SecretRef-backed memory-search API keys as configured, and fail embedding setup with explicit unresolved-secret errors instead of crashing. (#36835) Thanks @joshavant.
diff --git a/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts b/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts
index 997ff28065c..75d140867c2 100644
--- a/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts
+++ b/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts
@@ -372,7 +372,7 @@ describe("models-config", () => {
     });
   });
 
-  it("refreshes stale explicit moonshot model capabilities from implicit catalog", async () => {
+  it("refreshes moonshot capabilities while preserving explicit token limits", async () => {
     await withTempHome(async () => {
       await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
         const cfg = createMoonshotConfig({ contextWindow: 1024, maxTokens: 256 });
@@ -397,8 +397,8 @@ describe("models-config", () => {
         const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
         expect(kimi?.input).toEqual(["text", "image"]);
         expect(kimi?.reasoning).toBe(false);
-        expect(kimi?.contextWindow).toBe(256000);
-        expect(kimi?.maxTokens).toBe(8192);
+        expect(kimi?.contextWindow).toBe(1024);
+        expect(kimi?.maxTokens).toBe(256);
         // Preserve explicit user pricing overrides when refreshing capabilities.
         expect(kimi?.cost?.input).toBe(123);
         expect(kimi?.cost?.output).toBe(456);
@@ -464,4 +464,29 @@ describe("models-config", () => {
       });
     });
   });
+
+  it("falls back to implicit token limits when explicit values are invalid", async () => {
+    await withTempHome(async () => {
+      await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
+        const cfg = createMoonshotConfig({ contextWindow: 0, maxTokens: -1 });
+
+        await ensureOpenClawModelsJson(cfg);
+        const parsed = await readGeneratedModelsJson<{
+          providers: Record<
+            string,
+            {
+              models?: Array<{
+                id: string;
+                contextWindow?: number;
+                maxTokens?: number;
+              }>;
+            }
+          >;
+        }>();
+        const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
+        expect(kimi?.contextWindow).toBe(256000);
+        expect(kimi?.maxTokens).toBe(8192);
+      });
+    });
+  });
 });
diff --git a/src/agents/models-config.ts b/src/agents/models-config.ts
index a3f1fd19ff3..cb4c76cfe56 100644
--- a/src/agents/models-config.ts
+++ b/src/agents/models-config.ts
@@ -23,10 +23,22 @@ type ModelsConfig = NonNullable<OpenClawConfig["models"]>;
 const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
 const MODELS_JSON_WRITE_LOCKS = new Map<string, Promise<void>>();
 
-function resolvePreferredTokenLimit(explicitValue: number, implicitValue: number): number {
-  // Keep catalog refresh behavior for stale low values while preserving
-  // intentional larger user overrides (for example Ollama >128k contexts).
-  return explicitValue > implicitValue ? explicitValue : implicitValue;
+function isPositiveFiniteTokenLimit(value: unknown): value is number {
+  return typeof value === "number" && Number.isFinite(value) && value > 0;
+}
+
+function resolvePreferredTokenLimit(params: {
+  explicitPresent: boolean;
+  explicitValue: unknown;
+  implicitValue: unknown;
+}): number | undefined {
+  if (params.explicitPresent && isPositiveFiniteTokenLimit(params.explicitValue)) {
+    return params.explicitValue;
+  }
+  if (isPositiveFiniteTokenLimit(params.implicitValue)) {
+    return params.implicitValue;
+  }
+  return isPositiveFiniteTokenLimit(params.explicitValue) ? params.explicitValue : undefined;
 }
 
 function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig): ProviderConfig {
@@ -65,15 +77,23 @@ function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig)
     // it in their config (key present), honour that value; otherwise fall back
     // to the built-in catalog default so new reasoning models work out of the
     // box without requiring every user to configure it.
+    const contextWindow = resolvePreferredTokenLimit({
+      explicitPresent: "contextWindow" in explicitModel,
+      explicitValue: explicitModel.contextWindow,
+      implicitValue: implicitModel.contextWindow,
+    });
+    const maxTokens = resolvePreferredTokenLimit({
+      explicitPresent: "maxTokens" in explicitModel,
+      explicitValue: explicitModel.maxTokens,
+      implicitValue: implicitModel.maxTokens,
+    });
+
     return {
       ...explicitModel,
       input: implicitModel.input,
       reasoning: "reasoning" in explicitModel ? explicitModel.reasoning : implicitModel.reasoning,
-      contextWindow: resolvePreferredTokenLimit(
-        explicitModel.contextWindow,
-        implicitModel.contextWindow,
-      ),
-      maxTokens: resolvePreferredTokenLimit(explicitModel.maxTokens, implicitModel.maxTokens),
+      ...(contextWindow === undefined ? {} : { contextWindow }),
+      ...(maxTokens === undefined ? {} : { maxTokens }),
     };
   });
 
diff --git a/src/agents/ollama-stream.test.ts b/src/agents/ollama-stream.test.ts
index 79dd8d4a90d..813381b35b1 100644
--- a/src/agents/ollama-stream.test.ts
+++ b/src/agents/ollama-stream.test.ts
@@ -104,7 +104,23 @@ describe("buildAssistantMessage", () => {
     expect(result.usage.totalTokens).toBe(15);
   });
 
-  it("falls back to reasoning when content is empty", () => {
+  it("falls back to thinking when content is empty", () => {
+    const response = {
+      model: "qwen3:32b",
+      created_at: "2026-01-01T00:00:00Z",
+      message: {
+        role: "assistant" as const,
+        content: "",
+        thinking: "Thinking output",
+      },
+      done: true,
+    };
+    const result = buildAssistantMessage(response, modelInfo);
+    expect(result.stopReason).toBe("stop");
+    expect(result.content).toEqual([{ type: "text", text: "Thinking output" }]);
+  });
+
+  it("falls back to reasoning when content and thinking are empty", () => {
     const response = {
       model: "qwen3:32b",
       created_at: "2026-01-01T00:00:00Z",
@@ -397,7 +413,50 @@ describe("createOllamaStreamFn", () => {
     );
   });
 
-  it("accumulates reasoning chunks when content is empty", async () => {
+  it("accumulates thinking chunks when content is empty", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"reasoned"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":" output"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
+      ],
+      async () => {
+        const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
+        const events = await collectStreamEvents(stream);
+
+        const doneEvent = events.at(-1);
+        if (!doneEvent || doneEvent.type !== "done") {
+          throw new Error("Expected done event");
+        }
+
+        expect(doneEvent.message.content).toEqual([{ type: "text", text: "reasoned output" }]);
+      },
+    );
+  });
+
+  it("prefers streamed content over earlier thinking chunks", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"internal"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
+      ],
+      async () => {
+        const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
+        const events = await collectStreamEvents(stream);
+
+        const doneEvent = events.at(-1);
+        if (!doneEvent || doneEvent.type !== "done") {
+          throw new Error("Expected done event");
+        }
+
+        expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
+      },
+    );
+  });
+
+  it("accumulates reasoning chunks when thinking is absent", async () => {
     await withMockNdjsonFetch(
       [
         '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"reasoned"},"done":false}',
@@ -417,4 +476,26 @@ describe("createOllamaStreamFn", () => {
       },
     );
   });
+
+  it("prefers streamed content over earlier reasoning chunks", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"internal"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
+      ],
+      async () => {
+        const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
+        const events = await collectStreamEvents(stream);
+
+        const doneEvent = events.at(-1);
+        if (!doneEvent || doneEvent.type !== "done") {
+          throw new Error("Expected done event");
+        }
+
+        expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
+      },
+    );
+  });
 });
diff --git a/src/agents/ollama-stream.ts b/src/agents/ollama-stream.ts
index fdff0b2ae65..4446b03acdf 100644
--- a/src/agents/ollama-stream.ts
+++ b/src/agents/ollama-stream.ts
@@ -185,6 +185,7 @@ interface OllamaChatResponse {
   message: {
     role: "assistant";
     content: string;
+    thinking?: string;
     reasoning?: string;
     tool_calls?: OllamaToolCall[];
   };
@@ -323,10 +324,10 @@ export function buildAssistantMessage(
 ): AssistantMessage {
   const content: (TextContent | ToolCall)[] = [];
 
-  // Qwen 3 (and potentially other reasoning models) may return their final
-  // answer in a `reasoning` field with an empty `content`. Fall back to
-  // `reasoning` so the response isn't silently dropped.
-  const text = response.message.content || response.message.reasoning || "";
+  // Ollama-native reasoning models may emit their answer in `thinking` or
+  // `reasoning` with an empty `content`. Fall back so replies are not dropped.
+  const text =
+    response.message.content || response.message.thinking || response.message.reasoning || "";
   if (text) {
     content.push({ type: "text", text });
   }
@@ -468,15 +469,20 @@ export function createOllamaStreamFn(
 
         const reader = response.body.getReader();
         let accumulatedContent = "";
+        let fallbackContent = "";
+        let sawContent = false;
         const accumulatedToolCalls: OllamaToolCall[] = [];
         let finalResponse: OllamaChatResponse | undefined;
 
         for await (const chunk of parseNdjsonStream(reader)) {
           if (chunk.message?.content) {
+            sawContent = true;
             accumulatedContent += chunk.message.content;
-          } else if (chunk.message?.reasoning) {
-            // Qwen 3 reasoning mode: content may be empty, output in reasoning
-            accumulatedContent += chunk.message.reasoning;
+          } else if (!sawContent && chunk.message?.thinking) {
+            fallbackContent += chunk.message.thinking;
+          } else if (!sawContent && chunk.message?.reasoning) {
+            // Backward compatibility for older/native variants that still use reasoning.
+            fallbackContent += chunk.message.reasoning;
           }
 
           // Ollama sends tool_calls in intermediate (done:false) chunks,
@@ -495,7 +501,7 @@ export function createOllamaStreamFn(
           throw new Error("Ollama API stream ended without a final response");
         }
 
-        finalResponse.message.content = accumulatedContent;
+        finalResponse.message.content = accumulatedContent || fallbackContent;
         if (accumulatedToolCalls.length > 0) {
           finalResponse.message.tool_calls = accumulatedToolCalls;
         }