fix(ollama): preserve local limits and native thinking fallback (#39292)

* fix(ollama): support thinking field fallback in native stream * fix(models): honor explicit lower token limits in merge mode * fix(ollama): prefer streamed content over fallback thinking * changelog: note Ollama local model fixes
2026-05-06 09:50:42 +00:00 · 2026-03-07 19:53:02 -05:00
parent 5edcab2eee
commit d6d04f361e
5 changed files with 155 additions and 22 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -220,6 +220,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/HTTP route migration diagnostics: rewrite legacy `api.registerHttpHandler(...)` loader failures into actionable migration guidance so doctor/plugin diagnostics point operators to `api.registerHttpRoute(...)` or `registerPluginHttpRoute(...)`. (#36794) Thanks @vincentkoc
 - Doctor/Heartbeat upgrade diagnostics: warn when heartbeat delivery is configured with an implicit `directPolicy` so upgrades pin direct/DM behavior explicitly instead of relying on the current default. (#36789) Thanks @vincentkoc.
 - Agents/current-time UTC anchor: append a machine-readable UTC suffix alongside local `Current time:` lines in shared cron-style prompt contexts so agents can compare UTC-stamped workspace timestamps without doing timezone math. (#32423) thanks @jriff.
+- Ollama/local model handling: preserve explicit lower `contextWindow` / `maxTokens` overrides during merge refresh, and keep native Ollama streamed replies from surfacing fallback `thinking` / `reasoning` text once real content starts streaming. (#39292) Thanks @vincentkoc.
 - TUI/webchat command-owner scope alignment: treat internal-channel gateway sessions with `operator.admin` as owner-authorized in command auth, restoring cron/gateway/connector tool access for affected TUI/webchat sessions while keeping external channels on identity-based owner checks. (from #35666, #35673, #35704) Thanks @Naylenv, @Octane0411, and @Sid-Qin.
 - Discord/inbound timeout isolation: separate inbound worker timeout tracking from listener timeout budgets so queued Discord replies are no longer dropped when listener watchdog windows expire mid-run. (#36602) Thanks @dutifulbob.
 - Memory/doctor SecretRef handling: treat SecretRef-backed memory-search API keys as configured, and fail embedding setup with explicit unresolved-secret errors instead of crashing. (#36835) Thanks @joshavant.
--- a/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts
+++ b/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts
@@ -372,7 +372,7 @@ describe("models-config", () => {
    });
  });

-  it("refreshes stale explicit moonshot model capabilities from implicit catalog", async () => {
+  it("refreshes moonshot capabilities while preserving explicit token limits", async () => {
    await withTempHome(async () => {
      await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
        const cfg = createMoonshotConfig({ contextWindow: 1024, maxTokens: 256 });
@@ -397,8 +397,8 @@ describe("models-config", () => {
        const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
        expect(kimi?.input).toEqual(["text", "image"]);
        expect(kimi?.reasoning).toBe(false);
-        expect(kimi?.contextWindow).toBe(256000);
-        expect(kimi?.maxTokens).toBe(8192);
+        expect(kimi?.contextWindow).toBe(1024);
+        expect(kimi?.maxTokens).toBe(256);
        // Preserve explicit user pricing overrides when refreshing capabilities.
        expect(kimi?.cost?.input).toBe(123);
        expect(kimi?.cost?.output).toBe(456);
@@ -464,4 +464,29 @@ describe("models-config", () => {
      });
    });
  });
+
+  it("falls back to implicit token limits when explicit values are invalid", async () => {
+    await withTempHome(async () => {
+      await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
+        const cfg = createMoonshotConfig({ contextWindow: 0, maxTokens: -1 });
+
+        await ensureOpenClawModelsJson(cfg);
+        const parsed = await readGeneratedModelsJson<{
+          providers: Record<
+            string,
+            {
+              models?: Array<{
+                id: string;
+                contextWindow?: number;
+                maxTokens?: number;
+              }>;
+            }
+          >;
+        }>();
+        const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
+        expect(kimi?.contextWindow).toBe(256000);
+        expect(kimi?.maxTokens).toBe(8192);
+      });
+    });
+  });
 });
--- a/src/agents/models-config.ts
+++ b/src/agents/models-config.ts
@@ -23,10 +23,22 @@ type ModelsConfig = NonNullable<OpenClawConfig["models"]>;
 const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
 const MODELS_JSON_WRITE_LOCKS = new Map<string, Promise<void>>();

-function resolvePreferredTokenLimit(explicitValue: number, implicitValue: number): number {
-  // Keep catalog refresh behavior for stale low values while preserving
-  // intentional larger user overrides (for example Ollama >128k contexts).
-  return explicitValue > implicitValue ? explicitValue : implicitValue;
+function isPositiveFiniteTokenLimit(value: unknown): value is number {
+  return typeof value === "number" && Number.isFinite(value) && value > 0;
+}
+
+function resolvePreferredTokenLimit(params: {
+  explicitPresent: boolean;
+  explicitValue: unknown;
+  implicitValue: unknown;
+}): number | undefined {
+  if (params.explicitPresent && isPositiveFiniteTokenLimit(params.explicitValue)) {
+    return params.explicitValue;
+  }
+  if (isPositiveFiniteTokenLimit(params.implicitValue)) {
+    return params.implicitValue;
+  }
+  return isPositiveFiniteTokenLimit(params.explicitValue) ? params.explicitValue : undefined;
 }

 function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig): ProviderConfig {
@@ -65,15 +77,23 @@ function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig)
    // it in their config (key present), honour that value; otherwise fall back
    // to the built-in catalog default so new reasoning models work out of the
    // box without requiring every user to configure it.
+    const contextWindow = resolvePreferredTokenLimit({
+      explicitPresent: "contextWindow" in explicitModel,
+      explicitValue: explicitModel.contextWindow,
+      implicitValue: implicitModel.contextWindow,
+    });
+    const maxTokens = resolvePreferredTokenLimit({
+      explicitPresent: "maxTokens" in explicitModel,
+      explicitValue: explicitModel.maxTokens,
+      implicitValue: implicitModel.maxTokens,
+    });
+
    return {
      ...explicitModel,
      input: implicitModel.input,
      reasoning: "reasoning" in explicitModel ? explicitModel.reasoning : implicitModel.reasoning,
-      contextWindow: resolvePreferredTokenLimit(
-        explicitModel.contextWindow,
-        implicitModel.contextWindow,
-      ),
-      maxTokens: resolvePreferredTokenLimit(explicitModel.maxTokens, implicitModel.maxTokens),
+      ...(contextWindow === undefined ? {} : { contextWindow }),
+      ...(maxTokens === undefined ? {} : { maxTokens }),
    };
  });

--- a/src/agents/ollama-stream.test.ts
+++ b/src/agents/ollama-stream.test.ts
@@ -104,7 +104,23 @@ describe("buildAssistantMessage", () => {
    expect(result.usage.totalTokens).toBe(15);
  });

-  it("falls back to reasoning when content is empty", () => {
+  it("falls back to thinking when content is empty", () => {
+    const response = {
+      model: "qwen3:32b",
+      created_at: "2026-01-01T00:00:00Z",
+      message: {
+        role: "assistant" as const,
+        content: "",
+        thinking: "Thinking output",
+      },
+      done: true,
+    };
+    const result = buildAssistantMessage(response, modelInfo);
+    expect(result.stopReason).toBe("stop");
+    expect(result.content).toEqual([{ type: "text", text: "Thinking output" }]);
+  });
+
+  it("falls back to reasoning when content and thinking are empty", () => {
    const response = {
      model: "qwen3:32b",
      created_at: "2026-01-01T00:00:00Z",
@@ -397,7 +413,50 @@ describe("createOllamaStreamFn", () => {
    );
  });

-  it("accumulates reasoning chunks when content is empty", async () => {
+  it("accumulates thinking chunks when content is empty", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"reasoned"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":" output"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
+      ],
+      async () => {
+        const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
+        const events = await collectStreamEvents(stream);
+
+        const doneEvent = events.at(-1);
+        if (!doneEvent || doneEvent.type !== "done") {
+          throw new Error("Expected done event");
+        }
+
+        expect(doneEvent.message.content).toEqual([{ type: "text", text: "reasoned output" }]);
+      },
+    );
+  });
+
+  it("prefers streamed content over earlier thinking chunks", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"internal"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
+      ],
+      async () => {
+        const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
+        const events = await collectStreamEvents(stream);
+
+        const doneEvent = events.at(-1);
+        if (!doneEvent || doneEvent.type !== "done") {
+          throw new Error("Expected done event");
+        }
+
+        expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
+      },
+    );
+  });
+
+  it("accumulates reasoning chunks when thinking is absent", async () => {
    await withMockNdjsonFetch(
      [
        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"reasoned"},"done":false}',
@@ -417,4 +476,26 @@ describe("createOllamaStreamFn", () => {
      },
    );
  });
+
+  it("prefers streamed content over earlier reasoning chunks", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"internal"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
+      ],
+      async () => {
+        const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
+        const events = await collectStreamEvents(stream);
+
+        const doneEvent = events.at(-1);
+        if (!doneEvent || doneEvent.type !== "done") {
+          throw new Error("Expected done event");
+        }
+
+        expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
+      },
+    );
+  });
 });
--- a/src/agents/ollama-stream.ts
+++ b/src/agents/ollama-stream.ts
@@ -185,6 +185,7 @@ interface OllamaChatResponse {
  message: {
    role: "assistant";
    content: string;
+    thinking?: string;
    reasoning?: string;
    tool_calls?: OllamaToolCall[];
  };
@@ -323,10 +324,10 @@ export function buildAssistantMessage(
 ): AssistantMessage {
  const content: (TextContent | ToolCall)[] = [];

-  // Qwen 3 (and potentially other reasoning models) may return their final
-  // answer in a `reasoning` field with an empty `content`. Fall back to
-  // `reasoning` so the response isn't silently dropped.
-  const text = response.message.content || response.message.reasoning || "";
+  // Ollama-native reasoning models may emit their answer in `thinking` or
+  // `reasoning` with an empty `content`. Fall back so replies are not dropped.
+  const text =
+    response.message.content || response.message.thinking || response.message.reasoning || "";
  if (text) {
    content.push({ type: "text", text });
  }
@@ -468,15 +469,20 @@ export function createOllamaStreamFn(

        const reader = response.body.getReader();
        let accumulatedContent = "";
+        let fallbackContent = "";
+        let sawContent = false;
        const accumulatedToolCalls: OllamaToolCall[] = [];
        let finalResponse: OllamaChatResponse | undefined;

        for await (const chunk of parseNdjsonStream(reader)) {
          if (chunk.message?.content) {
+            sawContent = true;
            accumulatedContent += chunk.message.content;
-          } else if (chunk.message?.reasoning) {
-            // Qwen 3 reasoning mode: content may be empty, output in reasoning
-            accumulatedContent += chunk.message.reasoning;
+          } else if (!sawContent && chunk.message?.thinking) {
+            fallbackContent += chunk.message.thinking;
+          } else if (!sawContent && chunk.message?.reasoning) {
+            // Backward compatibility for older/native variants that still use reasoning.
+            fallbackContent += chunk.message.reasoning;
          }

          // Ollama sends tool_calls in intermediate (done:false) chunks,
@@ -495,7 +501,7 @@ export function createOllamaStreamFn(
          throw new Error("Ollama API stream ended without a final response");
        }

-        finalResponse.message.content = accumulatedContent;
+        finalResponse.message.content = accumulatedContent || fallbackContent;
        if (accumulatedToolCalls.length > 0) {
          finalResponse.message.tool_calls = accumulatedToolCalls;
        }