From d6d04f361e1c49cdac81e8371ece13238da74133 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 7 Mar 2026 19:53:02 -0500 Subject: [PATCH] fix(ollama): preserve local limits and native thinking fallback (#39292) * fix(ollama): support thinking field fallback in native stream * fix(models): honor explicit lower token limits in merge mode * fix(ollama): prefer streamed content over fallback thinking * changelog: note Ollama local model fixes --- CHANGELOG.md | 1 + ...ssing-provider-apikey-from-env-var.test.ts | 31 ++++++- src/agents/models-config.ts | 38 +++++++-- src/agents/ollama-stream.test.ts | 85 ++++++++++++++++++- src/agents/ollama-stream.ts | 22 +++-- 5 files changed, 155 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 770f9a8b86b..04248fe0d4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -220,6 +220,7 @@ Docs: https://docs.openclaw.ai - Plugins/HTTP route migration diagnostics: rewrite legacy `api.registerHttpHandler(...)` loader failures into actionable migration guidance so doctor/plugin diagnostics point operators to `api.registerHttpRoute(...)` or `registerPluginHttpRoute(...)`. (#36794) Thanks @vincentkoc - Doctor/Heartbeat upgrade diagnostics: warn when heartbeat delivery is configured with an implicit `directPolicy` so upgrades pin direct/DM behavior explicitly instead of relying on the current default. (#36789) Thanks @vincentkoc. - Agents/current-time UTC anchor: append a machine-readable UTC suffix alongside local `Current time:` lines in shared cron-style prompt contexts so agents can compare UTC-stamped workspace timestamps without doing timezone math. (#32423) thanks @jriff. +- Ollama/local model handling: preserve explicit lower `contextWindow` / `maxTokens` overrides during merge refresh, and keep native Ollama streamed replies from surfacing fallback `thinking` / `reasoning` text once real content starts streaming. (#39292) Thanks @vincentkoc. - TUI/webchat command-owner scope alignment: treat internal-channel gateway sessions with `operator.admin` as owner-authorized in command auth, restoring cron/gateway/connector tool access for affected TUI/webchat sessions while keeping external channels on identity-based owner checks. (from #35666, #35673, #35704) Thanks @Naylenv, @Octane0411, and @Sid-Qin. - Discord/inbound timeout isolation: separate inbound worker timeout tracking from listener timeout budgets so queued Discord replies are no longer dropped when listener watchdog windows expire mid-run. (#36602) Thanks @dutifulbob. - Memory/doctor SecretRef handling: treat SecretRef-backed memory-search API keys as configured, and fail embedding setup with explicit unresolved-secret errors instead of crashing. (#36835) Thanks @joshavant. diff --git a/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts b/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts index 997ff28065c..75d140867c2 100644 --- a/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts +++ b/src/agents/models-config.fills-missing-provider-apikey-from-env-var.test.ts @@ -372,7 +372,7 @@ describe("models-config", () => { }); }); - it("refreshes stale explicit moonshot model capabilities from implicit catalog", async () => { + it("refreshes moonshot capabilities while preserving explicit token limits", async () => { await withTempHome(async () => { await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => { const cfg = createMoonshotConfig({ contextWindow: 1024, maxTokens: 256 }); @@ -397,8 +397,8 @@ describe("models-config", () => { const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5"); expect(kimi?.input).toEqual(["text", "image"]); expect(kimi?.reasoning).toBe(false); - expect(kimi?.contextWindow).toBe(256000); - expect(kimi?.maxTokens).toBe(8192); + expect(kimi?.contextWindow).toBe(1024); + expect(kimi?.maxTokens).toBe(256); // Preserve explicit user pricing overrides when refreshing capabilities. expect(kimi?.cost?.input).toBe(123); expect(kimi?.cost?.output).toBe(456); @@ -464,4 +464,29 @@ describe("models-config", () => { }); }); }); + + it("falls back to implicit token limits when explicit values are invalid", async () => { + await withTempHome(async () => { + await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => { + const cfg = createMoonshotConfig({ contextWindow: 0, maxTokens: -1 }); + + await ensureOpenClawModelsJson(cfg); + const parsed = await readGeneratedModelsJson<{ + providers: Record< + string, + { + models?: Array<{ + id: string; + contextWindow?: number; + maxTokens?: number; + }>; + } + >; + }>(); + const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5"); + expect(kimi?.contextWindow).toBe(256000); + expect(kimi?.maxTokens).toBe(8192); + }); + }); + }); }); diff --git a/src/agents/models-config.ts b/src/agents/models-config.ts index a3f1fd19ff3..cb4c76cfe56 100644 --- a/src/agents/models-config.ts +++ b/src/agents/models-config.ts @@ -23,10 +23,22 @@ type ModelsConfig = NonNullable; const DEFAULT_MODE: NonNullable = "merge"; const MODELS_JSON_WRITE_LOCKS = new Map>(); -function resolvePreferredTokenLimit(explicitValue: number, implicitValue: number): number { - // Keep catalog refresh behavior for stale low values while preserving - // intentional larger user overrides (for example Ollama >128k contexts). - return explicitValue > implicitValue ? explicitValue : implicitValue; +function isPositiveFiniteTokenLimit(value: unknown): value is number { + return typeof value === "number" && Number.isFinite(value) && value > 0; +} + +function resolvePreferredTokenLimit(params: { + explicitPresent: boolean; + explicitValue: unknown; + implicitValue: unknown; +}): number | undefined { + if (params.explicitPresent && isPositiveFiniteTokenLimit(params.explicitValue)) { + return params.explicitValue; + } + if (isPositiveFiniteTokenLimit(params.implicitValue)) { + return params.implicitValue; + } + return isPositiveFiniteTokenLimit(params.explicitValue) ? params.explicitValue : undefined; } function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig): ProviderConfig { @@ -65,15 +77,23 @@ function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig) // it in their config (key present), honour that value; otherwise fall back // to the built-in catalog default so new reasoning models work out of the // box without requiring every user to configure it. + const contextWindow = resolvePreferredTokenLimit({ + explicitPresent: "contextWindow" in explicitModel, + explicitValue: explicitModel.contextWindow, + implicitValue: implicitModel.contextWindow, + }); + const maxTokens = resolvePreferredTokenLimit({ + explicitPresent: "maxTokens" in explicitModel, + explicitValue: explicitModel.maxTokens, + implicitValue: implicitModel.maxTokens, + }); + return { ...explicitModel, input: implicitModel.input, reasoning: "reasoning" in explicitModel ? explicitModel.reasoning : implicitModel.reasoning, - contextWindow: resolvePreferredTokenLimit( - explicitModel.contextWindow, - implicitModel.contextWindow, - ), - maxTokens: resolvePreferredTokenLimit(explicitModel.maxTokens, implicitModel.maxTokens), + ...(contextWindow === undefined ? {} : { contextWindow }), + ...(maxTokens === undefined ? {} : { maxTokens }), }; }); diff --git a/src/agents/ollama-stream.test.ts b/src/agents/ollama-stream.test.ts index 79dd8d4a90d..813381b35b1 100644 --- a/src/agents/ollama-stream.test.ts +++ b/src/agents/ollama-stream.test.ts @@ -104,7 +104,23 @@ describe("buildAssistantMessage", () => { expect(result.usage.totalTokens).toBe(15); }); - it("falls back to reasoning when content is empty", () => { + it("falls back to thinking when content is empty", () => { + const response = { + model: "qwen3:32b", + created_at: "2026-01-01T00:00:00Z", + message: { + role: "assistant" as const, + content: "", + thinking: "Thinking output", + }, + done: true, + }; + const result = buildAssistantMessage(response, modelInfo); + expect(result.stopReason).toBe("stop"); + expect(result.content).toEqual([{ type: "text", text: "Thinking output" }]); + }); + + it("falls back to reasoning when content and thinking are empty", () => { const response = { model: "qwen3:32b", created_at: "2026-01-01T00:00:00Z", @@ -397,7 +413,50 @@ describe("createOllamaStreamFn", () => { ); }); - it("accumulates reasoning chunks when content is empty", async () => { + it("accumulates thinking chunks when content is empty", async () => { + await withMockNdjsonFetch( + [ + '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"reasoned"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":" output"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}', + ], + async () => { + const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" }); + const events = await collectStreamEvents(stream); + + const doneEvent = events.at(-1); + if (!doneEvent || doneEvent.type !== "done") { + throw new Error("Expected done event"); + } + + expect(doneEvent.message.content).toEqual([{ type: "text", text: "reasoned output" }]); + }, + ); + }); + + it("prefers streamed content over earlier thinking chunks", async () => { + await withMockNdjsonFetch( + [ + '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"internal"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}', + ], + async () => { + const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" }); + const events = await collectStreamEvents(stream); + + const doneEvent = events.at(-1); + if (!doneEvent || doneEvent.type !== "done") { + throw new Error("Expected done event"); + } + + expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]); + }, + ); + }); + + it("accumulates reasoning chunks when thinking is absent", async () => { await withMockNdjsonFetch( [ '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"reasoned"},"done":false}', @@ -417,4 +476,26 @@ describe("createOllamaStreamFn", () => { }, ); }); + + it("prefers streamed content over earlier reasoning chunks", async () => { + await withMockNdjsonFetch( + [ + '{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"internal"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}', + '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}', + ], + async () => { + const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" }); + const events = await collectStreamEvents(stream); + + const doneEvent = events.at(-1); + if (!doneEvent || doneEvent.type !== "done") { + throw new Error("Expected done event"); + } + + expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]); + }, + ); + }); }); diff --git a/src/agents/ollama-stream.ts b/src/agents/ollama-stream.ts index fdff0b2ae65..4446b03acdf 100644 --- a/src/agents/ollama-stream.ts +++ b/src/agents/ollama-stream.ts @@ -185,6 +185,7 @@ interface OllamaChatResponse { message: { role: "assistant"; content: string; + thinking?: string; reasoning?: string; tool_calls?: OllamaToolCall[]; }; @@ -323,10 +324,10 @@ export function buildAssistantMessage( ): AssistantMessage { const content: (TextContent | ToolCall)[] = []; - // Qwen 3 (and potentially other reasoning models) may return their final - // answer in a `reasoning` field with an empty `content`. Fall back to - // `reasoning` so the response isn't silently dropped. - const text = response.message.content || response.message.reasoning || ""; + // Ollama-native reasoning models may emit their answer in `thinking` or + // `reasoning` with an empty `content`. Fall back so replies are not dropped. + const text = + response.message.content || response.message.thinking || response.message.reasoning || ""; if (text) { content.push({ type: "text", text }); } @@ -468,15 +469,20 @@ export function createOllamaStreamFn( const reader = response.body.getReader(); let accumulatedContent = ""; + let fallbackContent = ""; + let sawContent = false; const accumulatedToolCalls: OllamaToolCall[] = []; let finalResponse: OllamaChatResponse | undefined; for await (const chunk of parseNdjsonStream(reader)) { if (chunk.message?.content) { + sawContent = true; accumulatedContent += chunk.message.content; - } else if (chunk.message?.reasoning) { - // Qwen 3 reasoning mode: content may be empty, output in reasoning - accumulatedContent += chunk.message.reasoning; + } else if (!sawContent && chunk.message?.thinking) { + fallbackContent += chunk.message.thinking; + } else if (!sawContent && chunk.message?.reasoning) { + // Backward compatibility for older/native variants that still use reasoning. + fallbackContent += chunk.message.reasoning; } // Ollama sends tool_calls in intermediate (done:false) chunks, @@ -495,7 +501,7 @@ export function createOllamaStreamFn( throw new Error("Ollama API stream ended without a final response"); } - finalResponse.message.content = accumulatedContent; + finalResponse.message.content = accumulatedContent || fallbackContent; if (accumulatedToolCalls.length > 0) { finalResponse.message.tool_calls = accumulatedToolCalls; }