diff --git a/CHANGELOG.md b/CHANGELOG.md index 1eb09206e87..9f0349bc8ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ Docs: https://docs.openclaw.ai - Codex CLI: pass OpenClaw's system prompt through Codex's `model_instructions_file` config override so fresh Codex CLI sessions receive the same prompt guidance as Claude CLI sessions. - Matrix/gateway: wait for Matrix sync readiness before marking startup successful, keep Matrix background handler failures contained, and route fatal Matrix sync stops through channel-level restart handling instead of crashing the whole gateway. (#62779) Thanks @gumadeiras. - Browser/security: re-run blocked-destination safety checks after interaction-driven main-frame navigations from click, evaluate, hook-triggered click, and batched action flows, so browser interactions cannot bypass the SSRF quarantine when they land on forbidden URLs. (#63226) Thanks @eleqtrizit. +- Providers/Ollama: allow Ollama models using the native `api: "ollama"` path to optionally display thinking output when `/think` is set to a non-off level. (#62712) Thanks @hoyyeva. ## 2026.4.8 diff --git a/extensions/ollama/index.test.ts b/extensions/ollama/index.test.ts index bbaec49bf0a..7555cc1b930 100644 --- a/extensions/ollama/index.test.ts +++ b/extensions/ollama/index.test.ts @@ -445,4 +445,111 @@ describe("ollama plugin", () => { expect(payloadSeen?.think).toBe(false); expect((payloadSeen?.options as Record | undefined)?.think).toBeUndefined(); }); + + it("wraps native Ollama payloads with top-level think=true when thinking is enabled", () => { + const provider = registerProvider(); + let payloadSeen: Record | undefined; + const baseStreamFn = vi.fn((_model, _context, options) => { + const payload: Record = { + messages: [], + options: { num_ctx: 65536 }, + stream: true, + }; + options?.onPayload?.(payload, _model); + payloadSeen = payload; + return {} as never; + }); + + const wrapped = provider.wrapStreamFn?.({ + config: { + models: { + providers: { + ollama: { + api: "ollama", + baseUrl: "http://127.0.0.1:11434", + models: [], + }, + }, + }, + }, + provider: "ollama", + modelId: "qwen3.5:9b", + thinkingLevel: "low", + model: { + api: "ollama", + provider: "ollama", + id: "qwen3.5:9b", + baseUrl: "http://127.0.0.1:11434", + contextWindow: 131_072, + }, + streamFn: baseStreamFn, + }); + + expect(typeof wrapped).toBe("function"); + void wrapped?.( + { + api: "ollama", + provider: "ollama", + id: "qwen3.5:9b", + } as never, + {} as never, + {}, + ); + expect(baseStreamFn).toHaveBeenCalledTimes(1); + expect(payloadSeen?.think).toBe(true); + expect((payloadSeen?.options as Record | undefined)?.think).toBeUndefined(); + }); + + it("does not set think param when thinkingLevel is undefined", () => { + const provider = registerProvider(); + let payloadSeen: Record | undefined; + const baseStreamFn = vi.fn((_model, _context, options) => { + const payload: Record = { + messages: [], + options: { num_ctx: 65536 }, + stream: true, + }; + options?.onPayload?.(payload, _model); + payloadSeen = payload; + return {} as never; + }); + + const wrapped = provider.wrapStreamFn?.({ + config: { + models: { + providers: { + ollama: { + api: "ollama", + baseUrl: "http://127.0.0.1:11434", + models: [], + }, + }, + }, + }, + provider: "ollama", + modelId: "qwen3.5:9b", + thinkingLevel: undefined, + model: { + api: "ollama", + provider: "ollama", + id: "qwen3.5:9b", + baseUrl: "http://127.0.0.1:11434", + contextWindow: 131_072, + }, + streamFn: baseStreamFn, + }); + + expect(typeof wrapped).toBe("function"); + void wrapped?.( + { + api: "ollama", + provider: "ollama", + id: "qwen3.5:9b", + } as never, + {} as never, + {}, + ); + expect(baseStreamFn).toHaveBeenCalledTimes(1); + expect(payloadSeen?.think).toBeUndefined(); + }); }); diff --git a/extensions/ollama/src/stream.test.ts b/extensions/ollama/src/stream.test.ts new file mode 100644 index 00000000000..b3d57e9bb49 --- /dev/null +++ b/extensions/ollama/src/stream.test.ts @@ -0,0 +1,228 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { buildAssistantMessage, createOllamaStreamFn } from "./stream.js"; + +function makeOllamaResponse(params: { + content?: string; + thinking?: string; + reasoning?: string; + tool_calls?: Array<{ function: { name: string; arguments: Record } }>; +}) { + return { + model: "qwen3.5", + created_at: new Date().toISOString(), + message: { + role: "assistant" as const, + content: params.content ?? "", + ...(params.thinking != null ? { thinking: params.thinking } : {}), + ...(params.reasoning != null ? { reasoning: params.reasoning } : {}), + ...(params.tool_calls ? { tool_calls: params.tool_calls } : {}), + }, + done: true, + prompt_eval_count: 100, + eval_count: 50, + }; +} + +const MODEL_INFO = { api: "ollama", provider: "ollama", id: "qwen3.5" }; + +describe("buildAssistantMessage", () => { + it("includes thinking block when response has thinking field", () => { + const response = makeOllamaResponse({ + thinking: "Let me think about this", + content: "The answer is 42", + }); + const msg = buildAssistantMessage(response, MODEL_INFO); + expect(msg.content).toHaveLength(2); + expect(msg.content[0]).toEqual({ type: "thinking", thinking: "Let me think about this" }); + expect(msg.content[1]).toEqual({ type: "text", text: "The answer is 42" }); + }); + + it("includes thinking block when response has reasoning field", () => { + const response = makeOllamaResponse({ + reasoning: "Step by step analysis", + content: "Result is 7", + }); + const msg = buildAssistantMessage(response, MODEL_INFO); + expect(msg.content).toHaveLength(2); + expect(msg.content[0]).toEqual({ type: "thinking", thinking: "Step by step analysis" }); + expect(msg.content[1]).toEqual({ type: "text", text: "Result is 7" }); + }); + + it("prefers thinking over reasoning when both are present", () => { + const response = makeOllamaResponse({ + thinking: "From thinking field", + reasoning: "From reasoning field", + content: "Answer", + }); + const msg = buildAssistantMessage(response, MODEL_INFO); + expect(msg.content[0]).toEqual({ type: "thinking", thinking: "From thinking field" }); + }); + + it("omits thinking block when no thinking or reasoning field", () => { + const response = makeOllamaResponse({ + content: "Just text", + }); + const msg = buildAssistantMessage(response, MODEL_INFO); + expect(msg.content).toHaveLength(1); + expect(msg.content[0]).toEqual({ type: "text", text: "Just text" }); + }); + + it("omits thinking block when thinking field is empty", () => { + const response = makeOllamaResponse({ + thinking: "", + content: "Just text", + }); + const msg = buildAssistantMessage(response, MODEL_INFO); + expect(msg.content).toHaveLength(1); + expect(msg.content[0]).toEqual({ type: "text", text: "Just text" }); + }); +}); + +describe("createOllamaStreamFn thinking events", () => { + afterEach(() => vi.unstubAllGlobals()); + + function makeNdjsonBody(chunks: Array>): ReadableStream { + const encoder = new TextEncoder(); + const lines = chunks.map((c) => JSON.stringify(c) + "\n").join(""); + return new ReadableStream({ + start(controller) { + controller.enqueue(encoder.encode(lines)); + controller.close(); + }, + }); + } + + it("emits thinking_start, thinking_delta, and thinking_end events for thinking content", async () => { + const thinkingChunks = [ + { + model: "qwen3.5", + created_at: "2026-01-01T00:00:00Z", + message: { role: "assistant", content: "", thinking: "Step 1" }, + done: false, + }, + { + model: "qwen3.5", + created_at: "2026-01-01T00:00:01Z", + message: { role: "assistant", content: "", thinking: " and step 2" }, + done: false, + }, + { + model: "qwen3.5", + created_at: "2026-01-01T00:00:02Z", + message: { role: "assistant", content: "The answer", thinking: "" }, + done: false, + }, + { + model: "qwen3.5", + created_at: "2026-01-01T00:00:03Z", + message: { role: "assistant", content: "" }, + done: true, + done_reason: "stop", + prompt_eval_count: 10, + eval_count: 5, + }, + ]; + + const body = makeNdjsonBody(thinkingChunks); + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + body, + }); + vi.stubGlobal("fetch", fetchMock); + + const streamFn = createOllamaStreamFn("http://localhost:11434"); + const stream = streamFn( + { api: "ollama", provider: "ollama", id: "qwen3.5", contextWindow: 65536 } as never, + { messages: [{ role: "user", content: "test" }] } as never, + {}, + ); + + const events: Array<{ type: string; [key: string]: unknown }> = []; + for await (const event of stream as AsyncIterable<{ type: string; [key: string]: unknown }>) { + events.push(event); + } + + const eventTypes = events.map((e) => e.type); + + expect(eventTypes).toContain("thinking_start"); + expect(eventTypes).toContain("thinking_delta"); + expect(eventTypes).toContain("thinking_end"); + expect(eventTypes).toContain("text_start"); + expect(eventTypes).toContain("text_delta"); + expect(eventTypes).toContain("done"); + + // thinking_start comes before text_start + const thinkingStartIndex = eventTypes.indexOf("thinking_start"); + const textStartIndex = eventTypes.indexOf("text_start"); + expect(thinkingStartIndex).toBeLessThan(textStartIndex); + + // thinking_end comes before text_start + const thinkingEndIndex = eventTypes.indexOf("thinking_end"); + expect(thinkingEndIndex).toBeLessThan(textStartIndex); + + // Thinking deltas have correct content + const thinkingDeltas = events.filter((e) => e.type === "thinking_delta"); + expect(thinkingDeltas).toHaveLength(2); + expect(thinkingDeltas[0].delta).toBe("Step 1"); + expect(thinkingDeltas[1].delta).toBe(" and step 2"); + + // Content index: thinking at 0, text at 1 + const thinkingStart = events.find((e) => e.type === "thinking_start"); + expect(thinkingStart?.contentIndex).toBe(0); + const textStart = events.find((e) => e.type === "text_start"); + expect(textStart?.contentIndex).toBe(1); + + // Final message has thinking block + const done = events.find((e) => e.type === "done") as { message?: { content: unknown[] } }; + const content = done?.message?.content ?? []; + expect(content[0]).toMatchObject({ type: "thinking", thinking: "Step 1 and step 2" }); + expect(content[1]).toMatchObject({ type: "text", text: "The answer" }); + }); + + it("streams without thinking events when no thinking content is present", async () => { + const chunks = [ + { + model: "qwen3.5", + created_at: "2026-01-01T00:00:00Z", + message: { role: "assistant", content: "Hello" }, + done: false, + }, + { + model: "qwen3.5", + created_at: "2026-01-01T00:00:01Z", + message: { role: "assistant", content: "" }, + done: true, + done_reason: "stop", + prompt_eval_count: 10, + eval_count: 5, + }, + ]; + + const body = makeNdjsonBody(chunks); + vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: true, body })); + + const streamFn = createOllamaStreamFn("http://localhost:11434"); + const stream = streamFn( + { api: "ollama", provider: "ollama", id: "qwen3.5", contextWindow: 65536 } as never, + { messages: [{ role: "user", content: "test" }] } as never, + {}, + ); + + const events: Array<{ type: string }> = []; + for await (const event of stream as AsyncIterable<{ type: string }>) { + events.push(event); + } + + const eventTypes = events.map((e) => e.type); + expect(eventTypes).not.toContain("thinking_start"); + expect(eventTypes).not.toContain("thinking_delta"); + expect(eventTypes).not.toContain("thinking_end"); + expect(eventTypes).toContain("text_start"); + expect(eventTypes).toContain("text_delta"); + expect(eventTypes).toContain("done"); + + // Text content index should be 0 (no thinking block) + const textStart = events.find((e) => e.type === "text_start") as { contentIndex?: number }; + expect(textStart?.contentIndex).toBe(0); + }); +}); diff --git a/extensions/ollama/src/stream.ts b/extensions/ollama/src/stream.ts index 35c945ea325..49eaa1d7667 100644 --- a/extensions/ollama/src/stream.ts +++ b/extensions/ollama/src/stream.ts @@ -4,6 +4,7 @@ import type { AssistantMessage, StopReason, TextContent, + ThinkingContent, ToolCall, Tool, Usage, @@ -148,14 +149,14 @@ export function wrapOllamaCompatNumCtx(baseFn: StreamFn | undefined, numCtx: num }); } -function createOllamaThinkingOffWrapper(baseFn: StreamFn | undefined): StreamFn { +function createOllamaThinkingWrapper(baseFn: StreamFn | undefined, think: boolean): StreamFn { const streamFn = baseFn ?? streamSimple; return (model, context, options) => { if (model.api !== "ollama") { return streamFn(model, context, options); } return streamWithPayloadPatch(streamFn, model, context, options, (payloadRecord) => { - payloadRecord.think = false; + payloadRecord.think = think; }); }; } @@ -197,7 +198,11 @@ export function createConfiguredOllamaCompatStreamWrapper( } if (ctx.thinkingLevel === "off") { - streamFn = createOllamaThinkingOffWrapper(streamFn); + streamFn = createOllamaThinkingWrapper(streamFn, false); + } else if (ctx.thinkingLevel) { + // Any non-off ThinkLevel (minimal, low, medium, high, xhigh, adaptive) + // should enable Ollama's native thinking mode. + streamFn = createOllamaThinkingWrapper(streamFn, true); } if (normalizeProviderId(ctx.provider) === "ollama" && isOllamaCloudKimiModelRef(ctx.modelId)) { @@ -511,7 +516,11 @@ export function buildAssistantMessage( response: OllamaChatResponse, modelInfo: StreamModelDescriptor, ): AssistantMessage { - const content: (TextContent | ToolCall)[] = []; + const content: (TextContent | ThinkingContent | ToolCall)[] = []; + const thinking = response.message.thinking ?? response.message.reasoning ?? ""; + if (thinking) { + content.push({ type: "thinking", thinking }); + } const text = response.message.content || ""; if (text) { content.push({ type: "text", text }); @@ -654,39 +663,78 @@ export function createOllamaStreamFn( const reader = response.body.getReader(); let accumulatedContent = ""; + let accumulatedThinking = ""; const accumulatedToolCalls: OllamaToolCall[] = []; let finalResponse: OllamaChatResponse | undefined; const modelInfo = { api: model.api, provider: model.provider, id: model.id }; let streamStarted = false; + let thinkingStarted = false; + let thinkingEnded = false; + let textBlockStarted = false; let textBlockClosed = false; + // Content index tracking: thinking block (if present) is index 0, + // text block follows at index 1 (or 0 when no thinking). + const textContentIndex = () => (thinkingStarted ? 1 : 0); + + const buildCurrentContent = (): (TextContent | ThinkingContent | ToolCall)[] => { + const parts: (TextContent | ThinkingContent | ToolCall)[] = []; + if (accumulatedThinking) { + parts.push({ + type: "thinking", + thinking: accumulatedThinking, + }); + } + if (accumulatedContent) { + parts.push({ type: "text", text: accumulatedContent }); + } + return parts; + }; + + const closeThinkingBlock = () => { + if (!thinkingStarted || thinkingEnded) { + return; + } + thinkingEnded = true; + const partial = buildStreamAssistantMessage({ + model: modelInfo, + content: buildCurrentContent(), + stopReason: "stop", + usage: buildUsageWithNoCost({}), + }); + stream.push({ + type: "thinking_end", + contentIndex: 0, + content: accumulatedThinking, + partial, + }); + }; + const closeTextBlock = () => { - if (!streamStarted || textBlockClosed) { + if (!textBlockStarted || textBlockClosed) { return; } textBlockClosed = true; const partial = buildStreamAssistantMessage({ model: modelInfo, - content: [{ type: "text", text: accumulatedContent }], + content: buildCurrentContent(), stopReason: "stop", usage: buildUsageWithNoCost({}), }); stream.push({ type: "text_end", - contentIndex: 0, + contentIndex: textContentIndex(), content: accumulatedContent, partial, }); }; for await (const chunk of parseNdjsonStream(reader)) { - if (chunk.message?.content) { - const delta = chunk.message.content; - + // Handle thinking/reasoning deltas from Ollama's native think mode. + const thinkingDelta = chunk.message?.thinking ?? chunk.message?.reasoning; + if (thinkingDelta) { if (!streamStarted) { streamStarted = true; - // Emit start/text_start with an empty partial before accumulating - // the first delta, matching the Anthropic/OpenAI provider contract. const emptyPartial = buildStreamAssistantMessage({ model: modelInfo, content: [], @@ -694,19 +742,72 @@ export function createOllamaStreamFn( usage: buildUsageWithNoCost({}), }); stream.push({ type: "start", partial: emptyPartial }); - stream.push({ type: "text_start", contentIndex: 0, partial: emptyPartial }); + } + if (!thinkingStarted) { + thinkingStarted = true; + const partial = buildStreamAssistantMessage({ + model: modelInfo, + content: buildCurrentContent(), + stopReason: "stop", + usage: buildUsageWithNoCost({}), + }); + stream.push({ type: "thinking_start", contentIndex: 0, partial }); + } + accumulatedThinking += thinkingDelta; + const partial = buildStreamAssistantMessage({ + model: modelInfo, + content: buildCurrentContent(), + stopReason: "stop", + usage: buildUsageWithNoCost({}), + }); + stream.push({ + type: "thinking_delta", + contentIndex: 0, + delta: thinkingDelta, + partial, + }); + } + + if (chunk.message?.content) { + const delta = chunk.message.content; + + // Transition from thinking to text: close the thinking block first. + if (thinkingStarted && !thinkingEnded) { + closeThinkingBlock(); + } + + if (!streamStarted) { + streamStarted = true; + const emptyPartial = buildStreamAssistantMessage({ + model: modelInfo, + content: [], + stopReason: "stop", + usage: buildUsageWithNoCost({}), + }); + stream.push({ type: "start", partial: emptyPartial }); + } + if (!textBlockStarted) { + textBlockStarted = true; + const partial = buildStreamAssistantMessage({ + model: modelInfo, + content: buildCurrentContent(), + stopReason: "stop", + usage: buildUsageWithNoCost({}), + }); + stream.push({ type: "text_start", contentIndex: textContentIndex(), partial }); } accumulatedContent += delta; const partial = buildStreamAssistantMessage({ model: modelInfo, - content: [{ type: "text", text: accumulatedContent }], + content: buildCurrentContent(), stopReason: "stop", usage: buildUsageWithNoCost({}), }); - stream.push({ type: "text_delta", contentIndex: 0, delta, partial }); + stream.push({ type: "text_delta", contentIndex: textContentIndex(), delta, partial }); } if (chunk.message?.tool_calls) { + closeThinkingBlock(); closeTextBlock(); accumulatedToolCalls.push(...chunk.message.tool_calls); } @@ -721,13 +822,17 @@ export function createOllamaStreamFn( } finalResponse.message.content = accumulatedContent; + if (accumulatedThinking) { + finalResponse.message.thinking = accumulatedThinking; + } if (accumulatedToolCalls.length > 0) { finalResponse.message.tool_calls = accumulatedToolCalls; } const assistantMessage = buildAssistantMessage(finalResponse, modelInfo); - // Close the text block if we emitted any text_delta events. + // Close any open blocks before emitting the done event. + closeThinkingBlock(); closeTextBlock(); stream.push({