fix(ollama): expose native thinking efforts

2026-05-06 08:20:43 +00:00 · 2026-04-26 22:49:06 +01:00
parent 2cd23957c0
commit ff570f3a61
7 changed files with 107 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
 - Logging: write validated diagnostic trace context as top-level `traceId`, `spanId`, `parentSpanId`, and `traceFlags` fields in file-log JSONL records so traced requests and model calls are easier to correlate in log processors. Refs #40353. Thanks @liangruochong44-ui.
 - Logging/sessions: apply configured redaction patterns to persisted session transcript text and accept escaped character classes in safe custom redaction regexes, so transcript JSONL no longer keeps matching sensitive text in the clear. Fixes #42982. Thanks @panpan0000.
 - Providers/Ollama: honor `/api/show` capabilities when registering local models so non-tool Ollama models no longer receive the agent tool surface, and keep native Ollama thinking opt-in instead of enabling it by default. Fixes #64710 and duplicate #65343. Thanks @yuan-b, @netherby, @xilopaint, and @Diyforfun2026.
+- Providers/Ollama: expose native Ollama thinking effort levels so `/think max` is accepted for reasoning-capable Ollama models and maps to Ollama's highest supported `think` effort. Fixes #71584. Thanks @g0st1n.
 - Auto-reply: poison inbound message dedupe after replay-unsafe provider/runtime failures so retries stay safe before visible progress but cannot duplicate messages after block output, tool side effects, or session progress. Fixes #69303; keeps #58549 and #64606 as duplicate validation. Thanks @martingarramon, @NikolaFC, and @zeroth-blip.
 - Agents/model fallback: jump directly to a known later live-session model redirect instead of walking unrelated fallback candidates, while preserving the already-landed live-session/fallback loop guard. Fixes #57471; related loop family already closed via #58496. Thanks @yuxiaoyang2007-prog.
 - Gateway/Bonjour: keep @homebridge/ciao cancellation handlers registered across advertiser restarts so late probing cancellations cannot crash Linux and other mDNS-churned gateways. Thanks @codex.
--- a/docs/providers/ollama.md
+++ b/docs/providers/ollama.md
@@ -461,7 +461,7 @@ For the full setup and behavior details, see [Ollama Web Search](/tools/ollama-s
  <Accordion title="Streaming configuration">
    OpenClaw's Ollama integration uses the **native Ollama API** (`/api/chat`) by default, which fully supports streaming and tool calling simultaneously. No special configuration is needed.

-    For native `/api/chat` requests, OpenClaw also forwards thinking control directly to Ollama: `/think off` and `openclaw agent --thinking off` send top-level `think: false`, while non-`off` thinking levels send `think: true`.
+    For native `/api/chat` requests, OpenClaw also forwards thinking control directly to Ollama: `/think off` and `openclaw agent --thinking off` send top-level `think: false`, while `/think low|medium|high` send the matching top-level `think` effort string. `/think max` maps to Ollama's highest native effort, `think: "high"`.

    <Tip>
    If you need to use the OpenAI-compatible endpoint, see the "Legacy OpenAI-compatible mode" section above. Streaming and tool calling may not work simultaneously in that mode.
--- a/docs/tools/thinking.md
+++ b/docs/tools/thinking.md
@@ -15,7 +15,7 @@ title: "Thinking levels"
  - high → “ultrathink” (max budget)
  - xhigh → “ultrathink+” (GPT-5.2+ and Codex models, plus Anthropic Claude Opus 4.7 effort)
  - adaptive → provider-managed adaptive thinking (supported for Claude 4.6 on Anthropic/Bedrock, Anthropic Claude Opus 4.7, and Google Gemini dynamic thinking)
-  - max → provider max reasoning (currently Anthropic Claude Opus 4.7)
+  - max → provider max reasoning (Anthropic Claude Opus 4.7; Ollama maps this to its highest native `think` effort)
  - `x-high`, `x_high`, `extra-high`, `extra high`, and `extra_high` map to `xhigh`.
  - `highest` maps to `high`.
 - Provider notes:
@@ -26,6 +26,7 @@ title: "Thinking levels"
  - Anthropic Claude Opus 4.7 does not default to adaptive thinking. Its API effort default remains provider-owned unless you explicitly set a thinking level.
  - Anthropic Claude Opus 4.7 maps `/think xhigh` to adaptive thinking plus `output_config.effort: "xhigh"`, because `/think` is a thinking directive and `xhigh` is the Opus 4.7 effort setting.
  - Anthropic Claude Opus 4.7 also exposes `/think max`; it maps to the same provider-owned max effort path.
+  - Ollama thinking-capable models expose `/think low|medium|high|max`; `max` maps to native `think: "high"` because Ollama's native API accepts `low`, `medium`, and `high` effort strings.
  - OpenAI GPT models map `/think` through model-specific Responses API effort support. `/think off` sends `reasoning.effort: "none"` only when the target model supports it; otherwise OpenClaw omits the disabled reasoning payload instead of sending an unsupported value.
  - Google Gemini maps `/think adaptive` to Gemini's provider-owned dynamic thinking. Gemini 3 requests omit a fixed `thinkingLevel`, while Gemini 2.5 requests send `thinkingBudget: -1`; fixed levels still map to the closest Gemini `thinkingLevel` or budget for that model family.
  - MiniMax (`minimax/*`) on the Anthropic-compatible streaming path defaults to `thinking: { type: "disabled" }` unless you explicitly set thinking in model params or request params. This avoids leaked `reasoning_content` deltas from MiniMax's non-native Anthropic stream format.
--- a/extensions/ollama/index.test.ts
+++ b/extensions/ollama/index.test.ts
@@ -69,7 +69,9 @@ function registerProviderWithPluginConfig(pluginConfig: Record<string, unknown>)
  return registerProviderMock.mock.calls[0]?.[0];
 }

-function captureWrappedOllamaPayload(thinkingLevel: "off" | "low" | undefined) {
+function captureWrappedOllamaPayload(
+  thinkingLevel: "off" | "minimal" | "low" | "medium" | "high" | "max" | undefined,
+) {
  const provider = registerProvider();
  let payloadSeen: Record<string, unknown> | undefined;
  const baseStreamFn = vi.fn((_model, _context, options) => {
@@ -528,7 +530,7 @@ describe("ollama plugin", () => {
    expect((payloadSeen?.options as Record<string, unknown> | undefined)?.think).toBeUndefined();
  });

-  it("keeps native Ollama thinking off by default while exposing an opt-in toggle", () => {
+  it("keeps native Ollama thinking off by default while exposing opt-in effort levels", () => {
    const provider = registerProvider();

    expect(
@@ -549,15 +551,22 @@ describe("ollama plugin", () => {
        reasoning: true,
      }),
    ).toEqual({
-      levels: [{ id: "off" }, { id: "low", label: "on" }],
+      levels: [{ id: "off" }, { id: "low" }, { id: "medium" }, { id: "high" }, { id: "max" }],
      defaultLevel: "off",
    });
  });

-  it("wraps native Ollama payloads with top-level think=true when thinking is enabled", () => {
+  it("wraps native Ollama payloads with top-level think effort when thinking is enabled", () => {
    const { baseStreamFn, payloadSeen } = captureWrappedOllamaPayload("low");
    expect(baseStreamFn).toHaveBeenCalledTimes(1);
-    expect(payloadSeen?.think).toBe(true);
+    expect(payloadSeen?.think).toBe("low");
+    expect((payloadSeen?.options as Record<string, unknown> | undefined)?.think).toBeUndefined();
+  });
+
+  it("maps native Ollama max thinking to the highest supported wire effort", () => {
+    const { baseStreamFn, payloadSeen } = captureWrappedOllamaPayload("max");
+    expect(baseStreamFn).toHaveBeenCalledTimes(1);
+    expect(payloadSeen?.think).toBe("high");
    expect((payloadSeen?.options as Record<string, unknown> | undefined)?.think).toBeUndefined();
  });

--- a/extensions/ollama/index.ts
+++ b/extensions/ollama/index.ts
@@ -167,7 +167,10 @@ export default definePluginEntry({
        usesOllamaOpenAICompatTransport(model) ? { supportsUsageInStreaming: true } : undefined,
      resolveReasoningOutputMode: () => "native",
      resolveThinkingProfile: ({ reasoning }) => ({
-        levels: reasoning === true ? [{ id: "off" }, { id: "low", label: "on" }] : [{ id: "off" }],
+        levels:
+          reasoning === true
+            ? [{ id: "off" }, { id: "low" }, { id: "medium" }, { id: "high" }, { id: "max" }]
+            : [{ id: "off" }],
        defaultLevel: "off",
      }),
      wrapStreamFn: createConfiguredOllamaCompatStreamWrapper,
--- a/extensions/ollama/src/stream-runtime.test.ts
+++ b/extensions/ollama/src/stream-runtime.test.ts
@@ -150,7 +150,7 @@ describe("createConfiguredOllamaCompatStreamWrapper", () => {
    );
  });

-  it("forwards think=true on native Ollama chat requests when thinking is enabled", async () => {
+  it("forwards the native think effort on native Ollama chat requests when thinking is enabled", async () => {
    await withMockNdjsonFetch(
      [
        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"ok"},"done":false}',
@@ -193,10 +193,63 @@ describe("createConfiguredOllamaCompatStreamWrapper", () => {
          throw new Error("Expected string request body");
        }
        const requestBody = JSON.parse(requestInit.body) as {
-          think?: boolean;
-          options?: { think?: boolean; num_ctx?: number };
+          think?: boolean | string;
+          options?: { think?: boolean | string; num_ctx?: number };
        };
-        expect(requestBody.think).toBe(true);
+        expect(requestBody.think).toBe("low");
+        expect(requestBody.options?.think).toBeUndefined();
+        expect(requestBody.options?.num_ctx).toBe(131072);
+      },
+    );
+  });
+
+  it("maps native Ollama max thinking to think=high on the wire", async () => {
+    await withMockNdjsonFetch(
+      [
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":"ok"},"done":false}',
+        '{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":1}',
+      ],
+      async (fetchMock) => {
+        const baseStreamFn = createOllamaStreamFn("http://ollama-host:11434");
+        const model = {
+          api: "ollama",
+          provider: "ollama",
+          id: "gpt-oss:20b",
+          contextWindow: 131072,
+        };
+
+        const wrapped = createConfiguredOllamaCompatStreamWrapper({
+          provider: "ollama",
+          modelId: "gpt-oss:20b",
+          model,
+          streamFn: baseStreamFn,
+          thinkingLevel: "max",
+        } as never);
+        if (!wrapped) {
+          throw new Error("Expected wrapped Ollama stream function");
+        }
+
+        const stream = await Promise.resolve(
+          wrapped(
+            model as never,
+            {
+              messages: [{ role: "user", content: "hello" }],
+            } as never,
+            {} as never,
+          ),
+        );
+
+        await collectStreamEvents(stream);
+
+        const requestInit = getGuardedFetchCall(fetchMock).init ?? {};
+        if (typeof requestInit.body !== "string") {
+          throw new Error("Expected string request body");
+        }
+        const requestBody = JSON.parse(requestInit.body) as {
+          think?: boolean | string;
+          options?: { think?: boolean | string; num_ctx?: number };
+        };
+        expect(requestBody.think).toBe("high");
        expect(requestBody.options?.think).toBeUndefined();
        expect(requestBody.options?.num_ctx).toBe(131072);
      },
--- a/extensions/ollama/src/stream.ts
+++ b/extensions/ollama/src/stream.ts
@@ -151,7 +151,12 @@ export function wrapOllamaCompatNumCtx(baseFn: StreamFn | undefined, numCtx: num
    });
 }

-function createOllamaThinkingWrapper(baseFn: StreamFn | undefined, think: boolean): StreamFn {
+type OllamaThinkValue = boolean | "low" | "medium" | "high";
+
+function createOllamaThinkingWrapper(
+  baseFn: StreamFn | undefined,
+  think: OllamaThinkValue,
+): StreamFn {
  const streamFn = baseFn ?? streamSimple;
  return (model, context, options) =>
    streamWithPayloadPatch(streamFn, model, context, options, (payloadRecord) => {
@@ -159,6 +164,22 @@ function createOllamaThinkingWrapper(baseFn: StreamFn | undefined, think: boolea
    });
 }

+function resolveOllamaThinkValue(thinkingLevel: unknown): OllamaThinkValue | undefined {
+  if (thinkingLevel === "off") {
+    return false;
+  }
+  if (thinkingLevel === "low" || thinkingLevel === "medium" || thinkingLevel === "high") {
+    return thinkingLevel;
+  }
+  if (thinkingLevel === "minimal") {
+    return "low";
+  }
+  if (thinkingLevel === "xhigh" || thinkingLevel === "adaptive" || thinkingLevel === "max") {
+    return "high";
+  }
+  return undefined;
+}
+
 function resolveOllamaCompatNumCtx(model: ProviderRuntimeModel): number {
  return Math.max(1, Math.floor(model.contextWindow ?? model.maxTokens ?? DEFAULT_CONTEXT_TOKENS));
 }
@@ -196,12 +217,11 @@ export function createConfiguredOllamaCompatStreamWrapper(
    streamFn = wrapOllamaCompatNumCtx(streamFn, resolveOllamaCompatNumCtx(model));
  }

-  if (isNativeOllamaTransport && ctx.thinkingLevel === "off") {
-    streamFn = createOllamaThinkingWrapper(streamFn, false);
-  } else if (isNativeOllamaTransport && ctx.thinkingLevel) {
-    // Any non-off ThinkLevel (minimal, low, medium, high, xhigh, adaptive, max)
-    // should enable Ollama's native thinking mode.
-    streamFn = createOllamaThinkingWrapper(streamFn, true);
+  const ollamaThinkValue = isNativeOllamaTransport
+    ? resolveOllamaThinkValue(ctx.thinkingLevel)
+    : undefined;
+  if (ollamaThinkValue !== undefined) {
+    streamFn = createOllamaThinkingWrapper(streamFn, ollamaThinkValue);
  }

  if (normalizeProviderId(ctx.provider) === "ollama" && isOllamaCloudKimiModelRef(ctx.modelId)) {
@@ -310,7 +330,7 @@ interface OllamaChatRequest {
  stream: boolean;
  tools?: OllamaTool[];
  options?: Record<string, unknown>;
-  think?: boolean;
+  think?: OllamaThinkValue;
 }

 interface OllamaChatMessage {