fix(ollama): preserve local limits and native thinking fallback (#39292)

* fix(ollama): support thinking field fallback in native stream

* fix(models): honor explicit lower token limits in merge mode

* fix(ollama): prefer streamed content over fallback thinking

* changelog: note Ollama local model fixes
This commit is contained in:
Vincent Koc
2026-03-07 19:53:02 -05:00
committed by GitHub
parent 5edcab2eee
commit d6d04f361e
5 changed files with 155 additions and 22 deletions

View File

@@ -220,6 +220,7 @@ Docs: https://docs.openclaw.ai
- Plugins/HTTP route migration diagnostics: rewrite legacy `api.registerHttpHandler(...)` loader failures into actionable migration guidance so doctor/plugin diagnostics point operators to `api.registerHttpRoute(...)` or `registerPluginHttpRoute(...)`. (#36794) Thanks @vincentkoc
- Doctor/Heartbeat upgrade diagnostics: warn when heartbeat delivery is configured with an implicit `directPolicy` so upgrades pin direct/DM behavior explicitly instead of relying on the current default. (#36789) Thanks @vincentkoc.
- Agents/current-time UTC anchor: append a machine-readable UTC suffix alongside local `Current time:` lines in shared cron-style prompt contexts so agents can compare UTC-stamped workspace timestamps without doing timezone math. (#32423) thanks @jriff.
- Ollama/local model handling: preserve explicit lower `contextWindow` / `maxTokens` overrides during merge refresh, and keep native Ollama streamed replies from surfacing fallback `thinking` / `reasoning` text once real content starts streaming. (#39292) Thanks @vincentkoc.
- TUI/webchat command-owner scope alignment: treat internal-channel gateway sessions with `operator.admin` as owner-authorized in command auth, restoring cron/gateway/connector tool access for affected TUI/webchat sessions while keeping external channels on identity-based owner checks. (from #35666, #35673, #35704) Thanks @Naylenv, @Octane0411, and @Sid-Qin.
- Discord/inbound timeout isolation: separate inbound worker timeout tracking from listener timeout budgets so queued Discord replies are no longer dropped when listener watchdog windows expire mid-run. (#36602) Thanks @dutifulbob.
- Memory/doctor SecretRef handling: treat SecretRef-backed memory-search API keys as configured, and fail embedding setup with explicit unresolved-secret errors instead of crashing. (#36835) Thanks @joshavant.

View File

@@ -372,7 +372,7 @@ describe("models-config", () => {
});
});
it("refreshes stale explicit moonshot model capabilities from implicit catalog", async () => {
it("refreshes moonshot capabilities while preserving explicit token limits", async () => {
await withTempHome(async () => {
await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
const cfg = createMoonshotConfig({ contextWindow: 1024, maxTokens: 256 });
@@ -397,8 +397,8 @@ describe("models-config", () => {
const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
expect(kimi?.input).toEqual(["text", "image"]);
expect(kimi?.reasoning).toBe(false);
expect(kimi?.contextWindow).toBe(256000);
expect(kimi?.maxTokens).toBe(8192);
expect(kimi?.contextWindow).toBe(1024);
expect(kimi?.maxTokens).toBe(256);
// Preserve explicit user pricing overrides when refreshing capabilities.
expect(kimi?.cost?.input).toBe(123);
expect(kimi?.cost?.output).toBe(456);
@@ -464,4 +464,29 @@ describe("models-config", () => {
});
});
});
it("falls back to implicit token limits when explicit values are invalid", async () => {
await withTempHome(async () => {
await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
const cfg = createMoonshotConfig({ contextWindow: 0, maxTokens: -1 });
await ensureOpenClawModelsJson(cfg);
const parsed = await readGeneratedModelsJson<{
providers: Record<
string,
{
models?: Array<{
id: string;
contextWindow?: number;
maxTokens?: number;
}>;
}
>;
}>();
const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
expect(kimi?.contextWindow).toBe(256000);
expect(kimi?.maxTokens).toBe(8192);
});
});
});
});

View File

@@ -23,10 +23,22 @@ type ModelsConfig = NonNullable<OpenClawConfig["models"]>;
const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
const MODELS_JSON_WRITE_LOCKS = new Map<string, Promise<void>>();
function resolvePreferredTokenLimit(explicitValue: number, implicitValue: number): number {
// Keep catalog refresh behavior for stale low values while preserving
// intentional larger user overrides (for example Ollama >128k contexts).
return explicitValue > implicitValue ? explicitValue : implicitValue;
function isPositiveFiniteTokenLimit(value: unknown): value is number {
return typeof value === "number" && Number.isFinite(value) && value > 0;
}
function resolvePreferredTokenLimit(params: {
explicitPresent: boolean;
explicitValue: unknown;
implicitValue: unknown;
}): number | undefined {
if (params.explicitPresent && isPositiveFiniteTokenLimit(params.explicitValue)) {
return params.explicitValue;
}
if (isPositiveFiniteTokenLimit(params.implicitValue)) {
return params.implicitValue;
}
return isPositiveFiniteTokenLimit(params.explicitValue) ? params.explicitValue : undefined;
}
function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig): ProviderConfig {
@@ -65,15 +77,23 @@ function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig)
// it in their config (key present), honour that value; otherwise fall back
// to the built-in catalog default so new reasoning models work out of the
// box without requiring every user to configure it.
const contextWindow = resolvePreferredTokenLimit({
explicitPresent: "contextWindow" in explicitModel,
explicitValue: explicitModel.contextWindow,
implicitValue: implicitModel.contextWindow,
});
const maxTokens = resolvePreferredTokenLimit({
explicitPresent: "maxTokens" in explicitModel,
explicitValue: explicitModel.maxTokens,
implicitValue: implicitModel.maxTokens,
});
return {
...explicitModel,
input: implicitModel.input,
reasoning: "reasoning" in explicitModel ? explicitModel.reasoning : implicitModel.reasoning,
contextWindow: resolvePreferredTokenLimit(
explicitModel.contextWindow,
implicitModel.contextWindow,
),
maxTokens: resolvePreferredTokenLimit(explicitModel.maxTokens, implicitModel.maxTokens),
...(contextWindow === undefined ? {} : { contextWindow }),
...(maxTokens === undefined ? {} : { maxTokens }),
};
});

View File

@@ -104,7 +104,23 @@ describe("buildAssistantMessage", () => {
expect(result.usage.totalTokens).toBe(15);
});
it("falls back to reasoning when content is empty", () => {
it("falls back to thinking when content is empty", () => {
const response = {
model: "qwen3:32b",
created_at: "2026-01-01T00:00:00Z",
message: {
role: "assistant" as const,
content: "",
thinking: "Thinking output",
},
done: true,
};
const result = buildAssistantMessage(response, modelInfo);
expect(result.stopReason).toBe("stop");
expect(result.content).toEqual([{ type: "text", text: "Thinking output" }]);
});
it("falls back to reasoning when content and thinking are empty", () => {
const response = {
model: "qwen3:32b",
created_at: "2026-01-01T00:00:00Z",
@@ -397,7 +413,50 @@ describe("createOllamaStreamFn", () => {
);
});
it("accumulates reasoning chunks when content is empty", async () => {
it("accumulates thinking chunks when content is empty", async () => {
await withMockNdjsonFetch(
[
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"reasoned"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":" output"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
],
async () => {
const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
const events = await collectStreamEvents(stream);
const doneEvent = events.at(-1);
if (!doneEvent || doneEvent.type !== "done") {
throw new Error("Expected done event");
}
expect(doneEvent.message.content).toEqual([{ type: "text", text: "reasoned output" }]);
},
);
});
it("prefers streamed content over earlier thinking chunks", async () => {
await withMockNdjsonFetch(
[
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"internal"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
],
async () => {
const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
const events = await collectStreamEvents(stream);
const doneEvent = events.at(-1);
if (!doneEvent || doneEvent.type !== "done") {
throw new Error("Expected done event");
}
expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
},
);
});
it("accumulates reasoning chunks when thinking is absent", async () => {
await withMockNdjsonFetch(
[
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"reasoned"},"done":false}',
@@ -417,4 +476,26 @@ describe("createOllamaStreamFn", () => {
},
);
});
it("prefers streamed content over earlier reasoning chunks", async () => {
await withMockNdjsonFetch(
[
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"internal"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
'{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
],
async () => {
const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
const events = await collectStreamEvents(stream);
const doneEvent = events.at(-1);
if (!doneEvent || doneEvent.type !== "done") {
throw new Error("Expected done event");
}
expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
},
);
});
});

View File

@@ -185,6 +185,7 @@ interface OllamaChatResponse {
message: {
role: "assistant";
content: string;
thinking?: string;
reasoning?: string;
tool_calls?: OllamaToolCall[];
};
@@ -323,10 +324,10 @@ export function buildAssistantMessage(
): AssistantMessage {
const content: (TextContent | ToolCall)[] = [];
// Qwen 3 (and potentially other reasoning models) may return their final
// answer in a `reasoning` field with an empty `content`. Fall back to
// `reasoning` so the response isn't silently dropped.
const text = response.message.content || response.message.reasoning || "";
// Ollama-native reasoning models may emit their answer in `thinking` or
// `reasoning` with an empty `content`. Fall back so replies are not dropped.
const text =
response.message.content || response.message.thinking || response.message.reasoning || "";
if (text) {
content.push({ type: "text", text });
}
@@ -468,15 +469,20 @@ export function createOllamaStreamFn(
const reader = response.body.getReader();
let accumulatedContent = "";
let fallbackContent = "";
let sawContent = false;
const accumulatedToolCalls: OllamaToolCall[] = [];
let finalResponse: OllamaChatResponse | undefined;
for await (const chunk of parseNdjsonStream(reader)) {
if (chunk.message?.content) {
sawContent = true;
accumulatedContent += chunk.message.content;
} else if (chunk.message?.reasoning) {
// Qwen 3 reasoning mode: content may be empty, output in reasoning
accumulatedContent += chunk.message.reasoning;
} else if (!sawContent && chunk.message?.thinking) {
fallbackContent += chunk.message.thinking;
} else if (!sawContent && chunk.message?.reasoning) {
// Backward compatibility for older/native variants that still use reasoning.
fallbackContent += chunk.message.reasoning;
}
// Ollama sends tool_calls in intermediate (done:false) chunks,
@@ -495,7 +501,7 @@ export function createOllamaStreamFn(
throw new Error("Ollama API stream ended without a final response");
}
finalResponse.message.content = accumulatedContent;
finalResponse.message.content = accumulatedContent || fallbackContent;
if (accumulatedToolCalls.length > 0) {
finalResponse.message.tool_calls = accumulatedToolCalls;
}