mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 15:30:39 +00:00
fix(ollama): preserve local limits and native thinking fallback (#39292)
* fix(ollama): support thinking field fallback in native stream * fix(models): honor explicit lower token limits in merge mode * fix(ollama): prefer streamed content over fallback thinking * changelog: note Ollama local model fixes
This commit is contained in:
@@ -220,6 +220,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Plugins/HTTP route migration diagnostics: rewrite legacy `api.registerHttpHandler(...)` loader failures into actionable migration guidance so doctor/plugin diagnostics point operators to `api.registerHttpRoute(...)` or `registerPluginHttpRoute(...)`. (#36794) Thanks @vincentkoc
|
||||
- Doctor/Heartbeat upgrade diagnostics: warn when heartbeat delivery is configured with an implicit `directPolicy` so upgrades pin direct/DM behavior explicitly instead of relying on the current default. (#36789) Thanks @vincentkoc.
|
||||
- Agents/current-time UTC anchor: append a machine-readable UTC suffix alongside local `Current time:` lines in shared cron-style prompt contexts so agents can compare UTC-stamped workspace timestamps without doing timezone math. (#32423) thanks @jriff.
|
||||
- Ollama/local model handling: preserve explicit lower `contextWindow` / `maxTokens` overrides during merge refresh, and keep native Ollama streamed replies from surfacing fallback `thinking` / `reasoning` text once real content starts streaming. (#39292) Thanks @vincentkoc.
|
||||
- TUI/webchat command-owner scope alignment: treat internal-channel gateway sessions with `operator.admin` as owner-authorized in command auth, restoring cron/gateway/connector tool access for affected TUI/webchat sessions while keeping external channels on identity-based owner checks. (from #35666, #35673, #35704) Thanks @Naylenv, @Octane0411, and @Sid-Qin.
|
||||
- Discord/inbound timeout isolation: separate inbound worker timeout tracking from listener timeout budgets so queued Discord replies are no longer dropped when listener watchdog windows expire mid-run. (#36602) Thanks @dutifulbob.
|
||||
- Memory/doctor SecretRef handling: treat SecretRef-backed memory-search API keys as configured, and fail embedding setup with explicit unresolved-secret errors instead of crashing. (#36835) Thanks @joshavant.
|
||||
|
||||
@@ -372,7 +372,7 @@ describe("models-config", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("refreshes stale explicit moonshot model capabilities from implicit catalog", async () => {
|
||||
it("refreshes moonshot capabilities while preserving explicit token limits", async () => {
|
||||
await withTempHome(async () => {
|
||||
await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
|
||||
const cfg = createMoonshotConfig({ contextWindow: 1024, maxTokens: 256 });
|
||||
@@ -397,8 +397,8 @@ describe("models-config", () => {
|
||||
const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
|
||||
expect(kimi?.input).toEqual(["text", "image"]);
|
||||
expect(kimi?.reasoning).toBe(false);
|
||||
expect(kimi?.contextWindow).toBe(256000);
|
||||
expect(kimi?.maxTokens).toBe(8192);
|
||||
expect(kimi?.contextWindow).toBe(1024);
|
||||
expect(kimi?.maxTokens).toBe(256);
|
||||
// Preserve explicit user pricing overrides when refreshing capabilities.
|
||||
expect(kimi?.cost?.input).toBe(123);
|
||||
expect(kimi?.cost?.output).toBe(456);
|
||||
@@ -464,4 +464,29 @@ describe("models-config", () => {
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back to implicit token limits when explicit values are invalid", async () => {
|
||||
await withTempHome(async () => {
|
||||
await withEnvVar("MOONSHOT_API_KEY", "sk-moonshot-test", async () => {
|
||||
const cfg = createMoonshotConfig({ contextWindow: 0, maxTokens: -1 });
|
||||
|
||||
await ensureOpenClawModelsJson(cfg);
|
||||
const parsed = await readGeneratedModelsJson<{
|
||||
providers: Record<
|
||||
string,
|
||||
{
|
||||
models?: Array<{
|
||||
id: string;
|
||||
contextWindow?: number;
|
||||
maxTokens?: number;
|
||||
}>;
|
||||
}
|
||||
>;
|
||||
}>();
|
||||
const kimi = parsed.providers.moonshot?.models?.find((model) => model.id === "kimi-k2.5");
|
||||
expect(kimi?.contextWindow).toBe(256000);
|
||||
expect(kimi?.maxTokens).toBe(8192);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -23,10 +23,22 @@ type ModelsConfig = NonNullable<OpenClawConfig["models"]>;
|
||||
const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
|
||||
const MODELS_JSON_WRITE_LOCKS = new Map<string, Promise<void>>();
|
||||
|
||||
function resolvePreferredTokenLimit(explicitValue: number, implicitValue: number): number {
|
||||
// Keep catalog refresh behavior for stale low values while preserving
|
||||
// intentional larger user overrides (for example Ollama >128k contexts).
|
||||
return explicitValue > implicitValue ? explicitValue : implicitValue;
|
||||
function isPositiveFiniteTokenLimit(value: unknown): value is number {
|
||||
return typeof value === "number" && Number.isFinite(value) && value > 0;
|
||||
}
|
||||
|
||||
function resolvePreferredTokenLimit(params: {
|
||||
explicitPresent: boolean;
|
||||
explicitValue: unknown;
|
||||
implicitValue: unknown;
|
||||
}): number | undefined {
|
||||
if (params.explicitPresent && isPositiveFiniteTokenLimit(params.explicitValue)) {
|
||||
return params.explicitValue;
|
||||
}
|
||||
if (isPositiveFiniteTokenLimit(params.implicitValue)) {
|
||||
return params.implicitValue;
|
||||
}
|
||||
return isPositiveFiniteTokenLimit(params.explicitValue) ? params.explicitValue : undefined;
|
||||
}
|
||||
|
||||
function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig): ProviderConfig {
|
||||
@@ -65,15 +77,23 @@ function mergeProviderModels(implicit: ProviderConfig, explicit: ProviderConfig)
|
||||
// it in their config (key present), honour that value; otherwise fall back
|
||||
// to the built-in catalog default so new reasoning models work out of the
|
||||
// box without requiring every user to configure it.
|
||||
const contextWindow = resolvePreferredTokenLimit({
|
||||
explicitPresent: "contextWindow" in explicitModel,
|
||||
explicitValue: explicitModel.contextWindow,
|
||||
implicitValue: implicitModel.contextWindow,
|
||||
});
|
||||
const maxTokens = resolvePreferredTokenLimit({
|
||||
explicitPresent: "maxTokens" in explicitModel,
|
||||
explicitValue: explicitModel.maxTokens,
|
||||
implicitValue: implicitModel.maxTokens,
|
||||
});
|
||||
|
||||
return {
|
||||
...explicitModel,
|
||||
input: implicitModel.input,
|
||||
reasoning: "reasoning" in explicitModel ? explicitModel.reasoning : implicitModel.reasoning,
|
||||
contextWindow: resolvePreferredTokenLimit(
|
||||
explicitModel.contextWindow,
|
||||
implicitModel.contextWindow,
|
||||
),
|
||||
maxTokens: resolvePreferredTokenLimit(explicitModel.maxTokens, implicitModel.maxTokens),
|
||||
...(contextWindow === undefined ? {} : { contextWindow }),
|
||||
...(maxTokens === undefined ? {} : { maxTokens }),
|
||||
};
|
||||
});
|
||||
|
||||
|
||||
@@ -104,7 +104,23 @@ describe("buildAssistantMessage", () => {
|
||||
expect(result.usage.totalTokens).toBe(15);
|
||||
});
|
||||
|
||||
it("falls back to reasoning when content is empty", () => {
|
||||
it("falls back to thinking when content is empty", () => {
|
||||
const response = {
|
||||
model: "qwen3:32b",
|
||||
created_at: "2026-01-01T00:00:00Z",
|
||||
message: {
|
||||
role: "assistant" as const,
|
||||
content: "",
|
||||
thinking: "Thinking output",
|
||||
},
|
||||
done: true,
|
||||
};
|
||||
const result = buildAssistantMessage(response, modelInfo);
|
||||
expect(result.stopReason).toBe("stop");
|
||||
expect(result.content).toEqual([{ type: "text", text: "Thinking output" }]);
|
||||
});
|
||||
|
||||
it("falls back to reasoning when content and thinking are empty", () => {
|
||||
const response = {
|
||||
model: "qwen3:32b",
|
||||
created_at: "2026-01-01T00:00:00Z",
|
||||
@@ -397,7 +413,50 @@ describe("createOllamaStreamFn", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("accumulates reasoning chunks when content is empty", async () => {
|
||||
it("accumulates thinking chunks when content is empty", async () => {
|
||||
await withMockNdjsonFetch(
|
||||
[
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"reasoned"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":" output"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
|
||||
],
|
||||
async () => {
|
||||
const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
|
||||
const events = await collectStreamEvents(stream);
|
||||
|
||||
const doneEvent = events.at(-1);
|
||||
if (!doneEvent || doneEvent.type !== "done") {
|
||||
throw new Error("Expected done event");
|
||||
}
|
||||
|
||||
expect(doneEvent.message.content).toEqual([{ type: "text", text: "reasoned output" }]);
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("prefers streamed content over earlier thinking chunks", async () => {
|
||||
await withMockNdjsonFetch(
|
||||
[
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","thinking":"internal"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
|
||||
],
|
||||
async () => {
|
||||
const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
|
||||
const events = await collectStreamEvents(stream);
|
||||
|
||||
const doneEvent = events.at(-1);
|
||||
if (!doneEvent || doneEvent.type !== "done") {
|
||||
throw new Error("Expected done event");
|
||||
}
|
||||
|
||||
expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("accumulates reasoning chunks when thinking is absent", async () => {
|
||||
await withMockNdjsonFetch(
|
||||
[
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"reasoned"},"done":false}',
|
||||
@@ -417,4 +476,26 @@ describe("createOllamaStreamFn", () => {
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("prefers streamed content over earlier reasoning chunks", async () => {
|
||||
await withMockNdjsonFetch(
|
||||
[
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"","reasoning":"internal"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":"final"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":" answer"},"done":false}',
|
||||
'{"model":"m","created_at":"t","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2}',
|
||||
],
|
||||
async () => {
|
||||
const stream = await createOllamaTestStream({ baseUrl: "http://ollama-host:11434" });
|
||||
const events = await collectStreamEvents(stream);
|
||||
|
||||
const doneEvent = events.at(-1);
|
||||
if (!doneEvent || doneEvent.type !== "done") {
|
||||
throw new Error("Expected done event");
|
||||
}
|
||||
|
||||
expect(doneEvent.message.content).toEqual([{ type: "text", text: "final answer" }]);
|
||||
},
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -185,6 +185,7 @@ interface OllamaChatResponse {
|
||||
message: {
|
||||
role: "assistant";
|
||||
content: string;
|
||||
thinking?: string;
|
||||
reasoning?: string;
|
||||
tool_calls?: OllamaToolCall[];
|
||||
};
|
||||
@@ -323,10 +324,10 @@ export function buildAssistantMessage(
|
||||
): AssistantMessage {
|
||||
const content: (TextContent | ToolCall)[] = [];
|
||||
|
||||
// Qwen 3 (and potentially other reasoning models) may return their final
|
||||
// answer in a `reasoning` field with an empty `content`. Fall back to
|
||||
// `reasoning` so the response isn't silently dropped.
|
||||
const text = response.message.content || response.message.reasoning || "";
|
||||
// Ollama-native reasoning models may emit their answer in `thinking` or
|
||||
// `reasoning` with an empty `content`. Fall back so replies are not dropped.
|
||||
const text =
|
||||
response.message.content || response.message.thinking || response.message.reasoning || "";
|
||||
if (text) {
|
||||
content.push({ type: "text", text });
|
||||
}
|
||||
@@ -468,15 +469,20 @@ export function createOllamaStreamFn(
|
||||
|
||||
const reader = response.body.getReader();
|
||||
let accumulatedContent = "";
|
||||
let fallbackContent = "";
|
||||
let sawContent = false;
|
||||
const accumulatedToolCalls: OllamaToolCall[] = [];
|
||||
let finalResponse: OllamaChatResponse | undefined;
|
||||
|
||||
for await (const chunk of parseNdjsonStream(reader)) {
|
||||
if (chunk.message?.content) {
|
||||
sawContent = true;
|
||||
accumulatedContent += chunk.message.content;
|
||||
} else if (chunk.message?.reasoning) {
|
||||
// Qwen 3 reasoning mode: content may be empty, output in reasoning
|
||||
accumulatedContent += chunk.message.reasoning;
|
||||
} else if (!sawContent && chunk.message?.thinking) {
|
||||
fallbackContent += chunk.message.thinking;
|
||||
} else if (!sawContent && chunk.message?.reasoning) {
|
||||
// Backward compatibility for older/native variants that still use reasoning.
|
||||
fallbackContent += chunk.message.reasoning;
|
||||
}
|
||||
|
||||
// Ollama sends tool_calls in intermediate (done:false) chunks,
|
||||
@@ -495,7 +501,7 @@ export function createOllamaStreamFn(
|
||||
throw new Error("Ollama API stream ended without a final response");
|
||||
}
|
||||
|
||||
finalResponse.message.content = accumulatedContent;
|
||||
finalResponse.message.content = accumulatedContent || fallbackContent;
|
||||
if (accumulatedToolCalls.length > 0) {
|
||||
finalResponse.message.tool_calls = accumulatedToolCalls;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user