diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a49b02b375..70379cbc890 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai - Logging: write validated diagnostic trace context as top-level `traceId`, `spanId`, `parentSpanId`, and `traceFlags` fields in file-log JSONL records so traced requests and model calls are easier to correlate in log processors. Refs #40353. Thanks @liangruochong44-ui. - Logging/sessions: apply configured redaction patterns to persisted session transcript text and accept escaped character classes in safe custom redaction regexes, so transcript JSONL no longer keeps matching sensitive text in the clear. Fixes #42982. Thanks @panpan0000. - Providers/Ollama: honor `/api/show` capabilities when registering local models so non-tool Ollama models no longer receive the agent tool surface, and keep native Ollama thinking opt-in instead of enabling it by default. Fixes #64710 and duplicate #65343. Thanks @yuan-b, @netherby, @xilopaint, and @Diyforfun2026. +- Image tool/media: honor `tools.media.image.timeoutSeconds` and matching per-model image timeouts in explicit image analysis, including the MiniMax VLM fallback path, so slow local vision models are not capped by hardcoded 30s/60s aborts. Fixes #67889; supersedes #67929. Thanks @AllenT22 and @alchip. - Providers/Ollama: read larger custom Modelfile `PARAMETER num_ctx` values from `/api/show` so auto-discovered Ollama models with expanded context no longer stay pinned to the base model context. Fixes #68344. Thanks @neeravmakwana. - Providers/Ollama: honor configured model `params.num_ctx` in native and OpenAI-compatible Ollama requests so local models can cap runtime context without rebuilding Modelfiles. Fixes #44550 and #52206; supersedes #69464. Thanks @taitruong, @armi0024, and @LokiCode404. - Providers/Ollama: forward whitelisted native Ollama model params such as `temperature`, `top_p`, and top-level `think` so users can disable API-level thinking or tune local models from config without proxy shims. Fixes #48010. Thanks @tangzhi, @pandego, @maweibin, @Adam-Researchh, and @EmpireCreator. diff --git a/docs/gateway/config-tools.md b/docs/gateway/config-tools.md index 8504d83ad9b..f0149cc7392 100644 --- a/docs/gateway/config-tools.md +++ b/docs/gateway/config-tools.md @@ -215,6 +215,11 @@ Configures inbound media understanding (image/audio/video): { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] }, ], }, + image: { + enabled: true, + timeoutSeconds: 180, + models: [{ provider: "ollama", model: "gemma4:26b", timeoutSeconds: 300 }], + }, video: { enabled: true, maxBytes: 52428800, @@ -242,6 +247,7 @@ Configures inbound media understanding (image/audio/video): - `capabilities`: optional list (`image`, `audio`, `video`). Defaults: `openai`/`anthropic`/`minimax` → image, `google` → image+audio+video, `groq` → audio. - `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`: per-entry overrides. + - `tools.media.image.timeoutSeconds` and matching image model `timeoutSeconds` entries also apply when the agent calls the explicit `image` tool. - Failures fall back to the next entry. Provider auth follows standard order: `auth-profiles.json` → env vars → `models.providers.*.apiKey`. diff --git a/docs/providers/ollama.md b/docs/providers/ollama.md index ee5a81c35ad..693c56129d9 100644 --- a/docs/providers/ollama.md +++ b/docs/providers/ollama.md @@ -241,6 +241,44 @@ To make Ollama the default image-understanding model for inbound media, configur } ``` +Slow local vision models can need a longer image-understanding timeout than cloud models. They can also crash or stop when Ollama tries to allocate the full advertised vision context on constrained hardware. Set a capability timeout, and cap `num_ctx` on the model entry when you only need a normal image-description turn: + +```json5 +{ + models: { + providers: { + ollama: { + models: [ + { + id: "qwen2.5vl:7b", + name: "qwen2.5vl:7b", + input: ["text", "image"], + params: { num_ctx: 2048, keep_alive: "1m" }, + }, + ], + }, + }, + }, + tools: { + media: { + image: { + timeoutSeconds: 180, + models: [{ provider: "ollama", model: "qwen2.5vl:7b", timeoutSeconds: 300 }], + }, + }, + }, +} +``` + +This timeout applies to inbound image understanding and to the explicit `image` tool the agent can call during a turn. Provider-level `models.providers.ollama.timeoutSeconds` still controls the underlying Ollama HTTP request guard for normal model calls. + +Live-verify the explicit image tool against local Ollama with: + +```bash +OPENCLAW_LIVE_TEST=1 OPENCLAW_LIVE_OLLAMA_IMAGE=1 \ + pnpm test:live -- src/agents/tools/image-tool.ollama.live.test.ts +``` + If you define `models.providers.ollama.models` manually, mark vision models with image input support: ```json5 diff --git a/src/agents/minimax-vlm.normalizes-api-key.test.ts b/src/agents/minimax-vlm.normalizes-api-key.test.ts index ac413368b8c..3e377a932b1 100644 --- a/src/agents/minimax-vlm.normalizes-api-key.test.ts +++ b/src/agents/minimax-vlm.normalizes-api-key.test.ts @@ -80,6 +80,29 @@ describe("minimaxUnderstandImage apiKey normalization", () => { expect(fetchSpy).toHaveBeenCalledOnce(); }); + + it("uses the caller-provided request timeout", async () => { + const timeoutSpy = vi.spyOn(AbortSignal, "timeout"); + const fetchSpy = vi.fn(async () => { + return new Response(apiResponse, { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + }); + global.fetch = withFetchPreconnect(fetchSpy); + + await expect( + minimaxUnderstandImage({ + apiKey: "minimax-test-key", + prompt: "hi", + imageDataUrl: "data:image/png;base64,AAAA", + apiHost: "https://api.minimax.io", + timeoutMs: 180_000, + }), + ).resolves.toBe("ok"); + + expect(timeoutSpy).toHaveBeenCalledWith(180_000); + }); }); describe("isMinimaxVlmModel", () => { diff --git a/src/agents/minimax-vlm.ts b/src/agents/minimax-vlm.ts index 1b8d825c14d..bde911a17e0 100644 --- a/src/agents/minimax-vlm.ts +++ b/src/agents/minimax-vlm.ts @@ -51,6 +51,7 @@ export async function minimaxUnderstandImage(params: { imageDataUrl: string; apiHost?: string; modelBaseUrl?: string; + timeoutMs?: number; }): Promise { const apiKey = normalizeSecretInput(params.apiKey); if (!apiKey) { @@ -78,6 +79,13 @@ export async function minimaxUnderstandImage(params: { // Without this, HTTP_PROXY/HTTPS_PROXY env vars are silently ignored (#51619). ensureGlobalUndiciEnvProxyDispatcher(); + const timeoutMs = + typeof params.timeoutMs === "number" && + Number.isFinite(params.timeoutMs) && + params.timeoutMs > 0 + ? Math.floor(params.timeoutMs) + : 60_000; + const res = await fetch(url, { method: "POST", headers: { @@ -85,7 +93,7 @@ export async function minimaxUnderstandImage(params: { "Content-Type": "application/json", "MM-API-Source": "OpenClaw", }, - signal: AbortSignal.timeout(60_000), + signal: AbortSignal.timeout(timeoutMs), body: JSON.stringify({ prompt, image_url: imageDataUrl, diff --git a/src/agents/tools/image-tool.ollama.live.test.ts b/src/agents/tools/image-tool.ollama.live.test.ts new file mode 100644 index 00000000000..6cb41b81880 --- /dev/null +++ b/src/agents/tools/image-tool.ollama.live.test.ts @@ -0,0 +1,99 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import type { OpenClawConfig } from "../../config/types.openclaw.js"; +import { createImageTool } from "./image-tool.js"; + +const LIVE = + process.env.OPENCLAW_LIVE_TEST === "1" && process.env.OPENCLAW_LIVE_OLLAMA_IMAGE === "1"; +const OLLAMA_BASE_URL = + process.env.OPENCLAW_LIVE_OLLAMA_BASE_URL?.trim() || "http://127.0.0.1:11434"; +const OLLAMA_IMAGE_MODEL = process.env.OPENCLAW_LIVE_OLLAMA_IMAGE_MODEL?.trim() || "qwen2.5vl:7b"; + +function resolveLiveNumCtx(): number { + const parsed = Number.parseInt(process.env.OPENCLAW_LIVE_OLLAMA_IMAGE_NUM_CTX ?? "2048", 10); + return Number.isFinite(parsed) ? Math.max(512, parsed) : 2048; +} + +const OLLAMA_IMAGE_NUM_CTX = resolveLiveNumCtx(); + +const VALID_RED_PNG_B64 = + "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIGNIUk0AAHomAACAhAAA+gAAAIDoAAB1MAAA6mAAADqYAAAXcJy6UTwAAAAGYktHRAD/AP8A/6C9p5MAAAAHdElNRQfqBBsGAQr00ED3AAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDI2LTA0LTI3VDA2OjAxOjEwKzAwOjAwPU3tXwAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyNi0wNC0yN1QwNjowMToxMCswMDowMEwQVeMAAAAodEVYdGRhdGU6dGltZXN0YW1wADIwMjYtMDQtMjdUMDY6MDE6MTArMDA6MDAbBXQ8AAAAeElEQVRo3u3awQnDQBAEwT2Q8w/YAikIP5rF1RFMca+FO8/s7rrnqjcA1BsA6g0A9QaAesOfA77zqTf8Blj/AgAAAAAAAJsDqAOoA6gDqAOoc9TXAdQB1AHUAdQB1AHUAdQB1AHU7Qc46gEAAAAANrcecGZ2f8B/ASYSQPlKoEJ/AAAAAElFTkSuQmCC"; + +async function withLiveImageWorkspace( + run: (ctx: { agentDir: string; workspaceDir: string; imagePath: string }) => Promise, +) { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-ollama-image-live-")); + try { + const agentDir = path.join(root, "agent"); + const workspaceDir = path.join(root, "workspace"); + await fs.mkdir(agentDir, { recursive: true }); + await fs.mkdir(workspaceDir, { recursive: true }); + const imagePath = path.join(workspaceDir, "red.png"); + await fs.writeFile(imagePath, Buffer.from(VALID_RED_PNG_B64, "base64")); + return await run({ agentDir, workspaceDir, imagePath }); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } +} + +describe.skipIf(!LIVE)("image tool Ollama live", () => { + it("describes a local image through the explicit image tool", async () => { + process.env.OLLAMA_API_KEY ||= "ollama-local"; + await withLiveImageWorkspace(async ({ agentDir, workspaceDir, imagePath }) => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + imageModel: { primary: `ollama/${OLLAMA_IMAGE_MODEL}` }, + }, + }, + models: { + providers: { + ollama: { + api: "ollama", + baseUrl: OLLAMA_BASE_URL, + apiKey: "ollama-local", + timeoutSeconds: 300, + models: [ + { + id: OLLAMA_IMAGE_MODEL, + name: OLLAMA_IMAGE_MODEL, + input: ["text", "image"], + reasoning: false, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128_000, + maxTokens: 512, + params: { num_ctx: OLLAMA_IMAGE_NUM_CTX, keep_alive: "1m" }, + }, + ], + }, + }, + }, + tools: { + media: { + image: { + timeoutSeconds: 180, + models: [{ provider: "ollama", model: OLLAMA_IMAGE_MODEL, timeoutSeconds: 300 }], + }, + }, + }, + }; + const tool = createImageTool({ config: cfg, agentDir, workspaceDir }); + expect(tool).not.toBeNull(); + + const result = await tool!.execute("live-ollama-image", { + prompt: "Describe this image in one short sentence.", + image: imagePath, + }); + + expect(result).toMatchObject({ + content: [expect.objectContaining({ type: "text" })], + }); + const text = ( + result as { content?: Array<{ type?: string; text?: string }> } + ).content?.[0]?.text?.trim(); + expect(text?.length ?? 0).toBeGreaterThan(0); + }); + }, 180_000); +}); diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 2009e890b0b..30117854ca6 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -213,7 +213,7 @@ async function withTempAgentDir(run: (agentDir: string) => Promise): Promi } const ONE_PIXEL_PNG_B64 = - "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; + "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIGNIUk0AAHomAACAhAAA+gAAAIDoAAB1MAAA6mAAADqYAAAXcJy6UTwAAAAGYktHRAD/AP8A/6C9p5MAAAAHdElNRQfqBBsGAQr00ED3AAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDI2LTA0LTI3VDA2OjAxOjEwKzAwOjAwPU3tXwAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyNi0wNC0yN1QwNjowMToxMCswMDowMEwQVeMAAAAodEVYdGRhdGU6dGltZXN0YW1wADIwMjYtMDQtMjdUMDY6MDE6MTArMDA6MDAbBXQ8AAAAeElEQVRo3u3awQnDQBAEwT2Q8w/YAikIP5rF1RFMca+FO8/s7rrnqjcA1BsA6g0A9QaAesOfA77zqTf8Blj/AgAAAAAAAJsDqAOoA6gDqAOoc9TXAdQB1AHUAdQB1AHUAdQB1AHU7Qc46gEAAAAANrcecGZ2f8B/ASYSQPlKoEJ/AAAAAElFTkSuQmCC"; const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs="; const ONE_PIXEL_JPEG_B64 = "QUJDRA=="; @@ -671,6 +671,81 @@ describe("image tool implicit imageModel config", () => { }); }); + it("passes the configured image timeout to provider calls", async () => { + await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => { + await withTempAgentDir(async (agentDir) => { + const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({ + text: "ok", + model: params.model, + })); + installImageUnderstandingProviderStubs({ + id: "ollama", + capabilities: ["image"], + describeImage, + }); + const cfg: OpenClawConfig = { + agents: { + defaults: { + imageModel: { primary: "ollama/gemma4:26b-a4b-it-q4_K_M" }, + }, + }, + tools: { + media: { + image: { timeoutSeconds: 180 }, + }, + }, + }; + const tool = createRequiredImageTool({ config: cfg, agentDir, workspaceDir }); + + await expectImageToolExecOk(tool, imagePath); + + expect(describeImage).toHaveBeenCalledWith(expect.objectContaining({ timeoutMs: 180_000 })); + }); + }); + }); + + it("prefers a matching per-image-model timeout over the capability timeout", async () => { + await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => { + await withTempAgentDir(async (agentDir) => { + const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({ + text: "ok", + model: params.model, + })); + installImageUnderstandingProviderStubs({ + id: "ollama", + capabilities: ["image"], + describeImage, + }); + const cfg: OpenClawConfig = { + agents: { + defaults: { + imageModel: { primary: "ollama/gemma4:26b-a4b-it-q4_K_M" }, + }, + }, + tools: { + media: { + image: { + timeoutSeconds: 180, + models: [ + { + provider: "ollama", + model: "gemma4:26b-a4b-it-q4_K_M", + timeoutSeconds: 300, + }, + ], + }, + }, + }, + }; + const tool = createRequiredImageTool({ config: cfg, agentDir, workspaceDir }); + + await expectImageToolExecOk(tool, imagePath); + + expect(describeImage).toHaveBeenCalledWith(expect.objectContaining({ timeoutMs: 300_000 })); + }); + }); + }); + it("pairs minimax-portal primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => { await withTempAgentDir(async (agentDir) => { await writeAuthProfiles(agentDir, { diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index a9ef33e4f7b..4dfcc521b44 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,11 +1,16 @@ import { resolve, isAbsolute } from "node:path"; import { Type } from "typebox"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; +import type { MediaUnderstandingModelConfig } from "../../config/types.tools.js"; import { + DEFAULT_TIMEOUT_SECONDS, resolveAutoMediaKeyProviders, resolveDefaultMediaModel, } from "../../media-understanding/defaults.js"; +import { matchesMediaEntryCapability } from "../../media-understanding/entry-capabilities.js"; +import { normalizeMediaProviderId } from "../../media-understanding/provider-id.js"; import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js"; +import { resolveTimeoutMs } from "../../media-understanding/resolve.js"; import { buildProviderRegistry } from "../../media-understanding/runner.js"; import { classifyMediaReferenceSource, @@ -177,6 +182,70 @@ function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undef return undefined; } +function matchesImageTimeoutEntry(params: { + entry: MediaUnderstandingModelConfig; + source: "capability" | "shared"; + provider: string; + model: string; + providerRegistry: Map; +}): boolean { + const configuredProvider = normalizeMediaProviderId(params.entry.provider ?? ""); + const selectedProvider = normalizeMediaProviderId(params.provider); + if (!configuredProvider || configuredProvider !== selectedProvider) { + return false; + } + if ( + !matchesMediaEntryCapability({ + entry: params.entry, + source: params.source, + capability: "image", + providerRegistry: params.providerRegistry, + }) + ) { + return false; + } + const configuredModel = params.entry.model?.trim(); + if (!configuredModel) { + return true; + } + const providerPrefix = `${selectedProvider}/`; + const normalizedConfiguredModel = configuredModel.startsWith(providerPrefix) + ? configuredModel.slice(providerPrefix.length) + : configuredModel; + return normalizedConfiguredModel === params.model; +} + +function resolveImageToolTimeoutMs(params: { + cfg: OpenClawConfig; + provider: string; + model: string; + providerRegistry: Map; +}): number { + const imageConfig = params.cfg.tools?.media?.image; + const capabilityEntry = imageConfig?.models?.find((entry) => + matchesImageTimeoutEntry({ + entry, + source: "capability", + provider: params.provider, + model: params.model, + providerRegistry: params.providerRegistry, + }), + ); + const sharedEntry = params.cfg.tools?.media?.models?.find((entry) => + matchesImageTimeoutEntry({ + entry, + source: "shared", + provider: params.provider, + model: params.model, + providerRegistry: params.providerRegistry, + }), + ); + return resolveTimeoutMs( + capabilityEntry?.timeoutSeconds ?? sharedEntry?.timeoutSeconds ?? imageConfig?.timeoutSeconds, + DEFAULT_TIMEOUT_SECONDS.image, + ); +} + type ImageSandboxConfig = { root: string; bridge: SandboxFsBridge; @@ -203,6 +272,12 @@ async function runImagePrompt(params: { cfg: effectiveCfg, modelOverride: params.modelOverride, run: async (provider, modelId) => { + const timeoutMs = resolveImageToolTimeoutMs({ + cfg: providerCfg, + provider, + model: modelId, + providerRegistry: providerRegistry as Map, + }); const imageProvider = imageToolProviderDeps.getMediaUnderstandingProvider( provider, providerRegistry as Map, @@ -223,7 +298,7 @@ async function runImagePrompt(params: { model: modelId, prompt: params.prompt, maxTokens: resolveImageToolMaxTokens(undefined), - timeoutMs: 30_000, + timeoutMs, cfg: providerCfg, agentDir: params.agentDir, }); @@ -241,7 +316,7 @@ async function runImagePrompt(params: { model: modelId, prompt: params.prompt, maxTokens: resolveImageToolMaxTokens(undefined), - timeoutMs: 30_000, + timeoutMs, cfg: providerCfg, agentDir: params.agentDir, }); @@ -258,7 +333,7 @@ async function runImagePrompt(params: { model: modelId, prompt: `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length}.`, maxTokens: resolveImageToolMaxTokens(undefined), - timeoutMs: 30_000, + timeoutMs, cfg: providerCfg, agentDir: params.agentDir, }); diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index dee9ec0384c..c2aa7187b9e 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -123,6 +123,7 @@ describe("describeImageWithModel", () => { }); it("routes minimax-portal image models through the MiniMax VLM endpoint", async () => { + const timeoutSpy = vi.spyOn(AbortSignal, "timeout"); const authStore = { version: 1, profiles: {} }; const result = await describeImageWithModel({ cfg: {}, @@ -163,6 +164,7 @@ describe("describeImageWithModel", () => { signal: expect.any(AbortSignal), }), ); + expect(timeoutSpy).toHaveBeenCalledWith(1000); expect(completeMock).not.toHaveBeenCalled(); }); diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index 02d7624285c..572ceba650a 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -252,6 +252,7 @@ async function describeImagesWithMinimax(params: { modelId: string; modelBaseUrl?: string; prompt: string; + timeoutMs?: number; images: Array<{ buffer: Buffer; mime?: string }>; }): Promise { const responses: string[] = []; @@ -265,6 +266,7 @@ async function describeImagesWithMinimax(params: { prompt, imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`, modelBaseUrl: params.modelBaseUrl, + timeoutMs: params.timeoutMs, }); responses.push(params.images.length > 1 ? `Image ${index + 1}:\n${text.trim()}` : text.trim()); } @@ -331,6 +333,7 @@ async function describeImagesWithModelInternal( modelId: params.model, modelBaseUrl: fallback.modelBaseUrl, prompt, + timeoutMs: params.timeoutMs, images: params.images, }); } @@ -341,6 +344,7 @@ async function describeImagesWithModelInternal( modelId: model.id, modelBaseUrl: model.baseUrl, prompt, + timeoutMs: params.timeoutMs, images: params.images, }); }