fix(image): honor media timeouts

This commit is contained in:
Peter Steinberger
2026-04-27 07:09:25 +01:00
parent 19cb9ca6bf
commit ca67762b88
10 changed files with 336 additions and 5 deletions

View File

@@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai
- Logging: write validated diagnostic trace context as top-level `traceId`, `spanId`, `parentSpanId`, and `traceFlags` fields in file-log JSONL records so traced requests and model calls are easier to correlate in log processors. Refs #40353. Thanks @liangruochong44-ui.
- Logging/sessions: apply configured redaction patterns to persisted session transcript text and accept escaped character classes in safe custom redaction regexes, so transcript JSONL no longer keeps matching sensitive text in the clear. Fixes #42982. Thanks @panpan0000.
- Providers/Ollama: honor `/api/show` capabilities when registering local models so non-tool Ollama models no longer receive the agent tool surface, and keep native Ollama thinking opt-in instead of enabling it by default. Fixes #64710 and duplicate #65343. Thanks @yuan-b, @netherby, @xilopaint, and @Diyforfun2026.
- Image tool/media: honor `tools.media.image.timeoutSeconds` and matching per-model image timeouts in explicit image analysis, including the MiniMax VLM fallback path, so slow local vision models are not capped by hardcoded 30s/60s aborts. Fixes #67889; supersedes #67929. Thanks @AllenT22 and @alchip.
- Providers/Ollama: read larger custom Modelfile `PARAMETER num_ctx` values from `/api/show` so auto-discovered Ollama models with expanded context no longer stay pinned to the base model context. Fixes #68344. Thanks @neeravmakwana.
- Providers/Ollama: honor configured model `params.num_ctx` in native and OpenAI-compatible Ollama requests so local models can cap runtime context without rebuilding Modelfiles. Fixes #44550 and #52206; supersedes #69464. Thanks @taitruong, @armi0024, and @LokiCode404.
- Providers/Ollama: forward whitelisted native Ollama model params such as `temperature`, `top_p`, and top-level `think` so users can disable API-level thinking or tune local models from config without proxy shims. Fixes #48010. Thanks @tangzhi, @pandego, @maweibin, @Adam-Researchh, and @EmpireCreator.

View File

@@ -215,6 +215,11 @@ Configures inbound media understanding (image/audio/video):
{ type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] },
],
},
image: {
enabled: true,
timeoutSeconds: 180,
models: [{ provider: "ollama", model: "gemma4:26b", timeoutSeconds: 300 }],
},
video: {
enabled: true,
maxBytes: 52428800,
@@ -242,6 +247,7 @@ Configures inbound media understanding (image/audio/video):
- `capabilities`: optional list (`image`, `audio`, `video`). Defaults: `openai`/`anthropic`/`minimax` → image, `google` → image+audio+video, `groq` → audio.
- `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`: per-entry overrides.
- `tools.media.image.timeoutSeconds` and matching image model `timeoutSeconds` entries also apply when the agent calls the explicit `image` tool.
- Failures fall back to the next entry.
Provider auth follows standard order: `auth-profiles.json` → env vars → `models.providers.*.apiKey`.

View File

@@ -241,6 +241,44 @@ To make Ollama the default image-understanding model for inbound media, configur
}
```
Slow local vision models can need a longer image-understanding timeout than cloud models. They can also crash or stop when Ollama tries to allocate the full advertised vision context on constrained hardware. Set a capability timeout, and cap `num_ctx` on the model entry when you only need a normal image-description turn:
```json5
{
models: {
providers: {
ollama: {
models: [
{
id: "qwen2.5vl:7b",
name: "qwen2.5vl:7b",
input: ["text", "image"],
params: { num_ctx: 2048, keep_alive: "1m" },
},
],
},
},
},
tools: {
media: {
image: {
timeoutSeconds: 180,
models: [{ provider: "ollama", model: "qwen2.5vl:7b", timeoutSeconds: 300 }],
},
},
},
}
```
This timeout applies to inbound image understanding and to the explicit `image` tool the agent can call during a turn. Provider-level `models.providers.ollama.timeoutSeconds` still controls the underlying Ollama HTTP request guard for normal model calls.
Live-verify the explicit image tool against local Ollama with:
```bash
OPENCLAW_LIVE_TEST=1 OPENCLAW_LIVE_OLLAMA_IMAGE=1 \
pnpm test:live -- src/agents/tools/image-tool.ollama.live.test.ts
```
If you define `models.providers.ollama.models` manually, mark vision models with image input support:
```json5

View File

@@ -80,6 +80,29 @@ describe("minimaxUnderstandImage apiKey normalization", () => {
expect(fetchSpy).toHaveBeenCalledOnce();
});
it("uses the caller-provided request timeout", async () => {
const timeoutSpy = vi.spyOn(AbortSignal, "timeout");
const fetchSpy = vi.fn(async () => {
return new Response(apiResponse, {
status: 200,
headers: { "Content-Type": "application/json" },
});
});
global.fetch = withFetchPreconnect(fetchSpy);
await expect(
minimaxUnderstandImage({
apiKey: "minimax-test-key",
prompt: "hi",
imageDataUrl: "data:image/png;base64,AAAA",
apiHost: "https://api.minimax.io",
timeoutMs: 180_000,
}),
).resolves.toBe("ok");
expect(timeoutSpy).toHaveBeenCalledWith(180_000);
});
});
describe("isMinimaxVlmModel", () => {

View File

@@ -51,6 +51,7 @@ export async function minimaxUnderstandImage(params: {
imageDataUrl: string;
apiHost?: string;
modelBaseUrl?: string;
timeoutMs?: number;
}): Promise<string> {
const apiKey = normalizeSecretInput(params.apiKey);
if (!apiKey) {
@@ -78,6 +79,13 @@ export async function minimaxUnderstandImage(params: {
// Without this, HTTP_PROXY/HTTPS_PROXY env vars are silently ignored (#51619).
ensureGlobalUndiciEnvProxyDispatcher();
const timeoutMs =
typeof params.timeoutMs === "number" &&
Number.isFinite(params.timeoutMs) &&
params.timeoutMs > 0
? Math.floor(params.timeoutMs)
: 60_000;
const res = await fetch(url, {
method: "POST",
headers: {
@@ -85,7 +93,7 @@ export async function minimaxUnderstandImage(params: {
"Content-Type": "application/json",
"MM-API-Source": "OpenClaw",
},
signal: AbortSignal.timeout(60_000),
signal: AbortSignal.timeout(timeoutMs),
body: JSON.stringify({
prompt,
image_url: imageDataUrl,

View File

@@ -0,0 +1,99 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { createImageTool } from "./image-tool.js";
const LIVE =
process.env.OPENCLAW_LIVE_TEST === "1" && process.env.OPENCLAW_LIVE_OLLAMA_IMAGE === "1";
const OLLAMA_BASE_URL =
process.env.OPENCLAW_LIVE_OLLAMA_BASE_URL?.trim() || "http://127.0.0.1:11434";
const OLLAMA_IMAGE_MODEL = process.env.OPENCLAW_LIVE_OLLAMA_IMAGE_MODEL?.trim() || "qwen2.5vl:7b";
function resolveLiveNumCtx(): number {
const parsed = Number.parseInt(process.env.OPENCLAW_LIVE_OLLAMA_IMAGE_NUM_CTX ?? "2048", 10);
return Number.isFinite(parsed) ? Math.max(512, parsed) : 2048;
}
const OLLAMA_IMAGE_NUM_CTX = resolveLiveNumCtx();
const VALID_RED_PNG_B64 =
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIGNIUk0AAHomAACAhAAA+gAAAIDoAAB1MAAA6mAAADqYAAAXcJy6UTwAAAAGYktHRAD/AP8A/6C9p5MAAAAHdElNRQfqBBsGAQr00ED3AAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDI2LTA0LTI3VDA2OjAxOjEwKzAwOjAwPU3tXwAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyNi0wNC0yN1QwNjowMToxMCswMDowMEwQVeMAAAAodEVYdGRhdGU6dGltZXN0YW1wADIwMjYtMDQtMjdUMDY6MDE6MTArMDA6MDAbBXQ8AAAAeElEQVRo3u3awQnDQBAEwT2Q8w/YAikIP5rF1RFMca+FO8/s7rrnqjcA1BsA6g0A9QaAesOfA77zqTf8Blj/AgAAAAAAAJsDqAOoA6gDqAOoc9TXAdQB1AHUAdQB1AHUAdQB1AHU7Qc46gEAAAAANrcecGZ2f8B/ASYSQPlKoEJ/AAAAAElFTkSuQmCC";
async function withLiveImageWorkspace<T>(
run: (ctx: { agentDir: string; workspaceDir: string; imagePath: string }) => Promise<T>,
) {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-ollama-image-live-"));
try {
const agentDir = path.join(root, "agent");
const workspaceDir = path.join(root, "workspace");
await fs.mkdir(agentDir, { recursive: true });
await fs.mkdir(workspaceDir, { recursive: true });
const imagePath = path.join(workspaceDir, "red.png");
await fs.writeFile(imagePath, Buffer.from(VALID_RED_PNG_B64, "base64"));
return await run({ agentDir, workspaceDir, imagePath });
} finally {
await fs.rm(root, { recursive: true, force: true });
}
}
describe.skipIf(!LIVE)("image tool Ollama live", () => {
it("describes a local image through the explicit image tool", async () => {
process.env.OLLAMA_API_KEY ||= "ollama-local";
await withLiveImageWorkspace(async ({ agentDir, workspaceDir, imagePath }) => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: `ollama/${OLLAMA_IMAGE_MODEL}` },
},
},
models: {
providers: {
ollama: {
api: "ollama",
baseUrl: OLLAMA_BASE_URL,
apiKey: "ollama-local",
timeoutSeconds: 300,
models: [
{
id: OLLAMA_IMAGE_MODEL,
name: OLLAMA_IMAGE_MODEL,
input: ["text", "image"],
reasoning: false,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 128_000,
maxTokens: 512,
params: { num_ctx: OLLAMA_IMAGE_NUM_CTX, keep_alive: "1m" },
},
],
},
},
},
tools: {
media: {
image: {
timeoutSeconds: 180,
models: [{ provider: "ollama", model: OLLAMA_IMAGE_MODEL, timeoutSeconds: 300 }],
},
},
},
};
const tool = createImageTool({ config: cfg, agentDir, workspaceDir });
expect(tool).not.toBeNull();
const result = await tool!.execute("live-ollama-image", {
prompt: "Describe this image in one short sentence.",
image: imagePath,
});
expect(result).toMatchObject({
content: [expect.objectContaining({ type: "text" })],
});
const text = (
result as { content?: Array<{ type?: string; text?: string }> }
).content?.[0]?.text?.trim();
expect(text?.length ?? 0).toBeGreaterThan(0);
});
}, 180_000);
});

View File

@@ -213,7 +213,7 @@ async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promi
}
const ONE_PIXEL_PNG_B64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIGNIUk0AAHomAACAhAAA+gAAAIDoAAB1MAAA6mAAADqYAAAXcJy6UTwAAAAGYktHRAD/AP8A/6C9p5MAAAAHdElNRQfqBBsGAQr00ED3AAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDI2LTA0LTI3VDA2OjAxOjEwKzAwOjAwPU3tXwAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyNi0wNC0yN1QwNjowMToxMCswMDowMEwQVeMAAAAodEVYdGRhdGU6dGltZXN0YW1wADIwMjYtMDQtMjdUMDY6MDE6MTArMDA6MDAbBXQ8AAAAeElEQVRo3u3awQnDQBAEwT2Q8w/YAikIP5rF1RFMca+FO8/s7rrnqjcA1BsA6g0A9QaAesOfA77zqTf8Blj/AgAAAAAAAJsDqAOoA6gDqAOoc9TXAdQB1AHUAdQB1AHUAdQB1AHU7Qc46gEAAAAANrcecGZ2f8B/ASYSQPlKoEJ/AAAAAElFTkSuQmCC";
const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs=";
const ONE_PIXEL_JPEG_B64 = "QUJDRA==";
@@ -671,6 +671,81 @@ describe("image tool implicit imageModel config", () => {
});
});
it("passes the configured image timeout to provider calls", async () => {
await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => {
await withTempAgentDir(async (agentDir) => {
const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({
text: "ok",
model: params.model,
}));
installImageUnderstandingProviderStubs({
id: "ollama",
capabilities: ["image"],
describeImage,
});
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "ollama/gemma4:26b-a4b-it-q4_K_M" },
},
},
tools: {
media: {
image: { timeoutSeconds: 180 },
},
},
};
const tool = createRequiredImageTool({ config: cfg, agentDir, workspaceDir });
await expectImageToolExecOk(tool, imagePath);
expect(describeImage).toHaveBeenCalledWith(expect.objectContaining({ timeoutMs: 180_000 }));
});
});
});
it("prefers a matching per-image-model timeout over the capability timeout", async () => {
await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => {
await withTempAgentDir(async (agentDir) => {
const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({
text: "ok",
model: params.model,
}));
installImageUnderstandingProviderStubs({
id: "ollama",
capabilities: ["image"],
describeImage,
});
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "ollama/gemma4:26b-a4b-it-q4_K_M" },
},
},
tools: {
media: {
image: {
timeoutSeconds: 180,
models: [
{
provider: "ollama",
model: "gemma4:26b-a4b-it-q4_K_M",
timeoutSeconds: 300,
},
],
},
},
},
};
const tool = createRequiredImageTool({ config: cfg, agentDir, workspaceDir });
await expectImageToolExecOk(tool, imagePath);
expect(describeImage).toHaveBeenCalledWith(expect.objectContaining({ timeoutMs: 300_000 }));
});
});
});
it("pairs minimax-portal primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => {
await withTempAgentDir(async (agentDir) => {
await writeAuthProfiles(agentDir, {

View File

@@ -1,11 +1,16 @@
import { resolve, isAbsolute } from "node:path";
import { Type } from "typebox";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import type { MediaUnderstandingModelConfig } from "../../config/types.tools.js";
import {
DEFAULT_TIMEOUT_SECONDS,
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "../../media-understanding/defaults.js";
import { matchesMediaEntryCapability } from "../../media-understanding/entry-capabilities.js";
import { normalizeMediaProviderId } from "../../media-understanding/provider-id.js";
import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js";
import { resolveTimeoutMs } from "../../media-understanding/resolve.js";
import { buildProviderRegistry } from "../../media-understanding/runner.js";
import {
classifyMediaReferenceSource,
@@ -177,6 +182,70 @@ function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undef
return undefined;
}
function matchesImageTimeoutEntry(params: {
entry: MediaUnderstandingModelConfig;
source: "capability" | "shared";
provider: string;
model: string;
providerRegistry: Map<string, MediaUnderstandingProvider>;
}): boolean {
const configuredProvider = normalizeMediaProviderId(params.entry.provider ?? "");
const selectedProvider = normalizeMediaProviderId(params.provider);
if (!configuredProvider || configuredProvider !== selectedProvider) {
return false;
}
if (
!matchesMediaEntryCapability({
entry: params.entry,
source: params.source,
capability: "image",
providerRegistry: params.providerRegistry,
})
) {
return false;
}
const configuredModel = params.entry.model?.trim();
if (!configuredModel) {
return true;
}
const providerPrefix = `${selectedProvider}/`;
const normalizedConfiguredModel = configuredModel.startsWith(providerPrefix)
? configuredModel.slice(providerPrefix.length)
: configuredModel;
return normalizedConfiguredModel === params.model;
}
function resolveImageToolTimeoutMs(params: {
cfg: OpenClawConfig;
provider: string;
model: string;
providerRegistry: Map<string, MediaUnderstandingProvider>;
}): number {
const imageConfig = params.cfg.tools?.media?.image;
const capabilityEntry = imageConfig?.models?.find((entry) =>
matchesImageTimeoutEntry({
entry,
source: "capability",
provider: params.provider,
model: params.model,
providerRegistry: params.providerRegistry,
}),
);
const sharedEntry = params.cfg.tools?.media?.models?.find((entry) =>
matchesImageTimeoutEntry({
entry,
source: "shared",
provider: params.provider,
model: params.model,
providerRegistry: params.providerRegistry,
}),
);
return resolveTimeoutMs(
capabilityEntry?.timeoutSeconds ?? sharedEntry?.timeoutSeconds ?? imageConfig?.timeoutSeconds,
DEFAULT_TIMEOUT_SECONDS.image,
);
}
type ImageSandboxConfig = {
root: string;
bridge: SandboxFsBridge;
@@ -203,6 +272,12 @@ async function runImagePrompt(params: {
cfg: effectiveCfg,
modelOverride: params.modelOverride,
run: async (provider, modelId) => {
const timeoutMs = resolveImageToolTimeoutMs({
cfg: providerCfg,
provider,
model: modelId,
providerRegistry: providerRegistry as Map<string, MediaUnderstandingProvider>,
});
const imageProvider = imageToolProviderDeps.getMediaUnderstandingProvider(
provider,
providerRegistry as Map<string, MediaUnderstandingProvider>,
@@ -223,7 +298,7 @@ async function runImagePrompt(params: {
model: modelId,
prompt: params.prompt,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
timeoutMs,
cfg: providerCfg,
agentDir: params.agentDir,
});
@@ -241,7 +316,7 @@ async function runImagePrompt(params: {
model: modelId,
prompt: params.prompt,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
timeoutMs,
cfg: providerCfg,
agentDir: params.agentDir,
});
@@ -258,7 +333,7 @@ async function runImagePrompt(params: {
model: modelId,
prompt: `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length}.`,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
timeoutMs,
cfg: providerCfg,
agentDir: params.agentDir,
});

View File

@@ -123,6 +123,7 @@ describe("describeImageWithModel", () => {
});
it("routes minimax-portal image models through the MiniMax VLM endpoint", async () => {
const timeoutSpy = vi.spyOn(AbortSignal, "timeout");
const authStore = { version: 1, profiles: {} };
const result = await describeImageWithModel({
cfg: {},
@@ -163,6 +164,7 @@ describe("describeImageWithModel", () => {
signal: expect.any(AbortSignal),
}),
);
expect(timeoutSpy).toHaveBeenCalledWith(1000);
expect(completeMock).not.toHaveBeenCalled();
});

View File

@@ -252,6 +252,7 @@ async function describeImagesWithMinimax(params: {
modelId: string;
modelBaseUrl?: string;
prompt: string;
timeoutMs?: number;
images: Array<{ buffer: Buffer; mime?: string }>;
}): Promise<ImagesDescriptionResult> {
const responses: string[] = [];
@@ -265,6 +266,7 @@ async function describeImagesWithMinimax(params: {
prompt,
imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`,
modelBaseUrl: params.modelBaseUrl,
timeoutMs: params.timeoutMs,
});
responses.push(params.images.length > 1 ? `Image ${index + 1}:\n${text.trim()}` : text.trim());
}
@@ -331,6 +333,7 @@ async function describeImagesWithModelInternal(
modelId: params.model,
modelBaseUrl: fallback.modelBaseUrl,
prompt,
timeoutMs: params.timeoutMs,
images: params.images,
});
}
@@ -341,6 +344,7 @@ async function describeImagesWithModelInternal(
modelId: model.id,
modelBaseUrl: model.baseUrl,
prompt,
timeoutMs: params.timeoutMs,
images: params.images,
});
}