diff --git a/CHANGELOG.md b/CHANGELOG.md index b92975f1054..625bbddb9c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Cron/Telegram: preserve explicit `:topic:` delivery targets over stale session-derived thread IDs when isolated cron announces to Telegram forum topics. Carries forward #59069; refs #49704 and #43808. Thanks @roytong9. +- CLI/onboarding: infer image input for common custom-provider vision model IDs, ask only for unknown models, and keep `--custom-image-input`/`--custom-text-input` overrides so vision-capable proxies do not get saved as text-only configs. Fixes #51869. Thanks @Antsoldier1974. - Memory/Dreaming: retry Dream Diary once with the session default when a configured dreaming model is unavailable, while leaving subagent trust and allowlist errors visible instead of silently masking configuration problems. Refs #67409 and #69209. Thanks @Ghiggins18 and @everySympathy. - Feishu/inbound files: recover CJK filenames from plain `Content-Disposition: filename=` download headers when Feishu exposes UTF-8 bytes through Latin-1 header decoding, while leaving valid Latin-1 and JSON-derived names unchanged. (#48578, #50435, #59431) Thanks @alex-xuweilong, @lishuaigit, and @DoChaoing. diff --git a/docs/cli/onboard.md b/docs/cli/onboard.md index e0c715034f2..bb15823dd39 100644 --- a/docs/cli/onboard.md +++ b/docs/cli/onboard.md @@ -61,10 +61,12 @@ openclaw onboard --non-interactive \ --custom-model-id "foo-large" \ --custom-api-key "$CUSTOM_API_KEY" \ --secret-input-mode plaintext \ - --custom-compatibility openai + --custom-compatibility openai \ + --custom-image-input ``` `--custom-api-key` is optional in non-interactive mode. If omitted, onboarding checks `CUSTOM_API_KEY`. +OpenClaw marks common vision model IDs as image-capable automatically. Pass `--custom-image-input` for unknown custom vision IDs, or `--custom-text-input` to force text-only metadata. LM Studio also supports a provider-specific key flag in non-interactive mode: diff --git a/docs/gateway/config-tools.md b/docs/gateway/config-tools.md index 979863b03e6..660656b0106 100644 --- a/docs/gateway/config-tools.md +++ b/docs/gateway/config-tools.md @@ -456,6 +456,7 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi - `models.providers.*.models`: explicit provider model catalog entries. + - `models.providers.*.models.*.input`: model input modalities. Use `["text"]` for text-only models and `["text", "image"]` for native image/vision models. Image attachments are only injected into agent turns when the selected model is marked image-capable. - `models.providers.*.models.*.contextWindow`: native model context window metadata. This overrides provider-level `contextWindow` for that model. - `models.providers.*.models.*.contextTokens`: optional runtime context cap. This overrides provider-level `contextTokens`; use it when you want a smaller effective context budget than the model's native `contextWindow`; `openclaw models list` shows both values when they differ. - `models.providers.*.models.*.compat.supportsDeveloperRole`: optional compatibility hint. For `api: "openai-completions"` with a non-empty non-native `baseUrl` (host not `api.openai.com`), OpenClaw forces this to `false` at runtime. Empty/omitted `baseUrl` keeps default OpenAI behavior. @@ -472,6 +473,8 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi +Interactive custom-provider onboarding infers image input for common vision model IDs such as GPT-4o, Claude, Gemini, Qwen-VL, LLaVA, Pixtral, InternVL, Mllama, MiniCPM-V, and GLM-4V, and skips the extra question for known text-only families. Unknown model IDs still prompt for image support. Non-interactive onboarding uses the same inference; pass `--custom-image-input` to force image-capable metadata or `--custom-text-input` to force text-only metadata. + ### Provider examples diff --git a/docs/gateway/local-models.md b/docs/gateway/local-models.md index a5480dd1279..9eaf268608e 100644 --- a/docs/gateway/local-models.md +++ b/docs/gateway/local-models.md @@ -168,6 +168,13 @@ catalog id and model ref: - `models.providers.mlx.models[].id: "mlx-community/Qwen3-30B-A3B-6bit"` - `agents.defaults.model.primary: "mlx/mlx-community/Qwen3-30B-A3B-6bit"` +Set `input: ["text", "image"]` on local or proxied vision models so image +attachments are injected into agent turns. Interactive custom-provider +onboarding infers common vision model IDs and asks only for unknown names. +Non-interactive onboarding uses the same inference; use `--custom-image-input` +for unknown vision IDs or `--custom-text-input` when a known-looking model is +text-only behind your endpoint. + Keep `models.mode: "merge"` so hosted models stay available as fallbacks. Use `models.providers..timeoutSeconds` for slow local or remote model servers before raising `agents.defaults.timeoutSeconds`. The provider timeout diff --git a/docs/start/wizard-cli-automation.md b/docs/start/wizard-cli-automation.md index 055fb1ef0bb..61b342eeb89 100644 --- a/docs/start/wizard-cli-automation.md +++ b/docs/start/wizard-cli-automation.md @@ -166,11 +166,13 @@ openclaw onboard --non-interactive \ --custom-api-key "$CUSTOM_API_KEY" \ --custom-provider-id "my-custom" \ --custom-compatibility anthropic \ + --custom-image-input \ --gateway-port 18789 \ --gateway-bind loopback ``` `--custom-api-key` is optional. If omitted, onboarding checks `CUSTOM_API_KEY`. + OpenClaw marks common vision model IDs as image-capable automatically. Add `--custom-image-input` for unknown custom vision IDs, or `--custom-text-input` to force text-only metadata. Ref-mode variant: @@ -184,6 +186,7 @@ openclaw onboard --non-interactive \ --secret-input-mode ref \ --custom-provider-id "my-custom" \ --custom-compatibility anthropic \ + --custom-image-input \ --gateway-port 18789 \ --gateway-bind loopback ``` diff --git a/docs/start/wizard-cli-reference.md b/docs/start/wizard-cli-reference.md index 7ff2efdb9b9..9ffca43ea2f 100644 --- a/docs/start/wizard-cli-reference.md +++ b/docs/start/wizard-cli-reference.md @@ -202,6 +202,7 @@ What you set: - `--custom-api-key` (optional; falls back to `CUSTOM_API_KEY`) - `--custom-provider-id` (optional) - `--custom-compatibility ` (optional; default `openai`) + - `--custom-image-input` / `--custom-text-input` (optional; override inferred model input capability) @@ -212,6 +213,7 @@ What you set: Model behavior: - Pick default model from detected options, or enter provider and model manually. +- Custom-provider onboarding infers image support for common model IDs and asks only when the model name is unknown. - When onboarding starts from a provider auth choice, the model picker prefers that provider automatically. For Volcengine and BytePlus, the same preference also matches their coding-plan variants (`volcengine-plan/*`, diff --git a/src/cli/program/register.onboard.ts b/src/cli/program/register.onboard.ts index 9f032358e07..a20b0f344d9 100644 --- a/src/cli/program/register.onboard.ts +++ b/src/cli/program/register.onboard.ts @@ -144,6 +144,8 @@ export function registerOnboardCommand(program: Command) { "--custom-compatibility ", "Custom provider API compatibility: openai|anthropic (default: openai)", ) + .option("--custom-image-input", "Mark the custom provider model as image-capable") + .option("--custom-text-input", "Mark the custom provider model as text-only") .option("--gateway-port ", "Gateway port") .option("--gateway-bind ", "Gateway bind: loopback|tailnet|lan|auto|custom") .option("--gateway-auth ", "Gateway auth: token|password") @@ -214,6 +216,12 @@ export function registerOnboardCommand(program: Command) { customModelId: opts.customModelId as string | undefined, customProviderId: opts.customProviderId as string | undefined, customCompatibility: opts.customCompatibility as "openai" | "anthropic" | undefined, + customImageInput: + opts.customTextInput === true + ? false + : opts.customImageInput === true + ? true + : undefined, gatewayPort: typeof gatewayPort === "number" && Number.isFinite(gatewayPort) ? gatewayPort diff --git a/src/commands/onboard-custom-config.test.ts b/src/commands/onboard-custom-config.test.ts index bbea0ee2bac..f4e94404c76 100644 --- a/src/commands/onboard-custom-config.test.ts +++ b/src/commands/onboard-custom-config.test.ts @@ -5,7 +5,9 @@ import { applyCustomApiConfig, buildAnthropicVerificationProbeRequest, buildOpenAiVerificationProbeRequest, + inferCustomModelSupportsImageInput, parseNonInteractiveCustomApiFlags, + resolveCustomModelImageInputInference, } from "./onboard-custom-config.js"; function buildCustomProviderConfig(contextWindow?: number) { @@ -311,6 +313,60 @@ describe("applyCustomApiConfig", () => { ).toBeUndefined(); }); + it("adds image input for new non-azure custom models when requested", () => { + const result = applyCustomApiConfig({ + config: {}, + baseUrl: "https://llm.example.com/v1", + modelId: "gpt-4o", + compatibility: "openai", + providerId: "custom", + supportsImageInput: true, + }); + + expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]); + }); + + it("infers image input for known non-azure custom vision models", () => { + const result = applyCustomApiConfig({ + config: {}, + baseUrl: "https://llm.example.com/v1", + modelId: "gpt-4o", + compatibility: "openai", + providerId: "custom", + }); + + expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]); + }); + + it("lets explicit text input override known non-azure custom vision inference", () => { + const result = applyCustomApiConfig({ + config: {}, + baseUrl: "https://llm.example.com/v1", + modelId: "gpt-4o", + compatibility: "openai", + providerId: "custom", + supportsImageInput: false, + }); + + expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text"]); + }); + + it("updates existing non-azure custom model input when image support is explicitly requested", () => { + const result = applyCustomApiConfig({ + config: buildCustomProviderConfig(CONTEXT_WINDOW_HARD_MIN_TOKENS), + baseUrl: "https://llm.example.com/v1", + modelId: "foo-large", + compatibility: "openai", + providerId: "custom", + supportsImageInput: true, + }); + const model = result.config.models?.providers?.custom?.models?.find( + (entry) => entry.id === "foo-large", + ); + + expect(model?.input).toEqual(["text", "image"]); + }); + it("re-onboard preserves user-customized fields for non-azure models", () => { const result = applyCustomApiConfig({ config: { @@ -391,6 +447,16 @@ describe("parseNonInteractiveCustomApiFlags", () => { }); }); + it("parses custom image input opt-in", () => { + const result = parseNonInteractiveCustomApiFlags({ + baseUrl: "https://llm.example.com/v1", + modelId: "foo-large", + supportsImageInput: true, + }); + + expect(result.supportsImageInput).toBe(true); + }); + it.each([ { name: "missing required flags", @@ -419,3 +485,30 @@ describe("parseNonInteractiveCustomApiFlags", () => { expect(() => parseNonInteractiveCustomApiFlags(flags)).toThrow(expectedMessage); }); }); + +describe("inferCustomModelSupportsImageInput", () => { + it.each(["gpt-4o", "claude-sonnet-4-6", "gemini-3-flash", "qwen2.5-vl", "llava"])( + "detects likely vision model %s", + (modelId) => { + expect(inferCustomModelSupportsImageInput(modelId)).toBe(true); + }, + ); + + it.each(["llama3", "deepseek-v3", "evolvable-text-model"])( + "does not over-match text model %s", + (modelId) => { + expect(inferCustomModelSupportsImageInput(modelId)).toBe(false); + }, + ); + + it("reports confidence for known text and unknown custom models", () => { + expect(resolveCustomModelImageInputInference("llama3")).toEqual({ + supportsImageInput: false, + confidence: "known", + }); + expect(resolveCustomModelImageInputInference("my-private-model")).toEqual({ + supportsImageInput: false, + confidence: "unknown", + }); + }); +}); diff --git a/src/commands/onboard-custom-config.ts b/src/commands/onboard-custom-config.ts index f907309fd71..210d4ad5c18 100644 --- a/src/commands/onboard-custom-config.ts +++ b/src/commands/onboard-custom-config.ts @@ -18,12 +18,72 @@ const DEFAULT_MAX_TOKENS = 4096; // Azure OpenAI uses the Responses API which supports larger defaults const AZURE_DEFAULT_CONTEXT_WINDOW = 400_000; const AZURE_DEFAULT_MAX_TOKENS = 16_384; +type CustomModelInput = "text" | "image"; +export type CustomModelImageInputInference = { + supportsImageInput: boolean; + confidence: "known" | "unknown"; +}; function normalizeContextWindowForCustomModel(value: unknown): number { const parsed = typeof value === "number" && Number.isFinite(value) ? Math.floor(value) : 0; return parsed >= CONTEXT_WINDOW_HARD_MIN_TOKENS ? parsed : CONTEXT_WINDOW_HARD_MIN_TOKENS; } +function customModelInputs(supportsImageInput: boolean): CustomModelInput[] { + return supportsImageInput ? ["text", "image"] : ["text"]; +} + +export function resolveCustomModelImageInputInference( + modelId: string, +): CustomModelImageInputInference { + const normalized = normalizeLowercaseStringOrEmpty(modelId); + if (!normalized) { + return { supportsImageInput: false, confidence: "unknown" }; + } + const matchesKnownVision = + /\b(?:gpt-4o|gpt-4\.1|gpt-[5-9]|o[134])\b/.test(normalized) || + /\bclaude-(?:3|4|sonnet|opus|haiku)\b/.test(normalized) || + /\bgemini\b/.test(normalized) || + /\b(?:qwen[\w.-]*-?vl|qwen-vl)\b/.test(normalized) || + /\b(?:vision|llava|pixtral|internvl|mllama|minicpm-v|glm-4v)\b/.test(normalized) || + /(?:^|[-_/])vl(?:[-_/]|$)/.test(normalized); + if (matchesKnownVision) { + return { supportsImageInput: true, confidence: "known" }; + } + + const matchesKnownText = + /\b(?:llama\d*|deepseek|mistral|mixtral|kimi|moonshot|codestral|devstral|phi|qwq|codellama)\b/.test( + normalized, + ) || /\bqwen(?!.*(?:vl|vision))/.test(normalized); + if (matchesKnownText) { + return { supportsImageInput: false, confidence: "known" }; + } + + return { supportsImageInput: false, confidence: "unknown" }; +} + +export function inferCustomModelSupportsImageInput(modelId: string): boolean { + return resolveCustomModelImageInputInference(modelId).supportsImageInput; +} + +function resolveCustomModelSupportsImageInput(params: { + modelId: string; + explicit?: boolean; + fallback: boolean; + inferKnownModels: boolean; +}): boolean { + return ( + params.explicit ?? + ((): boolean => { + if (!params.inferKnownModels) { + return params.fallback; + } + const inference = resolveCustomModelImageInputInference(params.modelId); + return inference.confidence === "known" ? inference.supportsImageInput : params.fallback; + })() + ); +} + function isAzureFoundryUrl(baseUrl: string): boolean { try { const url = new URL(baseUrl); @@ -112,6 +172,7 @@ export type ApplyCustomApiConfigParams = { apiKey?: SecretInput; providerId?: string; alias?: string; + supportsImageInput?: boolean; }; export type ParseNonInteractiveCustomApiFlagsParams = { @@ -120,6 +181,7 @@ export type ParseNonInteractiveCustomApiFlagsParams = { compatibility?: string; apiKey?: string; providerId?: string; + supportsImageInput?: boolean; }; export type ParsedNonInteractiveCustomApiFlags = { @@ -128,6 +190,7 @@ export type ParsedNonInteractiveCustomApiFlags = { compatibility: CustomApiCompatibility; apiKey?: string; providerId?: string; + supportsImageInput?: boolean; }; export type CustomApiErrorCode = @@ -439,6 +502,9 @@ export function parseNonInteractiveCustomApiFlags( compatibility: parseCustomApiCompatibility(params.compatibility), ...(apiKey ? { apiKey } : {}), ...(providerId ? { providerId } : {}), + ...(params.supportsImageInput === undefined + ? {} + : { supportsImageInput: params.supportsImageInput }), }; } @@ -487,15 +553,25 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom const existingModels = Array.isArray(existingProvider?.models) ? existingProvider.models : []; const hasModel = existingModels.some((model) => model.id === modelId); const isLikelyReasoningModel = isAzure && /\b(o[134]|gpt-([5-9]|\d{2,}))\b/i.test(modelId); + const explicitInput = + params.supportsImageInput === undefined + ? undefined + : customModelInputs(params.supportsImageInput); + const generatedInput = customModelInputs( + resolveCustomModelSupportsImageInput({ + modelId, + explicit: params.supportsImageInput, + fallback: isAzure && isLikelyReasoningModel, + inferKnownModels: !isAzure, + }), + ); const nextModel = isAzure ? { id: modelId, name: `${modelId} (Custom Provider)`, contextWindow: AZURE_DEFAULT_CONTEXT_WINDOW, maxTokens: AZURE_DEFAULT_MAX_TOKENS, - input: isLikelyReasoningModel - ? (["text", "image"] as Array<"text" | "image">) - : (["text"] as ["text"]), + input: generatedInput, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, reasoning: isLikelyReasoningModel, compat: { supportsStore: false }, @@ -505,7 +581,7 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom name: `${modelId} (Custom Provider)`, contextWindow: DEFAULT_CONTEXT_WINDOW, maxTokens: DEFAULT_MAX_TOKENS, - input: ["text"] as ["text"], + input: generatedInput, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, reasoning: false, }; @@ -515,6 +591,7 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom ? { ...model, ...(isAzure ? nextModel : {}), + ...(explicitInput ? { input: explicitInput } : {}), name: model.name ?? nextModel.name, cost: model.cost ?? nextModel.cost, contextWindow: normalizeContextWindowForCustomModel(model.contextWindow), diff --git a/src/commands/onboard-custom.test.ts b/src/commands/onboard-custom.test.ts index c7d5a2379e2..ab56ff99f3d 100644 --- a/src/commands/onboard-custom.test.ts +++ b/src/commands/onboard-custom.test.ts @@ -19,7 +19,7 @@ vi.mock("../plugins/provider-auth-input.js", () => ({ ), })); -function createTestPrompter(params: { text: string[]; select?: string[] }): { +function createTestPrompter(params: { text: string[]; select?: string[]; confirm?: boolean[] }): { text: ReturnType; select: ReturnType; confirm: ReturnType; @@ -34,6 +34,10 @@ function createTestPrompter(params: { text: string[]; select?: string[] }): { for (const answer of params.select ?? []) { select.mockResolvedValueOnce(answer); } + const confirm = vi.fn(async () => false); + for (const answer of params.confirm ?? []) { + confirm.mockResolvedValueOnce(answer); + } return { text, progress: vi.fn(() => ({ @@ -41,7 +45,7 @@ function createTestPrompter(params: { text: string[]; select?: string[] }): { stop: vi.fn(), })), select, - confirm: vi.fn(), + confirm, note: vi.fn(), }; } @@ -100,6 +104,38 @@ describe("promptCustomApiConfig", () => { expectOpenAiCompatResult({ prompter, textCalls: 5, selectCalls: 2, result }); expect(result.config.agents?.defaults?.models?.["custom/llama3"]?.alias).toBe("local"); + expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text"]); + expect(prompter.confirm).not.toHaveBeenCalled(); + }); + + it("skips the image-input prompt for known custom vision models", async () => { + const prompter = createTestPrompter({ + text: ["https://proxy.example.com/v1", "test-key", "gpt-4o", "custom", ""], + select: ["plaintext", "openai"], + }); + stubFetchSequence([{ ok: true }]); + + const result = await runPromptCustomApi(prompter); + + expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]); + expect(prompter.confirm).not.toHaveBeenCalled(); + }); + + it("prompts for custom model image support when the model is unknown", async () => { + const prompter = createTestPrompter({ + text: ["https://proxy.example.com/v1", "test-key", "private-model", "custom", ""], + select: ["plaintext", "openai"], + confirm: [true], + }); + stubFetchSequence([{ ok: true }]); + + const result = await runPromptCustomApi(prompter); + + expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]); + expect(prompter.confirm).toHaveBeenCalledWith({ + message: "Does this model support image input?", + initialValue: false, + }); }); it("defaults custom setup to the native Ollama base URL", async () => { diff --git a/src/commands/onboard-custom.ts b/src/commands/onboard-custom.ts index f19dcd94169..24927eaa294 100644 --- a/src/commands/onboard-custom.ts +++ b/src/commands/onboard-custom.ts @@ -15,6 +15,7 @@ import { normalizeEndpointId, normalizeOptionalProviderApiKey, resolveCustomModelAliasError, + resolveCustomModelImageInputInference, resolveCustomProviderId, type CustomApiCompatibility, type CustomApiResult, @@ -24,11 +25,14 @@ export { buildAnthropicVerificationProbeRequest, buildOpenAiVerificationProbeRequest, CustomApiError, + inferCustomModelSupportsImageInput, parseNonInteractiveCustomApiFlags, + resolveCustomModelImageInputInference, resolveCustomProviderId, type ApplyCustomApiConfigParams, type CustomApiCompatibility, type CustomApiErrorCode, + type CustomModelImageInputInference, type CustomApiResult, type ParseNonInteractiveCustomApiFlagsParams, type ParsedNonInteractiveCustomApiFlags, @@ -341,6 +345,14 @@ export async function promptCustomApiConfig(params: { return resolveCustomModelAliasError({ raw: value, cfg: config, modelRef }); }, }); + const imageInputInference = resolveCustomModelImageInputInference(modelId); + const supportsImageInput = + imageInputInference.confidence === "known" + ? imageInputInference.supportsImageInput + : await prompter.confirm({ + message: "Does this model support image input?", + initialValue: imageInputInference.supportsImageInput, + }); const resolvedCompatibility = compatibility ?? "openai"; const result = applyCustomApiConfig({ config, @@ -350,6 +362,7 @@ export async function promptCustomApiConfig(params: { apiKey, providerId: providerIdInput, alias: aliasInput, + supportsImageInput, }); if (result.providerIdRenamedFrom && result.providerId) { diff --git a/src/commands/onboard-non-interactive/local/auth-choice.test.ts b/src/commands/onboard-non-interactive/local/auth-choice.test.ts index b60f6a80a0a..e4b2e647e1b 100644 --- a/src/commands/onboard-non-interactive/local/auth-choice.test.ts +++ b/src/commands/onboard-non-interactive/local/auth-choice.test.ts @@ -136,4 +136,71 @@ describe("applyNonInteractiveAuthChoice", () => { }), ); }); + + it("marks non-interactive custom provider models as image-capable when requested", async () => { + const runtime = createRuntime(); + const nextConfig = { agents: { defaults: {} } } as OpenClawConfig; + resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined); + + const result = await applyNonInteractiveAuthChoice({ + nextConfig, + authChoice: "custom-api-key", + opts: { + customBaseUrl: "https://models.custom.local/v1", + customModelId: "gpt-4o", + customImageInput: true, + } as never, + runtime: runtime as never, + baseConfig: nextConfig, + }); + + expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([ + "text", + "image", + ]); + }); + + it("infers image-capable non-interactive custom provider models by known model id", async () => { + const runtime = createRuntime(); + const nextConfig = { agents: { defaults: {} } } as OpenClawConfig; + resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined); + + const result = await applyNonInteractiveAuthChoice({ + nextConfig, + authChoice: "custom-api-key", + opts: { + customBaseUrl: "https://models.custom.local/v1", + customModelId: "gpt-4o", + } as never, + runtime: runtime as never, + baseConfig: nextConfig, + }); + + expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([ + "text", + "image", + ]); + }); + + it("honors explicit text-only override for known custom vision models", async () => { + const runtime = createRuntime(); + const nextConfig = { agents: { defaults: {} } } as OpenClawConfig; + resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined); + + const result = await applyNonInteractiveAuthChoice({ + nextConfig, + authChoice: "custom-api-key", + opts: { + customBaseUrl: "https://models.custom.local/v1", + customModelId: "gpt-4o", + customImageInput: false, + } as never, + runtime: runtime as never, + baseConfig: nextConfig, + }); + + expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([ + "text", + ]); + }); }); diff --git a/src/commands/onboard-non-interactive/local/auth-choice.ts b/src/commands/onboard-non-interactive/local/auth-choice.ts index ef558fd03d1..f56d7995798 100644 --- a/src/commands/onboard-non-interactive/local/auth-choice.ts +++ b/src/commands/onboard-non-interactive/local/auth-choice.ts @@ -177,6 +177,7 @@ export async function applyNonInteractiveAuthChoice(params: { compatibility: opts.customCompatibility, apiKey: opts.customApiKey, providerId: opts.customProviderId, + supportsImageInput: opts.customImageInput, }); const resolvedProviderId = resolveCustomProviderId({ config: nextConfig, @@ -213,6 +214,7 @@ export async function applyNonInteractiveAuthChoice(params: { compatibility: customAuth.compatibility, apiKey: customApiKeyInput, providerId: customAuth.providerId, + supportsImageInput: customAuth.supportsImageInput, }); if (result.providerIdRenamedFrom && result.providerId) { runtime.log( diff --git a/src/commands/onboard-types.ts b/src/commands/onboard-types.ts index c1ee73a8ea6..4c40a9b43db 100644 --- a/src/commands/onboard-types.ts +++ b/src/commands/onboard-types.ts @@ -62,6 +62,7 @@ export type OnboardOptions = OnboardDynamicProviderOptions & { customModelId?: string; customProviderId?: string; customCompatibility?: "openai" | "anthropic"; + customImageInput?: boolean; gatewayPort?: number; gatewayBind?: GatewayBind; gatewayAuth?: GatewayAuthChoice;