diff --git a/CHANGELOG.md b/CHANGELOG.md
index b92975f1054..625bbddb9c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes
 
 - Cron/Telegram: preserve explicit `:topic:` delivery targets over stale session-derived thread IDs when isolated cron announces to Telegram forum topics. Carries forward #59069; refs #49704 and #43808. Thanks @roytong9.
+- CLI/onboarding: infer image input for common custom-provider vision model IDs, ask only for unknown models, and keep `--custom-image-input`/`--custom-text-input` overrides so vision-capable proxies do not get saved as text-only configs. Fixes #51869. Thanks @Antsoldier1974.
 - Memory/Dreaming: retry Dream Diary once with the session default when a configured dreaming model is unavailable, while leaving subagent trust and allowlist errors visible instead of silently masking configuration problems. Refs #67409 and #69209. Thanks @Ghiggins18 and @everySympathy.
 - Feishu/inbound files: recover CJK filenames from plain `Content-Disposition: filename=` download headers when Feishu exposes UTF-8 bytes through Latin-1 header decoding, while leaving valid Latin-1 and JSON-derived names unchanged. (#48578, #50435, #59431) Thanks @alex-xuweilong, @lishuaigit, and @DoChaoing.
 
diff --git a/docs/cli/onboard.md b/docs/cli/onboard.md
index e0c715034f2..bb15823dd39 100644
--- a/docs/cli/onboard.md
+++ b/docs/cli/onboard.md
@@ -61,10 +61,12 @@ openclaw onboard --non-interactive \
   --custom-model-id "foo-large" \
   --custom-api-key "$CUSTOM_API_KEY" \
   --secret-input-mode plaintext \
-  --custom-compatibility openai
+  --custom-compatibility openai \
+  --custom-image-input
 ```
 
 `--custom-api-key` is optional in non-interactive mode. If omitted, onboarding checks `CUSTOM_API_KEY`.
+OpenClaw marks common vision model IDs as image-capable automatically. Pass `--custom-image-input` for unknown custom vision IDs, or `--custom-text-input` to force text-only metadata.
 
 LM Studio also supports a provider-specific key flag in non-interactive mode:
 
diff --git a/docs/gateway/config-tools.md b/docs/gateway/config-tools.md
index 979863b03e6..660656b0106 100644
--- a/docs/gateway/config-tools.md
+++ b/docs/gateway/config-tools.md
@@ -456,6 +456,7 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi
   </Accordion>
   <Accordion title="Model catalog entries">
     - `models.providers.*.models`: explicit provider model catalog entries.
+    - `models.providers.*.models.*.input`: model input modalities. Use `["text"]` for text-only models and `["text", "image"]` for native image/vision models. Image attachments are only injected into agent turns when the selected model is marked image-capable.
     - `models.providers.*.models.*.contextWindow`: native model context window metadata. This overrides provider-level `contextWindow` for that model.
     - `models.providers.*.models.*.contextTokens`: optional runtime context cap. This overrides provider-level `contextTokens`; use it when you want a smaller effective context budget than the model's native `contextWindow`; `openclaw models list` shows both values when they differ.
     - `models.providers.*.models.*.compat.supportsDeveloperRole`: optional compatibility hint. For `api: "openai-completions"` with a non-empty non-native `baseUrl` (host not `api.openai.com`), OpenClaw forces this to `false` at runtime. Empty/omitted `baseUrl` keeps default OpenAI behavior.
@@ -472,6 +473,8 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi
   </Accordion>
 </AccordionGroup>
 
+Interactive custom-provider onboarding infers image input for common vision model IDs such as GPT-4o, Claude, Gemini, Qwen-VL, LLaVA, Pixtral, InternVL, Mllama, MiniCPM-V, and GLM-4V, and skips the extra question for known text-only families. Unknown model IDs still prompt for image support. Non-interactive onboarding uses the same inference; pass `--custom-image-input` to force image-capable metadata or `--custom-text-input` to force text-only metadata.
+
 ### Provider examples
 
 <AccordionGroup>
diff --git a/docs/gateway/local-models.md b/docs/gateway/local-models.md
index a5480dd1279..9eaf268608e 100644
--- a/docs/gateway/local-models.md
+++ b/docs/gateway/local-models.md
@@ -168,6 +168,13 @@ catalog id and model ref:
 - `models.providers.mlx.models[].id: "mlx-community/Qwen3-30B-A3B-6bit"`
 - `agents.defaults.model.primary: "mlx/mlx-community/Qwen3-30B-A3B-6bit"`
 
+Set `input: ["text", "image"]` on local or proxied vision models so image
+attachments are injected into agent turns. Interactive custom-provider
+onboarding infers common vision model IDs and asks only for unknown names.
+Non-interactive onboarding uses the same inference; use `--custom-image-input`
+for unknown vision IDs or `--custom-text-input` when a known-looking model is
+text-only behind your endpoint.
+
 Keep `models.mode: "merge"` so hosted models stay available as fallbacks.
 Use `models.providers.<id>.timeoutSeconds` for slow local or remote model
 servers before raising `agents.defaults.timeoutSeconds`. The provider timeout
diff --git a/docs/start/wizard-cli-automation.md b/docs/start/wizard-cli-automation.md
index 055fb1ef0bb..61b342eeb89 100644
--- a/docs/start/wizard-cli-automation.md
+++ b/docs/start/wizard-cli-automation.md
@@ -166,11 +166,13 @@ openclaw onboard --non-interactive \
       --custom-api-key "$CUSTOM_API_KEY" \
       --custom-provider-id "my-custom" \
       --custom-compatibility anthropic \
+      --custom-image-input \
       --gateway-port 18789 \
       --gateway-bind loopback
     ```
 
     `--custom-api-key` is optional. If omitted, onboarding checks `CUSTOM_API_KEY`.
+    OpenClaw marks common vision model IDs as image-capable automatically. Add `--custom-image-input` for unknown custom vision IDs, or `--custom-text-input` to force text-only metadata.
 
     Ref-mode variant:
 
@@ -184,6 +186,7 @@ openclaw onboard --non-interactive \
       --secret-input-mode ref \
       --custom-provider-id "my-custom" \
       --custom-compatibility anthropic \
+      --custom-image-input \
       --gateway-port 18789 \
       --gateway-bind loopback
     ```
diff --git a/docs/start/wizard-cli-reference.md b/docs/start/wizard-cli-reference.md
index 7ff2efdb9b9..9ffca43ea2f 100644
--- a/docs/start/wizard-cli-reference.md
+++ b/docs/start/wizard-cli-reference.md
@@ -202,6 +202,7 @@ What you set:
     - `--custom-api-key` (optional; falls back to `CUSTOM_API_KEY`)
     - `--custom-provider-id` (optional)
     - `--custom-compatibility <openai|anthropic>` (optional; default `openai`)
+    - `--custom-image-input` / `--custom-text-input` (optional; override inferred model input capability)
 
   </Accordion>
   <Accordion title="Skip">
@@ -212,6 +213,7 @@ What you set:
 Model behavior:
 
 - Pick default model from detected options, or enter provider and model manually.
+- Custom-provider onboarding infers image support for common model IDs and asks only when the model name is unknown.
 - When onboarding starts from a provider auth choice, the model picker prefers
   that provider automatically. For Volcengine and BytePlus, the same preference
   also matches their coding-plan variants (`volcengine-plan/*`,
diff --git a/src/cli/program/register.onboard.ts b/src/cli/program/register.onboard.ts
index 9f032358e07..a20b0f344d9 100644
--- a/src/cli/program/register.onboard.ts
+++ b/src/cli/program/register.onboard.ts
@@ -144,6 +144,8 @@ export function registerOnboardCommand(program: Command) {
       "--custom-compatibility <mode>",
       "Custom provider API compatibility: openai|anthropic (default: openai)",
     )
+    .option("--custom-image-input", "Mark the custom provider model as image-capable")
+    .option("--custom-text-input", "Mark the custom provider model as text-only")
     .option("--gateway-port <port>", "Gateway port")
     .option("--gateway-bind <mode>", "Gateway bind: loopback|tailnet|lan|auto|custom")
     .option("--gateway-auth <mode>", "Gateway auth: token|password")
@@ -214,6 +216,12 @@ export function registerOnboardCommand(program: Command) {
           customModelId: opts.customModelId as string | undefined,
           customProviderId: opts.customProviderId as string | undefined,
           customCompatibility: opts.customCompatibility as "openai" | "anthropic" | undefined,
+          customImageInput:
+            opts.customTextInput === true
+              ? false
+              : opts.customImageInput === true
+                ? true
+                : undefined,
           gatewayPort:
             typeof gatewayPort === "number" && Number.isFinite(gatewayPort)
               ? gatewayPort
diff --git a/src/commands/onboard-custom-config.test.ts b/src/commands/onboard-custom-config.test.ts
index bbea0ee2bac..f4e94404c76 100644
--- a/src/commands/onboard-custom-config.test.ts
+++ b/src/commands/onboard-custom-config.test.ts
@@ -5,7 +5,9 @@ import {
   applyCustomApiConfig,
   buildAnthropicVerificationProbeRequest,
   buildOpenAiVerificationProbeRequest,
+  inferCustomModelSupportsImageInput,
   parseNonInteractiveCustomApiFlags,
+  resolveCustomModelImageInputInference,
 } from "./onboard-custom-config.js";
 
 function buildCustomProviderConfig(contextWindow?: number) {
@@ -311,6 +313,60 @@ describe("applyCustomApiConfig", () => {
     ).toBeUndefined();
   });
 
+  it("adds image input for new non-azure custom models when requested", () => {
+    const result = applyCustomApiConfig({
+      config: {},
+      baseUrl: "https://llm.example.com/v1",
+      modelId: "gpt-4o",
+      compatibility: "openai",
+      providerId: "custom",
+      supportsImageInput: true,
+    });
+
+    expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
+  });
+
+  it("infers image input for known non-azure custom vision models", () => {
+    const result = applyCustomApiConfig({
+      config: {},
+      baseUrl: "https://llm.example.com/v1",
+      modelId: "gpt-4o",
+      compatibility: "openai",
+      providerId: "custom",
+    });
+
+    expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
+  });
+
+  it("lets explicit text input override known non-azure custom vision inference", () => {
+    const result = applyCustomApiConfig({
+      config: {},
+      baseUrl: "https://llm.example.com/v1",
+      modelId: "gpt-4o",
+      compatibility: "openai",
+      providerId: "custom",
+      supportsImageInput: false,
+    });
+
+    expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text"]);
+  });
+
+  it("updates existing non-azure custom model input when image support is explicitly requested", () => {
+    const result = applyCustomApiConfig({
+      config: buildCustomProviderConfig(CONTEXT_WINDOW_HARD_MIN_TOKENS),
+      baseUrl: "https://llm.example.com/v1",
+      modelId: "foo-large",
+      compatibility: "openai",
+      providerId: "custom",
+      supportsImageInput: true,
+    });
+    const model = result.config.models?.providers?.custom?.models?.find(
+      (entry) => entry.id === "foo-large",
+    );
+
+    expect(model?.input).toEqual(["text", "image"]);
+  });
+
   it("re-onboard preserves user-customized fields for non-azure models", () => {
     const result = applyCustomApiConfig({
       config: {
@@ -391,6 +447,16 @@ describe("parseNonInteractiveCustomApiFlags", () => {
     });
   });
 
+  it("parses custom image input opt-in", () => {
+    const result = parseNonInteractiveCustomApiFlags({
+      baseUrl: "https://llm.example.com/v1",
+      modelId: "foo-large",
+      supportsImageInput: true,
+    });
+
+    expect(result.supportsImageInput).toBe(true);
+  });
+
   it.each([
     {
       name: "missing required flags",
@@ -419,3 +485,30 @@ describe("parseNonInteractiveCustomApiFlags", () => {
     expect(() => parseNonInteractiveCustomApiFlags(flags)).toThrow(expectedMessage);
   });
 });
+
+describe("inferCustomModelSupportsImageInput", () => {
+  it.each(["gpt-4o", "claude-sonnet-4-6", "gemini-3-flash", "qwen2.5-vl", "llava"])(
+    "detects likely vision model %s",
+    (modelId) => {
+      expect(inferCustomModelSupportsImageInput(modelId)).toBe(true);
+    },
+  );
+
+  it.each(["llama3", "deepseek-v3", "evolvable-text-model"])(
+    "does not over-match text model %s",
+    (modelId) => {
+      expect(inferCustomModelSupportsImageInput(modelId)).toBe(false);
+    },
+  );
+
+  it("reports confidence for known text and unknown custom models", () => {
+    expect(resolveCustomModelImageInputInference("llama3")).toEqual({
+      supportsImageInput: false,
+      confidence: "known",
+    });
+    expect(resolveCustomModelImageInputInference("my-private-model")).toEqual({
+      supportsImageInput: false,
+      confidence: "unknown",
+    });
+  });
+});
diff --git a/src/commands/onboard-custom-config.ts b/src/commands/onboard-custom-config.ts
index f907309fd71..210d4ad5c18 100644
--- a/src/commands/onboard-custom-config.ts
+++ b/src/commands/onboard-custom-config.ts
@@ -18,12 +18,72 @@ const DEFAULT_MAX_TOKENS = 4096;
 // Azure OpenAI uses the Responses API which supports larger defaults
 const AZURE_DEFAULT_CONTEXT_WINDOW = 400_000;
 const AZURE_DEFAULT_MAX_TOKENS = 16_384;
+type CustomModelInput = "text" | "image";
+export type CustomModelImageInputInference = {
+  supportsImageInput: boolean;
+  confidence: "known" | "unknown";
+};
 
 function normalizeContextWindowForCustomModel(value: unknown): number {
   const parsed = typeof value === "number" && Number.isFinite(value) ? Math.floor(value) : 0;
   return parsed >= CONTEXT_WINDOW_HARD_MIN_TOKENS ? parsed : CONTEXT_WINDOW_HARD_MIN_TOKENS;
 }
 
+function customModelInputs(supportsImageInput: boolean): CustomModelInput[] {
+  return supportsImageInput ? ["text", "image"] : ["text"];
+}
+
+export function resolveCustomModelImageInputInference(
+  modelId: string,
+): CustomModelImageInputInference {
+  const normalized = normalizeLowercaseStringOrEmpty(modelId);
+  if (!normalized) {
+    return { supportsImageInput: false, confidence: "unknown" };
+  }
+  const matchesKnownVision =
+    /\b(?:gpt-4o|gpt-4\.1|gpt-[5-9]|o[134])\b/.test(normalized) ||
+    /\bclaude-(?:3|4|sonnet|opus|haiku)\b/.test(normalized) ||
+    /\bgemini\b/.test(normalized) ||
+    /\b(?:qwen[\w.-]*-?vl|qwen-vl)\b/.test(normalized) ||
+    /\b(?:vision|llava|pixtral|internvl|mllama|minicpm-v|glm-4v)\b/.test(normalized) ||
+    /(?:^|[-_/])vl(?:[-_/]|$)/.test(normalized);
+  if (matchesKnownVision) {
+    return { supportsImageInput: true, confidence: "known" };
+  }
+
+  const matchesKnownText =
+    /\b(?:llama\d*|deepseek|mistral|mixtral|kimi|moonshot|codestral|devstral|phi|qwq|codellama)\b/.test(
+      normalized,
+    ) || /\bqwen(?!.*(?:vl|vision))/.test(normalized);
+  if (matchesKnownText) {
+    return { supportsImageInput: false, confidence: "known" };
+  }
+
+  return { supportsImageInput: false, confidence: "unknown" };
+}
+
+export function inferCustomModelSupportsImageInput(modelId: string): boolean {
+  return resolveCustomModelImageInputInference(modelId).supportsImageInput;
+}
+
+function resolveCustomModelSupportsImageInput(params: {
+  modelId: string;
+  explicit?: boolean;
+  fallback: boolean;
+  inferKnownModels: boolean;
+}): boolean {
+  return (
+    params.explicit ??
+    ((): boolean => {
+      if (!params.inferKnownModels) {
+        return params.fallback;
+      }
+      const inference = resolveCustomModelImageInputInference(params.modelId);
+      return inference.confidence === "known" ? inference.supportsImageInput : params.fallback;
+    })()
+  );
+}
+
 function isAzureFoundryUrl(baseUrl: string): boolean {
   try {
     const url = new URL(baseUrl);
@@ -112,6 +172,7 @@ export type ApplyCustomApiConfigParams = {
   apiKey?: SecretInput;
   providerId?: string;
   alias?: string;
+  supportsImageInput?: boolean;
 };
 
 export type ParseNonInteractiveCustomApiFlagsParams = {
@@ -120,6 +181,7 @@ export type ParseNonInteractiveCustomApiFlagsParams = {
   compatibility?: string;
   apiKey?: string;
   providerId?: string;
+  supportsImageInput?: boolean;
 };
 
 export type ParsedNonInteractiveCustomApiFlags = {
@@ -128,6 +190,7 @@ export type ParsedNonInteractiveCustomApiFlags = {
   compatibility: CustomApiCompatibility;
   apiKey?: string;
   providerId?: string;
+  supportsImageInput?: boolean;
 };
 
 export type CustomApiErrorCode =
@@ -439,6 +502,9 @@ export function parseNonInteractiveCustomApiFlags(
     compatibility: parseCustomApiCompatibility(params.compatibility),
     ...(apiKey ? { apiKey } : {}),
     ...(providerId ? { providerId } : {}),
+    ...(params.supportsImageInput === undefined
+      ? {}
+      : { supportsImageInput: params.supportsImageInput }),
   };
 }
 
@@ -487,15 +553,25 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom
   const existingModels = Array.isArray(existingProvider?.models) ? existingProvider.models : [];
   const hasModel = existingModels.some((model) => model.id === modelId);
   const isLikelyReasoningModel = isAzure && /\b(o[134]|gpt-([5-9]|\d{2,}))\b/i.test(modelId);
+  const explicitInput =
+    params.supportsImageInput === undefined
+      ? undefined
+      : customModelInputs(params.supportsImageInput);
+  const generatedInput = customModelInputs(
+    resolveCustomModelSupportsImageInput({
+      modelId,
+      explicit: params.supportsImageInput,
+      fallback: isAzure && isLikelyReasoningModel,
+      inferKnownModels: !isAzure,
+    }),
+  );
   const nextModel = isAzure
     ? {
         id: modelId,
         name: `${modelId} (Custom Provider)`,
         contextWindow: AZURE_DEFAULT_CONTEXT_WINDOW,
         maxTokens: AZURE_DEFAULT_MAX_TOKENS,
-        input: isLikelyReasoningModel
-          ? (["text", "image"] as Array<"text" | "image">)
-          : (["text"] as ["text"]),
+        input: generatedInput,
         cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
         reasoning: isLikelyReasoningModel,
         compat: { supportsStore: false },
@@ -505,7 +581,7 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom
         name: `${modelId} (Custom Provider)`,
         contextWindow: DEFAULT_CONTEXT_WINDOW,
         maxTokens: DEFAULT_MAX_TOKENS,
-        input: ["text"] as ["text"],
+        input: generatedInput,
         cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
         reasoning: false,
       };
@@ -515,6 +591,7 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom
           ? {
               ...model,
               ...(isAzure ? nextModel : {}),
+              ...(explicitInput ? { input: explicitInput } : {}),
               name: model.name ?? nextModel.name,
               cost: model.cost ?? nextModel.cost,
               contextWindow: normalizeContextWindowForCustomModel(model.contextWindow),
diff --git a/src/commands/onboard-custom.test.ts b/src/commands/onboard-custom.test.ts
index c7d5a2379e2..ab56ff99f3d 100644
--- a/src/commands/onboard-custom.test.ts
+++ b/src/commands/onboard-custom.test.ts
@@ -19,7 +19,7 @@ vi.mock("../plugins/provider-auth-input.js", () => ({
   ),
 }));
 
-function createTestPrompter(params: { text: string[]; select?: string[] }): {
+function createTestPrompter(params: { text: string[]; select?: string[]; confirm?: boolean[] }): {
   text: ReturnType<typeof vi.fn>;
   select: ReturnType<typeof vi.fn>;
   confirm: ReturnType<typeof vi.fn>;
@@ -34,6 +34,10 @@ function createTestPrompter(params: { text: string[]; select?: string[] }): {
   for (const answer of params.select ?? []) {
     select.mockResolvedValueOnce(answer);
   }
+  const confirm = vi.fn(async () => false);
+  for (const answer of params.confirm ?? []) {
+    confirm.mockResolvedValueOnce(answer);
+  }
   return {
     text,
     progress: vi.fn(() => ({
@@ -41,7 +45,7 @@ function createTestPrompter(params: { text: string[]; select?: string[] }): {
       stop: vi.fn(),
     })),
     select,
-    confirm: vi.fn(),
+    confirm,
     note: vi.fn(),
   };
 }
@@ -100,6 +104,38 @@ describe("promptCustomApiConfig", () => {
 
     expectOpenAiCompatResult({ prompter, textCalls: 5, selectCalls: 2, result });
     expect(result.config.agents?.defaults?.models?.["custom/llama3"]?.alias).toBe("local");
+    expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text"]);
+    expect(prompter.confirm).not.toHaveBeenCalled();
+  });
+
+  it("skips the image-input prompt for known custom vision models", async () => {
+    const prompter = createTestPrompter({
+      text: ["https://proxy.example.com/v1", "test-key", "gpt-4o", "custom", ""],
+      select: ["plaintext", "openai"],
+    });
+    stubFetchSequence([{ ok: true }]);
+
+    const result = await runPromptCustomApi(prompter);
+
+    expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
+    expect(prompter.confirm).not.toHaveBeenCalled();
+  });
+
+  it("prompts for custom model image support when the model is unknown", async () => {
+    const prompter = createTestPrompter({
+      text: ["https://proxy.example.com/v1", "test-key", "private-model", "custom", ""],
+      select: ["plaintext", "openai"],
+      confirm: [true],
+    });
+    stubFetchSequence([{ ok: true }]);
+
+    const result = await runPromptCustomApi(prompter);
+
+    expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
+    expect(prompter.confirm).toHaveBeenCalledWith({
+      message: "Does this model support image input?",
+      initialValue: false,
+    });
   });
 
   it("defaults custom setup to the native Ollama base URL", async () => {
diff --git a/src/commands/onboard-custom.ts b/src/commands/onboard-custom.ts
index f19dcd94169..24927eaa294 100644
--- a/src/commands/onboard-custom.ts
+++ b/src/commands/onboard-custom.ts
@@ -15,6 +15,7 @@ import {
   normalizeEndpointId,
   normalizeOptionalProviderApiKey,
   resolveCustomModelAliasError,
+  resolveCustomModelImageInputInference,
   resolveCustomProviderId,
   type CustomApiCompatibility,
   type CustomApiResult,
@@ -24,11 +25,14 @@ export {
   buildAnthropicVerificationProbeRequest,
   buildOpenAiVerificationProbeRequest,
   CustomApiError,
+  inferCustomModelSupportsImageInput,
   parseNonInteractiveCustomApiFlags,
+  resolveCustomModelImageInputInference,
   resolveCustomProviderId,
   type ApplyCustomApiConfigParams,
   type CustomApiCompatibility,
   type CustomApiErrorCode,
+  type CustomModelImageInputInference,
   type CustomApiResult,
   type ParseNonInteractiveCustomApiFlagsParams,
   type ParsedNonInteractiveCustomApiFlags,
@@ -341,6 +345,14 @@ export async function promptCustomApiConfig(params: {
       return resolveCustomModelAliasError({ raw: value, cfg: config, modelRef });
     },
   });
+  const imageInputInference = resolveCustomModelImageInputInference(modelId);
+  const supportsImageInput =
+    imageInputInference.confidence === "known"
+      ? imageInputInference.supportsImageInput
+      : await prompter.confirm({
+          message: "Does this model support image input?",
+          initialValue: imageInputInference.supportsImageInput,
+        });
   const resolvedCompatibility = compatibility ?? "openai";
   const result = applyCustomApiConfig({
     config,
@@ -350,6 +362,7 @@ export async function promptCustomApiConfig(params: {
     apiKey,
     providerId: providerIdInput,
     alias: aliasInput,
+    supportsImageInput,
   });
 
   if (result.providerIdRenamedFrom && result.providerId) {
diff --git a/src/commands/onboard-non-interactive/local/auth-choice.test.ts b/src/commands/onboard-non-interactive/local/auth-choice.test.ts
index b60f6a80a0a..e4b2e647e1b 100644
--- a/src/commands/onboard-non-interactive/local/auth-choice.test.ts
+++ b/src/commands/onboard-non-interactive/local/auth-choice.test.ts
@@ -136,4 +136,71 @@ describe("applyNonInteractiveAuthChoice", () => {
       }),
     );
   });
+
+  it("marks non-interactive custom provider models as image-capable when requested", async () => {
+    const runtime = createRuntime();
+    const nextConfig = { agents: { defaults: {} } } as OpenClawConfig;
+    resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined);
+
+    const result = await applyNonInteractiveAuthChoice({
+      nextConfig,
+      authChoice: "custom-api-key",
+      opts: {
+        customBaseUrl: "https://models.custom.local/v1",
+        customModelId: "gpt-4o",
+        customImageInput: true,
+      } as never,
+      runtime: runtime as never,
+      baseConfig: nextConfig,
+    });
+
+    expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([
+      "text",
+      "image",
+    ]);
+  });
+
+  it("infers image-capable non-interactive custom provider models by known model id", async () => {
+    const runtime = createRuntime();
+    const nextConfig = { agents: { defaults: {} } } as OpenClawConfig;
+    resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined);
+
+    const result = await applyNonInteractiveAuthChoice({
+      nextConfig,
+      authChoice: "custom-api-key",
+      opts: {
+        customBaseUrl: "https://models.custom.local/v1",
+        customModelId: "gpt-4o",
+      } as never,
+      runtime: runtime as never,
+      baseConfig: nextConfig,
+    });
+
+    expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([
+      "text",
+      "image",
+    ]);
+  });
+
+  it("honors explicit text-only override for known custom vision models", async () => {
+    const runtime = createRuntime();
+    const nextConfig = { agents: { defaults: {} } } as OpenClawConfig;
+    resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined);
+
+    const result = await applyNonInteractiveAuthChoice({
+      nextConfig,
+      authChoice: "custom-api-key",
+      opts: {
+        customBaseUrl: "https://models.custom.local/v1",
+        customModelId: "gpt-4o",
+        customImageInput: false,
+      } as never,
+      runtime: runtime as never,
+      baseConfig: nextConfig,
+    });
+
+    expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([
+      "text",
+    ]);
+  });
 });
diff --git a/src/commands/onboard-non-interactive/local/auth-choice.ts b/src/commands/onboard-non-interactive/local/auth-choice.ts
index ef558fd03d1..f56d7995798 100644
--- a/src/commands/onboard-non-interactive/local/auth-choice.ts
+++ b/src/commands/onboard-non-interactive/local/auth-choice.ts
@@ -177,6 +177,7 @@ export async function applyNonInteractiveAuthChoice(params: {
         compatibility: opts.customCompatibility,
         apiKey: opts.customApiKey,
         providerId: opts.customProviderId,
+        supportsImageInput: opts.customImageInput,
       });
       const resolvedProviderId = resolveCustomProviderId({
         config: nextConfig,
@@ -213,6 +214,7 @@ export async function applyNonInteractiveAuthChoice(params: {
         compatibility: customAuth.compatibility,
         apiKey: customApiKeyInput,
         providerId: customAuth.providerId,
+        supportsImageInput: customAuth.supportsImageInput,
       });
       if (result.providerIdRenamedFrom && result.providerId) {
         runtime.log(
diff --git a/src/commands/onboard-types.ts b/src/commands/onboard-types.ts
index c1ee73a8ea6..4c40a9b43db 100644
--- a/src/commands/onboard-types.ts
+++ b/src/commands/onboard-types.ts
@@ -62,6 +62,7 @@ export type OnboardOptions = OnboardDynamicProviderOptions & {
   customModelId?: string;
   customProviderId?: string;
   customCompatibility?: "openai" | "anthropic";
+  customImageInput?: boolean;
   gatewayPort?: number;
   gatewayBind?: GatewayBind;
   gatewayAuth?: GatewayAuthChoice;