refactor(media): move provider defaults into media metadata

This commit is contained in:
Peter Steinberger
2026-04-04 07:00:35 +01:00
parent fca80d2ee2
commit 3a3f88a80a
26 changed files with 308 additions and 148 deletions

View File

@@ -126,11 +126,12 @@ function resolveImageGenerationModelCandidates(
providerDefaults.set(providerId, `${providerId}/${modelId}`);
}
const primaryProvider = resolveDefaultModelRef(cfg).provider;
const orderedProviders = [
resolveDefaultModelRef(cfg).provider,
"openai",
"google",
...providerDefaults.keys(),
primaryProvider,
...[...providerDefaults.keys()]
.filter((providerId) => providerId !== primaryProvider)
.toSorted(),
];
const orderedRefs: string[] = [];
const seen = new Set<string>();

View File

@@ -274,7 +274,7 @@ function createMinimaxImageConfig(): OpenClawConfig {
function createDefaultImageFallbackExpectation(primary: string) {
return {
primary,
fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"],
fallbacks: ["openai/gpt-5.4-mini", "anthropic/claude-opus-4-6"],
};
}
@@ -618,12 +618,12 @@ describe("image tool implicit imageModel config", () => {
agents: {
defaults: {
model: { primary: "minimax/MiniMax-M2.7" },
imageModel: { primary: "openai/gpt-5-mini" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5-mini",
primary: "openai/gpt-5.4-mini",
});
});
});
@@ -638,7 +638,7 @@ describe("image tool implicit imageModel config", () => {
agents: {
defaults: {
model: { primary: "acme/vision-1" },
imageModel: { primary: "openai/gpt-5-mini" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
models: {
@@ -652,7 +652,7 @@ describe("image tool implicit imageModel config", () => {
};
// Tool should still be available for explicit image analysis requests
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5-mini",
primary: "openai/gpt-5.4-mini",
});
const tool = createImageTool({ config: cfg, agentDir, modelHasVision: true });
expect(tool).not.toBeNull();
@@ -1229,7 +1229,7 @@ describe("image tool response validation", () => {
role: "assistant",
api: "openai-responses",
provider: "openai",
model: "gpt-5-mini",
model: "gpt-5.4-mini",
stopReason: "stop",
timestamp: Date.now(),
usage: makeZeroUsageSnapshot(),
@@ -1278,7 +1278,7 @@ describe("image tool response validation", () => {
expect(() =>
__testing.coerceImageAssistantText({
provider: "openai",
model: "gpt-5-mini",
model: "gpt-5.4-mini",
message,
}),
).toThrow(expectedError);

View File

@@ -1,6 +1,10 @@
import { resolve, isAbsolute } from "node:path";
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import {
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "../../media-understanding/defaults.js";
import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js";
import { buildProviderRegistry } from "../../media-understanding/runner.js";
import { loadWebMedia } from "../../media/web-media.js";
@@ -40,8 +44,6 @@ import {
} from "./tool-runtime.helpers.js";
const DEFAULT_PROMPT = "Describe the image.";
const ANTHROPIC_IMAGE_PRIMARY = "anthropic/claude-opus-4-6";
const ANTHROPIC_IMAGE_FALLBACK = "anthropic/claude-opus-4-5";
const DEFAULT_MAX_IMAGES = 20;
const imageToolProviderDeps = {
@@ -103,28 +105,39 @@ export function resolveImageModelConfigForTool(params: {
provider: primary.provider,
});
const primaryCandidates = (() => {
if (isMinimaxVlmProvider(primary.provider)) {
return [`${primary.provider}/MiniMax-VL-01`];
}
if (providerVisionFromConfig) {
return [providerVisionFromConfig];
}
if (primary.provider === "zai") {
return ["zai/glm-4.6v"];
const providerDefault = resolveDefaultMediaModel({
cfg: params.cfg,
providerId: primary.provider,
capability: "image",
});
if (providerDefault) {
return [`${primary.provider}/${providerDefault}`];
}
if (primary.provider === "openai") {
return ["openai/gpt-5-mini"];
}
if (primary.provider === "anthropic") {
return [ANTHROPIC_IMAGE_PRIMARY];
if (isMinimaxVlmProvider(primary.provider)) {
return [`${primary.provider}/MiniMax-VL-01`];
}
return [];
})();
const autoCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
}).map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
});
return buildToolModelConfigFromCandidates({
explicit,
agentDir: params.agentDir,
candidates: [...primaryCandidates, "openai/gpt-5-mini", ANTHROPIC_IMAGE_FALLBACK],
candidates: [...primaryCandidates, ...autoCandidates],
});
}

View File

@@ -4,22 +4,16 @@ import {
resolveAgentModelFallbackValues,
resolveAgentModelPrimaryValue,
} from "../../config/model-input.js";
import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js";
import { extractAssistantText } from "../pi-embedded-utils.js";
export type PdfModelConfig = { primary?: string; fallbacks?: string[] };
/**
* Providers known to support native PDF document input.
* When the model's provider is in this set, the tool sends raw PDF bytes
* via provider-specific API calls instead of extracting text/images first.
*/
export const NATIVE_PDF_PROVIDERS = new Set(["anthropic", "google"]);
/**
* Check whether a provider supports native PDF document input.
*/
export function providerSupportsNativePdf(provider: string): boolean {
return NATIVE_PDF_PROVIDERS.has(provider.toLowerCase().trim());
return providerSupportsNativePdfDocument({ providerId: provider });
}
/**

View File

@@ -46,7 +46,7 @@ async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promi
}
const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
const OPENAI_PDF_MODEL = "openai/gpt-5-mini";
const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini";
const TEST_PDF_INPUT = { base64: "dGVzdA==", filename: "doc.pdf" } as const;
const FAKE_PDF_MEDIA = {
kind: "document",
@@ -295,12 +295,12 @@ describe("resolvePdfModelConfigForTool", () => {
agents: {
defaults: {
model: { primary: "openai/gpt-5.4" },
imageModel: { primary: "openai/gpt-5-mini" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
};
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5-mini",
primary: "openai/gpt-5.4-mini",
});
});
});

View File

@@ -1,6 +1,11 @@
import { type Context, complete } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import {
providerSupportsNativePdfDocument,
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "../../media-understanding/defaults.js";
import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js";
import { loadWebMediaRaw } from "../../media/web-media.js";
import { resolveUserPath } from "../../utils.js";
@@ -43,8 +48,6 @@ const DEFAULT_PROMPT = "Analyze this PDF document.";
const DEFAULT_MAX_PDFS = 10;
const DEFAULT_MAX_BYTES_MB = 10;
const DEFAULT_MAX_PAGES = 20;
const ANTHROPIC_PDF_PRIMARY = "anthropic/claude-opus-4-6";
const ANTHROPIC_PDF_FALLBACK = "anthropic/claude-opus-4-5";
const PDF_MIN_TEXT_CHARS = 200;
const PDF_MAX_PIXELS = 4_000_000;
@@ -75,9 +78,7 @@ export function resolvePdfModelConfigForTool(params: {
// Auto-detect from available providers
const primary = resolveDefaultModelRef(params.cfg);
const anthropicOk = hasAuthForProvider({ provider: "anthropic", agentDir: params.agentDir });
const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir });
const openaiOk = hasAuthForProvider({ provider: "openai", agentDir: params.agentDir });
const fallbacks: string[] = [];
const addFallback = (ref: string) => {
@@ -95,30 +96,54 @@ export function resolvePdfModelConfigForTool(params: {
cfg: params.cfg,
provider: primary.provider,
});
const providerDefault = resolveDefaultMediaModel({
cfg: params.cfg,
providerId: primary.provider,
capability: "image",
});
const nativePdfCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
})
.filter((providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId }))
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
const genericImageCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
})
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
if (primary.provider === "anthropic" && anthropicOk) {
preferred = ANTHROPIC_PDF_PRIMARY;
} else if (primary.provider === "google" && googleOk && providerVision) {
if (primary.provider === "google" && googleOk && providerVision) {
preferred = providerVision;
} else if (providerOk && providerVision) {
preferred = providerVision;
} else if (anthropicOk) {
preferred = ANTHROPIC_PDF_PRIMARY;
} else if (googleOk) {
preferred = "google/gemini-2.5-pro";
} else if (openaiOk) {
preferred = "openai/gpt-5-mini";
} else if (providerOk && (providerVision || providerDefault)) {
preferred = providerVision ?? `${primary.provider}/${providerDefault}`;
} else {
preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null;
}
if (preferred?.trim()) {
if (anthropicOk && preferred !== ANTHROPIC_PDF_PRIMARY) {
addFallback(ANTHROPIC_PDF_PRIMARY);
}
if (anthropicOk) {
addFallback(ANTHROPIC_PDF_FALLBACK);
}
if (openaiOk) {
addFallback("openai/gpt-5-mini");
for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) {
if (candidate !== preferred) {
addFallback(candidate);
}
}
const pruned = fallbacks.filter((ref) => ref !== preferred);
return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) };