diff --git a/src/agents/auth-profiles.ts b/src/agents/auth-profiles.ts index dda2e41256f..174951ce0ae 100644 --- a/src/agents/auth-profiles.ts +++ b/src/agents/auth-profiles.ts @@ -27,6 +27,7 @@ export { export { clearRuntimeAuthProfileStoreSnapshots, ensureAuthProfileStore, + hasAnyAuthProfileStoreSource, loadAuthProfileStoreForSecretsRuntime, loadAuthProfileStoreForRuntime, replaceRuntimeAuthProfileStoreSnapshots, diff --git a/src/agents/tools/model-config.helpers.ts b/src/agents/tools/model-config.helpers.ts index 8ba5b8ea283..b1ad3068028 100644 --- a/src/agents/tools/model-config.helpers.ts +++ b/src/agents/tools/model-config.helpers.ts @@ -4,7 +4,11 @@ import { resolveAgentModelPrimaryValue, } from "../../config/model-input.js"; import type { AgentModelConfig } from "../../config/types.agents-shared.js"; -import { ensureAuthProfileStore, listProfilesForProvider } from "../auth-profiles.js"; +import { + ensureAuthProfileStore, + hasAnyAuthProfileStoreSource, + listProfilesForProvider, +} from "../auth-profiles.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js"; import { resolveEnvApiKey } from "../model-auth.js"; import { resolveConfiguredModelRef } from "../model-selection.js"; @@ -37,6 +41,9 @@ export function hasAuthForProvider(params: { provider: string; agentDir?: string if (!agentDir) { return false; } + if (!hasAnyAuthProfileStoreSource(agentDir)) { + return false; + } const store = ensureAuthProfileStore(agentDir, { allowKeychainPrompt: false, }); diff --git a/src/agents/tools/pdf-tool.helpers.test.ts b/src/agents/tools/pdf-tool.helpers.test.ts new file mode 100644 index 00000000000..de16c38470e --- /dev/null +++ b/src/agents/tools/pdf-tool.helpers.test.ts @@ -0,0 +1,130 @@ +import type { OpenClawConfig } from "../../config/config.js"; +import { describe, expect, it } from "vitest"; +import { + coercePdfAssistantText, + coercePdfModelConfig, + parsePageRange, + providerSupportsNativePdf, + resolvePdfToolMaxTokens, +} from "./pdf-tool.helpers.js"; + +const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6"; + +describe("parsePageRange", () => { + it("parses a single page number", () => { + expect(parsePageRange("3", 20)).toEqual([3]); + }); + + it("parses a page range", () => { + expect(parsePageRange("1-5", 20)).toEqual([1, 2, 3, 4, 5]); + }); + + it("parses comma-separated pages and ranges", () => { + expect(parsePageRange("1,3,5-7", 20)).toEqual([1, 3, 5, 6, 7]); + }); + + it("clamps to maxPages", () => { + expect(parsePageRange("1-100", 5)).toEqual([1, 2, 3, 4, 5]); + }); + + it("deduplicates and sorts", () => { + expect(parsePageRange("5,3,1,3,5", 20)).toEqual([1, 3, 5]); + }); + + it("throws on invalid page number", () => { + expect(() => parsePageRange("abc", 20)).toThrow("Invalid page number"); + }); + + it("throws on invalid range (start > end)", () => { + expect(() => parsePageRange("5-3", 20)).toThrow("Invalid page range"); + }); + + it("throws on zero page number", () => { + expect(() => parsePageRange("0", 20)).toThrow("Invalid page number"); + }); + + it("throws on negative page number", () => { + expect(() => parsePageRange("-1", 20)).toThrow("Invalid page number"); + }); + + it("handles empty parts gracefully", () => { + expect(parsePageRange("1,,3", 20)).toEqual([1, 3]); + }); +}); + +describe("providerSupportsNativePdf", () => { + it("returns true for anthropic", () => { + expect(providerSupportsNativePdf("anthropic")).toBe(true); + }); + + it("returns true for google", () => { + expect(providerSupportsNativePdf("google")).toBe(true); + }); + + it("returns false for openai", () => { + expect(providerSupportsNativePdf("openai")).toBe(false); + }); + + it("returns false for minimax", () => { + expect(providerSupportsNativePdf("minimax")).toBe(false); + }); + + it("is case-insensitive", () => { + expect(providerSupportsNativePdf("Anthropic")).toBe(true); + expect(providerSupportsNativePdf("GOOGLE")).toBe(true); + }); +}); + +describe("pdf-tool.helpers", () => { + it("resolvePdfToolMaxTokens respects model limit", () => { + expect(resolvePdfToolMaxTokens(2048, 4096)).toBe(2048); + expect(resolvePdfToolMaxTokens(8192, 4096)).toBe(4096); + expect(resolvePdfToolMaxTokens(undefined, 4096)).toBe(4096); + }); + + it("coercePdfModelConfig reads primary and fallbacks", () => { + const cfg = { + agents: { + defaults: { + pdfModel: { + primary: ANTHROPIC_PDF_MODEL, + fallbacks: ["google/gemini-2.5-pro"], + }, + }, + }, + } as OpenClawConfig; + expect(coercePdfModelConfig(cfg)).toEqual({ + primary: ANTHROPIC_PDF_MODEL, + fallbacks: ["google/gemini-2.5-pro"], + }); + }); + + it("coercePdfAssistantText returns trimmed text", () => { + expect( + coercePdfAssistantText({ + provider: "anthropic", + model: "claude-opus-4-6", + message: { + role: "assistant", + stopReason: "stop", + content: [{ type: "text", text: " summary " }], + } as never, + }), + ).toBe("summary"); + }); + + it("coercePdfAssistantText throws clear error for failed model output", () => { + expect(() => + coercePdfAssistantText({ + provider: "google", + model: "gemini-2.5-pro", + message: { + role: "assistant", + stopReason: "error", + errorMessage: "bad request", + content: [], + } as never, + }), + ).toThrow("PDF model failed (google/gemini-2.5-pro): bad request"); + }); +}); diff --git a/src/agents/tools/pdf-tool.helpers.ts b/src/agents/tools/pdf-tool.helpers.ts index e259b79e11e..c6b6243fd0e 100644 --- a/src/agents/tools/pdf-tool.helpers.ts +++ b/src/agents/tools/pdf-tool.helpers.ts @@ -4,7 +4,7 @@ import { resolveAgentModelFallbackValues, resolveAgentModelPrimaryValue, } from "../../config/model-input.js"; -import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js"; +import { bundledProviderSupportsNativePdfDocument } from "../../media-understanding/bundled-defaults.js"; import { extractAssistantText } from "../pi-embedded-utils.js"; export type PdfModelConfig = { primary?: string; fallbacks?: string[] }; @@ -13,7 +13,7 @@ export type PdfModelConfig = { primary?: string; fallbacks?: string[] }; * Check whether a provider supports native PDF document input. */ export function providerSupportsNativePdf(provider: string): boolean { - return providerSupportsNativePdfDocument({ providerId: provider }); + return bundledProviderSupportsNativePdfDocument(provider); } /** diff --git a/src/agents/tools/pdf-tool.model-config.test.ts b/src/agents/tools/pdf-tool.model-config.test.ts new file mode 100644 index 00000000000..b6f24287763 --- /dev/null +++ b/src/agents/tools/pdf-tool.model-config.test.ts @@ -0,0 +1,103 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { OpenClawConfig } from "../../config/config.js"; +import { resolvePdfModelConfigForTool } from "./pdf-tool.model-config.js"; + +const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6"; + +async function withTempAgentDir(run: (agentDir: string) => Promise): Promise { + const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-pdf-")); + try { + return await run(agentDir); + } finally { + await fs.rm(agentDir, { recursive: true, force: true }); + } +} + +function resetAuthEnv() { + vi.stubEnv("OPENAI_API_KEY", ""); + vi.stubEnv("ANTHROPIC_API_KEY", ""); + vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", ""); + vi.stubEnv("GEMINI_API_KEY", ""); + vi.stubEnv("GOOGLE_API_KEY", ""); + vi.stubEnv("MINIMAX_API_KEY", ""); + vi.stubEnv("ZAI_API_KEY", ""); + vi.stubEnv("Z_AI_API_KEY", ""); + vi.stubEnv("COPILOT_GITHUB_TOKEN", ""); + vi.stubEnv("GH_TOKEN", ""); + vi.stubEnv("GITHUB_TOKEN", ""); +} + +function withDefaultModel(primary: string): OpenClawConfig { + return { + agents: { defaults: { model: { primary } } }, + } as OpenClawConfig; +} + +describe("resolvePdfModelConfigForTool", () => { + beforeEach(() => { + resetAuthEnv(); + }); + + afterEach(() => { + vi.unstubAllEnvs(); + }); + + it("returns null without any auth", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg = withDefaultModel("openai/gpt-5.4"); + expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toBeNull(); + }); + }); + + it("prefers explicit pdfModel config", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg = { + agents: { + defaults: { + model: { primary: "openai/gpt-5.4" }, + pdfModel: { primary: ANTHROPIC_PDF_MODEL }, + }, + }, + } as OpenClawConfig; + expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: ANTHROPIC_PDF_MODEL, + }); + }); + }); + + it("falls back to imageModel config when no pdfModel set", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg = { + agents: { + defaults: { + model: { primary: "openai/gpt-5.4" }, + imageModel: { primary: "openai/gpt-5.4-mini" }, + }, + }, + } as OpenClawConfig; + expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "openai/gpt-5.4-mini", + }); + }); + }); + + it("prefers anthropic when available for native PDF support", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + vi.stubEnv("OPENAI_API_KEY", "openai-test"); + const cfg = withDefaultModel("openai/gpt-5.4"); + expect(resolvePdfModelConfigForTool({ cfg, agentDir })?.primary).toBe(ANTHROPIC_PDF_MODEL); + }); + }); + + it("uses anthropic primary when provider is anthropic", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL); + expect(resolvePdfModelConfigForTool({ cfg, agentDir })?.primary).toBe(ANTHROPIC_PDF_MODEL); + }); + }); +}); diff --git a/src/agents/tools/pdf-tool.model-config.ts b/src/agents/tools/pdf-tool.model-config.ts new file mode 100644 index 00000000000..3ffff898e22 --- /dev/null +++ b/src/agents/tools/pdf-tool.model-config.ts @@ -0,0 +1,125 @@ +import type { OpenClawConfig } from "../../config/config.js"; +import { + bundledProviderSupportsNativePdfDocument, + resolveBundledAutoMediaKeyProviders, + resolveBundledDefaultMediaModel, +} from "../../media-understanding/bundled-defaults.js"; +import { + coerceImageModelConfig, + type ImageModelConfig, + resolveProviderVisionModelFromConfig, +} from "./image-tool.helpers.js"; +import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js"; +import { coercePdfModelConfig } from "./pdf-tool.helpers.js"; + +export function resolvePdfModelConfigForTool(params: { + cfg?: OpenClawConfig; + agentDir: string; +}): ImageModelConfig | null { + const explicitPdf = coercePdfModelConfig(params.cfg); + if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) { + return explicitPdf; + } + + const explicitImage = coerceImageModelConfig(params.cfg); + if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) { + return explicitImage; + } + + const primary = resolveDefaultModelRef(params.cfg); + const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir }); + + const fallbacks: string[] = []; + const addFallback = (ref: string) => { + const trimmed = ref.trim(); + if (trimmed && !fallbacks.includes(trimmed)) { + fallbacks.push(trimmed); + } + }; + + let preferred: string | null = null; + + const providerOk = hasAuthForProvider({ provider: primary.provider, agentDir: params.agentDir }); + const providerVision = resolveProviderVisionModelFromConfig({ + cfg: params.cfg, + provider: primary.provider, + }); + const providerDefault = + providerVision?.split("/")[1] ?? + resolveBundledDefaultMediaModel({ + providerId: primary.provider, + capability: "image", + }); + const primarySupportsNativePdf = bundledProviderSupportsNativePdfDocument(primary.provider); + const nativePdfCandidates = resolveBundledAutoMediaKeyProviders("image") + .filter((providerId) => bundledProviderSupportsNativePdfDocument(providerId)) + .filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) + .map((providerId) => { + const modelId = + resolveProviderVisionModelFromConfig({ + cfg: params.cfg, + provider: providerId, + })?.split("/")[1] ?? + resolveBundledDefaultMediaModel({ + providerId, + capability: "image", + }); + return modelId ? `${providerId}/${modelId}` : null; + }) + .filter((value): value is string => Boolean(value)); + const genericImageCandidates = resolveBundledAutoMediaKeyProviders("image") + .filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) + .map((providerId) => { + const modelId = + resolveProviderVisionModelFromConfig({ + cfg: params.cfg, + provider: providerId, + })?.split("/")[1] ?? + resolveBundledDefaultMediaModel({ + providerId, + capability: "image", + }); + return modelId ? `${providerId}/${modelId}` : null; + }) + .filter((value): value is string => Boolean(value)); + + if (params.cfg?.models?.providers && typeof params.cfg.models.providers === "object") { + for (const [providerKey, providerCfg] of Object.entries(params.cfg.models.providers)) { + const providerId = providerKey.trim(); + if (!providerId || !hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) { + continue; + } + const models = providerCfg?.models ?? []; + const modelId = models.find( + (model) => Boolean(model?.id?.trim()) && Array.isArray(model?.input) && model.input.includes("image"), + )?.id?.trim(); + if (!modelId) { + continue; + } + const ref = `${providerId}/${modelId}`; + if (!genericImageCandidates.includes(ref)) { + genericImageCandidates.push(ref); + } + } + } + + if (primary.provider === "google" && googleOk && providerVision && primarySupportsNativePdf) { + preferred = providerVision; + } else if (providerOk && primarySupportsNativePdf && (providerVision || providerDefault)) { + preferred = providerVision ?? `${primary.provider}/${providerDefault}`; + } else { + preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null; + } + + if (preferred?.trim()) { + for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) { + if (candidate !== preferred) { + addFallback(candidate); + } + } + const pruned = fallbacks.filter((ref) => ref !== preferred); + return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) }; + } + + return null; +} diff --git a/src/agents/tools/pdf-tool.test.ts b/src/agents/tools/pdf-tool.test.ts index 202b605dbfe..385ac5a3a60 100644 --- a/src/agents/tools/pdf-tool.test.ts +++ b/src/agents/tools/pdf-tool.test.ts @@ -1,7 +1,7 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; -import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../../config/config.js"; import * as pdfExtractModule from "../../media/pdf-extract.js"; import * as webMedia from "../../media/web-media.js"; @@ -10,13 +10,6 @@ import { modelSupportsDocument } from "../model-catalog.js"; import * as modelsConfig from "../models-config.js"; import * as modelDiscovery from "../pi-model-discovery.js"; import * as pdfNativeProviders from "./pdf-native-providers.js"; -import { - coercePdfAssistantText, - coercePdfModelConfig, - parsePageRange, - providerSupportsNativePdf, - resolvePdfToolMaxTokens, -} from "./pdf-tool.helpers.js"; const completeMock = vi.hoisted(() => vi.fn()); @@ -30,11 +23,13 @@ vi.mock("@mariozechner/pi-ai", async () => { type PdfToolModule = typeof import("./pdf-tool.js"); let createPdfTool: PdfToolModule["createPdfTool"]; -let resolvePdfModelConfigForTool: PdfToolModule["resolvePdfModelConfigForTool"]; -beforeAll(async () => { - ({ createPdfTool, resolvePdfModelConfigForTool } = await import("./pdf-tool.js")); -}); +async function loadCreatePdfTool() { + if (!createPdfTool) { + ({ createPdfTool } = await import("./pdf-tool.js")); + } + return createPdfTool; +} async function withTempAgentDir(run: (agentDir: string) => Promise): Promise { const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-pdf-")); @@ -55,7 +50,7 @@ const FAKE_PDF_MEDIA = { fileName: "doc.pdf", } as const; -function requirePdfTool(tool: ReturnType) { +function requirePdfTool(tool: Awaited> extends (...args: any[]) => infer R ? R : never) { expect(tool).not.toBeNull(); if (!tool) { throw new Error("expected pdf tool"); @@ -71,7 +66,7 @@ async function withAnthropicPdfTool( await withTempAgentDir(async (agentDir) => { vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL); - const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir })); + const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir })); await run(tool, agentDir); }); } @@ -87,7 +82,7 @@ function makeAnthropicAnalyzeParams( }> = {}, ) { return { - apiKey: "test-key", // pragma: allowlist secret + apiKey: "test-key", modelId: "claude-opus-4-6", prompt: "test", pdfs: [TEST_PDF_INPUT], @@ -105,7 +100,7 @@ function makeGeminiAnalyzeParams( }> = {}, ) { return { - apiKey: "test-key", // pragma: allowlist secret + apiKey: "test-key", modelId: "gemini-2.5-pro", prompt: "test", pdfs: [TEST_PDF_INPUT], @@ -168,169 +163,12 @@ async function stubPdfToolInfra( wrote: false, }); - vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never); // pragma: allowlist secret + vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never); vi.spyOn(modelAuth, "requireApiKey").mockReturnValue("test-key"); return { loadSpy }; } -// --------------------------------------------------------------------------- -// parsePageRange tests -// --------------------------------------------------------------------------- - -describe("parsePageRange", () => { - it("parses a single page number", () => { - expect(parsePageRange("3", 20)).toEqual([3]); - }); - - it("parses a page range", () => { - expect(parsePageRange("1-5", 20)).toEqual([1, 2, 3, 4, 5]); - }); - - it("parses comma-separated pages and ranges", () => { - expect(parsePageRange("1,3,5-7", 20)).toEqual([1, 3, 5, 6, 7]); - }); - - it("clamps to maxPages", () => { - expect(parsePageRange("1-100", 5)).toEqual([1, 2, 3, 4, 5]); - }); - - it("deduplicates and sorts", () => { - expect(parsePageRange("5,3,1,3,5", 20)).toEqual([1, 3, 5]); - }); - - it("throws on invalid page number", () => { - expect(() => parsePageRange("abc", 20)).toThrow("Invalid page number"); - }); - - it("throws on invalid range (start > end)", () => { - expect(() => parsePageRange("5-3", 20)).toThrow("Invalid page range"); - }); - - it("throws on zero page number", () => { - expect(() => parsePageRange("0", 20)).toThrow("Invalid page number"); - }); - - it("throws on negative page number", () => { - expect(() => parsePageRange("-1", 20)).toThrow("Invalid page number"); - }); - - it("handles empty parts gracefully", () => { - expect(parsePageRange("1,,3", 20)).toEqual([1, 3]); - }); -}); - -// --------------------------------------------------------------------------- -// providerSupportsNativePdf tests -// --------------------------------------------------------------------------- - -describe("providerSupportsNativePdf", () => { - it("returns true for anthropic", () => { - expect(providerSupportsNativePdf("anthropic")).toBe(true); - }); - - it("returns true for google", () => { - expect(providerSupportsNativePdf("google")).toBe(true); - }); - - it("returns false for openai", () => { - expect(providerSupportsNativePdf("openai")).toBe(false); - }); - - it("returns false for minimax", () => { - expect(providerSupportsNativePdf("minimax")).toBe(false); - }); - - it("is case-insensitive", () => { - expect(providerSupportsNativePdf("Anthropic")).toBe(true); - expect(providerSupportsNativePdf("GOOGLE")).toBe(true); - }); -}); - -// --------------------------------------------------------------------------- -// PDF model config resolution -// --------------------------------------------------------------------------- - -describe("resolvePdfModelConfigForTool", () => { - const priorFetch = global.fetch; - - beforeEach(() => { - resetAuthEnv(); - completeMock.mockReset(); - }); - - afterEach(() => { - vi.unstubAllEnvs(); - global.fetch = priorFetch; - }); - - it("returns null without any auth", async () => { - await withTempAgentDir(async (agentDir) => { - const cfg: OpenClawConfig = { - agents: { defaults: { model: { primary: "openai/gpt-5.4" } } }, - }; - expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toBeNull(); - }); - }); - - it("prefers explicit pdfModel config", async () => { - await withTempAgentDir(async (agentDir) => { - const cfg: OpenClawConfig = { - agents: { - defaults: { - model: { primary: "openai/gpt-5.4" }, - pdfModel: { primary: "anthropic/claude-opus-4-6" }, - }, - }, - } as OpenClawConfig; - expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({ - primary: "anthropic/claude-opus-4-6", - }); - }); - }); - - it("falls back to imageModel config when no pdfModel set", async () => { - await withTempAgentDir(async (agentDir) => { - const cfg: OpenClawConfig = { - agents: { - defaults: { - model: { primary: "openai/gpt-5.4" }, - imageModel: { primary: "openai/gpt-5.4-mini" }, - }, - }, - }; - expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({ - primary: "openai/gpt-5.4-mini", - }); - }); - }); - - it("prefers anthropic when available for native PDF support", async () => { - await withTempAgentDir(async (agentDir) => { - vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); - vi.stubEnv("OPENAI_API_KEY", "openai-test"); - const cfg = withDefaultModel("openai/gpt-5.4"); - const config = resolvePdfModelConfigForTool({ cfg, agentDir }); - expect(config).not.toBeNull(); - // Should prefer anthropic for native PDF - expect(config?.primary).toBe(ANTHROPIC_PDF_MODEL); - }); - }); - - it("uses anthropic primary when provider is anthropic", async () => { - await withTempAgentDir(async (agentDir) => { - vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); - const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL); - const config = resolvePdfModelConfigForTool({ cfg, agentDir }); - expect(config?.primary).toBe(ANTHROPIC_PDF_MODEL); - }); - }); -}); - -// --------------------------------------------------------------------------- -// createPdfTool -// --------------------------------------------------------------------------- - describe("createPdfTool", () => { const priorFetch = global.fetch; @@ -345,22 +183,14 @@ describe("createPdfTool", () => { global.fetch = priorFetch; }); - it("returns null without agentDir and no explicit config", () => { - expect(createPdfTool()).toBeNull(); + it("returns null without agentDir and no explicit config", async () => { + expect((await loadCreatePdfTool())()).toBeNull(); }); - it("returns null without any auth configured", async () => { - await withTempAgentDir(async (agentDir) => { - const cfg: OpenClawConfig = { - agents: { defaults: { model: { primary: "openai/gpt-5.4" } } }, - }; - expect(createPdfTool({ config: cfg, agentDir })).toBeNull(); - }); - }); - - it("throws when agentDir missing but explicit config present", () => { + it("throws when agentDir missing but explicit config present", async () => { const cfg = withPdfModel(ANTHROPIC_PDF_MODEL); - expect(() => createPdfTool({ config: cfg })).toThrow("requires agentDir"); + const createTool = await loadCreatePdfTool(); + expect(() => createTool({ config: cfg })).toThrow("requires agentDir"); }); it("creates tool when auth is available", async () => { @@ -395,7 +225,7 @@ describe("createPdfTool", () => { try { const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL); const tool = requirePdfTool( - createPdfTool({ + (await loadCreatePdfTool())({ config: cfg, agentDir, workspaceDir, @@ -432,7 +262,7 @@ describe("createPdfTool", () => { await withTempAgentDir(async (agentDir) => { const { loadSpy } = await stubPdfToolInfra(agentDir, { modelFound: false }); const cfg = withPdfModel(ANTHROPIC_PDF_MODEL); - const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir })); + const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir })); await expect( tool.execute("t1", { @@ -449,13 +279,10 @@ describe("createPdfTool", () => { it("uses native PDF path without eager extraction", async () => { await withTempAgentDir(async (agentDir) => { await stubPdfToolInfra(agentDir, { provider: "anthropic", input: ["text", "document"] }); - vi.spyOn(pdfNativeProviders, "anthropicAnalyzePdf").mockResolvedValue("native summary"); - const extractSpy = vi.spyOn(pdfExtractModule, "extractPdfContent"); - const cfg = withPdfModel(ANTHROPIC_PDF_MODEL); - const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir })); + const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir })); const result = await tool.execute("t1", { prompt: "summarize", @@ -474,7 +301,7 @@ describe("createPdfTool", () => { await withTempAgentDir(async (agentDir) => { await stubPdfToolInfra(agentDir, { provider: "anthropic", input: ["text", "document"] }); const cfg = withPdfModel(ANTHROPIC_PDF_MODEL); - const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir })); + const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir })); await expect( tool.execute("t1", { @@ -489,12 +316,10 @@ describe("createPdfTool", () => { it("uses extraction fallback for non-native models", async () => { await withTempAgentDir(async (agentDir) => { await stubPdfToolInfra(agentDir, { provider: "openai", input: ["text"] }); - const extractSpy = vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({ text: "Extracted content", images: [], }); - completeMock.mockResolvedValue({ role: "assistant", stopReason: "stop", @@ -502,8 +327,7 @@ describe("createPdfTool", () => { } as never); const cfg = withPdfModel(OPENAI_PDF_MODEL); - - const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir })); + const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir })); const result = await tool.execute("t1", { prompt: "summarize", @@ -534,12 +358,9 @@ describe("createPdfTool", () => { }); }); -// --------------------------------------------------------------------------- -// Native provider detection -// --------------------------------------------------------------------------- - describe("native PDF provider API calls", () => { const priorFetch = global.fetch; + const mockFetchResponse = (response: unknown) => { const fetchMock = vi.fn().mockResolvedValue(response); global.fetch = Object.assign(fetchMock, { preconnect: vi.fn() }) as typeof global.fetch; @@ -558,13 +379,13 @@ describe("native PDF provider API calls", () => { }), }); - const result = await pdfNativeProviders.anthropicAnalyzePdf({ - ...makeAnthropicAnalyzeParams({ + const result = await pdfNativeProviders.anthropicAnalyzePdf( + makeAnthropicAnalyzeParams({ modelId: "claude-opus-4-6", prompt: "Summarize this document", maxTokens: 4096, }), - }); + ); expect(result).toBe("Analysis of PDF"); expect(fetchMock).toHaveBeenCalledTimes(1); @@ -608,20 +429,16 @@ describe("native PDF provider API calls", () => { const fetchMock = mockFetchResponse({ ok: true, json: async () => ({ - candidates: [ - { - content: { parts: [{ text: "Gemini PDF analysis" }] }, - }, - ], + candidates: [{ content: { parts: [{ text: "Gemini PDF analysis" }] } }], }), }); - const result = await pdfNativeProviders.geminiAnalyzePdf({ - ...makeGeminiAnalyzeParams({ + const result = await pdfNativeProviders.geminiAnalyzePdf( + makeGeminiAnalyzeParams({ modelId: "gemini-2.5-pro", prompt: "Summarize this", }), - }); + ); expect(result).toBe("Gemini PDF analysis"); expect(fetchMock).toHaveBeenCalledTimes(1); @@ -666,8 +483,8 @@ describe("native PDF provider API calls", () => { }), }); - await pdfNativeProviders.anthropicAnalyzePdf({ - ...makeAnthropicAnalyzeParams({ + await pdfNativeProviders.anthropicAnalyzePdf( + makeAnthropicAnalyzeParams({ modelId: "claude-opus-4-6", prompt: "Compare these documents", pdfs: [ @@ -675,10 +492,9 @@ describe("native PDF provider API calls", () => { { base64: "cGRmMg==", filename: "doc2.pdf" }, ], }), - }); + ); const body = JSON.parse(fetchMock.mock.calls[0][1].body); - // 2 document blocks + 1 text block expect(body.messages[0].content).toHaveLength(3); expect(body.messages[0].content[0].type).toBe("document"); expect(body.messages[0].content[1].type).toBe("document"); @@ -693,9 +509,9 @@ describe("native PDF provider API calls", () => { }), }); - await pdfNativeProviders.anthropicAnalyzePdf({ - ...makeAnthropicAnalyzeParams({ baseUrl: "https://custom.example.com" }), - }); + await pdfNativeProviders.anthropicAnalyzePdf( + makeAnthropicAnalyzeParams({ baseUrl: "https://custom.example.com" }), + ); expect(fetchMock.mock.calls[0][0]).toContain("https://custom.example.com/v1/messages"); }); @@ -751,67 +567,6 @@ describe("native PDF provider API calls", () => { }); }); -// --------------------------------------------------------------------------- -// PDF tool helpers -// --------------------------------------------------------------------------- - -describe("pdf-tool.helpers", () => { - it("resolvePdfToolMaxTokens respects model limit", () => { - expect(resolvePdfToolMaxTokens(2048, 4096)).toBe(2048); - expect(resolvePdfToolMaxTokens(8192, 4096)).toBe(4096); - expect(resolvePdfToolMaxTokens(undefined, 4096)).toBe(4096); - }); - - it("coercePdfModelConfig reads primary and fallbacks", () => { - const cfg: OpenClawConfig = { - agents: { - defaults: { - pdfModel: { - primary: "anthropic/claude-opus-4-6", - fallbacks: ["google/gemini-2.5-pro"], - }, - }, - }, - }; - expect(coercePdfModelConfig(cfg)).toEqual({ - primary: "anthropic/claude-opus-4-6", - fallbacks: ["google/gemini-2.5-pro"], - }); - }); - - it("coercePdfAssistantText returns trimmed text", () => { - const text = coercePdfAssistantText({ - provider: "anthropic", - model: "claude-opus-4-6", - message: { - role: "assistant", - stopReason: "stop", - content: [{ type: "text", text: " summary " }], - } as never, - }); - expect(text).toBe("summary"); - }); - - it("coercePdfAssistantText throws clear error for failed model output", () => { - expect(() => - coercePdfAssistantText({ - provider: "google", - model: "gemini-2.5-pro", - message: { - role: "assistant", - stopReason: "error", - errorMessage: "bad request", - content: [], - } as never, - }), - ).toThrow("PDF model failed (google/gemini-2.5-pro): bad request"); - }); -}); - -// --------------------------------------------------------------------------- -// Model catalog document support -// --------------------------------------------------------------------------- - describe("model catalog document support", () => { it("modelSupportsDocument returns true when input includes document", () => { expect( diff --git a/src/agents/tools/pdf-tool.ts b/src/agents/tools/pdf-tool.ts index 56d15d602f9..f42ce6dbb8c 100644 --- a/src/agents/tools/pdf-tool.ts +++ b/src/agents/tools/pdf-tool.ts @@ -1,19 +1,11 @@ import { type Context, complete } from "@mariozechner/pi-ai"; import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; -import { - providerSupportsNativePdfDocument, - resolveAutoMediaKeyProviders, - resolveDefaultMediaModel, -} from "../../media-understanding/defaults.js"; import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js"; import { loadWebMediaRaw } from "../../media/web-media.js"; import { resolveUserPath } from "../../utils.js"; -import { - coerceImageModelConfig, - type ImageModelConfig, - resolveProviderVisionModelFromConfig, -} from "./image-tool.helpers.js"; +import { type ImageModelConfig } from "./image-tool.helpers.js"; +import { resolvePdfModelConfigForTool } from "./pdf-tool.model-config.js"; import { applyImageModelConfigDefaults, buildTextToolResult, @@ -22,7 +14,6 @@ import { resolveModelRuntimeApiKey, resolvePromptAndModelOverride, } from "./media-tool-shared.js"; -import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js"; import { anthropicAnalyzePdf, geminiAnalyzePdf } from "./pdf-native-providers.js"; import { coercePdfAssistantText, @@ -56,105 +47,7 @@ const PDF_MAX_PIXELS = 4_000_000; // Model resolution (mirrors image tool pattern) // --------------------------------------------------------------------------- -/** - * Resolve the effective PDF model config. - * Falls back to the image model config, then to provider-specific defaults. - */ -export function resolvePdfModelConfigForTool(params: { - cfg?: OpenClawConfig; - agentDir: string; -}): ImageModelConfig | null { - // Check for explicit PDF model config first - const explicitPdf = coercePdfModelConfig(params.cfg); - if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) { - return explicitPdf; - } - - // Fall back to the image model config - const explicitImage = coerceImageModelConfig(params.cfg); - if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) { - return explicitImage; - } - - // Auto-detect from available providers - const primary = resolveDefaultModelRef(params.cfg); - const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir }); - - const fallbacks: string[] = []; - const addFallback = (ref: string) => { - const trimmed = ref.trim(); - if (trimmed && !fallbacks.includes(trimmed)) { - fallbacks.push(trimmed); - } - }; - - // Prefer providers with native PDF support - let preferred: string | null = null; - - const providerOk = hasAuthForProvider({ provider: primary.provider, agentDir: params.agentDir }); - const providerVision = resolveProviderVisionModelFromConfig({ - cfg: params.cfg, - provider: primary.provider, - }); - const providerDefault = resolveDefaultMediaModel({ - cfg: params.cfg, - providerId: primary.provider, - capability: "image", - }); - const primarySupportsNativePdf = providerSupportsNativePdfDocument({ - cfg: params.cfg, - providerId: primary.provider, - }); - const nativePdfCandidates = resolveAutoMediaKeyProviders({ - cfg: params.cfg, - capability: "image", - }) - .filter((providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId })) - .filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) - .map((providerId) => { - const modelId = resolveDefaultMediaModel({ - cfg: params.cfg, - providerId, - capability: "image", - }); - return modelId ? `${providerId}/${modelId}` : null; - }) - .filter((value): value is string => Boolean(value)); - const genericImageCandidates = resolveAutoMediaKeyProviders({ - cfg: params.cfg, - capability: "image", - }) - .filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) - .map((providerId) => { - const modelId = resolveDefaultMediaModel({ - cfg: params.cfg, - providerId, - capability: "image", - }); - return modelId ? `${providerId}/${modelId}` : null; - }) - .filter((value): value is string => Boolean(value)); - - if (primary.provider === "google" && googleOk && providerVision && primarySupportsNativePdf) { - preferred = providerVision; - } else if (providerOk && primarySupportsNativePdf && (providerVision || providerDefault)) { - preferred = providerVision ?? `${primary.provider}/${providerDefault}`; - } else { - preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null; - } - - if (preferred?.trim()) { - for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) { - if (candidate !== preferred) { - addFallback(candidate); - } - } - const pruned = fallbacks.filter((ref) => ref !== preferred); - return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) }; - } - - return null; -} +export { resolvePdfModelConfigForTool } from "./pdf-tool.model-config.js"; // --------------------------------------------------------------------------- // Build context for extraction fallback path diff --git a/src/media-understanding/bundled-defaults.ts b/src/media-understanding/bundled-defaults.ts new file mode 100644 index 00000000000..0dc5f7e9bbe --- /dev/null +++ b/src/media-understanding/bundled-defaults.ts @@ -0,0 +1,98 @@ +import type { MediaUnderstandingCapability } from "./types.js"; +import { normalizeMediaProviderId } from "./provider-id.js"; + +type BundledMediaProviderDefaults = { + defaultModels?: Partial>; + autoPriority?: Partial>; + nativeDocumentInputs?: Array<"pdf">; +}; + +const BUNDLED_MEDIA_PROVIDER_DEFAULTS: Record = { + openai: { + defaultModels: { image: "gpt-5.4-mini", audio: "gpt-4o-transcribe" }, + autoPriority: { image: 10, audio: 10 }, + }, + "openai-codex": { + defaultModels: { image: "gpt-5.4" }, + }, + anthropic: { + defaultModels: { image: "claude-opus-4-6" }, + autoPriority: { image: 20 }, + nativeDocumentInputs: ["pdf"], + }, + google: { + defaultModels: { + image: "gemini-3-flash-preview", + audio: "gemini-3-flash-preview", + video: "gemini-3-flash-preview", + }, + autoPriority: { image: 30, audio: 40, video: 10 }, + nativeDocumentInputs: ["pdf"], + }, + groq: { + defaultModels: { audio: "whisper-large-v3-turbo" }, + autoPriority: { audio: 20 }, + }, + deepgram: { + defaultModels: { audio: "nova-3" }, + autoPriority: { audio: 30 }, + }, + mistral: { + defaultModels: { audio: "voxtral-mini-latest" }, + autoPriority: { audio: 50 }, + }, + minimax: { + defaultModels: { image: "MiniMax-VL-01" }, + autoPriority: { image: 40 }, + }, + "minimax-portal": { + defaultModels: { image: "MiniMax-VL-01" }, + autoPriority: { image: 50 }, + }, + zai: { + defaultModels: { image: "glm-4.6v" }, + autoPriority: { image: 60 }, + }, + qwen: { + defaultModels: { image: "qwen-vl-max-latest", video: "qwen-vl-max-latest" }, + autoPriority: { video: 15 }, + }, + moonshot: { + defaultModels: { image: "kimi-k2.5", video: "kimi-k2.5" }, + autoPriority: { video: 20 }, + }, + openrouter: { + defaultModels: { image: "auto" }, + }, +}; + +export function getBundledMediaProviderDefaults(providerId: string): BundledMediaProviderDefaults | null { + return BUNDLED_MEDIA_PROVIDER_DEFAULTS[normalizeMediaProviderId(providerId)] ?? null; +} + +export function resolveBundledDefaultMediaModel(params: { + providerId: string; + capability: MediaUnderstandingCapability; +}): string | undefined { + return getBundledMediaProviderDefaults(params.providerId)?.defaultModels?.[params.capability]?.trim(); +} + +export function resolveBundledAutoMediaKeyProviders(capability: MediaUnderstandingCapability): string[] { + return Object.entries(BUNDLED_MEDIA_PROVIDER_DEFAULTS) + .map(([providerId, defaults]) => ({ + providerId, + priority: defaults.autoPriority?.[capability], + })) + .filter((entry): entry is { providerId: string; priority: number } => typeof entry.priority === "number") + .toSorted((left, right) => { + if (left.priority !== right.priority) { + return left.priority - right.priority; + } + return left.providerId.localeCompare(right.providerId); + }) + .map((entry) => entry.providerId); +} + +export function bundledProviderSupportsNativePdfDocument(providerId: string): boolean { + return getBundledMediaProviderDefaults(providerId)?.nativeDocumentInputs?.includes("pdf") ?? false; +} diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index 723d545a44d..fac094ffee3 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -1,4 +1,9 @@ import type { OpenClawConfig } from "../config/config.js"; +import { + bundledProviderSupportsNativePdfDocument, + resolveBundledAutoMediaKeyProviders, + resolveBundledDefaultMediaModel, +} from "./bundled-defaults.js"; import { buildMediaUnderstandingRegistry, normalizeMediaProviderId } from "./provider-registry.js"; import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js"; @@ -52,12 +57,53 @@ function resolveDefaultRegistry(cfg?: OpenClawConfig) { return buildMediaUnderstandingRegistry(undefined, cfg ?? ({} as OpenClawConfig)); } +function resolveConfiguredImageProviderModel(params: { + cfg?: OpenClawConfig; + providerId: string; +}): string | undefined { + const providers = params.cfg?.models?.providers; + if (!providers || typeof providers !== "object") { + return undefined; + } + const normalizedProviderId = normalizeMediaProviderId(params.providerId); + for (const [providerKey, providerCfg] of Object.entries(providers)) { + if (normalizeMediaProviderId(providerKey) !== normalizedProviderId) { + continue; + } + const models = providerCfg?.models ?? []; + const match = models.find( + (model) => Boolean(model?.id?.trim()) && Array.isArray(model?.input) && model.input.includes("image"), + ); + return match?.id?.trim() || undefined; + } + return undefined; +} + export function resolveDefaultMediaModel(params: { providerId: string; capability: MediaUnderstandingCapability; cfg?: OpenClawConfig; providerRegistry?: Map; }): string | undefined { + if (!params.providerRegistry) { + const configuredImageModel = + params.capability === "image" + ? resolveConfiguredImageProviderModel({ + cfg: params.cfg, + providerId: params.providerId, + }) + : undefined; + if (configuredImageModel) { + return configuredImageModel; + } + const bundledDefault = resolveBundledDefaultMediaModel({ + providerId: params.providerId, + capability: params.capability, + }); + if (bundledDefault) { + return bundledDefault; + } + } const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); const provider = registry.get(normalizeMediaProviderId(params.providerId)); return provider?.defaultModels?.[params.capability]?.trim() || undefined; @@ -68,6 +114,28 @@ export function resolveAutoMediaKeyProviders(params: { cfg?: OpenClawConfig; providerRegistry?: Map; }): string[] { + if (!params.providerRegistry) { + const bundledProviders = resolveBundledAutoMediaKeyProviders(params.capability); + if (params.capability !== "image") { + return bundledProviders; + } + const configProviders = params.cfg?.models?.providers; + if (!configProviders || typeof configProviders !== "object") { + return bundledProviders; + } + const merged = [...bundledProviders]; + for (const [providerKey, providerCfg] of Object.entries(configProviders)) { + const normalizedProviderId = normalizeMediaProviderId(providerKey); + const models = providerCfg?.models ?? []; + const hasImageModel = models.some( + (model) => Array.isArray(model?.input) && model.input.includes("image"), + ); + if (hasImageModel && !merged.includes(normalizedProviderId)) { + merged.push(normalizedProviderId); + } + } + return merged; + } const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); type AutoProviderEntry = { provider: MediaUnderstandingProvider; @@ -97,6 +165,9 @@ export function providerSupportsNativePdfDocument(params: { cfg?: OpenClawConfig; providerRegistry?: Map; }): boolean { + if (!params.providerRegistry && bundledProviderSupportsNativePdfDocument(params.providerId)) { + return true; + } const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); const provider = registry.get(normalizeMediaProviderId(params.providerId)); return provider?.nativeDocumentInputs?.includes("pdf") ?? false; diff --git a/src/media-understanding/provider-id.ts b/src/media-understanding/provider-id.ts index 89667e7641a..777fbeab7ba 100644 --- a/src/media-understanding/provider-id.ts +++ b/src/media-understanding/provider-id.ts @@ -1,4 +1,4 @@ -import { normalizeProviderId } from "../agents/model-selection.js"; +import { normalizeProviderId } from "../agents/provider-id.js"; export function normalizeMediaProviderId(id: string): string { const normalized = normalizeProviderId(id);