perf(pdf): remove media/runtime lookup overhead

This commit is contained in:
Peter Steinberger
2026-04-07 05:58:11 +01:00
parent a1e0090fe4
commit 998cc02af4
11 changed files with 578 additions and 395 deletions

View File

@@ -27,6 +27,7 @@ export {
export {
clearRuntimeAuthProfileStoreSnapshots,
ensureAuthProfileStore,
hasAnyAuthProfileStoreSource,
loadAuthProfileStoreForSecretsRuntime,
loadAuthProfileStoreForRuntime,
replaceRuntimeAuthProfileStoreSnapshots,

View File

@@ -4,7 +4,11 @@ import {
resolveAgentModelPrimaryValue,
} from "../../config/model-input.js";
import type { AgentModelConfig } from "../../config/types.agents-shared.js";
import { ensureAuthProfileStore, listProfilesForProvider } from "../auth-profiles.js";
import {
ensureAuthProfileStore,
hasAnyAuthProfileStoreSource,
listProfilesForProvider,
} from "../auth-profiles.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
import { resolveEnvApiKey } from "../model-auth.js";
import { resolveConfiguredModelRef } from "../model-selection.js";
@@ -37,6 +41,9 @@ export function hasAuthForProvider(params: { provider: string; agentDir?: string
if (!agentDir) {
return false;
}
if (!hasAnyAuthProfileStoreSource(agentDir)) {
return false;
}
const store = ensureAuthProfileStore(agentDir, {
allowKeychainPrompt: false,
});

View File

@@ -0,0 +1,130 @@
import type { OpenClawConfig } from "../../config/config.js";
import { describe, expect, it } from "vitest";
import {
coercePdfAssistantText,
coercePdfModelConfig,
parsePageRange,
providerSupportsNativePdf,
resolvePdfToolMaxTokens,
} from "./pdf-tool.helpers.js";
const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
describe("parsePageRange", () => {
it("parses a single page number", () => {
expect(parsePageRange("3", 20)).toEqual([3]);
});
it("parses a page range", () => {
expect(parsePageRange("1-5", 20)).toEqual([1, 2, 3, 4, 5]);
});
it("parses comma-separated pages and ranges", () => {
expect(parsePageRange("1,3,5-7", 20)).toEqual([1, 3, 5, 6, 7]);
});
it("clamps to maxPages", () => {
expect(parsePageRange("1-100", 5)).toEqual([1, 2, 3, 4, 5]);
});
it("deduplicates and sorts", () => {
expect(parsePageRange("5,3,1,3,5", 20)).toEqual([1, 3, 5]);
});
it("throws on invalid page number", () => {
expect(() => parsePageRange("abc", 20)).toThrow("Invalid page number");
});
it("throws on invalid range (start > end)", () => {
expect(() => parsePageRange("5-3", 20)).toThrow("Invalid page range");
});
it("throws on zero page number", () => {
expect(() => parsePageRange("0", 20)).toThrow("Invalid page number");
});
it("throws on negative page number", () => {
expect(() => parsePageRange("-1", 20)).toThrow("Invalid page number");
});
it("handles empty parts gracefully", () => {
expect(parsePageRange("1,,3", 20)).toEqual([1, 3]);
});
});
describe("providerSupportsNativePdf", () => {
it("returns true for anthropic", () => {
expect(providerSupportsNativePdf("anthropic")).toBe(true);
});
it("returns true for google", () => {
expect(providerSupportsNativePdf("google")).toBe(true);
});
it("returns false for openai", () => {
expect(providerSupportsNativePdf("openai")).toBe(false);
});
it("returns false for minimax", () => {
expect(providerSupportsNativePdf("minimax")).toBe(false);
});
it("is case-insensitive", () => {
expect(providerSupportsNativePdf("Anthropic")).toBe(true);
expect(providerSupportsNativePdf("GOOGLE")).toBe(true);
});
});
describe("pdf-tool.helpers", () => {
it("resolvePdfToolMaxTokens respects model limit", () => {
expect(resolvePdfToolMaxTokens(2048, 4096)).toBe(2048);
expect(resolvePdfToolMaxTokens(8192, 4096)).toBe(4096);
expect(resolvePdfToolMaxTokens(undefined, 4096)).toBe(4096);
});
it("coercePdfModelConfig reads primary and fallbacks", () => {
const cfg = {
agents: {
defaults: {
pdfModel: {
primary: ANTHROPIC_PDF_MODEL,
fallbacks: ["google/gemini-2.5-pro"],
},
},
},
} as OpenClawConfig;
expect(coercePdfModelConfig(cfg)).toEqual({
primary: ANTHROPIC_PDF_MODEL,
fallbacks: ["google/gemini-2.5-pro"],
});
});
it("coercePdfAssistantText returns trimmed text", () => {
expect(
coercePdfAssistantText({
provider: "anthropic",
model: "claude-opus-4-6",
message: {
role: "assistant",
stopReason: "stop",
content: [{ type: "text", text: " summary " }],
} as never,
}),
).toBe("summary");
});
it("coercePdfAssistantText throws clear error for failed model output", () => {
expect(() =>
coercePdfAssistantText({
provider: "google",
model: "gemini-2.5-pro",
message: {
role: "assistant",
stopReason: "error",
errorMessage: "bad request",
content: [],
} as never,
}),
).toThrow("PDF model failed (google/gemini-2.5-pro): bad request");
});
});

View File

@@ -4,7 +4,7 @@ import {
resolveAgentModelFallbackValues,
resolveAgentModelPrimaryValue,
} from "../../config/model-input.js";
import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js";
import { bundledProviderSupportsNativePdfDocument } from "../../media-understanding/bundled-defaults.js";
import { extractAssistantText } from "../pi-embedded-utils.js";
export type PdfModelConfig = { primary?: string; fallbacks?: string[] };
@@ -13,7 +13,7 @@ export type PdfModelConfig = { primary?: string; fallbacks?: string[] };
* Check whether a provider supports native PDF document input.
*/
export function providerSupportsNativePdf(provider: string): boolean {
return providerSupportsNativePdfDocument({ providerId: provider });
return bundledProviderSupportsNativePdfDocument(provider);
}
/**

View File

@@ -0,0 +1,103 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../../config/config.js";
import { resolvePdfModelConfigForTool } from "./pdf-tool.model-config.js";
const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promise<T> {
const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-pdf-"));
try {
return await run(agentDir);
} finally {
await fs.rm(agentDir, { recursive: true, force: true });
}
}
function resetAuthEnv() {
vi.stubEnv("OPENAI_API_KEY", "");
vi.stubEnv("ANTHROPIC_API_KEY", "");
vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", "");
vi.stubEnv("GEMINI_API_KEY", "");
vi.stubEnv("GOOGLE_API_KEY", "");
vi.stubEnv("MINIMAX_API_KEY", "");
vi.stubEnv("ZAI_API_KEY", "");
vi.stubEnv("Z_AI_API_KEY", "");
vi.stubEnv("COPILOT_GITHUB_TOKEN", "");
vi.stubEnv("GH_TOKEN", "");
vi.stubEnv("GITHUB_TOKEN", "");
}
function withDefaultModel(primary: string): OpenClawConfig {
return {
agents: { defaults: { model: { primary } } },
} as OpenClawConfig;
}
describe("resolvePdfModelConfigForTool", () => {
beforeEach(() => {
resetAuthEnv();
});
afterEach(() => {
vi.unstubAllEnvs();
});
it("returns null without any auth", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg = withDefaultModel("openai/gpt-5.4");
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toBeNull();
});
});
it("prefers explicit pdfModel config", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg = {
agents: {
defaults: {
model: { primary: "openai/gpt-5.4" },
pdfModel: { primary: ANTHROPIC_PDF_MODEL },
},
},
} as OpenClawConfig;
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({
primary: ANTHROPIC_PDF_MODEL,
});
});
});
it("falls back to imageModel config when no pdfModel set", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg = {
agents: {
defaults: {
model: { primary: "openai/gpt-5.4" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
} as OpenClawConfig;
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5.4-mini",
});
});
});
it("prefers anthropic when available for native PDF support", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test");
vi.stubEnv("OPENAI_API_KEY", "openai-test");
const cfg = withDefaultModel("openai/gpt-5.4");
expect(resolvePdfModelConfigForTool({ cfg, agentDir })?.primary).toBe(ANTHROPIC_PDF_MODEL);
});
});
it("uses anthropic primary when provider is anthropic", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test");
const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL);
expect(resolvePdfModelConfigForTool({ cfg, agentDir })?.primary).toBe(ANTHROPIC_PDF_MODEL);
});
});
});

View File

@@ -0,0 +1,125 @@
import type { OpenClawConfig } from "../../config/config.js";
import {
bundledProviderSupportsNativePdfDocument,
resolveBundledAutoMediaKeyProviders,
resolveBundledDefaultMediaModel,
} from "../../media-understanding/bundled-defaults.js";
import {
coerceImageModelConfig,
type ImageModelConfig,
resolveProviderVisionModelFromConfig,
} from "./image-tool.helpers.js";
import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
import { coercePdfModelConfig } from "./pdf-tool.helpers.js";
export function resolvePdfModelConfigForTool(params: {
cfg?: OpenClawConfig;
agentDir: string;
}): ImageModelConfig | null {
const explicitPdf = coercePdfModelConfig(params.cfg);
if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) {
return explicitPdf;
}
const explicitImage = coerceImageModelConfig(params.cfg);
if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) {
return explicitImage;
}
const primary = resolveDefaultModelRef(params.cfg);
const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir });
const fallbacks: string[] = [];
const addFallback = (ref: string) => {
const trimmed = ref.trim();
if (trimmed && !fallbacks.includes(trimmed)) {
fallbacks.push(trimmed);
}
};
let preferred: string | null = null;
const providerOk = hasAuthForProvider({ provider: primary.provider, agentDir: params.agentDir });
const providerVision = resolveProviderVisionModelFromConfig({
cfg: params.cfg,
provider: primary.provider,
});
const providerDefault =
providerVision?.split("/")[1] ??
resolveBundledDefaultMediaModel({
providerId: primary.provider,
capability: "image",
});
const primarySupportsNativePdf = bundledProviderSupportsNativePdfDocument(primary.provider);
const nativePdfCandidates = resolveBundledAutoMediaKeyProviders("image")
.filter((providerId) => bundledProviderSupportsNativePdfDocument(providerId))
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId =
resolveProviderVisionModelFromConfig({
cfg: params.cfg,
provider: providerId,
})?.split("/")[1] ??
resolveBundledDefaultMediaModel({
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
const genericImageCandidates = resolveBundledAutoMediaKeyProviders("image")
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId =
resolveProviderVisionModelFromConfig({
cfg: params.cfg,
provider: providerId,
})?.split("/")[1] ??
resolveBundledDefaultMediaModel({
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
if (params.cfg?.models?.providers && typeof params.cfg.models.providers === "object") {
for (const [providerKey, providerCfg] of Object.entries(params.cfg.models.providers)) {
const providerId = providerKey.trim();
if (!providerId || !hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) {
continue;
}
const models = providerCfg?.models ?? [];
const modelId = models.find(
(model) => Boolean(model?.id?.trim()) && Array.isArray(model?.input) && model.input.includes("image"),
)?.id?.trim();
if (!modelId) {
continue;
}
const ref = `${providerId}/${modelId}`;
if (!genericImageCandidates.includes(ref)) {
genericImageCandidates.push(ref);
}
}
}
if (primary.provider === "google" && googleOk && providerVision && primarySupportsNativePdf) {
preferred = providerVision;
} else if (providerOk && primarySupportsNativePdf && (providerVision || providerDefault)) {
preferred = providerVision ?? `${primary.provider}/${providerDefault}`;
} else {
preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null;
}
if (preferred?.trim()) {
for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) {
if (candidate !== preferred) {
addFallback(candidate);
}
}
const pruned = fallbacks.filter((ref) => ref !== preferred);
return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) };
}
return null;
}

View File

@@ -1,7 +1,7 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../../config/config.js";
import * as pdfExtractModule from "../../media/pdf-extract.js";
import * as webMedia from "../../media/web-media.js";
@@ -10,13 +10,6 @@ import { modelSupportsDocument } from "../model-catalog.js";
import * as modelsConfig from "../models-config.js";
import * as modelDiscovery from "../pi-model-discovery.js";
import * as pdfNativeProviders from "./pdf-native-providers.js";
import {
coercePdfAssistantText,
coercePdfModelConfig,
parsePageRange,
providerSupportsNativePdf,
resolvePdfToolMaxTokens,
} from "./pdf-tool.helpers.js";
const completeMock = vi.hoisted(() => vi.fn());
@@ -30,11 +23,13 @@ vi.mock("@mariozechner/pi-ai", async () => {
type PdfToolModule = typeof import("./pdf-tool.js");
let createPdfTool: PdfToolModule["createPdfTool"];
let resolvePdfModelConfigForTool: PdfToolModule["resolvePdfModelConfigForTool"];
beforeAll(async () => {
({ createPdfTool, resolvePdfModelConfigForTool } = await import("./pdf-tool.js"));
});
async function loadCreatePdfTool() {
if (!createPdfTool) {
({ createPdfTool } = await import("./pdf-tool.js"));
}
return createPdfTool;
}
async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promise<T> {
const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-pdf-"));
@@ -55,7 +50,7 @@ const FAKE_PDF_MEDIA = {
fileName: "doc.pdf",
} as const;
function requirePdfTool(tool: ReturnType<typeof createPdfTool>) {
function requirePdfTool(tool: Awaited<ReturnType<typeof loadCreatePdfTool>> extends (...args: any[]) => infer R ? R : never) {
expect(tool).not.toBeNull();
if (!tool) {
throw new Error("expected pdf tool");
@@ -71,7 +66,7 @@ async function withAnthropicPdfTool(
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test");
const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL);
const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir }));
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
await run(tool, agentDir);
});
}
@@ -87,7 +82,7 @@ function makeAnthropicAnalyzeParams(
}> = {},
) {
return {
apiKey: "test-key", // pragma: allowlist secret
apiKey: "test-key",
modelId: "claude-opus-4-6",
prompt: "test",
pdfs: [TEST_PDF_INPUT],
@@ -105,7 +100,7 @@ function makeGeminiAnalyzeParams(
}> = {},
) {
return {
apiKey: "test-key", // pragma: allowlist secret
apiKey: "test-key",
modelId: "gemini-2.5-pro",
prompt: "test",
pdfs: [TEST_PDF_INPUT],
@@ -168,169 +163,12 @@ async function stubPdfToolInfra(
wrote: false,
});
vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never); // pragma: allowlist secret
vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never);
vi.spyOn(modelAuth, "requireApiKey").mockReturnValue("test-key");
return { loadSpy };
}
// ---------------------------------------------------------------------------
// parsePageRange tests
// ---------------------------------------------------------------------------
describe("parsePageRange", () => {
it("parses a single page number", () => {
expect(parsePageRange("3", 20)).toEqual([3]);
});
it("parses a page range", () => {
expect(parsePageRange("1-5", 20)).toEqual([1, 2, 3, 4, 5]);
});
it("parses comma-separated pages and ranges", () => {
expect(parsePageRange("1,3,5-7", 20)).toEqual([1, 3, 5, 6, 7]);
});
it("clamps to maxPages", () => {
expect(parsePageRange("1-100", 5)).toEqual([1, 2, 3, 4, 5]);
});
it("deduplicates and sorts", () => {
expect(parsePageRange("5,3,1,3,5", 20)).toEqual([1, 3, 5]);
});
it("throws on invalid page number", () => {
expect(() => parsePageRange("abc", 20)).toThrow("Invalid page number");
});
it("throws on invalid range (start > end)", () => {
expect(() => parsePageRange("5-3", 20)).toThrow("Invalid page range");
});
it("throws on zero page number", () => {
expect(() => parsePageRange("0", 20)).toThrow("Invalid page number");
});
it("throws on negative page number", () => {
expect(() => parsePageRange("-1", 20)).toThrow("Invalid page number");
});
it("handles empty parts gracefully", () => {
expect(parsePageRange("1,,3", 20)).toEqual([1, 3]);
});
});
// ---------------------------------------------------------------------------
// providerSupportsNativePdf tests
// ---------------------------------------------------------------------------
describe("providerSupportsNativePdf", () => {
it("returns true for anthropic", () => {
expect(providerSupportsNativePdf("anthropic")).toBe(true);
});
it("returns true for google", () => {
expect(providerSupportsNativePdf("google")).toBe(true);
});
it("returns false for openai", () => {
expect(providerSupportsNativePdf("openai")).toBe(false);
});
it("returns false for minimax", () => {
expect(providerSupportsNativePdf("minimax")).toBe(false);
});
it("is case-insensitive", () => {
expect(providerSupportsNativePdf("Anthropic")).toBe(true);
expect(providerSupportsNativePdf("GOOGLE")).toBe(true);
});
});
// ---------------------------------------------------------------------------
// PDF model config resolution
// ---------------------------------------------------------------------------
describe("resolvePdfModelConfigForTool", () => {
const priorFetch = global.fetch;
beforeEach(() => {
resetAuthEnv();
completeMock.mockReset();
});
afterEach(() => {
vi.unstubAllEnvs();
global.fetch = priorFetch;
});
it("returns null without any auth", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-5.4" } } },
};
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toBeNull();
});
});
it("prefers explicit pdfModel config", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
model: { primary: "openai/gpt-5.4" },
pdfModel: { primary: "anthropic/claude-opus-4-6" },
},
},
} as OpenClawConfig;
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "anthropic/claude-opus-4-6",
});
});
});
it("falls back to imageModel config when no pdfModel set", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
model: { primary: "openai/gpt-5.4" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
};
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5.4-mini",
});
});
});
it("prefers anthropic when available for native PDF support", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test");
vi.stubEnv("OPENAI_API_KEY", "openai-test");
const cfg = withDefaultModel("openai/gpt-5.4");
const config = resolvePdfModelConfigForTool({ cfg, agentDir });
expect(config).not.toBeNull();
// Should prefer anthropic for native PDF
expect(config?.primary).toBe(ANTHROPIC_PDF_MODEL);
});
});
it("uses anthropic primary when provider is anthropic", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test");
const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL);
const config = resolvePdfModelConfigForTool({ cfg, agentDir });
expect(config?.primary).toBe(ANTHROPIC_PDF_MODEL);
});
});
});
// ---------------------------------------------------------------------------
// createPdfTool
// ---------------------------------------------------------------------------
describe("createPdfTool", () => {
const priorFetch = global.fetch;
@@ -345,22 +183,14 @@ describe("createPdfTool", () => {
global.fetch = priorFetch;
});
it("returns null without agentDir and no explicit config", () => {
expect(createPdfTool()).toBeNull();
it("returns null without agentDir and no explicit config", async () => {
expect((await loadCreatePdfTool())()).toBeNull();
});
it("returns null without any auth configured", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-5.4" } } },
};
expect(createPdfTool({ config: cfg, agentDir })).toBeNull();
});
});
it("throws when agentDir missing but explicit config present", () => {
it("throws when agentDir missing but explicit config present", async () => {
const cfg = withPdfModel(ANTHROPIC_PDF_MODEL);
expect(() => createPdfTool({ config: cfg })).toThrow("requires agentDir");
const createTool = await loadCreatePdfTool();
expect(() => createTool({ config: cfg })).toThrow("requires agentDir");
});
it("creates tool when auth is available", async () => {
@@ -395,7 +225,7 @@ describe("createPdfTool", () => {
try {
const cfg = withDefaultModel(ANTHROPIC_PDF_MODEL);
const tool = requirePdfTool(
createPdfTool({
(await loadCreatePdfTool())({
config: cfg,
agentDir,
workspaceDir,
@@ -432,7 +262,7 @@ describe("createPdfTool", () => {
await withTempAgentDir(async (agentDir) => {
const { loadSpy } = await stubPdfToolInfra(agentDir, { modelFound: false });
const cfg = withPdfModel(ANTHROPIC_PDF_MODEL);
const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir }));
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
await expect(
tool.execute("t1", {
@@ -449,13 +279,10 @@ describe("createPdfTool", () => {
it("uses native PDF path without eager extraction", async () => {
await withTempAgentDir(async (agentDir) => {
await stubPdfToolInfra(agentDir, { provider: "anthropic", input: ["text", "document"] });
vi.spyOn(pdfNativeProviders, "anthropicAnalyzePdf").mockResolvedValue("native summary");
const extractSpy = vi.spyOn(pdfExtractModule, "extractPdfContent");
const cfg = withPdfModel(ANTHROPIC_PDF_MODEL);
const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir }));
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "summarize",
@@ -474,7 +301,7 @@ describe("createPdfTool", () => {
await withTempAgentDir(async (agentDir) => {
await stubPdfToolInfra(agentDir, { provider: "anthropic", input: ["text", "document"] });
const cfg = withPdfModel(ANTHROPIC_PDF_MODEL);
const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir }));
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
await expect(
tool.execute("t1", {
@@ -489,12 +316,10 @@ describe("createPdfTool", () => {
it("uses extraction fallback for non-native models", async () => {
await withTempAgentDir(async (agentDir) => {
await stubPdfToolInfra(agentDir, { provider: "openai", input: ["text"] });
const extractSpy = vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
text: "Extracted content",
images: [],
});
completeMock.mockResolvedValue({
role: "assistant",
stopReason: "stop",
@@ -502,8 +327,7 @@ describe("createPdfTool", () => {
} as never);
const cfg = withPdfModel(OPENAI_PDF_MODEL);
const tool = requirePdfTool(createPdfTool({ config: cfg, agentDir }));
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "summarize",
@@ -534,12 +358,9 @@ describe("createPdfTool", () => {
});
});
// ---------------------------------------------------------------------------
// Native provider detection
// ---------------------------------------------------------------------------
describe("native PDF provider API calls", () => {
const priorFetch = global.fetch;
const mockFetchResponse = (response: unknown) => {
const fetchMock = vi.fn().mockResolvedValue(response);
global.fetch = Object.assign(fetchMock, { preconnect: vi.fn() }) as typeof global.fetch;
@@ -558,13 +379,13 @@ describe("native PDF provider API calls", () => {
}),
});
const result = await pdfNativeProviders.anthropicAnalyzePdf({
...makeAnthropicAnalyzeParams({
const result = await pdfNativeProviders.anthropicAnalyzePdf(
makeAnthropicAnalyzeParams({
modelId: "claude-opus-4-6",
prompt: "Summarize this document",
maxTokens: 4096,
}),
});
);
expect(result).toBe("Analysis of PDF");
expect(fetchMock).toHaveBeenCalledTimes(1);
@@ -608,20 +429,16 @@ describe("native PDF provider API calls", () => {
const fetchMock = mockFetchResponse({
ok: true,
json: async () => ({
candidates: [
{
content: { parts: [{ text: "Gemini PDF analysis" }] },
},
],
candidates: [{ content: { parts: [{ text: "Gemini PDF analysis" }] } }],
}),
});
const result = await pdfNativeProviders.geminiAnalyzePdf({
...makeGeminiAnalyzeParams({
const result = await pdfNativeProviders.geminiAnalyzePdf(
makeGeminiAnalyzeParams({
modelId: "gemini-2.5-pro",
prompt: "Summarize this",
}),
});
);
expect(result).toBe("Gemini PDF analysis");
expect(fetchMock).toHaveBeenCalledTimes(1);
@@ -666,8 +483,8 @@ describe("native PDF provider API calls", () => {
}),
});
await pdfNativeProviders.anthropicAnalyzePdf({
...makeAnthropicAnalyzeParams({
await pdfNativeProviders.anthropicAnalyzePdf(
makeAnthropicAnalyzeParams({
modelId: "claude-opus-4-6",
prompt: "Compare these documents",
pdfs: [
@@ -675,10 +492,9 @@ describe("native PDF provider API calls", () => {
{ base64: "cGRmMg==", filename: "doc2.pdf" },
],
}),
});
);
const body = JSON.parse(fetchMock.mock.calls[0][1].body);
// 2 document blocks + 1 text block
expect(body.messages[0].content).toHaveLength(3);
expect(body.messages[0].content[0].type).toBe("document");
expect(body.messages[0].content[1].type).toBe("document");
@@ -693,9 +509,9 @@ describe("native PDF provider API calls", () => {
}),
});
await pdfNativeProviders.anthropicAnalyzePdf({
...makeAnthropicAnalyzeParams({ baseUrl: "https://custom.example.com" }),
});
await pdfNativeProviders.anthropicAnalyzePdf(
makeAnthropicAnalyzeParams({ baseUrl: "https://custom.example.com" }),
);
expect(fetchMock.mock.calls[0][0]).toContain("https://custom.example.com/v1/messages");
});
@@ -751,67 +567,6 @@ describe("native PDF provider API calls", () => {
});
});
// ---------------------------------------------------------------------------
// PDF tool helpers
// ---------------------------------------------------------------------------
describe("pdf-tool.helpers", () => {
it("resolvePdfToolMaxTokens respects model limit", () => {
expect(resolvePdfToolMaxTokens(2048, 4096)).toBe(2048);
expect(resolvePdfToolMaxTokens(8192, 4096)).toBe(4096);
expect(resolvePdfToolMaxTokens(undefined, 4096)).toBe(4096);
});
it("coercePdfModelConfig reads primary and fallbacks", () => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
pdfModel: {
primary: "anthropic/claude-opus-4-6",
fallbacks: ["google/gemini-2.5-pro"],
},
},
},
};
expect(coercePdfModelConfig(cfg)).toEqual({
primary: "anthropic/claude-opus-4-6",
fallbacks: ["google/gemini-2.5-pro"],
});
});
it("coercePdfAssistantText returns trimmed text", () => {
const text = coercePdfAssistantText({
provider: "anthropic",
model: "claude-opus-4-6",
message: {
role: "assistant",
stopReason: "stop",
content: [{ type: "text", text: " summary " }],
} as never,
});
expect(text).toBe("summary");
});
it("coercePdfAssistantText throws clear error for failed model output", () => {
expect(() =>
coercePdfAssistantText({
provider: "google",
model: "gemini-2.5-pro",
message: {
role: "assistant",
stopReason: "error",
errorMessage: "bad request",
content: [],
} as never,
}),
).toThrow("PDF model failed (google/gemini-2.5-pro): bad request");
});
});
// ---------------------------------------------------------------------------
// Model catalog document support
// ---------------------------------------------------------------------------
describe("model catalog document support", () => {
it("modelSupportsDocument returns true when input includes document", () => {
expect(

View File

@@ -1,19 +1,11 @@
import { type Context, complete } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import {
providerSupportsNativePdfDocument,
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "../../media-understanding/defaults.js";
import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js";
import { loadWebMediaRaw } from "../../media/web-media.js";
import { resolveUserPath } from "../../utils.js";
import {
coerceImageModelConfig,
type ImageModelConfig,
resolveProviderVisionModelFromConfig,
} from "./image-tool.helpers.js";
import { type ImageModelConfig } from "./image-tool.helpers.js";
import { resolvePdfModelConfigForTool } from "./pdf-tool.model-config.js";
import {
applyImageModelConfigDefaults,
buildTextToolResult,
@@ -22,7 +14,6 @@ import {
resolveModelRuntimeApiKey,
resolvePromptAndModelOverride,
} from "./media-tool-shared.js";
import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
import { anthropicAnalyzePdf, geminiAnalyzePdf } from "./pdf-native-providers.js";
import {
coercePdfAssistantText,
@@ -56,105 +47,7 @@ const PDF_MAX_PIXELS = 4_000_000;
// Model resolution (mirrors image tool pattern)
// ---------------------------------------------------------------------------
/**
* Resolve the effective PDF model config.
* Falls back to the image model config, then to provider-specific defaults.
*/
export function resolvePdfModelConfigForTool(params: {
cfg?: OpenClawConfig;
agentDir: string;
}): ImageModelConfig | null {
// Check for explicit PDF model config first
const explicitPdf = coercePdfModelConfig(params.cfg);
if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) {
return explicitPdf;
}
// Fall back to the image model config
const explicitImage = coerceImageModelConfig(params.cfg);
if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) {
return explicitImage;
}
// Auto-detect from available providers
const primary = resolveDefaultModelRef(params.cfg);
const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir });
const fallbacks: string[] = [];
const addFallback = (ref: string) => {
const trimmed = ref.trim();
if (trimmed && !fallbacks.includes(trimmed)) {
fallbacks.push(trimmed);
}
};
// Prefer providers with native PDF support
let preferred: string | null = null;
const providerOk = hasAuthForProvider({ provider: primary.provider, agentDir: params.agentDir });
const providerVision = resolveProviderVisionModelFromConfig({
cfg: params.cfg,
provider: primary.provider,
});
const providerDefault = resolveDefaultMediaModel({
cfg: params.cfg,
providerId: primary.provider,
capability: "image",
});
const primarySupportsNativePdf = providerSupportsNativePdfDocument({
cfg: params.cfg,
providerId: primary.provider,
});
const nativePdfCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
})
.filter((providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId }))
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
const genericImageCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
})
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
if (primary.provider === "google" && googleOk && providerVision && primarySupportsNativePdf) {
preferred = providerVision;
} else if (providerOk && primarySupportsNativePdf && (providerVision || providerDefault)) {
preferred = providerVision ?? `${primary.provider}/${providerDefault}`;
} else {
preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null;
}
if (preferred?.trim()) {
for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) {
if (candidate !== preferred) {
addFallback(candidate);
}
}
const pruned = fallbacks.filter((ref) => ref !== preferred);
return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) };
}
return null;
}
export { resolvePdfModelConfigForTool } from "./pdf-tool.model-config.js";
// ---------------------------------------------------------------------------
// Build context for extraction fallback path

View File

@@ -0,0 +1,98 @@
import type { MediaUnderstandingCapability } from "./types.js";
import { normalizeMediaProviderId } from "./provider-id.js";
type BundledMediaProviderDefaults = {
defaultModels?: Partial<Record<MediaUnderstandingCapability, string>>;
autoPriority?: Partial<Record<MediaUnderstandingCapability, number>>;
nativeDocumentInputs?: Array<"pdf">;
};
const BUNDLED_MEDIA_PROVIDER_DEFAULTS: Record<string, BundledMediaProviderDefaults> = {
openai: {
defaultModels: { image: "gpt-5.4-mini", audio: "gpt-4o-transcribe" },
autoPriority: { image: 10, audio: 10 },
},
"openai-codex": {
defaultModels: { image: "gpt-5.4" },
},
anthropic: {
defaultModels: { image: "claude-opus-4-6" },
autoPriority: { image: 20 },
nativeDocumentInputs: ["pdf"],
},
google: {
defaultModels: {
image: "gemini-3-flash-preview",
audio: "gemini-3-flash-preview",
video: "gemini-3-flash-preview",
},
autoPriority: { image: 30, audio: 40, video: 10 },
nativeDocumentInputs: ["pdf"],
},
groq: {
defaultModels: { audio: "whisper-large-v3-turbo" },
autoPriority: { audio: 20 },
},
deepgram: {
defaultModels: { audio: "nova-3" },
autoPriority: { audio: 30 },
},
mistral: {
defaultModels: { audio: "voxtral-mini-latest" },
autoPriority: { audio: 50 },
},
minimax: {
defaultModels: { image: "MiniMax-VL-01" },
autoPriority: { image: 40 },
},
"minimax-portal": {
defaultModels: { image: "MiniMax-VL-01" },
autoPriority: { image: 50 },
},
zai: {
defaultModels: { image: "glm-4.6v" },
autoPriority: { image: 60 },
},
qwen: {
defaultModels: { image: "qwen-vl-max-latest", video: "qwen-vl-max-latest" },
autoPriority: { video: 15 },
},
moonshot: {
defaultModels: { image: "kimi-k2.5", video: "kimi-k2.5" },
autoPriority: { video: 20 },
},
openrouter: {
defaultModels: { image: "auto" },
},
};
export function getBundledMediaProviderDefaults(providerId: string): BundledMediaProviderDefaults | null {
return BUNDLED_MEDIA_PROVIDER_DEFAULTS[normalizeMediaProviderId(providerId)] ?? null;
}
export function resolveBundledDefaultMediaModel(params: {
providerId: string;
capability: MediaUnderstandingCapability;
}): string | undefined {
return getBundledMediaProviderDefaults(params.providerId)?.defaultModels?.[params.capability]?.trim();
}
export function resolveBundledAutoMediaKeyProviders(capability: MediaUnderstandingCapability): string[] {
return Object.entries(BUNDLED_MEDIA_PROVIDER_DEFAULTS)
.map(([providerId, defaults]) => ({
providerId,
priority: defaults.autoPriority?.[capability],
}))
.filter((entry): entry is { providerId: string; priority: number } => typeof entry.priority === "number")
.toSorted((left, right) => {
if (left.priority !== right.priority) {
return left.priority - right.priority;
}
return left.providerId.localeCompare(right.providerId);
})
.map((entry) => entry.providerId);
}
export function bundledProviderSupportsNativePdfDocument(providerId: string): boolean {
return getBundledMediaProviderDefaults(providerId)?.nativeDocumentInputs?.includes("pdf") ?? false;
}

View File

@@ -1,4 +1,9 @@
import type { OpenClawConfig } from "../config/config.js";
import {
bundledProviderSupportsNativePdfDocument,
resolveBundledAutoMediaKeyProviders,
resolveBundledDefaultMediaModel,
} from "./bundled-defaults.js";
import { buildMediaUnderstandingRegistry, normalizeMediaProviderId } from "./provider-registry.js";
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
@@ -52,12 +57,53 @@ function resolveDefaultRegistry(cfg?: OpenClawConfig) {
return buildMediaUnderstandingRegistry(undefined, cfg ?? ({} as OpenClawConfig));
}
function resolveConfiguredImageProviderModel(params: {
cfg?: OpenClawConfig;
providerId: string;
}): string | undefined {
const providers = params.cfg?.models?.providers;
if (!providers || typeof providers !== "object") {
return undefined;
}
const normalizedProviderId = normalizeMediaProviderId(params.providerId);
for (const [providerKey, providerCfg] of Object.entries(providers)) {
if (normalizeMediaProviderId(providerKey) !== normalizedProviderId) {
continue;
}
const models = providerCfg?.models ?? [];
const match = models.find(
(model) => Boolean(model?.id?.trim()) && Array.isArray(model?.input) && model.input.includes("image"),
);
return match?.id?.trim() || undefined;
}
return undefined;
}
export function resolveDefaultMediaModel(params: {
providerId: string;
capability: MediaUnderstandingCapability;
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): string | undefined {
if (!params.providerRegistry) {
const configuredImageModel =
params.capability === "image"
? resolveConfiguredImageProviderModel({
cfg: params.cfg,
providerId: params.providerId,
})
: undefined;
if (configuredImageModel) {
return configuredImageModel;
}
const bundledDefault = resolveBundledDefaultMediaModel({
providerId: params.providerId,
capability: params.capability,
});
if (bundledDefault) {
return bundledDefault;
}
}
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
const provider = registry.get(normalizeMediaProviderId(params.providerId));
return provider?.defaultModels?.[params.capability]?.trim() || undefined;
@@ -68,6 +114,28 @@ export function resolveAutoMediaKeyProviders(params: {
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): string[] {
if (!params.providerRegistry) {
const bundledProviders = resolveBundledAutoMediaKeyProviders(params.capability);
if (params.capability !== "image") {
return bundledProviders;
}
const configProviders = params.cfg?.models?.providers;
if (!configProviders || typeof configProviders !== "object") {
return bundledProviders;
}
const merged = [...bundledProviders];
for (const [providerKey, providerCfg] of Object.entries(configProviders)) {
const normalizedProviderId = normalizeMediaProviderId(providerKey);
const models = providerCfg?.models ?? [];
const hasImageModel = models.some(
(model) => Array.isArray(model?.input) && model.input.includes("image"),
);
if (hasImageModel && !merged.includes(normalizedProviderId)) {
merged.push(normalizedProviderId);
}
}
return merged;
}
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
type AutoProviderEntry = {
provider: MediaUnderstandingProvider;
@@ -97,6 +165,9 @@ export function providerSupportsNativePdfDocument(params: {
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): boolean {
if (!params.providerRegistry && bundledProviderSupportsNativePdfDocument(params.providerId)) {
return true;
}
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
const provider = registry.get(normalizeMediaProviderId(params.providerId));
return provider?.nativeDocumentInputs?.includes("pdf") ?? false;

View File

@@ -1,4 +1,4 @@
import { normalizeProviderId } from "../agents/model-selection.js";
import { normalizeProviderId } from "../agents/provider-id.js";
export function normalizeMediaProviderId(id: string): string {
const normalized = normalizeProviderId(id);