Files
openclaw/src/media/pdf-extract.ts
Tyler Yust d0ac1b0195 feat: add PDF analysis tool with native provider support (#31319)
* feat: add PDF analysis tool with native provider support

New `pdf` tool for analyzing PDF documents with model-powered analysis.

Architecture:
- Native PDF path: sends raw PDF bytes directly to providers that support
  inline document input (Anthropic via DocumentBlockParam, Google Gemini
  via inlineData with application/pdf MIME type)
- Extraction fallback: for providers without native PDF support, extracts
  text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas,
  then sends through the standard vision/text completion path

Key features:
- Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10)
- Page range selection (`pages` param, e.g. "1-5", "1,3,7-9")
- Model override (`model` param) and file size limits (`maxBytesMb`)
- Auto-detects provider capability and falls back gracefully
- Same security patterns as image tool (SSRF guards, sandbox support,
  local path roots, workspace-only policy)

Config (agents.defaults):
- pdfModel: primary/fallbacks (defaults to imageModel, then session model)
- pdfMaxBytesMb: max PDF file size (default: 10)
- pdfMaxPages: max pages to process (default: 20)

Model catalog:
- Extended ModelInputType to include "document" alongside "text"/"image"
- Added modelSupportsDocument() capability check

Files:
- src/agents/tools/pdf-tool.ts - main tool factory
- src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.)
- src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google
- src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths
- Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help

* fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
2026-03-01 22:39:12 -08:00

105 lines
3.3 KiB
TypeScript

type CanvasModule = typeof import("@napi-rs/canvas");
type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs");
let canvasModulePromise: Promise<CanvasModule> | null = null;
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
async function loadCanvasModule(): Promise<CanvasModule> {
if (!canvasModulePromise) {
canvasModulePromise = import("@napi-rs/canvas").catch((err) => {
canvasModulePromise = null;
throw new Error(
`Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`,
);
});
}
return canvasModulePromise;
}
async function loadPdfJsModule(): Promise<PdfJsModule> {
if (!pdfJsModulePromise) {
pdfJsModulePromise = import("pdfjs-dist/legacy/build/pdf.mjs").catch((err) => {
pdfJsModulePromise = null;
throw new Error(
`Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`,
);
});
}
return pdfJsModulePromise;
}
export type PdfExtractedImage = {
type: "image";
data: string;
mimeType: string;
};
export type PdfExtractedContent = {
text: string;
images: PdfExtractedImage[];
};
export async function extractPdfContent(params: {
buffer: Buffer;
maxPages: number;
maxPixels: number;
minTextChars: number;
pageNumbers?: number[];
onImageExtractionError?: (error: unknown) => void;
}): Promise<PdfExtractedContent> {
const { buffer, maxPages, maxPixels, minTextChars, pageNumbers, onImageExtractionError } = params;
const { getDocument } = await loadPdfJsModule();
const pdf = await getDocument({ data: new Uint8Array(buffer), disableWorker: true }).promise;
const effectivePages: number[] = pageNumbers
? pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, maxPages)
: Array.from({ length: Math.min(pdf.numPages, maxPages) }, (_, i) => i + 1);
const textParts: string[] = [];
for (const pageNum of effectivePages) {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item) => ("str" in item ? String(item.str) : ""))
.filter(Boolean)
.join(" ");
if (pageText) {
textParts.push(pageText);
}
}
const text = textParts.join("\n\n");
if (text.trim().length >= minTextChars) {
return { text, images: [] };
}
let canvasModule: CanvasModule;
try {
canvasModule = await loadCanvasModule();
} catch (err) {
onImageExtractionError?.(err);
return { text, images: [] };
}
const { createCanvas } = canvasModule;
const images: PdfExtractedImage[] = [];
const pixelBudget = Math.max(1, maxPixels);
for (const pageNum of effectivePages) {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1 });
const pagePixels = viewport.width * viewport.height;
const scale = Math.min(1, Math.sqrt(pixelBudget / Math.max(1, pagePixels)));
const scaled = page.getViewport({ scale: Math.max(0.1, scale) });
const canvas = createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height));
await page.render({
canvas: canvas as unknown as HTMLCanvasElement,
viewport: scaled,
}).promise;
const png = canvas.toBuffer("image/png");
images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
}
return { text, images };
}