Files
openclaw/extensions/document-extract/document-extractor.ts
Peter Steinberger 75c3b53038 [codex] Use clawpdf for PDF extraction (#87670)
* feat: use clawpdf for PDF extraction

* fix: align approval action prompt typing

* chore: use clawpdf 0.2.0

* fix: lazily load clawpdf backend
2026-05-28 17:35:39 +01:00

89 lines
2.3 KiB
TypeScript

import type { PdfEngine, PdfImage } from "clawpdf";
import type {
DocumentExtractedImage,
DocumentExtractionRequest,
DocumentExtractionResult,
DocumentExtractorPlugin,
} from "openclaw/plugin-sdk/document-extractor";
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
const MAX_RENDER_DIMENSION = 10_000;
let pdfEnginePromise: Promise<PdfEngine> | null = null;
async function loadPdfEngine(): Promise<PdfEngine> {
if (!pdfEnginePromise) {
pdfEnginePromise = import("clawpdf")
.then(({ createEngine }) => createEngine())
.catch((err) => {
pdfEnginePromise = null;
throw new Error("Dependency clawpdf is required for PDF extraction", {
cause: err,
});
});
}
return pdfEnginePromise;
}
function toDocumentImage(image: PdfImage): DocumentExtractedImage {
return {
type: "image",
data: Buffer.from(image.bytes).toString("base64"),
mimeType: image.mimeType,
};
}
async function extractPdfContent(
request: DocumentExtractionRequest,
): Promise<DocumentExtractionResult> {
const engine = await loadPdfEngine();
const pdf = await engine.open(new Uint8Array(request.buffer));
try {
const pages = request.pageNumbers
? request.pageNumbers
.filter((p) => Number.isInteger(p) && p >= 1 && p <= pdf.pageCount)
.slice(0, request.maxPages)
: undefined;
const pageSelection = pages ? { pages } : { maxPages: request.maxPages };
const textResult = await pdf.extract({
mode: "text",
...pageSelection,
maxTextChars: MAX_EXTRACTED_TEXT_CHARS,
});
const text = textResult.text;
if (text.trim().length >= request.minTextChars) {
return { text, images: [] };
}
try {
const imageResult = await pdf.extract({
mode: "images",
...pageSelection,
image: {
maxDimension: MAX_RENDER_DIMENSION,
maxPixels: request.maxPixels,
forms: true,
},
});
return { text, images: imageResult.images.map(toDocumentImage) };
} catch (err) {
request.onImageExtractionError?.(err);
return { text, images: [] };
}
} finally {
pdf.destroy();
}
}
export function createPdfDocumentExtractor(): DocumentExtractorPlugin {
return {
id: "pdf",
label: "PDF",
mimeTypes: ["application/pdf"],
autoDetectOrder: 10,
extract: extractPdfContent,
};
}