mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-01 11:53:31 +00:00
* feat: use clawpdf for PDF extraction * fix: align approval action prompt typing * chore: use clawpdf 0.2.0 * fix: lazily load clawpdf backend
89 lines
2.3 KiB
TypeScript
89 lines
2.3 KiB
TypeScript
import type { PdfEngine, PdfImage } from "clawpdf";
|
|
import type {
|
|
DocumentExtractedImage,
|
|
DocumentExtractionRequest,
|
|
DocumentExtractionResult,
|
|
DocumentExtractorPlugin,
|
|
} from "openclaw/plugin-sdk/document-extractor";
|
|
|
|
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
|
|
const MAX_RENDER_DIMENSION = 10_000;
|
|
|
|
let pdfEnginePromise: Promise<PdfEngine> | null = null;
|
|
|
|
async function loadPdfEngine(): Promise<PdfEngine> {
|
|
if (!pdfEnginePromise) {
|
|
pdfEnginePromise = import("clawpdf")
|
|
.then(({ createEngine }) => createEngine())
|
|
.catch((err) => {
|
|
pdfEnginePromise = null;
|
|
throw new Error("Dependency clawpdf is required for PDF extraction", {
|
|
cause: err,
|
|
});
|
|
});
|
|
}
|
|
return pdfEnginePromise;
|
|
}
|
|
|
|
function toDocumentImage(image: PdfImage): DocumentExtractedImage {
|
|
return {
|
|
type: "image",
|
|
data: Buffer.from(image.bytes).toString("base64"),
|
|
mimeType: image.mimeType,
|
|
};
|
|
}
|
|
|
|
async function extractPdfContent(
|
|
request: DocumentExtractionRequest,
|
|
): Promise<DocumentExtractionResult> {
|
|
const engine = await loadPdfEngine();
|
|
const pdf = await engine.open(new Uint8Array(request.buffer));
|
|
try {
|
|
const pages = request.pageNumbers
|
|
? request.pageNumbers
|
|
.filter((p) => Number.isInteger(p) && p >= 1 && p <= pdf.pageCount)
|
|
.slice(0, request.maxPages)
|
|
: undefined;
|
|
const pageSelection = pages ? { pages } : { maxPages: request.maxPages };
|
|
|
|
const textResult = await pdf.extract({
|
|
mode: "text",
|
|
...pageSelection,
|
|
maxTextChars: MAX_EXTRACTED_TEXT_CHARS,
|
|
});
|
|
const text = textResult.text;
|
|
|
|
if (text.trim().length >= request.minTextChars) {
|
|
return { text, images: [] };
|
|
}
|
|
|
|
try {
|
|
const imageResult = await pdf.extract({
|
|
mode: "images",
|
|
...pageSelection,
|
|
image: {
|
|
maxDimension: MAX_RENDER_DIMENSION,
|
|
maxPixels: request.maxPixels,
|
|
forms: true,
|
|
},
|
|
});
|
|
return { text, images: imageResult.images.map(toDocumentImage) };
|
|
} catch (err) {
|
|
request.onImageExtractionError?.(err);
|
|
return { text, images: [] };
|
|
}
|
|
} finally {
|
|
pdf.destroy();
|
|
}
|
|
}
|
|
|
|
export function createPdfDocumentExtractor(): DocumentExtractorPlugin {
|
|
return {
|
|
id: "pdf",
|
|
label: "PDF",
|
|
mimeTypes: ["application/pdf"],
|
|
autoDetectOrder: 10,
|
|
extract: extractPdfContent,
|
|
};
|
|
}
|