mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 17:50:45 +00:00
refactor(pdf): move document extraction to plugin
* refactor(pdf): move document extraction to plugin * fix(deps): sync document extract lockfile * fix(pdf): harden document extraction plugin
This commit is contained in:
216
extensions/document-extract/document-extractor.ts
Normal file
216
extensions/document-extract/document-extractor.ts
Normal file
@@ -0,0 +1,216 @@
|
||||
import type {
|
||||
DocumentExtractedImage,
|
||||
DocumentExtractionRequest,
|
||||
DocumentExtractionResult,
|
||||
DocumentExtractorPlugin,
|
||||
} from "openclaw/plugin-sdk/document-extractor";
|
||||
|
||||
type CanvasLike = {
|
||||
toBuffer(type: "image/png"): Buffer;
|
||||
};
|
||||
|
||||
type CanvasModule = {
|
||||
createCanvas(width: number, height: number): CanvasLike;
|
||||
};
|
||||
|
||||
type PdfTextItem = {
|
||||
str: string;
|
||||
};
|
||||
|
||||
type PdfTextContent = {
|
||||
items: Array<PdfTextItem | object>;
|
||||
};
|
||||
|
||||
type PdfViewport = {
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
|
||||
type PdfPage = {
|
||||
getTextContent(): Promise<PdfTextContent>;
|
||||
getViewport(params: { scale: number }): PdfViewport;
|
||||
render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise<void> };
|
||||
};
|
||||
|
||||
type PdfDocument = {
|
||||
numPages: number;
|
||||
getPage(pageNumber: number): Promise<PdfPage>;
|
||||
};
|
||||
|
||||
type PdfJsModule = {
|
||||
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
|
||||
promise: Promise<PdfDocument>;
|
||||
};
|
||||
};
|
||||
|
||||
const CANVAS_MODULE = "@napi-rs/canvas";
|
||||
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
|
||||
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
|
||||
const MAX_RENDER_DIMENSION = 10_000;
|
||||
|
||||
let canvasModulePromise: Promise<CanvasModule> | null = null;
|
||||
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
|
||||
|
||||
async function loadCanvasModule(): Promise<CanvasModule> {
|
||||
if (!canvasModulePromise) {
|
||||
canvasModulePromise = (import(CANVAS_MODULE) as Promise<CanvasModule>).catch((err) => {
|
||||
canvasModulePromise = null;
|
||||
throw new Error("Optional dependency @napi-rs/canvas is required for PDF image extraction", {
|
||||
cause: err,
|
||||
});
|
||||
});
|
||||
}
|
||||
return canvasModulePromise;
|
||||
}
|
||||
|
||||
async function loadPdfJsModule(): Promise<PdfJsModule> {
|
||||
if (!pdfJsModulePromise) {
|
||||
pdfJsModulePromise = (import(PDFJS_MODULE) as Promise<PdfJsModule>).catch((err) => {
|
||||
pdfJsModulePromise = null;
|
||||
throw new Error("Optional dependency pdfjs-dist is required for PDF extraction", {
|
||||
cause: err,
|
||||
});
|
||||
});
|
||||
}
|
||||
return pdfJsModulePromise;
|
||||
}
|
||||
|
||||
function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number {
|
||||
if (!pageText) {
|
||||
return currentLength;
|
||||
}
|
||||
const remaining = MAX_EXTRACTED_TEXT_CHARS - currentLength;
|
||||
if (remaining <= 0) {
|
||||
return currentLength;
|
||||
}
|
||||
const nextText = pageText.length > remaining ? pageText.slice(0, remaining) : pageText;
|
||||
parts.push(nextText);
|
||||
return currentLength + nextText.length;
|
||||
}
|
||||
|
||||
function resolveRenderPlan(
|
||||
viewport: PdfViewport,
|
||||
remainingPixels: number,
|
||||
): { scale: number; width: number; height: number; pixels: number } | null {
|
||||
if (
|
||||
remainingPixels <= 0 ||
|
||||
!Number.isFinite(viewport.width) ||
|
||||
!Number.isFinite(viewport.height) ||
|
||||
viewport.width <= 0 ||
|
||||
viewport.height <= 0
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const pagePixels = Math.max(1, viewport.width * viewport.height);
|
||||
const maxScale = Math.min(
|
||||
1,
|
||||
Math.sqrt(remainingPixels / pagePixels),
|
||||
MAX_RENDER_DIMENSION / viewport.width,
|
||||
MAX_RENDER_DIMENSION / viewport.height,
|
||||
);
|
||||
if (!Number.isFinite(maxScale) || maxScale <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let best: { scale: number; width: number; height: number; pixels: number } | null = null;
|
||||
let low = 0;
|
||||
let high = maxScale;
|
||||
for (let i = 0; i < 32; i += 1) {
|
||||
const scale = (low + high) / 2;
|
||||
const width = Math.max(1, Math.ceil(viewport.width * scale));
|
||||
const height = Math.max(1, Math.ceil(viewport.height * scale));
|
||||
const pixels = width * height;
|
||||
if (
|
||||
width <= MAX_RENDER_DIMENSION &&
|
||||
height <= MAX_RENDER_DIMENSION &&
|
||||
pixels <= remainingPixels
|
||||
) {
|
||||
best = { scale, width, height, pixels };
|
||||
low = scale;
|
||||
} else {
|
||||
high = scale;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
async function extractPdfContent(
|
||||
request: DocumentExtractionRequest,
|
||||
): Promise<DocumentExtractionResult> {
|
||||
const pdfJsModule = await loadPdfJsModule();
|
||||
const pdf = await pdfJsModule.getDocument({
|
||||
data: new Uint8Array(request.buffer),
|
||||
disableWorker: true,
|
||||
}).promise;
|
||||
|
||||
const effectivePages: number[] = request.pageNumbers
|
||||
? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages)
|
||||
: Array.from({ length: Math.min(pdf.numPages, request.maxPages) }, (_, i) => i + 1);
|
||||
|
||||
const textParts: string[] = [];
|
||||
let extractedTextLength = 0;
|
||||
for (const pageNum of effectivePages) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const textContent = await page.getTextContent();
|
||||
const pageText = textContent.items
|
||||
.map((item) => ("str" in item ? item.str : ""))
|
||||
.filter(Boolean)
|
||||
.join(" ");
|
||||
if (pageText) {
|
||||
extractedTextLength = appendTextWithinLimit(textParts, pageText, extractedTextLength);
|
||||
if (extractedTextLength >= MAX_EXTRACTED_TEXT_CHARS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const text = textParts.join("\n\n");
|
||||
if (text.trim().length >= request.minTextChars) {
|
||||
return { text, images: [] };
|
||||
}
|
||||
|
||||
let canvasModule: CanvasModule;
|
||||
try {
|
||||
canvasModule = await loadCanvasModule();
|
||||
} catch (err) {
|
||||
request.onImageExtractionError?.(err);
|
||||
return { text, images: [] };
|
||||
}
|
||||
|
||||
const images: DocumentExtractedImage[] = [];
|
||||
let remainingPixels = Math.max(1, Math.floor(request.maxPixels));
|
||||
|
||||
for (const pageNum of effectivePages) {
|
||||
if (remainingPixels <= 0) {
|
||||
break;
|
||||
}
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const plan = resolveRenderPlan(viewport, remainingPixels);
|
||||
if (!plan) {
|
||||
break;
|
||||
}
|
||||
const scaled = page.getViewport({ scale: plan.scale });
|
||||
const canvas = canvasModule.createCanvas(plan.width, plan.height);
|
||||
await page.render({
|
||||
canvas: canvas as unknown as HTMLCanvasElement,
|
||||
viewport: scaled,
|
||||
}).promise;
|
||||
const png = canvas.toBuffer("image/png");
|
||||
images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
|
||||
remainingPixels -= plan.pixels;
|
||||
}
|
||||
|
||||
return { text, images };
|
||||
}
|
||||
|
||||
export function createPdfDocumentExtractor(): DocumentExtractorPlugin {
|
||||
return {
|
||||
id: "pdf",
|
||||
label: "PDF",
|
||||
mimeTypes: ["application/pdf"],
|
||||
autoDetectOrder: 10,
|
||||
extract: extractPdfContent,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user