refactor(pdf): move document extraction to plugin

* refactor(pdf): move document extraction to plugin * fix(deps): sync document extract lockfile * fix(pdf): harden document extraction plugin
2026-05-06 17:40:44 +00:00 · 2026-04-24 17:15:05 -07:00
parent 915931aa38
commit e3cba98f39
34 changed files with 1023 additions and 321 deletions
--- a/extensions/document-extract/document-extractor.test.ts
+++ b/extensions/document-extract/document-extractor.test.ts
@@ -0,0 +1,62 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+const { canvasSizes, pdfDocument } = vi.hoisted(() => ({
+  canvasSizes: [] as Array<{ width: number; height: number }>,
+  pdfDocument: {
+    numPages: 2,
+    getPage: vi.fn(async () => ({
+      getTextContent: vi.fn(async () => ({ items: [] })),
+      getViewport: vi.fn(({ scale }: { scale: number }) => ({
+        width: 1000 * scale,
+        height: 1000 * scale,
+      })),
+      render: vi.fn(() => ({ promise: Promise.resolve() })),
+    })),
+  },
+}));
+
+vi.mock("pdfjs-dist/legacy/build/pdf.mjs", () => ({
+  getDocument: vi.fn(() => ({ promise: Promise.resolve(pdfDocument) })),
+}));
+
+vi.mock("@napi-rs/canvas", () => ({
+  createCanvas: vi.fn((width: number, height: number) => {
+    canvasSizes.push({ width, height });
+    return {
+      toBuffer: vi.fn(() => Buffer.from("png")),
+    };
+  }),
+}));
+
+import { createPdfDocumentExtractor } from "./document-extractor.js";
+
+describe("PDF document extractor", () => {
+  beforeEach(() => {
+    canvasSizes.length = 0;
+    pdfDocument.getPage.mockClear();
+  });
+
+  it("declares PDF support", () => {
+    const extractor = createPdfDocumentExtractor();
+    expect(extractor).toMatchObject({
+      id: "pdf",
+      label: "PDF",
+      mimeTypes: ["application/pdf"],
+    });
+  });
+
+  it("treats maxPixels as a hard total image rendering budget", async () => {
+    const extractor = createPdfDocumentExtractor();
+
+    const result = await extractor.extract({
+      buffer: Buffer.from("%PDF-1.4"),
+      mimeType: "application/pdf",
+      maxPages: 2,
+      maxPixels: 100,
+      minTextChars: 10,
+    });
+
+    expect(result?.images).toHaveLength(1);
+    expect(canvasSizes).toEqual([{ width: 10, height: 10 }]);
+  });
+});