diff --git a/CHANGELOG.md b/CHANGELOG.md index 89e4db7d102..deff027be42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ Docs: https://docs.openclaw.ai - Security/QQBot: sanitize debug log arguments before writing to `console.*`, so gateway payload fields cannot forge extra log lines when debug logging is enabled. Thanks @vincentkoc. - CLI/agents/status: keep `openclaw agents`, text `agents list`, and plain text `status` on read-only metadata paths so human output no longer preloads plugin runtimes or live channel scans before printing. Fixes #74195. Thanks @NianJiuZst. - Agents/local models: derive context-window guard thresholds from the effective model window with 4k/8k safety floors, so small local models are no longer rejected by fixed 16k/32k preflight cutoffs. Fixes #42999. Thanks @chengjialu8888. +- PDF extraction: resolve PDF.js standard fonts from the installed package root and pass a filesystem path to the Node fallback extractor, so built-in font PDFs render without `file://` URL lookup failures. Fixes #51455; carries forward #70936, #54447, and #62175. Thanks @anyech, @JuanRdBO, and @solomonneas. - Media: treat legacy Word/OLE attachments with `application/msword` or `application/x-cfb` MIME as binary so printable-looking `.doc` files are not embedded into prompts as text. Fixes #54176; carries forward #54380. Thanks @andyliu. - Config: accept documented `browser.tabCleanup` keys in strict root config validation, so configured tab cleanup no longer fails before runtime reads it. Fixes #74577. Thanks @lonexreb and @ezdlp. - Cron: validate disabled job schedule edits before persisting updates, so invalid cron changes no longer partially mutate stored jobs. Fixes #74459. Thanks @yfge. diff --git a/extensions/document-extract/document-extractor.test.ts b/extensions/document-extract/document-extractor.test.ts index d65168d3234..d8b4e3d87a3 100644 --- a/extensions/document-extract/document-extractor.test.ts +++ b/extensions/document-extract/document-extractor.test.ts @@ -1,7 +1,11 @@ +import { existsSync } from "node:fs"; +import { createRequire } from "node:module"; +import path from "node:path"; import { beforeEach, describe, expect, it, vi } from "vitest"; -const { canvasSizes, pdfDocument } = vi.hoisted(() => ({ +const { canvasSizes, getDocumentMock, pdfDocument } = vi.hoisted(() => ({ canvasSizes: [] as Array<{ width: number; height: number }>, + getDocumentMock: vi.fn(), pdfDocument: { numPages: 2, getPage: vi.fn(async () => ({ @@ -16,7 +20,7 @@ const { canvasSizes, pdfDocument } = vi.hoisted(() => ({ })); vi.mock("pdfjs-dist/legacy/build/pdf.mjs", () => ({ - getDocument: vi.fn(() => ({ promise: Promise.resolve(pdfDocument) })), + getDocument: getDocumentMock, })); vi.mock("@napi-rs/canvas", () => ({ @@ -30,9 +34,13 @@ vi.mock("@napi-rs/canvas", () => ({ import { createPdfDocumentExtractor } from "./document-extractor.js"; +const require = createRequire(import.meta.url); + describe("PDF document extractor", () => { beforeEach(() => { canvasSizes.length = 0; + getDocumentMock.mockReset(); + getDocumentMock.mockReturnValue({ promise: Promise.resolve(pdfDocument) }); pdfDocument.getPage.mockClear(); }); @@ -59,4 +67,34 @@ describe("PDF document extractor", () => { expect(result?.images).toHaveLength(1); expect(canvasSizes).toEqual([{ width: 10, height: 10 }]); }); + + it("passes standardFontDataUrl to pdfjs getDocument as a package-root filesystem path", async () => { + const extractor = createPdfDocumentExtractor(); + + await extractor.extract({ + buffer: Buffer.from("%PDF-1.4"), + mimeType: "application/pdf", + maxPages: 1, + maxPixels: 4_000_000, + minTextChars: 200, + }); + + expect(getDocumentMock).toHaveBeenCalledTimes(1); + const [params] = getDocumentMock.mock.calls[0] ?? []; + expect(params).toMatchObject({ + disableWorker: true, + }); + expect(typeof params.standardFontDataUrl).toBe("string"); + + const expectedStandardFontDataUrl = + path.join(path.dirname(require.resolve("pdfjs-dist/package.json")), "standard_fonts") + "/"; + expect(params.standardFontDataUrl).toBe(expectedStandardFontDataUrl); + expect(path.isAbsolute(params.standardFontDataUrl)).toBe(true); + expect(params.standardFontDataUrl.endsWith("/")).toBe(true); + expect(params.standardFontDataUrl.startsWith("file://")).toBe(false); + expect(existsSync(params.standardFontDataUrl)).toBe(true); + expect(existsSync(path.join(params.standardFontDataUrl, "LiberationSans-Regular.ttf"))).toBe( + true, + ); + }); }); diff --git a/extensions/document-extract/document-extractor.ts b/extensions/document-extract/document-extractor.ts index 4bea3b566ef..37cb7b03bec 100644 --- a/extensions/document-extract/document-extractor.ts +++ b/extensions/document-extract/document-extractor.ts @@ -1,9 +1,12 @@ +import { createRequire } from "node:module"; +import path from "node:path"; import type { DocumentExtractedImage, DocumentExtractionRequest, DocumentExtractionResult, DocumentExtractorPlugin, } from "openclaw/plugin-sdk/document-extractor"; +import type * as PdfJsLegacy from "pdfjs-dist/legacy/build/pdf.mjs"; type CanvasLike = { toBuffer(type: "image/png"): Buffer; @@ -37,19 +40,17 @@ type PdfDocument = { getPage(pageNumber: number): Promise; }; -type PdfJsModule = { - getDocument(params: { data: Uint8Array; disableWorker?: boolean }): { - promise: Promise; - }; -}; +type PdfJsModule = typeof PdfJsLegacy; const CANVAS_MODULE = "@napi-rs/canvas"; const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs"; const MAX_EXTRACTED_TEXT_CHARS = 200_000; const MAX_RENDER_DIMENSION = 10_000; +const require = createRequire(import.meta.url); let canvasModulePromise: Promise | null = null; let pdfJsModulePromise: Promise | null = null; +let pdfJsStandardFontDataPath: string | null = null; async function loadCanvasModule(): Promise { if (!canvasModulePromise) { @@ -75,6 +76,15 @@ async function loadPdfJsModule(): Promise { return pdfJsModulePromise; } +function resolvePdfJsStandardFontDataPath(): string { + if (!pdfJsStandardFontDataPath) { + const pdfJsPackageJsonPath = require.resolve("pdfjs-dist/package.json"); + pdfJsStandardFontDataPath = + path.join(path.dirname(pdfJsPackageJsonPath), "standard_fonts") + "/"; + } + return pdfJsStandardFontDataPath; +} + function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number { if (!pageText) { return currentLength; @@ -139,10 +149,11 @@ async function extractPdfContent( request: DocumentExtractionRequest, ): Promise { const pdfJsModule = await loadPdfJsModule(); - const pdf = await pdfJsModule.getDocument({ + const pdf = (await pdfJsModule.getDocument({ data: new Uint8Array(request.buffer), disableWorker: true, - }).promise; + standardFontDataUrl: resolvePdfJsStandardFontDataPath(), + }).promise) as PdfDocument; const effectivePages: number[] = request.pageNumbers ? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages) diff --git a/src/types/pdfjs-dist-legacy.d.ts b/src/types/pdfjs-dist-legacy.d.ts new file mode 100644 index 00000000000..1657b3246c2 --- /dev/null +++ b/src/types/pdfjs-dist-legacy.d.ts @@ -0,0 +1,26 @@ +declare module "pdfjs-dist/legacy/build/pdf.mjs" { + import type { + DocumentInitParameters, + PDFDocumentLoadingTask, + TypedArray, + } from "pdfjs-dist/types/src/display/api.js"; + + export type LegacyDocumentInitParameters = DocumentInitParameters & { + disableWorker?: boolean; + }; + + export function getDocument( + src?: string | URL | TypedArray | ArrayBuffer | LegacyDocumentInitParameters, + ): PDFDocumentLoadingTask; + + export type { + DocumentInitParameters, + PDFDocumentLoadingTask, + PDFDocumentProxy, + PDFPageProxy, + TextContent, + TextItem, + TypedArray, + } from "pdfjs-dist/types/src/display/api.js"; + export type { PageViewport } from "pdfjs-dist/types/src/display/display_utils.js"; +}