fix(pdf): resolve standard fonts from pdfjs package root (#70936)

* fix(pdf): resolve standard fonts from pdfjs package root

Resolve PDF.js standard fonts via pdfjs-dist/package.json instead of a
relative ../../node_modules path so the fallback renderer does not depend
on emitted dist chunk layout.

Add focused regression coverage that asserts the forwarded
standardFontDataUrl matches the installed pdfjs-dist package root and
exists on disk.

* fix(pdf): resolve pdfjs standard fonts from package root

* fix(pdf): use PDF.js font URL separator

---------

Co-authored-by: Dr JCai <jingxiao.cai@gmail.com>
Co-authored-by: vincentkoc <25068+vincentkoc@users.noreply.github.com>
Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
JC
2026-04-30 00:38:48 -07:00
committed by GitHub
parent 2d748e4ac1
commit 83753535eb
4 changed files with 85 additions and 9 deletions

View File

@@ -45,6 +45,7 @@ Docs: https://docs.openclaw.ai
- Security/QQBot: sanitize debug log arguments before writing to `console.*`, so gateway payload fields cannot forge extra log lines when debug logging is enabled. Thanks @vincentkoc.
- CLI/agents/status: keep `openclaw agents`, text `agents list`, and plain text `status` on read-only metadata paths so human output no longer preloads plugin runtimes or live channel scans before printing. Fixes #74195. Thanks @NianJiuZst.
- Agents/local models: derive context-window guard thresholds from the effective model window with 4k/8k safety floors, so small local models are no longer rejected by fixed 16k/32k preflight cutoffs. Fixes #42999. Thanks @chengjialu8888.
- PDF extraction: resolve PDF.js standard fonts from the installed package root and pass a filesystem path to the Node fallback extractor, so built-in font PDFs render without `file://` URL lookup failures. Fixes #51455; carries forward #70936, #54447, and #62175. Thanks @anyech, @JuanRdBO, and @solomonneas.
- Media: treat legacy Word/OLE attachments with `application/msword` or `application/x-cfb` MIME as binary so printable-looking `.doc` files are not embedded into prompts as text. Fixes #54176; carries forward #54380. Thanks @andyliu.
- Config: accept documented `browser.tabCleanup` keys in strict root config validation, so configured tab cleanup no longer fails before runtime reads it. Fixes #74577. Thanks @lonexreb and @ezdlp.
- Cron: validate disabled job schedule edits before persisting updates, so invalid cron changes no longer partially mutate stored jobs. Fixes #74459. Thanks @yfge.

View File

@@ -1,7 +1,11 @@
import { existsSync } from "node:fs";
import { createRequire } from "node:module";
import path from "node:path";
import { beforeEach, describe, expect, it, vi } from "vitest";
const { canvasSizes, pdfDocument } = vi.hoisted(() => ({
const { canvasSizes, getDocumentMock, pdfDocument } = vi.hoisted(() => ({
canvasSizes: [] as Array<{ width: number; height: number }>,
getDocumentMock: vi.fn(),
pdfDocument: {
numPages: 2,
getPage: vi.fn(async () => ({
@@ -16,7 +20,7 @@ const { canvasSizes, pdfDocument } = vi.hoisted(() => ({
}));
vi.mock("pdfjs-dist/legacy/build/pdf.mjs", () => ({
getDocument: vi.fn(() => ({ promise: Promise.resolve(pdfDocument) })),
getDocument: getDocumentMock,
}));
vi.mock("@napi-rs/canvas", () => ({
@@ -30,9 +34,13 @@ vi.mock("@napi-rs/canvas", () => ({
import { createPdfDocumentExtractor } from "./document-extractor.js";
const require = createRequire(import.meta.url);
describe("PDF document extractor", () => {
beforeEach(() => {
canvasSizes.length = 0;
getDocumentMock.mockReset();
getDocumentMock.mockReturnValue({ promise: Promise.resolve(pdfDocument) });
pdfDocument.getPage.mockClear();
});
@@ -59,4 +67,34 @@ describe("PDF document extractor", () => {
expect(result?.images).toHaveLength(1);
expect(canvasSizes).toEqual([{ width: 10, height: 10 }]);
});
it("passes standardFontDataUrl to pdfjs getDocument as a package-root filesystem path", async () => {
const extractor = createPdfDocumentExtractor();
await extractor.extract({
buffer: Buffer.from("%PDF-1.4"),
mimeType: "application/pdf",
maxPages: 1,
maxPixels: 4_000_000,
minTextChars: 200,
});
expect(getDocumentMock).toHaveBeenCalledTimes(1);
const [params] = getDocumentMock.mock.calls[0] ?? [];
expect(params).toMatchObject({
disableWorker: true,
});
expect(typeof params.standardFontDataUrl).toBe("string");
const expectedStandardFontDataUrl =
path.join(path.dirname(require.resolve("pdfjs-dist/package.json")), "standard_fonts") + "/";
expect(params.standardFontDataUrl).toBe(expectedStandardFontDataUrl);
expect(path.isAbsolute(params.standardFontDataUrl)).toBe(true);
expect(params.standardFontDataUrl.endsWith("/")).toBe(true);
expect(params.standardFontDataUrl.startsWith("file://")).toBe(false);
expect(existsSync(params.standardFontDataUrl)).toBe(true);
expect(existsSync(path.join(params.standardFontDataUrl, "LiberationSans-Regular.ttf"))).toBe(
true,
);
});
});

View File

@@ -1,9 +1,12 @@
import { createRequire } from "node:module";
import path from "node:path";
import type {
DocumentExtractedImage,
DocumentExtractionRequest,
DocumentExtractionResult,
DocumentExtractorPlugin,
} from "openclaw/plugin-sdk/document-extractor";
import type * as PdfJsLegacy from "pdfjs-dist/legacy/build/pdf.mjs";
type CanvasLike = {
toBuffer(type: "image/png"): Buffer;
@@ -37,19 +40,17 @@ type PdfDocument = {
getPage(pageNumber: number): Promise<PdfPage>;
};
type PdfJsModule = {
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
promise: Promise<PdfDocument>;
};
};
type PdfJsModule = typeof PdfJsLegacy;
const CANVAS_MODULE = "@napi-rs/canvas";
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
const MAX_RENDER_DIMENSION = 10_000;
const require = createRequire(import.meta.url);
let canvasModulePromise: Promise<CanvasModule> | null = null;
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
let pdfJsStandardFontDataPath: string | null = null;
async function loadCanvasModule(): Promise<CanvasModule> {
if (!canvasModulePromise) {
@@ -75,6 +76,15 @@ async function loadPdfJsModule(): Promise<PdfJsModule> {
return pdfJsModulePromise;
}
function resolvePdfJsStandardFontDataPath(): string {
if (!pdfJsStandardFontDataPath) {
const pdfJsPackageJsonPath = require.resolve("pdfjs-dist/package.json");
pdfJsStandardFontDataPath =
path.join(path.dirname(pdfJsPackageJsonPath), "standard_fonts") + "/";
}
return pdfJsStandardFontDataPath;
}
function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number {
if (!pageText) {
return currentLength;
@@ -139,10 +149,11 @@ async function extractPdfContent(
request: DocumentExtractionRequest,
): Promise<DocumentExtractionResult> {
const pdfJsModule = await loadPdfJsModule();
const pdf = await pdfJsModule.getDocument({
const pdf = (await pdfJsModule.getDocument({
data: new Uint8Array(request.buffer),
disableWorker: true,
}).promise;
standardFontDataUrl: resolvePdfJsStandardFontDataPath(),
}).promise) as PdfDocument;
const effectivePages: number[] = request.pageNumbers
? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages)

26
src/types/pdfjs-dist-legacy.d.ts vendored Normal file
View File

@@ -0,0 +1,26 @@
declare module "pdfjs-dist/legacy/build/pdf.mjs" {
import type {
DocumentInitParameters,
PDFDocumentLoadingTask,
TypedArray,
} from "pdfjs-dist/types/src/display/api.js";
export type LegacyDocumentInitParameters = DocumentInitParameters & {
disableWorker?: boolean;
};
export function getDocument(
src?: string | URL | TypedArray | ArrayBuffer | LegacyDocumentInitParameters,
): PDFDocumentLoadingTask;
export type {
DocumentInitParameters,
PDFDocumentLoadingTask,
PDFDocumentProxy,
PDFPageProxy,
TextContent,
TextItem,
TypedArray,
} from "pdfjs-dist/types/src/display/api.js";
export type { PageViewport } from "pdfjs-dist/types/src/display/display_utils.js";
}