mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:20:43 +00:00
fix(pdf): resolve standard fonts from pdfjs package root (#70936)
* fix(pdf): resolve standard fonts from pdfjs package root Resolve PDF.js standard fonts via pdfjs-dist/package.json instead of a relative ../../node_modules path so the fallback renderer does not depend on emitted dist chunk layout. Add focused regression coverage that asserts the forwarded standardFontDataUrl matches the installed pdfjs-dist package root and exists on disk. * fix(pdf): resolve pdfjs standard fonts from package root * fix(pdf): use PDF.js font URL separator --------- Co-authored-by: Dr JCai <jingxiao.cai@gmail.com> Co-authored-by: vincentkoc <25068+vincentkoc@users.noreply.github.com> Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -45,6 +45,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Security/QQBot: sanitize debug log arguments before writing to `console.*`, so gateway payload fields cannot forge extra log lines when debug logging is enabled. Thanks @vincentkoc.
|
||||
- CLI/agents/status: keep `openclaw agents`, text `agents list`, and plain text `status` on read-only metadata paths so human output no longer preloads plugin runtimes or live channel scans before printing. Fixes #74195. Thanks @NianJiuZst.
|
||||
- Agents/local models: derive context-window guard thresholds from the effective model window with 4k/8k safety floors, so small local models are no longer rejected by fixed 16k/32k preflight cutoffs. Fixes #42999. Thanks @chengjialu8888.
|
||||
- PDF extraction: resolve PDF.js standard fonts from the installed package root and pass a filesystem path to the Node fallback extractor, so built-in font PDFs render without `file://` URL lookup failures. Fixes #51455; carries forward #70936, #54447, and #62175. Thanks @anyech, @JuanRdBO, and @solomonneas.
|
||||
- Media: treat legacy Word/OLE attachments with `application/msword` or `application/x-cfb` MIME as binary so printable-looking `.doc` files are not embedded into prompts as text. Fixes #54176; carries forward #54380. Thanks @andyliu.
|
||||
- Config: accept documented `browser.tabCleanup` keys in strict root config validation, so configured tab cleanup no longer fails before runtime reads it. Fixes #74577. Thanks @lonexreb and @ezdlp.
|
||||
- Cron: validate disabled job schedule edits before persisting updates, so invalid cron changes no longer partially mutate stored jobs. Fixes #74459. Thanks @yfge.
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import { existsSync } from "node:fs";
|
||||
import { createRequire } from "node:module";
|
||||
import path from "node:path";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const { canvasSizes, pdfDocument } = vi.hoisted(() => ({
|
||||
const { canvasSizes, getDocumentMock, pdfDocument } = vi.hoisted(() => ({
|
||||
canvasSizes: [] as Array<{ width: number; height: number }>,
|
||||
getDocumentMock: vi.fn(),
|
||||
pdfDocument: {
|
||||
numPages: 2,
|
||||
getPage: vi.fn(async () => ({
|
||||
@@ -16,7 +20,7 @@ const { canvasSizes, pdfDocument } = vi.hoisted(() => ({
|
||||
}));
|
||||
|
||||
vi.mock("pdfjs-dist/legacy/build/pdf.mjs", () => ({
|
||||
getDocument: vi.fn(() => ({ promise: Promise.resolve(pdfDocument) })),
|
||||
getDocument: getDocumentMock,
|
||||
}));
|
||||
|
||||
vi.mock("@napi-rs/canvas", () => ({
|
||||
@@ -30,9 +34,13 @@ vi.mock("@napi-rs/canvas", () => ({
|
||||
|
||||
import { createPdfDocumentExtractor } from "./document-extractor.js";
|
||||
|
||||
const require = createRequire(import.meta.url);
|
||||
|
||||
describe("PDF document extractor", () => {
|
||||
beforeEach(() => {
|
||||
canvasSizes.length = 0;
|
||||
getDocumentMock.mockReset();
|
||||
getDocumentMock.mockReturnValue({ promise: Promise.resolve(pdfDocument) });
|
||||
pdfDocument.getPage.mockClear();
|
||||
});
|
||||
|
||||
@@ -59,4 +67,34 @@ describe("PDF document extractor", () => {
|
||||
expect(result?.images).toHaveLength(1);
|
||||
expect(canvasSizes).toEqual([{ width: 10, height: 10 }]);
|
||||
});
|
||||
|
||||
it("passes standardFontDataUrl to pdfjs getDocument as a package-root filesystem path", async () => {
|
||||
const extractor = createPdfDocumentExtractor();
|
||||
|
||||
await extractor.extract({
|
||||
buffer: Buffer.from("%PDF-1.4"),
|
||||
mimeType: "application/pdf",
|
||||
maxPages: 1,
|
||||
maxPixels: 4_000_000,
|
||||
minTextChars: 200,
|
||||
});
|
||||
|
||||
expect(getDocumentMock).toHaveBeenCalledTimes(1);
|
||||
const [params] = getDocumentMock.mock.calls[0] ?? [];
|
||||
expect(params).toMatchObject({
|
||||
disableWorker: true,
|
||||
});
|
||||
expect(typeof params.standardFontDataUrl).toBe("string");
|
||||
|
||||
const expectedStandardFontDataUrl =
|
||||
path.join(path.dirname(require.resolve("pdfjs-dist/package.json")), "standard_fonts") + "/";
|
||||
expect(params.standardFontDataUrl).toBe(expectedStandardFontDataUrl);
|
||||
expect(path.isAbsolute(params.standardFontDataUrl)).toBe(true);
|
||||
expect(params.standardFontDataUrl.endsWith("/")).toBe(true);
|
||||
expect(params.standardFontDataUrl.startsWith("file://")).toBe(false);
|
||||
expect(existsSync(params.standardFontDataUrl)).toBe(true);
|
||||
expect(existsSync(path.join(params.standardFontDataUrl, "LiberationSans-Regular.ttf"))).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import { createRequire } from "node:module";
|
||||
import path from "node:path";
|
||||
import type {
|
||||
DocumentExtractedImage,
|
||||
DocumentExtractionRequest,
|
||||
DocumentExtractionResult,
|
||||
DocumentExtractorPlugin,
|
||||
} from "openclaw/plugin-sdk/document-extractor";
|
||||
import type * as PdfJsLegacy from "pdfjs-dist/legacy/build/pdf.mjs";
|
||||
|
||||
type CanvasLike = {
|
||||
toBuffer(type: "image/png"): Buffer;
|
||||
@@ -37,19 +40,17 @@ type PdfDocument = {
|
||||
getPage(pageNumber: number): Promise<PdfPage>;
|
||||
};
|
||||
|
||||
type PdfJsModule = {
|
||||
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
|
||||
promise: Promise<PdfDocument>;
|
||||
};
|
||||
};
|
||||
type PdfJsModule = typeof PdfJsLegacy;
|
||||
|
||||
const CANVAS_MODULE = "@napi-rs/canvas";
|
||||
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
|
||||
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
|
||||
const MAX_RENDER_DIMENSION = 10_000;
|
||||
const require = createRequire(import.meta.url);
|
||||
|
||||
let canvasModulePromise: Promise<CanvasModule> | null = null;
|
||||
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
|
||||
let pdfJsStandardFontDataPath: string | null = null;
|
||||
|
||||
async function loadCanvasModule(): Promise<CanvasModule> {
|
||||
if (!canvasModulePromise) {
|
||||
@@ -75,6 +76,15 @@ async function loadPdfJsModule(): Promise<PdfJsModule> {
|
||||
return pdfJsModulePromise;
|
||||
}
|
||||
|
||||
function resolvePdfJsStandardFontDataPath(): string {
|
||||
if (!pdfJsStandardFontDataPath) {
|
||||
const pdfJsPackageJsonPath = require.resolve("pdfjs-dist/package.json");
|
||||
pdfJsStandardFontDataPath =
|
||||
path.join(path.dirname(pdfJsPackageJsonPath), "standard_fonts") + "/";
|
||||
}
|
||||
return pdfJsStandardFontDataPath;
|
||||
}
|
||||
|
||||
function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number {
|
||||
if (!pageText) {
|
||||
return currentLength;
|
||||
@@ -139,10 +149,11 @@ async function extractPdfContent(
|
||||
request: DocumentExtractionRequest,
|
||||
): Promise<DocumentExtractionResult> {
|
||||
const pdfJsModule = await loadPdfJsModule();
|
||||
const pdf = await pdfJsModule.getDocument({
|
||||
const pdf = (await pdfJsModule.getDocument({
|
||||
data: new Uint8Array(request.buffer),
|
||||
disableWorker: true,
|
||||
}).promise;
|
||||
standardFontDataUrl: resolvePdfJsStandardFontDataPath(),
|
||||
}).promise) as PdfDocument;
|
||||
|
||||
const effectivePages: number[] = request.pageNumbers
|
||||
? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages)
|
||||
|
||||
26
src/types/pdfjs-dist-legacy.d.ts
vendored
Normal file
26
src/types/pdfjs-dist-legacy.d.ts
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
declare module "pdfjs-dist/legacy/build/pdf.mjs" {
|
||||
import type {
|
||||
DocumentInitParameters,
|
||||
PDFDocumentLoadingTask,
|
||||
TypedArray,
|
||||
} from "pdfjs-dist/types/src/display/api.js";
|
||||
|
||||
export type LegacyDocumentInitParameters = DocumentInitParameters & {
|
||||
disableWorker?: boolean;
|
||||
};
|
||||
|
||||
export function getDocument(
|
||||
src?: string | URL | TypedArray | ArrayBuffer | LegacyDocumentInitParameters,
|
||||
): PDFDocumentLoadingTask;
|
||||
|
||||
export type {
|
||||
DocumentInitParameters,
|
||||
PDFDocumentLoadingTask,
|
||||
PDFDocumentProxy,
|
||||
PDFPageProxy,
|
||||
TextContent,
|
||||
TextItem,
|
||||
TypedArray,
|
||||
} from "pdfjs-dist/types/src/display/api.js";
|
||||
export type { PageViewport } from "pdfjs-dist/types/src/display/display_utils.js";
|
||||
}
|
||||
Reference in New Issue
Block a user