diff --git a/src/agents/tools/web-fetch-utils.ts b/src/agents/tools/web-fetch-utils.ts index 01fc1beada8..0df64b531a3 100644 --- a/src/agents/tools/web-fetch-utils.ts +++ b/src/agents/tools/web-fetch-utils.ts @@ -6,24 +6,57 @@ export type ExtractMode = "markdown" | "text"; const READABILITY_MAX_HTML_CHARS = 1_000_000; const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000; +type ParsedHtml = { + document: Document; +}; + +type ParseHtml = (html: string) => ParsedHtml; + +type ReadabilityResult = { + content?: string; + textContent?: string | null; + title?: string | null; +}; + +type ReadabilityInstance = { + parse(): ReadabilityResult | null; +}; + +type ReadabilityConstructor = new ( + document: Document, + options: { charThreshold: number }, +) => ReadabilityInstance; + +type ReadabilityModule = { + Readability: ReadabilityConstructor; +}; + +type LinkedomModule = { + parseHTML: ParseHtml; +}; + +const READABILITY_MODULE = "@mozilla/readability"; +const LINKEDOM_MODULE = "linkedom"; + let readabilityDepsPromise: | Promise<{ - Readability: typeof import("@mozilla/readability").Readability; - parseHTML: typeof import("linkedom").parseHTML; + Readability: ReadabilityConstructor; + parseHTML: ParseHtml; }> | undefined; async function loadReadabilityDeps(): Promise<{ - Readability: typeof import("@mozilla/readability").Readability; - parseHTML: typeof import("linkedom").parseHTML; + Readability: ReadabilityConstructor; + parseHTML: ParseHtml; }> { if (!readabilityDepsPromise) { - readabilityDepsPromise = Promise.all([import("@mozilla/readability"), import("linkedom")]).then( - ([readability, linkedom]) => ({ - Readability: readability.Readability, - parseHTML: linkedom.parseHTML, - }), - ); + readabilityDepsPromise = Promise.all([ + import(READABILITY_MODULE) as Promise, + import(LINKEDOM_MODULE) as Promise, + ]).then(([readability, linkedom]) => ({ + Readability: readability.Readability, + parseHTML: linkedom.parseHTML, + })); } try { return await readabilityDepsPromise; diff --git a/src/agents/tools/web-fetch-visibility.ts b/src/agents/tools/web-fetch-visibility.ts index 70ce89bdfc9..ad1a3a77696 100644 --- a/src/agents/tools/web-fetch-visibility.ts +++ b/src/agents/tools/web-fetch-visibility.ts @@ -26,10 +26,24 @@ const HIDDEN_CLASS_NAMES = new Set([ "offscreen", ]); -let parseHtmlPromise: Promise | null = null; +type ParsedHtml = { + document: Document; +}; -async function loadParseHTML(): Promise { - parseHtmlPromise ??= import("linkedom").then(({ parseHTML }) => parseHTML); +type ParseHtml = (html: string) => ParsedHtml; + +type LinkedomModule = { + parseHTML: ParseHtml; +}; + +const LINKEDOM_MODULE = "linkedom"; + +let parseHtmlPromise: Promise | null = null; + +async function loadParseHTML(): Promise { + parseHtmlPromise ??= (import(LINKEDOM_MODULE) as Promise).then( + ({ parseHTML }) => parseHTML, + ); return parseHtmlPromise; } diff --git a/src/auto-reply/reply/export-html/template.security.test.ts b/src/auto-reply/reply/export-html/template.security.test.ts index 9a42fd22337..8a2844af32d 100644 --- a/src/auto-reply/reply/export-html/template.security.test.ts +++ b/src/auto-reply/reply/export-html/template.security.test.ts @@ -3,7 +3,6 @@ import path from "node:path"; import vm from "node:vm"; import { fileURLToPath } from "node:url"; import { describe, expect, it } from "vitest"; -import { parseHTML } from "linkedom"; type SessionEntry = { id: string; @@ -28,13 +27,39 @@ type SessionData = { tools: unknown[]; }; +type ParsedHtml = { + document: Document; + window: { + HTMLElement?: { + prototype?: { + scrollIntoView?: () => void; + }; + }; + }; +}; + +type LinkedomModule = { + parseHTML(html: string): ParsedHtml; +}; + +const LINKEDOM_MODULE = "linkedom"; + const exportHtmlDir = path.dirname(fileURLToPath(import.meta.url)); const templateHtml = fs.readFileSync(path.join(exportHtmlDir, "template.html"), "utf8"); const templateJs = fs.readFileSync(path.join(exportHtmlDir, "template.js"), "utf8"); const markedJs = fs.readFileSync(path.join(exportHtmlDir, "vendor", "marked.min.js"), "utf8"); const highlightJs = fs.readFileSync(path.join(exportHtmlDir, "vendor", "highlight.min.js"), "utf8"); -function renderTemplate(sessionData: SessionData) { +let parseHtmlPromise: Promise | null = null; + +async function loadParseHTML(): Promise { + parseHtmlPromise ??= (import(LINKEDOM_MODULE) as Promise).then( + ({ parseHTML }) => parseHTML, + ); + return parseHtmlPromise; +} + +async function renderTemplate(sessionData: SessionData) { const html = templateHtml .replace("{{CSS}}", "") .replace("{{SESSION_DATA}}", Buffer.from(JSON.stringify(sessionData), "utf8").toString("base64")) @@ -42,6 +67,7 @@ function renderTemplate(sessionData: SessionData) { .replace("{{HIGHLIGHT_JS}}", "") .replace("{{JS}}", ""); + const parseHTML = await loadParseHTML(); const { document, window } = parseHTML(html); if (window.HTMLElement?.prototype) { window.HTMLElement.prototype.scrollIntoView = () => {}; @@ -80,7 +106,7 @@ function now() { } describe("export html security hardening", () => { - it("escapes raw HTML from markdown blocks", () => { + it("escapes raw HTML from markdown blocks", async () => { const attack = ""; const session: SessionData = { header: { id: "session-1", timestamp: now() }, @@ -114,14 +140,14 @@ describe("export html security hardening", () => { tools: [], }; - const { document } = renderTemplate(session); + const { document } = await renderTemplate(session); const messages = document.getElementById("messages"); expect(messages).toBeTruthy(); expect(messages?.querySelector("img[onerror]")).toBeNull(); expect(messages?.innerHTML).toContain("<img src=x onerror=alert(1)>"); }); - it("escapes tree and header metadata fields", () => { + it("escapes tree and header metadata fields", async () => { const attack = ""; const baseEntries: SessionEntry[] = [ { @@ -181,7 +207,7 @@ describe("export html security hardening", () => { tools: [], }; - const { document } = renderTemplate(headerSession); + const { document } = await renderTemplate(headerSession); const tree = document.getElementById("tree-container"); const header = document.getElementById("header-container"); expect(tree).toBeTruthy(); @@ -198,7 +224,7 @@ describe("export html security hardening", () => { systemPrompt: "", tools: [], }; - const modelLeaf = renderTemplate(modelLeafSession).document; + const modelLeaf = (await renderTemplate(modelLeafSession)).document; expect(modelLeaf.getElementById("tree-container")?.querySelector("img[onerror]")).toBeNull(); expect(modelLeaf.getElementById("tree-container")?.innerHTML).toContain( "<img src=x onerror=alert(9)>", @@ -211,14 +237,14 @@ describe("export html security hardening", () => { systemPrompt: "", tools: [], }; - const thinkingLeaf = renderTemplate(thinkingLeafSession).document; + const thinkingLeaf = (await renderTemplate(thinkingLeafSession)).document; expect(thinkingLeaf.getElementById("tree-container")?.querySelector("img[onerror]")).toBeNull(); expect(thinkingLeaf.getElementById("tree-container")?.innerHTML).toContain( "<img src=x onerror=alert(9)>", ); }); - it("sanitizes image MIME types used in data URLs", () => { + it("sanitizes image MIME types used in data URLs", async () => { const session: SessionData = { header: { id: "session-3", timestamp: now() }, entries: [ @@ -244,14 +270,14 @@ describe("export html security hardening", () => { tools: [], }; - const { document } = renderTemplate(session); + const { document } = await renderTemplate(session); const img = document.querySelector("#messages .message-image"); expect(img).toBeTruthy(); expect(img?.getAttribute("onerror")).toBeNull(); expect(img?.getAttribute("src")).toBe("data:application/octet-stream;base64,AAAA"); }); - it("flattens remote markdown images but keeps data-image markdown", () => { + it("flattens remote markdown images but keeps data-image markdown", async () => { const dataImage = "data:image/png;base64,AAAA"; const session: SessionData = { header: { id: "session-4", timestamp: now() }, @@ -277,7 +303,7 @@ describe("export html security hardening", () => { tools: [], }; - const { document } = renderTemplate(session); + const { document } = await renderTemplate(session); const messages = document.getElementById("messages"); expect(messages).toBeTruthy(); expect(messages?.querySelector('img[src^="https://"]')).toBeNull(); @@ -285,7 +311,7 @@ describe("export html security hardening", () => { expect(messages?.querySelector(`img[src="${dataImage}"]`)).toBeTruthy(); }); - it("escapes markdown data-image attributes", () => { + it("escapes markdown data-image attributes", async () => { const dataImage = "data:image/png;base64,AAAA"; const session: SessionData = { header: { id: "session-5", timestamp: now() }, @@ -311,7 +337,7 @@ describe("export html security hardening", () => { tools: [], }; - const { document } = renderTemplate(session); + const { document } = await renderTemplate(session); const img = document.querySelector("#messages img"); expect(img).toBeTruthy(); expect(img?.getAttribute("onerror")).toBeNull();