perf: narrow HTML parser type surface

This commit is contained in:
Peter Steinberger
2026-04-19 01:31:21 +01:00
parent 6f076dcde7
commit 346aa0ed47
3 changed files with 100 additions and 27 deletions

View File

@@ -3,7 +3,6 @@ import path from "node:path";
import vm from "node:vm";
import { fileURLToPath } from "node:url";
import { describe, expect, it } from "vitest";
import { parseHTML } from "linkedom";
type SessionEntry = {
id: string;
@@ -28,13 +27,39 @@ type SessionData = {
tools: unknown[];
};
type ParsedHtml = {
document: Document;
window: {
HTMLElement?: {
prototype?: {
scrollIntoView?: () => void;
};
};
};
};
type LinkedomModule = {
parseHTML(html: string): ParsedHtml;
};
const LINKEDOM_MODULE = "linkedom";
const exportHtmlDir = path.dirname(fileURLToPath(import.meta.url));
const templateHtml = fs.readFileSync(path.join(exportHtmlDir, "template.html"), "utf8");
const templateJs = fs.readFileSync(path.join(exportHtmlDir, "template.js"), "utf8");
const markedJs = fs.readFileSync(path.join(exportHtmlDir, "vendor", "marked.min.js"), "utf8");
const highlightJs = fs.readFileSync(path.join(exportHtmlDir, "vendor", "highlight.min.js"), "utf8");
function renderTemplate(sessionData: SessionData) {
let parseHtmlPromise: Promise<LinkedomModule["parseHTML"]> | null = null;
async function loadParseHTML(): Promise<LinkedomModule["parseHTML"]> {
parseHtmlPromise ??= (import(LINKEDOM_MODULE) as Promise<LinkedomModule>).then(
({ parseHTML }) => parseHTML,
);
return parseHtmlPromise;
}
async function renderTemplate(sessionData: SessionData) {
const html = templateHtml
.replace("{{CSS}}", "")
.replace("{{SESSION_DATA}}", Buffer.from(JSON.stringify(sessionData), "utf8").toString("base64"))
@@ -42,6 +67,7 @@ function renderTemplate(sessionData: SessionData) {
.replace("{{HIGHLIGHT_JS}}", "")
.replace("{{JS}}", "");
const parseHTML = await loadParseHTML();
const { document, window } = parseHTML(html);
if (window.HTMLElement?.prototype) {
window.HTMLElement.prototype.scrollIntoView = () => {};
@@ -80,7 +106,7 @@ function now() {
}
describe("export html security hardening", () => {
it("escapes raw HTML from markdown blocks", () => {
it("escapes raw HTML from markdown blocks", async () => {
const attack = "<img src=x onerror=alert(1)>";
const session: SessionData = {
header: { id: "session-1", timestamp: now() },
@@ -114,14 +140,14 @@ describe("export html security hardening", () => {
tools: [],
};
const { document } = renderTemplate(session);
const { document } = await renderTemplate(session);
const messages = document.getElementById("messages");
expect(messages).toBeTruthy();
expect(messages?.querySelector("img[onerror]")).toBeNull();
expect(messages?.innerHTML).toContain("&lt;img src=x onerror=alert(1)&gt;");
});
it("escapes tree and header metadata fields", () => {
it("escapes tree and header metadata fields", async () => {
const attack = "<img src=x onerror=alert(9)>";
const baseEntries: SessionEntry[] = [
{
@@ -181,7 +207,7 @@ describe("export html security hardening", () => {
tools: [],
};
const { document } = renderTemplate(headerSession);
const { document } = await renderTemplate(headerSession);
const tree = document.getElementById("tree-container");
const header = document.getElementById("header-container");
expect(tree).toBeTruthy();
@@ -198,7 +224,7 @@ describe("export html security hardening", () => {
systemPrompt: "",
tools: [],
};
const modelLeaf = renderTemplate(modelLeafSession).document;
const modelLeaf = (await renderTemplate(modelLeafSession)).document;
expect(modelLeaf.getElementById("tree-container")?.querySelector("img[onerror]")).toBeNull();
expect(modelLeaf.getElementById("tree-container")?.innerHTML).toContain(
"&lt;img src=x onerror=alert(9)&gt;",
@@ -211,14 +237,14 @@ describe("export html security hardening", () => {
systemPrompt: "",
tools: [],
};
const thinkingLeaf = renderTemplate(thinkingLeafSession).document;
const thinkingLeaf = (await renderTemplate(thinkingLeafSession)).document;
expect(thinkingLeaf.getElementById("tree-container")?.querySelector("img[onerror]")).toBeNull();
expect(thinkingLeaf.getElementById("tree-container")?.innerHTML).toContain(
"&lt;img src=x onerror=alert(9)&gt;",
);
});
it("sanitizes image MIME types used in data URLs", () => {
it("sanitizes image MIME types used in data URLs", async () => {
const session: SessionData = {
header: { id: "session-3", timestamp: now() },
entries: [
@@ -244,14 +270,14 @@ describe("export html security hardening", () => {
tools: [],
};
const { document } = renderTemplate(session);
const { document } = await renderTemplate(session);
const img = document.querySelector("#messages .message-image");
expect(img).toBeTruthy();
expect(img?.getAttribute("onerror")).toBeNull();
expect(img?.getAttribute("src")).toBe("data:application/octet-stream;base64,AAAA");
});
it("flattens remote markdown images but keeps data-image markdown", () => {
it("flattens remote markdown images but keeps data-image markdown", async () => {
const dataImage = "data:image/png;base64,AAAA";
const session: SessionData = {
header: { id: "session-4", timestamp: now() },
@@ -277,7 +303,7 @@ describe("export html security hardening", () => {
tools: [],
};
const { document } = renderTemplate(session);
const { document } = await renderTemplate(session);
const messages = document.getElementById("messages");
expect(messages).toBeTruthy();
expect(messages?.querySelector('img[src^="https://"]')).toBeNull();
@@ -285,7 +311,7 @@ describe("export html security hardening", () => {
expect(messages?.querySelector(`img[src="${dataImage}"]`)).toBeTruthy();
});
it("escapes markdown data-image attributes", () => {
it("escapes markdown data-image attributes", async () => {
const dataImage = "data:image/png;base64,AAAA";
const session: SessionData = {
header: { id: "session-5", timestamp: now() },
@@ -311,7 +337,7 @@ describe("export html security hardening", () => {
tools: [],
};
const { document } = renderTemplate(session);
const { document } = await renderTemplate(session);
const img = document.querySelector("#messages img");
expect(img).toBeTruthy();
expect(img?.getAttribute("onerror")).toBeNull();