mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 11:30:43 +00:00
perf: narrow HTML parser type surface
This commit is contained in:
@@ -6,24 +6,57 @@ export type ExtractMode = "markdown" | "text";
|
||||
const READABILITY_MAX_HTML_CHARS = 1_000_000;
|
||||
const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
|
||||
|
||||
type ParsedHtml = {
|
||||
document: Document;
|
||||
};
|
||||
|
||||
type ParseHtml = (html: string) => ParsedHtml;
|
||||
|
||||
type ReadabilityResult = {
|
||||
content?: string;
|
||||
textContent?: string | null;
|
||||
title?: string | null;
|
||||
};
|
||||
|
||||
type ReadabilityInstance = {
|
||||
parse(): ReadabilityResult | null;
|
||||
};
|
||||
|
||||
type ReadabilityConstructor = new (
|
||||
document: Document,
|
||||
options: { charThreshold: number },
|
||||
) => ReadabilityInstance;
|
||||
|
||||
type ReadabilityModule = {
|
||||
Readability: ReadabilityConstructor;
|
||||
};
|
||||
|
||||
type LinkedomModule = {
|
||||
parseHTML: ParseHtml;
|
||||
};
|
||||
|
||||
const READABILITY_MODULE = "@mozilla/readability";
|
||||
const LINKEDOM_MODULE = "linkedom";
|
||||
|
||||
let readabilityDepsPromise:
|
||||
| Promise<{
|
||||
Readability: typeof import("@mozilla/readability").Readability;
|
||||
parseHTML: typeof import("linkedom").parseHTML;
|
||||
Readability: ReadabilityConstructor;
|
||||
parseHTML: ParseHtml;
|
||||
}>
|
||||
| undefined;
|
||||
|
||||
async function loadReadabilityDeps(): Promise<{
|
||||
Readability: typeof import("@mozilla/readability").Readability;
|
||||
parseHTML: typeof import("linkedom").parseHTML;
|
||||
Readability: ReadabilityConstructor;
|
||||
parseHTML: ParseHtml;
|
||||
}> {
|
||||
if (!readabilityDepsPromise) {
|
||||
readabilityDepsPromise = Promise.all([import("@mozilla/readability"), import("linkedom")]).then(
|
||||
([readability, linkedom]) => ({
|
||||
Readability: readability.Readability,
|
||||
parseHTML: linkedom.parseHTML,
|
||||
}),
|
||||
);
|
||||
readabilityDepsPromise = Promise.all([
|
||||
import(READABILITY_MODULE) as Promise<ReadabilityModule>,
|
||||
import(LINKEDOM_MODULE) as Promise<LinkedomModule>,
|
||||
]).then(([readability, linkedom]) => ({
|
||||
Readability: readability.Readability,
|
||||
parseHTML: linkedom.parseHTML,
|
||||
}));
|
||||
}
|
||||
try {
|
||||
return await readabilityDepsPromise;
|
||||
|
||||
@@ -26,10 +26,24 @@ const HIDDEN_CLASS_NAMES = new Set([
|
||||
"offscreen",
|
||||
]);
|
||||
|
||||
let parseHtmlPromise: Promise<typeof import("linkedom").parseHTML> | null = null;
|
||||
type ParsedHtml = {
|
||||
document: Document;
|
||||
};
|
||||
|
||||
async function loadParseHTML(): Promise<typeof import("linkedom").parseHTML> {
|
||||
parseHtmlPromise ??= import("linkedom").then(({ parseHTML }) => parseHTML);
|
||||
type ParseHtml = (html: string) => ParsedHtml;
|
||||
|
||||
type LinkedomModule = {
|
||||
parseHTML: ParseHtml;
|
||||
};
|
||||
|
||||
const LINKEDOM_MODULE = "linkedom";
|
||||
|
||||
let parseHtmlPromise: Promise<ParseHtml> | null = null;
|
||||
|
||||
async function loadParseHTML(): Promise<ParseHtml> {
|
||||
parseHtmlPromise ??= (import(LINKEDOM_MODULE) as Promise<LinkedomModule>).then(
|
||||
({ parseHTML }) => parseHTML,
|
||||
);
|
||||
return parseHtmlPromise;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ import path from "node:path";
|
||||
import vm from "node:vm";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { parseHTML } from "linkedom";
|
||||
|
||||
type SessionEntry = {
|
||||
id: string;
|
||||
@@ -28,13 +27,39 @@ type SessionData = {
|
||||
tools: unknown[];
|
||||
};
|
||||
|
||||
type ParsedHtml = {
|
||||
document: Document;
|
||||
window: {
|
||||
HTMLElement?: {
|
||||
prototype?: {
|
||||
scrollIntoView?: () => void;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
type LinkedomModule = {
|
||||
parseHTML(html: string): ParsedHtml;
|
||||
};
|
||||
|
||||
const LINKEDOM_MODULE = "linkedom";
|
||||
|
||||
const exportHtmlDir = path.dirname(fileURLToPath(import.meta.url));
|
||||
const templateHtml = fs.readFileSync(path.join(exportHtmlDir, "template.html"), "utf8");
|
||||
const templateJs = fs.readFileSync(path.join(exportHtmlDir, "template.js"), "utf8");
|
||||
const markedJs = fs.readFileSync(path.join(exportHtmlDir, "vendor", "marked.min.js"), "utf8");
|
||||
const highlightJs = fs.readFileSync(path.join(exportHtmlDir, "vendor", "highlight.min.js"), "utf8");
|
||||
|
||||
function renderTemplate(sessionData: SessionData) {
|
||||
let parseHtmlPromise: Promise<LinkedomModule["parseHTML"]> | null = null;
|
||||
|
||||
async function loadParseHTML(): Promise<LinkedomModule["parseHTML"]> {
|
||||
parseHtmlPromise ??= (import(LINKEDOM_MODULE) as Promise<LinkedomModule>).then(
|
||||
({ parseHTML }) => parseHTML,
|
||||
);
|
||||
return parseHtmlPromise;
|
||||
}
|
||||
|
||||
async function renderTemplate(sessionData: SessionData) {
|
||||
const html = templateHtml
|
||||
.replace("{{CSS}}", "")
|
||||
.replace("{{SESSION_DATA}}", Buffer.from(JSON.stringify(sessionData), "utf8").toString("base64"))
|
||||
@@ -42,6 +67,7 @@ function renderTemplate(sessionData: SessionData) {
|
||||
.replace("{{HIGHLIGHT_JS}}", "")
|
||||
.replace("{{JS}}", "");
|
||||
|
||||
const parseHTML = await loadParseHTML();
|
||||
const { document, window } = parseHTML(html);
|
||||
if (window.HTMLElement?.prototype) {
|
||||
window.HTMLElement.prototype.scrollIntoView = () => {};
|
||||
@@ -80,7 +106,7 @@ function now() {
|
||||
}
|
||||
|
||||
describe("export html security hardening", () => {
|
||||
it("escapes raw HTML from markdown blocks", () => {
|
||||
it("escapes raw HTML from markdown blocks", async () => {
|
||||
const attack = "<img src=x onerror=alert(1)>";
|
||||
const session: SessionData = {
|
||||
header: { id: "session-1", timestamp: now() },
|
||||
@@ -114,14 +140,14 @@ describe("export html security hardening", () => {
|
||||
tools: [],
|
||||
};
|
||||
|
||||
const { document } = renderTemplate(session);
|
||||
const { document } = await renderTemplate(session);
|
||||
const messages = document.getElementById("messages");
|
||||
expect(messages).toBeTruthy();
|
||||
expect(messages?.querySelector("img[onerror]")).toBeNull();
|
||||
expect(messages?.innerHTML).toContain("<img src=x onerror=alert(1)>");
|
||||
});
|
||||
|
||||
it("escapes tree and header metadata fields", () => {
|
||||
it("escapes tree and header metadata fields", async () => {
|
||||
const attack = "<img src=x onerror=alert(9)>";
|
||||
const baseEntries: SessionEntry[] = [
|
||||
{
|
||||
@@ -181,7 +207,7 @@ describe("export html security hardening", () => {
|
||||
tools: [],
|
||||
};
|
||||
|
||||
const { document } = renderTemplate(headerSession);
|
||||
const { document } = await renderTemplate(headerSession);
|
||||
const tree = document.getElementById("tree-container");
|
||||
const header = document.getElementById("header-container");
|
||||
expect(tree).toBeTruthy();
|
||||
@@ -198,7 +224,7 @@ describe("export html security hardening", () => {
|
||||
systemPrompt: "",
|
||||
tools: [],
|
||||
};
|
||||
const modelLeaf = renderTemplate(modelLeafSession).document;
|
||||
const modelLeaf = (await renderTemplate(modelLeafSession)).document;
|
||||
expect(modelLeaf.getElementById("tree-container")?.querySelector("img[onerror]")).toBeNull();
|
||||
expect(modelLeaf.getElementById("tree-container")?.innerHTML).toContain(
|
||||
"<img src=x onerror=alert(9)>",
|
||||
@@ -211,14 +237,14 @@ describe("export html security hardening", () => {
|
||||
systemPrompt: "",
|
||||
tools: [],
|
||||
};
|
||||
const thinkingLeaf = renderTemplate(thinkingLeafSession).document;
|
||||
const thinkingLeaf = (await renderTemplate(thinkingLeafSession)).document;
|
||||
expect(thinkingLeaf.getElementById("tree-container")?.querySelector("img[onerror]")).toBeNull();
|
||||
expect(thinkingLeaf.getElementById("tree-container")?.innerHTML).toContain(
|
||||
"<img src=x onerror=alert(9)>",
|
||||
);
|
||||
});
|
||||
|
||||
it("sanitizes image MIME types used in data URLs", () => {
|
||||
it("sanitizes image MIME types used in data URLs", async () => {
|
||||
const session: SessionData = {
|
||||
header: { id: "session-3", timestamp: now() },
|
||||
entries: [
|
||||
@@ -244,14 +270,14 @@ describe("export html security hardening", () => {
|
||||
tools: [],
|
||||
};
|
||||
|
||||
const { document } = renderTemplate(session);
|
||||
const { document } = await renderTemplate(session);
|
||||
const img = document.querySelector("#messages .message-image");
|
||||
expect(img).toBeTruthy();
|
||||
expect(img?.getAttribute("onerror")).toBeNull();
|
||||
expect(img?.getAttribute("src")).toBe("data:application/octet-stream;base64,AAAA");
|
||||
});
|
||||
|
||||
it("flattens remote markdown images but keeps data-image markdown", () => {
|
||||
it("flattens remote markdown images but keeps data-image markdown", async () => {
|
||||
const dataImage = "data:image/png;base64,AAAA";
|
||||
const session: SessionData = {
|
||||
header: { id: "session-4", timestamp: now() },
|
||||
@@ -277,7 +303,7 @@ describe("export html security hardening", () => {
|
||||
tools: [],
|
||||
};
|
||||
|
||||
const { document } = renderTemplate(session);
|
||||
const { document } = await renderTemplate(session);
|
||||
const messages = document.getElementById("messages");
|
||||
expect(messages).toBeTruthy();
|
||||
expect(messages?.querySelector('img[src^="https://"]')).toBeNull();
|
||||
@@ -285,7 +311,7 @@ describe("export html security hardening", () => {
|
||||
expect(messages?.querySelector(`img[src="${dataImage}"]`)).toBeTruthy();
|
||||
});
|
||||
|
||||
it("escapes markdown data-image attributes", () => {
|
||||
it("escapes markdown data-image attributes", async () => {
|
||||
const dataImage = "data:image/png;base64,AAAA";
|
||||
const session: SessionData = {
|
||||
header: { id: "session-5", timestamp: now() },
|
||||
@@ -311,7 +337,7 @@ describe("export html security hardening", () => {
|
||||
tools: [],
|
||||
};
|
||||
|
||||
const { document } = renderTemplate(session);
|
||||
const { document } = await renderTemplate(session);
|
||||
const img = document.querySelector("#messages img");
|
||||
expect(img).toBeTruthy();
|
||||
expect(img?.getAttribute("onerror")).toBeNull();
|
||||
|
||||
Reference in New Issue
Block a user