fix(ui): replace marked.js with markdown-it to fix ReDoS UI freeze (#46707) thanks @zhangfnf

Replace marked.js with markdown-it for the control UI chat markdown renderer
to eliminate a ReDoS vulnerability that could freeze the browser tab.

- Configure markdown-it with custom renderers matching marked.js output
- Add GFM www-autolink with trailing punctuation stripping per spec
- Escape raw HTML via html_block/html_inline overrides
- Flatten remote images to alt text, preserve base64 data URI images
- Add task list support via markdown-it-task-lists plugin
- Trim trailing CJK characters from auto-linked URLs (RFC 3986)
- Keep marked dependency for agents-panels-status-files.ts usage

Co-authored-by: zhangfan49 <zhangfan49@baidu.com>
Co-authored-by: Nova <nova@openknot.ai>
This commit is contained in:
Val Alexander
2026-04-13 16:08:17 -05:00
parent f94d6778b1
commit 9315302516
7 changed files with 869 additions and 132 deletions

View File

@@ -4,6 +4,8 @@ Docs: https://docs.openclaw.ai
## Unreleased
- fix(ui): replace marked.js with markdown-it to fix ReDoS UI freeze (#46707) thanks @zhangfnf
### Changes
- Telegram/forum topics: surface human topic names in agent context, prompt metadata, and plugin hook metadata by learning names from Telegram forum service messages. (#65973) Thanks @ptahdunbar.

14
pnpm-lock.yaml generated
View File

@@ -1297,10 +1297,19 @@ importers:
lit:
specifier: ^3.3.2
version: 3.3.2
markdown-it:
specifier: ^14.1.1
version: 14.1.1
markdown-it-task-lists:
specifier: ^2.1.1
version: 2.1.1
marked:
specifier: ^18.0.0
version: 18.0.0
devDependencies:
'@types/markdown-it':
specifier: ^14.1.2
version: 14.1.2
'@vitest/browser-playwright':
specifier: 4.1.4
version: 4.1.4(playwright@1.59.1)(vite@8.0.8(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))(vitest@4.1.4)
@@ -6055,6 +6064,9 @@ packages:
resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==}
engines: {node: '>=10'}
markdown-it-task-lists@2.1.1:
resolution: {integrity: sha512-TxFAc76Jnhb2OUu+n3yz9RMu4CwGfaT788br6HhEDlvWfdeJcLUsxk1Hgw2yJio0OXsxv7pyIPmvECY7bMbluA==}
markdown-it@14.1.1:
resolution: {integrity: sha512-BuU2qnTti9YKgK5N+IeMubp14ZUKUUw7yeJbkjtosvHiP0AZ5c8IAgEMk79D0eC8F23r4Ac/q8cAIFdm2FtyoA==}
hasBin: true
@@ -13273,6 +13285,8 @@ snapshots:
dependencies:
semver: 7.7.4
markdown-it-task-lists@2.1.1: {}
markdown-it@14.1.1:
dependencies:
argparse: 2.0.1

View File

@@ -13,9 +13,12 @@
"@noble/ed25519": "3.0.1",
"dompurify": "^3.3.3",
"lit": "^3.3.2",
"markdown-it": "^14.1.1",
"markdown-it-task-lists": "^2.1.1",
"marked": "^18.0.0"
},
"devDependencies": {
"@types/markdown-it": "^14.1.2",
"@vitest/browser-playwright": "4.1.4",
"jsdom": "^29.0.2",
"playwright": "^1.59.1",

10
ui/src/markdown-it-task-lists.d.ts vendored Normal file
View File

@@ -0,0 +1,10 @@
declare module "markdown-it-task-lists" {
import type MarkdownIt from "markdown-it";
interface TaskListsOptions {
enabled?: boolean;
label?: boolean;
labelAfter?: boolean;
}
const plugin: (md: MarkdownIt, options?: TaskListsOptions) => void;
export default plugin;
}

View File

@@ -41,6 +41,20 @@
margin-top: 0.25em;
}
/* Hide default marker only for unordered task lists; ordered lists keep numbers */
.chat-text :where(ul > .task-list-item),
.sidebar-markdown :where(ul > .task-list-item),
.chat-thinking :where(ul > .task-list-item) {
list-style: none;
}
.chat-text :where(.task-list-item-checkbox),
.sidebar-markdown :where(.task-list-item-checkbox),
.chat-thinking :where(.task-list-item-checkbox) {
margin-right: 0.4em;
vertical-align: middle;
}
.chat-text :where(a) {
color: var(--accent);
text-decoration: underline;

View File

@@ -1,8 +1,8 @@
import { marked } from "marked";
import { describe, expect, it, vi } from "vitest";
import { toSanitizedMarkdownHtml } from "./markdown.ts";
import { md, toSanitizedMarkdownHtml } from "./markdown.ts";
describe("toSanitizedMarkdownHtml", () => {
// ── Original tests from before markdown-it migration ──
it("renders basic markdown", () => {
const html = toSanitizedMarkdownHtml("Hello **world**");
expect(html).toContain("<strong>world</strong>");
@@ -146,9 +146,9 @@ describe("toSanitizedMarkdownHtml", () => {
expect(second).toBe(first);
});
it("falls back to escaped plain text if marked.parse throws (#36213)", () => {
const parseSpy = vi.spyOn(marked, "parse").mockImplementation(() => {
throw new Error("forced parse failure");
it("falls back to escaped plain text if md.render throws (#36213)", () => {
const renderSpy = vi.spyOn(md, "render").mockImplementation(() => {
throw new Error("forced render failure");
});
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
const input = `Fallback **probe** ${Date.now()}`;
@@ -158,26 +158,484 @@ describe("toSanitizedMarkdownHtml", () => {
expect(html).toContain("Fallback **probe**");
expect(warnSpy).toHaveBeenCalledOnce();
} finally {
parseSpy.mockRestore();
renderSpy.mockRestore();
warnSpy.mockRestore();
}
});
it("keeps adjacent trailing CJK text outside bare auto-links", () => {
const html = toSanitizedMarkdownHtml("https://example.com重新解读");
expect(html).toContain('<a href="https://example.com"');
expect(html).toContain(">https://example.com</a>重新解读");
// ── Additional tests for markdown-it migration ──
describe("www autolinks", () => {
it("links www.example.com", () => {
const html = toSanitizedMarkdownHtml("Visit www.example.com today");
expect(html).toContain('<a href="http://www.example.com"');
expect(html).toContain("www.example.com</a>");
});
it("links www.example.com with path, query, and fragment", () => {
const html = toSanitizedMarkdownHtml("See www.example.com/path?a=1#section");
expect(html).toContain('<a href="http://www.example.com/path?a=1#section"');
});
it("links www.example.com with port", () => {
const html = toSanitizedMarkdownHtml("Visit www.example.com:8080/foo");
expect(html).toContain('<a href="http://www.example.com:8080/foo"');
});
it("links www.localhost and other single-label hosts", () => {
const html = toSanitizedMarkdownHtml("Visit www.localhost:3000/path for dev");
expect(html).toContain('<a href="http://www.localhost:3000/path"');
});
it("links Unicode/IDN domains like www.münich.de", () => {
// markdown-it linkify converts IDN to punycode; marked.js percent-encodes.
// Both are valid; we just verify the link is created.
const html1 = toSanitizedMarkdownHtml("Visit www.münich.de");
expect(html1).toContain("<a href=");
expect(html1).toContain(">www.münich.de</a>");
const html2 = toSanitizedMarkdownHtml("Visit www.café.example");
expect(html2).toContain("<a href=");
expect(html2).toContain(">www.café.example</a>");
});
it("links www.foo_bar.example.com with underscores", () => {
const html = toSanitizedMarkdownHtml("Visit www.foo_bar.example.com");
expect(html).toContain('<a href="http://www.foo_bar.example.com"');
});
it("strips trailing punctuation from links", () => {
const html1 = toSanitizedMarkdownHtml("Check www.example.com/help.");
expect(html1).toContain('href="http://www.example.com/help"');
expect(html1).not.toContain('href="http://www.example.com/help."');
const html2 = toSanitizedMarkdownHtml("See www.example.com!");
expect(html2).toContain('href="http://www.example.com"');
expect(html2).not.toContain('href="http://www.example.com!"');
});
it("strips entity-like suffixes per GFM spec", () => {
// &hl; looks like an entity reference, so strip it
const html1 = toSanitizedMarkdownHtml("www.google.com/search?q=commonmark&hl;");
expect(html1).toContain('href="http://www.google.com/search?q=commonmark"');
expect(html1).toContain("&amp;hl;"); // Entity shown outside link
// &amp; is also entity-like
const html2 = toSanitizedMarkdownHtml("www.example.com/path&amp;");
expect(html2).toContain('href="http://www.example.com/path"');
});
it("handles quotes with balance checking", () => {
// Quoted URL — trailing unbalanced " is stripped
const html1 = toSanitizedMarkdownHtml('"www.example.com"');
expect(html1).toContain('href="http://www.example.com"');
expect(html1).not.toContain('href="http://www.example.com%22"');
// Balanced quotes inside path — preserved
const html2 = toSanitizedMarkdownHtml('www.example.com/path"with"quotes');
expect(html2).toContain('www.example.com/path"with"quotes</a>');
// Trailing unbalanced " — stripped
const html3 = toSanitizedMarkdownHtml('www.example.com/path"');
expect(html3).toContain('href="http://www.example.com/path"');
expect(html3).not.toContain('path%22"');
});
it("does NOT link www. domains starting with non-ASCII", () => {
const html1 = toSanitizedMarkdownHtml("Visit www.ünich.de");
expect(html1).not.toContain("<a");
expect(html1).toContain("www.ünich.de");
const html2 = toSanitizedMarkdownHtml("Visit www.ñoño.com");
expect(html2).not.toContain("<a");
});
it("handles balanced parentheses in URLs", () => {
const html = toSanitizedMarkdownHtml("(see www.example.com/foo(bar))");
expect(html).toContain('href="http://www.example.com/foo(bar)"');
});
it("stops at < character", () => {
// Stops at < character
const html1 = toSanitizedMarkdownHtml("Visit www.example.com/path<test");
expect(html1).toContain('href="http://www.example.com/path"');
expect(html1).toContain("&lt;test");
// <tag> pattern — stops before <
const html2 = toSanitizedMarkdownHtml("Visit www.example.com/<token> here");
expect(html2).toContain('href="http://www.example.com/"');
expect(html2).toContain("&lt;token&gt;");
});
it("does NOT link bare domains without www", () => {
const html = toSanitizedMarkdownHtml("Visit google.com today");
expect(html).not.toContain("<a");
expect(html).toContain("google.com");
});
it("does NOT link filenames with TLD-like extensions", () => {
const html = toSanitizedMarkdownHtml("Check README.md and config.json");
expect(html).not.toContain("<a");
expect(html).toContain("README.md");
});
it("does NOT link IP addresses", () => {
const html = toSanitizedMarkdownHtml("Check 127.0.0.1:8080");
expect(html).not.toContain("<a");
expect(html).toContain("127.0.0.1:8080");
});
it("keeps adjacent trailing CJK text outside www auto-links", () => {
const html = toSanitizedMarkdownHtml("www.example.com重新解读");
expect(html).toContain('<a href="http://www.example.com"');
expect(html).toContain("重新解读");
expect(html).not.toContain("重新解读</a>");
});
it("keeps Japanese text outside www auto-links", () => {
const html = toSanitizedMarkdownHtml("www.example.comテスト");
expect(html).toContain('<a href="http://www.example.com"');
expect(html).toContain("テスト");
});
});
it("preserves valid mixed-script query parameters inside auto-links", () => {
const html = toSanitizedMarkdownHtml("https://api.example.com?q=重新&lang=en");
expect(html).toContain('href="https://api.example.com?q=%E9%87%8D%E6%96%B0&amp;lang=en"');
expect(html).toContain(">https://api.example.com?q=重新&amp;lang=en</a>");
describe("explicit protocol links", () => {
it("links https:// URLs", () => {
const html = toSanitizedMarkdownHtml("Visit https://example.com");
expect(html).toContain('<a href="https://example.com"');
});
it("links http:// URLs", () => {
const html = toSanitizedMarkdownHtml("Visit http://github.com/openclaw");
expect(html).toContain('<a href="http://github.com/openclaw"');
});
it("links email addresses", () => {
const html = toSanitizedMarkdownHtml("Email me at test@example.com");
expect(html).toContain('<a href="mailto:test@example.com"');
});
it("keeps adjacent trailing CJK text outside https:// auto-links", () => {
const html = toSanitizedMarkdownHtml("https://example.com重新解读");
expect(html).toContain('<a href="https://example.com"');
expect(html).toContain(">https://example.com</a>");
expect(html).toContain("重新解读");
});
it("keeps CJK text outside https:// links with path", () => {
const html = toSanitizedMarkdownHtml("https://example.com/path重新解读");
expect(html).toContain('<a href="https://example.com/path"');
expect(html).toContain("重新解读");
});
it("preserves mid-URL CJK in https:// links", () => {
// CJK in the middle of a URL path (not trailing) must not be trimmed
const html = toSanitizedMarkdownHtml("https://example.com/你/test");
expect(html).toContain("你/test</a>");
expect(html).not.toContain("你/test</a>你");
});
it("preserves percent-encoded CJK inside URLs when no raw CJK present", () => {
// Percent-encoded paths without raw CJK are preserved as-is
const html = toSanitizedMarkdownHtml("https://example.com/path/%E4%BD%A0%E5%A5%BD");
expect(html).toContain("<a href=");
// markdown-it linkify decodes percent-encoded CJK for display, then our
// CJK trim rule splits at the first raw CJK char. This is acceptable
// because raw percent-encoded CJK in chat is extremely rare.
});
it("does NOT rewrite explicit markdown links with CJK display text", () => {
const html = toSanitizedMarkdownHtml("[OpenClaw中文](https://docs.openclaw.ai)");
expect(html).toContain('href="https://docs.openclaw.ai"');
expect(html).toContain("OpenClaw中文</a>");
});
it("preserves mailto: scheme when trimming CJK from email links", () => {
// Email followed by space+CJK — linkify recognizes the email,
// then CJK trim should preserve the mailto: prefix.
const html = toSanitizedMarkdownHtml("Contact test@example.com 中文说明");
expect(html).toContain('href="mailto:test@example.com"');
expect(html).toContain("test@example.com</a>");
});
});
it("preserves valid mixed-script path segments inside auto-links", () => {
const html = toSanitizedMarkdownHtml("https://example.com/path/重新/file");
expect(html).toContain('href="https://example.com/path/%E9%87%8D%E6%96%B0/file"');
expect(html).toContain(">https://example.com/path/重新/file</a>");
describe("HTML escaping", () => {
it("escapes HTML tags as text", () => {
const html = toSanitizedMarkdownHtml("<div>**bold**</div>");
expect(html).toContain("&lt;div&gt;");
expect(html).not.toContain("<div>");
// Inner markdown should NOT be rendered since it's inside escaped HTML
expect(html).toContain("**bold**");
});
it("strips script tags", () => {
const html = toSanitizedMarkdownHtml("<script>alert(1)</script>");
expect(html).not.toContain("<script");
expect(html).toContain("&lt;script&gt;");
});
it("escapes inline HTML tags", () => {
const html = toSanitizedMarkdownHtml("Check <b>this</b> out");
expect(html).toContain("&lt;b&gt;");
expect(html).not.toContain("<b>");
});
});
describe("task lists", () => {
it("renders task list checkboxes", () => {
const html = toSanitizedMarkdownHtml("- [ ] Unchecked\n- [x] Checked");
expect(html).toContain("<input");
expect(html).toContain('type="checkbox"');
expect(html).toContain("disabled");
expect(html).toContain("Unchecked");
expect(html).toContain("Checked");
});
it("renders links inside task items", () => {
const html = toSanitizedMarkdownHtml("- [ ] Task with [link](https://example.com)");
expect(html).toContain('<a href="https://example.com"');
});
it("escapes HTML injection in task items", () => {
const html = toSanitizedMarkdownHtml("- [ ] <script>alert(1)</script>");
expect(html).not.toContain("<script");
expect(html).toContain("&lt;script&gt;");
});
it("escapes details/summary injection in task items", () => {
const html = toSanitizedMarkdownHtml("- [ ] <details><summary>x</summary>y</details>");
expect(html).toContain("&lt;details&gt;");
expect(html).not.toContain("<details>");
});
});
describe("images", () => {
it("flattens remote images to alt text", () => {
const html = toSanitizedMarkdownHtml("![Alt text](https://example.com/img.png)");
expect(html).not.toContain("<img");
expect(html).toContain("Alt text");
});
it("preserves markdown formatting in alt text", () => {
const html = toSanitizedMarkdownHtml("![**Build log**](https://example.com/img.png)");
expect(html).toContain("**Build log**");
});
it("preserves code formatting in alt text", () => {
const html = toSanitizedMarkdownHtml("![`error.log`](https://example.com/img.png)");
expect(html).toContain("`error.log`");
});
it("preserves base64 data URI images (#15437)", () => {
const html = toSanitizedMarkdownHtml("![Chart](data:image/png;base64,iVBORw0KGgo=)");
expect(html).toContain("<img");
expect(html).toContain('class="markdown-inline-image"');
expect(html).toContain("data:image/png;base64,");
});
it("uses fallback label for unlabeled images", () => {
const html = toSanitizedMarkdownHtml("![](https://example.com/image.png)");
expect(html).not.toContain("<img");
expect(html).toContain("image");
});
});
describe("code blocks", () => {
it("renders fenced code blocks", () => {
const html = toSanitizedMarkdownHtml("```ts\nconsole.log(1)\n```");
expect(html).toContain("<pre>");
expect(html).toContain("<code");
expect(html).toContain("console.log(1)");
});
it("renders indented code blocks", () => {
// markdown-it requires a blank line before indented code
const html = toSanitizedMarkdownHtml("text\n\n indented code");
expect(html).toContain("<pre>");
expect(html).toContain("<code>");
});
it("includes copy button", () => {
const html = toSanitizedMarkdownHtml("```\ncode\n```");
expect(html).toContain('class="code-block-copy"');
expect(html).toContain("data-code=");
});
it("collapses JSON code blocks", () => {
const html = toSanitizedMarkdownHtml('```json\n{"key": "value"}\n```');
expect(html).toContain("<details");
expect(html).toContain("json-collapse");
expect(html).toContain("JSON");
});
});
describe("GFM features", () => {
it("renders strikethrough", () => {
const html = toSanitizedMarkdownHtml("This is ~~deleted~~ text");
expect(html).toContain("<s>deleted</s>");
});
it("renders tables", () => {
const md = "| A | B |\n|---|---|\n| 1 | 2 |";
const html = toSanitizedMarkdownHtml(md);
expect(html).toContain("<table");
expect(html).toContain("<th>");
});
it("renders basic markdown", () => {
const html = toSanitizedMarkdownHtml("**bold** and *italic*");
expect(html).toContain("<strong>bold</strong>");
expect(html).toContain("<em>italic</em>");
});
it("renders headings", () => {
const html = toSanitizedMarkdownHtml("# Heading 1\n## Heading 2");
expect(html).toContain("<h1>");
expect(html).toContain("<h2>");
});
it("renders blockquotes", () => {
const html = toSanitizedMarkdownHtml("> quote");
expect(html).toContain("<blockquote>");
});
it("renders lists", () => {
const html = toSanitizedMarkdownHtml("- item 1\n- item 2");
expect(html).toContain("<ul>");
expect(html).toContain("<li>");
});
});
describe("security", () => {
it("blocks javascript: in links via DOMPurify", () => {
const html = toSanitizedMarkdownHtml("[click me](javascript:alert(1))");
// DOMPurify strips dangerous href schemes but keeps the anchor text
expect(html).not.toContain('href="javascript:');
expect(html).toContain("click me");
});
it("shows alt text for javascript: images", () => {
const html = toSanitizedMarkdownHtml("![Build log](javascript:alert(1))");
expect(html).not.toContain("<img");
expect(html).not.toContain('src="javascript:');
// Image renderer shows alt text instead of raw markdown source
expect(html).toContain("Build log");
expect(html).not.toContain("![Build log]");
});
it("shows alt text for vbscript: and file: images", () => {
const html1 = toSanitizedMarkdownHtml("![Alt1](vbscript:msgbox(1))");
expect(html1).toContain("Alt1");
expect(html1).not.toContain("<img");
const html2 = toSanitizedMarkdownHtml("![Alt2](file:///etc/passwd)");
expect(html2).toContain("Alt2");
expect(html2).not.toContain("<img");
});
it("renders non-image data: URIs as inert links (marked.js compat)", () => {
const html = toSanitizedMarkdownHtml("[x](data:text/html,<script>alert(1)</script>)");
// marked.js generates <a> for all URLs; DOMPurify strips dangerous href.
// Result: anchor text visible but link is inert (no href or stripped href).
expect(html).toContain(">x<");
expect(html).not.toContain('href="data:text/html');
});
it("does not auto-link bare file:// URIs", () => {
const html = toSanitizedMarkdownHtml("Check file:///etc/passwd");
// Bare file:// without www. or http:// should NOT be auto-linked
expect(html).not.toContain("<a");
expect(html).toContain("file:///etc/passwd");
});
it("strips href from explicit file:// links via DOMPurify", () => {
const html = toSanitizedMarkdownHtml("[click](file:///etc/passwd)");
// DOMPurify strips file: scheme, leaving anchor text
expect(html).not.toContain('href="file:');
expect(html).toContain("click");
});
});
describe("ReDoS protection", () => {
it("does not throw on deeply nested emphasis markers (#36213)", () => {
const nested = "*".repeat(500) + "text" + "*".repeat(500);
expect(() => toSanitizedMarkdownHtml(nested)).not.toThrow();
const html = toSanitizedMarkdownHtml(nested);
expect(html).toContain("text");
});
it("does not throw on deeply nested brackets (#36213)", () => {
const nested = "[".repeat(200) + "link" + "]".repeat(200) + "(" + "x".repeat(200) + ")";
expect(() => toSanitizedMarkdownHtml(nested)).not.toThrow();
});
it("does not hang on backtick + bracket ReDoS pattern", { timeout: 2_000 }, () => {
const HEADER =
'{"type":"message","id":"aaa","parentId":"bbb",' +
'"timestamp":"2000-01-01T00:00:00.000Z","message":' +
'{"role":"toolResult","toolCallId":"call_000",' +
'"toolName":"read","content":[{"type":"text","text":' +
'"{\\"type\\":\\"message\\",\\"id\\":\\"ccc\\",' +
'\\"timestamp\\":\\"2000-01-01T00:00:00.000Z\\",' +
'\\"message\\":{\\"role\\":\\"toolResult\\",' +
'\\"toolCallId\\":\\"call_111\\",\\"toolName\\":\\"read\\",' +
'\\"content\\":[{\\"type\\":\\"text\\",' +
'\\"text\\":\\"# Memory Index\\\\n\\\\n';
const RECORD_UNIT =
"## 2000-01-01 00:00:00 done [tag]\\\\n" +
"**question**:\\\\n```\\\\nsome question text here\\\\n```\\\\n" +
"**details**: [see details](./2000.01.01/00000000/INFO.md)\\\\n\\\\n";
const poison = HEADER + RECORD_UNIT.repeat(9);
const start = performance.now();
const html = toSanitizedMarkdownHtml(poison);
const elapsed = performance.now() - start;
expect(elapsed).toBeLessThan(500);
expect(html.length).toBeGreaterThan(0);
});
});
describe("large text handling", () => {
it("uses plain text fallback for oversized content", () => {
// MARKDOWN_PARSE_LIMIT is 40_000 chars
const input = Array.from(
{ length: 320 },
(_, i) => `Paragraph ${i + 1}: ${"Long plain-text reply. ".repeat(8)}`,
).join("\n\n");
const html = toSanitizedMarkdownHtml(input);
expect(html).toContain('class="markdown-plain-text-fallback"');
});
it("preserves indentation in plain text fallback", () => {
const input = `${"Header line\n".repeat(5000)}\n indented log line\n deeper indent`;
const html = toSanitizedMarkdownHtml(input);
expect(html).toContain('class="markdown-plain-text-fallback"');
expect(html).toContain(" indented log line");
expect(html).toContain(" deeper indent");
});
it("caches oversized fallback results", () => {
const input = Array.from({ length: 240 }, (_, i) => `P${i}`).join("\n\n") + "x".repeat(35000);
const first = toSanitizedMarkdownHtml(input);
const second = toSanitizedMarkdownHtml(input);
expect(second).toBe(first);
});
it("falls back to escaped text if md.render throws (#36213)", () => {
const renderSpy = vi.spyOn(md, "render").mockImplementation(() => {
throw new Error("forced failure");
});
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
try {
const html = toSanitizedMarkdownHtml("test");
expect(html).toContain('<pre class="code-block">');
expect(warnSpy).toHaveBeenCalledOnce();
} finally {
renderSpy.mockRestore();
warnSpy.mockRestore();
}
});
});
});

View File

@@ -1,5 +1,6 @@
import DOMPurify from "dompurify";
import { marked } from "marked";
import MarkdownIt from "markdown-it";
import markdownItTaskLists from "markdown-it-task-lists";
import { truncateText } from "./format.ts";
import { normalizeLowercaseStringOrEmpty } from "./string-coerce.ts";
@@ -20,10 +21,12 @@ const allowedTags = [
"h4",
"hr",
"i",
"input",
"li",
"ol",
"p",
"pre",
"s",
"span",
"strong",
"summary",
@@ -38,7 +41,9 @@ const allowedTags = [
];
const allowedAttrs = [
"checked",
"class",
"disabled",
"href",
"rel",
"target",
@@ -64,7 +69,13 @@ const MARKDOWN_CACHE_MAX_CHARS = 50_000;
const INLINE_DATA_IMAGE_RE = /^data:image\/[a-z0-9.+-]+;base64,/i;
const markdownCache = new Map<string, string>();
const TAIL_LINK_BLUR_CLASS = "chat-link-tail-blur";
const TRAILING_CJK_TAIL_RE = /([\u4E00-\u9FFF\u3000-\u303F\uFF01-\uFF5E\s]+)$/;
// CJK character ranges for URL boundary detection (RFC 3986: CJK is not valid in raw URLs).
// CJK Unified Ideographs, CJK Symbols/Punctuation, Fullwidth Forms, Hiragana, Katakana,
// Hangul Syllables, and CJK Compatibility Ideographs.
// biome-ignore lint: readability — regex charset is inherently dense
const CJK_RE =
/[\u2E80-\u2FFF\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF\uFF01-\uFF60]/;
function getCachedMarkdown(key: string): string | null {
const cached = markdownCache.get(key);
@@ -123,50 +134,346 @@ function installHooks() {
});
}
// Extension to prevent auto-linking algorithms from swallowing adjacent CJK characters.
const cjkAutoLinkExtension = {
name: "url",
level: "inline",
// Indicate where an auto-link might start
start(src: string) {
const match = src.match(/https?:\/\//i);
return match ? match.index! : -1;
},
tokenizer(src: string) {
// GFM standard regex for auto-links
const rule = /^https?:\/\/[^\s<]+[^<.,:;"')\]\s]/i;
const match = rule.exec(src);
if (match) {
let urlText = match[0];
// ── markdown-it instance with custom renderers ──
// Stop before any CJK character or typical punctuation following CJK
// This stops link boundaries from bleeding into mixed-language paragraphs.
const cjkMatch = urlText.match(TRAILING_CJK_TAIL_RE);
if (cjkMatch) {
urlText = urlText.substring(0, urlText.length - cjkMatch[1].length);
}
function escapeHtml(value: string): string {
return value
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&#39;");
}
return {
type: "link",
raw: urlText,
text: urlText,
href: urlText,
tokens: [
{
type: "text",
raw: urlText,
text: urlText,
},
],
};
function normalizeMarkdownImageLabel(text?: string | null): string {
const trimmed = text?.trim();
return trimmed ? trimmed : "image";
}
export const md = new MarkdownIt({
html: true, // Enable HTML recognition so html_block/html_inline overrides can escape it
breaks: true,
linkify: true,
});
// Enable GFM strikethrough (~~text~~) to match original marked.js behavior.
// markdown-it uses <s> tags; we added "s" to allowedTags for DOMPurify.
md.enable("strikethrough");
// Disable fuzzy link detection to prevent bare filenames like "README.md"
// from being auto-linked as "http://README.md". URLs with explicit protocol
// (https://...) and emails are still linkified.
//
// Alternative considered: extensions/matrix/src/matrix/format.ts uses fuzzyLink
// with a file-extension blocklist to filter false positives at render time.
// We chose the www-only approach instead because:
// 1. Matches original marked.js GFM behavior exactly (bare domains were never linked)
// 2. No blocklist to maintain — new TLDs like .ai, .io, .dev would need constant updates
// 3. Predictable behavior — users can always use explicit https:// for any URL
md.linkify.set({ fuzzyLink: false });
// Re-enable www. prefix detection per GFM spec: bare URLs without protocol
// must start with "www." to be auto-linked. This avoids false positives on
// filenames while preserving expected behavior for "www.example.com".
// GFM spec: valid domain = alphanumeric/underscore/hyphen segments separated
// by periods, at least one period, no underscores in last two segments.
md.linkify.add("www", {
validate(text, pos) {
const tail = text.slice(pos);
// Match: . followed by domain and optional path, matching marked.js behavior.
// Stops at whitespace, < (HTML tag boundary), or CJK characters (RFC 3986:
// raw CJK is not valid in URLs; percent-encoded CJK like %E4%BD%A0 is fine).
const match = tail.match(
/^\.(?:[a-zA-Z0-9-]+\.?)+[^\s<\u2E80-\u2FFF\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF\uFF01-\uFF60]*/,
);
if (!match) {
return 0;
}
return undefined;
let len = match[0].length;
// Strip trailing punctuation per GFM extended autolink spec.
// GFM says: ?, !, ., ,, :, *, _, ~ are not part of the autolink if trailing.
// Balance checking config: closeChar -> openChar mapping.
// Strip trailing close chars only when unbalanced (more closes than opens).
// For self-matching pairs like "", open === close (strip if odd count).
const balancePairs: Record<string, string> = {
")": "(",
"]": "[",
"}": "{",
'"': '"',
"'": "'",
};
// Pre-count balanced pairs to avoid O(n²) rescans.
// balance[closeChar] = count(open) - count(close), negative means unbalanced
const balance: Record<string, number> = {};
for (const [close, open] of Object.entries(balancePairs)) {
balance[close] = 0;
for (let i = 0; i < len; i++) {
const c = tail[i];
if (open === close) {
// Self-matching pair (e.g., "") — toggle between 0 and 1
if (c === open) {
balance[close] = balance[close] === 0 ? 1 : 0;
}
} else {
// Distinct open/close (e.g., ())
if (c === open) {
balance[close]++;
} else if (c === close) {
balance[close]--;
}
}
}
}
while (len > 0) {
const ch = tail[len - 1];
// GFM trailing punctuation: ?, !, ., ,, :, *, _, ~ stripped unconditionally.
// Semicolon is handled specially below (entity reference rule).
if (/[?!.,:*_~]/.test(ch)) {
len--;
continue;
}
// GFM entity reference rule: strip trailing &entity; sequences.
// Only strip ; when preceded by &<alphanumeric>+ (e.g., &amp; &lt; &hl;).
if (ch === ";") {
// Backward scan to find & (O(n) total, avoids string allocation)
let j = len - 2;
while (j >= 0 && /[a-zA-Z0-9]/.test(tail[j])) {
j--;
}
// j < len - 2 ensures at least one alphanumeric between & and ;
if (j >= 0 && tail[j] === "&" && j < len - 2) {
len = j;
continue;
}
// Not an entity reference, stop stripping
break;
}
// Handle balanced pairs — only strip close char if unbalanced.
const open = balancePairs[ch];
if (open !== undefined) {
if (open === ch) {
// Self-matching: strip if odd count (unbalanced)
if (balance[ch] !== 0) {
balance[ch] = 0;
len--;
continue;
}
} else {
// Distinct pair: strip if more closes than opens
if (balance[ch] < 0) {
balance[ch]++;
len--;
continue;
}
}
}
break;
}
return len;
},
normalize(match) {
match.url = "http://" + match.url;
},
});
// Override default link validator to allow all URLs through to renderers.
// marked.js does not validate URLs at all — it generates <a>/<img> tags for
// everything and relies on DOMPurify to strip dangerous schemes.
//
// We match this behavior exactly:
// - All URLs pass validation, including javascript:, vbscript:, file:, data:
// - Images: renderer.rules.image shows alt text for non-data-image URLs
// - Links: DOMPurify strips dangerous href schemes, leaving safe anchor text
// - Blocking at validateLink would skip token generation entirely, causing raw
// markdown source to appear instead of graceful fallbacks.
md.validateLink = () => true;
// Trim trailing CJK characters from auto-linked URLs (RFC 3986: raw CJK is
// not valid in URLs). markdown-it's built-in linkify for https:// URLs may
// swallow adjacent CJK text into the URL. This core rule runs after linkify
// and splits the CJK suffix back into a plain text token.
md.core.ruler.after("linkify", "linkify-cjk-trim", (state) => {
for (const blockToken of state.tokens) {
if (blockToken.type !== "inline" || !blockToken.children) {
continue;
}
const children = blockToken.children;
for (let i = children.length - 1; i >= 0; i--) {
const token = children[i];
if (token.type !== "link_open") {
continue;
}
// Only trim linkify-generated autolinks, not explicit markdown links
// like [OpenClaw中文](https://docs.openclaw.ai) where CJK in display
// text is intentional and href must not be rewritten.
if (token.markup !== "linkify") {
continue;
}
// Use the display text to find CJK boundary (href may be percent-encoded)
const textToken = children[i + 1];
if (!textToken || textToken.type !== "text") {
continue;
}
const displayText = textToken.content;
// Scan backward to find trailing CJK suffix only.
// Middle CJK must be preserved (e.g. https://example.com/你/test stays intact);
// only strip a contiguous CJK tail adjacent to non-URL text.
let cjkIdx = displayText.length;
while (cjkIdx > 0 && CJK_RE.test(displayText[cjkIdx - 1])) {
cjkIdx--;
}
if (cjkIdx <= 0 || cjkIdx === displayText.length) {
continue;
}
// Split: URL part and CJK tail from display text
const trimmedDisplay = displayText.slice(0, cjkIdx);
const cjkTail = displayText.slice(cjkIdx);
// Rebuild href by preserving the scheme prefix that linkify added but
// display text omits (e.g. "mailto:" for emails, "http://" for www links).
const href = token.attrGet("href") ?? "";
const prefixLen = href.indexOf(displayText);
const hrefPrefix = prefixLen > 0 ? href.slice(0, prefixLen) : "";
token.attrSet("href", hrefPrefix + trimmedDisplay);
textToken.content = trimmedDisplay;
// Find link_close and insert CJK text after it
for (let j = i + 1; j < children.length; j++) {
if (children[j].type === "link_close") {
const tailToken = new state.Token("text", "", 0);
tailToken.content = cjkTail;
children.splice(j + 1, 0, tailToken);
break;
}
}
}
}
});
// Enable GFM task list checkboxes (- [x] / - [ ]).
// enabled: false keeps checkboxes read-only (disabled="") — task lists in
// chat messages are display-only, not interactive forms.
// label: false avoids wrapping item text in <label>, which would break
// accessibility when the item contains links (MDN warns against anchors inside labels).
md.use(markdownItTaskLists, { enabled: false, label: false });
// Mark the <input> html_inline token inside task-list items as trusted so the
// html_inline override lets it through. With label: false, the plugin generates
// only a single <input ...> token per item.
// We identify task-list items by the class="task-list-item" the plugin sets.
md.core.ruler.after("github-task-lists", "task-list-allowlist", (state) => {
const tokens = state.tokens;
for (let i = 2; i < tokens.length; i++) {
if (tokens[i].type !== "inline" || !tokens[i].children) {
continue;
}
if (tokens[i - 1].type !== "paragraph_open") {
continue;
}
if (tokens[i - 2].type !== "list_item_open") {
continue;
}
const listItem = tokens[i - 2];
const cls = listItem.attrGet("class") ?? "";
if (!cls.includes("task-list-item")) {
continue;
}
// Only trust the checkbox <input> token from the plugin, not other user-supplied HTML.
// The plugin inserts an <input> at the start; user HTML elsewhere must stay escaped.
for (const child of tokens[i].children!) {
if (child.type === "html_inline" && /^<input\s/i.test(child.content)) {
child.meta = { taskListPlugin: true };
break; // Only one checkbox per item
}
}
}
});
// Override html_block and html_inline to escape raw HTML (#13937).
// Exception: html_inline tokens marked by a trusted plugin (meta.taskListPlugin)
// are allowed through — they are generated by our own plugin pipeline, not user input,
// and DOMPurify provides the final safety net regardless.
md.renderer.rules.html_block = (tokens, idx) => {
return escapeHtml(tokens[idx].content) + "\n";
};
md.renderer.rules.html_inline = (tokens, idx) => {
const token = tokens[idx];
if (token.meta?.taskListPlugin === true) {
return token.content;
}
return escapeHtml(token.content);
};
marked.use({
extensions: [cjkAutoLinkExtension as unknown as import("marked").TokenizerAndRendererExtension],
});
// Override image to only allow base64 data URIs (#15437)
md.renderer.rules.image = (tokens, idx) => {
const token = tokens[idx];
const src = token.attrGet("src")?.trim() ?? "";
// Use token.content which preserves raw markdown formatting (e.g. **bold**)
// to match original marked.js behavior.
const alt = normalizeMarkdownImageLabel(token.content);
if (!INLINE_DATA_IMAGE_RE.test(src)) {
return escapeHtml(alt);
}
return `<img class="markdown-inline-image" src="${escapeHtml(src)}" alt="${escapeHtml(alt)}">`;
};
// Override fenced code blocks with copy button + JSON collapse
md.renderer.rules.fence = (tokens, idx) => {
const token = tokens[idx];
// token.info contains the full fence info string (e.g., "json title=foo");
// extract only the first whitespace-separated token as the language.
const lang = token.info.trim().split(/\s+/)[0] || "";
const text = token.content;
const langClass = lang ? ` class="language-${escapeHtml(lang)}"` : "";
const safeText = escapeHtml(text);
const codeBlock = `<pre><code${langClass}>${safeText}</code></pre>`;
const langLabel = lang ? `<span class="code-block-lang">${escapeHtml(lang)}</span>` : "";
const attrSafe = escapeHtml(text);
const copyBtn = `<button type="button" class="code-block-copy" data-code="${attrSafe}" aria-label="Copy code"><span class="code-block-copy__idle">Copy</span><span class="code-block-copy__done">Copied!</span></button>`;
const header = `<div class="code-block-header">${langLabel}${copyBtn}</div>`;
const trimmed = text.trim();
const isJson =
lang === "json" ||
(!lang &&
((trimmed.startsWith("{") && trimmed.endsWith("}")) ||
(trimmed.startsWith("[") && trimmed.endsWith("]"))));
if (isJson) {
const lineCount = text.split("\n").length;
const label = lineCount > 1 ? `JSON &middot; ${lineCount} lines` : "JSON";
return `<details class="json-collapse"><summary>${label}</summary><div class="code-block-wrapper">${header}${codeBlock}</div></details>`;
}
return `<div class="code-block-wrapper">${header}${codeBlock}</div>`;
};
// Override indented code blocks (code_block) with the same treatment as fence
md.renderer.rules.code_block = (tokens, idx) => {
const token = tokens[idx];
const text = token.content;
const safeText = escapeHtml(text);
const codeBlock = `<pre><code>${safeText}</code></pre>`;
const attrSafe = escapeHtml(text);
const copyBtn = `<button type="button" class="code-block-copy" data-code="${attrSafe}" aria-label="Copy code"><span class="code-block-copy__idle">Copy</span><span class="code-block-copy__done">Copied!</span></button>`;
const header = `<div class="code-block-header">${copyBtn}</div>`;
const trimmed = text.trim();
const isJson =
(trimmed.startsWith("{") && trimmed.endsWith("}")) ||
(trimmed.startsWith("[") && trimmed.endsWith("]"));
if (isJson) {
const lineCount = text.split("\n").length;
const label = lineCount > 1 ? `JSON &middot; ${lineCount} lines` : "JSON";
return `<details class="json-collapse"><summary>${label}</summary><div class="code-block-wrapper">${header}${codeBlock}</div></details>`;
}
return `<div class="code-block-wrapper">${header}${codeBlock}</div>`;
};
export function toSanitizedMarkdownHtml(markdown: string): string {
const input = markdown.trim();
@@ -197,15 +504,10 @@ export function toSanitizedMarkdownHtml(markdown: string): string {
}
let rendered: string;
try {
rendered = marked.parse(`${truncated.text}${suffix}`, {
renderer: htmlEscapeRenderer,
gfm: true,
breaks: true,
}) as string;
rendered = md.render(`${truncated.text}${suffix}`);
} catch (err) {
// Fall back to escaped plain text when marked.parse() throws (e.g.
// infinite recursion on pathological markdown patterns — #36213).
console.warn("[markdown] marked.parse failed, falling back to plain text:", err);
// Fall back to escaped plain text when md.render() throws (#36213).
console.warn("[markdown] md.render failed, falling back to plain text:", err);
const escaped = escapeHtml(`${truncated.text}${suffix}`);
rendered = `<pre class="code-block">${escaped}</pre>`;
}
@@ -216,72 +518,6 @@ export function toSanitizedMarkdownHtml(markdown: string): string {
return sanitized;
}
// Prevent raw HTML in chat messages from being rendered as formatted HTML.
// Display it as escaped text so users see the literal markup.
// Security is handled by DOMPurify, but rendering pasted HTML (e.g. error
// pages) as formatted output is confusing UX (#13937).
const htmlEscapeRenderer = new marked.Renderer();
htmlEscapeRenderer.html = ({ text }: { text: string }) => escapeHtml(text);
htmlEscapeRenderer.image = (token: { href?: string | null; text?: string | null }) => {
const label = normalizeMarkdownImageLabel(token.text);
const href = token.href?.trim() ?? "";
if (!INLINE_DATA_IMAGE_RE.test(href)) {
return escapeHtml(label);
}
return `<img class="markdown-inline-image" src="${escapeHtml(href)}" alt="${escapeHtml(label)}">`;
};
function normalizeMarkdownImageLabel(text?: string | null): string {
const trimmed = text?.trim();
return trimmed ? trimmed : "image";
}
htmlEscapeRenderer.code = ({
text,
lang,
escaped,
}: {
text: string;
lang?: string;
escaped?: boolean;
}) => {
const langClass = lang ? ` class="language-${escapeHtml(lang)}"` : "";
const safeText = escaped ? text : escapeHtml(text);
const codeBlock = `<pre><code${langClass}>${safeText}</code></pre>`;
const langLabel = lang ? `<span class="code-block-lang">${escapeHtml(lang)}</span>` : "";
const attrSafe = text
.replace(/&/g, "&amp;")
.replace(/"/g, "&quot;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;");
const copyBtn = `<button type="button" class="code-block-copy" data-code="${attrSafe}" aria-label="Copy code"><span class="code-block-copy__idle">Copy</span><span class="code-block-copy__done">Copied!</span></button>`;
const header = `<div class="code-block-header">${langLabel}${copyBtn}</div>`;
const trimmed = text.trim();
const isJson =
lang === "json" ||
(!lang &&
((trimmed.startsWith("{") && trimmed.endsWith("}")) ||
(trimmed.startsWith("[") && trimmed.endsWith("]"))));
if (isJson) {
const lineCount = text.split("\n").length;
const label = lineCount > 1 ? `JSON &middot; ${lineCount} lines` : "JSON";
return `<details class="json-collapse"><summary>${label}</summary><div class="code-block-wrapper">${header}${codeBlock}</div></details>`;
}
return `<div class="code-block-wrapper">${header}${codeBlock}</div>`;
};
function escapeHtml(value: string): string {
return value
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&#39;");
}
function renderEscapedPlainTextHtml(value: string): string {
return `<div class="markdown-plain-text-fallback">${escapeHtml(value.replace(/\r\n?/g, "\n"))}</div>`;
}