From 604a0240677c24261bd3ca842af535d60fa9e65b Mon Sep 17 00:00:00 2001 From: Frank Yang Date: Wed, 15 Apr 2026 16:09:19 +0800 Subject: [PATCH] fix: harden host-read text fallback --- src/media/web-media.test.ts | 22 +++++++ src/media/web-media.ts | 122 +++++++++++++++++++++++++++++++----- 2 files changed, 129 insertions(+), 15 deletions(-) diff --git a/src/media/web-media.test.ts b/src/media/web-media.test.ts index 213c43354e7..fbfd6d062f9 100644 --- a/src/media/web-media.test.ts +++ b/src/media/web-media.test.ts @@ -228,6 +228,28 @@ describe("loadWebMedia", () => { }); }); + it.each([ + { label: "CSV", fileName: "opaque.csv" }, + { label: "Markdown", fileName: "opaque.md" }, + ])("rejects opaque non-NUL binary data disguised as %s", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + const opaqueBinary = Buffer.alloc(9000); + for (let i = 0; i < opaqueBinary.length; i += 1) { + opaqueBinary[i] = (i % 255) + 1; + } + await fs.writeFile(fakeTextFile, opaqueBinary); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + it("rejects traversal-style canvas media paths before filesystem access", async () => { await expect( loadWebMedia(`${CANVAS_HOST_PATH}/documents/../collection.media/tiny.png`), diff --git a/src/media/web-media.ts b/src/media/web-media.ts index feb2a75e631..9fa01e942e1 100644 --- a/src/media/web-media.ts +++ b/src/media/web-media.ts @@ -87,24 +87,116 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([ "text/markdown", ]); // file-type returns undefined (no magic bytes) for plain-text formats like CSV and -// Markdown. These MIME types are allowed via extension + null-byte check only. +// Markdown, so host-read needs an explicit "this really decodes as text" fallback. const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]); +const HOST_READ_TEXT_SAMPLE_BYTES = 8192; const MB = 1024 * 1024; +const WORDISH_CHAR = /[\p{L}\p{N}]/u; -// Returns true only if every byte in the buffer is text-safe: no null bytes and no C0 -// control characters other than the standard whitespace group (tab 0x09, LF 0x0A, -// VT 0x0B, FF 0x0C, CR 0x0D). This is the same heuristic used by `git` and `file` to -// distinguish text from binary. Bytes ≄ 0x80 are allowed so that UTF-8, Latin-1, and -// Windows-1252 encoded files all pass. -function looksLikeText(buffer: Buffer): boolean { - for (let i = 0; i < buffer.length; i++) { - const b = buffer[i]; - // Reject null (0x00–0x08) and remaining C0 controls (0x0E–0x1F) and DEL (0x7F). - if (b < 0x09 || (b >= 0x0e && b <= 0x1f) || b === 0x7f) { - return false; +function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined { + if (!buffer || buffer.length < 2) { + return undefined; + } + const b0 = buffer[0]; + const b1 = buffer[1]; + if (b0 === 0xff && b1 === 0xfe) { + return "utf-16le"; + } + if (b0 === 0xfe && b1 === 0xff) { + return "utf-16be"; + } + const sampleLen = Math.min(buffer.length, 2048); + let zeroEven = 0; + let zeroOdd = 0; + for (let i = 0; i < sampleLen; i += 1) { + if (buffer[i] !== 0) { + continue; + } + if (i % 2 === 0) { + zeroEven += 1; + } else { + zeroOdd += 1; } } - return true; + const zeroCount = zeroEven + zeroOdd; + if (sampleLen > 0 && zeroCount / sampleLen > 0.2) { + return zeroOdd >= zeroEven ? "utf-16le" : "utf-16be"; + } + return undefined; +} + +function getTextStats(text: string): { printableRatio: number; wordishRatio: number } { + if (!text) { + return { printableRatio: 0, wordishRatio: 0 }; + } + let printable = 0; + let control = 0; + let wordish = 0; + for (const char of text) { + const code = char.codePointAt(0) ?? 0; + if (code === 9 || code === 10 || code === 13 || code === 32) { + printable += 1; + wordish += 1; + continue; + } + if (code < 32 || (code >= 0x7f && code <= 0x9f)) { + control += 1; + continue; + } + printable += 1; + if (WORDISH_CHAR.test(char)) { + wordish += 1; + } + } + const total = printable + control; + if (total === 0) { + return { printableRatio: 0, wordishRatio: 0 }; + } + return { printableRatio: printable / total, wordishRatio: wordish / total }; +} + +function decodeHostReadTextSample(buffer: Buffer): string | undefined { + const sample = buffer.subarray(0, Math.min(buffer.length, HOST_READ_TEXT_SAMPLE_BYTES)); + if (sample.length === 0) { + return ""; + } + const utf16Charset = resolveUtf16Charset(sample); + try { + if (utf16Charset === "utf-16be") { + const evenSample = sample.length % 2 === 0 ? sample : sample.subarray(0, sample.length - 1); + if (evenSample.length === 0) { + return ""; + } + const swapped = Buffer.alloc(evenSample.length); + for (let i = 0; i + 1 < evenSample.length; i += 2) { + swapped[i] = evenSample[i + 1]; + swapped[i + 1] = evenSample[i]; + } + return new TextDecoder("utf-16le").decode(swapped); + } + if (utf16Charset === "utf-16le") { + const evenSample = sample.length % 2 === 0 ? sample : sample.subarray(0, sample.length - 1); + return new TextDecoder("utf-16le").decode(evenSample); + } + return new TextDecoder("utf-8", { fatal: true }).decode(sample); + } catch { + return undefined; + } +} + +function isValidatedHostReadText(buffer?: Buffer): boolean { + if (!buffer) { + return false; + } + if (buffer.length === 0) { + return true; + } + const text = decodeHostReadTextSample(buffer); + if (text === undefined) { + return false; + } + const { printableRatio, wordishRatio } = getTextStats(text); + return printableRatio > 0.95 && wordishRatio > 0.2; } function formatMb(bytes: number, digits = 2): string { @@ -159,13 +251,13 @@ function assertHostReadMediaAllowed(params: { // plain-text buffers that have no binary magic bytes. Allow these formats when: // - sniffedMime is undefined (no binary signature detected by file-type) // - The extension-derived MIME is text/csv or text/markdown (operator intent) - // - Every byte in the buffer passes the text-safety check (no binary control chars) + // - The buffer decodes as actual text instead of opaque binary bytes if ( !sniffedMime && normalizedMime && HOST_READ_TEXT_PLAIN_ALIASES.has(normalizedMime) && params.buffer && - looksLikeText(params.buffer) + isValidatedHostReadText(params.buffer) ) { return; }