From 2c236a9f4c9c83d2bccd67e5a450d6f8759f7acd Mon Sep 17 00:00:00 2001 From: Chen Chia Yang Date: Wed, 15 Apr 2026 19:04:41 +0800 Subject: [PATCH] fix(media): remove UTF-16 decoding from host-read text validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TextDecoder("utf-16le/be") never throws on arbitrary byte pairs — every pair of bytes is a valid Unicode scalar, so an attacker can prepend a UTF-16 BOM (0xFF 0xFE) to binary garbage, give the file a .csv/.md extension, and pass getTextStats with printableRatio≈1.0, bypassing the host-read security boundary. Remove resolveUtf16Charset and the UTF-16 branches from decodeHostReadText. The Latin-1 fallback (gated by hasSingleByteTextShape) already covers the most common non-UTF-8 real-world case: Excel CSV exports with accented characters like é, ñ. UTF-16 CSVs are extremely rare and users can trivially re-save as UTF-8. Adds two regression tests: - NUL-padded (0x00/0xFF) must be rejected - BOM-prefixed (0xFF 0xFE + 0xFF garbage) must be rejected Co-Authored-By: Claude Sonnet 4.6 --- src/media/web-media.test.ts | 34 +++++++++++++++++++++++------- src/media/web-media.ts | 41 ++++++------------------------------- 2 files changed, 33 insertions(+), 42 deletions(-) diff --git a/src/media/web-media.test.ts b/src/media/web-media.test.ts index 5398e67ad02..a320bef04a7 100644 --- a/src/media/web-media.test.ts +++ b/src/media/web-media.test.ts @@ -335,14 +335,10 @@ describe("loadWebMedia", () => { it.each([ { label: "CSV", fileName: "nul-padded.csv" }, { label: "Markdown", fileName: "nul-padded.md" }, - ])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => { + ])("rejects NUL-padded binary data disguised as %s", async ({ fileName }) => { const fakeTextFile = path.join(fixtureRoot, fileName); - // Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16. - // Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00), - // so getTextStats returns printableRatio=1.0 and the file would have been allowed. - // After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8 - // strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control - // bytes (0x00 < 0x20), so the upload is correctly rejected. + // Alternating 0x00/0xFF — UTF-8 decode fails (0xFF is invalid UTF-8), then + // hasSingleByteTextShape rejects because 0x00 bytes are control chars (< 0x20). const nulPadded = Buffer.alloc(9000); for (let i = 0; i < nulPadded.length; i += 1) { nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff; @@ -360,6 +356,30 @@ describe("loadWebMedia", () => { }); }); + it.each([ + { label: "CSV", fileName: "bom-binary.csv" }, + { label: "Markdown", fileName: "bom-binary.md" }, + ])("rejects UTF-16 BOM-prefixed binary data disguised as %s", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + // UTF-16LE BOM + repeating 0xFF bytes: if UTF-16 decoding were attempted, + // every byte pair would produce a printable code point and pass getTextStats. + // With UTF-16 decoding removed, falls through to UTF-8 strict decode (throws + // on 0xFF), then hasSingleByteTextShape rejects due to high-byte ratio > 30%. + const bom = Buffer.from([0xff, 0xfe]); + const garbage = Buffer.alloc(9000, 0xff); + await fs.writeFile(fakeTextFile, Buffer.concat([bom, garbage])); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + it.each([ { label: "CSV", fileName: "alternating-high.csv" }, { label: "Markdown", fileName: "alternating-high.md" }, diff --git a/src/media/web-media.ts b/src/media/web-media.ts index 5e42227248e..cdcb5596510 100644 --- a/src/media/web-media.ts +++ b/src/media/web-media.ts @@ -91,24 +91,6 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([ const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]); const MB = 1024 * 1024; -function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined { - if (!buffer || buffer.length < 2) { - return undefined; - } - const b0 = buffer[0]; - const b1 = buffer[1]; - // Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate - // because TextDecoder("utf-16le") never throws and all byte pairs produce printable - // code points, allowing opaque NUL-padded binaries to pass the text-stats check. - if (b0 === 0xff && b1 === 0xfe) { - return "utf-16le"; - } - if (b0 === 0xfe && b1 === 0xff) { - return "utf-16be"; - } - return undefined; -} - function getTextStats(text: string): { printableRatio: number } { if (!text) { return { printableRatio: 0 }; @@ -158,24 +140,13 @@ function decodeHostReadText(buffer: Buffer): string | undefined { if (buffer.length === 0) { return ""; } - const utf16Charset = resolveUtf16Charset(buffer); + // UTF-16 decoding is intentionally omitted: TextDecoder("utf-16le/be") never throws on + // arbitrary byte pairs, so every byte pair is a valid (if meaningless) Unicode scalar — + // an attacker can prepend a BOM and pass getTextStats with printableRatio≈1.0 on pure + // binary garbage. The Latin-1 path below already covers the most common non-UTF-8 + // real-world case (Excel CSV exports with accented chars like é, ñ) while remaining + // safe because hasSingleByteTextShape gates on byte shape *before* any decode. try { - if (utf16Charset === "utf-16be") { - const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1); - if (evenBuffer.length === 0) { - return ""; - } - const swapped = Buffer.alloc(evenBuffer.length); - for (let i = 0; i + 1 < evenBuffer.length; i += 2) { - swapped[i] = evenBuffer[i + 1]; - swapped[i + 1] = evenBuffer[i]; - } - return new TextDecoder("utf-16le").decode(swapped); - } - if (utf16Charset === "utf-16le") { - const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1); - return new TextDecoder("utf-16le").decode(evenBuffer); - } return new TextDecoder("utf-8", { fatal: true }).decode(buffer); } catch { if (!hasSingleByteTextShape(buffer)) {