From 3ab1088fc2fbc94fe51082969b3e8e6745421507 Mon Sep 17 00:00:00 2001 From: Chen Chia Yang Date: Wed, 15 Apr 2026 18:56:54 +0800 Subject: [PATCH] fix(media): require BOM for UTF-16 detection to close NUL-padded bypass The NUL-heavy heuristic in resolveUtf16Charset was unsafe as a security gate: TextDecoder("utf-16le") never throws, so every byte pair in an opaque binary (e.g. repeating 0x00/0xFF) decodes to a printable code point and passes the text-stats check, allowing the upload. Remove the heuristic; only a leading BOM (0xFF 0xFE / 0xFE 0xFF) now triggers UTF-16 decoding. Without a BOM the strict UTF-8 path runs first, and NUL-padded binaries are then rejected by hasSingleByteTextShape (0x00 bytes are control bytes). Adds a regression test: 9000-byte alternating-NUL/0xFF buffer must be rejected as path-not-allowed. Co-Authored-By: Claude Sonnet 4.6 --- src/media/web-media.test.ts | 28 ++++++++++++++++++++++++++++ src/media/web-media.ts | 20 +++----------------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/media/web-media.test.ts b/src/media/web-media.test.ts index f46dca2136e..5398e67ad02 100644 --- a/src/media/web-media.test.ts +++ b/src/media/web-media.test.ts @@ -332,6 +332,34 @@ describe("loadWebMedia", () => { }, ); + it.each([ + { label: "CSV", fileName: "nul-padded.csv" }, + { label: "Markdown", fileName: "nul-padded.md" }, + ])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + // Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16. + // Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00), + // so getTextStats returns printableRatio=1.0 and the file would have been allowed. + // After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8 + // strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control + // bytes (0x00 < 0x20), so the upload is correctly rejected. + const nulPadded = Buffer.alloc(9000); + for (let i = 0; i < nulPadded.length; i += 1) { + nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff; + } + await fs.writeFile(fakeTextFile, nulPadded); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + it.each([ { label: "CSV", fileName: "alternating-high.csv" }, { label: "Markdown", fileName: "alternating-high.md" }, diff --git a/src/media/web-media.ts b/src/media/web-media.ts index c42256ac8e3..5e42227248e 100644 --- a/src/media/web-media.ts +++ b/src/media/web-media.ts @@ -97,29 +97,15 @@ function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefin } const b0 = buffer[0]; const b1 = buffer[1]; + // Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate + // because TextDecoder("utf-16le") never throws and all byte pairs produce printable + // code points, allowing opaque NUL-padded binaries to pass the text-stats check. if (b0 === 0xff && b1 === 0xfe) { return "utf-16le"; } if (b0 === 0xfe && b1 === 0xff) { return "utf-16be"; } - const sampleLen = Math.min(buffer.length, 2048); - let zeroEven = 0; - let zeroOdd = 0; - for (let i = 0; i < sampleLen; i += 1) { - if (buffer[i] !== 0) { - continue; - } - if (i % 2 === 0) { - zeroEven += 1; - } else { - zeroOdd += 1; - } - } - const zeroCount = zeroEven + zeroOdd; - if (sampleLen > 0 && zeroCount / sampleLen > 0.2) { - return zeroOdd >= zeroEven ? "utf-16le" : "utf-16be"; - } return undefined; }