From 3ab1088fc2fbc94fe51082969b3e8e6745421507 Mon Sep 17 00:00:00 2001
From: Chen Chia Yang <unayung@gmail.com>
Date: Wed, 15 Apr 2026 18:56:54 +0800
Subject: [PATCH] fix(media): require BOM for UTF-16 detection to close
 NUL-padded bypass

The NUL-heavy heuristic in resolveUtf16Charset was unsafe as a security
gate: TextDecoder("utf-16le") never throws, so every byte pair in an
opaque binary (e.g. repeating 0x00/0xFF) decodes to a printable code
point and passes the text-stats check, allowing the upload.

Remove the heuristic; only a leading BOM (0xFF 0xFE / 0xFE 0xFF) now
triggers UTF-16 decoding. Without a BOM the strict UTF-8 path runs
first, and NUL-padded binaries are then rejected by hasSingleByteTextShape
(0x00 bytes are control bytes).

Adds a regression test: 9000-byte alternating-NUL/0xFF buffer must be
rejected as path-not-allowed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/media/web-media.test.ts | 28 ++++++++++++++++++++++++++++
 src/media/web-media.ts      | 20 +++-----------------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/media/web-media.test.ts b/src/media/web-media.test.ts
index f46dca2136e..5398e67ad02 100644
--- a/src/media/web-media.test.ts
+++ b/src/media/web-media.test.ts
@@ -332,6 +332,34 @@ describe("loadWebMedia", () => {
     },
   );
 
+  it.each([
+    { label: "CSV", fileName: "nul-padded.csv" },
+    { label: "Markdown", fileName: "nul-padded.md" },
+  ])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => {
+    const fakeTextFile = path.join(fixtureRoot, fileName);
+    // Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16.
+    // Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00),
+    // so getTextStats returns printableRatio=1.0 and the file would have been allowed.
+    // After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8
+    // strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control
+    // bytes (0x00 < 0x20), so the upload is correctly rejected.
+    const nulPadded = Buffer.alloc(9000);
+    for (let i = 0; i < nulPadded.length; i += 1) {
+      nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff;
+    }
+    await fs.writeFile(fakeTextFile, nulPadded);
+    await expect(
+      loadWebMedia(fakeTextFile, {
+        maxBytes: 1024 * 1024,
+        localRoots: "any",
+        readFile: async (filePath) => await fs.readFile(filePath),
+        hostReadCapability: true,
+      }),
+    ).rejects.toMatchObject({
+      code: "path-not-allowed",
+    });
+  });
+
   it.each([
     { label: "CSV", fileName: "alternating-high.csv" },
     { label: "Markdown", fileName: "alternating-high.md" },
diff --git a/src/media/web-media.ts b/src/media/web-media.ts
index c42256ac8e3..5e42227248e 100644
--- a/src/media/web-media.ts
+++ b/src/media/web-media.ts
@@ -97,29 +97,15 @@ function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefin
   }
   const b0 = buffer[0];
   const b1 = buffer[1];
+  // Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate
+  // because TextDecoder("utf-16le") never throws and all byte pairs produce printable
+  // code points, allowing opaque NUL-padded binaries to pass the text-stats check.
   if (b0 === 0xff && b1 === 0xfe) {
     return "utf-16le";
   }
   if (b0 === 0xfe && b1 === 0xff) {
     return "utf-16be";
   }
-  const sampleLen = Math.min(buffer.length, 2048);
-  let zeroEven = 0;
-  let zeroOdd = 0;
-  for (let i = 0; i < sampleLen; i += 1) {
-    if (buffer[i] !== 0) {
-      continue;
-    }
-    if (i % 2 === 0) {
-      zeroEven += 1;
-    } else {
-      zeroOdd += 1;
-    }
-  }
-  const zeroCount = zeroEven + zeroOdd;
-  if (sampleLen > 0 && zeroCount / sampleLen > 0.2) {
-    return zeroOdd >= zeroEven ? "utf-16le" : "utf-16be";
-  }
   return undefined;
 }