From 2c236a9f4c9c83d2bccd67e5a450d6f8759f7acd Mon Sep 17 00:00:00 2001
From: Chen Chia Yang <unayung@gmail.com>
Date: Wed, 15 Apr 2026 19:04:41 +0800
Subject: [PATCH] fix(media): remove UTF-16 decoding from host-read text
 validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TextDecoder("utf-16le/be") never throws on arbitrary byte pairs —
every pair of bytes is a valid Unicode scalar, so an attacker can
prepend a UTF-16 BOM (0xFF 0xFE) to binary garbage, give the file a
.csv/.md extension, and pass getTextStats with printableRatio≈1.0,
bypassing the host-read security boundary.

Remove resolveUtf16Charset and the UTF-16 branches from decodeHostReadText.
The Latin-1 fallback (gated by hasSingleByteTextShape) already covers
the most common non-UTF-8 real-world case: Excel CSV exports with
accented characters like é, ñ. UTF-16 CSVs are extremely rare and
users can trivially re-save as UTF-8.

Adds two regression tests:
- NUL-padded (0x00/0xFF) must be rejected
- BOM-prefixed (0xFF 0xFE + 0xFF garbage) must be rejected

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/media/web-media.test.ts | 34 +++++++++++++++++++++++-------
 src/media/web-media.ts      | 41 ++++++-------------------------------
 2 files changed, 33 insertions(+), 42 deletions(-)

diff --git a/src/media/web-media.test.ts b/src/media/web-media.test.ts
index 5398e67ad02..a320bef04a7 100644
--- a/src/media/web-media.test.ts
+++ b/src/media/web-media.test.ts
@@ -335,14 +335,10 @@ describe("loadWebMedia", () => {
   it.each([
     { label: "CSV", fileName: "nul-padded.csv" },
     { label: "Markdown", fileName: "nul-padded.md" },
-  ])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => {
+  ])("rejects NUL-padded binary data disguised as %s", async ({ fileName }) => {
     const fakeTextFile = path.join(fixtureRoot, fileName);
-    // Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16.
-    // Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00),
-    // so getTextStats returns printableRatio=1.0 and the file would have been allowed.
-    // After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8
-    // strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control
-    // bytes (0x00 < 0x20), so the upload is correctly rejected.
+    // Alternating 0x00/0xFF — UTF-8 decode fails (0xFF is invalid UTF-8), then
+    // hasSingleByteTextShape rejects because 0x00 bytes are control chars (< 0x20).
     const nulPadded = Buffer.alloc(9000);
     for (let i = 0; i < nulPadded.length; i += 1) {
       nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff;
@@ -360,6 +356,30 @@ describe("loadWebMedia", () => {
     });
   });
 
+  it.each([
+    { label: "CSV", fileName: "bom-binary.csv" },
+    { label: "Markdown", fileName: "bom-binary.md" },
+  ])("rejects UTF-16 BOM-prefixed binary data disguised as %s", async ({ fileName }) => {
+    const fakeTextFile = path.join(fixtureRoot, fileName);
+    // UTF-16LE BOM + repeating 0xFF bytes: if UTF-16 decoding were attempted,
+    // every byte pair would produce a printable code point and pass getTextStats.
+    // With UTF-16 decoding removed, falls through to UTF-8 strict decode (throws
+    // on 0xFF), then hasSingleByteTextShape rejects due to high-byte ratio > 30%.
+    const bom = Buffer.from([0xff, 0xfe]);
+    const garbage = Buffer.alloc(9000, 0xff);
+    await fs.writeFile(fakeTextFile, Buffer.concat([bom, garbage]));
+    await expect(
+      loadWebMedia(fakeTextFile, {
+        maxBytes: 1024 * 1024,
+        localRoots: "any",
+        readFile: async (filePath) => await fs.readFile(filePath),
+        hostReadCapability: true,
+      }),
+    ).rejects.toMatchObject({
+      code: "path-not-allowed",
+    });
+  });
+
   it.each([
     { label: "CSV", fileName: "alternating-high.csv" },
     { label: "Markdown", fileName: "alternating-high.md" },
diff --git a/src/media/web-media.ts b/src/media/web-media.ts
index 5e42227248e..cdcb5596510 100644
--- a/src/media/web-media.ts
+++ b/src/media/web-media.ts
@@ -91,24 +91,6 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([
 const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]);
 const MB = 1024 * 1024;
 
-function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined {
-  if (!buffer || buffer.length < 2) {
-    return undefined;
-  }
-  const b0 = buffer[0];
-  const b1 = buffer[1];
-  // Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate
-  // because TextDecoder("utf-16le") never throws and all byte pairs produce printable
-  // code points, allowing opaque NUL-padded binaries to pass the text-stats check.
-  if (b0 === 0xff && b1 === 0xfe) {
-    return "utf-16le";
-  }
-  if (b0 === 0xfe && b1 === 0xff) {
-    return "utf-16be";
-  }
-  return undefined;
-}
-
 function getTextStats(text: string): { printableRatio: number } {
   if (!text) {
     return { printableRatio: 0 };
@@ -158,24 +140,13 @@ function decodeHostReadText(buffer: Buffer): string | undefined {
   if (buffer.length === 0) {
     return "";
   }
-  const utf16Charset = resolveUtf16Charset(buffer);
+  // UTF-16 decoding is intentionally omitted: TextDecoder("utf-16le/be") never throws on
+  // arbitrary byte pairs, so every byte pair is a valid (if meaningless) Unicode scalar —
+  // an attacker can prepend a BOM and pass getTextStats with printableRatio≈1.0 on pure
+  // binary garbage. The Latin-1 path below already covers the most common non-UTF-8
+  // real-world case (Excel CSV exports with accented chars like é, ñ) while remaining
+  // safe because hasSingleByteTextShape gates on byte shape *before* any decode.
   try {
-    if (utf16Charset === "utf-16be") {
-      const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
-      if (evenBuffer.length === 0) {
-        return "";
-      }
-      const swapped = Buffer.alloc(evenBuffer.length);
-      for (let i = 0; i + 1 < evenBuffer.length; i += 2) {
-        swapped[i] = evenBuffer[i + 1];
-        swapped[i + 1] = evenBuffer[i];
-      }
-      return new TextDecoder("utf-16le").decode(swapped);
-    }
-    if (utf16Charset === "utf-16le") {
-      const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
-      return new TextDecoder("utf-16le").decode(evenBuffer);
-    }
     return new TextDecoder("utf-8", { fatal: true }).decode(buffer);
   } catch {
     if (!hasSingleByteTextShape(buffer)) {