fix(media): remove UTF-16 decoding from host-read text validation

TextDecoder("utf-16le/be") never throws on arbitrary byte pairs — every pair of bytes is a valid Unicode scalar, so an attacker can prepend a UTF-16 BOM (0xFF 0xFE) to binary garbage, give the file a .csv/.md extension, and pass getTextStats with printableRatio≈1.0, bypassing the host-read security boundary. Remove resolveUtf16Charset and the UTF-16 branches from decodeHostReadText. The Latin-1 fallback (gated by hasSingleByteTextShape) already covers the most common non-UTF-8 real-world case: Excel CSV exports with accented characters like é, ñ. UTF-16 CSVs are extremely rare and users can trivially re-save as UTF-8. Adds two regression tests: - NUL-padded (0x00/0xFF) must be rejected - BOM-prefixed (0xFF 0xFE + 0xFF garbage) must be rejected Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 07:30:43 +00:00 · 2026-04-15 19:04:41 +08:00
parent 3ab1088fc2
commit 2c236a9f4c
2 changed files with 33 additions and 42 deletions
--- a/src/media/web-media.test.ts
+++ b/src/media/web-media.test.ts
@@ -335,14 +335,10 @@ describe("loadWebMedia", () => {
  it.each([
    { label: "CSV", fileName: "nul-padded.csv" },
    { label: "Markdown", fileName: "nul-padded.md" },
-  ])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => {
+  ])("rejects NUL-padded binary data disguised as %s", async ({ fileName }) => {
    const fakeTextFile = path.join(fixtureRoot, fileName);
-    // Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16.
-    // Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00),
-    // so getTextStats returns printableRatio=1.0 and the file would have been allowed.
-    // After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8
-    // strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control
-    // bytes (0x00 < 0x20), so the upload is correctly rejected.
+    // Alternating 0x00/0xFF — UTF-8 decode fails (0xFF is invalid UTF-8), then
+    // hasSingleByteTextShape rejects because 0x00 bytes are control chars (< 0x20).
    const nulPadded = Buffer.alloc(9000);
    for (let i = 0; i < nulPadded.length; i += 1) {
      nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff;
@@ -360,6 +356,30 @@ describe("loadWebMedia", () => {
    });
  });

+  it.each([
+    { label: "CSV", fileName: "bom-binary.csv" },
+    { label: "Markdown", fileName: "bom-binary.md" },
+  ])("rejects UTF-16 BOM-prefixed binary data disguised as %s", async ({ fileName }) => {
+    const fakeTextFile = path.join(fixtureRoot, fileName);
+    // UTF-16LE BOM + repeating 0xFF bytes: if UTF-16 decoding were attempted,
+    // every byte pair would produce a printable code point and pass getTextStats.
+    // With UTF-16 decoding removed, falls through to UTF-8 strict decode (throws
+    // on 0xFF), then hasSingleByteTextShape rejects due to high-byte ratio > 30%.
+    const bom = Buffer.from([0xff, 0xfe]);
+    const garbage = Buffer.alloc(9000, 0xff);
+    await fs.writeFile(fakeTextFile, Buffer.concat([bom, garbage]));
+    await expect(
+      loadWebMedia(fakeTextFile, {
+        maxBytes: 1024 * 1024,
+        localRoots: "any",
+        readFile: async (filePath) => await fs.readFile(filePath),
+        hostReadCapability: true,
+      }),
+    ).rejects.toMatchObject({
+      code: "path-not-allowed",
+    });
+  });
+
  it.each([
    { label: "CSV", fileName: "alternating-high.csv" },
    { label: "Markdown", fileName: "alternating-high.md" },
--- a/src/media/web-media.ts
+++ b/src/media/web-media.ts
@@ -91,24 +91,6 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([
 const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]);
 const MB = 1024 * 1024;

-function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined {
-  if (!buffer || buffer.length < 2) {
-    return undefined;
-  }
-  const b0 = buffer[0];
-  const b1 = buffer[1];
-  // Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate
-  // because TextDecoder("utf-16le") never throws and all byte pairs produce printable
-  // code points, allowing opaque NUL-padded binaries to pass the text-stats check.
-  if (b0 === 0xff && b1 === 0xfe) {
-    return "utf-16le";
-  }
-  if (b0 === 0xfe && b1 === 0xff) {
-    return "utf-16be";
-  }
-  return undefined;
-}
-
 function getTextStats(text: string): { printableRatio: number } {
  if (!text) {
    return { printableRatio: 0 };
@@ -158,24 +140,13 @@ function decodeHostReadText(buffer: Buffer): string | undefined {
  if (buffer.length === 0) {
    return "";
  }
-  const utf16Charset = resolveUtf16Charset(buffer);
+  // UTF-16 decoding is intentionally omitted: TextDecoder("utf-16le/be") never throws on
+  // arbitrary byte pairs, so every byte pair is a valid (if meaningless) Unicode scalar —
+  // an attacker can prepend a BOM and pass getTextStats with printableRatio≈1.0 on pure
+  // binary garbage. The Latin-1 path below already covers the most common non-UTF-8
+  // real-world case (Excel CSV exports with accented chars like é, ñ) while remaining
+  // safe because hasSingleByteTextShape gates on byte shape *before* any decode.
  try {
-    if (utf16Charset === "utf-16be") {
-      const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
-      if (evenBuffer.length === 0) {
-        return "";
-      }
-      const swapped = Buffer.alloc(evenBuffer.length);
-      for (let i = 0; i + 1 < evenBuffer.length; i += 2) {
-        swapped[i] = evenBuffer[i + 1];
-        swapped[i + 1] = evenBuffer[i];
-      }
-      return new TextDecoder("utf-16le").decode(swapped);
-    }
-    if (utf16Charset === "utf-16le") {
-      const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
-      return new TextDecoder("utf-16le").decode(evenBuffer);
-    }
    return new TextDecoder("utf-8", { fatal: true }).decode(buffer);
  } catch {
    if (!hasSingleByteTextShape(buffer)) {