fix(media): remove UTF-16 decoding from host-read text validation

TextDecoder("utf-16le/be") never throws on arbitrary byte pairs —
every pair of bytes is a valid Unicode scalar, so an attacker can
prepend a UTF-16 BOM (0xFF 0xFE) to binary garbage, give the file a
.csv/.md extension, and pass getTextStats with printableRatio≈1.0,
bypassing the host-read security boundary.

Remove resolveUtf16Charset and the UTF-16 branches from decodeHostReadText.
The Latin-1 fallback (gated by hasSingleByteTextShape) already covers
the most common non-UTF-8 real-world case: Excel CSV exports with
accented characters like é, ñ. UTF-16 CSVs are extremely rare and
users can trivially re-save as UTF-8.

Adds two regression tests:
- NUL-padded (0x00/0xFF) must be rejected
- BOM-prefixed (0xFF 0xFE + 0xFF garbage) must be rejected

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chen Chia Yang
2026-04-15 19:04:41 +08:00
committed by Frank Yang
parent 3ab1088fc2
commit 2c236a9f4c
2 changed files with 33 additions and 42 deletions

View File

@@ -335,14 +335,10 @@ describe("loadWebMedia", () => {
it.each([
{ label: "CSV", fileName: "nul-padded.csv" },
{ label: "Markdown", fileName: "nul-padded.md" },
])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => {
])("rejects NUL-padded binary data disguised as %s", async ({ fileName }) => {
const fakeTextFile = path.join(fixtureRoot, fileName);
// Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16.
// Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00),
// so getTextStats returns printableRatio=1.0 and the file would have been allowed.
// After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8
// strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control
// bytes (0x00 < 0x20), so the upload is correctly rejected.
// Alternating 0x00/0xFF — UTF-8 decode fails (0xFF is invalid UTF-8), then
// hasSingleByteTextShape rejects because 0x00 bytes are control chars (< 0x20).
const nulPadded = Buffer.alloc(9000);
for (let i = 0; i < nulPadded.length; i += 1) {
nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff;
@@ -360,6 +356,30 @@ describe("loadWebMedia", () => {
});
});
it.each([
{ label: "CSV", fileName: "bom-binary.csv" },
{ label: "Markdown", fileName: "bom-binary.md" },
])("rejects UTF-16 BOM-prefixed binary data disguised as %s", async ({ fileName }) => {
const fakeTextFile = path.join(fixtureRoot, fileName);
// UTF-16LE BOM + repeating 0xFF bytes: if UTF-16 decoding were attempted,
// every byte pair would produce a printable code point and pass getTextStats.
// With UTF-16 decoding removed, falls through to UTF-8 strict decode (throws
// on 0xFF), then hasSingleByteTextShape rejects due to high-byte ratio > 30%.
const bom = Buffer.from([0xff, 0xfe]);
const garbage = Buffer.alloc(9000, 0xff);
await fs.writeFile(fakeTextFile, Buffer.concat([bom, garbage]));
await expect(
loadWebMedia(fakeTextFile, {
maxBytes: 1024 * 1024,
localRoots: "any",
readFile: async (filePath) => await fs.readFile(filePath),
hostReadCapability: true,
}),
).rejects.toMatchObject({
code: "path-not-allowed",
});
});
it.each([
{ label: "CSV", fileName: "alternating-high.csv" },
{ label: "Markdown", fileName: "alternating-high.md" },

View File

@@ -91,24 +91,6 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([
const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]);
const MB = 1024 * 1024;
function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined {
if (!buffer || buffer.length < 2) {
return undefined;
}
const b0 = buffer[0];
const b1 = buffer[1];
// Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate
// because TextDecoder("utf-16le") never throws and all byte pairs produce printable
// code points, allowing opaque NUL-padded binaries to pass the text-stats check.
if (b0 === 0xff && b1 === 0xfe) {
return "utf-16le";
}
if (b0 === 0xfe && b1 === 0xff) {
return "utf-16be";
}
return undefined;
}
function getTextStats(text: string): { printableRatio: number } {
if (!text) {
return { printableRatio: 0 };
@@ -158,24 +140,13 @@ function decodeHostReadText(buffer: Buffer): string | undefined {
if (buffer.length === 0) {
return "";
}
const utf16Charset = resolveUtf16Charset(buffer);
// UTF-16 decoding is intentionally omitted: TextDecoder("utf-16le/be") never throws on
// arbitrary byte pairs, so every byte pair is a valid (if meaningless) Unicode scalar —
// an attacker can prepend a BOM and pass getTextStats with printableRatio≈1.0 on pure
// binary garbage. The Latin-1 path below already covers the most common non-UTF-8
// real-world case (Excel CSV exports with accented chars like é, ñ) while remaining
// safe because hasSingleByteTextShape gates on byte shape *before* any decode.
try {
if (utf16Charset === "utf-16be") {
const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
if (evenBuffer.length === 0) {
return "";
}
const swapped = Buffer.alloc(evenBuffer.length);
for (let i = 0; i + 1 < evenBuffer.length; i += 2) {
swapped[i] = evenBuffer[i + 1];
swapped[i + 1] = evenBuffer[i];
}
return new TextDecoder("utf-16le").decode(swapped);
}
if (utf16Charset === "utf-16le") {
const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
return new TextDecoder("utf-16le").decode(evenBuffer);
}
return new TextDecoder("utf-8", { fatal: true }).decode(buffer);
} catch {
if (!hasSingleByteTextShape(buffer)) {