mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:30:43 +00:00
fix(media): remove UTF-16 decoding from host-read text validation
TextDecoder("utf-16le/be") never throws on arbitrary byte pairs —
every pair of bytes is a valid Unicode scalar, so an attacker can
prepend a UTF-16 BOM (0xFF 0xFE) to binary garbage, give the file a
.csv/.md extension, and pass getTextStats with printableRatio≈1.0,
bypassing the host-read security boundary.
Remove resolveUtf16Charset and the UTF-16 branches from decodeHostReadText.
The Latin-1 fallback (gated by hasSingleByteTextShape) already covers
the most common non-UTF-8 real-world case: Excel CSV exports with
accented characters like é, ñ. UTF-16 CSVs are extremely rare and
users can trivially re-save as UTF-8.
Adds two regression tests:
- NUL-padded (0x00/0xFF) must be rejected
- BOM-prefixed (0xFF 0xFE + 0xFF garbage) must be rejected
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
committed by
Frank Yang
parent
3ab1088fc2
commit
2c236a9f4c
@@ -335,14 +335,10 @@ describe("loadWebMedia", () => {
|
||||
it.each([
|
||||
{ label: "CSV", fileName: "nul-padded.csv" },
|
||||
{ label: "Markdown", fileName: "nul-padded.md" },
|
||||
])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => {
|
||||
])("rejects NUL-padded binary data disguised as %s", async ({ fileName }) => {
|
||||
const fakeTextFile = path.join(fixtureRoot, fileName);
|
||||
// Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16.
|
||||
// Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00),
|
||||
// so getTextStats returns printableRatio=1.0 and the file would have been allowed.
|
||||
// After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8
|
||||
// strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control
|
||||
// bytes (0x00 < 0x20), so the upload is correctly rejected.
|
||||
// Alternating 0x00/0xFF — UTF-8 decode fails (0xFF is invalid UTF-8), then
|
||||
// hasSingleByteTextShape rejects because 0x00 bytes are control chars (< 0x20).
|
||||
const nulPadded = Buffer.alloc(9000);
|
||||
for (let i = 0; i < nulPadded.length; i += 1) {
|
||||
nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff;
|
||||
@@ -360,6 +356,30 @@ describe("loadWebMedia", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it.each([
|
||||
{ label: "CSV", fileName: "bom-binary.csv" },
|
||||
{ label: "Markdown", fileName: "bom-binary.md" },
|
||||
])("rejects UTF-16 BOM-prefixed binary data disguised as %s", async ({ fileName }) => {
|
||||
const fakeTextFile = path.join(fixtureRoot, fileName);
|
||||
// UTF-16LE BOM + repeating 0xFF bytes: if UTF-16 decoding were attempted,
|
||||
// every byte pair would produce a printable code point and pass getTextStats.
|
||||
// With UTF-16 decoding removed, falls through to UTF-8 strict decode (throws
|
||||
// on 0xFF), then hasSingleByteTextShape rejects due to high-byte ratio > 30%.
|
||||
const bom = Buffer.from([0xff, 0xfe]);
|
||||
const garbage = Buffer.alloc(9000, 0xff);
|
||||
await fs.writeFile(fakeTextFile, Buffer.concat([bom, garbage]));
|
||||
await expect(
|
||||
loadWebMedia(fakeTextFile, {
|
||||
maxBytes: 1024 * 1024,
|
||||
localRoots: "any",
|
||||
readFile: async (filePath) => await fs.readFile(filePath),
|
||||
hostReadCapability: true,
|
||||
}),
|
||||
).rejects.toMatchObject({
|
||||
code: "path-not-allowed",
|
||||
});
|
||||
});
|
||||
|
||||
it.each([
|
||||
{ label: "CSV", fileName: "alternating-high.csv" },
|
||||
{ label: "Markdown", fileName: "alternating-high.md" },
|
||||
|
||||
@@ -91,24 +91,6 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([
|
||||
const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]);
|
||||
const MB = 1024 * 1024;
|
||||
|
||||
function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined {
|
||||
if (!buffer || buffer.length < 2) {
|
||||
return undefined;
|
||||
}
|
||||
const b0 = buffer[0];
|
||||
const b1 = buffer[1];
|
||||
// Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate
|
||||
// because TextDecoder("utf-16le") never throws and all byte pairs produce printable
|
||||
// code points, allowing opaque NUL-padded binaries to pass the text-stats check.
|
||||
if (b0 === 0xff && b1 === 0xfe) {
|
||||
return "utf-16le";
|
||||
}
|
||||
if (b0 === 0xfe && b1 === 0xff) {
|
||||
return "utf-16be";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function getTextStats(text: string): { printableRatio: number } {
|
||||
if (!text) {
|
||||
return { printableRatio: 0 };
|
||||
@@ -158,24 +140,13 @@ function decodeHostReadText(buffer: Buffer): string | undefined {
|
||||
if (buffer.length === 0) {
|
||||
return "";
|
||||
}
|
||||
const utf16Charset = resolveUtf16Charset(buffer);
|
||||
// UTF-16 decoding is intentionally omitted: TextDecoder("utf-16le/be") never throws on
|
||||
// arbitrary byte pairs, so every byte pair is a valid (if meaningless) Unicode scalar —
|
||||
// an attacker can prepend a BOM and pass getTextStats with printableRatio≈1.0 on pure
|
||||
// binary garbage. The Latin-1 path below already covers the most common non-UTF-8
|
||||
// real-world case (Excel CSV exports with accented chars like é, ñ) while remaining
|
||||
// safe because hasSingleByteTextShape gates on byte shape *before* any decode.
|
||||
try {
|
||||
if (utf16Charset === "utf-16be") {
|
||||
const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
|
||||
if (evenBuffer.length === 0) {
|
||||
return "";
|
||||
}
|
||||
const swapped = Buffer.alloc(evenBuffer.length);
|
||||
for (let i = 0; i + 1 < evenBuffer.length; i += 2) {
|
||||
swapped[i] = evenBuffer[i + 1];
|
||||
swapped[i + 1] = evenBuffer[i];
|
||||
}
|
||||
return new TextDecoder("utf-16le").decode(swapped);
|
||||
}
|
||||
if (utf16Charset === "utf-16le") {
|
||||
const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
|
||||
return new TextDecoder("utf-16le").decode(evenBuffer);
|
||||
}
|
||||
return new TextDecoder("utf-8", { fatal: true }).decode(buffer);
|
||||
} catch {
|
||||
if (!hasSingleByteTextShape(buffer)) {
|
||||
|
||||
Reference in New Issue
Block a user