mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 11:40:42 +00:00
fix(media): require BOM for UTF-16 detection to close NUL-padded bypass
The NUL-heavy heuristic in resolveUtf16Charset was unsafe as a security
gate: TextDecoder("utf-16le") never throws, so every byte pair in an
opaque binary (e.g. repeating 0x00/0xFF) decodes to a printable code
point and passes the text-stats check, allowing the upload.
Remove the heuristic; only a leading BOM (0xFF 0xFE / 0xFE 0xFF) now
triggers UTF-16 decoding. Without a BOM the strict UTF-8 path runs
first, and NUL-padded binaries are then rejected by hasSingleByteTextShape
(0x00 bytes are control bytes).
Adds a regression test: 9000-byte alternating-NUL/0xFF buffer must be
rejected as path-not-allowed.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
committed by
Frank Yang
parent
a55d38ed6c
commit
3ab1088fc2
@@ -332,6 +332,34 @@ describe("loadWebMedia", () => {
|
||||
},
|
||||
);
|
||||
|
||||
it.each([
|
||||
{ label: "CSV", fileName: "nul-padded.csv" },
|
||||
{ label: "Markdown", fileName: "nul-padded.md" },
|
||||
])("rejects NUL-padded binary data disguised as %s (UTF-16 heuristic bypass)", async ({ fileName }) => {
|
||||
const fakeTextFile = path.join(fixtureRoot, fileName);
|
||||
// Alternating 0x00/0xFF — no BOM, but old NUL-heuristic would classify as UTF-16.
|
||||
// Decoded as UTF-16 every pair becomes a printable code point (e.g. U+FF00),
|
||||
// so getTextStats returns printableRatio=1.0 and the file would have been allowed.
|
||||
// After requiring a BOM for UTF-16, decodeHostReadText falls through to the UTF-8
|
||||
// strict path (throws on 0xFF), then hasSingleByteTextShape rejects due to control
|
||||
// bytes (0x00 < 0x20), so the upload is correctly rejected.
|
||||
const nulPadded = Buffer.alloc(9000);
|
||||
for (let i = 0; i < nulPadded.length; i += 1) {
|
||||
nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff;
|
||||
}
|
||||
await fs.writeFile(fakeTextFile, nulPadded);
|
||||
await expect(
|
||||
loadWebMedia(fakeTextFile, {
|
||||
maxBytes: 1024 * 1024,
|
||||
localRoots: "any",
|
||||
readFile: async (filePath) => await fs.readFile(filePath),
|
||||
hostReadCapability: true,
|
||||
}),
|
||||
).rejects.toMatchObject({
|
||||
code: "path-not-allowed",
|
||||
});
|
||||
});
|
||||
|
||||
it.each([
|
||||
{ label: "CSV", fileName: "alternating-high.csv" },
|
||||
{ label: "Markdown", fileName: "alternating-high.md" },
|
||||
|
||||
@@ -97,29 +97,15 @@ function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefin
|
||||
}
|
||||
const b0 = buffer[0];
|
||||
const b1 = buffer[1];
|
||||
// Only trust a BOM — the NUL-heavy heuristic cannot be used as a security gate
|
||||
// because TextDecoder("utf-16le") never throws and all byte pairs produce printable
|
||||
// code points, allowing opaque NUL-padded binaries to pass the text-stats check.
|
||||
if (b0 === 0xff && b1 === 0xfe) {
|
||||
return "utf-16le";
|
||||
}
|
||||
if (b0 === 0xfe && b1 === 0xff) {
|
||||
return "utf-16be";
|
||||
}
|
||||
const sampleLen = Math.min(buffer.length, 2048);
|
||||
let zeroEven = 0;
|
||||
let zeroOdd = 0;
|
||||
for (let i = 0; i < sampleLen; i += 1) {
|
||||
if (buffer[i] !== 0) {
|
||||
continue;
|
||||
}
|
||||
if (i % 2 === 0) {
|
||||
zeroEven += 1;
|
||||
} else {
|
||||
zeroOdd += 1;
|
||||
}
|
||||
}
|
||||
const zeroCount = zeroEven + zeroOdd;
|
||||
if (sampleLen > 0 && zeroCount / sampleLen > 0.2) {
|
||||
return zeroOdd >= zeroEven ? "utf-16le" : "utf-16be";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user