fix(media): replace null-byte heuristic with full C0 control-char check

The previous null-byte check was too narrow — binary payloads with no 0x00
bytes (e.g. short/unsupported formats) could still pass. Replace it with
looksLikeText(), which rejects any byte in the C0 control range (0x00–0x08,
0x0E–0x1F, 0x7F), matching the same heuristic used by git and the file
command to distinguish text from binary. Bytes ≥ 0x80 are kept so UTF-8,
Latin-1, and Windows-1252 encoded files continue to pass.
This commit is contained in:
Chen Chia Yang
2026-04-15 16:07:59 +08:00
committed by Frank Yang
parent f66e08a23f
commit 9aaf63ef4c

View File

@@ -91,6 +91,22 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([
const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]);
const MB = 1024 * 1024;
// Returns true only if every byte in the buffer is text-safe: no null bytes and no C0
// control characters other than the standard whitespace group (tab 0x09, LF 0x0A,
// VT 0x0B, FF 0x0C, CR 0x0D). This is the same heuristic used by `git` and `file` to
// distinguish text from binary. Bytes ≥ 0x80 are allowed so that UTF-8, Latin-1, and
// Windows-1252 encoded files all pass.
function looksLikeText(buffer: Buffer): boolean {
for (let i = 0; i < buffer.length; i++) {
const b = buffer[i]!;
// Reject null (0x000x08) and remaining C0 controls (0x0E0x1F) and DEL (0x7F).
if (b < 0x09 || (b >= 0x0e && b <= 0x1f) || b === 0x7f) {
return false;
}
}
return true;
}
function formatMb(bytes: number, digits = 2): string {
return (bytes / MB).toFixed(digits);
}
@@ -143,13 +159,13 @@ function assertHostReadMediaAllowed(params: {
// plain-text buffers that have no binary magic bytes. Allow these formats when:
// - sniffedMime is undefined (no binary signature detected by file-type)
// - The extension-derived MIME is text/csv or text/markdown (operator intent)
// - The full buffer contains no null bytes (rules out binary data with no known signature)
// - Every byte in the buffer passes the text-safety check (no binary control chars)
if (
!sniffedMime &&
normalizedMime &&
HOST_READ_TEXT_PLAIN_ALIASES.has(normalizedMime) &&
params.buffer &&
!params.buffer.includes(0x00)
looksLikeText(params.buffer)
) {
return;
}