From 9aaf63ef4cfc6479b31b7cac2550e513e5f53e44 Mon Sep 17 00:00:00 2001 From: Chen Chia Yang Date: Wed, 15 Apr 2026 16:07:59 +0800 Subject: [PATCH] fix(media): replace null-byte heuristic with full C0 control-char check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous null-byte check was too narrow — binary payloads with no 0x00 bytes (e.g. short/unsupported formats) could still pass. Replace it with looksLikeText(), which rejects any byte in the C0 control range (0x00–0x08, 0x0E–0x1F, 0x7F), matching the same heuristic used by git and the file command to distinguish text from binary. Bytes ≥ 0x80 are kept so UTF-8, Latin-1, and Windows-1252 encoded files continue to pass. --- src/media/web-media.ts | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/media/web-media.ts b/src/media/web-media.ts index d1009e529c5..4cfde9acd34 100644 --- a/src/media/web-media.ts +++ b/src/media/web-media.ts @@ -91,6 +91,22 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([ const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]); const MB = 1024 * 1024; +// Returns true only if every byte in the buffer is text-safe: no null bytes and no C0 +// control characters other than the standard whitespace group (tab 0x09, LF 0x0A, +// VT 0x0B, FF 0x0C, CR 0x0D). This is the same heuristic used by `git` and `file` to +// distinguish text from binary. Bytes ≥ 0x80 are allowed so that UTF-8, Latin-1, and +// Windows-1252 encoded files all pass. +function looksLikeText(buffer: Buffer): boolean { + for (let i = 0; i < buffer.length; i++) { + const b = buffer[i]!; + // Reject null (0x00–0x08) and remaining C0 controls (0x0E–0x1F) and DEL (0x7F). + if (b < 0x09 || (b >= 0x0e && b <= 0x1f) || b === 0x7f) { + return false; + } + } + return true; +} + function formatMb(bytes: number, digits = 2): string { return (bytes / MB).toFixed(digits); } @@ -143,13 +159,13 @@ function assertHostReadMediaAllowed(params: { // plain-text buffers that have no binary magic bytes. Allow these formats when: // - sniffedMime is undefined (no binary signature detected by file-type) // - The extension-derived MIME is text/csv or text/markdown (operator intent) - // - The full buffer contains no null bytes (rules out binary data with no known signature) + // - Every byte in the buffer passes the text-safety check (no binary control chars) if ( !sniffedMime && normalizedMime && HOST_READ_TEXT_PLAIN_ALIASES.has(normalizedMime) && params.buffer && - !params.buffer.includes(0x00) + looksLikeText(params.buffer) ) { return; }