fix(media): tighten hasSingleByteTextShape to reject mixed ASCII/high-byte blobs

Raise the ASCII floor to 70% and add an explicit 30% high-byte cap.
The previous 50% threshold accepted alternating 0x41/0xFF buffers
(50% ASCII, 0 control bytes), which decoded through Latin-1 and passed
the printable-ratio gate — allowing opaque binary data to slip through
as a CSV or Markdown document.

Real single-byte text exports (e.g. Excel Latin-1 CSVs with accented
chars like é, ñ) rarely exceed 20-25% high bytes, so the tighter
thresholds do not regress legitimate input.

Adds a regression test: 9000 bytes alternating 'A'/0xFF must be
rejected as path-not-allowed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chen Chia Yang
2026-04-15 18:48:32 +08:00
committed by Frank Yang
parent ac89e9d964
commit a55d38ed6c
2 changed files with 27 additions and 1 deletions

View File

@@ -332,6 +332,31 @@ describe("loadWebMedia", () => {
},
);
it.each([
{ label: "CSV", fileName: "alternating-high.csv" },
{ label: "Markdown", fileName: "alternating-high.md" },
])("rejects alternating ASCII/high-byte data disguised as %s", async ({ fileName }) => {
const fakeTextFile = path.join(fixtureRoot, fileName);
// Alternating 0x41 ('A') and 0xFF — exactly 50% ASCII, 50% high bytes.
// With the old 50% threshold hasSingleByteTextShape would accept this;
// the tightened 70%/30% thresholds must reject it.
const mixed = Buffer.alloc(9000);
for (let i = 0; i < mixed.length; i += 1) {
mixed[i] = i % 2 === 0 ? 0x41 : 0xff;
}
await fs.writeFile(fakeTextFile, mixed);
await expect(
loadWebMedia(fakeTextFile, {
maxBytes: 1024 * 1024,
localRoots: "any",
readFile: async (filePath) => await fs.readFile(filePath),
hostReadCapability: true,
}),
).rejects.toMatchObject({
code: "path-not-allowed",
});
});
it.each([
{ label: "CSV", fileName: "high-bytes.csv" },
{ label: "Markdown", fileName: "high-bytes.md" },

View File

@@ -164,7 +164,8 @@ function hasSingleByteTextShape(buffer: Buffer): boolean {
}
}
const total = buffer.length;
return control === 0 && asciiText / total >= 0.5;
const highBytes = total - asciiText - control;
return control === 0 && asciiText / total >= 0.7 && highBytes / total <= 0.3;
}
function decodeHostReadText(buffer: Buffer): string | undefined {