fix: validate full host-read text payload

This commit is contained in:
Frank Yang
2026-04-15 17:07:39 +08:00
parent 604a024067
commit 856c88f25f
2 changed files with 37 additions and 15 deletions

View File

@@ -250,6 +250,30 @@ describe("loadWebMedia", () => {
});
});
it.each([
{ label: "CSV", fileName: "prefix-tail.csv" },
{ label: "Markdown", fileName: "prefix-tail.md" },
])(
"rejects %s files with a text prefix and binary tail after the old sample window",
async ({ fileName }) => {
const fakeTextFile = path.join(fixtureRoot, fileName);
const textPrefix = Buffer.from(`name,value\n${"row,1\n".repeat(1400)}`, "utf8");
expect(textPrefix.length).toBeGreaterThan(8192);
const binaryTail = Buffer.from([0x00, 0xff, 0x10, 0x80]);
await fs.writeFile(fakeTextFile, Buffer.concat([textPrefix, binaryTail]));
await expect(
loadWebMedia(fakeTextFile, {
maxBytes: 1024 * 1024,
localRoots: "any",
readFile: async (filePath) => await fs.readFile(filePath),
hostReadCapability: true,
}),
).rejects.toMatchObject({
code: "path-not-allowed",
});
},
);
it("rejects traversal-style canvas media paths before filesystem access", async () => {
await expect(
loadWebMedia(`${CANVAS_HOST_PATH}/documents/../collection.media/tiny.png`),

View File

@@ -89,7 +89,6 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([
// file-type returns undefined (no magic bytes) for plain-text formats like CSV and
// Markdown, so host-read needs an explicit "this really decodes as text" fallback.
const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]);
const HOST_READ_TEXT_SAMPLE_BYTES = 8192;
const MB = 1024 * 1024;
const WORDISH_CHAR = /[\p{L}\p{N}]/u;
@@ -155,30 +154,29 @@ function getTextStats(text: string): { printableRatio: number; wordishRatio: num
return { printableRatio: printable / total, wordishRatio: wordish / total };
}
function decodeHostReadTextSample(buffer: Buffer): string | undefined {
const sample = buffer.subarray(0, Math.min(buffer.length, HOST_READ_TEXT_SAMPLE_BYTES));
if (sample.length === 0) {
function decodeHostReadText(buffer: Buffer): string | undefined {
if (buffer.length === 0) {
return "";
}
const utf16Charset = resolveUtf16Charset(sample);
const utf16Charset = resolveUtf16Charset(buffer);
try {
if (utf16Charset === "utf-16be") {
const evenSample = sample.length % 2 === 0 ? sample : sample.subarray(0, sample.length - 1);
if (evenSample.length === 0) {
const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
if (evenBuffer.length === 0) {
return "";
}
const swapped = Buffer.alloc(evenSample.length);
for (let i = 0; i + 1 < evenSample.length; i += 2) {
swapped[i] = evenSample[i + 1];
swapped[i + 1] = evenSample[i];
const swapped = Buffer.alloc(evenBuffer.length);
for (let i = 0; i + 1 < evenBuffer.length; i += 2) {
swapped[i] = evenBuffer[i + 1];
swapped[i + 1] = evenBuffer[i];
}
return new TextDecoder("utf-16le").decode(swapped);
}
if (utf16Charset === "utf-16le") {
const evenSample = sample.length % 2 === 0 ? sample : sample.subarray(0, sample.length - 1);
return new TextDecoder("utf-16le").decode(evenSample);
const evenBuffer = buffer.length % 2 === 0 ? buffer : buffer.subarray(0, buffer.length - 1);
return new TextDecoder("utf-16le").decode(evenBuffer);
}
return new TextDecoder("utf-8", { fatal: true }).decode(sample);
return new TextDecoder("utf-8", { fatal: true }).decode(buffer);
} catch {
return undefined;
}
@@ -191,7 +189,7 @@ function isValidatedHostReadText(buffer?: Buffer): boolean {
if (buffer.length === 0) {
return true;
}
const text = decodeHostReadTextSample(buffer);
const text = decodeHostReadText(buffer);
if (text === undefined) {
return false;
}