fix: allow punctuation-heavy host-read text

This commit is contained in:
Frank Yang
2026-04-15 18:17:09 +08:00
parent 856c88f25f
commit cff3445a6f
2 changed files with 35 additions and 12 deletions

View File

@@ -274,6 +274,35 @@ describe("loadWebMedia", () => {
},
);
it.each([
{
label: "CSV",
fileName: "punctuation.csv",
contentType: "text/csv",
body: ",,,,,,,,,,\n",
},
{
label: "Markdown",
fileName: "punctuation.md",
contentType: "text/markdown",
body: "---\n***\n> > >\n",
},
])(
"loads valid punctuation-heavy %s files when host-read capability is enabled",
async ({ fileName, contentType, body }) => {
const textFile = path.join(fixtureRoot, fileName);
await fs.writeFile(textFile, Buffer.from(body, "utf8"));
const result = await loadWebMedia(textFile, {
maxBytes: 1024 * 1024,
localRoots: "any",
readFile: async (filePath) => await fs.readFile(filePath),
hostReadCapability: true,
});
expect(result.kind).toBe("document");
expect(result.contentType).toBe(contentType);
},
);
it("rejects traversal-style canvas media paths before filesystem access", async () => {
await expect(
loadWebMedia(`${CANVAS_HOST_PATH}/documents/../collection.media/tiny.png`),

View File

@@ -90,7 +90,6 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([
// Markdown, so host-read needs an explicit "this really decodes as text" fallback.
const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]);
const MB = 1024 * 1024;
const WORDISH_CHAR = /[\p{L}\p{N}]/u;
function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined {
if (!buffer || buffer.length < 2) {
@@ -124,18 +123,16 @@ function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefin
return undefined;
}
function getTextStats(text: string): { printableRatio: number; wordishRatio: number } {
function getTextStats(text: string): { printableRatio: number } {
if (!text) {
return { printableRatio: 0, wordishRatio: 0 };
return { printableRatio: 0 };
}
let printable = 0;
let control = 0;
let wordish = 0;
for (const char of text) {
const code = char.codePointAt(0) ?? 0;
if (code === 9 || code === 10 || code === 13 || code === 32) {
printable += 1;
wordish += 1;
continue;
}
if (code < 32 || (code >= 0x7f && code <= 0x9f)) {
@@ -143,15 +140,12 @@ function getTextStats(text: string): { printableRatio: number; wordishRatio: num
continue;
}
printable += 1;
if (WORDISH_CHAR.test(char)) {
wordish += 1;
}
}
const total = printable + control;
if (total === 0) {
return { printableRatio: 0, wordishRatio: 0 };
return { printableRatio: 0 };
}
return { printableRatio: printable / total, wordishRatio: wordish / total };
return { printableRatio: printable / total };
}
function decodeHostReadText(buffer: Buffer): string | undefined {
@@ -193,8 +187,8 @@ function isValidatedHostReadText(buffer?: Buffer): boolean {
if (text === undefined) {
return false;
}
const { printableRatio, wordishRatio } = getTextStats(text);
return printableRatio > 0.95 && wordishRatio > 0.2;
const { printableRatio } = getTextStats(text);
return printableRatio > 0.95;
}
function formatMb(bytes: number, digits = 2): string {