mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 22:10:43 +00:00
Telegram/documents: sanitize binary payloads to prevent prompt input inflation (#66877)
Merged via squash.
Prepared head SHA: 09a87c184f
Co-authored-by: martinfrancois <14319020+martinfrancois@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
@@ -1092,6 +1092,125 @@ describe("applyMediaUnderstanding", () => {
|
||||
expectFileNotApplied({ ctx, result, body: "<media:audio>" });
|
||||
});
|
||||
|
||||
it("skips archive container attachments with +zip MIME types", async () => {
|
||||
const pseudoEpub = Buffer.from(
|
||||
"PK\u0003\u0004mimetypeapplication/epub+zipMETA-INF/container",
|
||||
"utf8",
|
||||
);
|
||||
const filePath = await createTempMediaFile({
|
||||
fileName: "book.epub",
|
||||
content: pseudoEpub,
|
||||
});
|
||||
|
||||
const { ctx, result } = await applyWithDisabledMedia({
|
||||
body: "<media:file>",
|
||||
mediaPath: filePath,
|
||||
mediaType: "application/epub+zip",
|
||||
});
|
||||
|
||||
expectFileNotApplied({ ctx, result, body: "<media:file>" });
|
||||
});
|
||||
|
||||
it("does not coerce binary control-byte payloads into text/plain", async () => {
|
||||
const pseudoZip = Buffer.from("PK\u0003\u0004mimetypeapplication/epub+zipcontent.opf", "utf8");
|
||||
const filePath = await createTempMediaFile({
|
||||
fileName: "payload.bin",
|
||||
content: pseudoZip,
|
||||
});
|
||||
|
||||
const { ctx, result } = await applyWithDisabledMedia({
|
||||
body: "<media:file>",
|
||||
mediaPath: filePath,
|
||||
});
|
||||
|
||||
expectFileNotApplied({ ctx, result, body: "<media:file>" });
|
||||
});
|
||||
|
||||
it("does not trust text file extensions when the buffer starts with a ZIP signature", async () => {
|
||||
const spoofedZip = Buffer.from("PK\u0003\u0004mimetypeapplication/epub+zipcontent.opf", "utf8");
|
||||
const filePath = await createTempMediaFile({
|
||||
fileName: "payload.txt",
|
||||
content: spoofedZip,
|
||||
});
|
||||
|
||||
const { ctx, result } = await applyWithDisabledMedia({
|
||||
body: "<media:file>",
|
||||
mediaPath: filePath,
|
||||
});
|
||||
|
||||
expectFileNotApplied({ ctx, result, body: "<media:file>" });
|
||||
});
|
||||
|
||||
it("does not coerce real ZIP local headers into text/plain when UTF-16 guessing misfires", async () => {
|
||||
const zipLikeHeader = Buffer.from([
|
||||
0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x08, 0x29, 0xb9, 0x5a, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x66, 0x6f,
|
||||
0x6f, 0x2e, 0x74, 0x78, 0x74,
|
||||
]);
|
||||
const filePath = await createTempMediaFile({
|
||||
fileName: "archive.bin",
|
||||
content: zipLikeHeader,
|
||||
});
|
||||
|
||||
const { ctx, result } = await applyWithDisabledMedia({
|
||||
body: "<media:file>",
|
||||
mediaPath: filePath,
|
||||
});
|
||||
|
||||
expectFileNotApplied({ ctx, result, body: "<media:file>" });
|
||||
});
|
||||
|
||||
it("does not coerce ZIP central-directory headers into text/plain", async () => {
|
||||
const zipCentralDirectory = Buffer.from([
|
||||
0x50, 0x4b, 0x01, 0x02, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x08, 0x29, 0xb9,
|
||||
0x5a, 0x00, 0x00, 0x00, 0x00,
|
||||
]);
|
||||
const filePath = await createTempMediaFile({
|
||||
fileName: "central-directory.bin",
|
||||
content: zipCentralDirectory,
|
||||
});
|
||||
|
||||
const { ctx, result } = await applyWithDisabledMedia({
|
||||
body: "<media:file>",
|
||||
mediaPath: filePath,
|
||||
});
|
||||
|
||||
expectFileNotApplied({ ctx, result, body: "<media:file>" });
|
||||
});
|
||||
|
||||
it("does not coerce empty ZIP end-of-central-directory headers into text/plain", async () => {
|
||||
const emptyZip = Buffer.from([
|
||||
0x50, 0x4b, 0x05, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
]);
|
||||
const filePath = await createTempMediaFile({
|
||||
fileName: "empty-archive.bin",
|
||||
content: emptyZip,
|
||||
});
|
||||
|
||||
const { ctx, result } = await applyWithDisabledMedia({
|
||||
body: "<media:file>",
|
||||
mediaPath: filePath,
|
||||
});
|
||||
|
||||
expectFileNotApplied({ ctx, result, body: "<media:file>" });
|
||||
});
|
||||
|
||||
it("keeps utf16 text attachments eligible for extraction", async () => {
|
||||
const utf16Text = Buffer.from("hello from utf16 text", "utf16le");
|
||||
const filePath = await createTempMediaFile({
|
||||
fileName: "notes.bin",
|
||||
content: utf16Text,
|
||||
});
|
||||
|
||||
const { ctx, result } = await applyWithDisabledMedia({
|
||||
body: "<media:file>",
|
||||
mediaPath: filePath,
|
||||
});
|
||||
|
||||
expect(result.appliedFile).toBe(true);
|
||||
expect(ctx.Body).toContain("hello from utf16 text");
|
||||
});
|
||||
|
||||
it("does not reclassify PDF attachments as text/plain", async () => {
|
||||
const pseudoPdf = Buffer.from("%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\nendobj\n", "utf8");
|
||||
const filePath = await createTempMediaFile({
|
||||
|
||||
@@ -248,6 +248,20 @@ function looksLikeUtf8Text(buffer?: Buffer): boolean {
|
||||
}
|
||||
}
|
||||
|
||||
function hasSuspiciousBinarySignal(buffer?: Buffer): boolean {
|
||||
if (!buffer || buffer.length === 0) {
|
||||
return false;
|
||||
}
|
||||
const sample = buffer.subarray(0, Math.min(buffer.length, 4096));
|
||||
if (sample.length < 4 || sample[0] !== 0x50 || sample[1] !== 0x4b) {
|
||||
return false;
|
||||
}
|
||||
const signature = (sample[2] << 8) | sample[3];
|
||||
// Cover the ZIP local-header, central-directory, and empty-archive markers
|
||||
// so archive payloads cannot slip past text coercion when MIME detection is weak.
|
||||
return signature === 0x0304 || signature === 0x0102 || signature === 0x0506;
|
||||
}
|
||||
|
||||
function decodeTextSample(buffer?: Buffer): string {
|
||||
if (!buffer || buffer.length === 0) {
|
||||
return "";
|
||||
@@ -312,6 +326,9 @@ function isBinaryMediaMime(mime?: string): boolean {
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
if (mime.endsWith("+zip")) {
|
||||
return true;
|
||||
}
|
||||
if (mime.startsWith("application/vnd.")) {
|
||||
// Keep vendor +json/+xml payloads eligible for text extraction while
|
||||
// treating the common binary vendor family (Office, archives, etc.) as binary.
|
||||
@@ -372,6 +389,9 @@ async function extractFileBlocks(params: {
|
||||
if (!forcedTextMimeResolved && isBinaryMediaMime(normalizedRawMime)) {
|
||||
continue;
|
||||
}
|
||||
if (hasSuspiciousBinarySignal(bufferResult?.buffer)) {
|
||||
continue;
|
||||
}
|
||||
const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
|
||||
const textSample = decodeTextSample(bufferResult?.buffer);
|
||||
// Do not coerce real PDFs into text/plain via printable-byte heuristics.
|
||||
|
||||
Reference in New Issue
Block a user