fix(whatsapp): honor forceDocument flag end-to-end (#79272)

Merged via squash.

Prepared head SHA: faaff35f1e
Co-authored-by: itsuzef <53057646+itsuzef@users.noreply.github.com>
Co-authored-by: mcaxtr <7562095+mcaxtr@users.noreply.github.com>
Reviewed-by: @mcaxtr
This commit is contained in:
Youssef Hemimy
2026-05-16 22:29:01 -04:00
committed by GitHub
parent c1bc6adfaa
commit 94ed68bc76
20 changed files with 230 additions and 36 deletions

View File

@@ -2,6 +2,12 @@
Docs: https://docs.openclaw.ai
## Unreleased
### Fixes
- WhatsApp: honor forced document delivery for outbound image, GIF, and video media so `forceDocument`/`asDocument` sends preserve original media bytes instead of using compressed media payloads. (#79272) Thanks @itsuzef.
## 2026.5.17
### Changes

View File

@@ -393,6 +393,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
- non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded with `ffmpeg` to 48 kHz mono Ogg/Opus before PTT delivery
- `/tts latest` sends the latest assistant reply as one voice note and suppresses repeat sends for the same reply; `/tts chat on|off|default` controls auto-TTS for the current WhatsApp chat
- animated GIF playback is supported via `gifPlayback: true` on video sends
- `forceDocument` / `asDocument` sends outbound images, GIFs, and videos through the Baileys document payload to avoid WhatsApp media compression while preserving the resolved filename and MIME type
- captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently
- media source can be HTTP(S), `file://`, or local paths
@@ -402,7 +403,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
- inbound media save cap: `channels.whatsapp.mediaMaxMb` (default `50`)
- outbound media send cap: `channels.whatsapp.mediaMaxMb` (default `50`)
- per-account overrides use `channels.whatsapp.accounts.<accountId>.mediaMaxMb`
- images are auto-optimized (resize/quality sweep) to fit limits
- images are auto-optimized (resize/quality sweep) to fit limits unless `forceDocument` / `asDocument` requests document delivery
- on media send failure, first-item fallback sends text warning instead of dropping the response silently
</Accordion>

View File

@@ -72,7 +72,7 @@ Name lookup:
- Optional: `--media`, `--presentation`, `--delivery`, `--pin`, `--reply-to`, `--thread-id`, `--gif-playback`, `--force-document`, `--silent`
- Shared presentation payloads: `--presentation` sends semantic blocks (`text`, `context`, `divider`, `buttons`, `select`) that core renders through the selected channel's declared capabilities. See [Message Presentation](/plugins/message-presentation).
- Generic delivery preferences: `--delivery` accepts delivery hints such as `{ "pin": true }`; `--pin` is shorthand for pinned delivery when the channel supports it.
- Telegram only: `--force-document` (send images, GIFs, and videos as documents to avoid Telegram compression)
- Telegram + WhatsApp: `--force-document` (send images, GIFs, and videos as documents to avoid channel compression)
- Telegram only: `--thread-id` (forum topic id)
- Slack only: `--thread-id` (thread timestamp; `--reply-to` uses the same field)
- Telegram + Discord: `--silent`
@@ -302,7 +302,7 @@ openclaw message send --channel msteams \
--presentation '{"title":"Status update","blocks":[{"type":"text","text":"Build completed"}]}'
```
Send a Telegram image as a document to avoid compression:
Send a Telegram or WhatsApp image as a document to avoid compression:
```bash
openclaw message send --channel telegram --target @mychat \

View File

@@ -123,6 +123,37 @@ describe("createWebSendApi", () => {
});
});
it("sends visual media as document when sendOptions.asDocument is true", async () => {
const payload = Buffer.from("img");
await api.sendMessage("+1555", "promo", payload, "image/png", {
asDocument: true,
fileName: "promo.png",
});
expect(sendMessage).toHaveBeenCalledWith(
"1555@s.whatsapp.net",
expect.objectContaining({
document: payload,
fileName: "promo.png",
caption: "promo",
mimetype: "image/png",
}),
);
});
it("does not force audio media onto the document branch", async () => {
const payload = Buffer.from("aud");
await api.sendMessage("+1555", "voice", payload, "audio/ogg", {
asDocument: true,
fileName: "voice.ogg",
});
expect(sendMessage).toHaveBeenCalledWith("1555@s.whatsapp.net", {
audio: payload,
ptt: true,
mimetype: "audio/ogg",
});
});
it("sends plain text messages", async () => {
const res = await api.sendMessage("+1555", "hello");
expect(sendMessage).toHaveBeenCalledWith("1555@s.whatsapp.net", { text: "hello" });
@@ -199,6 +230,39 @@ describe("createWebSendApi", () => {
});
});
it("uses resolved mention caption text for forced-document media", async () => {
api = createWebSendApi({
sock: { sendMessage, sendPresenceUpdate },
defaultAccountId: "main",
resolveOutboundMentions: ({ jid, text }) =>
resolveWhatsAppOutboundMentions({
chatJid: jid,
text,
participants: [
{
id: "277038292303944:4@lid",
phoneNumber: "5511976136970@s.whatsapp.net",
},
],
}),
});
const payload = Buffer.from("img");
await api.sendMessage("120363000000000000@g.us", "cap @+5511976136970", payload, "image/jpeg", {
asDocument: true,
fileName: "promo.jpg",
});
expectFirstSendJid("120363000000000000@g.us");
expectSendContentFields(0, {
document: payload,
fileName: "promo.jpg",
caption: "cap @277038292303944",
mimetype: "image/jpeg",
mentions: ["277038292303944@lid"],
});
});
it("supports audio as push-to-talk voice note", async () => {
const payload = Buffer.from("aud");
await api.sendMessage("+1555", "", payload, "audio/ogg", { accountId: "alt" });

View File

@@ -27,6 +27,10 @@ function recordWhatsAppOutbound(accountId: string) {
});
}
function supportsForcedDocumentMediaType(mediaType: string): boolean {
return mediaType.startsWith("image/") || mediaType.startsWith("video/");
}
export function createWebSendApi(params: {
sock: {
sendMessage: (
@@ -79,7 +83,15 @@ export function createWebSendApi(params: {
? { text, mentionedJids: [] }
: await resolveMentions(jid, text);
if (mediaBuffer && mediaType) {
if (mediaType.startsWith("image/")) {
if (sendOptions?.asDocument === true && supportsForcedDocumentMediaType(mediaType)) {
const fileName = sendOptions?.fileName?.trim() || "file";
payload = {
document: mediaBuffer,
fileName,
caption: resolvedPayloadText.text || undefined,
mimetype: mediaType,
};
} else if (mediaType.startsWith("image/")) {
payload = {
image: mediaBuffer,
caption: resolvedPayloadText.text || undefined,

View File

@@ -21,6 +21,7 @@ export type ActiveWebSendOptions = {
gifPlayback?: boolean;
accountId?: string;
fileName?: string;
asDocument?: boolean;
};
export type ActiveWebListener = {

View File

@@ -32,6 +32,7 @@ type WhatsAppSendTextOptions = {
mediaReadFile?: (filePath: string) => Promise<Buffer>;
gifPlayback?: boolean;
audioAsVoice?: boolean;
forceDocument?: boolean;
accountId?: string;
quotedMessageKey?: {
id: string;
@@ -192,6 +193,7 @@ export function createWhatsAppOutboundBase({
accountId,
deps,
gifPlayback,
forceDocument,
replyToId,
}) => {
const send =
@@ -214,6 +216,7 @@ export function createWhatsAppOutboundBase({
...(audioAsVoice === undefined ? {} : { audioAsVoice }),
accountId: accountId ?? undefined,
gifPlayback,
forceDocument,
quotedMessageKey,
});
},

View File

@@ -124,7 +124,7 @@ function normalizeWhatsAppLoadedMedia(
const fileName =
kind === "document"
? (media.fileName ?? deriveWhatsAppDocumentFileName(mediaUrl) ?? "file")
: undefined;
: media.fileName;
return {
buffer: media.buffer,
kind,

View File

@@ -10,6 +10,7 @@ export async function loadOutboundMediaFromUrl(
};
mediaLocalRoots?: readonly string[];
mediaReadFile?: (filePath: string) => Promise<Buffer>;
optimizeImages?: boolean;
} = {},
) {
const readFile = options.mediaAccess?.readFile ?? options.mediaReadFile;
@@ -19,17 +20,21 @@ export async function loadOutboundMediaFromUrl(
: options.mediaLocalRoots && options.mediaLocalRoots.length > 0
? options.mediaLocalRoots
: undefined;
const sharedOptions = {
...(options.maxBytes !== undefined ? { maxBytes: options.maxBytes } : {}),
...(options.optimizeImages !== undefined ? { optimizeImages: options.optimizeImages } : {}),
};
return await loadWebMedia(
mediaUrl,
readFile
? {
...(options.maxBytes !== undefined ? { maxBytes: options.maxBytes } : {}),
...sharedOptions,
localRoots: "any",
readFile,
hostReadCapability: true,
}
: {
...(options.maxBytes !== undefined ? { maxBytes: options.maxBytes } : {}),
...sharedOptions,
...(localRoots ? { localRoots } : {}),
},
);

View File

@@ -107,6 +107,7 @@ describe("web outbound", () => {
};
mediaLocalRoots?: readonly string[];
mediaReadFile?: (filePath: string) => Promise<Buffer>;
optimizeImages?: boolean;
},
) =>
await loadWebMediaMock(mediaUrl, {
@@ -451,6 +452,89 @@ describe("web outbound", () => {
});
});
it("forces document branch when forceDocument is true with image media", async () => {
const buf = Buffer.from("img");
loadWebMediaMock.mockResolvedValueOnce({
buffer: buf,
contentType: "image/jpeg",
kind: "image",
fileName: "promo.jpg",
});
await sendMessageWhatsApp("+1555", "look", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/pic.jpg",
forceDocument: true,
});
expect(sendMessage).toHaveBeenLastCalledWith("+1555", "look", buf, "image/jpeg", {
asDocument: true,
fileName: "promo.jpg",
});
expect(hoisted.loadOutboundMediaFromUrl).toHaveBeenCalledWith(
"/tmp/pic.jpg",
expect.objectContaining({ optimizeImages: false }),
);
});
it("forces document branch when forceDocument is true with video media", async () => {
const buf = Buffer.from("video");
loadWebMediaMock.mockResolvedValueOnce({
buffer: buf,
contentType: "video/mp4",
kind: "video",
fileName: "clip.mp4",
});
await sendMessageWhatsApp("+1555", "watch", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/clip.mp4",
forceDocument: true,
});
expect(sendMessage).toHaveBeenLastCalledWith("+1555", "watch", buf, "video/mp4", {
asDocument: true,
fileName: "clip.mp4",
});
});
it("falls back to a default filename when forceDocument media has no fileName", async () => {
const buf = Buffer.from("img");
loadWebMediaMock.mockResolvedValueOnce({
buffer: buf,
contentType: "image/png",
kind: "image",
});
await sendMessageWhatsApp("+1555", "promo", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/pic.png",
forceDocument: true,
});
expect(sendMessage).toHaveBeenLastCalledWith("+1555", "promo", buf, "image/png", {
asDocument: true,
fileName: "file",
});
});
it("keeps audio on the voice-note path when forceDocument is true", async () => {
const buf = Buffer.from("audio");
loadWebMediaMock.mockResolvedValueOnce({
buffer: buf,
contentType: "audio/ogg",
kind: "audio",
fileName: "voice.ogg",
});
await sendMessageWhatsApp("+1555", "voice note", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/voice.ogg",
forceDocument: true,
});
expect(sendMessage).toHaveBeenNthCalledWith(1, "+1555", "", buf, "audio/ogg; codecs=opus");
expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined);
});
it("uses account-aware WhatsApp media caps for outbound uploads", async () => {
hoisted.controllerListeners.set("work", {
sendComposingTo,

View File

@@ -27,6 +27,10 @@ import { markdownToWhatsApp, toWhatsappJid } from "./text-runtime.js";
const outboundLog = createSubsystemLogger("gateway/channels/whatsapp").child("outbound");
function supportsForcedDocumentDelivery(kind: "image" | "audio" | "video" | "document"): boolean {
return kind === "image" || kind === "video";
}
function resolveOutboundWhatsAppAccountId(params: {
cfg: OpenClawConfig;
accountId?: string;
@@ -70,6 +74,7 @@ export async function sendMessageWhatsApp(
mediaReadFile?: (filePath: string) => Promise<Buffer>;
gifPlayback?: boolean;
audioAsVoice?: boolean;
forceDocument?: boolean;
accountId?: string;
quotedMessageKey?: {
id: string;
@@ -118,10 +123,12 @@ export async function sendMessageWhatsApp(
let mediaType: string | undefined;
let documentFileName: string | undefined;
let visibleTextAfterVoice: string | undefined;
let forceDocumentDelivery = false;
if (primaryMediaUrl) {
const media = await prepareWhatsAppOutboundMedia(
await loadOutboundMediaFromUrl(primaryMediaUrl, {
maxBytes: resolveWhatsAppMediaMaxBytes(account),
optimizeImages: options.forceDocument ? false : undefined,
mediaAccess: options.mediaAccess,
mediaLocalRoots: options.mediaLocalRoots,
mediaReadFile: options.mediaReadFile,
@@ -131,6 +138,9 @@ export async function sendMessageWhatsApp(
const caption = text || undefined;
mediaBuffer = media.buffer;
mediaType = media.mimetype;
forceDocumentDelivery = Boolean(
options.forceDocument && supportsForcedDocumentDelivery(media.kind),
);
if (media.kind === "audio" && caption) {
visibleTextAfterVoice = caption;
text = "";
@@ -140,6 +150,9 @@ export async function sendMessageWhatsApp(
} else {
text = caption ?? "";
}
if (forceDocumentDelivery) {
documentFileName ??= media.fileName ?? "file";
}
}
outboundLog.info(`Sending message -> ${redactedJid}${primaryMediaUrl ? " (media)" : ""}`);
logger.info({ jid: redactedJid, hasMedia: Boolean(primaryMediaUrl) }, "sending message");
@@ -149,9 +162,14 @@ export async function sendMessageWhatsApp(
const hasExplicitAccountId = Boolean(options.accountId?.trim());
const accountId = hasExplicitAccountId ? resolvedAccountId : undefined;
const sendOptions: ActiveWebSendOptions | undefined =
options.gifPlayback || accountId || documentFileName || options.quotedMessageKey
options.gifPlayback ||
forceDocumentDelivery ||
accountId ||
documentFileName ||
options.quotedMessageKey
? {
...(options.gifPlayback ? { gifPlayback: true } : {}),
...(forceDocumentDelivery ? { asDocument: true } : {}),
...(documentFileName ? { fileName: documentFileName } : {}),
...(options.quotedMessageKey ? { quotedMessageKey: options.quotedMessageKey } : {}),
accountId,

View File

@@ -219,13 +219,13 @@ function buildSendSchema(options: { includePresentation: boolean; includeDeliver
gifPlayback: Type.Optional(Type.Boolean()),
forceDocument: Type.Optional(
Type.Boolean({
description: "Send image/GIF as document to avoid Telegram compression (Telegram only).",
description: "Send image/GIF/video as document to avoid channel compression.",
}),
),
asDocument: Type.Optional(
Type.Boolean({
description:
"Send image/GIF as document to avoid Telegram compression. Alias for forceDocument (Telegram only).",
"Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
}),
),
};

View File

@@ -23,7 +23,7 @@ export type ChannelOutboundContext = {
mediaLocalRoots?: readonly string[];
mediaReadFile?: (filePath: string) => Promise<Buffer>;
gifPlayback?: boolean;
/** Send image as document to avoid Telegram compression. */
/** Send image, GIF, or video as document to avoid channel compression. */
forceDocument?: boolean;
replyToId?: string | null;
replyToIdSource?: "explicit" | "implicit";

View File

@@ -26,7 +26,7 @@ export function registerMessageSendCommand(message: Command, helpers: MessageCli
.option("--gif-playback", "Treat video media as GIF playback (WhatsApp only).", false)
.option(
"--force-document",
"Send media as document to avoid Telegram compression (Telegram only). Applies to images and GIFs.",
"Send media as document to avoid channel compression (Telegram, WhatsApp). Applies to images, GIFs, and videos.",
false,
)
.option(

View File

@@ -625,7 +625,7 @@
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid Telegram compression. Alias for forceDocument (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -702,7 +702,7 @@
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid Telegram compression (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -625,7 +625,7 @@
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid Telegram compression. Alias for forceDocument (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -702,7 +702,7 @@
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid Telegram compression (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -625,7 +625,7 @@
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid Telegram compression. Alias for forceDocument (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -702,7 +702,7 @@
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid Telegram compression (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -217,8 +217,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 140
},
"dynamicToolsJson": {
"chars": 44373,
"roughTokens": 11094
"chars": 44351,
"roughTokens": 11088
},
"openClawDeveloperInstructions": {
"chars": 5436,
@@ -229,8 +229,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 7129
},
"totalWithDynamicToolsJson": {
"chars": 72891,
"roughTokens": 18223
"chars": 72869,
"roughTokens": 18218
},
"userInputText": {
"chars": 870,
@@ -602,7 +602,7 @@ Full JSON: `codex-dynamic-tools.discord-group.json`
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid Telegram compression. Alias for forceDocument (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -679,7 +679,7 @@ Full JSON: `codex-dynamic-tools.discord-group.json`
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid Telegram compression (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -217,8 +217,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 140
},
"dynamicToolsJson": {
"chars": 44064,
"roughTokens": 11016
"chars": 44042,
"roughTokens": 11011
},
"openClawDeveloperInstructions": {
"chars": 4412,
@@ -229,8 +229,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 6748
},
"totalWithDynamicToolsJson": {
"chars": 71058,
"roughTokens": 17765
"chars": 71036,
"roughTokens": 17759
},
"userInputText": {
"chars": 370,
@@ -579,7 +579,7 @@ Full JSON: `codex-dynamic-tools.telegram-direct.json`
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid Telegram compression. Alias for forceDocument (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -656,7 +656,7 @@ Full JSON: `codex-dynamic-tools.telegram-direct.json`
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid Telegram compression (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -218,8 +218,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 140
},
"dynamicToolsJson": {
"chars": 45242,
"roughTokens": 11311
"chars": 45220,
"roughTokens": 11305
},
"openClawDeveloperInstructions": {
"chars": 4412,
@@ -230,8 +230,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 7155
},
"totalWithDynamicToolsJson": {
"chars": 73863,
"roughTokens": 18466
"chars": 73841,
"roughTokens": 18461
},
"userInputText": {
"chars": 608,
@@ -596,7 +596,7 @@ Full JSON: `codex-dynamic-tools.heartbeat-turn.json`
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid Telegram compression. Alias for forceDocument (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -673,7 +673,7 @@ Full JSON: `codex-dynamic-tools.heartbeat-turn.json`
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid Telegram compression (Telegram only).",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {