fix: complete whatsapp forced document delivery

This commit is contained in:
Marcus Castro
2026-05-16 19:56:58 -03:00
parent e37460474d
commit faaff35f1e
16 changed files with 150 additions and 39 deletions

View File

@@ -393,6 +393,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
- non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded with `ffmpeg` to 48 kHz mono Ogg/Opus before PTT delivery
- `/tts latest` sends the latest assistant reply as one voice note and suppresses repeat sends for the same reply; `/tts chat on|off|default` controls auto-TTS for the current WhatsApp chat
- animated GIF playback is supported via `gifPlayback: true` on video sends
- `forceDocument` / `asDocument` sends outbound images, GIFs, and videos through the Baileys document payload to avoid WhatsApp media compression while preserving the resolved filename and MIME type
- captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently
- media source can be HTTP(S), `file://`, or local paths
@@ -402,7 +403,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
- inbound media save cap: `channels.whatsapp.mediaMaxMb` (default `50`)
- outbound media send cap: `channels.whatsapp.mediaMaxMb` (default `50`)
- per-account overrides use `channels.whatsapp.accounts.<accountId>.mediaMaxMb`
- images are auto-optimized (resize/quality sweep) to fit limits
- images are auto-optimized (resize/quality sweep) to fit limits unless `forceDocument` / `asDocument` requests document delivery
- on media send failure, first-item fallback sends text warning instead of dropping the response silently
</Accordion>

View File

@@ -302,7 +302,7 @@ openclaw message send --channel msteams \
--presentation '{"title":"Status update","blocks":[{"type":"text","text":"Build completed"}]}'
```
Send a Telegram image as a document to avoid compression:
Send a Telegram or WhatsApp image as a document to avoid compression:
```bash
openclaw message send --channel telegram --target @mychat \

View File

@@ -123,7 +123,7 @@ describe("createWebSendApi", () => {
});
});
it("sends as document when sendOptions.asDocument is true regardless of MIME", async () => {
it("sends visual media as document when sendOptions.asDocument is true", async () => {
const payload = Buffer.from("img");
await api.sendMessage("+1555", "promo", payload, "image/png", {
asDocument: true,
@@ -140,6 +140,20 @@ describe("createWebSendApi", () => {
);
});
it("does not force audio media onto the document branch", async () => {
const payload = Buffer.from("aud");
await api.sendMessage("+1555", "voice", payload, "audio/ogg", {
asDocument: true,
fileName: "voice.ogg",
});
expect(sendMessage).toHaveBeenCalledWith("1555@s.whatsapp.net", {
audio: payload,
ptt: true,
mimetype: "audio/ogg",
});
});
it("sends plain text messages", async () => {
const res = await api.sendMessage("+1555", "hello");
expect(sendMessage).toHaveBeenCalledWith("1555@s.whatsapp.net", { text: "hello" });
@@ -216,6 +230,39 @@ describe("createWebSendApi", () => {
});
});
it("uses resolved mention caption text for forced-document media", async () => {
api = createWebSendApi({
sock: { sendMessage, sendPresenceUpdate },
defaultAccountId: "main",
resolveOutboundMentions: ({ jid, text }) =>
resolveWhatsAppOutboundMentions({
chatJid: jid,
text,
participants: [
{
id: "277038292303944:4@lid",
phoneNumber: "5511976136970@s.whatsapp.net",
},
],
}),
});
const payload = Buffer.from("img");
await api.sendMessage("120363000000000000@g.us", "cap @+5511976136970", payload, "image/jpeg", {
asDocument: true,
fileName: "promo.jpg",
});
expectFirstSendJid("120363000000000000@g.us");
expectSendContentFields(0, {
document: payload,
fileName: "promo.jpg",
caption: "cap @277038292303944",
mimetype: "image/jpeg",
mentions: ["277038292303944@lid"],
});
});
it("supports audio as push-to-talk voice note", async () => {
const payload = Buffer.from("aud");
await api.sendMessage("+1555", "", payload, "audio/ogg", { accountId: "alt" });

View File

@@ -27,6 +27,10 @@ function recordWhatsAppOutbound(accountId: string) {
});
}
function supportsForcedDocumentMediaType(mediaType: string): boolean {
return mediaType.startsWith("image/") || mediaType.startsWith("video/");
}
export function createWebSendApi(params: {
sock: {
sendMessage: (
@@ -79,12 +83,12 @@ export function createWebSendApi(params: {
? { text, mentionedJids: [] }
: await resolveMentions(jid, text);
if (mediaBuffer && mediaType) {
if (sendOptions?.asDocument === true) {
if (sendOptions?.asDocument === true && supportsForcedDocumentMediaType(mediaType)) {
const fileName = sendOptions?.fileName?.trim() || "file";
payload = {
document: mediaBuffer,
fileName,
caption: text || undefined,
caption: resolvedPayloadText.text || undefined,
mimetype: mediaType,
};
} else if (mediaType.startsWith("image/")) {

View File

@@ -10,6 +10,7 @@ export async function loadOutboundMediaFromUrl(
};
mediaLocalRoots?: readonly string[];
mediaReadFile?: (filePath: string) => Promise<Buffer>;
optimizeImages?: boolean;
} = {},
) {
const readFile = options.mediaAccess?.readFile ?? options.mediaReadFile;
@@ -19,17 +20,21 @@ export async function loadOutboundMediaFromUrl(
: options.mediaLocalRoots && options.mediaLocalRoots.length > 0
? options.mediaLocalRoots
: undefined;
const sharedOptions = {
...(options.maxBytes !== undefined ? { maxBytes: options.maxBytes } : {}),
...(options.optimizeImages !== undefined ? { optimizeImages: options.optimizeImages } : {}),
};
return await loadWebMedia(
mediaUrl,
readFile
? {
...(options.maxBytes !== undefined ? { maxBytes: options.maxBytes } : {}),
...sharedOptions,
localRoots: "any",
readFile,
hostReadCapability: true,
}
: {
...(options.maxBytes !== undefined ? { maxBytes: options.maxBytes } : {}),
...sharedOptions,
...(localRoots ? { localRoots } : {}),
},
);

View File

@@ -107,6 +107,7 @@ describe("web outbound", () => {
};
mediaLocalRoots?: readonly string[];
mediaReadFile?: (filePath: string) => Promise<Buffer>;
optimizeImages?: boolean;
},
) =>
await loadWebMediaMock(mediaUrl, {
@@ -469,6 +470,30 @@ describe("web outbound", () => {
asDocument: true,
fileName: "promo.jpg",
});
expect(hoisted.loadOutboundMediaFromUrl).toHaveBeenCalledWith(
"/tmp/pic.jpg",
expect.objectContaining({ optimizeImages: false }),
);
});
it("forces document branch when forceDocument is true with video media", async () => {
const buf = Buffer.from("video");
loadWebMediaMock.mockResolvedValueOnce({
buffer: buf,
contentType: "video/mp4",
kind: "video",
fileName: "clip.mp4",
});
await sendMessageWhatsApp("+1555", "watch", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/clip.mp4",
forceDocument: true,
});
expect(sendMessage).toHaveBeenLastCalledWith("+1555", "watch", buf, "video/mp4", {
asDocument: true,
fileName: "clip.mp4",
});
});
it("falls back to a default filename when forceDocument media has no fileName", async () => {
@@ -490,6 +515,26 @@ describe("web outbound", () => {
});
});
it("keeps audio on the voice-note path when forceDocument is true", async () => {
const buf = Buffer.from("audio");
loadWebMediaMock.mockResolvedValueOnce({
buffer: buf,
contentType: "audio/ogg",
kind: "audio",
fileName: "voice.ogg",
});
await sendMessageWhatsApp("+1555", "voice note", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/voice.ogg",
forceDocument: true,
});
expect(sendMessage).toHaveBeenNthCalledWith(1, "+1555", "", buf, "audio/ogg; codecs=opus");
expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined);
});
it("uses account-aware WhatsApp media caps for outbound uploads", async () => {
hoisted.controllerListeners.set("work", {
sendComposingTo,

View File

@@ -27,6 +27,10 @@ import { markdownToWhatsApp, toWhatsappJid } from "./text-runtime.js";
const outboundLog = createSubsystemLogger("gateway/channels/whatsapp").child("outbound");
function supportsForcedDocumentDelivery(kind: "image" | "audio" | "video" | "document"): boolean {
return kind === "image" || kind === "video";
}
function resolveOutboundWhatsAppAccountId(params: {
cfg: OpenClawConfig;
accountId?: string;
@@ -119,10 +123,12 @@ export async function sendMessageWhatsApp(
let mediaType: string | undefined;
let documentFileName: string | undefined;
let visibleTextAfterVoice: string | undefined;
let forceDocumentDelivery = false;
if (primaryMediaUrl) {
const media = await prepareWhatsAppOutboundMedia(
await loadOutboundMediaFromUrl(primaryMediaUrl, {
maxBytes: resolveWhatsAppMediaMaxBytes(account),
optimizeImages: options.forceDocument ? false : undefined,
mediaAccess: options.mediaAccess,
mediaLocalRoots: options.mediaLocalRoots,
mediaReadFile: options.mediaReadFile,
@@ -132,7 +138,10 @@ export async function sendMessageWhatsApp(
const caption = text || undefined;
mediaBuffer = media.buffer;
mediaType = media.mimetype;
if (media.kind === "audio" && caption && !options.forceDocument) {
forceDocumentDelivery = Boolean(
options.forceDocument && supportsForcedDocumentDelivery(media.kind),
);
if (media.kind === "audio" && caption) {
visibleTextAfterVoice = caption;
text = "";
} else if (media.kind === "document") {
@@ -141,7 +150,7 @@ export async function sendMessageWhatsApp(
} else {
text = caption ?? "";
}
if (options.forceDocument) {
if (forceDocumentDelivery) {
documentFileName ??= media.fileName ?? "file";
}
}
@@ -154,13 +163,13 @@ export async function sendMessageWhatsApp(
const accountId = hasExplicitAccountId ? resolvedAccountId : undefined;
const sendOptions: ActiveWebSendOptions | undefined =
options.gifPlayback ||
options.forceDocument ||
forceDocumentDelivery ||
accountId ||
documentFileName ||
options.quotedMessageKey
? {
...(options.gifPlayback ? { gifPlayback: true } : {}),
...(options.forceDocument ? { asDocument: true } : {}),
...(forceDocumentDelivery ? { asDocument: true } : {}),
...(documentFileName ? { fileName: documentFileName } : {}),
...(options.quotedMessageKey ? { quotedMessageKey: options.quotedMessageKey } : {}),
accountId,

View File

@@ -219,13 +219,13 @@ function buildSendSchema(options: { includePresentation: boolean; includeDeliver
gifPlayback: Type.Optional(Type.Boolean()),
forceDocument: Type.Optional(
Type.Boolean({
description: "Send image/GIF as document to avoid channel compression.",
description: "Send image/GIF/video as document to avoid channel compression.",
}),
),
asDocument: Type.Optional(
Type.Boolean({
description:
"Send image/GIF as document to avoid channel compression. Alias for forceDocument.",
"Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
}),
),
};

View File

@@ -23,7 +23,7 @@ export type ChannelOutboundContext = {
mediaLocalRoots?: readonly string[];
mediaReadFile?: (filePath: string) => Promise<Buffer>;
gifPlayback?: boolean;
/** Send image as document to avoid channel compression. */
/** Send image, GIF, or video as document to avoid channel compression. */
forceDocument?: boolean;
replyToId?: string | null;
replyToIdSource?: "explicit" | "implicit";

View File

@@ -26,7 +26,7 @@ export function registerMessageSendCommand(message: Command, helpers: MessageCli
.option("--gif-playback", "Treat video media as GIF playback (WhatsApp only).", false)
.option(
"--force-document",
"Send media as document to avoid channel compression (Telegram, WhatsApp). Applies to images and GIFs.",
"Send media as document to avoid channel compression (Telegram, WhatsApp). Applies to images, GIFs, and videos.",
false,
)
.option(

View File

@@ -625,7 +625,7 @@
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid channel compression. Alias for forceDocument.",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -702,7 +702,7 @@
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid channel compression.",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -625,7 +625,7 @@
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid channel compression. Alias for forceDocument.",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -702,7 +702,7 @@
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid channel compression.",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -625,7 +625,7 @@
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid channel compression. Alias for forceDocument.",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -702,7 +702,7 @@
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid channel compression.",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -217,8 +217,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 140
},
"dynamicToolsJson": {
"chars": 44373,
"roughTokens": 11094
"chars": 44351,
"roughTokens": 11088
},
"openClawDeveloperInstructions": {
"chars": 5436,
@@ -229,8 +229,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 7129
},
"totalWithDynamicToolsJson": {
"chars": 72891,
"roughTokens": 18223
"chars": 72869,
"roughTokens": 18218
},
"userInputText": {
"chars": 870,
@@ -602,7 +602,7 @@ Full JSON: `codex-dynamic-tools.discord-group.json`
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid channel compression. Alias for forceDocument.",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -679,7 +679,7 @@ Full JSON: `codex-dynamic-tools.discord-group.json`
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid channel compression.",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -217,8 +217,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 140
},
"dynamicToolsJson": {
"chars": 44064,
"roughTokens": 11016
"chars": 44042,
"roughTokens": 11011
},
"openClawDeveloperInstructions": {
"chars": 4412,
@@ -229,8 +229,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 6748
},
"totalWithDynamicToolsJson": {
"chars": 71058,
"roughTokens": 17765
"chars": 71036,
"roughTokens": 17759
},
"userInputText": {
"chars": 370,
@@ -579,7 +579,7 @@ Full JSON: `codex-dynamic-tools.telegram-direct.json`
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid channel compression. Alias for forceDocument.",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -656,7 +656,7 @@ Full JSON: `codex-dynamic-tools.telegram-direct.json`
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid channel compression.",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {

View File

@@ -218,8 +218,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 140
},
"dynamicToolsJson": {
"chars": 45242,
"roughTokens": 11311
"chars": 45220,
"roughTokens": 11305
},
"openClawDeveloperInstructions": {
"chars": 4412,
@@ -230,8 +230,8 @@ This is the deterministic model-bound layer stack OpenClaw can snapshot for the
"roughTokens": 7155
},
"totalWithDynamicToolsJson": {
"chars": 73863,
"roughTokens": 18466
"chars": 73841,
"roughTokens": 18461
},
"userInputText": {
"chars": 608,
@@ -596,7 +596,7 @@ Full JSON: `codex-dynamic-tools.heartbeat-turn.json`
"type": "string"
},
"asDocument": {
"description": "Send image/GIF as document to avoid channel compression. Alias for forceDocument.",
"description": "Send image/GIF/video as document to avoid channel compression. Alias for forceDocument.",
"type": "boolean"
},
"asVoice": {
@@ -673,7 +673,7 @@ Full JSON: `codex-dynamic-tools.heartbeat-turn.json`
"type": "string"
},
"forceDocument": {
"description": "Send image/GIF as document to avoid channel compression.",
"description": "Send image/GIF/video as document to avoid channel compression.",
"type": "boolean"
},
"gatewayToken": {