From 06be5eee6a23af8617353924b5da59efa0df21c5 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 02:19:16 +0100 Subject: [PATCH] fix: include quoted WhatsApp media in inbound context --- CHANGELOG.md | 1 + docs/channels/whatsapp.md | 4 ++ extensions/whatsapp/src/inbound.media.test.ts | 48 +++++++++++++++++++ extensions/whatsapp/src/inbound/extract.ts | 4 +- extensions/whatsapp/src/inbound/media.ts | 26 ++++++++++ extensions/whatsapp/src/inbound/monitor.ts | 41 +++++++++------- 6 files changed, 107 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36e3ff43d59..bf2a534ff8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai - Gateway/sessions: move hot transcript reads and mirror appends onto async bounded IO with serialized parent-linked writes, keeping large session histories from stalling Gateway requests and channel replies. Fixes #75656. Thanks @DerFlash. - Cron/TTS: run cron announce payloads through the normal TTS directive transform before outbound delivery, so scheduled `[[tts]]` replies generate voice payloads instead of leaking raw tags. Fixes #52125. Thanks @kenchen3000. +- WhatsApp: save downloadable quoted image media from reply context as inbound media, so agents can inspect an image that a user replied to instead of only seeing ``. Fixes #59174. Thanks @gaffner. - Doctor/WhatsApp: warn when Linux crontabs still run the legacy `ensure-whatsapp.sh` health check, which can misreport `Gateway inactive` when cron lacks the systemd user-bus environment. Fixes #60204. Thanks @mySebbe. - Slack/setup: print the generated app manifest as plain JSON instead of embedding it inside the framed setup note, so it can be copied into Slack without deleting border characters. Fixes #65751. Thanks @theDanielJLewis. - Channels/WhatsApp: route CLI logout through the live Gateway and stop runtime-backed listeners before channel removal, so removing a WhatsApp account does not leave the old socket replying until restart. Fixes #67746. Thanks @123Mismail. diff --git a/docs/channels/whatsapp.md b/docs/channels/whatsapp.md index 5a9662bd3a6..79f72a59d4f 100644 --- a/docs/channels/whatsapp.md +++ b/docs/channels/whatsapp.md @@ -293,6 +293,10 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s ``` Reply metadata fields are also populated when available (`ReplyToId`, `ReplyToBody`, `ReplyToSender`, sender JID/E.164). + When the quoted reply target is downloadable media, OpenClaw saves it through + the normal inbound media store and exposes it as `MediaPath`/`MediaType` so + the agent can inspect the referenced image instead of only seeing + ``. diff --git a/extensions/whatsapp/src/inbound.media.test.ts b/extensions/whatsapp/src/inbound.media.test.ts index 89d03359358..9ab47cca7ba 100644 --- a/extensions/whatsapp/src/inbound.media.test.ts +++ b/extensions/whatsapp/src/inbound.media.test.ts @@ -234,6 +234,54 @@ describe("web inbound media saves with extension", () => { await listener.close(); }); + it("stores quoted image media from reply context", async () => { + const onMessage = vi.fn(); + const listener = await monitorWebInbox({ + cfg: { + channels: { whatsapp: { allowFrom: ["*"] } }, + messages: { messagePrefix: undefined, responsePrefix: undefined }, + } as never, + verbose: false, + onMessage, + accountId: "default", + authDir: path.join(HOME, "wa-auth"), + }); + const realSock = await getMockSocket(); + + realSock.ev.emit("messages.upsert", { + type: "notify", + messages: [ + { + key: { id: "quote-img-reply", fromMe: false, remoteJid: "111@g.us" }, + message: { + extendedTextMessage: { + text: "@bot what is this?", + contextInfo: { + stanzaId: "quoted-image", + participant: "222@s.whatsapp.net", + mentionedJid: ["me@s.whatsapp.net"], + quotedMessage: { + imageMessage: { mimetype: "image/jpeg" }, + }, + }, + }, + }, + messageTimestamp: 1_700_000_005, + }, + ], + }); + + const inbound = await waitForMessage(onMessage); + expect(inbound.replyToBody).toBe(""); + expect(inbound.mediaPath).toBeDefined(); + expect(path.extname(inbound.mediaPath as string)).toBe(".jpg"); + expect(saveMediaBufferSpy).toHaveBeenCalled(); + const lastCall = saveMediaBufferSpy.mock.calls.at(-1); + expect(lastCall?.[1]).toBe("image/jpeg"); + + await listener.close(); + }); + it("passes mediaMaxMb to saveMediaBuffer", async () => { const onMessage = vi.fn(); const listener = await monitorWebInbox({ diff --git a/extensions/whatsapp/src/inbound/extract.ts b/extensions/whatsapp/src/inbound/extract.ts index 043fdab81ff..ec2677b5c2f 100644 --- a/extensions/whatsapp/src/inbound/extract.ts +++ b/extensions/whatsapp/src/inbound/extract.ts @@ -197,7 +197,9 @@ function extractContextInfoFromMessage(message: proto.IMessage): proto.IContextI return undefined; } -function extractContextInfo(message: proto.IMessage | undefined): proto.IContextInfo | undefined { +export function extractContextInfo( + message: proto.IMessage | undefined, +): proto.IContextInfo | undefined { for (const candidate of buildMessageChain(message)) { const contextInfo = extractContextInfoFromMessage(candidate); if (contextInfo) { diff --git a/extensions/whatsapp/src/inbound/media.ts b/extensions/whatsapp/src/inbound/media.ts index bdddf59975c..aee8cce81a5 100644 --- a/extensions/whatsapp/src/inbound/media.ts +++ b/extensions/whatsapp/src/inbound/media.ts @@ -1,6 +1,7 @@ import type { proto, WAMessage } from "@whiskeysockets/baileys"; import { logVerbose } from "openclaw/plugin-sdk/runtime-env"; import type { createWaSocket } from "../session.js"; +import { extractContextInfo } from "./extract.js"; import { downloadMediaMessage, normalizeMessageContent } from "./runtime-api.js"; function unwrapMessage(message: proto.IMessage | undefined): proto.IMessage | undefined { @@ -74,3 +75,28 @@ export async function downloadInboundMedia( return undefined; } } + +export async function downloadQuotedInboundMedia( + msg: proto.IWebMessageInfo, + sock: Awaited>, +): Promise<{ buffer: Buffer; mimetype?: string; fileName?: string } | undefined> { + const message = unwrapMessage(msg.message as proto.IMessage | undefined); + const contextInfo = extractContextInfo(message); + if (!contextInfo?.quotedMessage) { + return undefined; + } + const quotedMessage = contextInfo.quotedMessage; + return downloadInboundMedia( + { + key: { + id: contextInfo?.stanzaId || undefined, + remoteJid: contextInfo.remoteJid ?? msg.key?.remoteJid ?? undefined, + participant: contextInfo?.participant ?? undefined, + fromMe: false, + }, + message: quotedMessage, + messageTimestamp: msg.messageTimestamp, + }, + sock, + ); +} diff --git a/extensions/whatsapp/src/inbound/monitor.ts b/extensions/whatsapp/src/inbound/monitor.ts index 42cc04db93a..826143e274e 100644 --- a/extensions/whatsapp/src/inbound/monitor.ts +++ b/extensions/whatsapp/src/inbound/monitor.ts @@ -39,7 +39,7 @@ import { hasInboundUserContent, } from "./extract.js"; import { attachEmitterListener, closeInboundMonitorSocket } from "./lifecycle.js"; -import { downloadInboundMedia } from "./media.js"; +import { downloadInboundMedia, downloadQuotedInboundMedia } from "./media.js"; import { DisconnectReason, isJidGroup, saveMediaBuffer } from "./runtime-api.js"; import { createWebSendApi } from "./send-api.js"; import { normalizeWhatsAppSendResult } from "./send-result.js"; @@ -571,24 +571,33 @@ export async function attachWebInboxToSocket( let mediaPath: string | undefined; let mediaType: string | undefined; let mediaFileName: string | undefined; + const saveInboundMedia = async ( + inboundMedia: Awaited>, + ) => { + if (!inboundMedia) { + return; + } + const maxMb = + typeof options.mediaMaxMb === "number" && options.mediaMaxMb > 0 ? options.mediaMaxMb : 50; + const maxBytes = maxMb * 1024 * 1024; + const saved = await saveMediaBuffer( + inboundMedia.buffer, + inboundMedia.mimetype, + "inbound", + maxBytes, + inboundMedia.fileName, + ); + mediaPath = saved.path; + mediaType = inboundMedia.mimetype; + mediaFileName = inboundMedia.fileName; + }; try { const inboundMedia = await downloadInboundMedia(msg as proto.IWebMessageInfo, sock); - if (inboundMedia) { - const maxMb = - typeof options.mediaMaxMb === "number" && options.mediaMaxMb > 0 - ? options.mediaMaxMb - : 50; - const maxBytes = maxMb * 1024 * 1024; - const saved = await saveMediaBuffer( - inboundMedia.buffer, - inboundMedia.mimetype, - "inbound", - maxBytes, - inboundMedia.fileName, + await saveInboundMedia(inboundMedia); + if (!mediaPath && replyContext) { + await saveInboundMedia( + await downloadQuotedInboundMedia(msg as proto.IWebMessageInfo, sock), ); - mediaPath = saved.path; - mediaType = inboundMedia.mimetype; - mediaFileName = inboundMedia.fileName; } } catch (err) { logWhatsAppVerbose(options.verbose, `Inbound media download failed: ${String(err)}`);