fix(whatsapp): canonicalize outbound media delivery (#69813)

* fix(whatsapp): normalize outbound media payloads

* fix(embedded-runner): preserve final media directives

* fix(auto-reply): keep non-streaming media on final path

* fix(auto-reply): warn when reply media is dropped

* fix(whatsapp): align auto-reply media delivery

* docs(changelog): note whatsapp media normalization
This commit is contained in:
Marcus Castro
2026-04-24 01:04:28 -03:00
committed by GitHub
parent 5b34082106
commit 18c98316f7
24 changed files with 1056 additions and 200 deletions

View File

@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
- Agents/replay: stop OpenAI/Codex transcript replay from synthesizing missing tool results while still preserving synthetic repair on Anthropic, Gemini, and Bedrock transport-owned sessions. (#61556) Thanks @VictorJeon and @vincentkoc.
- Telegram/media replies: parse remote markdown image syntax into outbound media payloads on the final reply path, so Telegram group chats stop falling back to plain-text image URLs when the model or a tool emits `![...](...)` instead of a `MEDIA:` token. (#66191) Thanks @apezam and @vincentkoc.
- Agents/WebChat: surface non-retryable provider failures such as billing, auth, and rate-limit errors from the embedded runner instead of logging `surface_error` and leaving webchat with no rendered error. Fixes #70124. (#70848) Thanks @truffle-dev.
- WhatsApp: unify outbound media normalization across direct sends and auto-replies. Thanks @mcaxtr.
- Memory/CLI: declare the built-in `local` embedding provider in the memory-core manifest, so standalone `openclaw memory status`, `index`, and `search` can resolve local embeddings just like the gateway runtime. Fixes #70836. (#70873) Thanks @mattznojassist.
- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
- Plugins/Google Meet: hang up delegated Twilio calls on leave, clean up Chrome realtime audio bridges when launch fails, and use a flat provider-safe tool schema.

View File

@@ -379,7 +379,7 @@ describe("web auto-reply", () => {
const fallback = reply.mock.calls[0]?.[0] as string;
expect(fallback).toContain("caption");
expect(fallback).toContain("Media failed");
expect(fallback).toContain("404");
expect(fallback).not.toContain("404");
fetchMock.mockRestore();
});

View File

@@ -31,6 +31,7 @@ vi.mock("../media.js", () => ({
}));
let deliverWebReply: typeof import("./deliver-reply.js").deliverWebReply;
let whatsappOutbound: typeof import("../outbound-adapter.js").whatsappOutbound;
function makeMsg(): WebInboundMsg {
return {
@@ -69,6 +70,12 @@ function mockFirstReplyFailure(msg: WebInboundMsg, message: string) {
);
}
function mockFirstReplyFailureWithWrappedError(msg: WebInboundMsg, message: string) {
(msg.reply as unknown as { mockRejectedValueOnce: (v: unknown) => void }).mockRejectedValueOnce({
error: { message },
});
}
function mockSecondReplySuccess(msg: WebInboundMsg) {
(msg.reply as unknown as { mockResolvedValueOnce: (v: unknown) => void }).mockResolvedValueOnce(
undefined,
@@ -97,6 +104,7 @@ async function expectReplySuppressed(replyResult: { text: string; isReasoning?:
describe("deliverWebReply", () => {
beforeAll(async () => {
({ deliverWebReply } = await import("./deliver-reply.js"));
({ whatsappOutbound } = await import("../outbound-adapter.js"));
});
it("suppresses payloads flagged as reasoning", async () => {
@@ -217,6 +225,24 @@ describe("deliverWebReply", () => {
},
);
it("retries text send on wrapped transient failure", async () => {
const msg = makeMsg();
mockFirstReplyFailureWithWrappedError(msg, "connection closed");
mockSecondReplySuccess(msg);
await deliverWebReply({
replyResult: { text: "hi" },
msg,
maxMediaBytes: 1024 * 1024,
textLimit: 200,
replyLogger,
skipLog: true,
});
expect(msg.reply).toHaveBeenCalledTimes(2);
expect(sleep).toHaveBeenCalledWith(500);
});
it("sends image media with caption and then remaining text", async () => {
const msg = makeMsg();
const mediaLocalRoots = ["/tmp/workspace-work"];
@@ -250,6 +276,22 @@ describe("deliverWebReply", () => {
expect(logVerbose).toHaveBeenCalled();
});
it("preserves leading indentation after trimming only leading blank lines", async () => {
const msg = makeMsg();
await deliverWebReply({
replyResult: { text: "\n \n indented block" },
msg,
maxMediaBytes: 1024 * 1024,
textLimit: 200,
replyLogger,
skipLog: true,
});
expect(msg.reply).toHaveBeenCalledTimes(1);
expect(msg.reply).toHaveBeenCalledWith(" indented block", undefined);
});
it("keeps quote threading on media and trailing text chunks for a threaded reply", async () => {
const msg = makeMsg();
mockLoadedImageMedia();
@@ -343,12 +385,137 @@ describe("deliverWebReply", () => {
expect(
String((msg.reply as unknown as { mock: { calls: unknown[][] } }).mock.calls[0]?.[0]),
).toContain("⚠️ Media failed");
expect(
String((msg.reply as unknown as { mock: { calls: unknown[][] } }).mock.calls[0]?.[0]),
).not.toContain("boom");
expect(replyLogger.warn).toHaveBeenCalledWith(
expect.objectContaining({ mediaUrl: "http://example.com/img.jpg" }),
"failed to send web media reply",
);
});
it("still attempts later media after the first media fails", async () => {
vi.clearAllMocks();
const msg = makeMsg();
(
loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void }
).mockResolvedValueOnce({
buffer: Buffer.from("bad"),
contentType: "image/jpeg",
kind: "image",
});
(
loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void }
).mockResolvedValueOnce({
buffer: Buffer.from("good"),
contentType: "application/pdf",
kind: "file",
fileName: "good.pdf",
});
mockFirstSendMediaFailure(msg, "boom");
(
msg.sendMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void }
).mockResolvedValueOnce(undefined);
await deliverWebReply({
replyResult: {
text: "caption",
mediaUrls: ["http://example.com/bad.jpg", "http://example.com/good.pdf"],
},
msg,
maxMediaBytes: 1024 * 1024,
textLimit: 200,
replyLogger,
skipLog: true,
});
expect(loadWebMedia).toHaveBeenNthCalledWith(1, "http://example.com/bad.jpg", {
maxBytes: 1024 * 1024,
localRoots: undefined,
});
expect(loadWebMedia).toHaveBeenNthCalledWith(2, "http://example.com/good.pdf", {
maxBytes: 1024 * 1024,
localRoots: undefined,
});
expect(msg.sendMedia).toHaveBeenCalledTimes(2);
expect(msg.sendMedia).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
document: expect.any(Buffer),
fileName: "good.pdf",
caption: undefined,
mimetype: "application/pdf",
}),
undefined,
);
expect(msg.reply).toHaveBeenCalledTimes(1);
expect(
String((msg.reply as unknown as { mock: { calls: unknown[][] } }).mock.calls[0]?.[0]),
).toContain("⚠️ Media failed");
expect(
String((msg.reply as unknown as { mock: { calls: unknown[][] } }).mock.calls[0]?.[0]),
).not.toContain("boom");
});
it("keeps payload and auto-reply media normalization in parity", async () => {
const payload = {
text: "\n\ncaption",
mediaUrls: [" ", " /tmp/voice.ogg "],
};
const sendWhatsApp = vi.fn(async () => ({ messageId: "wa-1", toJid: "jid" }));
await whatsappOutbound.sendPayload!({
cfg: {},
to: "5511999999999@c.us",
text: "",
payload,
deps: { sendWhatsApp },
});
const msg = makeMsg();
(
loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void }
).mockResolvedValueOnce({
buffer: Buffer.from("aud"),
contentType: "audio/ogg",
kind: "audio",
});
await deliverWebReply({
replyResult: payload,
msg,
maxMediaBytes: 1024 * 1024,
textLimit: 200,
replyLogger,
skipLog: true,
});
expect(sendWhatsApp).toHaveBeenCalledTimes(1);
expect(sendWhatsApp).toHaveBeenCalledWith("5511999999999@c.us", "caption", {
verbose: false,
cfg: {},
mediaUrl: "/tmp/voice.ogg",
mediaLocalRoots: undefined,
accountId: undefined,
gifPlayback: undefined,
});
expect(loadWebMedia).toHaveBeenCalledWith("/tmp/voice.ogg", {
maxBytes: 1024 * 1024,
localRoots: undefined,
});
expect(msg.sendMedia).toHaveBeenCalledTimes(1);
expect(msg.sendMedia).toHaveBeenCalledWith(
expect.objectContaining({
audio: expect.any(Buffer),
ptt: true,
mimetype: "audio/ogg; codecs=opus",
caption: "caption",
}),
undefined,
);
expect(msg.reply).not.toHaveBeenCalled();
});
it("sends audio media as ptt voice note", async () => {
const msg = makeMsg();
(
@@ -372,7 +539,7 @@ describe("deliverWebReply", () => {
expect.objectContaining({
audio: expect.any(Buffer),
ptt: true,
mimetype: "audio/ogg",
mimetype: "audio/ogg; codecs=opus",
caption: "cap",
}),
undefined,
@@ -438,4 +605,37 @@ describe("deliverWebReply", () => {
undefined,
);
});
it("strips URL query and fragment data from derived document file names", async () => {
const msg = makeMsg();
(
loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void }
).mockResolvedValueOnce({
buffer: Buffer.from("pdf"),
contentType: "application/pdf",
kind: "file",
});
await deliverWebReply({
replyResult: {
text: "cap",
mediaUrl: "https://example.com/report.pdf?X-Amz-Signature=secret#frag",
},
msg,
maxMediaBytes: 1024 * 1024,
textLimit: 200,
replyLogger,
skipLog: true,
});
expect(msg.sendMedia).toHaveBeenCalledWith(
expect.objectContaining({
document: expect.any(Buffer),
fileName: "report.pdf",
caption: "cap",
mimetype: "application/pdf",
}),
undefined,
);
});
});

View File

@@ -3,15 +3,20 @@ import { chunkMarkdownTextWithMode, type ChunkMode } from "openclaw/plugin-sdk/r
import type { ReplyPayload } from "openclaw/plugin-sdk/reply-chunking";
import {
isReasoningReplyPayload,
resolveOutboundMediaUrls,
sendMediaWithLeadingCaption,
} from "openclaw/plugin-sdk/reply-payload";
import { logVerbose, shouldLogVerbose } from "openclaw/plugin-sdk/runtime-env";
import { loadWebMedia } from "../media.js";
import {
normalizeWhatsAppLoadedMedia,
normalizeWhatsAppOutboundPayload,
normalizeWhatsAppPayloadTextPreservingIndentation,
sendWhatsAppOutboundWithRetry,
} from "../outbound-media-contract.js";
import { buildQuotedMessageOptions, lookupInboundMessageMeta } from "../quoted-message.js";
import { newConnectionId } from "../reconnect.js";
import { formatError } from "../session.js";
import { convertMarkdownTables, sleep } from "../text-runtime.js";
import { convertMarkdownTables } from "../text-runtime.js";
import { markdownToWhatsApp } from "../text-runtime.js";
import { whatsappOutboundLog } from "./loggers.js";
import type { WebInboundMsg } from "./types.js";
@@ -40,11 +45,12 @@ export async function deliverWebReply(params: {
}
const tableMode = params.tableMode ?? "code";
const chunkMode = params.chunkMode ?? "length";
const convertedText = markdownToWhatsApp(
convertMarkdownTables(replyResult.text || "", tableMode),
);
const normalizedReply = normalizeWhatsAppOutboundPayload(replyResult, {
normalizeText: normalizeWhatsAppPayloadTextPreservingIndentation,
});
const convertedText = markdownToWhatsApp(convertMarkdownTables(normalizedReply.text, tableMode));
const textChunks = chunkMarkdownTextWithMode(convertedText, textLimit, chunkMode);
const mediaList = resolveOutboundMediaUrls(replyResult);
const mediaList = normalizedReply.mediaUrls ?? [];
const getQuote = () => {
if (!replyResult.replyToId) {
@@ -64,26 +70,15 @@ export async function deliverWebReply(params: {
};
const sendWithRetry = async (fn: () => Promise<unknown>, label: string, maxAttempts = 3) => {
let lastErr: unknown;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
try {
return await fn();
} catch (err) {
lastErr = err;
const errText = formatError(err);
const isLast = attempt === maxAttempts;
const shouldRetry = /closed|reset|timed\s*out|disconnect/i.test(errText);
if (!shouldRetry || isLast) {
throw err;
}
const backoffMs = 500 * attempt;
return await sendWhatsAppOutboundWithRetry({
send: fn,
maxAttempts,
onRetry: ({ attempt, maxAttempts: retryMaxAttempts, backoffMs, errorText }) => {
logVerbose(
`Retrying ${label} to ${msg.from} after failure (${attempt}/${maxAttempts - 1}) in ${backoffMs}ms: ${errText}`,
`Retrying ${label} to ${msg.from} after failure (${attempt}/${retryMaxAttempts - 1}) in ${backoffMs}ms: ${errorText}`,
);
await sleep(backoffMs);
}
}
throw lastErr;
},
});
};
// Text-only replies
@@ -125,10 +120,13 @@ export async function deliverWebReply(params: {
mediaUrls: mediaList,
caption: leadingCaption,
send: async ({ mediaUrl, caption }) => {
const media = await loadWebMedia(mediaUrl, {
maxBytes: maxMediaBytes,
localRoots: params.mediaLocalRoots,
});
const media = normalizeWhatsAppLoadedMedia(
await loadWebMedia(mediaUrl, {
maxBytes: maxMediaBytes,
localRoots: params.mediaLocalRoots,
}),
mediaUrl,
);
if (shouldLogVerbose()) {
logVerbose(
`Web auto-reply media size: ${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB`,
@@ -143,7 +141,7 @@ export async function deliverWebReply(params: {
{
image: media.buffer,
caption,
mimetype: media.contentType,
mimetype: media.mimetype,
},
quote,
),
@@ -157,7 +155,7 @@ export async function deliverWebReply(params: {
{
audio: media.buffer,
ptt: true,
mimetype: media.contentType,
mimetype: media.mimetype,
caption,
},
quote,
@@ -172,24 +170,22 @@ export async function deliverWebReply(params: {
{
video: media.buffer,
caption,
mimetype: media.contentType,
mimetype: media.mimetype,
},
quote,
),
"media:video",
);
} else {
const fileName = media.fileName ?? mediaUrl.split("/").pop() ?? "file";
const mimetype = media.contentType ?? "application/octet-stream";
const quote = getQuote();
await sendWithRetry(
() =>
msg.sendMedia(
{
document: media.buffer,
fileName,
fileName: media.fileName,
caption,
mimetype,
mimetype: media.mimetype,
},
quote,
),
@@ -220,8 +216,7 @@ export async function deliverWebReply(params: {
if (!isFirst) {
return;
}
const warning =
error instanceof Error ? `⚠️ Media failed: ${error.message}` : "⚠️ Media failed.";
const warning = "⚠️ Media failed.";
const fallbackTextParts = [remainingText.shift() ?? caption ?? "", warning].filter(Boolean);
const fallbackText = fallbackTextParts.join("\n");
if (!fallbackText) {

View File

@@ -0,0 +1,74 @@
import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
const hoisted = vi.hoisted(() => ({
sendMessageWhatsApp: vi.fn(async () => ({ messageId: "wa-1", toJid: "jid" })),
sendPollWhatsApp: vi.fn(async () => ({ messageId: "poll-1", toJid: "jid" })),
}));
vi.mock("./send.js", () => ({
sendMessageWhatsApp: hoisted.sendMessageWhatsApp,
sendPollWhatsApp: hoisted.sendPollWhatsApp,
}));
vi.mock("./runtime.js", () => ({
getWhatsAppRuntime: () => ({
logging: {
shouldLogVerbose: () => false,
},
}),
}));
let whatsappChannelOutbound: typeof import("./channel-outbound.js").whatsappChannelOutbound;
describe("whatsappChannelOutbound", () => {
beforeAll(async () => {
({ whatsappChannelOutbound } = await import("./channel-outbound.js"));
});
beforeEach(() => {
vi.clearAllMocks();
});
it("drops leading blank lines but preserves intentional indentation", () => {
expect(
whatsappChannelOutbound.normalizePayload?.({
payload: { text: "\n \n indented" },
}),
).toEqual({
text: " indented",
});
});
it("preserves indentation for live text sends", async () => {
await whatsappChannelOutbound.sendText!({
cfg: {},
to: "5511999999999@c.us",
text: "\n \n indented",
});
expect(hoisted.sendMessageWhatsApp).toHaveBeenCalledWith("5511999999999@c.us", " indented", {
verbose: false,
cfg: {},
accountId: undefined,
gifPlayback: undefined,
preserveLeadingWhitespace: true,
});
});
it("preserves indentation for payload delivery", async () => {
await whatsappChannelOutbound.sendPayload!({
cfg: {},
to: "5511999999999@c.us",
text: "",
payload: { text: "\n \n indented" },
});
expect(hoisted.sendMessageWhatsApp).toHaveBeenCalledWith("5511999999999@c.us", " indented", {
verbose: false,
cfg: {},
accountId: undefined,
gifPlayback: undefined,
preserveLeadingWhitespace: true,
});
});
});

View File

@@ -4,21 +4,31 @@ import { resolveWhatsAppOutboundTarget } from "./resolve-outbound-target.js";
import { getWhatsAppRuntime } from "./runtime.js";
import { sendMessageWhatsApp, sendPollWhatsApp } from "./send.js";
export function normalizeWhatsAppPayloadText(text: string | undefined): string {
export function normalizeWhatsAppChannelPayloadText(text: string | undefined): string {
return (text ?? "").replace(/^(?:[ \t]*\r?\n)+/, "");
}
function normalizeWhatsAppChannelSendText(text: string | undefined): string {
const normalized = normalizeWhatsAppChannelPayloadText(text);
return normalized.trim() ? normalized : "";
}
export const whatsappChannelOutbound = {
...createWhatsAppOutboundBase({
chunker: chunkText,
sendMessageWhatsApp,
sendMessageWhatsApp: async (to, text, options) =>
await sendMessageWhatsApp(to, text, {
...options,
preserveLeadingWhitespace: true,
}),
sendPollWhatsApp,
shouldLogVerbose: () => getWhatsAppRuntime().logging.shouldLogVerbose(),
resolveTarget: ({ to, allowFrom, mode }) =>
resolveWhatsAppOutboundTarget({ to, allowFrom, mode }),
normalizeText: normalizeWhatsAppChannelSendText,
}),
normalizePayload: ({ payload }: { payload: { text?: string } }) => ({
...payload,
text: normalizeWhatsAppPayloadText(payload.text),
text: normalizeWhatsAppChannelPayloadText(payload.text),
}),
};

View File

@@ -75,6 +75,31 @@ describe("whatsappOutbound sendPayload", () => {
});
});
it("drops blank mediaUrls before sending payload media", async () => {
const sendWhatsApp = vi.fn(async () => ({ messageId: "wa-1", toJid: "jid" }));
await whatsappOutbound.sendPayload!({
cfg: {},
to: "5511999999999@c.us",
text: "",
payload: {
text: "\n\ncaption",
mediaUrls: [" ", " /tmp/voice.ogg "],
},
deps: { sendWhatsApp },
});
expect(sendWhatsApp).toHaveBeenCalledTimes(1);
expect(sendWhatsApp).toHaveBeenCalledWith("5511999999999@c.us", "caption", {
verbose: false,
cfg: {},
mediaUrl: "/tmp/voice.ogg",
mediaLocalRoots: undefined,
accountId: undefined,
gifPlayback: undefined,
});
});
it("skips whitespace-only text payloads", async () => {
const sendWhatsApp = vi.fn();

View File

@@ -1,16 +1,7 @@
import {
type ChannelOutboundAdapter,
createAttachedChannelResultAdapter,
createEmptyChannelResult,
} from "openclaw/plugin-sdk/channel-send-result";
import { resolveOutboundSendDep, sanitizeForPlainText } from "openclaw/plugin-sdk/outbound-runtime";
import {
resolveSendableOutboundReplyParts,
sendTextMediaPayload,
} from "openclaw/plugin-sdk/reply-payload";
import { type ChannelOutboundAdapter } from "openclaw/plugin-sdk/channel-send-result";
import { chunkText } from "openclaw/plugin-sdk/reply-runtime";
import { shouldLogVerbose } from "openclaw/plugin-sdk/runtime-env";
import { WHATSAPP_LEGACY_OUTBOUND_SEND_DEP_KEYS } from "./outbound-send-deps.js";
import { createWhatsAppOutboundBase } from "./outbound-base.js";
import { resolveWhatsAppOutboundTarget } from "./resolve-outbound-target.js";
type WhatsAppSendModule = typeof import("./send.js");
@@ -26,84 +17,19 @@ function trimLeadingWhitespace(text: string | undefined): string {
return text?.trimStart() ?? "";
}
export const whatsappOutbound: ChannelOutboundAdapter = {
deliveryMode: "gateway",
export const whatsappOutbound: ChannelOutboundAdapter = createWhatsAppOutboundBase({
chunker: chunkText,
chunkerMode: "text",
textChunkLimit: 4000,
sanitizeText: ({ text }) => sanitizeForPlainText(text),
pollMaxOptions: 12,
sendMessageWhatsApp: async (to, text, options) =>
await (
await loadWhatsAppSendModule()
).sendMessageWhatsApp(to, trimLeadingWhitespace(text), {
...options,
}),
sendPollWhatsApp: async (to, poll, options) =>
await (await loadWhatsAppSendModule()).sendPollWhatsApp(to, poll, options),
shouldLogVerbose: () => shouldLogVerbose(),
resolveTarget: ({ to, allowFrom, mode }) =>
resolveWhatsAppOutboundTarget({ to, allowFrom, mode }),
sendPayload: async (ctx) => {
const text = trimLeadingWhitespace(ctx.payload.text);
const hasMedia = resolveSendableOutboundReplyParts(ctx.payload).hasMedia;
if (!text && !hasMedia) {
return createEmptyChannelResult("whatsapp");
}
return await sendTextMediaPayload({
channel: "whatsapp",
ctx: {
...ctx,
payload: {
...ctx.payload,
text,
},
},
adapter: whatsappOutbound,
});
},
...createAttachedChannelResultAdapter({
channel: "whatsapp",
sendText: async ({ cfg, to, text, accountId, deps, gifPlayback }) => {
const normalizedText = trimLeadingWhitespace(text);
if (!normalizedText) {
return createEmptyChannelResult("whatsapp");
}
const send =
resolveOutboundSendDep<typeof import("./send.js").sendMessageWhatsApp>(deps, "whatsapp", {
legacyKeys: WHATSAPP_LEGACY_OUTBOUND_SEND_DEP_KEYS,
}) ?? (await loadWhatsAppSendModule()).sendMessageWhatsApp;
return await send(to, normalizedText, {
verbose: false,
cfg,
accountId: accountId ?? undefined,
gifPlayback,
});
},
sendMedia: async ({
cfg,
to,
text,
mediaUrl,
mediaLocalRoots,
mediaReadFile,
accountId,
deps,
gifPlayback,
}) => {
const normalizedText = trimLeadingWhitespace(text);
const send =
resolveOutboundSendDep<typeof import("./send.js").sendMessageWhatsApp>(deps, "whatsapp", {
legacyKeys: WHATSAPP_LEGACY_OUTBOUND_SEND_DEP_KEYS,
}) ?? (await loadWhatsAppSendModule()).sendMessageWhatsApp;
return await send(to, normalizedText, {
verbose: false,
cfg,
mediaUrl,
mediaLocalRoots,
mediaReadFile,
accountId: accountId ?? undefined,
gifPlayback,
});
},
sendPoll: async ({ cfg, to, poll, accountId }) =>
await (
await loadWhatsAppSendModule()
).sendPollWhatsApp(to, poll, {
verbose: shouldLogVerbose(),
accountId: accountId ?? undefined,
cfg,
}),
}),
};
normalizeText: trimLeadingWhitespace,
skipEmptyText: true,
});

View File

@@ -351,6 +351,146 @@ describe("createWhatsAppOutboundBase", () => {
);
});
it("normalizes mediaUrls before payload delivery", async () => {
const sendMessageWhatsApp = vi.fn(async () => ({
messageId: "msg-1",
toJid: "15551234567@s.whatsapp.net",
}));
const outbound = createWhatsAppOutboundBase({
chunker: (text) => [text],
sendMessageWhatsApp,
sendPollWhatsApp: vi.fn(),
shouldLogVerbose: () => false,
resolveTarget: ({ to }) => ({ ok: true as const, to: to ?? "" }),
});
await outbound.sendPayload!({
cfg: {} as never,
to: "whatsapp:+15551234567",
text: "",
payload: {
text: "\n\ncaption",
mediaUrls: [" ", " /tmp/voice.ogg "],
},
deps: { sendWhatsApp: sendMessageWhatsApp },
});
expect(sendMessageWhatsApp).toHaveBeenCalledTimes(1);
expect(sendMessageWhatsApp).toHaveBeenCalledWith(
"whatsapp:+15551234567",
"caption",
expect.objectContaining({
verbose: false,
mediaUrl: "/tmp/voice.ogg",
}),
);
});
it("keeps explicit mediaUrl first when payload also includes mediaUrls", async () => {
const sendMessageWhatsApp = vi.fn(async () => ({
messageId: "msg-1",
toJid: "15551234567@s.whatsapp.net",
}));
const outbound = createWhatsAppOutboundBase({
chunker: (text) => [text],
sendMessageWhatsApp,
sendPollWhatsApp: vi.fn(),
shouldLogVerbose: () => false,
resolveTarget: ({ to }) => ({ ok: true as const, to: to ?? "" }),
});
await outbound.sendPayload!({
cfg: {} as never,
to: "whatsapp:+15551234567",
text: "",
payload: {
text: "\n\ncaption",
mediaUrl: "/tmp/primary.ogg",
mediaUrls: [" /tmp/secondary.ogg "],
},
deps: { sendWhatsApp: sendMessageWhatsApp },
});
expect(sendMessageWhatsApp).toHaveBeenNthCalledWith(
1,
"whatsapp:+15551234567",
"caption",
expect.objectContaining({
mediaUrl: "/tmp/primary.ogg",
}),
);
expect(sendMessageWhatsApp).toHaveBeenNthCalledWith(
2,
"whatsapp:+15551234567",
"",
expect.objectContaining({
mediaUrl: "/tmp/secondary.ogg",
}),
);
});
it("uses the caller-provided text normalization for payload delivery", async () => {
const sendMessageWhatsApp = vi.fn(async () => ({
messageId: "msg-1",
toJid: "15551234567@s.whatsapp.net",
}));
const outbound = createWhatsAppOutboundBase({
chunker: (text) => [text],
sendMessageWhatsApp,
sendPollWhatsApp: vi.fn(),
shouldLogVerbose: () => false,
resolveTarget: ({ to }) => ({ ok: true as const, to: to ?? "" }),
normalizeText: (text) => (text ?? "").replace(/^(?:[ \t]*\r?\n)+/, ""),
});
await outbound.sendPayload!({
cfg: {} as never,
to: "whatsapp:+15551234567",
text: "",
payload: {
text: "\n \n indented",
},
deps: { sendWhatsApp: sendMessageWhatsApp },
});
expect(sendMessageWhatsApp).toHaveBeenCalledWith(
"whatsapp:+15551234567",
" indented",
expect.objectContaining({
verbose: false,
}),
);
});
it("rejects structured-only payloads instead of reporting an empty successful send", async () => {
const sendMessageWhatsApp = vi.fn(async () => ({
messageId: "msg-1",
toJid: "15551234567@s.whatsapp.net",
}));
const outbound = createWhatsAppOutboundBase({
chunker: (text) => [text],
sendMessageWhatsApp,
sendPollWhatsApp: vi.fn(),
shouldLogVerbose: () => false,
resolveTarget: ({ to }) => ({ ok: true as const, to: to ?? "" }),
});
await expect(
outbound.sendPayload!({
cfg: {} as never,
to: "whatsapp:+15551234567",
text: "",
payload: {
channelData: { kind: "structured-only" },
},
deps: { sendWhatsApp: sendMessageWhatsApp },
}),
).rejects.toThrow(
"WhatsApp sendPayload does not support structured-only payloads without text or media.",
);
expect(sendMessageWhatsApp).not.toHaveBeenCalled();
});
it("threads cfg into sendPollWhatsApp call", async () => {
const sendPollWhatsApp = vi.fn(async () => ({
messageId: "wa-poll-1",

View File

@@ -10,6 +10,11 @@ import {
} from "openclaw/plugin-sdk/channel-send-result";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
import { resolveOutboundSendDep, sanitizeForPlainText } from "openclaw/plugin-sdk/infra-runtime";
import { sendTextMediaPayload } from "openclaw/plugin-sdk/reply-payload";
import {
normalizeWhatsAppOutboundPayload,
normalizeWhatsAppPayloadText,
} from "./outbound-media-contract.js";
import { WHATSAPP_LEGACY_OUTBOUND_SEND_DEP_KEYS } from "./outbound-send-deps.js";
import { lookupInboundMessageMetaForTarget } from "./quoted-message.js";
import { toWhatsappJid } from "./text-runtime.js";
@@ -34,6 +39,7 @@ type WhatsAppSendTextOptions = {
participant?: string;
messageText?: string;
};
preserveLeadingWhitespace?: boolean;
};
type WhatsAppSendMessage = (
to: string,
@@ -75,14 +81,28 @@ function resolveQuoteLookupAccountId(cfg?: OpenClawConfig, accountId?: string |
});
}
type WhatsAppOutboundBaseCore = Pick<
ChannelOutboundAdapter,
| "deliveryMode"
| "chunker"
| "chunkerMode"
| "textChunkLimit"
| "sanitizeText"
| "pollMaxOptions"
| "resolveTarget"
| "sendText"
| "sendMedia"
| "sendPoll"
>;
export function createWhatsAppOutboundBase({
chunker,
sendMessageWhatsApp,
sendPollWhatsApp,
shouldLogVerbose,
resolveTarget,
normalizeText = (text) => text ?? "",
skipEmptyText = false,
normalizeText = normalizeWhatsAppPayloadText,
skipEmptyText = true,
}: CreateWhatsAppOutboundBaseParams): Pick<
ChannelOutboundAdapter,
| "deliveryMode"
@@ -92,6 +112,7 @@ export function createWhatsAppOutboundBase({
| "sanitizeText"
| "pollMaxOptions"
| "resolveTarget"
| "sendPayload"
| "sendText"
| "sendMedia"
| "sendPoll"
@@ -116,7 +137,7 @@ export function createWhatsAppOutboundBase({
};
};
return {
const outbound: WhatsAppOutboundBaseCore = {
deliveryMode: "gateway",
chunker,
chunkerMode: "text",
@@ -192,4 +213,26 @@ export function createWhatsAppOutboundBase({
}),
}),
};
return {
...outbound,
sendPayload: async (ctx) => {
const payload = normalizeWhatsAppOutboundPayload(ctx.payload, { normalizeText });
if (!payload.text && !(payload.mediaUrl || payload.mediaUrls?.length)) {
if (ctx.payload.interactive || ctx.payload.presentation || ctx.payload.channelData) {
throw new Error(
"WhatsApp sendPayload does not support structured-only payloads without text or media.",
);
}
return { channel: "whatsapp", messageId: "" };
}
return await sendTextMediaPayload({
channel: "whatsapp",
ctx: {
...ctx,
payload,
},
adapter: outbound,
});
},
};
}

View File

@@ -0,0 +1,147 @@
import path from "node:path";
import { formatError } from "./session-errors.js";
import { sleep } from "./text-runtime.js";
type WhatsAppOutboundPayloadLike = {
text?: string;
mediaUrl?: string;
mediaUrls?: readonly string[];
};
type WhatsAppLoadedMediaLike = {
buffer: Buffer;
contentType?: string;
kind?: string;
fileName?: string;
};
export type CanonicalWhatsAppLoadedMedia = {
buffer: Buffer;
kind: "image" | "audio" | "video" | "document";
mimetype: string;
fileName?: string;
};
export function normalizeWhatsAppPayloadText(text: string | undefined): string {
return text?.trimStart() ?? "";
}
export function normalizeWhatsAppPayloadTextPreservingIndentation(
text: string | undefined,
): string {
return (text ?? "").replace(/^(?:[ \t]*\r?\n)+/, "");
}
export function resolveWhatsAppOutboundMediaUrls(
payload: Pick<WhatsAppOutboundPayloadLike, "mediaUrl" | "mediaUrls">,
): string[] {
const primaryMediaUrl = payload.mediaUrl?.trim();
const mediaUrls = (payload.mediaUrls ? [...payload.mediaUrls] : [])
.map((entry) => entry.trim())
.filter((entry): entry is string => Boolean(entry));
const orderedMediaUrls = [primaryMediaUrl, ...mediaUrls].filter((entry): entry is string =>
Boolean(entry),
);
return Array.from(new Set(orderedMediaUrls));
}
// Keep new WhatsApp outbound-media behavior in this helper so payload, gateway, and auto-reply paths stay aligned.
export function normalizeWhatsAppOutboundPayload<T extends WhatsAppOutboundPayloadLike>(
payload: T,
options?: {
normalizeText?: (text: string | undefined) => string;
},
): Omit<T, "text" | "mediaUrl" | "mediaUrls"> & {
text: string;
mediaUrl?: string;
mediaUrls?: string[];
} {
const mediaUrls = resolveWhatsAppOutboundMediaUrls(payload);
const normalizeText = options?.normalizeText ?? normalizeWhatsAppPayloadText;
return {
...payload,
text: normalizeText(payload.text),
mediaUrl: mediaUrls[0],
mediaUrls: mediaUrls.length > 0 ? mediaUrls : undefined,
};
}
export function normalizeWhatsAppLoadedMedia(
media: WhatsAppLoadedMediaLike,
mediaUrl?: string,
): CanonicalWhatsAppLoadedMedia {
const kind =
media.kind === "image" || media.kind === "audio" || media.kind === "video"
? media.kind
: "document";
const mimetype =
kind === "audio" && media.contentType === "audio/ogg"
? "audio/ogg; codecs=opus"
: (media.contentType ?? "application/octet-stream");
const fileName =
kind === "document"
? (media.fileName ?? deriveWhatsAppDocumentFileName(mediaUrl) ?? "file")
: undefined;
return {
buffer: media.buffer,
kind,
mimetype,
...(fileName ? { fileName } : {}),
};
}
function deriveWhatsAppDocumentFileName(mediaUrl: string | undefined): string | undefined {
if (!mediaUrl) {
return undefined;
}
try {
const parsed = new URL(mediaUrl);
const fileName = path.posix.basename(parsed.pathname);
return fileName ? decodeURIComponent(fileName) : undefined;
} catch {
const withoutQueryOrFragment = mediaUrl.split(/[?#]/, 1)[0] ?? "";
const fileName = withoutQueryOrFragment.split(/[\\/]/).pop();
return fileName || undefined;
}
}
export function isRetryableWhatsAppOutboundError(error: unknown): boolean {
return /closed|reset|timed\s*out|disconnect/i.test(formatError(error));
}
export async function sendWhatsAppOutboundWithRetry<T>(params: {
send: () => Promise<T>;
onRetry?: (params: {
attempt: number;
maxAttempts: number;
backoffMs: number;
error: unknown;
errorText: string;
}) => Promise<void> | void;
maxAttempts?: number;
}): Promise<T> {
const maxAttempts = params.maxAttempts ?? 3;
let lastError: unknown;
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
try {
return await params.send();
} catch (error) {
lastError = error;
const errorText = formatError(error);
const isLastAttempt = attempt === maxAttempts;
if (!isRetryableWhatsAppOutboundError(error) || isLastAttempt) {
throw error;
}
const backoffMs = 500 * attempt;
await params.onRetry?.({
attempt,
maxAttempts,
backoffMs,
error,
errorText,
});
await sleep(backoffMs);
}
}
throw lastError;
}

View File

@@ -3,7 +3,7 @@ import {
primeChannelOutboundSendMock,
type OutboundPayloadHarnessParams,
} from "openclaw/plugin-sdk/testing";
import { describe, vi } from "vitest";
import { describe, expect, it, vi } from "vitest";
import { whatsappOutbound } from "./outbound-adapter.js";
function createWhatsAppHarness(params: OutboundPayloadHarnessParams) {
@@ -31,4 +31,31 @@ describe("WhatsApp outbound payload contract", () => {
chunking: { mode: "split", longTextLength: 5000, maxChunkLength: 4000 },
createHarness: createWhatsAppHarness,
});
it("normalizes blank mediaUrls before contract delivery", async () => {
const sendWhatsApp = vi.fn();
primeChannelOutboundSendMock(sendWhatsApp, { messageId: "wa-1" });
await whatsappOutbound.sendPayload!({
cfg: {},
to: "5511999999999@c.us",
text: "",
payload: {
text: "\n\ncaption",
mediaUrls: [" ", " /tmp/voice.ogg "],
},
deps: {
whatsapp: sendWhatsApp,
},
});
expect(sendWhatsApp).toHaveBeenCalledTimes(1);
expect(sendWhatsApp).toHaveBeenCalledWith(
"5511999999999@c.us",
"caption",
expect.objectContaining({
mediaUrl: "/tmp/voice.ogg",
}),
);
});
});

View File

@@ -49,6 +49,14 @@ vi.mock("./outbound-media.runtime.js", async () => {
};
});
vi.mock("./text-runtime.js", async () => {
const actual = await vi.importActual<typeof import("./text-runtime.js")>("./text-runtime.js");
return {
...actual,
sleep: vi.fn(async () => {}),
};
});
describe("web outbound", () => {
const sendComposingTo = vi.fn(async () => {});
const sendMessage = vi.fn(async () => ({ messageId: "msg123" }));
@@ -161,6 +169,16 @@ describe("web outbound", () => {
expect(sendMessage).toHaveBeenLastCalledWith("+1555", "caption", buf, "image/jpeg");
});
it("preserves intentional indentation when the caller opts out of transport trimming", async () => {
await sendMessageWhatsApp("+1555", " indented", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
preserveLeadingWhitespace: true,
});
expect(sendMessage).toHaveBeenLastCalledWith("+1555", " indented", undefined, undefined);
});
it("skips whitespace-only text sends without media", async () => {
const result = await sendMessageWhatsApp("+1555", "\n \t", {
verbose: false,
@@ -268,6 +286,39 @@ describe("web outbound", () => {
expect(sendMessage).toHaveBeenLastCalledWith("+1555", "pic", buf, "image/jpeg");
});
it("does not retry transient outbound send failures to avoid duplicate sends", async () => {
sendMessage.mockRejectedValueOnce({ error: { message: "connection closed" } });
await expect(
sendMessageWhatsApp("+1555", "hi", { verbose: false, cfg: WHATSAPP_TEST_CFG }),
).rejects.toEqual({ error: { message: "connection closed" } });
expect(sendMessage).toHaveBeenCalledTimes(1);
});
it("prefers explicit mediaUrl over mediaUrls when both are present", async () => {
const buf = Buffer.from("img");
loadWebMediaMock.mockResolvedValueOnce({
buffer: buf,
contentType: "image/jpeg",
kind: "image",
});
await sendMessageWhatsApp("+1555", "pic", {
verbose: false,
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/primary.jpg",
mediaUrls: [" /tmp/secondary.jpg "],
});
expect(loadWebMediaMock).toHaveBeenCalledWith(
"/tmp/primary.jpg",
expect.objectContaining({
hostReadCapability: false,
}),
);
expect(sendMessage).toHaveBeenLastCalledWith("+1555", "pic", buf, "image/jpeg");
});
it("falls back to the first mediaUrls entry when mediaUrl is omitted", async () => {
const buf = Buffer.from("img");
loadWebMediaMock.mockResolvedValueOnce({

View File

@@ -14,6 +14,11 @@ import {
} from "./accounts.js";
import { getRegisteredWhatsAppConnectionController } from "./connection-controller-registry.js";
import type { ActiveWebListener, ActiveWebSendOptions } from "./inbound/types.js";
import {
normalizeWhatsAppLoadedMedia,
normalizeWhatsAppPayloadText,
resolveWhatsAppOutboundMediaUrls,
} from "./outbound-media-contract.js";
import { loadOutboundMediaFromUrl } from "./outbound-media.runtime.js";
import { markdownToWhatsApp, toWhatsappJid } from "./text-runtime.js";
@@ -69,16 +74,13 @@ export async function sendMessageWhatsApp(
participant?: string;
messageText?: string;
};
preserveLeadingWhitespace?: boolean;
},
): Promise<{ messageId: string; toJid: string }> {
let text = body.trimStart();
let text = options.preserveLeadingWhitespace ? body : normalizeWhatsAppPayloadText(body);
const jid = toWhatsappJid(to);
const mediaUrls = Array.isArray(options.mediaUrls)
? options.mediaUrls
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
.filter(Boolean)
: [];
const primaryMediaUrl = options.mediaUrl?.trim() || mediaUrls[0];
const mediaUrls = resolveWhatsAppOutboundMediaUrls(options);
const primaryMediaUrl = mediaUrls[0];
if (!text && !primaryMediaUrl) {
return { messageId: "", toJid: jid };
}
@@ -112,28 +114,23 @@ export async function sendMessageWhatsApp(
let mediaType: string | undefined;
let documentFileName: string | undefined;
if (primaryMediaUrl) {
const media = await loadOutboundMediaFromUrl(primaryMediaUrl, {
maxBytes: resolveWhatsAppMediaMaxBytes(account),
mediaAccess: options.mediaAccess,
mediaLocalRoots: options.mediaLocalRoots,
mediaReadFile: options.mediaReadFile,
});
const media = normalizeWhatsAppLoadedMedia(
await loadOutboundMediaFromUrl(primaryMediaUrl, {
maxBytes: resolveWhatsAppMediaMaxBytes(account),
mediaAccess: options.mediaAccess,
mediaLocalRoots: options.mediaLocalRoots,
mediaReadFile: options.mediaReadFile,
}),
primaryMediaUrl,
);
const caption = text || undefined;
mediaBuffer = media.buffer;
mediaType = media.contentType ?? "application/octet-stream";
if (media.kind === "audio") {
// WhatsApp expects explicit opus codec for PTT voice notes.
mediaType =
media.contentType === "audio/ogg"
? "audio/ogg; codecs=opus"
: (media.contentType ?? "application/octet-stream");
} else if (media.kind === "video") {
text = caption ?? "";
} else if (media.kind === "image") {
text = caption ?? "";
} else {
mediaType = media.mimetype;
if (media.kind === "document") {
text = caption ?? "";
documentFileName = media.fileName;
} else {
text = caption ?? "";
}
}
outboundLog.info(`Sending message -> ${redactedJid}${primaryMediaUrl ? " (media)" : ""}`);

View File

@@ -199,4 +199,59 @@ describe("buildEmbeddedRunPayloads tool-error warnings", () => {
assistantTexts: ['{"action":"NO_REPLY"}'],
});
});
it("preserves media directives when stored assistant text was reduced to visible text only", () => {
const payloads = buildPayloads({
assistantTexts: ["Attached image"],
lastAssistant: {
role: "assistant",
stopReason: "stop",
content: [
{
type: "text",
text: "MEDIA:/tmp/reply-image.png\nAttached image",
textSignature: JSON.stringify({
v: 1,
id: "item_final",
phase: "final_answer",
}),
},
],
} as AssistantMessage,
});
expect(payloads).toHaveLength(1);
expect(payloads[0]).toMatchObject({
text: "Attached image",
mediaUrl: "/tmp/reply-image.png",
mediaUrls: ["/tmp/reply-image.png"],
});
});
it("uses raw final assistant text when visible-text extraction removed a media-only directive line", () => {
const payloads = buildPayloads({
lastAssistant: {
role: "assistant",
stopReason: "stop",
content: [
{
type: "text",
text: "MEDIA:/tmp/reply-image.png\nAttached image",
textSignature: JSON.stringify({
v: 1,
id: "item_final",
phase: "final_answer",
}),
},
],
} as AssistantMessage,
});
expect(payloads).toHaveLength(1);
expect(payloads[0]).toMatchObject({
text: "Attached image",
mediaUrl: "/tmp/reply-image.png",
mediaUrls: ["/tmp/reply-image.png"],
});
});
});

View File

@@ -6,6 +6,7 @@ import { isSilentReplyPayloadText, SILENT_REPLY_TOKEN } from "../../../auto-repl
import { formatToolAggregate } from "../../../auto-reply/tool-meta.js";
import type { OpenClawConfig } from "../../../config/types.openclaw.js";
import { isCronSessionKey } from "../../../routing/session-key.js";
import { extractAssistantTextForPhase } from "../../../shared/chat-message-content.js";
import {
normalizeOptionalLowercaseString,
normalizeOptionalString,
@@ -52,6 +53,18 @@ function isVerboseToolDetailEnabled(level?: VerboseLevel): boolean {
return level === "on" || level === "full";
}
function resolveRawAssistantAnswerText(lastAssistant: AssistantMessage | undefined): string {
if (!lastAssistant) {
return "";
}
return (
normalizeOptionalString(
extractAssistantTextForPhase(lastAssistant, { phase: "final_answer" }) ??
extractAssistantTextForPhase(lastAssistant),
) ?? ""
);
}
function shouldIncludeToolErrorDetails(params: {
lastToolError: ToolErrorSummary;
isCronTrigger?: boolean;
@@ -220,6 +233,7 @@ export function buildEmbeddedRunPayloads(params: {
const fallbackAnswerText = params.lastAssistant
? extractAssistantVisibleText(params.lastAssistant)
: "";
const fallbackRawAnswerText = resolveRawAssistantAnswerText(params.lastAssistant);
const shouldSuppressRawErrorText = (text: string) => {
if (!lastAssistantErrored) {
return false;
@@ -270,13 +284,32 @@ export function buildEmbeddedRunPayloads(params: {
}
return isRawApiErrorPayload(trimmed);
};
const rawAnswerDirectiveState = fallbackRawAnswerText
? parseReplyDirectives(fallbackRawAnswerText)
: null;
const rawAnswerHasMedia =
(rawAnswerDirectiveState?.mediaUrls?.length ?? 0) > 0 || rawAnswerDirectiveState?.audioAsVoice;
const assistantTextsHaveMedia = params.assistantTexts.some((text) => {
const parsed = parseReplyDirectives(text);
return (parsed.mediaUrls?.length ?? 0) > 0 || parsed.audioAsVoice;
});
const normalizedAssistantTexts = normalizeTextForComparison(params.assistantTexts.join("\n\n"));
const normalizedRawAnswerText = normalizeTextForComparison(rawAnswerDirectiveState?.text ?? "");
const shouldPreferRawAnswerText =
rawAnswerHasMedia &&
(!params.assistantTexts.length ||
(!assistantTextsHaveMedia &&
normalizedAssistantTexts.length > 0 &&
normalizedAssistantTexts === normalizedRawAnswerText));
const answerTexts = suppressAssistantArtifacts
? []
: (params.assistantTexts.length
? params.assistantTexts
: fallbackAnswerText
? [fallbackAnswerText]
: []
: (shouldPreferRawAnswerText && fallbackRawAnswerText
? [fallbackRawAnswerText]
: params.assistantTexts.length
? params.assistantTexts
: fallbackAnswerText
? [fallbackAnswerText]
: []
).filter((text) => !shouldSuppressRawErrorText(text));
let hasUserFacingAssistantReply = false;

View File

@@ -103,7 +103,7 @@ describe("buildReplyPayloads media filter integration", () => {
});
});
it("applies media filter after text filter", async () => {
it("drops duplicate caption text after matching media is stripped", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
payloads: [{ text: "hello world!", mediaUrl: "file:///tmp/photo.jpg" }],
@@ -111,10 +111,24 @@ describe("buildReplyPayloads media filter integration", () => {
messagingToolSentMediaUrls: ["file:///tmp/photo.jpg"],
});
// Text filter removes the payload entirely (text matched), so nothing remains.
expect(replyPayloads).toHaveLength(0);
});
it("keeps captioned media when only the caption matches a messaging tool send", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
payloads: [{ text: "hello world!", mediaUrl: "file:///tmp/photo.jpg" }],
messagingToolSentTexts: ["hello world!"],
messagingToolSentMediaUrls: ["file:///tmp/other.jpg"],
});
expect(replyPayloads).toHaveLength(1);
expect(replyPayloads[0]).toMatchObject({
text: "hello world!",
mediaUrl: "file:///tmp/photo.jpg",
});
});
it("does not dedupe text for cross-target messaging sends", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
@@ -246,6 +260,20 @@ describe("buildReplyPayloads media filter integration", () => {
expect(replyPayloads).toHaveLength(0);
});
it("suppresses warning text when silent media payloads fail normalization", async () => {
const normalizeMediaPaths = async () => {
throw new Error("file not found");
};
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
payloads: [{ text: "NO_REPLY\nMEDIA: ./missing.png" }],
normalizeMediaPaths,
});
expect(replyPayloads).toHaveLength(0);
});
it("extracts markdown image replies into final payload media urls", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
@@ -310,6 +338,25 @@ describe("buildReplyPayloads media filter integration", () => {
expect(replyPayloads).toHaveLength(0);
});
it("deduplicates final payloads against directly sent block keys when streaming is enabled without a pipeline", async () => {
const { createBlockReplyContentKey } = await import("./block-reply-pipeline.js");
const directlySentBlockKeys = new Set<string>();
directlySentBlockKeys.add(
createBlockReplyContentKey({ text: "response", replyToId: "post-1" }),
);
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
blockStreamingEnabled: true,
blockReplyPipeline: null,
directlySentBlockKeys,
replyToMode: "off",
payloads: [{ text: "response" }],
});
expect(replyPayloads).toHaveLength(0);
});
it("does not suppress same-target replies when accountId differs", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,

View File

@@ -148,11 +148,18 @@ export async function buildReplyPayloads(params: {
currentMessageId: params.currentMessageId,
silentToken: SILENT_REPLY_TOKEN,
parseMode: "always",
}).payload;
return await normalizeReplyPayloadMedia({
payload: parsed,
});
const mediaNormalizedPayload = await normalizeReplyPayloadMedia({
payload: parsed.payload,
normalizeMediaPaths: params.normalizeMediaPaths,
});
if (
parsed.isSilent &&
!resolveSendableOutboundReplyParts(mediaNormalizedPayload).hasMedia
) {
mediaNormalizedPayload.text = undefined;
}
return mediaNormalizedPayload;
}),
)
).filter(isRenderablePayload);
@@ -199,32 +206,36 @@ export async function buildReplyPayloads(params: {
normalizeMediaPaths: params.normalizeMediaPaths,
})
: (params.messagingToolSentMediaUrls ?? []);
const dedupedPayloads = dedupeMessagingToolPayloads
? (dedupeRuntime ?? (await loadReplyPayloadsDedupeRuntime())).filterMessagingToolDuplicates({
payloads: silentFilteredPayloads,
sentTexts: messagingToolSentTexts,
})
: silentFilteredPayloads;
const mediaFilteredPayloads = dedupeMessagingToolPayloads
? (
dedupeRuntime ?? (await loadReplyPayloadsDedupeRuntime())
).filterMessagingToolMediaDuplicates({
payloads: dedupedPayloads,
payloads: silentFilteredPayloads,
sentMediaUrls: messagingToolSentMediaUrls,
})
: dedupedPayloads;
: silentFilteredPayloads;
const dedupedPayloads = dedupeMessagingToolPayloads
? (dedupeRuntime ?? (await loadReplyPayloadsDedupeRuntime())).filterMessagingToolDuplicates({
payloads: mediaFilteredPayloads,
sentTexts: messagingToolSentTexts,
})
: mediaFilteredPayloads;
const isDirectlySentBlockPayload = (payload: ReplyPayload) =>
Boolean(params.directlySentBlockKeys?.has(createBlockReplyContentKey(payload)));
// Filter out payloads already sent via pipeline or directly during tool flush.
const filteredPayloads = shouldDropFinalPayloads
? mediaFilteredPayloads.filter((payload) => payload.isError)
? dedupedPayloads.filter((payload) => payload.isError)
: params.blockStreamingEnabled
? mediaFilteredPayloads.filter(
(payload) => !params.blockReplyPipeline?.hasSentPayload(payload),
? dedupedPayloads.filter(
(payload) =>
!params.blockReplyPipeline?.hasSentPayload(payload) &&
!isDirectlySentBlockPayload(payload),
)
: params.directlySentBlockKeys?.size
? mediaFilteredPayloads.filter(
? dedupedPayloads.filter(
(payload) => !params.directlySentBlockKeys!.has(createBlockReplyContentKey(payload)),
)
: mediaFilteredPayloads;
: dedupedPayloads;
const replyPayloads = suppressMessagingToolReplies ? [] : filteredPayloads;
return {

View File

@@ -196,7 +196,7 @@ describe("runReplyAgent media path normalization", () => {
);
});
it("shares one media cache between direct block media and final payload filtering", async () => {
it("shares one media cache between block accumulation and final payload delivery", async () => {
let stagedIndex = 0;
resolveOutboundAttachmentFromUrlMock.mockImplementation(async (mediaUrl: string) => {
stagedIndex += 1;
@@ -233,18 +233,16 @@ describe("runReplyAgent media path normalization", () => {
}),
);
expect(result).toBeUndefined();
expect(resolveOutboundAttachmentFromUrlMock).toHaveBeenCalledTimes(1);
expect(onBlockReply).toHaveBeenCalledTimes(1);
expect(onBlockReply).toHaveBeenCalledWith({
text: undefined,
expect(result).toMatchObject({
text: "here is the chart",
mediaUrl: "/tmp/outbound-media/1-chart.png",
mediaUrls: ["/tmp/outbound-media/1-chart.png"],
replyToCurrent: undefined,
replyToId: "msg-1",
replyToTag: false,
audioAsVoice: false,
});
expect(resolveOutboundAttachmentFromUrlMock).toHaveBeenCalledTimes(1);
expect(onBlockReply).not.toHaveBeenCalled();
});
it("does not create a second media context inside runAgentTurnWithFallback when onBlockReply is provided", async () => {

View File

@@ -13,7 +13,7 @@ type BlockReplyPipelineLike = NonNullable<
>;
describe("createBlockReplyDeliveryHandler", () => {
it("sends media-bearing block replies even when block streaming is disabled", async () => {
it("keeps captioned media-bearing block replies buffered when block streaming is disabled", async () => {
const onBlockReply = vi.fn(async () => {});
const normalizeStreamingText = vi.fn((payload: { text?: string }) => ({
text: payload.text,
@@ -40,25 +40,49 @@ describe("createBlockReplyDeliveryHandler", () => {
replyToCurrent: true,
});
expect(onBlockReply).not.toHaveBeenCalled();
expect(directlySentBlockKeys).toEqual(new Set());
expect(typingSignals.signalTextDelta).toHaveBeenCalledWith("here's the vibe");
});
it("sends media-only block replies when block streaming is disabled", async () => {
const onBlockReply = vi.fn(async () => {});
const directlySentBlockKeys = new Set<string>();
const handler = createBlockReplyDeliveryHandler({
onBlockReply,
normalizeStreamingText: (payload) => ({ text: payload.text, skip: false }),
applyReplyToMode: (payload) => payload,
typingSignals: {
signalTextDelta: vi.fn(async () => {}),
} as unknown as TypingSignaler,
blockStreamingEnabled: false,
blockReplyPipeline: null,
directlySentBlockKeys,
});
await handler({
mediaUrls: ["/tmp/generated.png"],
replyToCurrent: true,
});
expect(onBlockReply).toHaveBeenCalledWith({
text: undefined,
mediaUrl: "/tmp/generated.png",
mediaUrls: ["/tmp/generated.png"],
replyToCurrent: true,
replyToId: undefined,
replyToTag: undefined,
audioAsVoice: false,
text: undefined,
});
expect(directlySentBlockKeys).toEqual(
new Set([
createBlockReplyContentKey({
text: "here's the vibe",
mediaUrls: ["/tmp/generated.png"],
replyToCurrent: true,
}),
]),
);
expect(typingSignals.signalTextDelta).toHaveBeenCalledWith("here's the vibe");
});
it("keeps text-only block replies buffered when block streaming is disabled", async () => {

View File

@@ -159,15 +159,14 @@ export function createBlockReplyDeliveryHandler(params: {
trackingPayload: blockPayload,
payload: blockPayload,
});
} else if (blockHasMedia) {
// When block streaming is disabled, text-only block replies are accumulated into the
// final response. Media cannot be reconstructed later, so send it immediately and let
// the assistant's final text arrive through the normal final-reply path.
} else if (blockHasMedia && !blockPayload.text) {
// Media-only block replies (for example orphaned tool attachments) are not reconstructible
// from the assistant's final text, so they still need a direct fallback when streaming is off.
await sendDirectBlockReply({
onBlockReply: params.onBlockReply,
directlySentBlockKeys: params.directlySentBlockKeys,
trackingPayload: blockPayload,
payload: { ...blockPayload, text: undefined },
payload: blockPayload,
});
}
// When streaming is disabled entirely, text-only blocks are accumulated in final text.

View File

@@ -120,6 +120,7 @@ describe("createReplyMediaPathNormalizer", () => {
5 * 1024 * 1024,
expect.any(Object),
);
expect(result.text).toBe("⚠️ Media failed.");
});
it("drops host file URLs when no sandbox mapping applies", async () => {
@@ -319,6 +320,45 @@ describe("createReplyMediaPathNormalizer", () => {
});
});
it("keeps reply text and appends a warning when all reply media is dropped", async () => {
resolveOutboundAttachmentFromUrl.mockRejectedValueOnce(new Error("file not found"));
const normalize = createReplyMediaPathNormalizer({
cfg: {},
sessionKey: "session-key",
workspaceDir: "/tmp/agent-workspace",
});
const result = await normalize({
text: "WA_MEDIA_DM_07",
mediaUrls: ["./out/missing.png"],
});
expect(result).toMatchObject({
text: "WA_MEDIA_DM_07\n⚠ Media failed.",
mediaUrl: undefined,
mediaUrls: undefined,
});
});
it("returns a warning-only text reply when media-only output is dropped upstream", async () => {
resolveOutboundAttachmentFromUrl.mockRejectedValueOnce(new Error("file not found"));
const normalize = createReplyMediaPathNormalizer({
cfg: {},
sessionKey: "session-key",
workspaceDir: "/tmp/agent-workspace",
});
const result = await normalize({
mediaUrls: ["./out/missing.png"],
});
expect(result).toMatchObject({
text: "⚠️ Media failed.",
mediaUrl: undefined,
mediaUrls: undefined,
});
});
it("threads requester context into shared outbound media access", async () => {
const normalize = createReplyMediaPathNormalizer({
cfg: {},

View File

@@ -60,6 +60,10 @@ function resolveReplyMediaMaxBytes(params: {
: MEDIA_MAX_BYTES;
}
function formatBlockedReplyMediaWarning(): string {
return "⚠️ Media failed.";
}
export function createReplyMediaPathNormalizer(params: {
cfg: OpenClawConfig;
sessionKey?: string;
@@ -206,11 +210,13 @@ export function createReplyMediaPathNormalizer(params: {
const normalizedMedia: string[] = [];
const seen = new Set<string>();
let firstMediaDropError: unknown;
for (const media of mediaList) {
let normalized: string;
try {
normalized = await normalizeMediaSource(media);
} catch (err) {
firstMediaDropError ??= err;
logVerbose(`dropping blocked reply media ${media}: ${String(err)}`);
continue;
}
@@ -222,8 +228,10 @@ export function createReplyMediaPathNormalizer(params: {
}
if (normalizedMedia.length === 0) {
const warning = firstMediaDropError ? formatBlockedReplyMediaWarning() : undefined;
return {
...payload,
text: warning ? (payload.text ? `${payload.text}\n${warning}` : warning) : payload.text,
mediaUrl: undefined,
mediaUrls: undefined,
};

View File

@@ -18,7 +18,12 @@ export function filterMessagingToolDuplicates(params: {
if (sentTexts.length === 0) {
return payloads;
}
return payloads.filter((payload) => !isMessagingToolDuplicate(payload.text ?? "", sentTexts));
return payloads.filter((payload) => {
if (payload.mediaUrl || payload.mediaUrls?.length) {
return true;
}
return !isMessagingToolDuplicate(payload.text ?? "", sentTexts);
});
}
export function filterMessagingToolMediaDuplicates(params: {