From 5b3fce4c855ed890a00cd2c28feba9fdb4456d29 Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Sat, 21 Mar 2026 15:14:03 -0700 Subject: [PATCH] Outbound: preserve routed audioAsVoice delivery --- extensions/matrix/src/outbound.test.ts | 2 ++ extensions/matrix/src/outbound.ts | 5 ++- src/auto-reply/reply/route-reply.test.ts | 25 ++++++++++++++ src/channels/plugins/types.adapters.ts | 1 + src/commands/agent.delivery.test.ts | 37 +++++++++++++++++++- src/infra/outbound/deliver.test.ts | 43 ++++++++++++++++++++++++ src/infra/outbound/deliver.ts | 15 ++++++++- src/infra/outbound/outbound.test.ts | 17 ++++++++-- src/infra/outbound/payloads.test.ts | 17 ++++++++-- src/infra/outbound/payloads.ts | 4 +++ 10 files changed, 157 insertions(+), 9 deletions(-) diff --git a/extensions/matrix/src/outbound.test.ts b/extensions/matrix/src/outbound.test.ts index 29de2346868..4516671b614 100644 --- a/extensions/matrix/src/outbound.test.ts +++ b/extensions/matrix/src/outbound.test.ts @@ -77,6 +77,7 @@ describe("matrixOutbound cfg threading", () => { mediaUrl: "file:///tmp/cat.png", mediaLocalRoots: ["/tmp/openclaw"], accountId: "default", + audioAsVoice: true, }); expect(mocks.sendMessageMatrix).toHaveBeenCalledWith( @@ -86,6 +87,7 @@ describe("matrixOutbound cfg threading", () => { cfg, mediaUrl: "file:///tmp/cat.png", mediaLocalRoots: ["/tmp/openclaw"], + audioAsVoice: true, }), ); }); diff --git a/extensions/matrix/src/outbound.ts b/extensions/matrix/src/outbound.ts index 5a715c54a1d..7fa4bda4570 100644 --- a/extensions/matrix/src/outbound.ts +++ b/extensions/matrix/src/outbound.ts @@ -7,7 +7,7 @@ export const matrixOutbound: ChannelOutboundAdapter = { chunker: (text, limit) => getMatrixRuntime().channel.text.chunkMarkdownText(text, limit), chunkerMode: "markdown", textChunkLimit: 4000, - sendText: async ({ cfg, to, text, deps, replyToId, threadId, accountId }) => { + sendText: async ({ cfg, to, text, deps, replyToId, threadId, accountId, audioAsVoice }) => { const send = resolveOutboundSendDep(deps, "matrix") ?? sendMessageMatrix; const resolvedThreadId = @@ -17,6 +17,7 @@ export const matrixOutbound: ChannelOutboundAdapter = { replyToId: replyToId ?? undefined, threadId: resolvedThreadId, accountId: accountId ?? undefined, + audioAsVoice, }); return { channel: "matrix", @@ -34,6 +35,7 @@ export const matrixOutbound: ChannelOutboundAdapter = { replyToId, threadId, accountId, + audioAsVoice, }) => { const send = resolveOutboundSendDep(deps, "matrix") ?? sendMessageMatrix; @@ -46,6 +48,7 @@ export const matrixOutbound: ChannelOutboundAdapter = { replyToId: replyToId ?? undefined, threadId: resolvedThreadId, accountId: accountId ?? undefined, + audioAsVoice, }); return { channel: "matrix", diff --git a/src/auto-reply/reply/route-reply.test.ts b/src/auto-reply/reply/route-reply.test.ts index c0eca8d6996..d590abe5844 100644 --- a/src/auto-reply/reply/route-reply.test.ts +++ b/src/auto-reply/reply/route-reply.test.ts @@ -414,6 +414,31 @@ describe("routeReply", () => { ); }); + it("preserves audioAsVoice on routed outbound payloads", async () => { + mocks.deliverOutboundPayloads.mockClear(); + mocks.deliverOutboundPayloads.mockResolvedValue([]); + await routeReply({ + payload: { text: "voice caption", mediaUrl: "file:///tmp/clip.mp3", audioAsVoice: true }, + channel: "slack", + to: "channel:C123", + cfg: {} as never, + }); + expect(mocks.deliverOutboundPayloads).toHaveBeenCalledTimes(1); + expect(mocks.deliverOutboundPayloads).toHaveBeenCalledWith( + expect.objectContaining({ + channel: "slack", + to: "channel:C123", + payloads: [ + expect.objectContaining({ + text: "voice caption", + mediaUrl: "file:///tmp/clip.mp3", + audioAsVoice: true, + }), + ], + }), + ); + }); + it("uses replyToId as threadTs for Slack", async () => { mocks.sendMessageSlack.mockClear(); await routeReply({ diff --git a/src/channels/plugins/types.adapters.ts b/src/channels/plugins/types.adapters.ts index 14a7ab10b8e..a5adf62c280 100644 --- a/src/channels/plugins/types.adapters.ts +++ b/src/channels/plugins/types.adapters.ts @@ -130,6 +130,7 @@ export type ChannelOutboundContext = { to: string; text: string; mediaUrl?: string; + audioAsVoice?: boolean; mediaLocalRoots?: readonly string[]; gifPlayback?: boolean; /** Send image as document to avoid Telegram compression. */ diff --git a/src/commands/agent.delivery.test.ts b/src/commands/agent.delivery.test.ts index e13cf219966..9e4bc0693e3 100644 --- a/src/commands/agent.delivery.test.ts +++ b/src/commands/agent.delivery.test.ts @@ -1,4 +1,5 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { ReplyPayload } from "../auto-reply/types.js"; import type { CliDeps } from "../cli/deps.js"; import type { OpenClawConfig } from "../config/config.js"; import type { SessionEntry } from "../config/sessions.js"; @@ -52,11 +53,17 @@ describe("deliverAgentCommandResult", () => { sessionEntry?: SessionEntry; runtime?: RuntimeEnv; resultText?: string; + payloads?: ReplyPayload[]; }) { const cfg = {} as OpenClawConfig; const deps = {} as CliDeps; const runtime = params.runtime ?? createRuntime(); - const result = createResult(params.resultText); + const result = params.payloads + ? { + payloads: params.payloads, + meta: { durationMs: 1 }, + } + : createResult(params.resultText); await deliverAgentCommandResult({ cfg, @@ -284,4 +291,32 @@ describe("deliverAgentCommandResult", () => { expect(line).toContain("channel=webchat"); expect(line).toContain("ANNOUNCE_SKIP"); }); + + it("preserves audioAsVoice in JSON output envelopes", async () => { + const runtime = createRuntime(); + await runDelivery({ + runtime, + payloads: [{ text: "voice caption", mediaUrl: "file:///tmp/clip.mp3", audioAsVoice: true }], + opts: { + message: "hello", + deliver: false, + json: true, + }, + }); + + expect(runtime.log).toHaveBeenCalledTimes(1); + expect( + JSON.parse(String((runtime.log as ReturnType).mock.calls[0]?.[0])), + ).toEqual({ + payloads: [ + { + text: "voice caption", + mediaUrl: "file:///tmp/clip.mp3", + mediaUrls: ["file:///tmp/clip.mp3"], + audioAsVoice: true, + }, + ], + meta: { durationMs: 1 }, + }); + }); }); diff --git a/src/infra/outbound/deliver.test.ts b/src/infra/outbound/deliver.test.ts index 6bf69a519f8..2f97906e892 100644 --- a/src/infra/outbound/deliver.test.ts +++ b/src/infra/outbound/deliver.test.ts @@ -501,6 +501,49 @@ describe("deliverOutboundPayloads", () => { ); }); + it("forwards audioAsVoice through generic plugin media delivery", async () => { + const sendMedia = vi.fn(async () => ({ + channel: "matrix" as const, + messageId: "mx-1", + roomId: "!room:example", + })); + setActivePluginRegistry( + createTestRegistry([ + { + pluginId: "matrix", + source: "test", + plugin: createOutboundTestPlugin({ + id: "matrix", + outbound: { + deliveryMode: "direct", + sendText: async ({ to, text }) => ({ + channel: "matrix", + messageId: `${to}:${text}`, + }), + sendMedia, + }, + }), + }, + ]), + ); + + await deliverOutboundPayloads({ + cfg: { channels: { matrix: {} } } as OpenClawConfig, + channel: "matrix", + to: "room:!room:example", + payloads: [{ text: "voice caption", mediaUrl: "file:///tmp/clip.mp3", audioAsVoice: true }], + }); + + expect(sendMedia).toHaveBeenCalledWith( + expect.objectContaining({ + to: "room:!room:example", + text: "voice caption", + mediaUrl: "file:///tmp/clip.mp3", + audioAsVoice: true, + }), + ); + }); + it("includes OpenClaw tmp root in whatsapp mediaLocalRoots", async () => { const sendWhatsApp = vi.fn().mockResolvedValue({ messageId: "w1", toJid: "jid" }); diff --git a/src/infra/outbound/deliver.ts b/src/infra/outbound/deliver.ts index e1be816c910..707477b8339 100644 --- a/src/infra/outbound/deliver.ts +++ b/src/infra/outbound/deliver.ts @@ -78,6 +78,7 @@ type ChannelHandler = { overrides?: { replyToId?: string | null; threadId?: string | number | null; + audioAsVoice?: boolean; }, ) => Promise; sendFormattedText?: ( @@ -85,6 +86,7 @@ type ChannelHandler = { overrides?: { replyToId?: string | null; threadId?: string | number | null; + audioAsVoice?: boolean; }, ) => Promise; sendFormattedMedia?: ( @@ -93,6 +95,7 @@ type ChannelHandler = { overrides?: { replyToId?: string | null; threadId?: string | number | null; + audioAsVoice?: boolean; }, ) => Promise; sendText: ( @@ -100,6 +103,7 @@ type ChannelHandler = { overrides?: { replyToId?: string | null; threadId?: string | number | null; + audioAsVoice?: boolean; }, ) => Promise; sendMedia: ( @@ -108,6 +112,7 @@ type ChannelHandler = { overrides?: { replyToId?: string | null; threadId?: string | number | null; + audioAsVoice?: boolean; }, ) => Promise; }; @@ -159,10 +164,12 @@ function createPluginHandler( const resolveCtx = (overrides?: { replyToId?: string | null; threadId?: string | number | null; + audioAsVoice?: boolean; }): Omit => ({ ...baseCtx, replyToId: overrides?.replyToId ?? baseCtx.replyToId, threadId: overrides?.threadId ?? baseCtx.threadId, + audioAsVoice: overrides?.audioAsVoice, }); return { chunker, @@ -335,6 +342,7 @@ function buildPayloadSummary(payload: ReplyPayload): NormalizedOutboundPayload { return { text: parts.text, mediaUrls: parts.mediaUrls, + audioAsVoice: payload.audioAsVoice === true ? true : undefined, interactive: payload.interactive, channelData: payload.channelData, }; @@ -572,7 +580,11 @@ async function deliverOutboundPayloadsCore( const sendTextChunks = async ( text: string, - overrides?: { replyToId?: string | null; threadId?: string | number | null }, + overrides?: { + replyToId?: string | null; + threadId?: string | number | null; + audioAsVoice?: boolean; + }, ) => { throwIfAborted(abortSignal); if (!handler.chunker || textLimit === undefined) { @@ -657,6 +669,7 @@ async function deliverOutboundPayloadsCore( const sendOverrides = { replyToId: effectivePayload.replyToId ?? params.replyToId ?? undefined, threadId: params.threadId ?? undefined, + audioAsVoice: effectivePayload.audioAsVoice === true ? true : undefined, forceDocument: params.forceDocument, }; if ( diff --git a/src/infra/outbound/outbound.test.ts b/src/infra/outbound/outbound.test.ts index 006a160e6ab..481fc8c7888 100644 --- a/src/infra/outbound/outbound.test.ts +++ b/src/infra/outbound/outbound.test.ts @@ -1308,21 +1308,29 @@ describe("normalizeOutboundPayloadsForJson", () => { { input: [ { text: "hi" }, - { text: "photo", mediaUrl: "https://x.test/a.jpg" }, + { text: "photo", mediaUrl: "https://x.test/a.jpg", audioAsVoice: true }, { text: "multi", mediaUrls: ["https://x.test/1.png"] }, ], expected: [ - { text: "hi", mediaUrl: null, mediaUrls: undefined, channelData: undefined }, + { + text: "hi", + mediaUrl: null, + mediaUrls: undefined, + audioAsVoice: undefined, + channelData: undefined, + }, { text: "photo", mediaUrl: "https://x.test/a.jpg", mediaUrls: ["https://x.test/a.jpg"], + audioAsVoice: true, channelData: undefined, }, { text: "multi", mediaUrl: null, mediaUrls: ["https://x.test/1.png"], + audioAsVoice: undefined, channelData: undefined, }, ], @@ -1338,6 +1346,7 @@ describe("normalizeOutboundPayloadsForJson", () => { text: "", mediaUrl: null, mediaUrls: ["https://x.test/a.png", "https://x.test/b.png"], + audioAsVoice: undefined, channelData: undefined, }, ], @@ -1362,7 +1371,9 @@ describe("normalizeOutboundPayloadsForJson", () => { { text: "Reasoning:\n_step_", isReasoning: true }, { text: "final answer" }, ]); - expect(normalized).toEqual([{ text: "final answer", mediaUrl: null, mediaUrls: undefined }]); + expect(normalized).toEqual([ + { text: "final answer", mediaUrl: null, mediaUrls: undefined, audioAsVoice: undefined }, + ]); }); }); diff --git a/src/infra/outbound/payloads.test.ts b/src/infra/outbound/payloads.test.ts index ef5ccbced53..3aaf1a2f61e 100644 --- a/src/infra/outbound/payloads.test.ts +++ b/src/infra/outbound/payloads.test.ts @@ -83,21 +83,29 @@ describe("normalizeOutboundPayloadsForJson", () => { { input: [ { text: "hi" }, - { text: "photo", mediaUrl: "https://x.test/a.jpg" }, + { text: "photo", mediaUrl: "https://x.test/a.jpg", audioAsVoice: true }, { text: "multi", mediaUrls: ["https://x.test/1.png"] }, ], expected: [ - { text: "hi", mediaUrl: null, mediaUrls: undefined, channelData: undefined }, + { + text: "hi", + mediaUrl: null, + mediaUrls: undefined, + audioAsVoice: undefined, + channelData: undefined, + }, { text: "photo", mediaUrl: "https://x.test/a.jpg", mediaUrls: ["https://x.test/a.jpg"], + audioAsVoice: true, channelData: undefined, }, { text: "multi", mediaUrl: null, mediaUrls: ["https://x.test/1.png"], + audioAsVoice: undefined, channelData: undefined, }, ], @@ -113,6 +121,7 @@ describe("normalizeOutboundPayloadsForJson", () => { text: "", mediaUrl: null, mediaUrls: ["https://x.test/a.png", "https://x.test/b.png"], + audioAsVoice: undefined, channelData: undefined, }, ], @@ -138,7 +147,9 @@ describe("normalizeOutboundPayloadsForJson", () => { { text: "Reasoning:\n_step_", isReasoning: true }, { text: "final answer" }, ]), - ).toEqual([{ text: "final answer", mediaUrl: null, mediaUrls: undefined }]); + ).toEqual([ + { text: "final answer", mediaUrl: null, mediaUrls: undefined, audioAsVoice: undefined }, + ]); }); }); diff --git a/src/infra/outbound/payloads.ts b/src/infra/outbound/payloads.ts index 39da3d2fdcb..285dd99c0be 100644 --- a/src/infra/outbound/payloads.ts +++ b/src/infra/outbound/payloads.ts @@ -16,6 +16,7 @@ import { export type NormalizedOutboundPayload = { text: string; mediaUrls: string[]; + audioAsVoice?: boolean; interactive?: InteractiveReply; channelData?: Record; }; @@ -24,6 +25,7 @@ export type OutboundPayloadJson = { text: string; mediaUrl: string | null; mediaUrls?: string[]; + audioAsVoice?: boolean; interactive?: InteractiveReply; channelData?: Record; }; @@ -111,6 +113,7 @@ export function normalizeOutboundPayloads( normalizedPayloads.push({ text, mediaUrls: parts.mediaUrls, + audioAsVoice: payload.audioAsVoice === true ? true : undefined, ...(hasInteractive ? { interactive } : {}), ...(hasChannelData ? { channelData } : {}), }); @@ -128,6 +131,7 @@ export function normalizeOutboundPayloadsForJson( text: parts.text, mediaUrl: payload.mediaUrl ?? null, mediaUrls: parts.mediaUrls.length ? parts.mediaUrls : undefined, + audioAsVoice: payload.audioAsVoice === true ? true : undefined, interactive: payload.interactive, channelData: payload.channelData, });