From b0c55eb65948b633f61fe04e04c91d7da02cf4f8 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 25 Apr 2026 09:26:08 +0100
Subject: [PATCH] fix(feishu): transcode voice TTS audio

---
 CHANGELOG.md                                  |   1 +
 docs/channels/feishu.md                       |   8 ++
 docs/tools/tts.md                             |   8 +-
 extensions/feishu/src/channel.test.ts         |  28 ++++
 extensions/feishu/src/channel.ts              |  12 ++
 extensions/feishu/src/media.test.ts           | 111 ++++++++++++++++
 extensions/feishu/src/media.ts                | 120 +++++++++++++++++-
 extensions/feishu/src/outbound.test.ts        |  18 +++
 extensions/feishu/src/outbound.ts             |   2 +
 .../feishu/src/reply-dispatcher.test.ts       |  15 +++
 extensions/feishu/src/reply-dispatcher.ts     |   1 +
 extensions/speech-core/src/tts.test.ts        |  30 +++++
 extensions/speech-core/src/tts.ts             |  33 ++++-
 src/agents/tools/tts-tool.test.ts             |  22 ++++
 src/agents/tools/tts-tool.ts                  |   2 +-
 src/plugin-sdk/tts-runtime.types.ts           |  11 ++
 16 files changed, 416 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 12f087a5f0d..a39003f644e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ Docs: https://docs.openclaw.ai
 - Control UI: make `/usage` use the fresh context snapshot for context percentage, and include cache-write tokens in the Usage overview cache-hit denominator. Fixes #47885. Thanks @imwyvern and @Ante042.
 - GitHub Copilot: preserve encrypted Responses reasoning item IDs during replay so Copilot can validate encrypted reasoning payloads across requests. (#71448) Thanks @a410979729-sys.
 - Agents/replies: recover final-answer text when streamed assistant chunks contain only whitespace, preventing completed turns from surfacing as empty-payload errors. Fixes #71454. (#71467) Thanks @Sanjays2402.
+- Feishu/TTS: transcode voice-intent MP3 and other audio replies to Ogg/Opus before sending native Feishu audio bubbles, while keeping ordinary MP3 attachments as files. Fixes #61249 and #37868.
 - Telegram/webhook: acknowledge validated webhook updates before running bot middleware, keeping slow agent turns from tripping Telegram delivery retries while preserving per-chat processing lanes. Fixes #71392. Thanks @joelforsberg46-source.
 - MCP: retire one-shot embedded bundled MCP runtimes at run end, skip bundle-MCP startup when a runtime tool allowlist cannot reach bundle-MCP tools, and add `mcp.sessionIdleTtlMs` idle eviction for leaked session runtimes. Fixes #71106, #71110, #70389, and #70808.
 - MCP/config reload: hot-apply `mcp.*` changes by disposing cached session MCP runtimes, and dispose bundled MCP runtimes during gateway shutdown so removed `mcp.servers` entries reap child processes promptly. Fixes #60656.
diff --git a/docs/channels/feishu.md b/docs/channels/feishu.md
index 50571935ae4..c839818a492 100644
--- a/docs/channels/feishu.md
+++ b/docs/channels/feishu.md
@@ -424,6 +424,14 @@ Full configuration: [Gateway configuration](/gateway/configuration)
 - ✅ Interactive cards (including streaming updates)
 - ⚠️ Rich text (post-style formatting; doesn't support full Feishu/Lark authoring capabilities)
 
+Native Feishu/Lark audio bubbles use the Feishu `audio` message type and require
+Ogg/Opus upload media (`file_type: "opus"`). Existing `.opus` and `.ogg` media
+is sent directly as native audio. MP3/WAV/M4A and other likely audio formats are
+transcoded to 48kHz Ogg/Opus with `ffmpeg` only when the reply requests voice
+delivery (`audioAsVoice` / message tool `asVoice`, including TTS voice-note
+replies). Ordinary MP3 attachments stay regular files. If `ffmpeg` is missing or
+conversion fails, OpenClaw falls back to a file attachment and logs the reason.
+
 ### Threads and replies
 
 - ✅ Inline replies
diff --git a/docs/tools/tts.md b/docs/tools/tts.md
index 16780a098f0..2d7c6cb9c99 100644
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -489,8 +489,12 @@ These override `messages.tts.*` for that host.
 
 ## Output formats (fixed)
 
-- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
+- **Feishu / Matrix / Telegram / WhatsApp**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
   - 48kHz / 64kbps is a good voice message tradeoff.
+- **Feishu**: when a voice-note reply is produced as MP3/WAV/M4A or another
+  likely audio file, the Feishu plugin transcodes it to 48kHz Ogg/Opus with
+  `ffmpeg` before sending the native `audio` bubble. If conversion fails, Feishu
+  receives the original file as an attachment.
 - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
   - 44.1kHz / 128kbps is the default balance for speech clarity.
 - **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery.
@@ -572,6 +576,8 @@ Notes:
 The `tts` tool converts text to speech and returns an audio attachment for
 reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
 the audio is delivered as a voice message rather than a file attachment.
+Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is
+available.
 It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
 per-call provider request timeout in milliseconds.
 
diff --git a/extensions/feishu/src/channel.test.ts b/extensions/feishu/src/channel.test.ts
index 2811b7fcfa4..a6e7af830ce 100644
--- a/extensions/feishu/src/channel.test.ts
+++ b/extensions/feishu/src/channel.test.ts
@@ -461,6 +461,34 @@ describe("feishuPlugin actions", () => {
     expect(result?.details).toMatchObject({ messageId: "om_media" });
   });
 
+  it("passes asVoice through media sends", async () => {
+    feishuOutboundSendMediaMock.mockResolvedValueOnce({
+      channel: "feishu",
+      messageId: "om_voice",
+      details: { messageId: "om_voice", chatId: "oc_group_1" },
+    });
+
+    await feishuPlugin.actions?.handleAction?.({
+      action: "send",
+      params: {
+        to: "chat:oc_group_1",
+        media: "https://example.com/reply.mp3",
+        asVoice: true,
+      },
+      cfg,
+      accountId: undefined,
+      toolContext: {},
+      mediaLocalRoots: [],
+    } as never);
+
+    expect(feishuOutboundSendMediaMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        mediaUrl: "https://example.com/reply.mp3",
+        audioAsVoice: true,
+      }),
+    );
+  });
+
   it("reads messages", async () => {
     getMessageFeishuMock.mockResolvedValueOnce({
       messageId: "om_1",
diff --git a/extensions/feishu/src/channel.ts b/extensions/feishu/src/channel.ts
index 55ef53d8e42..07844d57a0c 100644
--- a/extensions/feishu/src/channel.ts
+++ b/extensions/feishu/src/channel.ts
@@ -81,6 +81,16 @@ function readFeishuMediaParam(params: Record<string, unknown>): string | undefin
   return media.trim() ? media : undefined;
 }
 
+function readBooleanParam(params: Record<string, unknown>, keys: string[]): boolean | undefined {
+  for (const key of keys) {
+    const value = params[key];
+    if (typeof value === "boolean") {
+      return value;
+    }
+  }
+  return undefined;
+}
+
 function hasLegacyFeishuCardCommandValue(actionValue: unknown): boolean {
   return (
     isRecord(actionValue) &&
@@ -695,6 +705,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
             const presentation = normalizeMessagePresentation(ctx.params.presentation);
             const text = readFirstString(ctx.params, ["text", "message"]);
             const mediaUrl = readFeishuMediaParam(ctx.params);
+            const audioAsVoice = readBooleanParam(ctx.params, ["asVoice", "audioAsVoice"]);
             const card = presentation
               ? buildFeishuPresentationCard({ presentation, fallbackText: text })
               : undefined;
@@ -734,6 +745,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
                 accountId: ctx.accountId ?? undefined,
                 mediaLocalRoots: ctx.mediaLocalRoots,
                 replyToId: replyToMessageId,
+                ...(audioAsVoice === true ? { audioAsVoice: true } : {}),
               });
             } else {
               result = await runtime.sendMessageFeishu({
diff --git a/extensions/feishu/src/media.test.ts b/extensions/feishu/src/media.test.ts
index a6ddc834c67..0ed50ae6c8c 100644
--- a/extensions/feishu/src/media.test.ts
+++ b/extensions/feishu/src/media.test.ts
@@ -9,6 +9,7 @@ const resolveFeishuAccountMock = vi.hoisted(() => vi.fn());
 const normalizeFeishuTargetMock = vi.hoisted(() => vi.fn());
 const resolveReceiveIdTypeMock = vi.hoisted(() => vi.fn());
 const loadWebMediaMock = vi.hoisted(() => vi.fn());
+const runFfmpegMock = vi.hoisted(() => vi.fn());
 
 const fileCreateMock = vi.hoisted(() => vi.fn());
 const imageCreateMock = vi.hoisted(() => vi.fn());
@@ -42,6 +43,14 @@ vi.mock("./runtime.js", () => ({
   }),
 }));
 
+vi.mock("openclaw/plugin-sdk/media-runtime", async (importOriginal) => {
+  const actual = await importOriginal<typeof import("openclaw/plugin-sdk/media-runtime")>();
+  return {
+    ...actual,
+    runFfmpeg: runFfmpegMock,
+  };
+});
+
 vi.mock("../../../src/channels/plugins/bundled.js", () => ({
   bundledChannelPlugins: [],
   bundledChannelSetupPlugins: [],
@@ -145,6 +154,10 @@ describe("sendMediaFeishu msg_type routing", () => {
 
     imageGetMock.mockResolvedValue(Buffer.from("image-bytes"));
     messageResourceGetMock.mockResolvedValue(Buffer.from("resource-bytes"));
+    runFfmpegMock.mockImplementation(async (args: string[]) => {
+      await fs.writeFile(args.at(-1) ?? "", Buffer.from("opus-output"));
+      return "";
+    });
   });
 
   it("uses msg_type=media for mp4 video", async () => {
@@ -260,6 +273,104 @@ describe("sendMediaFeishu msg_type routing", () => {
         data: expect.objectContaining({ msg_type: "file" }),
       }),
     );
+    expect(runFfmpegMock).not.toHaveBeenCalled();
+  });
+
+  it("transcodes voice-intent mp3 to msg_type=audio", async () => {
+    loadWebMediaMock.mockResolvedValueOnce({
+      buffer: Buffer.from("remote-mp3"),
+      fileName: "reply.mp3",
+      kind: "audio",
+      contentType: "audio/mpeg",
+    });
+
+    await sendMediaFeishu({
+      cfg: emptyConfig,
+      to: "user:ou_target",
+      mediaUrl: "https://example.com/reply.mp3",
+      audioAsVoice: true,
+    });
+
+    expect(runFfmpegMock).toHaveBeenCalledWith(
+      expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]),
+    );
+    expect(fileCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({
+          file_type: "opus",
+          file_name: "voice.ogg",
+          file: Buffer.from("opus-output"),
+        }),
+      }),
+    );
+    expect(messageCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({ msg_type: "audio" }),
+      }),
+    );
+  });
+
+  it("leaves native voice audio unchanged when audioAsVoice is true", async () => {
+    await sendMediaFeishu({
+      cfg: emptyConfig,
+      to: "user:ou_target",
+      mediaBuffer: Buffer.from("opus"),
+      fileName: "reply.ogg",
+      audioAsVoice: true,
+    });
+
+    expect(runFfmpegMock).not.toHaveBeenCalled();
+    expect(fileCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({
+          file_type: "opus",
+          file_name: "reply.ogg",
+        }),
+      }),
+    );
+    expect(messageCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({ msg_type: "audio" }),
+      }),
+    );
+  });
+
+  it("falls back to file when voice-intent audio cannot be transcoded", async () => {
+    const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => undefined);
+    runFfmpegMock.mockRejectedValueOnce(new Error("ffmpeg missing"));
+    loadWebMediaMock.mockResolvedValueOnce({
+      buffer: Buffer.from("remote-mp3"),
+      fileName: "reply.mp3",
+      kind: "audio",
+      contentType: "audio/mpeg",
+    });
+
+    await sendMediaFeishu({
+      cfg: emptyConfig,
+      to: "user:ou_target",
+      mediaUrl: "https://example.com/reply.mp3",
+      audioAsVoice: true,
+    });
+
+    expect(fileCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({
+          file_type: "stream",
+          file_name: "reply.mp3",
+          file: Buffer.from("remote-mp3"),
+        }),
+      }),
+    );
+    expect(messageCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({ msg_type: "file" }),
+      }),
+    );
+    expect(warnSpy).toHaveBeenCalledWith(
+      expect.stringContaining("audioAsVoice transcode failed"),
+      expect.any(Error),
+    );
+    warnSpy.mockRestore();
   });
 
   it("configures the media client timeout for image uploads", async () => {
diff --git a/extensions/feishu/src/media.ts b/extensions/feishu/src/media.ts
index 6497db99955..529dba5b616 100644
--- a/extensions/feishu/src/media.ts
+++ b/extensions/feishu/src/media.ts
@@ -3,7 +3,11 @@ import path from "node:path";
 import { Readable } from "node:stream";
 import type * as Lark from "@larksuiteoapi/node-sdk";
 import { mediaKindFromMime } from "openclaw/plugin-sdk/media-mime";
-import { withTempDownloadPath } from "openclaw/plugin-sdk/temp-path";
+import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS, runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
+import {
+  resolvePreferredOpenClawTmpDir,
+  withTempDownloadPath,
+} from "openclaw/plugin-sdk/temp-path";
 import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
 import type { ClawdbotConfig } from "../runtime-api.js";
 import { resolveFeishuRuntimeAccount } from "./accounts.js";
@@ -14,6 +18,24 @@ import { assertFeishuMessageApiSuccess, toFeishuSendResult } from "./send-result
 import { resolveFeishuSendTarget } from "./send-target.js";
 
 const FEISHU_MEDIA_HTTP_TIMEOUT_MS = 120_000;
+const FEISHU_VOICE_FILE_NAME = "voice.ogg";
+const FEISHU_VOICE_SAMPLE_RATE_HZ = 48_000;
+const FEISHU_VOICE_BITRATE = "64k";
+
+const FEISHU_TRANSCODABLE_AUDIO_EXTS = new Set([
+  ".aac",
+  ".aiff",
+  ".alac",
+  ".amr",
+  ".caf",
+  ".flac",
+  ".m4a",
+  ".mp3",
+  ".oga",
+  ".wav",
+  ".webm",
+  ".wma",
+]);
 
 export type DownloadImageResult = {
   buffer: Buffer;
@@ -568,6 +590,89 @@ function resolveFeishuOutboundMediaKind(params: { fileName: string; contentType?
   };
 }
 
+function isFeishuNativeVoiceAudio(params: { fileName: string; contentType?: string }): boolean {
+  const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
+  const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
+  return (
+    ext === ".opus" || ext === ".ogg" || contentType === "audio/ogg" || contentType === "audio/opus"
+  );
+}
+
+function isLikelyTranscodableAudio(params: { fileName: string; contentType?: string }): boolean {
+  const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
+  const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
+  return FEISHU_TRANSCODABLE_AUDIO_EXTS.has(ext) || mediaKindFromMime(contentType) === "audio";
+}
+
+async function transcodeToFeishuVoiceOpus(params: {
+  buffer: Buffer;
+  fileName: string;
+  contentType?: string;
+}): Promise<{ buffer: Buffer; fileName: string; contentType: string }> {
+  const tempRoot = resolvePreferredOpenClawTmpDir();
+  await fs.promises.mkdir(tempRoot, { recursive: true, mode: 0o700 });
+  const tempDir = await fs.promises.mkdtemp(path.join(tempRoot, "feishu-voice-"));
+  try {
+    const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
+    const inputExt = ext && ext.length <= 12 ? ext : ".audio";
+    const inputPath = path.join(tempDir, `input${inputExt}`);
+    const outputPath = path.join(tempDir, FEISHU_VOICE_FILE_NAME);
+    await fs.promises.writeFile(inputPath, params.buffer, { mode: 0o600 });
+    await runFfmpeg([
+      "-hide_banner",
+      "-loglevel",
+      "error",
+      "-y",
+      "-i",
+      inputPath,
+      "-vn",
+      "-sn",
+      "-dn",
+      "-t",
+      String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS),
+      "-ar",
+      String(FEISHU_VOICE_SAMPLE_RATE_HZ),
+      "-ac",
+      "1",
+      "-c:a",
+      "libopus",
+      "-b:a",
+      FEISHU_VOICE_BITRATE,
+      outputPath,
+    ]);
+    return {
+      buffer: await fs.promises.readFile(outputPath),
+      fileName: FEISHU_VOICE_FILE_NAME,
+      contentType: "audio/ogg",
+    };
+  } finally {
+    await fs.promises.rm(tempDir, { recursive: true, force: true });
+  }
+}
+
+async function prepareFeishuVoiceMedia(params: {
+  buffer: Buffer;
+  fileName: string;
+  contentType?: string;
+  audioAsVoice?: boolean;
+}): Promise<{ buffer: Buffer; fileName: string; contentType?: string }> {
+  if (isFeishuNativeVoiceAudio(params)) {
+    return params;
+  }
+  if (params.audioAsVoice !== true || !isLikelyTranscodableAudio(params)) {
+    return params;
+  }
+  try {
+    return await transcodeToFeishuVoiceOpus(params);
+  } catch (err) {
+    console.warn(
+      `[feishu] audioAsVoice transcode failed; sending ${params.fileName} as a file attachment:`,
+      err,
+    );
+    return params;
+  }
+}
+
 /**
  * Upload and send media (image or file) from URL, local path, or buffer.
  * When mediaUrl is a local path, mediaLocalRoots (from core outbound context)
@@ -584,6 +689,8 @@ export async function sendMediaFeishu(params: {
   accountId?: string;
   /** Allowed roots for local path reads; required for local filePath to work. */
   mediaLocalRoots?: readonly string[];
+  /** When true, transcode compatible audio to Feishu native Ogg/Opus voice bubbles. */
+  audioAsVoice?: boolean;
 }): Promise<SendMediaResult> {
   const {
     cfg,
@@ -595,6 +702,7 @@ export async function sendMediaFeishu(params: {
     replyInThread,
     accountId,
     mediaLocalRoots,
+    audioAsVoice,
   } = params;
   const account = resolveFeishuRuntimeAccount({ cfg, accountId });
   if (!account.configured) {
@@ -622,6 +730,16 @@ export async function sendMediaFeishu(params: {
     throw new Error("Either mediaUrl or mediaBuffer must be provided");
   }
 
+  const prepared = await prepareFeishuVoiceMedia({
+    buffer,
+    fileName: name,
+    contentType,
+    audioAsVoice,
+  });
+  buffer = prepared.buffer;
+  name = prepared.fileName;
+  contentType = prepared.contentType;
+
   const routing = resolveFeishuOutboundMediaKind({ fileName: name, contentType });
 
   if (routing.msgType === "image") {
diff --git a/extensions/feishu/src/outbound.test.ts b/extensions/feishu/src/outbound.test.ts
index 0c070dd6d38..db939731277 100644
--- a/extensions/feishu/src/outbound.test.ts
+++ b/extensions/feishu/src/outbound.test.ts
@@ -457,6 +457,24 @@ describe("feishuOutbound.sendMedia replyToId forwarding", () => {
     );
   });
 
+  it("forwards audioAsVoice to sendMediaFeishu", async () => {
+    await feishuOutbound.sendMedia?.({
+      cfg: emptyConfig,
+      to: "chat_1",
+      text: "",
+      mediaUrl: "https://example.com/reply.mp3",
+      audioAsVoice: true,
+      accountId: "main",
+    });
+
+    expect(sendMediaFeishuMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        mediaUrl: "https://example.com/reply.mp3",
+        audioAsVoice: true,
+      }),
+    );
+  });
+
   it("forwards replyToId to text caption send", async () => {
     await feishuOutbound.sendMedia?.({
       cfg: emptyConfig,
diff --git a/extensions/feishu/src/outbound.ts b/extensions/feishu/src/outbound.ts
index d451786730b..43493490ccd 100644
--- a/extensions/feishu/src/outbound.ts
+++ b/extensions/feishu/src/outbound.ts
@@ -232,6 +232,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
       to,
       text,
       mediaUrl,
+      audioAsVoice,
       accountId,
       mediaLocalRoots,
       replyToId,
@@ -271,6 +272,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
             accountId: accountId ?? undefined,
             mediaLocalRoots,
             replyToMessageId,
+            ...(audioAsVoice === true ? { audioAsVoice: true } : {}),
           });
         } catch (err) {
           // Log the error for debugging
diff --git a/extensions/feishu/src/reply-dispatcher.test.ts b/extensions/feishu/src/reply-dispatcher.test.ts
index 16411a9ed51..9dbc60b3f6a 100644
--- a/extensions/feishu/src/reply-dispatcher.test.ts
+++ b/extensions/feishu/src/reply-dispatcher.test.ts
@@ -469,6 +469,21 @@ describe("createFeishuReplyDispatcher streaming behavior", () => {
     expect(sendMarkdownCardFeishuMock).not.toHaveBeenCalled();
   });
 
+  it("passes audioAsVoice to media attachments", async () => {
+    const { options } = createDispatcherHarness();
+    await options.deliver(
+      { mediaUrl: "https://example.com/reply.mp3", audioAsVoice: true },
+      { kind: "final" },
+    );
+
+    expect(sendMediaFeishuMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        mediaUrl: "https://example.com/reply.mp3",
+        audioAsVoice: true,
+      }),
+    );
+  });
+
   it("falls back to legacy mediaUrl when mediaUrls is an empty array", async () => {
     const { options } = createDispatcherHarness();
     await options.deliver(
diff --git a/extensions/feishu/src/reply-dispatcher.ts b/extensions/feishu/src/reply-dispatcher.ts
index 460c0413137..dd4857a9cc2 100644
--- a/extensions/feishu/src/reply-dispatcher.ts
+++ b/extensions/feishu/src/reply-dispatcher.ts
@@ -396,6 +396,7 @@ export function createFeishuReplyDispatcher(params: CreateFeishuReplyDispatcherP
           replyToMessageId: sendReplyToMessageId,
           replyInThread: effectiveReplyInThread,
           accountId,
+          ...(payload.audioAsVoice === true ? { audioAsVoice: true } : {}),
         });
       },
     });
diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts
index d6fb3b164a8..b13aff709c2 100644
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -118,6 +118,36 @@ describe("speech-core native voice-note routing", () => {
     });
   });
 
+  it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => {
+    synthesizeMock.mockResolvedValueOnce({
+      audioBuffer: Buffer.from("mp3"),
+      outputFormat: "mp3",
+      fileExtension: ".mp3",
+      voiceCompatible: false,
+    });
+    const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test");
+    let mediaDir: string | undefined;
+    try {
+      const result = await maybeApplyTtsToPayload({
+        payload: { text: "This Feishu reply should be transcoded by the channel." },
+        cfg,
+        channel: "feishu",
+        kind: "final",
+      });
+
+      expect(synthesizeMock).toHaveBeenCalledWith(
+        expect.objectContaining({ target: "voice-note" }),
+      );
+      expect(result.audioAsVoice).toBe(true);
+      expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/);
+      mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
+    } finally {
+      if (mediaDir) {
+        rmSync(mediaDir, { recursive: true, force: true });
+      }
+    }
+  });
+
   it("keeps non-native voice-note channels as regular audio files", async () => {
     await expectTtsPayloadResult({
       channel: "slack",
diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts
index 4eccca12e19..47f78df91dc 100644
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -100,6 +100,8 @@ export type TtsResult = {
   attempts?: TtsProviderAttempt[];
   outputFormat?: string;
   voiceCompatible?: boolean;
+  audioAsVoice?: boolean;
+  target?: "audio-file" | "voice-note";
 };
 
 export type TtsSynthesisResult = {
@@ -114,6 +116,7 @@ export type TtsSynthesisResult = {
   outputFormat?: string;
   voiceCompatible?: boolean;
   fileExtension?: string;
+  target?: "audio-file" | "voice-note";
 };
 
 export type TtsTelephonyResult = {
@@ -586,6 +589,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
 }
 
 const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
+const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]);
 
 function resolveChannelId(channel: string | undefined): ChannelId | null {
   return channel ? normalizeChannelId(channel) : null;
@@ -596,6 +600,22 @@ function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
   return channelId !== null && OPUS_CHANNELS.has(channelId);
 }
 
+function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
+  const channelId = resolveChannelId(channel);
+  return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
+}
+
+function shouldDeliverTtsAsVoice(params: {
+  channel: string | undefined;
+  target: "audio-file" | "voice-note" | undefined;
+  voiceCompatible: boolean | undefined;
+}): boolean {
+  if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") {
+    return false;
+  }
+  return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
+}
+
 export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
   const normalizedPrimary = canonicalizeSpeechProviderId(primary, cfg) ?? primary;
   const ordered = new Set<TtsProvider>([normalizedPrimary]);
@@ -782,6 +802,12 @@ export async function textToSpeech(params: {
     attempts: synthesis.attempts,
     outputFormat: synthesis.outputFormat,
     voiceCompatible: synthesis.voiceCompatible,
+    audioAsVoice: shouldDeliverTtsAsVoice({
+      channel: params.channel,
+      target: synthesis.target,
+      voiceCompatible: synthesis.voiceCompatible,
+    }),
+    target: synthesis.target,
   };
 }
 
@@ -863,6 +889,7 @@ export async function synthesizeSpeech(params: {
         outputFormat: synthesis.outputFormat,
         voiceCompatible: synthesis.voiceCompatible,
         fileExtension: synthesis.fileExtension,
+        target,
       };
     } catch (err) {
       const errorMsg = formatTtsProviderError(provider, err);
@@ -1171,12 +1198,10 @@ export async function maybeApplyTtsToPayload(params: {
       latencyMs: result.latencyMs,
     };
 
-    const shouldVoice =
-      supportsNativeVoiceNoteTts(params.channel) && result.voiceCompatible === true;
     return {
       ...nextPayload,
       mediaUrl: result.audioPath,
-      audioAsVoice: shouldVoice || params.payload.audioAsVoice,
+      audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice,
     };
   }
 
@@ -1199,6 +1224,8 @@ export const _test = {
   parseTtsDirectives,
   resolveModelOverridePolicy,
   supportsNativeVoiceNoteTts,
+  supportsTranscodedVoiceNoteTts,
+  shouldDeliverTtsAsVoice,
   summarizeText,
   getResolvedSpeechProviderConfig,
   formatTtsProviderError,
diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts
index e833d5eb6df..412e42b8667 100644
--- a/src/agents/tools/tts-tool.test.ts
+++ b/src/agents/tools/tts-tool.test.ts
@@ -43,6 +43,28 @@ describe("createTtsTool", () => {
     expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
   });
 
+  it("uses audioAsVoice from the TTS runtime even when the provider output is not native", async () => {
+    textToSpeechSpy.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/reply.mp3",
+      provider: "test",
+      voiceCompatible: false,
+      audioAsVoice: true,
+    });
+
+    const tool = createTtsTool();
+    const result = await tool.execute("call-1", { text: "hello", channel: "feishu" });
+
+    expect(result).toMatchObject({
+      details: {
+        media: {
+          mediaUrl: "/tmp/reply.mp3",
+          audioAsVoice: true,
+        },
+      },
+    });
+  });
+
   it("passes an optional timeout to speech generation", async () => {
     textToSpeechSpy.mockResolvedValue({
       success: true,
diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts
index a3db27121cb..8861cf3ad0b 100644
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -92,7 +92,7 @@ export function createTtsTool(opts?: {
             media: {
               mediaUrl: result.audioPath,
               trustedLocalMedia: true,
-              ...(result.voiceCompatible ? { audioAsVoice: true } : {}),
+              ...(result.audioAsVoice || result.voiceCompatible ? { audioAsVoice: true } : {}),
             },
           },
         };
diff --git a/src/plugin-sdk/tts-runtime.types.ts b/src/plugin-sdk/tts-runtime.types.ts
index 0e43dba237a..09a8cac2ea1 100644
--- a/src/plugin-sdk/tts-runtime.types.ts
+++ b/src/plugin-sdk/tts-runtime.types.ts
@@ -41,6 +41,8 @@ export type TtsStatusEntry = {
   error?: string;
 };
 
+export type TtsSpeechTarget = "audio-file" | "voice-note";
+
 export type SummarizeResult = {
   summary: string;
   latencyMs: number;
@@ -99,6 +101,12 @@ export type TtsTestFacade = {
   parseTtsDirectives: (...args: unknown[]) => TtsDirectiveParseResult;
   resolveModelOverridePolicy: (...args: unknown[]) => ResolvedTtsModelOverrides;
   supportsNativeVoiceNoteTts: (channel: string | undefined) => boolean;
+  supportsTranscodedVoiceNoteTts: (channel: string | undefined) => boolean;
+  shouldDeliverTtsAsVoice: (params: {
+    channel: string | undefined;
+    target: TtsSpeechTarget | undefined;
+    voiceCompatible: boolean | undefined;
+  }) => boolean;
   summarizeText: (...args: unknown[]) => Promise<SummarizeResult>;
   getResolvedSpeechProviderConfig: (
     config: ResolvedTtsConfig,
@@ -120,6 +128,8 @@ export type TtsResult = {
   attempts?: TtsProviderAttempt[];
   outputFormat?: string;
   voiceCompatible?: boolean;
+  audioAsVoice?: boolean;
+  target?: TtsSpeechTarget;
 };
 
 export type TtsSynthesisResult = {
@@ -134,6 +144,7 @@ export type TtsSynthesisResult = {
   outputFormat?: string;
   voiceCompatible?: boolean;
   fileExtension?: string;
+  target?: TtsSpeechTarget;
 };
 
 export type TtsTelephonyResult = {