fix(feishu): transcode voice TTS audio

2026-05-06 11:20:43 +00:00 · 2026-04-25 09:26:08 +01:00
parent bd32b1a906
commit b0c55eb659
16 changed files with 416 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ Docs: https://docs.openclaw.ai
 - Control UI: make `/usage` use the fresh context snapshot for context percentage, and include cache-write tokens in the Usage overview cache-hit denominator. Fixes #47885. Thanks @imwyvern and @Ante042.
 - GitHub Copilot: preserve encrypted Responses reasoning item IDs during replay so Copilot can validate encrypted reasoning payloads across requests. (#71448) Thanks @a410979729-sys.
 - Agents/replies: recover final-answer text when streamed assistant chunks contain only whitespace, preventing completed turns from surfacing as empty-payload errors. Fixes #71454. (#71467) Thanks @Sanjays2402.
+- Feishu/TTS: transcode voice-intent MP3 and other audio replies to Ogg/Opus before sending native Feishu audio bubbles, while keeping ordinary MP3 attachments as files. Fixes #61249 and #37868.
 - Telegram/webhook: acknowledge validated webhook updates before running bot middleware, keeping slow agent turns from tripping Telegram delivery retries while preserving per-chat processing lanes. Fixes #71392. Thanks @joelforsberg46-source.
 - MCP: retire one-shot embedded bundled MCP runtimes at run end, skip bundle-MCP startup when a runtime tool allowlist cannot reach bundle-MCP tools, and add `mcp.sessionIdleTtlMs` idle eviction for leaked session runtimes. Fixes #71106, #71110, #70389, and #70808.
 - MCP/config reload: hot-apply `mcp.*` changes by disposing cached session MCP runtimes, and dispose bundled MCP runtimes during gateway shutdown so removed `mcp.servers` entries reap child processes promptly. Fixes #60656.
--- a/docs/channels/feishu.md
+++ b/docs/channels/feishu.md
@@ -424,6 +424,14 @@ Full configuration: [Gateway configuration](/gateway/configuration)
 - ✅ Interactive cards (including streaming updates)
 - ⚠️ Rich text (post-style formatting; doesn't support full Feishu/Lark authoring capabilities)

+Native Feishu/Lark audio bubbles use the Feishu `audio` message type and require
+Ogg/Opus upload media (`file_type: "opus"`). Existing `.opus` and `.ogg` media
+is sent directly as native audio. MP3/WAV/M4A and other likely audio formats are
+transcoded to 48kHz Ogg/Opus with `ffmpeg` only when the reply requests voice
+delivery (`audioAsVoice` / message tool `asVoice`, including TTS voice-note
+replies). Ordinary MP3 attachments stay regular files. If `ffmpeg` is missing or
+conversion fails, OpenClaw falls back to a file attachment and logs the reason.
+
 ### Threads and replies

 - ✅ Inline replies
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -489,8 +489,12 @@ These override `messages.tts.*` for that host.

 ## Output formats (fixed)

- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
+- **Feishu / Matrix / Telegram / WhatsApp**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
  - 48kHz / 64kbps is a good voice message tradeoff.
+- **Feishu**: when a voice-note reply is produced as MP3/WAV/M4A or another
+  likely audio file, the Feishu plugin transcodes it to 48kHz Ogg/Opus with
+  `ffmpeg` before sending the native `audio` bubble. If conversion fails, Feishu
+  receives the original file as an attachment.
 - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
  - 44.1kHz / 128kbps is the default balance for speech clarity.
 - **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery.
@@ -572,6 +576,8 @@ Notes:
 The `tts` tool converts text to speech and returns an audio attachment for
 reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
 the audio is delivered as a voice message rather than a file attachment.
+Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is
+available.
 It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
 per-call provider request timeout in milliseconds.

--- a/extensions/feishu/src/channel.test.ts
+++ b/extensions/feishu/src/channel.test.ts
@@ -461,6 +461,34 @@ describe("feishuPlugin actions", () => {
    expect(result?.details).toMatchObject({ messageId: "om_media" });
  });

+  it("passes asVoice through media sends", async () => {
+    feishuOutboundSendMediaMock.mockResolvedValueOnce({
+      channel: "feishu",
+      messageId: "om_voice",
+      details: { messageId: "om_voice", chatId: "oc_group_1" },
+    });
+
+    await feishuPlugin.actions?.handleAction?.({
+      action: "send",
+      params: {
+        to: "chat:oc_group_1",
+        media: "https://example.com/reply.mp3",
+        asVoice: true,
+      },
+      cfg,
+      accountId: undefined,
+      toolContext: {},
+      mediaLocalRoots: [],
+    } as never);
+
+    expect(feishuOutboundSendMediaMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        mediaUrl: "https://example.com/reply.mp3",
+        audioAsVoice: true,
+      }),
+    );
+  });
+
  it("reads messages", async () => {
    getMessageFeishuMock.mockResolvedValueOnce({
      messageId: "om_1",
--- a/extensions/feishu/src/channel.ts
+++ b/extensions/feishu/src/channel.ts
@@ -81,6 +81,16 @@ function readFeishuMediaParam(params: Record<string, unknown>): string | undefin
  return media.trim() ? media : undefined;
 }

+function readBooleanParam(params: Record<string, unknown>, keys: string[]): boolean | undefined {
+  for (const key of keys) {
+    const value = params[key];
+    if (typeof value === "boolean") {
+      return value;
+    }
+  }
+  return undefined;
+}
+
 function hasLegacyFeishuCardCommandValue(actionValue: unknown): boolean {
  return (
    isRecord(actionValue) &&
@@ -695,6 +705,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
            const presentation = normalizeMessagePresentation(ctx.params.presentation);
            const text = readFirstString(ctx.params, ["text", "message"]);
            const mediaUrl = readFeishuMediaParam(ctx.params);
+            const audioAsVoice = readBooleanParam(ctx.params, ["asVoice", "audioAsVoice"]);
            const card = presentation
              ? buildFeishuPresentationCard({ presentation, fallbackText: text })
              : undefined;
@@ -734,6 +745,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
                accountId: ctx.accountId ?? undefined,
                mediaLocalRoots: ctx.mediaLocalRoots,
                replyToId: replyToMessageId,
+                ...(audioAsVoice === true ? { audioAsVoice: true } : {}),
              });
            } else {
              result = await runtime.sendMessageFeishu({
--- a/extensions/feishu/src/media.test.ts
+++ b/extensions/feishu/src/media.test.ts
@@ -9,6 +9,7 @@ const resolveFeishuAccountMock = vi.hoisted(() => vi.fn());
 const normalizeFeishuTargetMock = vi.hoisted(() => vi.fn());
 const resolveReceiveIdTypeMock = vi.hoisted(() => vi.fn());
 const loadWebMediaMock = vi.hoisted(() => vi.fn());
+const runFfmpegMock = vi.hoisted(() => vi.fn());

 const fileCreateMock = vi.hoisted(() => vi.fn());
 const imageCreateMock = vi.hoisted(() => vi.fn());
@@ -42,6 +43,14 @@ vi.mock("./runtime.js", () => ({
  }),
 }));

+vi.mock("openclaw/plugin-sdk/media-runtime", async (importOriginal) => {
+  const actual = await importOriginal<typeof import("openclaw/plugin-sdk/media-runtime")>();
+  return {
+    ...actual,
+    runFfmpeg: runFfmpegMock,
+  };
+});
+
 vi.mock("../../../src/channels/plugins/bundled.js", () => ({
  bundledChannelPlugins: [],
  bundledChannelSetupPlugins: [],
@@ -145,6 +154,10 @@ describe("sendMediaFeishu msg_type routing", () => {

    imageGetMock.mockResolvedValue(Buffer.from("image-bytes"));
    messageResourceGetMock.mockResolvedValue(Buffer.from("resource-bytes"));
+    runFfmpegMock.mockImplementation(async (args: string[]) => {
+      await fs.writeFile(args.at(-1) ?? "", Buffer.from("opus-output"));
+      return "";
+    });
  });

  it("uses msg_type=media for mp4 video", async () => {
@@ -260,6 +273,104 @@ describe("sendMediaFeishu msg_type routing", () => {
        data: expect.objectContaining({ msg_type: "file" }),
      }),
    );
+    expect(runFfmpegMock).not.toHaveBeenCalled();
+  });
+
+  it("transcodes voice-intent mp3 to msg_type=audio", async () => {
+    loadWebMediaMock.mockResolvedValueOnce({
+      buffer: Buffer.from("remote-mp3"),
+      fileName: "reply.mp3",
+      kind: "audio",
+      contentType: "audio/mpeg",
+    });
+
+    await sendMediaFeishu({
+      cfg: emptyConfig,
+      to: "user:ou_target",
+      mediaUrl: "https://example.com/reply.mp3",
+      audioAsVoice: true,
+    });
+
+    expect(runFfmpegMock).toHaveBeenCalledWith(
+      expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]),
+    );
+    expect(fileCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({
+          file_type: "opus",
+          file_name: "voice.ogg",
+          file: Buffer.from("opus-output"),
+        }),
+      }),
+    );
+    expect(messageCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({ msg_type: "audio" }),
+      }),
+    );
+  });
+
+  it("leaves native voice audio unchanged when audioAsVoice is true", async () => {
+    await sendMediaFeishu({
+      cfg: emptyConfig,
+      to: "user:ou_target",
+      mediaBuffer: Buffer.from("opus"),
+      fileName: "reply.ogg",
+      audioAsVoice: true,
+    });
+
+    expect(runFfmpegMock).not.toHaveBeenCalled();
+    expect(fileCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({
+          file_type: "opus",
+          file_name: "reply.ogg",
+        }),
+      }),
+    );
+    expect(messageCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({ msg_type: "audio" }),
+      }),
+    );
+  });
+
+  it("falls back to file when voice-intent audio cannot be transcoded", async () => {
+    const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => undefined);
+    runFfmpegMock.mockRejectedValueOnce(new Error("ffmpeg missing"));
+    loadWebMediaMock.mockResolvedValueOnce({
+      buffer: Buffer.from("remote-mp3"),
+      fileName: "reply.mp3",
+      kind: "audio",
+      contentType: "audio/mpeg",
+    });
+
+    await sendMediaFeishu({
+      cfg: emptyConfig,
+      to: "user:ou_target",
+      mediaUrl: "https://example.com/reply.mp3",
+      audioAsVoice: true,
+    });
+
+    expect(fileCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({
+          file_type: "stream",
+          file_name: "reply.mp3",
+          file: Buffer.from("remote-mp3"),
+        }),
+      }),
+    );
+    expect(messageCreateMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        data: expect.objectContaining({ msg_type: "file" }),
+      }),
+    );
+    expect(warnSpy).toHaveBeenCalledWith(
+      expect.stringContaining("audioAsVoice transcode failed"),
+      expect.any(Error),
+    );
+    warnSpy.mockRestore();
  });

  it("configures the media client timeout for image uploads", async () => {
--- a/extensions/feishu/src/media.ts
+++ b/extensions/feishu/src/media.ts
@@ -3,7 +3,11 @@ import path from "node:path";
 import { Readable } from "node:stream";
 import type * as Lark from "@larksuiteoapi/node-sdk";
 import { mediaKindFromMime } from "openclaw/plugin-sdk/media-mime";
-import { withTempDownloadPath } from "openclaw/plugin-sdk/temp-path";
+import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS, runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
+import {
+  resolvePreferredOpenClawTmpDir,
+  withTempDownloadPath,
+} from "openclaw/plugin-sdk/temp-path";
 import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
 import type { ClawdbotConfig } from "../runtime-api.js";
 import { resolveFeishuRuntimeAccount } from "./accounts.js";
@@ -14,6 +18,24 @@ import { assertFeishuMessageApiSuccess, toFeishuSendResult } from "./send-result
 import { resolveFeishuSendTarget } from "./send-target.js";

 const FEISHU_MEDIA_HTTP_TIMEOUT_MS = 120_000;
+const FEISHU_VOICE_FILE_NAME = "voice.ogg";
+const FEISHU_VOICE_SAMPLE_RATE_HZ = 48_000;
+const FEISHU_VOICE_BITRATE = "64k";
+
+const FEISHU_TRANSCODABLE_AUDIO_EXTS = new Set([
+  ".aac",
+  ".aiff",
+  ".alac",
+  ".amr",
+  ".caf",
+  ".flac",
+  ".m4a",
+  ".mp3",
+  ".oga",
+  ".wav",
+  ".webm",
+  ".wma",
+]);

 export type DownloadImageResult = {
  buffer: Buffer;
@@ -568,6 +590,89 @@ function resolveFeishuOutboundMediaKind(params: { fileName: string; contentType?
  };
 }

+function isFeishuNativeVoiceAudio(params: { fileName: string; contentType?: string }): boolean {
+  const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
+  const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
+  return (
+    ext === ".opus" || ext === ".ogg" || contentType === "audio/ogg" || contentType === "audio/opus"
+  );
+}
+
+function isLikelyTranscodableAudio(params: { fileName: string; contentType?: string }): boolean {
+  const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
+  const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
+  return FEISHU_TRANSCODABLE_AUDIO_EXTS.has(ext) || mediaKindFromMime(contentType) === "audio";
+}
+
+async function transcodeToFeishuVoiceOpus(params: {
+  buffer: Buffer;
+  fileName: string;
+  contentType?: string;
+}): Promise<{ buffer: Buffer; fileName: string; contentType: string }> {
+  const tempRoot = resolvePreferredOpenClawTmpDir();
+  await fs.promises.mkdir(tempRoot, { recursive: true, mode: 0o700 });
+  const tempDir = await fs.promises.mkdtemp(path.join(tempRoot, "feishu-voice-"));
+  try {
+    const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
+    const inputExt = ext && ext.length <= 12 ? ext : ".audio";
+    const inputPath = path.join(tempDir, `input${inputExt}`);
+    const outputPath = path.join(tempDir, FEISHU_VOICE_FILE_NAME);
+    await fs.promises.writeFile(inputPath, params.buffer, { mode: 0o600 });
+    await runFfmpeg([
+      "-hide_banner",
+      "-loglevel",
+      "error",
+      "-y",
+      "-i",
+      inputPath,
+      "-vn",
+      "-sn",
+      "-dn",
+      "-t",
+      String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS),
+      "-ar",
+      String(FEISHU_VOICE_SAMPLE_RATE_HZ),
+      "-ac",
+      "1",
+      "-c:a",
+      "libopus",
+      "-b:a",
+      FEISHU_VOICE_BITRATE,
+      outputPath,
+    ]);
+    return {
+      buffer: await fs.promises.readFile(outputPath),
+      fileName: FEISHU_VOICE_FILE_NAME,
+      contentType: "audio/ogg",
+    };
+  } finally {
+    await fs.promises.rm(tempDir, { recursive: true, force: true });
+  }
+}
+
+async function prepareFeishuVoiceMedia(params: {
+  buffer: Buffer;
+  fileName: string;
+  contentType?: string;
+  audioAsVoice?: boolean;
+}): Promise<{ buffer: Buffer; fileName: string; contentType?: string }> {
+  if (isFeishuNativeVoiceAudio(params)) {
+    return params;
+  }
+  if (params.audioAsVoice !== true || !isLikelyTranscodableAudio(params)) {
+    return params;
+  }
+  try {
+    return await transcodeToFeishuVoiceOpus(params);
+  } catch (err) {
+    console.warn(
+      `[feishu] audioAsVoice transcode failed; sending ${params.fileName} as a file attachment:`,
+      err,
+    );
+    return params;
+  }
+}
+
 /**
 * Upload and send media (image or file) from URL, local path, or buffer.
 * When mediaUrl is a local path, mediaLocalRoots (from core outbound context)
@@ -584,6 +689,8 @@ export async function sendMediaFeishu(params: {
  accountId?: string;
  /** Allowed roots for local path reads; required for local filePath to work. */
  mediaLocalRoots?: readonly string[];
+  /** When true, transcode compatible audio to Feishu native Ogg/Opus voice bubbles. */
+  audioAsVoice?: boolean;
 }): Promise<SendMediaResult> {
  const {
    cfg,
@@ -595,6 +702,7 @@ export async function sendMediaFeishu(params: {
    replyInThread,
    accountId,
    mediaLocalRoots,
+    audioAsVoice,
  } = params;
  const account = resolveFeishuRuntimeAccount({ cfg, accountId });
  if (!account.configured) {
@@ -622,6 +730,16 @@ export async function sendMediaFeishu(params: {
    throw new Error("Either mediaUrl or mediaBuffer must be provided");
  }

+  const prepared = await prepareFeishuVoiceMedia({
+    buffer,
+    fileName: name,
+    contentType,
+    audioAsVoice,
+  });
+  buffer = prepared.buffer;
+  name = prepared.fileName;
+  contentType = prepared.contentType;
+
  const routing = resolveFeishuOutboundMediaKind({ fileName: name, contentType });

  if (routing.msgType === "image") {
--- a/extensions/feishu/src/outbound.test.ts
+++ b/extensions/feishu/src/outbound.test.ts
@@ -457,6 +457,24 @@ describe("feishuOutbound.sendMedia replyToId forwarding", () => {
    );
  });

+  it("forwards audioAsVoice to sendMediaFeishu", async () => {
+    await feishuOutbound.sendMedia?.({
+      cfg: emptyConfig,
+      to: "chat_1",
+      text: "",
+      mediaUrl: "https://example.com/reply.mp3",
+      audioAsVoice: true,
+      accountId: "main",
+    });
+
+    expect(sendMediaFeishuMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        mediaUrl: "https://example.com/reply.mp3",
+        audioAsVoice: true,
+      }),
+    );
+  });
+
  it("forwards replyToId to text caption send", async () => {
    await feishuOutbound.sendMedia?.({
      cfg: emptyConfig,
--- a/extensions/feishu/src/outbound.ts
+++ b/extensions/feishu/src/outbound.ts
@@ -232,6 +232,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
      to,
      text,
      mediaUrl,
+      audioAsVoice,
      accountId,
      mediaLocalRoots,
      replyToId,
@@ -271,6 +272,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
            accountId: accountId ?? undefined,
            mediaLocalRoots,
            replyToMessageId,
+            ...(audioAsVoice === true ? { audioAsVoice: true } : {}),
          });
        } catch (err) {
          // Log the error for debugging
--- a/extensions/feishu/src/reply-dispatcher.test.ts
+++ b/extensions/feishu/src/reply-dispatcher.test.ts
@@ -469,6 +469,21 @@ describe("createFeishuReplyDispatcher streaming behavior", () => {
    expect(sendMarkdownCardFeishuMock).not.toHaveBeenCalled();
  });

+  it("passes audioAsVoice to media attachments", async () => {
+    const { options } = createDispatcherHarness();
+    await options.deliver(
+      { mediaUrl: "https://example.com/reply.mp3", audioAsVoice: true },
+      { kind: "final" },
+    );
+
+    expect(sendMediaFeishuMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        mediaUrl: "https://example.com/reply.mp3",
+        audioAsVoice: true,
+      }),
+    );
+  });
+
  it("falls back to legacy mediaUrl when mediaUrls is an empty array", async () => {
    const { options } = createDispatcherHarness();
    await options.deliver(
--- a/extensions/feishu/src/reply-dispatcher.ts
+++ b/extensions/feishu/src/reply-dispatcher.ts
@@ -396,6 +396,7 @@ export function createFeishuReplyDispatcher(params: CreateFeishuReplyDispatcherP
          replyToMessageId: sendReplyToMessageId,
          replyInThread: effectiveReplyInThread,
          accountId,
+          ...(payload.audioAsVoice === true ? { audioAsVoice: true } : {}),
        });
      },
    });
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -118,6 +118,36 @@ describe("speech-core native voice-note routing", () => {
    });
  });

+  it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => {
+    synthesizeMock.mockResolvedValueOnce({
+      audioBuffer: Buffer.from("mp3"),
+      outputFormat: "mp3",
+      fileExtension: ".mp3",
+      voiceCompatible: false,
+    });
+    const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test");
+    let mediaDir: string | undefined;
+    try {
+      const result = await maybeApplyTtsToPayload({
+        payload: { text: "This Feishu reply should be transcoded by the channel." },
+        cfg,
+        channel: "feishu",
+        kind: "final",
+      });
+
+      expect(synthesizeMock).toHaveBeenCalledWith(
+        expect.objectContaining({ target: "voice-note" }),
+      );
+      expect(result.audioAsVoice).toBe(true);
+      expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/);
+      mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
+    } finally {
+      if (mediaDir) {
+        rmSync(mediaDir, { recursive: true, force: true });
+      }
+    }
+  });
+
  it("keeps non-native voice-note channels as regular audio files", async () => {
    await expectTtsPayloadResult({
      channel: "slack",
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -100,6 +100,8 @@ export type TtsResult = {
  attempts?: TtsProviderAttempt[];
  outputFormat?: string;
  voiceCompatible?: boolean;
+  audioAsVoice?: boolean;
+  target?: "audio-file" | "voice-note";
 };

 export type TtsSynthesisResult = {
@@ -114,6 +116,7 @@ export type TtsSynthesisResult = {
  outputFormat?: string;
  voiceCompatible?: boolean;
  fileExtension?: string;
+  target?: "audio-file" | "voice-note";
 };

 export type TtsTelephonyResult = {
@@ -586,6 +589,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
 }

 const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
+const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]);

 function resolveChannelId(channel: string | undefined): ChannelId | null {
  return channel ? normalizeChannelId(channel) : null;
@@ -596,6 +600,22 @@ function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
  return channelId !== null && OPUS_CHANNELS.has(channelId);
 }

+function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
+  const channelId = resolveChannelId(channel);
+  return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
+}
+
+function shouldDeliverTtsAsVoice(params: {
+  channel: string | undefined;
+  target: "audio-file" | "voice-note" | undefined;
+  voiceCompatible: boolean | undefined;
+}): boolean {
+  if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") {
+    return false;
+  }
+  return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
+}
+
 export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
  const normalizedPrimary = canonicalizeSpeechProviderId(primary, cfg) ?? primary;
  const ordered = new Set<TtsProvider>([normalizedPrimary]);
@@ -782,6 +802,12 @@ export async function textToSpeech(params: {
    attempts: synthesis.attempts,
    outputFormat: synthesis.outputFormat,
    voiceCompatible: synthesis.voiceCompatible,
+    audioAsVoice: shouldDeliverTtsAsVoice({
+      channel: params.channel,
+      target: synthesis.target,
+      voiceCompatible: synthesis.voiceCompatible,
+    }),
+    target: synthesis.target,
  };
 }

@@ -863,6 +889,7 @@ export async function synthesizeSpeech(params: {
        outputFormat: synthesis.outputFormat,
        voiceCompatible: synthesis.voiceCompatible,
        fileExtension: synthesis.fileExtension,
+        target,
      };
    } catch (err) {
      const errorMsg = formatTtsProviderError(provider, err);
@@ -1171,12 +1198,10 @@ export async function maybeApplyTtsToPayload(params: {
      latencyMs: result.latencyMs,
    };

-    const shouldVoice =
-      supportsNativeVoiceNoteTts(params.channel) && result.voiceCompatible === true;
    return {
      ...nextPayload,
      mediaUrl: result.audioPath,
-      audioAsVoice: shouldVoice || params.payload.audioAsVoice,
+      audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice,
    };
  }

@@ -1199,6 +1224,8 @@ export const _test = {
  parseTtsDirectives,
  resolveModelOverridePolicy,
  supportsNativeVoiceNoteTts,
+  supportsTranscodedVoiceNoteTts,
+  shouldDeliverTtsAsVoice,
  summarizeText,
  getResolvedSpeechProviderConfig,
  formatTtsProviderError,
--- a/src/agents/tools/tts-tool.test.ts
+++ b/src/agents/tools/tts-tool.test.ts
@@ -43,6 +43,28 @@ describe("createTtsTool", () => {
    expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
  });

+  it("uses audioAsVoice from the TTS runtime even when the provider output is not native", async () => {
+    textToSpeechSpy.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/reply.mp3",
+      provider: "test",
+      voiceCompatible: false,
+      audioAsVoice: true,
+    });
+
+    const tool = createTtsTool();
+    const result = await tool.execute("call-1", { text: "hello", channel: "feishu" });
+
+    expect(result).toMatchObject({
+      details: {
+        media: {
+          mediaUrl: "/tmp/reply.mp3",
+          audioAsVoice: true,
+        },
+      },
+    });
+  });
+
  it("passes an optional timeout to speech generation", async () => {
    textToSpeechSpy.mockResolvedValue({
      success: true,
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -92,7 +92,7 @@ export function createTtsTool(opts?: {
            media: {
              mediaUrl: result.audioPath,
              trustedLocalMedia: true,
-              ...(result.voiceCompatible ? { audioAsVoice: true } : {}),
+              ...(result.audioAsVoice || result.voiceCompatible ? { audioAsVoice: true } : {}),
            },
          },
        };
--- a/src/plugin-sdk/tts-runtime.types.ts
+++ b/src/plugin-sdk/tts-runtime.types.ts
@@ -41,6 +41,8 @@ export type TtsStatusEntry = {
  error?: string;
 };

+export type TtsSpeechTarget = "audio-file" | "voice-note";
+
 export type SummarizeResult = {
  summary: string;
  latencyMs: number;
@@ -99,6 +101,12 @@ export type TtsTestFacade = {
  parseTtsDirectives: (...args: unknown[]) => TtsDirectiveParseResult;
  resolveModelOverridePolicy: (...args: unknown[]) => ResolvedTtsModelOverrides;
  supportsNativeVoiceNoteTts: (channel: string | undefined) => boolean;
+  supportsTranscodedVoiceNoteTts: (channel: string | undefined) => boolean;
+  shouldDeliverTtsAsVoice: (params: {
+    channel: string | undefined;
+    target: TtsSpeechTarget | undefined;
+    voiceCompatible: boolean | undefined;
+  }) => boolean;
  summarizeText: (...args: unknown[]) => Promise<SummarizeResult>;
  getResolvedSpeechProviderConfig: (
    config: ResolvedTtsConfig,
@@ -120,6 +128,8 @@ export type TtsResult = {
  attempts?: TtsProviderAttempt[];
  outputFormat?: string;
  voiceCompatible?: boolean;
+  audioAsVoice?: boolean;
+  target?: TtsSpeechTarget;
 };

 export type TtsSynthesisResult = {
@@ -134,6 +144,7 @@ export type TtsSynthesisResult = {
  outputFormat?: string;
  voiceCompatible?: boolean;
  fileExtension?: string;
+  target?: TtsSpeechTarget;
 };

 export type TtsTelephonyResult = {