fix(tts): route WhatsApp MP3 TTS as voice notes

2026-05-06 15:40:44 +00:00 · 2026-04-26 03:25:55 +01:00
parent 90cd9fce85
commit 9b91040053
4 changed files with 34 additions and 36 deletions
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -71,7 +71,12 @@ async function expectTtsPayloadResult(params: {
  text: string;
  target: "voice-note" | "audio-file";
  audioAsVoice: true | undefined;
+  providerResult?: MockSpeechSynthesisResult;
+  mediaExtension?: string;
 }) {
+  if (params.providerResult) {
+    synthesizeMock.mockResolvedValueOnce(params.providerResult);
+  }
  const cfg = createTtsConfig(params.prefsName);
  let mediaDir: string | undefined;
  try {
@@ -84,7 +89,7 @@ async function expectTtsPayloadResult(params: {

    expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ target: params.target }));
    expect(result.audioAsVoice).toBe(params.audioAsVoice);
-    expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/);
+    expect(result.mediaUrl).toMatch(new RegExp(`voice-\\d+\\.${params.mediaExtension ?? "ogg"}$`));

    mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
  } finally {
@@ -118,35 +123,26 @@ describe("speech-core native voice-note routing", () => {
    });
  });

-  it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => {
-    synthesizeMock.mockResolvedValueOnce({
-      audioBuffer: Buffer.from("mp3"),
-      outputFormat: "mp3",
-      fileExtension: ".mp3",
-      voiceCompatible: false,
-    });
-    const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test");
-    let mediaDir: string | undefined;
-    try {
-      const result = await maybeApplyTtsToPayload({
-        payload: { text: "This Feishu reply should be transcoded by the channel." },
-        cfg,
-        channel: "feishu",
-        kind: "final",
+  it.each(["feishu", "whatsapp"] as const)(
+    "marks %s voice-note TTS for channel-side transcoding when provider returns mp3",
+    async (channel) => {
+      expect(_test.supportsTranscodedVoiceNoteTts(channel)).toBe(true);
+      await expectTtsPayloadResult({
+        channel,
+        prefsName: `openclaw-speech-core-tts-${channel}-mp3-test`,
+        text: `This ${channel} reply should be transcoded by the channel.`,
+        target: "voice-note",
+        audioAsVoice: true,
+        mediaExtension: "mp3",
+        providerResult: {
+          audioBuffer: Buffer.from("mp3"),
+          outputFormat: "mp3",
+          fileExtension: ".mp3",
+          voiceCompatible: false,
+        },
      });
-
-      expect(synthesizeMock).toHaveBeenCalledWith(
-        expect.objectContaining({ target: "voice-note" }),
-      );
-      expect(result.audioAsVoice).toBe(true);
-      expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/);
-      mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
-    } finally {
-      if (mediaDir) {
-        rmSync(mediaDir, { recursive: true, force: true });
-      }
-    }
-  });
+    },
+  );

  it("keeps non-native voice-note channels as regular audio files", async () => {
    await expectTtsPayloadResult({
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -640,7 +640,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
 }

 const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
-const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]);
+const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]);

 function resolveChannelId(channel: string | undefined): ChannelId | null {
  return channel ? normalizeChannelId(channel) : null;