fix(tts): pre-transcode synthesized audio to opus-in-CAF for native iMessage voice-memo bubbles via BlueBubbles (#72586)

End-to-end testing on macOS + BlueBubbles + ElevenLabs walked through three CAF flavors before landing on the format Apple's Messages.app actually emits when a user records a native iMessage voice memo: - PCM int16 @ 44.1 kHz CAF: BlueBubbles' internal `afconvert -f m4af -d aac` conversion fails; the original CAF reaches iMessage but renders with 0 s duration. - AAC @ 22.05 kHz mono CAF: BlueBubbles' conversion succeeds and the server silently downgrades the delivery, sending the converted MP3 as a generic audio attachment. - **Opus @ 24 kHz mono CAF**: byte-identical to the descriptor block Apple's Messages.app produces; BlueBubbles passes it through unchanged and iMessage renders a native voice-memo bubble with proper duration and waveform UI. Adds an opt-in `tts.voice.preferAudioFileFormat` channel capability and a macOS `afconvert`-backed pre-transcode in the speech-core pipeline. BlueBubbles declares `preferAudioFileFormat: "caf"`. Other channels are unaffected. Falls back to the original buffer when the host platform, the source/target pair, or the transcoder process can't produce the preferred container — so non-Darwin hosts and unsupported provider combinations are unchanged. Also adds a `caff` magic-byte sniff in `src/media/mime.ts` so the auto-reply host-local-media validator (which uses `file-type` and didn't recognize CAF natively) accepts the buffer instead of dropping it as "⚠️ Media failed." Fixes #72506. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 14:10:51 +00:00 · 2026-04-27 14:15:16 -07:00
parent fb4d9fc4fb
commit da3d17e1ca
9 changed files with 409 additions and 7 deletions
--- a/extensions/bluebubbles/src/channel-shared.ts
+++ b/extensions/bluebubbles/src/channel-shared.ts
@@ -35,6 +35,13 @@ export const bluebubblesCapabilities: ChannelPlugin<ResolvedBlueBubblesAccount>[
    voice: {
      synthesisTarget: "audio-file",
      audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
+      // Prefer CAF when the host can pre-transcode (afconvert on macOS).
+      // The BlueBubbles server otherwise races a CAF→MP3 conversion against
+      // the upload write completing and silently falls back to a generic
+      // attachment send when its conversion fails. Pre-encoding to CAF
+      // bypasses that race so iMessage renders the result as a native voice
+      // memo bubble (waveform UI) instead of a plain audio attachment.
+      preferAudioFileFormat: "caf",
    },
  },
  reactions: true,
--- a/extensions/speech-core/src/audio-transcode.test.ts
+++ b/extensions/speech-core/src/audio-transcode.test.ts
@@ -0,0 +1,64 @@
+import { describe, expect, it } from "vitest";
+import { transcodeAudioBuffer } from "./audio-transcode.js";
+
+describe("transcodeAudioBuffer", () => {
+  it("returns noop-same-container when source and target containers match", async () => {
+    const result = await transcodeAudioBuffer({
+      audioBuffer: Buffer.from("payload"),
+      sourceExtension: "mp3",
+      targetExtension: ".mp3",
+    });
+    expect(result).toEqual({ ok: false, reason: "noop-same-container" });
+  });
+
+  it("returns no-recipe when no afconvert recipe is defined for the requested pair", async () => {
+    const result = await transcodeAudioBuffer({
+      audioBuffer: Buffer.from("payload"),
+      sourceExtension: "mp3",
+      targetExtension: "flac",
+    });
+    expect(result).toEqual({ ok: false, reason: "no-recipe" });
+  });
+
+  it("returns invalid-extension for an empty source extension", async () => {
+    const result = await transcodeAudioBuffer({
+      audioBuffer: Buffer.from("payload"),
+      sourceExtension: "",
+      targetExtension: "caf",
+    });
+    expect(result).toEqual({ ok: false, reason: "invalid-extension" });
+  });
+
+  it("returns invalid-extension for an empty target extension", async () => {
+    const result = await transcodeAudioBuffer({
+      audioBuffer: Buffer.from("payload"),
+      sourceExtension: "mp3",
+      targetExtension: "",
+    });
+    expect(result).toEqual({ ok: false, reason: "invalid-extension" });
+  });
+
+  it("rejects path-traversal style extensions", async () => {
+    const result = await transcodeAudioBuffer({
+      audioBuffer: Buffer.from("payload"),
+      sourceExtension: "../etc/passwd",
+      targetExtension: "caf",
+    });
+    expect(result).toEqual({ ok: false, reason: "invalid-extension" });
+  });
+
+  it("returns platform-unsupported off-Darwin without invoking afconvert", async () => {
+    if (process.platform === "darwin") {
+      // macOS: a valid mp3→caf request would proceed to spawn `afconvert`,
+      // which we don't want to run from a unit test. The Darwin happy path
+      // is exercised end-to-end via the BlueBubbles voice-memo flow.
+      return;
+    }
+    const result = await transcodeAudioBuffer({
+      audioBuffer: Buffer.from("payload"),
+      sourceExtension: "mp3",
+      targetExtension: "caf",
+    });
+    expect(result).toEqual({ ok: false, reason: "platform-unsupported" });
+  });
+});
--- a/extensions/speech-core/src/audio-transcode.ts
+++ b/extensions/speech-core/src/audio-transcode.ts
@@ -0,0 +1,134 @@
+import { spawn } from "node:child_process";
+import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/sandbox";
+
+/** Container token (file-extension shape, no leading dot) the host knows how
+ * to pre-transcode into. Update in lockstep with `pickAfconvertRecipe`. */
+export type HostTranscodableContainer = "caf";
+
+export type TranscodeOutcome =
+  | { ok: true; buffer: Buffer }
+  | {
+      ok: false;
+      reason:
+        | "platform-unsupported"
+        | "invalid-extension"
+        | "noop-same-container"
+        | "no-recipe"
+        | "transcoder-failed";
+      detail?: string;
+    };
+
+/**
+ * Best-effort audio container transcode using macOS `afconvert`.
+ *
+ * Used by the TTS pipeline to pre-encode synthesized audio into a channel's
+ * preferred container (see `ChannelTtsVoiceDeliveryCapabilities.preferAudioFileFormat`)
+ * so the channel's downstream does not have to perform a container
+ * conversion of its own. Returns a discriminated outcome so callers can
+ * distinguish "we didn't try" (platform/recipe/noop) from "we tried and the
+ * transcoder failed", which is the case worth logging.
+ *
+ * Currently only macOS is supported because `afconvert` is the only widely
+ * available encoder we ship a recipe for.
+ */
+export async function transcodeAudioBuffer(params: {
+  audioBuffer: Buffer;
+  sourceExtension: string;
+  targetExtension: string;
+  timeoutMs?: number;
+}): Promise<TranscodeOutcome> {
+  // Validate inputs first so callers get a specific reason regardless of
+  // host platform. Platform-unsupported is the gate immediately before the
+  // actual `afconvert` invocation.
+  const source = normalizeExt(params.sourceExtension);
+  const target = normalizeExt(params.targetExtension);
+  if (!source || !target) {
+    return { ok: false, reason: "invalid-extension" };
+  }
+  if (source === target) {
+    return { ok: false, reason: "noop-same-container" };
+  }
+  const recipe = pickAfconvertRecipe(source, target);
+  if (!recipe) {
+    return { ok: false, reason: "no-recipe" };
+  }
+  if (process.platform !== "darwin") {
+    return { ok: false, reason: "platform-unsupported" };
+  }
+
+  const tmpRoot = resolvePreferredOpenClawTmpDir();
+  mkdirSync(tmpRoot, { recursive: true, mode: 0o700 });
+  const tmpDir = mkdtempSync(join(tmpRoot, "tts-transcode-"));
+  const inPath = join(tmpDir, `in.${source}`);
+  const outPath = join(tmpDir, `out.${target}`);
+  try {
+    writeFileSync(inPath, params.audioBuffer, { mode: 0o600 });
+    const result = await runAfconvert({
+      args: [...recipe, inPath, outPath],
+      timeoutMs: params.timeoutMs ?? 5000,
+    });
+    if (!result.ok) {
+      return { ok: false, reason: "transcoder-failed", detail: result.detail };
+    }
+    return { ok: true, buffer: readFileSync(outPath) };
+  } catch (err) {
+    return { ok: false, reason: "transcoder-failed", detail: (err as Error).message };
+  } finally {
+    try {
+      rmSync(tmpDir, { recursive: true, force: true });
+    } catch {
+      // best-effort cleanup
+    }
+  }
+}
+
+function normalizeExt(ext: string): string | undefined {
+  // Pattern matches the sibling helper in src/media/audio-transcode.ts: a short
+  // alphanumeric extension token. Keeps the value safe to interpolate into
+  // tmp-file names below without introducing a path-traversal surface.
+  const trimmed = ext.trim().toLowerCase().replace(/^\./, "");
+  return /^[a-z0-9]{1,12}$/.test(trimmed) ? trimmed : undefined;
+}
+
+function pickAfconvertRecipe(source: string, target: string): string[] | undefined {
+  // Currently only the MP3→CAF path used by BlueBubbles voice memos. Keep
+  // this in lockstep with `HostTranscodableContainer` above so a typo at the
+  // channel-capability declaration site is a compile-time error.
+  if (target === "caf") {
+    // Opus-in-CAF, mono, 24 kHz. Validated against macOS 15.x Messages.app's
+    // native voice-memo CAF descriptor (1 ch, 24000 Hz, opus); other CAF
+    // flavors (PCM, AAC) get downgraded to plain audio attachments along the
+    // BlueBubbles → Messages.app path. If iMessage stops rendering the result
+    // as a voice memo after a system update, try forcing frames-per-packet
+    // explicitly via `opus@24000#480` and re-validate. See #72506.
+    return ["-f", "caff", "-d", "opus@24000", "-c", "1"];
+  }
+  return undefined;
+}
+
+function runAfconvert(params: {
+  args: string[];
+  timeoutMs: number;
+}): Promise<{ ok: true } | { ok: false; detail: string }> {
+  return new Promise((resolve) => {
+    const child = spawn("/usr/bin/afconvert", params.args, { stdio: "ignore" });
+    const timer = setTimeout(() => {
+      child.kill("SIGKILL");
+      resolve({ ok: false, detail: `timeout-${params.timeoutMs}ms` });
+    }, params.timeoutMs);
+    child.once("error", (err) => {
+      clearTimeout(timer);
+      resolve({ ok: false, detail: err.message });
+    });
+    child.once("exit", (code) => {
+      clearTimeout(timer);
+      if (code === 0) {
+        resolve({ ok: true });
+      } else {
+        resolve({ ok: false, detail: `exit-${code ?? "unknown"}` });
+      }
+    });
+  });
+}
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -31,6 +31,32 @@ const prepareSynthesisMock = vi.hoisted(() =>

 const listSpeechProvidersMock = vi.hoisted(() => vi.fn());
 const getSpeechProviderMock = vi.hoisted(() => vi.fn());
+const transcodeAudioBufferMock = vi.hoisted(() =>
+  // Default off: most tests rely on the synthesized buffer reaching the
+  // channel unchanged. Tests that exercise the pre-transcode branch override
+  // per-call via `transcodeAudioBufferMock.mockResolvedValueOnce(...)`.
+  // Typed as the helper's full return shape so per-call overrides aren't
+  // narrowed to the default's literal.
+  vi.fn<
+    () => Promise<
+      | { ok: true; buffer: Buffer }
+      | {
+          ok: false;
+          reason:
+            | "platform-unsupported"
+            | "invalid-extension"
+            | "noop-same-container"
+            | "no-recipe"
+            | "transcoder-failed";
+          detail?: string;
+        }
+    >
+  >(async () => ({ ok: false, reason: "platform-unsupported" })),
+);
+
+vi.mock("./audio-transcode.js", () => ({
+  transcodeAudioBuffer: transcodeAudioBufferMock,
+}));

 vi.mock("openclaw/plugin-sdk/channel-targets", () => ({
  normalizeChannelId: (channel: string | undefined) => channel?.trim().toLowerCase() ?? null,
@@ -40,6 +66,7 @@ vi.mock("openclaw/plugin-sdk/channel-targets", () => ({
      return {
        synthesisTarget: "audio-file",
        audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
+        preferAudioFileFormat: "caf",
      };
    }
    if (normalized === "feishu" || normalized === "whatsapp") {
@@ -171,6 +198,7 @@ describe("speech-core native voice-note routing", () => {
    clearRuntimeConfigSnapshot();
    synthesizeMock.mockClear();
    prepareSynthesisMock.mockClear();
+    transcodeAudioBufferMock.mockClear();
    installSpeechProviders([createMockSpeechProvider()]);
  });

@@ -220,6 +248,57 @@ describe("speech-core native voice-note routing", () => {
    });
  });

+  it("pre-transcodes BlueBubbles synthesized mp3 to opus-in-CAF when the host can satisfy preferAudioFileFormat", async () => {
+    transcodeAudioBufferMock.mockResolvedValueOnce({
+      ok: true,
+      buffer: Buffer.from("transcoded-caf"),
+    });
+    await expectTtsPayloadResult({
+      channel: "bluebubbles",
+      prefsName: "openclaw-speech-core-tts-bluebubbles-caf-transcode-test",
+      text: "This BlueBubbles reply should be pre-transcoded to a native voice-memo CAF.",
+      target: "audio-file",
+      audioAsVoice: true,
+      mediaExtension: "caf",
+      providerResult: {
+        audioBuffer: Buffer.from("mp3"),
+        outputFormat: "mp3",
+        fileExtension: ".mp3",
+        voiceCompatible: false,
+      },
+    });
+    expect(transcodeAudioBufferMock).toHaveBeenCalledWith(
+      expect.objectContaining({ sourceExtension: "mp3", targetExtension: "caf" }),
+    );
+  });
+
+  it("falls back to the original mp3 buffer when the host transcoder fails", async () => {
+    transcodeAudioBufferMock.mockResolvedValueOnce({
+      ok: false,
+      reason: "transcoder-failed",
+      detail: "exit-1",
+    });
+    // Even though the transcode failed, the original mp3 still satisfies
+    // BlueBubbles' audioFileFormats list, so the channel still flips
+    // audioAsVoice. The user gets the v2026.4.26 PCM-CAF behavior (a voice
+    // memo bubble, possibly with bad duration) instead of a regression — and
+    // the failure is logged via the call site in tts.ts so it isn't silent.
+    await expectTtsPayloadResult({
+      channel: "bluebubbles",
+      prefsName: "openclaw-speech-core-tts-bluebubbles-caf-fallback-test",
+      text: "This BlueBubbles reply should fall back to the original mp3.",
+      target: "audio-file",
+      audioAsVoice: true,
+      mediaExtension: "mp3",
+      providerResult: {
+        audioBuffer: Buffer.from("mp3"),
+        outputFormat: "mp3",
+        fileExtension: ".mp3",
+        voiceCompatible: false,
+      },
+    });
+  });
+
  it("uses the active runtime snapshot when source config still contains TTS SecretRefs", async () => {
    const sourceConfig = {
      messages: {
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -57,6 +57,7 @@ import {
  type TtsDirectiveParseResult,
  type TtsConfigResolutionContext,
 } from "../api.js";
+import { transcodeAudioBuffer } from "./audio-transcode.js";

 export type {
  ResolvedTtsConfig,
@@ -1111,11 +1112,27 @@ export async function textToSpeech(params: {
    };
  }

+  let audioBuffer = synthesis.audioBuffer;
+  let fileExtension = synthesis.fileExtension;
+  let outputFormat = synthesis.outputFormat;
+  const transcoded = await maybePreTranscodeForVoiceDelivery({
+    channel: params.channel,
+    target: synthesis.target,
+    audioBuffer,
+    fileExtension,
+    outputFormat,
+  });
+  if (transcoded) {
+    audioBuffer = transcoded.audioBuffer;
+    fileExtension = transcoded.fileExtension;
+    outputFormat = transcoded.outputFormat;
+  }
+
  const tempRoot = resolvePreferredOpenClawTmpDir();
  mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
  const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
-  const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
-  writeFileSync(audioPath, synthesis.audioBuffer);
+  const audioPath = path.join(tempDir, `voice-${Date.now()}${fileExtension}`);
+  writeFileSync(audioPath, audioBuffer);
  scheduleCleanup(tempDir);

  return {
@@ -1127,19 +1144,64 @@ export async function textToSpeech(params: {
    fallbackFrom: synthesis.fallbackFrom,
    attemptedProviders: synthesis.attemptedProviders,
    attempts: synthesis.attempts,
-    outputFormat: synthesis.outputFormat,
+    outputFormat,
    voiceCompatible: synthesis.voiceCompatible,
    audioAsVoice: shouldDeliverTtsAsVoice({
      channel: params.channel,
      target: synthesis.target,
      voiceCompatible: synthesis.voiceCompatible,
-      fileExtension: synthesis.fileExtension,
-      outputFormat: synthesis.outputFormat,
+      fileExtension,
+      outputFormat,
    }),
    target: synthesis.target,
  };
 }

+async function maybePreTranscodeForVoiceDelivery(params: {
+  channel: string | undefined;
+  target: "audio-file" | "voice-note" | undefined;
+  audioBuffer: Buffer;
+  fileExtension: string;
+  outputFormat?: string;
+}): Promise<{ audioBuffer: Buffer; fileExtension: string; outputFormat?: string } | undefined> {
+  if (params.target !== "audio-file") {
+    return undefined;
+  }
+  const delivery = resolveChannelTtsVoiceDelivery(params.channel);
+  const preferred = delivery?.preferAudioFileFormat?.trim().toLowerCase();
+  if (!preferred) {
+    return undefined;
+  }
+  const sourceExt = params.fileExtension.trim().toLowerCase().replace(/^\./, "");
+  if (sourceExt === preferred) {
+    return undefined;
+  }
+  const outcome = await transcodeAudioBuffer({
+    audioBuffer: params.audioBuffer,
+    sourceExtension: sourceExt,
+    targetExtension: preferred,
+  });
+  if (!outcome.ok) {
+    if (outcome.reason === "transcoder-failed") {
+      // Surface only the case where the host actually attempted the transcode
+      // and it broke. The other reasons ("no-recipe", "noop-same-container",
+      // "platform-unsupported", "invalid-extension") are by-design skips and
+      // would just be log noise. This is the line that tells you "the channel
+      // asked for a pre-encode, the host had a recipe for it, and it failed"
+      // — i.e. the case where #72506 silently regresses.
+      logVerbose(
+        `TTS: pre-transcode ${sourceExt}->${preferred} for channel=${params.channel ?? "?"} failed: ${outcome.detail ?? "unknown"}`,
+      );
+    }
+    return undefined;
+  }
+  return {
+    audioBuffer: outcome.buffer,
+    fileExtension: `.${preferred}`,
+    outputFormat: preferred,
+  };
+}
+
 export async function synthesizeSpeech(params: {
  text: string;
  cfg: OpenClawConfig;