From da3d17e1ca3c1f3ba831feb36ee4ffe5c4c8dbf9 Mon Sep 17 00:00:00 2001 From: Omar Shahine Date: Mon, 27 Apr 2026 14:15:16 -0700 Subject: [PATCH] fix(tts): pre-transcode synthesized audio to opus-in-CAF for native iMessage voice-memo bubbles via BlueBubbles (#72586) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end testing on macOS + BlueBubbles + ElevenLabs walked through three CAF flavors before landing on the format Apple's Messages.app actually emits when a user records a native iMessage voice memo: - PCM int16 @ 44.1 kHz CAF: BlueBubbles' internal `afconvert -f m4af -d aac` conversion fails; the original CAF reaches iMessage but renders with 0 s duration. - AAC @ 22.05 kHz mono CAF: BlueBubbles' conversion succeeds and the server silently downgrades the delivery, sending the converted MP3 as a generic audio attachment. - **Opus @ 24 kHz mono CAF**: byte-identical to the descriptor block Apple's Messages.app produces; BlueBubbles passes it through unchanged and iMessage renders a native voice-memo bubble with proper duration and waveform UI. Adds an opt-in `tts.voice.preferAudioFileFormat` channel capability and a macOS `afconvert`-backed pre-transcode in the speech-core pipeline. BlueBubbles declares `preferAudioFileFormat: "caf"`. Other channels are unaffected. Falls back to the original buffer when the host platform, the source/target pair, or the transcoder process can't produce the preferred container — so non-Darwin hosts and unsupported provider combinations are unchanged. Also adds a `caff` magic-byte sniff in `src/media/mime.ts` so the auto-reply host-local-media validator (which uses `file-type` and didn't recognize CAF natively) accepts the buffer instead of dropping it as "⚠️ Media failed." Fixes #72506. Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 4 + extensions/bluebubbles/src/channel-shared.ts | 7 + .../speech-core/src/audio-transcode.test.ts | 64 +++++++++ extensions/speech-core/src/audio-transcode.ts | 134 ++++++++++++++++++ extensions/speech-core/src/tts.test.ts | 79 +++++++++++ extensions/speech-core/src/tts.ts | 72 +++++++++- src/channels/plugins/types.core.ts | 19 +++ src/media/mime.test.ts | 17 +++ src/media/mime.ts | 20 ++- 9 files changed, 409 insertions(+), 7 deletions(-) create mode 100644 extensions/speech-core/src/audio-transcode.test.ts create mode 100644 extensions/speech-core/src/audio-transcode.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 124d75d84aa..68e4c79fdc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,10 @@ Docs: https://docs.openclaw.ai - WhatsApp/Web: keep quiet but healthy linked-device sessions connected by basing the watchdog on WhatsApp Web transport activity, while retaining a longer app-silence cap so frame activity cannot mask a stuck session forever. Fixes #70678; carries forward the focused #71466 approach and keeps #63939 as related configurable-timeout follow-up. Thanks @vincentkoc and @oromeis. - Discord/gateway: count failed health-monitor restart attempts toward cooldown and hourly caps, and evict stale account lifecycle state during channel reloads so repeated Discord gateway recovery cannot loop on old status. Fixes #38596. (#40413) Thanks @jellyAI-dev and @vashquez. +### Fixes + +- TTS/BlueBubbles: pre-transcode synthesized MP3 audio to opus-in-CAF (mono, 24 kHz — validated against macOS 15.x Messages.app's native voice-memo CAF descriptor) on macOS hosts before handing the file to BlueBubbles, so iMessage renders the result as a native voice-memo bubble with proper duration and waveform UI instead of a plain file attachment. Adds an opt-in `tts.voice.preferAudioFileFormat` channel capability and a magic-byte sniff for the CAF container so the host-local-media validator (which uses `file-type` and didn't recognize CAF natively) can verify the pre-transcoded buffer. Channels that don't opt in are unaffected. (#72586) Fixes #72506. Thanks @omarshahine. + ## 2026.4.26 ### Changes diff --git a/extensions/bluebubbles/src/channel-shared.ts b/extensions/bluebubbles/src/channel-shared.ts index fc260c55610..9ed9bb4587d 100644 --- a/extensions/bluebubbles/src/channel-shared.ts +++ b/extensions/bluebubbles/src/channel-shared.ts @@ -35,6 +35,13 @@ export const bluebubblesCapabilities: ChannelPlugin[ voice: { synthesisTarget: "audio-file", audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"], + // Prefer CAF when the host can pre-transcode (afconvert on macOS). + // The BlueBubbles server otherwise races a CAF→MP3 conversion against + // the upload write completing and silently falls back to a generic + // attachment send when its conversion fails. Pre-encoding to CAF + // bypasses that race so iMessage renders the result as a native voice + // memo bubble (waveform UI) instead of a plain audio attachment. + preferAudioFileFormat: "caf", }, }, reactions: true, diff --git a/extensions/speech-core/src/audio-transcode.test.ts b/extensions/speech-core/src/audio-transcode.test.ts new file mode 100644 index 00000000000..4814c19f05b --- /dev/null +++ b/extensions/speech-core/src/audio-transcode.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from "vitest"; +import { transcodeAudioBuffer } from "./audio-transcode.js"; + +describe("transcodeAudioBuffer", () => { + it("returns noop-same-container when source and target containers match", async () => { + const result = await transcodeAudioBuffer({ + audioBuffer: Buffer.from("payload"), + sourceExtension: "mp3", + targetExtension: ".mp3", + }); + expect(result).toEqual({ ok: false, reason: "noop-same-container" }); + }); + + it("returns no-recipe when no afconvert recipe is defined for the requested pair", async () => { + const result = await transcodeAudioBuffer({ + audioBuffer: Buffer.from("payload"), + sourceExtension: "mp3", + targetExtension: "flac", + }); + expect(result).toEqual({ ok: false, reason: "no-recipe" }); + }); + + it("returns invalid-extension for an empty source extension", async () => { + const result = await transcodeAudioBuffer({ + audioBuffer: Buffer.from("payload"), + sourceExtension: "", + targetExtension: "caf", + }); + expect(result).toEqual({ ok: false, reason: "invalid-extension" }); + }); + + it("returns invalid-extension for an empty target extension", async () => { + const result = await transcodeAudioBuffer({ + audioBuffer: Buffer.from("payload"), + sourceExtension: "mp3", + targetExtension: "", + }); + expect(result).toEqual({ ok: false, reason: "invalid-extension" }); + }); + + it("rejects path-traversal style extensions", async () => { + const result = await transcodeAudioBuffer({ + audioBuffer: Buffer.from("payload"), + sourceExtension: "../etc/passwd", + targetExtension: "caf", + }); + expect(result).toEqual({ ok: false, reason: "invalid-extension" }); + }); + + it("returns platform-unsupported off-Darwin without invoking afconvert", async () => { + if (process.platform === "darwin") { + // macOS: a valid mp3→caf request would proceed to spawn `afconvert`, + // which we don't want to run from a unit test. The Darwin happy path + // is exercised end-to-end via the BlueBubbles voice-memo flow. + return; + } + const result = await transcodeAudioBuffer({ + audioBuffer: Buffer.from("payload"), + sourceExtension: "mp3", + targetExtension: "caf", + }); + expect(result).toEqual({ ok: false, reason: "platform-unsupported" }); + }); +}); diff --git a/extensions/speech-core/src/audio-transcode.ts b/extensions/speech-core/src/audio-transcode.ts new file mode 100644 index 00000000000..6627204bbf6 --- /dev/null +++ b/extensions/speech-core/src/audio-transcode.ts @@ -0,0 +1,134 @@ +import { spawn } from "node:child_process"; +import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/sandbox"; + +/** Container token (file-extension shape, no leading dot) the host knows how + * to pre-transcode into. Update in lockstep with `pickAfconvertRecipe`. */ +export type HostTranscodableContainer = "caf"; + +export type TranscodeOutcome = + | { ok: true; buffer: Buffer } + | { + ok: false; + reason: + | "platform-unsupported" + | "invalid-extension" + | "noop-same-container" + | "no-recipe" + | "transcoder-failed"; + detail?: string; + }; + +/** + * Best-effort audio container transcode using macOS `afconvert`. + * + * Used by the TTS pipeline to pre-encode synthesized audio into a channel's + * preferred container (see `ChannelTtsVoiceDeliveryCapabilities.preferAudioFileFormat`) + * so the channel's downstream does not have to perform a container + * conversion of its own. Returns a discriminated outcome so callers can + * distinguish "we didn't try" (platform/recipe/noop) from "we tried and the + * transcoder failed", which is the case worth logging. + * + * Currently only macOS is supported because `afconvert` is the only widely + * available encoder we ship a recipe for. + */ +export async function transcodeAudioBuffer(params: { + audioBuffer: Buffer; + sourceExtension: string; + targetExtension: string; + timeoutMs?: number; +}): Promise { + // Validate inputs first so callers get a specific reason regardless of + // host platform. Platform-unsupported is the gate immediately before the + // actual `afconvert` invocation. + const source = normalizeExt(params.sourceExtension); + const target = normalizeExt(params.targetExtension); + if (!source || !target) { + return { ok: false, reason: "invalid-extension" }; + } + if (source === target) { + return { ok: false, reason: "noop-same-container" }; + } + const recipe = pickAfconvertRecipe(source, target); + if (!recipe) { + return { ok: false, reason: "no-recipe" }; + } + if (process.platform !== "darwin") { + return { ok: false, reason: "platform-unsupported" }; + } + + const tmpRoot = resolvePreferredOpenClawTmpDir(); + mkdirSync(tmpRoot, { recursive: true, mode: 0o700 }); + const tmpDir = mkdtempSync(join(tmpRoot, "tts-transcode-")); + const inPath = join(tmpDir, `in.${source}`); + const outPath = join(tmpDir, `out.${target}`); + try { + writeFileSync(inPath, params.audioBuffer, { mode: 0o600 }); + const result = await runAfconvert({ + args: [...recipe, inPath, outPath], + timeoutMs: params.timeoutMs ?? 5000, + }); + if (!result.ok) { + return { ok: false, reason: "transcoder-failed", detail: result.detail }; + } + return { ok: true, buffer: readFileSync(outPath) }; + } catch (err) { + return { ok: false, reason: "transcoder-failed", detail: (err as Error).message }; + } finally { + try { + rmSync(tmpDir, { recursive: true, force: true }); + } catch { + // best-effort cleanup + } + } +} + +function normalizeExt(ext: string): string | undefined { + // Pattern matches the sibling helper in src/media/audio-transcode.ts: a short + // alphanumeric extension token. Keeps the value safe to interpolate into + // tmp-file names below without introducing a path-traversal surface. + const trimmed = ext.trim().toLowerCase().replace(/^\./, ""); + return /^[a-z0-9]{1,12}$/.test(trimmed) ? trimmed : undefined; +} + +function pickAfconvertRecipe(source: string, target: string): string[] | undefined { + // Currently only the MP3→CAF path used by BlueBubbles voice memos. Keep + // this in lockstep with `HostTranscodableContainer` above so a typo at the + // channel-capability declaration site is a compile-time error. + if (target === "caf") { + // Opus-in-CAF, mono, 24 kHz. Validated against macOS 15.x Messages.app's + // native voice-memo CAF descriptor (1 ch, 24000 Hz, opus); other CAF + // flavors (PCM, AAC) get downgraded to plain audio attachments along the + // BlueBubbles → Messages.app path. If iMessage stops rendering the result + // as a voice memo after a system update, try forcing frames-per-packet + // explicitly via `opus@24000#480` and re-validate. See #72506. + return ["-f", "caff", "-d", "opus@24000", "-c", "1"]; + } + return undefined; +} + +function runAfconvert(params: { + args: string[]; + timeoutMs: number; +}): Promise<{ ok: true } | { ok: false; detail: string }> { + return new Promise((resolve) => { + const child = spawn("/usr/bin/afconvert", params.args, { stdio: "ignore" }); + const timer = setTimeout(() => { + child.kill("SIGKILL"); + resolve({ ok: false, detail: `timeout-${params.timeoutMs}ms` }); + }, params.timeoutMs); + child.once("error", (err) => { + clearTimeout(timer); + resolve({ ok: false, detail: err.message }); + }); + child.once("exit", (code) => { + clearTimeout(timer); + if (code === 0) { + resolve({ ok: true }); + } else { + resolve({ ok: false, detail: `exit-${code ?? "unknown"}` }); + } + }); + }); +} diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index d998dd56dde..6f9eca6a6be 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -31,6 +31,32 @@ const prepareSynthesisMock = vi.hoisted(() => const listSpeechProvidersMock = vi.hoisted(() => vi.fn()); const getSpeechProviderMock = vi.hoisted(() => vi.fn()); +const transcodeAudioBufferMock = vi.hoisted(() => + // Default off: most tests rely on the synthesized buffer reaching the + // channel unchanged. Tests that exercise the pre-transcode branch override + // per-call via `transcodeAudioBufferMock.mockResolvedValueOnce(...)`. + // Typed as the helper's full return shape so per-call overrides aren't + // narrowed to the default's literal. + vi.fn< + () => Promise< + | { ok: true; buffer: Buffer } + | { + ok: false; + reason: + | "platform-unsupported" + | "invalid-extension" + | "noop-same-container" + | "no-recipe" + | "transcoder-failed"; + detail?: string; + } + > + >(async () => ({ ok: false, reason: "platform-unsupported" })), +); + +vi.mock("./audio-transcode.js", () => ({ + transcodeAudioBuffer: transcodeAudioBufferMock, +})); vi.mock("openclaw/plugin-sdk/channel-targets", () => ({ normalizeChannelId: (channel: string | undefined) => channel?.trim().toLowerCase() ?? null, @@ -40,6 +66,7 @@ vi.mock("openclaw/plugin-sdk/channel-targets", () => ({ return { synthesisTarget: "audio-file", audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"], + preferAudioFileFormat: "caf", }; } if (normalized === "feishu" || normalized === "whatsapp") { @@ -171,6 +198,7 @@ describe("speech-core native voice-note routing", () => { clearRuntimeConfigSnapshot(); synthesizeMock.mockClear(); prepareSynthesisMock.mockClear(); + transcodeAudioBufferMock.mockClear(); installSpeechProviders([createMockSpeechProvider()]); }); @@ -220,6 +248,57 @@ describe("speech-core native voice-note routing", () => { }); }); + it("pre-transcodes BlueBubbles synthesized mp3 to opus-in-CAF when the host can satisfy preferAudioFileFormat", async () => { + transcodeAudioBufferMock.mockResolvedValueOnce({ + ok: true, + buffer: Buffer.from("transcoded-caf"), + }); + await expectTtsPayloadResult({ + channel: "bluebubbles", + prefsName: "openclaw-speech-core-tts-bluebubbles-caf-transcode-test", + text: "This BlueBubbles reply should be pre-transcoded to a native voice-memo CAF.", + target: "audio-file", + audioAsVoice: true, + mediaExtension: "caf", + providerResult: { + audioBuffer: Buffer.from("mp3"), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }, + }); + expect(transcodeAudioBufferMock).toHaveBeenCalledWith( + expect.objectContaining({ sourceExtension: "mp3", targetExtension: "caf" }), + ); + }); + + it("falls back to the original mp3 buffer when the host transcoder fails", async () => { + transcodeAudioBufferMock.mockResolvedValueOnce({ + ok: false, + reason: "transcoder-failed", + detail: "exit-1", + }); + // Even though the transcode failed, the original mp3 still satisfies + // BlueBubbles' audioFileFormats list, so the channel still flips + // audioAsVoice. The user gets the v2026.4.26 PCM-CAF behavior (a voice + // memo bubble, possibly with bad duration) instead of a regression — and + // the failure is logged via the call site in tts.ts so it isn't silent. + await expectTtsPayloadResult({ + channel: "bluebubbles", + prefsName: "openclaw-speech-core-tts-bluebubbles-caf-fallback-test", + text: "This BlueBubbles reply should fall back to the original mp3.", + target: "audio-file", + audioAsVoice: true, + mediaExtension: "mp3", + providerResult: { + audioBuffer: Buffer.from("mp3"), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }, + }); + }); + it("uses the active runtime snapshot when source config still contains TTS SecretRefs", async () => { const sourceConfig = { messages: { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 2a3a883ecb4..eb821d2f015 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -57,6 +57,7 @@ import { type TtsDirectiveParseResult, type TtsConfigResolutionContext, } from "../api.js"; +import { transcodeAudioBuffer } from "./audio-transcode.js"; export type { ResolvedTtsConfig, @@ -1111,11 +1112,27 @@ export async function textToSpeech(params: { }; } + let audioBuffer = synthesis.audioBuffer; + let fileExtension = synthesis.fileExtension; + let outputFormat = synthesis.outputFormat; + const transcoded = await maybePreTranscodeForVoiceDelivery({ + channel: params.channel, + target: synthesis.target, + audioBuffer, + fileExtension, + outputFormat, + }); + if (transcoded) { + audioBuffer = transcoded.audioBuffer; + fileExtension = transcoded.fileExtension; + outputFormat = transcoded.outputFormat; + } + const tempRoot = resolvePreferredOpenClawTmpDir(); mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); const tempDir = mkdtempSync(path.join(tempRoot, "tts-")); - const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`); - writeFileSync(audioPath, synthesis.audioBuffer); + const audioPath = path.join(tempDir, `voice-${Date.now()}${fileExtension}`); + writeFileSync(audioPath, audioBuffer); scheduleCleanup(tempDir); return { @@ -1127,19 +1144,64 @@ export async function textToSpeech(params: { fallbackFrom: synthesis.fallbackFrom, attemptedProviders: synthesis.attemptedProviders, attempts: synthesis.attempts, - outputFormat: synthesis.outputFormat, + outputFormat, voiceCompatible: synthesis.voiceCompatible, audioAsVoice: shouldDeliverTtsAsVoice({ channel: params.channel, target: synthesis.target, voiceCompatible: synthesis.voiceCompatible, - fileExtension: synthesis.fileExtension, - outputFormat: synthesis.outputFormat, + fileExtension, + outputFormat, }), target: synthesis.target, }; } +async function maybePreTranscodeForVoiceDelivery(params: { + channel: string | undefined; + target: "audio-file" | "voice-note" | undefined; + audioBuffer: Buffer; + fileExtension: string; + outputFormat?: string; +}): Promise<{ audioBuffer: Buffer; fileExtension: string; outputFormat?: string } | undefined> { + if (params.target !== "audio-file") { + return undefined; + } + const delivery = resolveChannelTtsVoiceDelivery(params.channel); + const preferred = delivery?.preferAudioFileFormat?.trim().toLowerCase(); + if (!preferred) { + return undefined; + } + const sourceExt = params.fileExtension.trim().toLowerCase().replace(/^\./, ""); + if (sourceExt === preferred) { + return undefined; + } + const outcome = await transcodeAudioBuffer({ + audioBuffer: params.audioBuffer, + sourceExtension: sourceExt, + targetExtension: preferred, + }); + if (!outcome.ok) { + if (outcome.reason === "transcoder-failed") { + // Surface only the case where the host actually attempted the transcode + // and it broke. The other reasons ("no-recipe", "noop-same-container", + // "platform-unsupported", "invalid-extension") are by-design skips and + // would just be log noise. This is the line that tells you "the channel + // asked for a pre-encode, the host had a recipe for it, and it failed" + // — i.e. the case where #72506 silently regresses. + logVerbose( + `TTS: pre-transcode ${sourceExt}->${preferred} for channel=${params.channel ?? "?"} failed: ${outcome.detail ?? "unknown"}`, + ); + } + return undefined; + } + return { + audioBuffer: outcome.buffer, + fileExtension: `.${preferred}`, + outputFormat: preferred, + }; +} + export async function synthesizeSpeech(params: { text: string; cfg: OpenClawConfig; diff --git a/src/channels/plugins/types.core.ts b/src/channels/plugins/types.core.ts index 8433d65163b..d99b371c411 100644 --- a/src/channels/plugins/types.core.ts +++ b/src/channels/plugins/types.core.ts @@ -273,10 +273,29 @@ export type ChannelGroupContext = { }; /** TTS voice delivery behavior advertised by a channel plugin. */ +/** + * Container tokens (file-extension shape, no leading dot) that the host + * speech-core pipeline knows how to pre-transcode synthesized audio into. + * Channels that benefit from a specific container — currently only + * BlueBubbles, which needs Apple's native voice-memo CAF descriptor — name + * one here. Adding a new entry requires extending the host transcoder + * recipe table in lockstep so a typed declaration cannot silently no-op. + */ +export type PreferredAudioFileFormat = "caf"; + export type ChannelTtsVoiceDeliveryCapabilities = { synthesisTarget: "audio-file" | "voice-note"; transcodesAudio?: boolean; audioFileFormats?: readonly string[]; + /** + * Optional preferred audio container the channel wants for voice-memo + * delivery. When set and the host can transcode (e.g. `afconvert` on + * macOS), the TTS pipeline pre-encodes synthesized audio to this format + * before handing it to the channel. Useful for channels (such as + * BlueBubbles) whose downstream attempts its own container conversion + * that races against the upload write and fails. + */ + preferAudioFileFormat?: PreferredAudioFileFormat; }; /** Static capability flags advertised by a channel plugin. */ diff --git a/src/media/mime.test.ts b/src/media/mime.test.ts index 40fa2783941..192abff1727 100644 --- a/src/media/mime.test.ts +++ b/src/media/mime.test.ts @@ -118,6 +118,23 @@ describe("mime detection", () => { expect(mime).toBe("audio/aac"); }); + it("detects Apple CAF audio by magic bytes when file-type does not recognize the container", async () => { + // CAF files start with the four-byte ASCII tag "caff". `file-type` v22 has + // no native CAF detector, so without the manual magic-byte fallback the + // host-local-media validator drops `afconvert`-produced voice-memo CAFs as + // unknown binary blobs. Regression guard for the BlueBubbles voice-memo + // pre-transcode path. + const buf = Buffer.concat([Buffer.from("caff", "ascii"), Buffer.alloc(60)]); + const mime = await detectMime({ buffer: buf }); + expect(mime).toBe("audio/x-caf"); + }); + + it("returns audio/x-caf when extension and CAF magic bytes both agree", async () => { + const buf = Buffer.concat([Buffer.from("caff", "ascii"), Buffer.alloc(60)]); + const mime = await detectMime({ buffer: buf, filePath: "/tmp/voice.caf" }); + expect(mime).toBe("audio/x-caf"); + }); + it("caps dependency sniffing to a bounded prefix", () => { const small = Buffer.alloc(32); const large = Buffer.alloc(FILE_TYPE_SNIFF_MAX_BYTES + 16); diff --git a/src/media/mime.ts b/src/media/mime.ts index 31fc3016ce0..861a582607d 100644 --- a/src/media/mime.ts +++ b/src/media/mime.ts @@ -20,6 +20,7 @@ const EXT_BY_MIME: Record = { "audio/opus": ".opus", "audio/x-m4a": ".m4a", "audio/mp4": ".m4a", + "audio/x-caf": ".caf", "video/mp4": ".mp4", "video/quicktime": ".mov", "application/pdf": ".pdf", @@ -91,10 +92,25 @@ async function sniffMime(buffer?: Buffer): Promise { fileTypeModulePromise ??= import("file-type"); const { fileTypeFromBuffer } = await fileTypeModulePromise; const type = await fileTypeFromBuffer(sliceMimeSniffBuffer(buffer)); - return type?.mime ?? undefined; + if (type?.mime) { + return type.mime; + } } catch { - return undefined; + // fall through to manual magic-byte sniffs } + return sniffKnownAudioMagic(buffer); +} + +// Fallbacks for audio containers `file-type` doesn't recognize natively (e.g. +// Apple's CAF, used by iMessage voice memos when produced by `afconvert`). +// Without this the host-local-media validator drops these buffers as unknown +// binary blobs because the sniff returns undefined, even though the file is +// a valid audio container. +function sniffKnownAudioMagic(buffer: Buffer): string | undefined { + if (buffer.byteLength >= 4 && buffer.toString("ascii", 0, 4) === "caff") { + return "audio/x-caf"; + } + return undefined; } export function getFileExtension(filePath?: string | null): string | undefined {