From fbc26ef9f3d5e7ac63f64e15eaa46f9d51e746e5 Mon Sep 17 00:00:00 2001 From: libokai Date: Mon, 2 Mar 2026 02:27:59 +0800 Subject: [PATCH] fix(ios): auto-fallback from PCM to MP3 for ElevenLabs TTS The default output format pcm_44100 requires an ElevenLabs Pro tier subscription. Users on free or starter plans get a silent 403 failure and hear no audio. Instead of hardcoding mp3, keep pcm_44100 as the default (better quality for Pro users) but remember the failure: when a PCM request is rejected, set pcmFormatUnavailable and use mp3_44100_128 for all subsequent requests in the session. The flag resets on config reload so it re-probes after reconnection. Also standardize the MP3 fallback format from mp3_44100 to mp3_44100_128 for consistent bitrate. Co-Authored-By: Claude Opus 4.6 --- apps/ios/Sources/Voice/TalkModeManager.swift | 23 +++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 0f8a7e6461b..943bea38078 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -72,6 +72,9 @@ final class TalkModeManager: NSObject { private var mainSessionKey: String = "main" private var fallbackVoiceId: String? private var lastPlaybackWasPCM: Bool = false + /// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required). + /// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM. + private var pcmFormatUnavailable: Bool = false var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared @@ -1004,7 +1007,8 @@ final class TalkModeManager: NSObject { let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)? .trimmingCharacters(in: .whitespacesAndNewlines) let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil - let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100") + let outputFormat = ElevenLabsTTSClient.validatedOutputFormat( + requestedOutputFormat ?? self.effectiveDefaultOutputFormat) if outputFormat == nil, let requestedOutputFormat { self.logger.warning( "talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)") @@ -1051,8 +1055,9 @@ final class TalkModeManager: NSObject { self.lastPlaybackWasPCM = true var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) if !playback.finished, playback.interruptedAt == nil { - let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") self.logger.warning("pcm playback failed; retrying mp3") + self.pcmFormatUnavailable = true self.lastPlaybackWasPCM = false let mp3Stream = client.streamSynthesize( voiceId: voiceId, @@ -1388,7 +1393,7 @@ final class TalkModeManager: NSObject { private func resolveIncrementalPrefetchOutputFormat(context: IncrementalSpeechContext) -> String? { if TalkTTSValidation.pcmSampleRate(from: context.outputFormat) != nil { - return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") } return context.outputFormat } @@ -1474,7 +1479,8 @@ final class TalkModeManager: NSObject { let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)? .trimmingCharacters(in: .whitespacesAndNewlines) let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil - let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100") + let outputFormat = ElevenLabsTTSClient.validatedOutputFormat( + requestedOutputFormat ?? self.effectiveDefaultOutputFormat) if outputFormat == nil, let requestedOutputFormat { self.logger.warning( "talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)") @@ -1525,6 +1531,11 @@ final class TalkModeManager: NSObject { latencyTier: TalkTTSValidation.validatedLatencyTier(context.directive?.latencyTier)) } + /// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`. + private var effectiveDefaultOutputFormat: String { + self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100" + } + private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream { AsyncThrowingStream { continuation in for chunk in chunks { @@ -1580,8 +1591,9 @@ final class TalkModeManager: NSObject { var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) if !playback.finished, playback.interruptedAt == nil { self.logger.warning("pcm playback failed; retrying mp3") + self.pcmFormatUnavailable = true self.lastPlaybackWasPCM = false - let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") let mp3Stream = client.streamSynthesize( voiceId: voiceId, request: self.makeIncrementalTTSRequest( @@ -1991,6 +2003,7 @@ extension TalkModeManager { self.gatewayTalkDefaultModelId = nil self.gatewayTalkApiKeyConfigured = false self.gatewayTalkConfigLoaded = false + self.pcmFormatUnavailable = false } }