fix(ios): auto-fallback from PCM to MP3 for ElevenLabs TTS

The default output format pcm_44100 requires an ElevenLabs Pro tier
subscription. Users on free or starter plans get a silent 403 failure
and hear no audio.

Instead of hardcoding mp3, keep pcm_44100 as the default (better
quality for Pro users) but remember the failure: when a PCM request
is rejected, set pcmFormatUnavailable and use mp3_44100_128 for all
subsequent requests in the session. The flag resets on config reload
so it re-probes after reconnection.

Also standardize the MP3 fallback format from mp3_44100 to
mp3_44100_128 for consistent bitrate.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
libokai
2026-03-02 02:27:59 +08:00
parent da0ba1b73a
commit fbc26ef9f3

View File

@@ -72,6 +72,9 @@ final class TalkModeManager: NSObject {
private var mainSessionKey: String = "main"
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
/// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required).
/// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM.
private var pcmFormatUnavailable: Bool = false
var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
@@ -1004,7 +1007,8 @@ final class TalkModeManager: NSObject {
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
if outputFormat == nil, let requestedOutputFormat {
self.logger.warning(
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1051,8 +1055,9 @@ final class TalkModeManager: NSObject {
self.lastPlaybackWasPCM = true
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
if !playback.finished, playback.interruptedAt == nil {
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
self.logger.warning("pcm playback failed; retrying mp3")
self.pcmFormatUnavailable = true
self.lastPlaybackWasPCM = false
let mp3Stream = client.streamSynthesize(
voiceId: voiceId,
@@ -1388,7 +1393,7 @@ final class TalkModeManager: NSObject {
private func resolveIncrementalPrefetchOutputFormat(context: IncrementalSpeechContext) -> String? {
if TalkTTSValidation.pcmSampleRate(from: context.outputFormat) != nil {
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
}
return context.outputFormat
}
@@ -1474,7 +1479,8 @@ final class TalkModeManager: NSObject {
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
if outputFormat == nil, let requestedOutputFormat {
self.logger.warning(
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1525,6 +1531,11 @@ final class TalkModeManager: NSObject {
latencyTier: TalkTTSValidation.validatedLatencyTier(context.directive?.latencyTier))
}
/// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`.
private var effectiveDefaultOutputFormat: String {
self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100"
}
private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream<Data, Error> {
AsyncThrowingStream { continuation in
for chunk in chunks {
@@ -1580,8 +1591,9 @@ final class TalkModeManager: NSObject {
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
if !playback.finished, playback.interruptedAt == nil {
self.logger.warning("pcm playback failed; retrying mp3")
self.pcmFormatUnavailable = true
self.lastPlaybackWasPCM = false
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
let mp3Stream = client.streamSynthesize(
voiceId: voiceId,
request: self.makeIncrementalTTSRequest(
@@ -1991,6 +2003,7 @@ extension TalkModeManager {
self.gatewayTalkDefaultModelId = nil
self.gatewayTalkApiKeyConfigured = false
self.gatewayTalkConfigLoaded = false
self.pcmFormatUnavailable = false
}
}