mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-23 07:51:33 +00:00
fix(ios): auto-fallback from PCM to MP3 for ElevenLabs TTS
The default output format pcm_44100 requires an ElevenLabs Pro tier subscription. Users on free or starter plans get a silent 403 failure and hear no audio. Instead of hardcoding mp3, keep pcm_44100 as the default (better quality for Pro users) but remember the failure: when a PCM request is rejected, set pcmFormatUnavailable and use mp3_44100_128 for all subsequent requests in the session. The flag resets on config reload so it re-probes after reconnection. Also standardize the MP3 fallback format from mp3_44100 to mp3_44100_128 for consistent bitrate. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -72,6 +72,9 @@ final class TalkModeManager: NSObject {
|
||||
private var mainSessionKey: String = "main"
|
||||
private var fallbackVoiceId: String?
|
||||
private var lastPlaybackWasPCM: Bool = false
|
||||
/// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required).
|
||||
/// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM.
|
||||
private var pcmFormatUnavailable: Bool = false
|
||||
var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared
|
||||
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
|
||||
|
||||
@@ -1004,7 +1007,8 @@ final class TalkModeManager: NSObject {
|
||||
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
|
||||
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
|
||||
if outputFormat == nil, let requestedOutputFormat {
|
||||
self.logger.warning(
|
||||
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
|
||||
@@ -1051,8 +1055,9 @@ final class TalkModeManager: NSObject {
|
||||
self.lastPlaybackWasPCM = true
|
||||
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
|
||||
if !playback.finished, playback.interruptedAt == nil {
|
||||
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
|
||||
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
|
||||
self.logger.warning("pcm playback failed; retrying mp3")
|
||||
self.pcmFormatUnavailable = true
|
||||
self.lastPlaybackWasPCM = false
|
||||
let mp3Stream = client.streamSynthesize(
|
||||
voiceId: voiceId,
|
||||
@@ -1388,7 +1393,7 @@ final class TalkModeManager: NSObject {
|
||||
|
||||
private func resolveIncrementalPrefetchOutputFormat(context: IncrementalSpeechContext) -> String? {
|
||||
if TalkTTSValidation.pcmSampleRate(from: context.outputFormat) != nil {
|
||||
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
|
||||
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
|
||||
}
|
||||
return context.outputFormat
|
||||
}
|
||||
@@ -1474,7 +1479,8 @@ final class TalkModeManager: NSObject {
|
||||
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
|
||||
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
|
||||
if outputFormat == nil, let requestedOutputFormat {
|
||||
self.logger.warning(
|
||||
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
|
||||
@@ -1525,6 +1531,11 @@ final class TalkModeManager: NSObject {
|
||||
latencyTier: TalkTTSValidation.validatedLatencyTier(context.directive?.latencyTier))
|
||||
}
|
||||
|
||||
/// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`.
|
||||
private var effectiveDefaultOutputFormat: String {
|
||||
self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100"
|
||||
}
|
||||
|
||||
private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream<Data, Error> {
|
||||
AsyncThrowingStream { continuation in
|
||||
for chunk in chunks {
|
||||
@@ -1580,8 +1591,9 @@ final class TalkModeManager: NSObject {
|
||||
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
|
||||
if !playback.finished, playback.interruptedAt == nil {
|
||||
self.logger.warning("pcm playback failed; retrying mp3")
|
||||
self.pcmFormatUnavailable = true
|
||||
self.lastPlaybackWasPCM = false
|
||||
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
|
||||
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
|
||||
let mp3Stream = client.streamSynthesize(
|
||||
voiceId: voiceId,
|
||||
request: self.makeIncrementalTTSRequest(
|
||||
@@ -1991,6 +2003,7 @@ extension TalkModeManager {
|
||||
self.gatewayTalkDefaultModelId = nil
|
||||
self.gatewayTalkApiKeyConfigured = false
|
||||
self.gatewayTalkConfigLoaded = false
|
||||
self.pcmFormatUnavailable = false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user