From ccd55d148016783da84cc4f1cdf0950acfcdb1f4 Mon Sep 17 00:00:00 2001 From: Mariano Belinky Date: Tue, 3 Mar 2026 14:35:15 +0100 Subject: [PATCH] fix(iOS): persist PCM fallback only on explicit API format rejection --- apps/ios/Sources/Voice/TalkModeManager.swift | 78 +++++++++++++++++-- .../Tests/TalkModeConfigParsingTests.swift | 19 +++++ 2 files changed, 89 insertions(+), 8 deletions(-) diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 44ba5c86f19..01670d12980 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -7,6 +7,23 @@ import Observation import OSLog import Speech +private final class StreamFailureBox: @unchecked Sendable { + private let lock = NSLock() + private var valueInternal: Error? + + func set(_ error: Error) { + self.lock.lock() + self.valueInternal = error + self.lock.unlock() + } + + var value: Error? { + self.lock.lock() + defer { self.lock.unlock() } + return self.valueInternal + } +} + // This file intentionally centralizes talk mode state + behavior. // It's large, and splitting would force `private` -> `fileprivate` across many members. // We'll refactor into smaller files when the surface stabilizes. @@ -1040,7 +1057,7 @@ final class TalkModeManager: NSObject { let request = makeRequest(outputFormat: outputFormat) let client = ElevenLabsTTSClient(apiKey: apiKey) - let stream = client.streamSynthesize(voiceId: voiceId, request: request) + let rawStream = client.streamSynthesize(voiceId: voiceId, request: request) if self.interruptOnSpeech { do { @@ -1055,12 +1072,16 @@ final class TalkModeManager: NSObject { let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat) let result: StreamingPlaybackResult if let sampleRate { + let streamFailure = StreamFailureBox() + let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure) self.lastPlaybackWasPCM = true var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) if !playback.finished, playback.interruptedAt == nil { let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") self.logger.warning("pcm playback failed; retrying mp3") - self.pcmFormatUnavailable = true + if Self.isPCMFormatRejectedByAPI(streamFailure.value) { + self.pcmFormatUnavailable = true + } self.lastPlaybackWasPCM = false let mp3Stream = client.streamSynthesize( voiceId: voiceId, @@ -1070,7 +1091,7 @@ final class TalkModeManager: NSObject { result = playback } else { self.lastPlaybackWasPCM = false - result = await self.mp3Player.play(stream: stream) + result = await self.mp3Player.play(stream: rawStream) } let duration = Date().timeIntervalSince(started) self.logger.info("elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(duration, privacy: .public)s") @@ -1545,6 +1566,39 @@ final class TalkModeManager: NSObject { self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100" } + private static func monitorStreamFailures( + _ stream: AsyncThrowingStream, + failureBox: StreamFailureBox + ) -> AsyncThrowingStream + { + AsyncThrowingStream { continuation in + let task = Task { + do { + for try await chunk in stream { + continuation.yield(chunk) + } + continuation.finish() + } catch { + failureBox.set(error) + continuation.finish(throwing: error) + } + } + continuation.onTermination = { _ in + task.cancel() + } + } + } + + private static func isPCMFormatRejectedByAPI(_ error: Error?) -> Bool { + guard let error = error as NSError? else { return false } + guard error.domain == "ElevenLabsTTS", error.code >= 400 else { return false } + let message = (error.userInfo[NSLocalizedDescriptionKey] as? String ?? error.localizedDescription).lowercased() + return message.contains("output_format") + || message.contains("pcm_") + || message.contains("pcm ") + || message.contains("subscription_required") + } + private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream { AsyncThrowingStream { continuation in for chunk in chunks { @@ -1586,21 +1640,25 @@ final class TalkModeManager: NSObject { text: text, context: context, outputFormat: context.outputFormat) - let stream: AsyncThrowingStream + let rawStream: AsyncThrowingStream if let prefetchedAudio, !prefetchedAudio.chunks.isEmpty { - stream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks) + rawStream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks) } else { - stream = client.streamSynthesize(voiceId: voiceId, request: request) + rawStream = client.streamSynthesize(voiceId: voiceId, request: request) } let playbackFormat = prefetchedAudio?.outputFormat ?? context.outputFormat let sampleRate = TalkTTSValidation.pcmSampleRate(from: playbackFormat) let result: StreamingPlaybackResult if let sampleRate { + let streamFailure = StreamFailureBox() + let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure) self.lastPlaybackWasPCM = true var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) if !playback.finished, playback.interruptedAt == nil { self.logger.warning("pcm playback failed; retrying mp3") - self.pcmFormatUnavailable = true + if Self.isPCMFormatRejectedByAPI(streamFailure.value) { + self.pcmFormatUnavailable = true + } self.lastPlaybackWasPCM = false let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") let mp3Stream = client.streamSynthesize( @@ -1614,7 +1672,7 @@ final class TalkModeManager: NSObject { result = playback } else { self.lastPlaybackWasPCM = false - result = await self.mp3Player.play(stream: stream) + result = await self.mp3Player.play(stream: rawStream) } if !result.finished, let interruptedAt = result.interruptedAt { self.lastInterruptedAtSeconds = interruptedAt @@ -2118,6 +2176,10 @@ private final class AudioTapDiagnostics: @unchecked Sendable { #if DEBUG extension TalkModeManager { + static func _test_isPCMFormatRejectedByAPI(_ error: Error?) -> Bool { + self.isPCMFormatRejectedByAPI(error) + } + func _test_seedTranscript(_ transcript: String) { self.lastTranscript = transcript self.lastHeard = Date() diff --git a/apps/ios/Tests/TalkModeConfigParsingTests.swift b/apps/ios/Tests/TalkModeConfigParsingTests.swift index fd6b535f8a3..a09f095a233 100644 --- a/apps/ios/Tests/TalkModeConfigParsingTests.swift +++ b/apps/ios/Tests/TalkModeConfigParsingTests.swift @@ -1,3 +1,4 @@ +import Foundation import Testing @testable import OpenClaw @@ -28,4 +29,22 @@ import Testing let selection = TalkModeManager.selectTalkProviderConfig(talk) #expect(selection == nil) } + + @Test func detectsPCMFormatRejectionFromElevenLabsError() { + let error = NSError( + domain: "ElevenLabsTTS", + code: 403, + userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed: 403 subscription_required output_format=pcm_44100", + ]) + #expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error)) + } + + @Test func ignoresGenericPlaybackFailuresForPCMFormatRejection() { + let error = NSError( + domain: "StreamingAudio", + code: -1, + userInfo: [NSLocalizedDescriptionKey: "queue enqueue failed"]) + #expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error) == false) + } }