diff --git a/CHANGELOG.md b/CHANGELOG.md index 0759ac07689..899064a4889 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai - iOS/Gateway keychain hardening: move gateway metadata and TLS fingerprints to device keychain storage with safer migration behavior and rollback-safe writes to reduce credential loss risk during upgrades. (#33029) thanks @mbelinky. - iOS/Concurrency stability: replace risky shared-state access in camera and gateway connection paths with lock-protected access patterns to reduce crash risk under load. (#33241) thanks @mbelinky. - iOS/Security guardrails: limit production API-key sourcing to app config and make deep-link confirmation prompts safer by coalescing queued requests instead of silently dropping them. (#33031) thanks @mbelinky. +- iOS/TTS playback fallback: keep voice playback resilient by switching from PCM to MP3 when provider format support is unavailable, while avoiding sticky fallback on generic local playback errors. (#33032) thanks @mbelinky. - Telegram/multi-account default routing clarity: warn only for ambiguous (2+) account setups without an explicit default, add `openclaw doctor` warnings for missing/invalid multi-account defaults across channels, and document explicit-default guidance for channel routing and Telegram config. (#32544) thanks @Sid-Qin. - Telegram/plugin outbound hook parity: run `message_sending` + `message_sent` in Telegram reply delivery, include reply-path hook metadata (`mediaUrls`, `threadId`), and report `message_sent.success=false` when hooks blank text and no outbound message is delivered. (#32649) Thanks @KimGLee. - Agents/Skills runtime loading: propagate run config into embedded attempt and compaction skill-entry loading so explicitly enabled bundled companion skills are discovered consistently when skill snapshots do not already provide resolved entries. Thanks @gumadeiras. diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 859c9e43566..01670d12980 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -7,6 +7,23 @@ import Observation import OSLog import Speech +private final class StreamFailureBox: @unchecked Sendable { + private let lock = NSLock() + private var valueInternal: Error? + + func set(_ error: Error) { + self.lock.lock() + self.valueInternal = error + self.lock.unlock() + } + + var value: Error? { + self.lock.lock() + defer { self.lock.unlock() } + return self.valueInternal + } +} + // This file intentionally centralizes talk mode state + behavior. // It's large, and splitting would force `private` -> `fileprivate` across many members. // We'll refactor into smaller files when the surface stabilizes. @@ -72,6 +89,9 @@ final class TalkModeManager: NSObject { private var mainSessionKey: String = "main" private var fallbackVoiceId: String? private var lastPlaybackWasPCM: Bool = false + /// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required). + /// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM. + private var pcmFormatUnavailable: Bool = false var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared @@ -1007,7 +1027,8 @@ final class TalkModeManager: NSObject { let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)? .trimmingCharacters(in: .whitespacesAndNewlines) let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil - let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100") + let outputFormat = ElevenLabsTTSClient.validatedOutputFormat( + requestedOutputFormat ?? self.effectiveDefaultOutputFormat) if outputFormat == nil, let requestedOutputFormat { self.logger.warning( "talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)") @@ -1036,7 +1057,7 @@ final class TalkModeManager: NSObject { let request = makeRequest(outputFormat: outputFormat) let client = ElevenLabsTTSClient(apiKey: apiKey) - let stream = client.streamSynthesize(voiceId: voiceId, request: request) + let rawStream = client.streamSynthesize(voiceId: voiceId, request: request) if self.interruptOnSpeech { do { @@ -1051,11 +1072,16 @@ final class TalkModeManager: NSObject { let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat) let result: StreamingPlaybackResult if let sampleRate { + let streamFailure = StreamFailureBox() + let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure) self.lastPlaybackWasPCM = true var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) if !playback.finished, playback.interruptedAt == nil { - let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") self.logger.warning("pcm playback failed; retrying mp3") + if Self.isPCMFormatRejectedByAPI(streamFailure.value) { + self.pcmFormatUnavailable = true + } self.lastPlaybackWasPCM = false let mp3Stream = client.streamSynthesize( voiceId: voiceId, @@ -1065,7 +1091,7 @@ final class TalkModeManager: NSObject { result = playback } else { self.lastPlaybackWasPCM = false - result = await self.mp3Player.play(stream: stream) + result = await self.mp3Player.play(stream: rawStream) } let duration = Date().timeIntervalSince(started) self.logger.info("elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(duration, privacy: .public)s") @@ -1391,7 +1417,7 @@ final class TalkModeManager: NSObject { private func resolveIncrementalPrefetchOutputFormat(context: IncrementalSpeechContext) -> String? { if TalkTTSValidation.pcmSampleRate(from: context.outputFormat) != nil { - return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") } return context.outputFormat } @@ -1480,7 +1506,8 @@ final class TalkModeManager: NSObject { let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)? .trimmingCharacters(in: .whitespacesAndNewlines) let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil - let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100") + let outputFormat = ElevenLabsTTSClient.validatedOutputFormat( + requestedOutputFormat ?? self.effectiveDefaultOutputFormat) if outputFormat == nil, let requestedOutputFormat { self.logger.warning( "talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)") @@ -1534,6 +1561,44 @@ final class TalkModeManager: NSObject { latencyTier: TalkTTSValidation.validatedLatencyTier(context.directive?.latencyTier)) } + /// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`. + private var effectiveDefaultOutputFormat: String { + self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100" + } + + private static func monitorStreamFailures( + _ stream: AsyncThrowingStream, + failureBox: StreamFailureBox + ) -> AsyncThrowingStream + { + AsyncThrowingStream { continuation in + let task = Task { + do { + for try await chunk in stream { + continuation.yield(chunk) + } + continuation.finish() + } catch { + failureBox.set(error) + continuation.finish(throwing: error) + } + } + continuation.onTermination = { _ in + task.cancel() + } + } + } + + private static func isPCMFormatRejectedByAPI(_ error: Error?) -> Bool { + guard let error = error as NSError? else { return false } + guard error.domain == "ElevenLabsTTS", error.code >= 400 else { return false } + let message = (error.userInfo[NSLocalizedDescriptionKey] as? String ?? error.localizedDescription).lowercased() + return message.contains("output_format") + || message.contains("pcm_") + || message.contains("pcm ") + || message.contains("subscription_required") + } + private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream { AsyncThrowingStream { continuation in for chunk in chunks { @@ -1575,22 +1640,27 @@ final class TalkModeManager: NSObject { text: text, context: context, outputFormat: context.outputFormat) - let stream: AsyncThrowingStream + let rawStream: AsyncThrowingStream if let prefetchedAudio, !prefetchedAudio.chunks.isEmpty { - stream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks) + rawStream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks) } else { - stream = client.streamSynthesize(voiceId: voiceId, request: request) + rawStream = client.streamSynthesize(voiceId: voiceId, request: request) } let playbackFormat = prefetchedAudio?.outputFormat ?? context.outputFormat let sampleRate = TalkTTSValidation.pcmSampleRate(from: playbackFormat) let result: StreamingPlaybackResult if let sampleRate { + let streamFailure = StreamFailureBox() + let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure) self.lastPlaybackWasPCM = true var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate) if !playback.finished, playback.interruptedAt == nil { self.logger.warning("pcm playback failed; retrying mp3") + if Self.isPCMFormatRejectedByAPI(streamFailure.value) { + self.pcmFormatUnavailable = true + } self.lastPlaybackWasPCM = false - let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100") + let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128") let mp3Stream = client.streamSynthesize( voiceId: voiceId, request: self.makeIncrementalTTSRequest( @@ -1602,7 +1672,7 @@ final class TalkModeManager: NSObject { result = playback } else { self.lastPlaybackWasPCM = false - result = await self.mp3Player.play(stream: stream) + result = await self.mp3Player.play(stream: rawStream) } if !result.finished, let interruptedAt = result.interruptedAt { self.lastInterruptedAtSeconds = interruptedAt @@ -1926,6 +1996,7 @@ extension TalkModeManager { func reloadConfig() async { guard let gateway else { return } + self.pcmFormatUnavailable = false do { let res = try await gateway.request( method: "talk.config", @@ -2105,6 +2176,10 @@ private final class AudioTapDiagnostics: @unchecked Sendable { #if DEBUG extension TalkModeManager { + static func _test_isPCMFormatRejectedByAPI(_ error: Error?) -> Bool { + self.isPCMFormatRejectedByAPI(error) + } + func _test_seedTranscript(_ transcript: String) { self.lastTranscript = transcript self.lastHeard = Date() diff --git a/apps/ios/Tests/TalkModeConfigParsingTests.swift b/apps/ios/Tests/TalkModeConfigParsingTests.swift index fd6b535f8a3..a09f095a233 100644 --- a/apps/ios/Tests/TalkModeConfigParsingTests.swift +++ b/apps/ios/Tests/TalkModeConfigParsingTests.swift @@ -1,3 +1,4 @@ +import Foundation import Testing @testable import OpenClaw @@ -28,4 +29,22 @@ import Testing let selection = TalkModeManager.selectTalkProviderConfig(talk) #expect(selection == nil) } + + @Test func detectsPCMFormatRejectionFromElevenLabsError() { + let error = NSError( + domain: "ElevenLabsTTS", + code: 403, + userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed: 403 subscription_required output_format=pcm_44100", + ]) + #expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error)) + } + + @Test func ignoresGenericPlaybackFailuresForPCMFormatRejection() { + let error = NSError( + domain: "StreamingAudio", + code: -1, + userInfo: [NSLocalizedDescriptionKey: "queue enqueue failed"]) + #expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error) == false) + } }