iOS Security Stack 4/5: TTS PCM->MP3 Fallback (#30885) (#33032)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f77e3d7644
Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com>
Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com>
Reviewed-by: @mbelinky
This commit is contained in:
Mariano
2026-03-03 16:33:55 +00:00
committed by GitHub
parent d493861c16
commit bf7061092a
3 changed files with 106 additions and 11 deletions

View File

@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
- iOS/Gateway keychain hardening: move gateway metadata and TLS fingerprints to device keychain storage with safer migration behavior and rollback-safe writes to reduce credential loss risk during upgrades. (#33029) thanks @mbelinky.
- iOS/Concurrency stability: replace risky shared-state access in camera and gateway connection paths with lock-protected access patterns to reduce crash risk under load. (#33241) thanks @mbelinky.
- iOS/Security guardrails: limit production API-key sourcing to app config and make deep-link confirmation prompts safer by coalescing queued requests instead of silently dropping them. (#33031) thanks @mbelinky.
- iOS/TTS playback fallback: keep voice playback resilient by switching from PCM to MP3 when provider format support is unavailable, while avoiding sticky fallback on generic local playback errors. (#33032) thanks @mbelinky.
- Telegram/multi-account default routing clarity: warn only for ambiguous (2+) account setups without an explicit default, add `openclaw doctor` warnings for missing/invalid multi-account defaults across channels, and document explicit-default guidance for channel routing and Telegram config. (#32544) thanks @Sid-Qin.
- Telegram/plugin outbound hook parity: run `message_sending` + `message_sent` in Telegram reply delivery, include reply-path hook metadata (`mediaUrls`, `threadId`), and report `message_sent.success=false` when hooks blank text and no outbound message is delivered. (#32649) Thanks @KimGLee.
- Agents/Skills runtime loading: propagate run config into embedded attempt and compaction skill-entry loading so explicitly enabled bundled companion skills are discovered consistently when skill snapshots do not already provide resolved entries. Thanks @gumadeiras.

View File

@@ -7,6 +7,23 @@ import Observation
import OSLog
import Speech
private final class StreamFailureBox: @unchecked Sendable {
private let lock = NSLock()
private var valueInternal: Error?
func set(_ error: Error) {
self.lock.lock()
self.valueInternal = error
self.lock.unlock()
}
var value: Error? {
self.lock.lock()
defer { self.lock.unlock() }
return self.valueInternal
}
}
// This file intentionally centralizes talk mode state + behavior.
// It's large, and splitting would force `private` -> `fileprivate` across many members.
// We'll refactor into smaller files when the surface stabilizes.
@@ -72,6 +89,9 @@ final class TalkModeManager: NSObject {
private var mainSessionKey: String = "main"
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
/// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required).
/// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM.
private var pcmFormatUnavailable: Bool = false
var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
@@ -1007,7 +1027,8 @@ final class TalkModeManager: NSObject {
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
if outputFormat == nil, let requestedOutputFormat {
self.logger.warning(
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1036,7 +1057,7 @@ final class TalkModeManager: NSObject {
let request = makeRequest(outputFormat: outputFormat)
let client = ElevenLabsTTSClient(apiKey: apiKey)
let stream = client.streamSynthesize(voiceId: voiceId, request: request)
let rawStream = client.streamSynthesize(voiceId: voiceId, request: request)
if self.interruptOnSpeech {
do {
@@ -1051,11 +1072,16 @@ final class TalkModeManager: NSObject {
let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
let result: StreamingPlaybackResult
if let sampleRate {
let streamFailure = StreamFailureBox()
let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure)
self.lastPlaybackWasPCM = true
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
if !playback.finished, playback.interruptedAt == nil {
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
self.logger.warning("pcm playback failed; retrying mp3")
if Self.isPCMFormatRejectedByAPI(streamFailure.value) {
self.pcmFormatUnavailable = true
}
self.lastPlaybackWasPCM = false
let mp3Stream = client.streamSynthesize(
voiceId: voiceId,
@@ -1065,7 +1091,7 @@ final class TalkModeManager: NSObject {
result = playback
} else {
self.lastPlaybackWasPCM = false
result = await self.mp3Player.play(stream: stream)
result = await self.mp3Player.play(stream: rawStream)
}
let duration = Date().timeIntervalSince(started)
self.logger.info("elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(duration, privacy: .public)s")
@@ -1391,7 +1417,7 @@ final class TalkModeManager: NSObject {
private func resolveIncrementalPrefetchOutputFormat(context: IncrementalSpeechContext) -> String? {
if TalkTTSValidation.pcmSampleRate(from: context.outputFormat) != nil {
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
}
return context.outputFormat
}
@@ -1480,7 +1506,8 @@ final class TalkModeManager: NSObject {
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
if outputFormat == nil, let requestedOutputFormat {
self.logger.warning(
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1534,6 +1561,44 @@ final class TalkModeManager: NSObject {
latencyTier: TalkTTSValidation.validatedLatencyTier(context.directive?.latencyTier))
}
/// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`.
private var effectiveDefaultOutputFormat: String {
self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100"
}
private static func monitorStreamFailures(
_ stream: AsyncThrowingStream<Data, Error>,
failureBox: StreamFailureBox
) -> AsyncThrowingStream<Data, Error>
{
AsyncThrowingStream { continuation in
let task = Task {
do {
for try await chunk in stream {
continuation.yield(chunk)
}
continuation.finish()
} catch {
failureBox.set(error)
continuation.finish(throwing: error)
}
}
continuation.onTermination = { _ in
task.cancel()
}
}
}
private static func isPCMFormatRejectedByAPI(_ error: Error?) -> Bool {
guard let error = error as NSError? else { return false }
guard error.domain == "ElevenLabsTTS", error.code >= 400 else { return false }
let message = (error.userInfo[NSLocalizedDescriptionKey] as? String ?? error.localizedDescription).lowercased()
return message.contains("output_format")
|| message.contains("pcm_")
|| message.contains("pcm ")
|| message.contains("subscription_required")
}
private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream<Data, Error> {
AsyncThrowingStream { continuation in
for chunk in chunks {
@@ -1575,22 +1640,27 @@ final class TalkModeManager: NSObject {
text: text,
context: context,
outputFormat: context.outputFormat)
let stream: AsyncThrowingStream<Data, Error>
let rawStream: AsyncThrowingStream<Data, Error>
if let prefetchedAudio, !prefetchedAudio.chunks.isEmpty {
stream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks)
rawStream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks)
} else {
stream = client.streamSynthesize(voiceId: voiceId, request: request)
rawStream = client.streamSynthesize(voiceId: voiceId, request: request)
}
let playbackFormat = prefetchedAudio?.outputFormat ?? context.outputFormat
let sampleRate = TalkTTSValidation.pcmSampleRate(from: playbackFormat)
let result: StreamingPlaybackResult
if let sampleRate {
let streamFailure = StreamFailureBox()
let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure)
self.lastPlaybackWasPCM = true
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
if !playback.finished, playback.interruptedAt == nil {
self.logger.warning("pcm playback failed; retrying mp3")
if Self.isPCMFormatRejectedByAPI(streamFailure.value) {
self.pcmFormatUnavailable = true
}
self.lastPlaybackWasPCM = false
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
let mp3Stream = client.streamSynthesize(
voiceId: voiceId,
request: self.makeIncrementalTTSRequest(
@@ -1602,7 +1672,7 @@ final class TalkModeManager: NSObject {
result = playback
} else {
self.lastPlaybackWasPCM = false
result = await self.mp3Player.play(stream: stream)
result = await self.mp3Player.play(stream: rawStream)
}
if !result.finished, let interruptedAt = result.interruptedAt {
self.lastInterruptedAtSeconds = interruptedAt
@@ -1926,6 +1996,7 @@ extension TalkModeManager {
func reloadConfig() async {
guard let gateway else { return }
self.pcmFormatUnavailable = false
do {
let res = try await gateway.request(
method: "talk.config",
@@ -2105,6 +2176,10 @@ private final class AudioTapDiagnostics: @unchecked Sendable {
#if DEBUG
extension TalkModeManager {
static func _test_isPCMFormatRejectedByAPI(_ error: Error?) -> Bool {
self.isPCMFormatRejectedByAPI(error)
}
func _test_seedTranscript(_ transcript: String) {
self.lastTranscript = transcript
self.lastHeard = Date()

View File

@@ -1,3 +1,4 @@
import Foundation
import Testing
@testable import OpenClaw
@@ -28,4 +29,22 @@ import Testing
let selection = TalkModeManager.selectTalkProviderConfig(talk)
#expect(selection == nil)
}
@Test func detectsPCMFormatRejectionFromElevenLabsError() {
let error = NSError(
domain: "ElevenLabsTTS",
code: 403,
userInfo: [
NSLocalizedDescriptionKey: "ElevenLabs failed: 403 subscription_required output_format=pcm_44100",
])
#expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error))
}
@Test func ignoresGenericPlaybackFailuresForPCMFormatRejection() {
let error = NSError(
domain: "StreamingAudio",
code: -1,
userInfo: [NSLocalizedDescriptionKey: "queue enqueue failed"])
#expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error) == false)
}
}