macOS: add MLX Talk provider MVP (#63539)

Merged via squash. Prepared head SHA: da43563513 Co-authored-by: ImLukeF <92253590+ImLukeF@users.noreply.github.com> Co-authored-by: ImLukeF <92253590+ImLukeF@users.noreply.github.com> Reviewed-by: @ImLukeF
2026-04-17 12:11:20 +00:00 · 2026-04-09 17:13:34 +10:00
parent 2729c91ad5
commit 7c72b694f1
8 changed files with 509 additions and 22 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,8 @@ Docs: https://docs.openclaw.ai

 ### Changes

+- macOS/Talk: add an experimental local MLX speech provider for Talk Mode, with explicit provider selection, local utterance playback, interruption handling, and system-voice fallback. (#63539) Thanks @ImLukeF.
+
 ### Fixes

 - fix(browser): auto-generate browser control auth token for none/trusted-proxy modes [AI]. (#63280) Thanks @pgondhi987.
--- a/apps/macos/Package.resolved
+++ b/apps/macos/Package.resolved
@@ -1,5 +1,5 @@
 {
-  "originHash" : "fb90e7b1977f43661ac91681d16da11f9ddd85630407ef170eaada0a6ee39972",
+  "originHash" : "31972864afdac74537794e1a3b7bd22484c09ec1be8e3624fb9ea582e9222ad9",
  "pins" : [
    {
      "identity" : "axorcist",
@@ -28,6 +28,15 @@
        "version" : "0.1.0"
      }
    },
+    {
+      "identity" : "eventsource",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/mattt/EventSource.git",
+      "state" : {
+        "revision" : "a3a85a85214caf642abaa96ae664e4c772a59f6e",
+        "version" : "1.4.1"
+      }
+    },
    {
      "identity" : "menubarextraaccess",
      "kind" : "remoteSourceControl",
@@ -37,6 +46,33 @@
        "version" : "1.2.2"
      }
    },
+    {
+      "identity" : "mlx-audio-swift",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/Blaizzy/mlx-audio-swift",
+      "state" : {
+        "revision" : "fcbd04daa1bfebe881932f630af2ba6ce9af3274",
+        "version" : "0.1.2"
+      }
+    },
+    {
+      "identity" : "mlx-swift",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/ml-explore/mlx-swift.git",
+      "state" : {
+        "revision" : "61b9e011e09a62b489f6bd647958f1555bdf2896",
+        "version" : "0.31.3"
+      }
+    },
+    {
+      "identity" : "mlx-swift-lm",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/ml-explore/mlx-swift-lm.git",
+      "state" : {
+        "revision" : "25b00d4e22e61ec9c41efda47990cd2084ec87ff",
+        "version" : "2.31.3"
+      }
+    },
    {
      "identity" : "peekaboo",
      "kind" : "remoteSourceControl",
@@ -64,6 +100,33 @@
        "version" : "1.2.1"
      }
    },
+    {
+      "identity" : "swift-asn1",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-asn1.git",
+      "state" : {
+        "revision" : "9f542610331815e29cc3821d3b6f488db8715517",
+        "version" : "1.6.0"
+      }
+    },
+    {
+      "identity" : "swift-atomics",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-atomics.git",
+      "state" : {
+        "revision" : "b601256eab081c0f92f059e12818ac1d4f178ff7",
+        "version" : "1.3.0"
+      }
+    },
+    {
+      "identity" : "swift-collections",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-collections.git",
+      "state" : {
+        "revision" : "6675bc0ff86e61436e615df6fc5174e043e57924",
+        "version" : "1.4.1"
+      }
+    },
    {
      "identity" : "swift-concurrency-extras",
      "kind" : "remoteSourceControl",
@@ -73,6 +136,33 @@
        "version" : "1.3.2"
      }
    },
+    {
+      "identity" : "swift-crypto",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-crypto.git",
+      "state" : {
+        "revision" : "bb4ba815dab96d4edc1e0b86d7b9acf9ff973a84",
+        "version" : "4.3.1"
+      }
+    },
+    {
+      "identity" : "swift-huggingface",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/huggingface/swift-huggingface.git",
+      "state" : {
+        "revision" : "b721959445b617d0bf03910b2b4aced345fd93bf",
+        "version" : "0.9.0"
+      }
+    },
+    {
+      "identity" : "swift-jinja",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/huggingface/swift-jinja.git",
+      "state" : {
+        "revision" : "0aeefadec459ce8e11a333769950fb86183aca43",
+        "version" : "2.3.5"
+      }
+    },
    {
      "identity" : "swift-log",
      "kind" : "remoteSourceControl",
@@ -82,6 +172,15 @@
        "version" : "1.10.1"
      }
    },
+    {
+      "identity" : "swift-nio",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-nio.git",
+      "state" : {
+        "revision" : "558f24a4647193b5a0e2104031b71c55d31ff83a",
+        "version" : "2.97.1"
+      }
+    },
    {
      "identity" : "swift-numerics",
      "kind" : "remoteSourceControl",
@@ -109,6 +208,15 @@
        "version" : "1.6.4"
      }
    },
+    {
+      "identity" : "swift-transformers",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/huggingface/swift-transformers.git",
+      "state" : {
+        "revision" : "58c4bc11963a140358d791f678a60a2745a23146",
+        "version" : "1.2.1"
+      }
+    },
    {
      "identity" : "swiftui-math",
      "kind" : "remoteSourceControl",
@@ -126,6 +234,15 @@
        "revision" : "5b06b811c0f5313b6b84bbef98c635a630638c38",
        "version" : "0.3.1"
      }
+    },
+    {
+      "identity" : "yyjson",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/ibireme/yyjson.git",
+      "state" : {
+        "revision" : "8b4a38dc994a110abaec8a400615567bd996105f",
+        "version" : "0.12.0"
+      }
    }
  ],
  "version" : 3
--- a/apps/macos/Package.swift
+++ b/apps/macos/Package.swift
@@ -20,6 +20,7 @@ let package = Package(
        .package(url: "https://github.com/apple/swift-log.git", from: "1.10.1"),
        .package(url: "https://github.com/sparkle-project/Sparkle", from: "2.9.0"),
        .package(url: "https://github.com/steipete/Peekaboo.git", branch: "main"),
+        .package(url: "https://github.com/Blaizzy/mlx-audio-swift", exact: "0.1.2"),
        .package(path: "../shared/OpenClawKit"),
        .package(path: "../../Swabble"),
    ],
@@ -54,6 +55,7 @@ let package = Package(
                .product(name: "Sparkle", package: "Sparkle"),
                .product(name: "PeekabooBridge", package: "Peekaboo"),
                .product(name: "PeekabooAutomationKit", package: "Peekaboo"),
+                .product(name: "MLXAudioTTS", package: "mlx-audio-swift"),
            ],
            exclude: [
                "Resources/Info.plist",
--- a/apps/macos/Sources/OpenClaw/TalkMLXSpeechSynthesizer.swift
+++ b/apps/macos/Sources/OpenClaw/TalkMLXSpeechSynthesizer.swift
@@ -0,0 +1,178 @@
+import Foundation
+import MLXAudioTTS
+import OSLog
+
+// swiftformat:disable wrap wrapMultilineStatementBraces trailingCommas redundantSelf extensionAccessControl
+/// Runtime access stays serialized through `TalkModeRuntime` actor helper methods.
+final class TalkMLXSpeechSynthesizer {
+    enum SynthesizeError: Error {
+        case canceled
+        case modelLoadFailed(String)
+        case audioGenerationFailed
+        case audioPlaybackFailed
+        case timedOut
+    }
+
+    static let shared = TalkMLXSpeechSynthesizer()
+    static let defaultModelRepo = "mlx-community/Soprano-80M-bf16"
+
+    private let logger = Logger(subsystem: "ai.openclaw", category: "talk.mlx")
+    private var currentToken = UUID()
+    private var modelRepo: String?
+    private var model: (any SpeechGenerationModel)?
+
+    private init() {}
+
+    func stop() {
+        self.currentToken = UUID()
+    }
+
+    func synthesize(
+        text: String,
+        modelRepo: String?,
+        language: String?,
+        voicePreset: String?) async throws -> Data {
+        let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
+        guard !trimmed.isEmpty else { return Data() }
+
+        self.stop()
+        let token = UUID()
+        self.currentToken = token
+
+        let resolvedRepo = Self.resolvedModelRepo(modelRepo)
+        let rawModel = try await self.loadModel(
+            modelRepo: resolvedRepo,
+            token: token)
+        let model = UncheckedSpeechModel(raw: rawModel)
+        guard self.currentToken == token else {
+            throw SynthesizeError.canceled
+        }
+
+        let audioData: Data
+        do {
+            let audio = try await model.generateAudio(
+                text: trimmed,
+                voice: voicePreset,
+                language: language)
+            audioData = Self.makeWavData(
+                samples: audio,
+                sampleRate: Double(model.sampleRateValue()))
+        } catch {
+            self.logger.error(
+                "talk mlx generation failed: \(error.localizedDescription, privacy: .public)")
+            throw SynthesizeError.audioGenerationFailed
+        }
+
+        guard self.currentToken == token else {
+            throw SynthesizeError.canceled
+        }
+        return audioData
+    }
+
+    private func loadModel(
+        modelRepo: String,
+        token: UUID) async throws -> any SpeechGenerationModel {
+        if let model = self.model, self.modelRepo == modelRepo {
+            return model
+        }
+
+        self.logger.info("talk mlx loading modelRepo=\(modelRepo, privacy: .public)")
+        do {
+            let model = try await TTS.loadModel(modelRepo: modelRepo)
+            guard self.currentToken == token else {
+                throw SynthesizeError.canceled
+            }
+            self.model = model
+            self.modelRepo = modelRepo
+            return model
+        } catch is CancellationError {
+            throw SynthesizeError.canceled
+        } catch {
+            self.logger.error(
+                "talk mlx load failed: \(error.localizedDescription, privacy: .public)")
+            throw SynthesizeError.modelLoadFailed(modelRepo)
+        }
+    }
+
+    private static func resolvedModelRepo(_ modelRepo: String?) -> String {
+        let trimmed = modelRepo?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
+        return trimmed.isEmpty ? Self.defaultModelRepo : trimmed
+    }
+
+    private static func makeWavData(samples: [Float], sampleRate: Double) -> Data {
+        let channels: UInt16 = 1
+        let bitsPerSample: UInt16 = 16
+        let blockAlign = channels * (bitsPerSample / 8)
+        let sampleRateInt = UInt32(sampleRate.rounded())
+        let byteRate = sampleRateInt * UInt32(blockAlign)
+        let dataSize = UInt32(samples.count) * UInt32(blockAlign)
+
+        var data = Data(capacity: Int(44 + dataSize))
+        data.append(contentsOf: [0x52, 0x49, 0x46, 0x46]) // RIFF
+        data.appendLEUInt32(36 + dataSize)
+        data.append(contentsOf: [0x57, 0x41, 0x56, 0x45]) // WAVE
+
+        data.append(contentsOf: [0x66, 0x6D, 0x74, 0x20]) // fmt
+        data.appendLEUInt32(16)
+        data.appendLEUInt16(1)
+        data.appendLEUInt16(channels)
+        data.appendLEUInt32(sampleRateInt)
+        data.appendLEUInt32(byteRate)
+        data.appendLEUInt16(blockAlign)
+        data.appendLEUInt16(bitsPerSample)
+
+        data.append(contentsOf: [0x64, 0x61, 0x74, 0x61]) // data
+        data.appendLEUInt32(dataSize)
+
+        for sample in samples {
+            let clamped = max(-1.0, min(1.0, sample))
+            let scaled = Int16((clamped * Float(Int16.max)).rounded())
+            data.appendLEInt16(scaled)
+        }
+        return data
+    }
+}
+
+extension TalkMLXSpeechSynthesizer: @unchecked Sendable {}
+
+private struct UncheckedSpeechModel {
+    let raw: any SpeechGenerationModel
+
+    func sampleRateValue() -> Int {
+        raw.sampleRate
+    }
+
+    func generateAudio(
+        text: String,
+        voice: String?,
+        language: String?) async throws -> [Float] {
+        let generatedAudio = try await raw.generate(
+            text: text,
+            voice: voice,
+            refAudio: nil,
+            refText: nil,
+            language: language)
+        return generatedAudio.asArray(Float.self)
+    }
+}
+
+extension UncheckedSpeechModel: @unchecked Sendable {}
+
+extension Data {
+    fileprivate mutating func appendLEUInt16(_ value: UInt16) {
+        var littleEndian = value.littleEndian
+        Swift.withUnsafeBytes(of: &littleEndian) { append(contentsOf: $0) }
+    }
+
+    fileprivate mutating func appendLEUInt32(_ value: UInt32) {
+        var littleEndian = value.littleEndian
+        Swift.withUnsafeBytes(of: &littleEndian) { append(contentsOf: $0) }
+    }
+
+    fileprivate mutating func appendLEInt16(_ value: Int16) {
+        var littleEndian = value.littleEndian
+        Swift.withUnsafeBytes(of: &littleEndian) { append(contentsOf: $0) }
+    }
+}
+
+// swiftformat:enable wrap wrapMultilineStatementBraces trailingCommas redundantSelf extensionAccessControl
--- a/apps/macos/Sources/OpenClaw/TalkModeGatewayConfig.swift
+++ b/apps/macos/Sources/OpenClaw/TalkModeGatewayConfig.swift
@@ -44,7 +44,13 @@ enum TalkModeGatewayConfigParser {
                acc[key] = value
            } ?? [:]
        let model = activeConfig?["modelId"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
-        let resolvedModel = (model?.isEmpty == false) ? model! : defaultModelIdFallback
+        let resolvedModel: String? = if model?.isEmpty == false {
+            model!
+        } else if activeProvider == defaultProvider {
+            defaultModelIdFallback
+        } else {
+            nil
+        }
        let outputFormat = activeConfig?["outputFormat"]?.stringValue
        let interrupt = talk?["interruptOnSpeech"]?.boolValue
        let apiKey = activeConfig?["apiKey"]?.stringValue
--- a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift
+++ b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift
@@ -10,6 +10,7 @@ actor TalkModeRuntime {

    enum PlaybackPlan: Equatable {
        case elevenLabsThenSystemVoice(apiKey: String, voiceId: String)
+        case mlxThenSystemVoice
        case systemVoiceOnly
    }

@@ -17,6 +18,8 @@ actor TalkModeRuntime {
    private let ttsLogger = Logger(subsystem: "ai.openclaw", category: "talk.tts")
    private static let defaultModelIdFallback = "eleven_v3"
    private static let defaultTalkProvider = "elevenlabs"
+    private static let mlxTalkProvider = "mlx"
+    private static let systemTalkProvider = "system"
    private static let defaultSilenceTimeoutMs = TalkDefaults.silenceTimeoutMs

    private final class RMSMeter: @unchecked Sendable {
@@ -65,6 +68,7 @@ actor TalkModeRuntime {
    private var modelOverrideActive = false
    private var defaultOutputFormat: String?
    private var interruptOnSpeech: Bool = true
+    private var activeTalkProvider = TalkModeRuntime.defaultTalkProvider
    private var lastInterruptedAtSeconds: Double?
    private var voiceAliases: [String: String] = [:]
    private var lastSpokenText: String?
@@ -462,7 +466,7 @@ actor TalkModeRuntime {
    private func playAssistant(text: String) async {
        guard let input = await self.preparePlaybackInput(text: text) else { return }

-        switch Self.playbackPlan(apiKey: input.apiKey, voiceId: input.voiceId) {
+        switch Self.playbackPlan(provider: input.provider, apiKey: input.apiKey, voiceId: input.voiceId) {
        case let .elevenLabsThenSystemVoice(apiKey, voiceId):
            do {
                try await self.playElevenLabs(input: input, apiKey: apiKey, voiceId: voiceId)
@@ -477,6 +481,23 @@ actor TalkModeRuntime {
                    self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)")
                }
            }
+        case .mlxThenSystemVoice:
+            do {
+                try await self.playMLX(input: input)
+            } catch TalkMLXSpeechSynthesizer.SynthesizeError.canceled {
+                self.ttsLogger.info("talk mlx canceled")
+                return
+            } catch {
+                self.ttsLogger
+                    .error(
+                        "talk MLX failed: \(error.localizedDescription, privacy: .public); " +
+                            "falling back to system voice")
+                do {
+                    try await self.playSystemVoice(input: input)
+                } catch {
+                    self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)")
+                }
+            }
        case .systemVoiceOnly:
            do {
                try await self.playSystemVoice(input: input)
@@ -491,19 +512,30 @@ actor TalkModeRuntime {
        }
    }

-    static func playbackPlan(apiKey: String?, voiceId: String?) -> PlaybackPlan {
-        guard let apiKey, !apiKey.isEmpty, let voiceId else {
+    static func playbackPlan(provider: String, apiKey: String?, voiceId: String?) -> PlaybackPlan {
+        switch provider {
+        case self.defaultTalkProvider:
+            guard let apiKey, !apiKey.isEmpty, let voiceId else {
+                return .systemVoiceOnly
+            }
+            return .elevenLabsThenSystemVoice(apiKey: apiKey, voiceId: voiceId)
+        case self.mlxTalkProvider:
+            return .mlxThenSystemVoice
+        case self.systemTalkProvider:
+            return .systemVoiceOnly
+        default:
            return .systemVoiceOnly
        }
-        return .elevenLabsThenSystemVoice(apiKey: apiKey, voiceId: voiceId)
    }

    private struct TalkPlaybackInput {
        let generation: Int
+        let provider: String
        let cleanedText: String
        let directive: TalkDirective?
        let apiKey: String?
        let voiceId: String?
+        let voicePreset: String?
        let language: String?
        let synthTimeoutSeconds: Double
    }
@@ -552,18 +584,20 @@ actor TalkModeRuntime {
            resolvedVoice ??
            self.currentVoiceId ??
            self.defaultVoiceId
+        let voicePreset = preferredVoice
+        let provider = self.activeTalkProvider

        let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)

-        let voiceId: String? = if let apiKey, !apiKey.isEmpty {
+        let voiceId: String? = if provider == Self.defaultTalkProvider, let apiKey, !apiKey.isEmpty {
            await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
        } else {
            nil
        }

-        if apiKey?.isEmpty != false {
+        if provider == Self.defaultTalkProvider, apiKey?.isEmpty != false {
            self.ttsLogger.warning("talk missing ELEVENLABS_API_KEY; falling back to system voice")
-        } else if voiceId == nil {
+        } else if provider == Self.defaultTalkProvider, voiceId == nil {
            self.ttsLogger.warning("talk missing voiceId; falling back to system voice")
        } else if let voiceId {
            self.ttsLogger
@@ -579,15 +613,21 @@ actor TalkModeRuntime {

        return TalkPlaybackInput(
            generation: gen,
+            provider: provider,
            cleanedText: cleaned,
            directive: directive,
            apiKey: apiKey,
            voiceId: voiceId,
+            voicePreset: voicePreset,
            language: language,
            synthTimeoutSeconds: synthTimeoutSeconds)
    }

-    private func playElevenLabs(input: TalkPlaybackInput, apiKey: String, voiceId: String) async throws {
+    private func playElevenLabs(
+        input: TalkPlaybackInput,
+        apiKey: String,
+        voiceId: String) async throws
+    {
        let desiredOutputFormat = input.directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
        let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
        if outputFormat == nil, !desiredOutputFormat.isEmpty {
@@ -696,6 +736,39 @@ actor TalkModeRuntime {
        self.ttsLogger.info("talk system voice done")
    }

+    private func playMLX(input: TalkPlaybackInput) async throws {
+        self.ttsLogger.info("talk mlx start chars=\(input.cleanedText.count, privacy: .public)")
+        if self.interruptOnSpeech {
+            guard await self.prepareForPlayback(generation: input.generation) else { return }
+        }
+        await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
+        self.phase = .speaking
+        let modelRepo = input.directive?.modelId ?? self.currentModelId
+        let audioData: Data
+        do {
+            audioData = try await AsyncTimeout.withTimeout(
+                seconds: input.synthTimeoutSeconds,
+                onTimeout: {
+                    TalkMLXSpeechSynthesizer.SynthesizeError.timedOut
+                },
+                operation: { [self] in
+                    try await self.synthesizeMLXVoice(
+                        text: input.cleanedText,
+                        modelRepo: modelRepo,
+                        language: input.language,
+                        voicePreset: input.voicePreset)
+                })
+        } catch TalkMLXSpeechSynthesizer.SynthesizeError.timedOut {
+            self.stopMLXVoice()
+            throw TalkMLXSpeechSynthesizer.SynthesizeError.timedOut
+        }
+        let result = await self.playTalkAudio(data: audioData)
+        if !result.finished, result.interruptedAt == nil {
+            throw TalkMLXSpeechSynthesizer.SynthesizeError.audioPlaybackFailed
+        }
+        self.ttsLogger.info("talk mlx done")
+    }
+
    private func prepareForPlayback(generation: Int) async -> Bool {
        await self.startRecognition()
        return self.isCurrent(generation)
@@ -750,10 +823,13 @@ actor TalkModeRuntime {

    func stopSpeaking(reason: TalkStopReason) async {
        let usePCM = self.lastPlaybackWasPCM
-        let interruptedAt = usePCM ? await self.stopPCM() : await self.stopMP3()
+        let remoteInterruptedAt = usePCM ? await self.stopPCM() : await self.stopMP3()
        _ = usePCM ? await self.stopMP3() : await self.stopPCM()
+        let localInterruptedAt = await self.stopTalkAudio()
        await TalkSystemSpeechSynthesizer.shared.stop()
+        self.stopMLXVoice()
        guard self.phase == .speaking else { return }
+        let interruptedAt = remoteInterruptedAt ?? localInterruptedAt
        if reason == .speech, let interruptedAt {
            self.lastInterruptedAtSeconds = interruptedAt
        }
@@ -795,6 +871,33 @@ extension TalkModeRuntime {
        StreamingAudioPlayer.shared.stop()
    }

+    @MainActor
+    private func playTalkAudio(data: Data) async -> TalkPlaybackResult {
+        await TalkAudioPlayer.shared.play(data: data)
+    }
+
+    @MainActor
+    private func stopTalkAudio() -> Double? {
+        TalkAudioPlayer.shared.stop()
+    }
+
+    private func synthesizeMLXVoice(
+        text: String,
+        modelRepo: String?,
+        language: String?,
+        voicePreset: String?) async throws -> Data
+    {
+        try await TalkMLXSpeechSynthesizer.shared.synthesize(
+            text: text,
+            modelRepo: modelRepo,
+            language: language,
+            voicePreset: voicePreset)
+    }
+
+    private func stopMLXVoice() {
+        TalkMLXSpeechSynthesizer.shared.stop()
+    }
+
    // MARK: - Config

    private func reloadConfig() async {
@@ -810,6 +913,7 @@ extension TalkModeRuntime {
        }
        self.defaultOutputFormat = cfg.outputFormat
        self.interruptOnSpeech = cfg.interruptOnSpeech
+        self.activeTalkProvider = cfg.activeProvider
        self.silenceWindow = TimeInterval(cfg.silenceTimeoutMs) / 1000
        self.apiKey = cfg.apiKey
        let hasApiKey = (cfg.apiKey?.isEmpty == false)
@@ -817,7 +921,8 @@ extension TalkModeRuntime {
        let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none"
        self.logger
            .info(
-                "talk config voiceId=\(voiceLabel, privacy: .public) " +
+                "talk config provider=\(cfg.activeProvider, privacy: .public) " +
+                    "talk config voiceId=\(voiceLabel, privacy: .public) " +
                    "modelId=\(modelLabel, privacy: .public) " +
                    "apiKey=\(hasApiKey, privacy: .public) " +
                    "interrupt=\(cfg.interruptOnSpeech, privacy: .public) " +
@@ -859,11 +964,17 @@ extension TalkModeRuntime {
            await MainActor.run {
                AppStateStore.shared.seamColorHex = parsed.seamColorHex
            }
-            if parsed.activeProvider != Self.defaultTalkProvider {
-                self.ttsLogger
-                    .info("talk provider \(parsed.activeProvider, privacy: .public) unsupported; using system voice")
-            } else if parsed.normalizedPayload {
+            if parsed.activeProvider == Self.defaultTalkProvider {
                self.ttsLogger.info("talk config provider from talk.resolved")
+            } else if parsed.activeProvider == Self.mlxTalkProvider ||
+                parsed.activeProvider == Self.systemTalkProvider
+            {
+                self.ttsLogger.info(
+                    "talk provider \(parsed.activeProvider, privacy: .public) active")
+            } else {
+                self.ttsLogger
+                    .info(
+                        "talk provider \(parsed.activeProvider, privacy: .public) unsupported; using system voice")
            }
            return parsed
        } catch {
--- a/apps/macos/Tests/OpenClawIPCTests/TalkModeGatewayConfigTests.swift
+++ b/apps/macos/Tests/OpenClawIPCTests/TalkModeGatewayConfigTests.swift
@@ -0,0 +1,48 @@
+import OpenClawProtocol
+import Testing
+@testable import OpenClaw
+
+struct TalkModeGatewayConfigTests {
+    @Test func `mlx provider does not inherit elevenlabs defaults`() {
+        let snapshot = ConfigSnapshot(
+            path: nil,
+            exists: true,
+            raw: nil,
+            hash: nil,
+            parsed: nil,
+            valid: true,
+            config: [
+                "talk": AnyCodable([
+                    "provider": "mlx",
+                    "providers": [
+                        "mlx": [
+                            "voiceId": "unused-voice",
+                        ],
+                    ],
+                    "resolved": [
+                        "provider": "mlx",
+                        "config": [
+                            "voiceId": "unused-voice",
+                        ],
+                    ],
+                ]),
+            ],
+            issues: nil
+        )
+
+        let parsed = TalkModeGatewayConfigParser.parse(
+            snapshot: snapshot,
+            defaultProvider: "elevenlabs",
+            defaultModelIdFallback: "eleven_v3",
+            defaultSilenceTimeoutMs: TalkDefaults.silenceTimeoutMs,
+            envVoice: "env-voice",
+            sagVoice: "sag-voice",
+            envApiKey: "env-key"
+        )
+
+        #expect(parsed.activeProvider == "mlx")
+        #expect(parsed.modelId == nil)
+        #expect(parsed.apiKey == nil)
+        #expect(parsed.voiceId == "unused-voice")
+    }
+}
--- a/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift
+++ b/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift
@@ -13,11 +13,34 @@ struct TalkModeRuntimeSpeechTests {
    }

    @Test func `playback plan falls back only from elevenlabs`() {
-        #expect(
-            TalkModeRuntime.playbackPlan(apiKey: "key", voiceId: "voice")
-                == .elevenLabsThenSystemVoice(apiKey: "key", voiceId: "voice"))
-        #expect(TalkModeRuntime.playbackPlan(apiKey: nil, voiceId: "voice") == .systemVoiceOnly)
-        #expect(TalkModeRuntime.playbackPlan(apiKey: "key", voiceId: nil) == .systemVoiceOnly)
-        #expect(TalkModeRuntime.playbackPlan(apiKey: "", voiceId: "voice") == .systemVoiceOnly)
+        let elevenLabsPlan = TalkModeRuntime.playbackPlan(
+            provider: "elevenlabs",
+            apiKey: "key",
+            voiceId: "voice"
+        )
+        let missingKeyPlan = TalkModeRuntime.playbackPlan(
+            provider: "elevenlabs",
+            apiKey: nil,
+            voiceId: "voice"
+        )
+        let missingVoicePlan = TalkModeRuntime.playbackPlan(
+            provider: "elevenlabs",
+            apiKey: "key",
+            voiceId: nil
+        )
+        let blankKeyPlan = TalkModeRuntime.playbackPlan(
+            provider: "elevenlabs",
+            apiKey: "",
+            voiceId: "voice"
+        )
+        let mlxPlan = TalkModeRuntime.playbackPlan(provider: "mlx", apiKey: nil, voiceId: nil)
+        let systemPlan = TalkModeRuntime.playbackPlan(provider: "system", apiKey: nil, voiceId: nil)
+
+        #expect(elevenLabsPlan == .elevenLabsThenSystemVoice(apiKey: "key", voiceId: "voice"))
+        #expect(missingKeyPlan == .systemVoiceOnly)
+        #expect(missingVoicePlan == .systemVoiceOnly)
+        #expect(blankKeyPlan == .systemVoiceOnly)
+        #expect(mlxPlan == .mlxThenSystemVoice)
+        #expect(systemPlan == .systemVoiceOnly)
    }
 }