mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-17 12:11:20 +00:00
macOS: add MLX Talk provider MVP (#63539)
Merged via squash.
Prepared head SHA: da43563513
Co-authored-by: ImLukeF <92253590+ImLukeF@users.noreply.github.com>
Co-authored-by: ImLukeF <92253590+ImLukeF@users.noreply.github.com>
Reviewed-by: @ImLukeF
This commit is contained in:
@@ -6,6 +6,8 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Changes
|
||||
|
||||
- macOS/Talk: add an experimental local MLX speech provider for Talk Mode, with explicit provider selection, local utterance playback, interruption handling, and system-voice fallback. (#63539) Thanks @ImLukeF.
|
||||
|
||||
### Fixes
|
||||
|
||||
- fix(browser): auto-generate browser control auth token for none/trusted-proxy modes [AI]. (#63280) Thanks @pgondhi987.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"originHash" : "fb90e7b1977f43661ac91681d16da11f9ddd85630407ef170eaada0a6ee39972",
|
||||
"originHash" : "31972864afdac74537794e1a3b7bd22484c09ec1be8e3624fb9ea582e9222ad9",
|
||||
"pins" : [
|
||||
{
|
||||
"identity" : "axorcist",
|
||||
@@ -28,6 +28,15 @@
|
||||
"version" : "0.1.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "eventsource",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/mattt/EventSource.git",
|
||||
"state" : {
|
||||
"revision" : "a3a85a85214caf642abaa96ae664e4c772a59f6e",
|
||||
"version" : "1.4.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "menubarextraaccess",
|
||||
"kind" : "remoteSourceControl",
|
||||
@@ -37,6 +46,33 @@
|
||||
"version" : "1.2.2"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "mlx-audio-swift",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/Blaizzy/mlx-audio-swift",
|
||||
"state" : {
|
||||
"revision" : "fcbd04daa1bfebe881932f630af2ba6ce9af3274",
|
||||
"version" : "0.1.2"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "mlx-swift",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/ml-explore/mlx-swift.git",
|
||||
"state" : {
|
||||
"revision" : "61b9e011e09a62b489f6bd647958f1555bdf2896",
|
||||
"version" : "0.31.3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "mlx-swift-lm",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/ml-explore/mlx-swift-lm.git",
|
||||
"state" : {
|
||||
"revision" : "25b00d4e22e61ec9c41efda47990cd2084ec87ff",
|
||||
"version" : "2.31.3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "peekaboo",
|
||||
"kind" : "remoteSourceControl",
|
||||
@@ -64,6 +100,33 @@
|
||||
"version" : "1.2.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-asn1",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-asn1.git",
|
||||
"state" : {
|
||||
"revision" : "9f542610331815e29cc3821d3b6f488db8715517",
|
||||
"version" : "1.6.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-atomics",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-atomics.git",
|
||||
"state" : {
|
||||
"revision" : "b601256eab081c0f92f059e12818ac1d4f178ff7",
|
||||
"version" : "1.3.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-collections",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-collections.git",
|
||||
"state" : {
|
||||
"revision" : "6675bc0ff86e61436e615df6fc5174e043e57924",
|
||||
"version" : "1.4.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-concurrency-extras",
|
||||
"kind" : "remoteSourceControl",
|
||||
@@ -73,6 +136,33 @@
|
||||
"version" : "1.3.2"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-crypto",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-crypto.git",
|
||||
"state" : {
|
||||
"revision" : "bb4ba815dab96d4edc1e0b86d7b9acf9ff973a84",
|
||||
"version" : "4.3.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-huggingface",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/huggingface/swift-huggingface.git",
|
||||
"state" : {
|
||||
"revision" : "b721959445b617d0bf03910b2b4aced345fd93bf",
|
||||
"version" : "0.9.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-jinja",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/huggingface/swift-jinja.git",
|
||||
"state" : {
|
||||
"revision" : "0aeefadec459ce8e11a333769950fb86183aca43",
|
||||
"version" : "2.3.5"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-log",
|
||||
"kind" : "remoteSourceControl",
|
||||
@@ -82,6 +172,15 @@
|
||||
"version" : "1.10.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-nio",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-nio.git",
|
||||
"state" : {
|
||||
"revision" : "558f24a4647193b5a0e2104031b71c55d31ff83a",
|
||||
"version" : "2.97.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-numerics",
|
||||
"kind" : "remoteSourceControl",
|
||||
@@ -109,6 +208,15 @@
|
||||
"version" : "1.6.4"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-transformers",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/huggingface/swift-transformers.git",
|
||||
"state" : {
|
||||
"revision" : "58c4bc11963a140358d791f678a60a2745a23146",
|
||||
"version" : "1.2.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swiftui-math",
|
||||
"kind" : "remoteSourceControl",
|
||||
@@ -126,6 +234,15 @@
|
||||
"revision" : "5b06b811c0f5313b6b84bbef98c635a630638c38",
|
||||
"version" : "0.3.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "yyjson",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/ibireme/yyjson.git",
|
||||
"state" : {
|
||||
"revision" : "8b4a38dc994a110abaec8a400615567bd996105f",
|
||||
"version" : "0.12.0"
|
||||
}
|
||||
}
|
||||
],
|
||||
"version" : 3
|
||||
|
||||
@@ -20,6 +20,7 @@ let package = Package(
|
||||
.package(url: "https://github.com/apple/swift-log.git", from: "1.10.1"),
|
||||
.package(url: "https://github.com/sparkle-project/Sparkle", from: "2.9.0"),
|
||||
.package(url: "https://github.com/steipete/Peekaboo.git", branch: "main"),
|
||||
.package(url: "https://github.com/Blaizzy/mlx-audio-swift", exact: "0.1.2"),
|
||||
.package(path: "../shared/OpenClawKit"),
|
||||
.package(path: "../../Swabble"),
|
||||
],
|
||||
@@ -54,6 +55,7 @@ let package = Package(
|
||||
.product(name: "Sparkle", package: "Sparkle"),
|
||||
.product(name: "PeekabooBridge", package: "Peekaboo"),
|
||||
.product(name: "PeekabooAutomationKit", package: "Peekaboo"),
|
||||
.product(name: "MLXAudioTTS", package: "mlx-audio-swift"),
|
||||
],
|
||||
exclude: [
|
||||
"Resources/Info.plist",
|
||||
|
||||
178
apps/macos/Sources/OpenClaw/TalkMLXSpeechSynthesizer.swift
Normal file
178
apps/macos/Sources/OpenClaw/TalkMLXSpeechSynthesizer.swift
Normal file
@@ -0,0 +1,178 @@
|
||||
import Foundation
|
||||
import MLXAudioTTS
|
||||
import OSLog
|
||||
|
||||
// swiftformat:disable wrap wrapMultilineStatementBraces trailingCommas redundantSelf extensionAccessControl
|
||||
/// Runtime access stays serialized through `TalkModeRuntime` actor helper methods.
|
||||
final class TalkMLXSpeechSynthesizer {
|
||||
enum SynthesizeError: Error {
|
||||
case canceled
|
||||
case modelLoadFailed(String)
|
||||
case audioGenerationFailed
|
||||
case audioPlaybackFailed
|
||||
case timedOut
|
||||
}
|
||||
|
||||
static let shared = TalkMLXSpeechSynthesizer()
|
||||
static let defaultModelRepo = "mlx-community/Soprano-80M-bf16"
|
||||
|
||||
private let logger = Logger(subsystem: "ai.openclaw", category: "talk.mlx")
|
||||
private var currentToken = UUID()
|
||||
private var modelRepo: String?
|
||||
private var model: (any SpeechGenerationModel)?
|
||||
|
||||
private init() {}
|
||||
|
||||
func stop() {
|
||||
self.currentToken = UUID()
|
||||
}
|
||||
|
||||
func synthesize(
|
||||
text: String,
|
||||
modelRepo: String?,
|
||||
language: String?,
|
||||
voicePreset: String?) async throws -> Data {
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return Data() }
|
||||
|
||||
self.stop()
|
||||
let token = UUID()
|
||||
self.currentToken = token
|
||||
|
||||
let resolvedRepo = Self.resolvedModelRepo(modelRepo)
|
||||
let rawModel = try await self.loadModel(
|
||||
modelRepo: resolvedRepo,
|
||||
token: token)
|
||||
let model = UncheckedSpeechModel(raw: rawModel)
|
||||
guard self.currentToken == token else {
|
||||
throw SynthesizeError.canceled
|
||||
}
|
||||
|
||||
let audioData: Data
|
||||
do {
|
||||
let audio = try await model.generateAudio(
|
||||
text: trimmed,
|
||||
voice: voicePreset,
|
||||
language: language)
|
||||
audioData = Self.makeWavData(
|
||||
samples: audio,
|
||||
sampleRate: Double(model.sampleRateValue()))
|
||||
} catch {
|
||||
self.logger.error(
|
||||
"talk mlx generation failed: \(error.localizedDescription, privacy: .public)")
|
||||
throw SynthesizeError.audioGenerationFailed
|
||||
}
|
||||
|
||||
guard self.currentToken == token else {
|
||||
throw SynthesizeError.canceled
|
||||
}
|
||||
return audioData
|
||||
}
|
||||
|
||||
private func loadModel(
|
||||
modelRepo: String,
|
||||
token: UUID) async throws -> any SpeechGenerationModel {
|
||||
if let model = self.model, self.modelRepo == modelRepo {
|
||||
return model
|
||||
}
|
||||
|
||||
self.logger.info("talk mlx loading modelRepo=\(modelRepo, privacy: .public)")
|
||||
do {
|
||||
let model = try await TTS.loadModel(modelRepo: modelRepo)
|
||||
guard self.currentToken == token else {
|
||||
throw SynthesizeError.canceled
|
||||
}
|
||||
self.model = model
|
||||
self.modelRepo = modelRepo
|
||||
return model
|
||||
} catch is CancellationError {
|
||||
throw SynthesizeError.canceled
|
||||
} catch {
|
||||
self.logger.error(
|
||||
"talk mlx load failed: \(error.localizedDescription, privacy: .public)")
|
||||
throw SynthesizeError.modelLoadFailed(modelRepo)
|
||||
}
|
||||
}
|
||||
|
||||
private static func resolvedModelRepo(_ modelRepo: String?) -> String {
|
||||
let trimmed = modelRepo?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
return trimmed.isEmpty ? Self.defaultModelRepo : trimmed
|
||||
}
|
||||
|
||||
private static func makeWavData(samples: [Float], sampleRate: Double) -> Data {
|
||||
let channels: UInt16 = 1
|
||||
let bitsPerSample: UInt16 = 16
|
||||
let blockAlign = channels * (bitsPerSample / 8)
|
||||
let sampleRateInt = UInt32(sampleRate.rounded())
|
||||
let byteRate = sampleRateInt * UInt32(blockAlign)
|
||||
let dataSize = UInt32(samples.count) * UInt32(blockAlign)
|
||||
|
||||
var data = Data(capacity: Int(44 + dataSize))
|
||||
data.append(contentsOf: [0x52, 0x49, 0x46, 0x46]) // RIFF
|
||||
data.appendLEUInt32(36 + dataSize)
|
||||
data.append(contentsOf: [0x57, 0x41, 0x56, 0x45]) // WAVE
|
||||
|
||||
data.append(contentsOf: [0x66, 0x6D, 0x74, 0x20]) // fmt
|
||||
data.appendLEUInt32(16)
|
||||
data.appendLEUInt16(1)
|
||||
data.appendLEUInt16(channels)
|
||||
data.appendLEUInt32(sampleRateInt)
|
||||
data.appendLEUInt32(byteRate)
|
||||
data.appendLEUInt16(blockAlign)
|
||||
data.appendLEUInt16(bitsPerSample)
|
||||
|
||||
data.append(contentsOf: [0x64, 0x61, 0x74, 0x61]) // data
|
||||
data.appendLEUInt32(dataSize)
|
||||
|
||||
for sample in samples {
|
||||
let clamped = max(-1.0, min(1.0, sample))
|
||||
let scaled = Int16((clamped * Float(Int16.max)).rounded())
|
||||
data.appendLEInt16(scaled)
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
extension TalkMLXSpeechSynthesizer: @unchecked Sendable {}
|
||||
|
||||
private struct UncheckedSpeechModel {
|
||||
let raw: any SpeechGenerationModel
|
||||
|
||||
func sampleRateValue() -> Int {
|
||||
raw.sampleRate
|
||||
}
|
||||
|
||||
func generateAudio(
|
||||
text: String,
|
||||
voice: String?,
|
||||
language: String?) async throws -> [Float] {
|
||||
let generatedAudio = try await raw.generate(
|
||||
text: text,
|
||||
voice: voice,
|
||||
refAudio: nil,
|
||||
refText: nil,
|
||||
language: language)
|
||||
return generatedAudio.asArray(Float.self)
|
||||
}
|
||||
}
|
||||
|
||||
extension UncheckedSpeechModel: @unchecked Sendable {}
|
||||
|
||||
extension Data {
|
||||
fileprivate mutating func appendLEUInt16(_ value: UInt16) {
|
||||
var littleEndian = value.littleEndian
|
||||
Swift.withUnsafeBytes(of: &littleEndian) { append(contentsOf: $0) }
|
||||
}
|
||||
|
||||
fileprivate mutating func appendLEUInt32(_ value: UInt32) {
|
||||
var littleEndian = value.littleEndian
|
||||
Swift.withUnsafeBytes(of: &littleEndian) { append(contentsOf: $0) }
|
||||
}
|
||||
|
||||
fileprivate mutating func appendLEInt16(_ value: Int16) {
|
||||
var littleEndian = value.littleEndian
|
||||
Swift.withUnsafeBytes(of: &littleEndian) { append(contentsOf: $0) }
|
||||
}
|
||||
}
|
||||
|
||||
// swiftformat:enable wrap wrapMultilineStatementBraces trailingCommas redundantSelf extensionAccessControl
|
||||
@@ -44,7 +44,13 @@ enum TalkModeGatewayConfigParser {
|
||||
acc[key] = value
|
||||
} ?? [:]
|
||||
let model = activeConfig?["modelId"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let resolvedModel = (model?.isEmpty == false) ? model! : defaultModelIdFallback
|
||||
let resolvedModel: String? = if model?.isEmpty == false {
|
||||
model!
|
||||
} else if activeProvider == defaultProvider {
|
||||
defaultModelIdFallback
|
||||
} else {
|
||||
nil
|
||||
}
|
||||
let outputFormat = activeConfig?["outputFormat"]?.stringValue
|
||||
let interrupt = talk?["interruptOnSpeech"]?.boolValue
|
||||
let apiKey = activeConfig?["apiKey"]?.stringValue
|
||||
|
||||
@@ -10,6 +10,7 @@ actor TalkModeRuntime {
|
||||
|
||||
enum PlaybackPlan: Equatable {
|
||||
case elevenLabsThenSystemVoice(apiKey: String, voiceId: String)
|
||||
case mlxThenSystemVoice
|
||||
case systemVoiceOnly
|
||||
}
|
||||
|
||||
@@ -17,6 +18,8 @@ actor TalkModeRuntime {
|
||||
private let ttsLogger = Logger(subsystem: "ai.openclaw", category: "talk.tts")
|
||||
private static let defaultModelIdFallback = "eleven_v3"
|
||||
private static let defaultTalkProvider = "elevenlabs"
|
||||
private static let mlxTalkProvider = "mlx"
|
||||
private static let systemTalkProvider = "system"
|
||||
private static let defaultSilenceTimeoutMs = TalkDefaults.silenceTimeoutMs
|
||||
|
||||
private final class RMSMeter: @unchecked Sendable {
|
||||
@@ -65,6 +68,7 @@ actor TalkModeRuntime {
|
||||
private var modelOverrideActive = false
|
||||
private var defaultOutputFormat: String?
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var activeTalkProvider = TalkModeRuntime.defaultTalkProvider
|
||||
private var lastInterruptedAtSeconds: Double?
|
||||
private var voiceAliases: [String: String] = [:]
|
||||
private var lastSpokenText: String?
|
||||
@@ -462,7 +466,7 @@ actor TalkModeRuntime {
|
||||
private func playAssistant(text: String) async {
|
||||
guard let input = await self.preparePlaybackInput(text: text) else { return }
|
||||
|
||||
switch Self.playbackPlan(apiKey: input.apiKey, voiceId: input.voiceId) {
|
||||
switch Self.playbackPlan(provider: input.provider, apiKey: input.apiKey, voiceId: input.voiceId) {
|
||||
case let .elevenLabsThenSystemVoice(apiKey, voiceId):
|
||||
do {
|
||||
try await self.playElevenLabs(input: input, apiKey: apiKey, voiceId: voiceId)
|
||||
@@ -477,6 +481,23 @@ actor TalkModeRuntime {
|
||||
self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
case .mlxThenSystemVoice:
|
||||
do {
|
||||
try await self.playMLX(input: input)
|
||||
} catch TalkMLXSpeechSynthesizer.SynthesizeError.canceled {
|
||||
self.ttsLogger.info("talk mlx canceled")
|
||||
return
|
||||
} catch {
|
||||
self.ttsLogger
|
||||
.error(
|
||||
"talk MLX failed: \(error.localizedDescription, privacy: .public); " +
|
||||
"falling back to system voice")
|
||||
do {
|
||||
try await self.playSystemVoice(input: input)
|
||||
} catch {
|
||||
self.ttsLogger.error("talk system voice failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
case .systemVoiceOnly:
|
||||
do {
|
||||
try await self.playSystemVoice(input: input)
|
||||
@@ -491,19 +512,30 @@ actor TalkModeRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
static func playbackPlan(apiKey: String?, voiceId: String?) -> PlaybackPlan {
|
||||
guard let apiKey, !apiKey.isEmpty, let voiceId else {
|
||||
static func playbackPlan(provider: String, apiKey: String?, voiceId: String?) -> PlaybackPlan {
|
||||
switch provider {
|
||||
case self.defaultTalkProvider:
|
||||
guard let apiKey, !apiKey.isEmpty, let voiceId else {
|
||||
return .systemVoiceOnly
|
||||
}
|
||||
return .elevenLabsThenSystemVoice(apiKey: apiKey, voiceId: voiceId)
|
||||
case self.mlxTalkProvider:
|
||||
return .mlxThenSystemVoice
|
||||
case self.systemTalkProvider:
|
||||
return .systemVoiceOnly
|
||||
default:
|
||||
return .systemVoiceOnly
|
||||
}
|
||||
return .elevenLabsThenSystemVoice(apiKey: apiKey, voiceId: voiceId)
|
||||
}
|
||||
|
||||
private struct TalkPlaybackInput {
|
||||
let generation: Int
|
||||
let provider: String
|
||||
let cleanedText: String
|
||||
let directive: TalkDirective?
|
||||
let apiKey: String?
|
||||
let voiceId: String?
|
||||
let voicePreset: String?
|
||||
let language: String?
|
||||
let synthTimeoutSeconds: Double
|
||||
}
|
||||
@@ -552,18 +584,20 @@ actor TalkModeRuntime {
|
||||
resolvedVoice ??
|
||||
self.currentVoiceId ??
|
||||
self.defaultVoiceId
|
||||
let voicePreset = preferredVoice
|
||||
let provider = self.activeTalkProvider
|
||||
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
|
||||
let voiceId: String? = if let apiKey, !apiKey.isEmpty {
|
||||
let voiceId: String? = if provider == Self.defaultTalkProvider, let apiKey, !apiKey.isEmpty {
|
||||
await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
|
||||
} else {
|
||||
nil
|
||||
}
|
||||
|
||||
if apiKey?.isEmpty != false {
|
||||
if provider == Self.defaultTalkProvider, apiKey?.isEmpty != false {
|
||||
self.ttsLogger.warning("talk missing ELEVENLABS_API_KEY; falling back to system voice")
|
||||
} else if voiceId == nil {
|
||||
} else if provider == Self.defaultTalkProvider, voiceId == nil {
|
||||
self.ttsLogger.warning("talk missing voiceId; falling back to system voice")
|
||||
} else if let voiceId {
|
||||
self.ttsLogger
|
||||
@@ -579,15 +613,21 @@ actor TalkModeRuntime {
|
||||
|
||||
return TalkPlaybackInput(
|
||||
generation: gen,
|
||||
provider: provider,
|
||||
cleanedText: cleaned,
|
||||
directive: directive,
|
||||
apiKey: apiKey,
|
||||
voiceId: voiceId,
|
||||
voicePreset: voicePreset,
|
||||
language: language,
|
||||
synthTimeoutSeconds: synthTimeoutSeconds)
|
||||
}
|
||||
|
||||
private func playElevenLabs(input: TalkPlaybackInput, apiKey: String, voiceId: String) async throws {
|
||||
private func playElevenLabs(
|
||||
input: TalkPlaybackInput,
|
||||
apiKey: String,
|
||||
voiceId: String) async throws
|
||||
{
|
||||
let desiredOutputFormat = input.directive?.outputFormat ?? self.defaultOutputFormat ?? "pcm_44100"
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(desiredOutputFormat)
|
||||
if outputFormat == nil, !desiredOutputFormat.isEmpty {
|
||||
@@ -696,6 +736,39 @@ actor TalkModeRuntime {
|
||||
self.ttsLogger.info("talk system voice done")
|
||||
}
|
||||
|
||||
private func playMLX(input: TalkPlaybackInput) async throws {
|
||||
self.ttsLogger.info("talk mlx start chars=\(input.cleanedText.count, privacy: .public)")
|
||||
if self.interruptOnSpeech {
|
||||
guard await self.prepareForPlayback(generation: input.generation) else { return }
|
||||
}
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
let modelRepo = input.directive?.modelId ?? self.currentModelId
|
||||
let audioData: Data
|
||||
do {
|
||||
audioData = try await AsyncTimeout.withTimeout(
|
||||
seconds: input.synthTimeoutSeconds,
|
||||
onTimeout: {
|
||||
TalkMLXSpeechSynthesizer.SynthesizeError.timedOut
|
||||
},
|
||||
operation: { [self] in
|
||||
try await self.synthesizeMLXVoice(
|
||||
text: input.cleanedText,
|
||||
modelRepo: modelRepo,
|
||||
language: input.language,
|
||||
voicePreset: input.voicePreset)
|
||||
})
|
||||
} catch TalkMLXSpeechSynthesizer.SynthesizeError.timedOut {
|
||||
self.stopMLXVoice()
|
||||
throw TalkMLXSpeechSynthesizer.SynthesizeError.timedOut
|
||||
}
|
||||
let result = await self.playTalkAudio(data: audioData)
|
||||
if !result.finished, result.interruptedAt == nil {
|
||||
throw TalkMLXSpeechSynthesizer.SynthesizeError.audioPlaybackFailed
|
||||
}
|
||||
self.ttsLogger.info("talk mlx done")
|
||||
}
|
||||
|
||||
private func prepareForPlayback(generation: Int) async -> Bool {
|
||||
await self.startRecognition()
|
||||
return self.isCurrent(generation)
|
||||
@@ -750,10 +823,13 @@ actor TalkModeRuntime {
|
||||
|
||||
func stopSpeaking(reason: TalkStopReason) async {
|
||||
let usePCM = self.lastPlaybackWasPCM
|
||||
let interruptedAt = usePCM ? await self.stopPCM() : await self.stopMP3()
|
||||
let remoteInterruptedAt = usePCM ? await self.stopPCM() : await self.stopMP3()
|
||||
_ = usePCM ? await self.stopMP3() : await self.stopPCM()
|
||||
let localInterruptedAt = await self.stopTalkAudio()
|
||||
await TalkSystemSpeechSynthesizer.shared.stop()
|
||||
self.stopMLXVoice()
|
||||
guard self.phase == .speaking else { return }
|
||||
let interruptedAt = remoteInterruptedAt ?? localInterruptedAt
|
||||
if reason == .speech, let interruptedAt {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
@@ -795,6 +871,33 @@ extension TalkModeRuntime {
|
||||
StreamingAudioPlayer.shared.stop()
|
||||
}
|
||||
|
||||
@MainActor
|
||||
private func playTalkAudio(data: Data) async -> TalkPlaybackResult {
|
||||
await TalkAudioPlayer.shared.play(data: data)
|
||||
}
|
||||
|
||||
@MainActor
|
||||
private func stopTalkAudio() -> Double? {
|
||||
TalkAudioPlayer.shared.stop()
|
||||
}
|
||||
|
||||
private func synthesizeMLXVoice(
|
||||
text: String,
|
||||
modelRepo: String?,
|
||||
language: String?,
|
||||
voicePreset: String?) async throws -> Data
|
||||
{
|
||||
try await TalkMLXSpeechSynthesizer.shared.synthesize(
|
||||
text: text,
|
||||
modelRepo: modelRepo,
|
||||
language: language,
|
||||
voicePreset: voicePreset)
|
||||
}
|
||||
|
||||
private func stopMLXVoice() {
|
||||
TalkMLXSpeechSynthesizer.shared.stop()
|
||||
}
|
||||
|
||||
// MARK: - Config
|
||||
|
||||
private func reloadConfig() async {
|
||||
@@ -810,6 +913,7 @@ extension TalkModeRuntime {
|
||||
}
|
||||
self.defaultOutputFormat = cfg.outputFormat
|
||||
self.interruptOnSpeech = cfg.interruptOnSpeech
|
||||
self.activeTalkProvider = cfg.activeProvider
|
||||
self.silenceWindow = TimeInterval(cfg.silenceTimeoutMs) / 1000
|
||||
self.apiKey = cfg.apiKey
|
||||
let hasApiKey = (cfg.apiKey?.isEmpty == false)
|
||||
@@ -817,7 +921,8 @@ extension TalkModeRuntime {
|
||||
let modelLabel = (cfg.modelId?.isEmpty == false) ? cfg.modelId! : "none"
|
||||
self.logger
|
||||
.info(
|
||||
"talk config voiceId=\(voiceLabel, privacy: .public) " +
|
||||
"talk config provider=\(cfg.activeProvider, privacy: .public) " +
|
||||
"talk config voiceId=\(voiceLabel, privacy: .public) " +
|
||||
"modelId=\(modelLabel, privacy: .public) " +
|
||||
"apiKey=\(hasApiKey, privacy: .public) " +
|
||||
"interrupt=\(cfg.interruptOnSpeech, privacy: .public) " +
|
||||
@@ -859,11 +964,17 @@ extension TalkModeRuntime {
|
||||
await MainActor.run {
|
||||
AppStateStore.shared.seamColorHex = parsed.seamColorHex
|
||||
}
|
||||
if parsed.activeProvider != Self.defaultTalkProvider {
|
||||
self.ttsLogger
|
||||
.info("talk provider \(parsed.activeProvider, privacy: .public) unsupported; using system voice")
|
||||
} else if parsed.normalizedPayload {
|
||||
if parsed.activeProvider == Self.defaultTalkProvider {
|
||||
self.ttsLogger.info("talk config provider from talk.resolved")
|
||||
} else if parsed.activeProvider == Self.mlxTalkProvider ||
|
||||
parsed.activeProvider == Self.systemTalkProvider
|
||||
{
|
||||
self.ttsLogger.info(
|
||||
"talk provider \(parsed.activeProvider, privacy: .public) active")
|
||||
} else {
|
||||
self.ttsLogger
|
||||
.info(
|
||||
"talk provider \(parsed.activeProvider, privacy: .public) unsupported; using system voice")
|
||||
}
|
||||
return parsed
|
||||
} catch {
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
import OpenClawProtocol
|
||||
import Testing
|
||||
@testable import OpenClaw
|
||||
|
||||
struct TalkModeGatewayConfigTests {
|
||||
@Test func `mlx provider does not inherit elevenlabs defaults`() {
|
||||
let snapshot = ConfigSnapshot(
|
||||
path: nil,
|
||||
exists: true,
|
||||
raw: nil,
|
||||
hash: nil,
|
||||
parsed: nil,
|
||||
valid: true,
|
||||
config: [
|
||||
"talk": AnyCodable([
|
||||
"provider": "mlx",
|
||||
"providers": [
|
||||
"mlx": [
|
||||
"voiceId": "unused-voice",
|
||||
],
|
||||
],
|
||||
"resolved": [
|
||||
"provider": "mlx",
|
||||
"config": [
|
||||
"voiceId": "unused-voice",
|
||||
],
|
||||
],
|
||||
]),
|
||||
],
|
||||
issues: nil
|
||||
)
|
||||
|
||||
let parsed = TalkModeGatewayConfigParser.parse(
|
||||
snapshot: snapshot,
|
||||
defaultProvider: "elevenlabs",
|
||||
defaultModelIdFallback: "eleven_v3",
|
||||
defaultSilenceTimeoutMs: TalkDefaults.silenceTimeoutMs,
|
||||
envVoice: "env-voice",
|
||||
sagVoice: "sag-voice",
|
||||
envApiKey: "env-key"
|
||||
)
|
||||
|
||||
#expect(parsed.activeProvider == "mlx")
|
||||
#expect(parsed.modelId == nil)
|
||||
#expect(parsed.apiKey == nil)
|
||||
#expect(parsed.voiceId == "unused-voice")
|
||||
}
|
||||
}
|
||||
@@ -13,11 +13,34 @@ struct TalkModeRuntimeSpeechTests {
|
||||
}
|
||||
|
||||
@Test func `playback plan falls back only from elevenlabs`() {
|
||||
#expect(
|
||||
TalkModeRuntime.playbackPlan(apiKey: "key", voiceId: "voice")
|
||||
== .elevenLabsThenSystemVoice(apiKey: "key", voiceId: "voice"))
|
||||
#expect(TalkModeRuntime.playbackPlan(apiKey: nil, voiceId: "voice") == .systemVoiceOnly)
|
||||
#expect(TalkModeRuntime.playbackPlan(apiKey: "key", voiceId: nil) == .systemVoiceOnly)
|
||||
#expect(TalkModeRuntime.playbackPlan(apiKey: "", voiceId: "voice") == .systemVoiceOnly)
|
||||
let elevenLabsPlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: "key",
|
||||
voiceId: "voice"
|
||||
)
|
||||
let missingKeyPlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: nil,
|
||||
voiceId: "voice"
|
||||
)
|
||||
let missingVoicePlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: "key",
|
||||
voiceId: nil
|
||||
)
|
||||
let blankKeyPlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: "",
|
||||
voiceId: "voice"
|
||||
)
|
||||
let mlxPlan = TalkModeRuntime.playbackPlan(provider: "mlx", apiKey: nil, voiceId: nil)
|
||||
let systemPlan = TalkModeRuntime.playbackPlan(provider: "system", apiKey: nil, voiceId: nil)
|
||||
|
||||
#expect(elevenLabsPlan == .elevenLabsThenSystemVoice(apiKey: "key", voiceId: "voice"))
|
||||
#expect(missingKeyPlan == .systemVoiceOnly)
|
||||
#expect(missingVoicePlan == .systemVoiceOnly)
|
||||
#expect(blankKeyPlan == .systemVoiceOnly)
|
||||
#expect(mlxPlan == .mlxThenSystemVoice)
|
||||
#expect(systemPlan == .systemVoiceOnly)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user