Files
openclaw/apps/ios/Sources/Voice/TalkModeGatewayConfig.swift
Nimrod Gutman 19e4c37c37 feat(ios): show Talk voice mode (#86798)
Merged via squash.

Prepared head SHA: bd24da3f3b
Co-authored-by: ngutman <1540134+ngutman@users.noreply.github.com>
Co-authored-by: ngutman <1540134+ngutman@users.noreply.github.com>
Reviewed-by: @ngutman
2026-05-26 17:31:31 +03:00

298 lines
11 KiB
Swift

import Foundation
import OpenClawKit
enum TalkModeExecutionMode {
case native
case realtimeClient
case realtimeRelay
}
struct TalkVoiceModeDescriptor: Equatable {
let title: String
let subtitle: String?
let providerId: String?
let modelId: String?
let voiceId: String?
let transport: String?
let isRealtime: Bool
var accessibilityValue: String {
if let subtitle, !subtitle.isEmpty {
return "\(self.title), \(subtitle)"
}
return self.title
}
}
enum TalkVoiceModeDescriptorBuilder {
static func build(
providerId: String,
providerLabel: String,
modelId: String?,
voiceId: String?,
transport: String?,
isRealtime: Bool) -> TalkVoiceModeDescriptor
{
let normalizedProvider = providerId.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let trimmedModel = Self.trimmed(modelId)
let trimmedVoice = Self.trimmed(voiceId)
let trimmedTransport = Self.trimmed(transport)
let title = if isRealtime, normalizedProvider == "openai", trimmedModel == "gpt-realtime-2" {
"GPT Realtime 2.0"
} else if isRealtime, normalizedProvider == "openai" {
"OpenAI Realtime"
} else if isRealtime {
providerLabel.isEmpty ? "Realtime Voice" : providerLabel
} else if normalizedProvider == "system" {
"iOS System Voice"
} else {
providerLabel.isEmpty ? "Talk Voice" : providerLabel
}
var details: [String] = []
if isRealtime, normalizedProvider != "openai", !providerLabel.isEmpty, providerLabel != title {
details.append(providerLabel)
}
if let trimmedTransport {
details.append(Self.transportLabel(trimmedTransport))
}
if let trimmedModel, title != "GPT Realtime 2.0" || trimmedModel != "gpt-realtime-2" {
details.append(trimmedModel)
}
if let trimmedVoice {
details.append(Self.voiceLabel(trimmedVoice))
}
return TalkVoiceModeDescriptor(
title: title,
subtitle: details.isEmpty ? nil : details.joined(separator: ""),
providerId: normalizedProvider.isEmpty ? nil : normalizedProvider,
modelId: trimmedModel,
voiceId: trimmedVoice,
transport: trimmedTransport,
isRealtime: isRealtime)
}
private static func trimmed(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
return trimmed.isEmpty ? nil : trimmed
}
private static func voiceLabel(_ voice: String) -> String {
TalkModeRealtimeVoiceSelection.voices.contains(voice)
? TalkModeRealtimeVoiceSelection.label(for: voice)
: voice
}
private static func transportLabel(_ transport: String) -> String {
switch transport.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() {
case "webrtc":
"Native WebRTC"
case "gateway-relay":
"Gateway Relay"
case "provider-websocket":
"Provider WebSocket"
case "managed-room":
"Managed Room"
case "native":
"Native"
case let value where !value.isEmpty:
value
default:
"Native"
}
}
}
enum TalkModeProviderSelection: String, CaseIterable, Identifiable {
case gatewayDefault = "gateway"
case nativeElevenLabs = "elevenlabs"
case openAIRealtime = "openai-realtime"
static let storageKey = "talk.providerSelection"
var id: String {
self.rawValue
}
var label: String {
switch self {
case .gatewayDefault:
"Gateway Default"
case .nativeElevenLabs:
"ElevenLabs"
case .openAIRealtime:
"Realtime-2 (OpenAI)"
}
}
static func resolved(_ raw: String?) -> TalkModeProviderSelection {
let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
return TalkModeProviderSelection(rawValue: trimmed) ?? .gatewayDefault
}
}
enum TalkModeRealtimeVoiceSelection {
static let storageKey = "talk.realtime.voiceSelection"
static let voices = [
"alloy",
"ash",
"ballad",
"coral",
"echo",
"sage",
"shimmer",
"verse",
"marin",
"cedar",
]
static func resolvedOverride(_ raw: String?) -> String? {
let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
guard !trimmed.isEmpty else { return nil }
return Self.voices.contains(trimmed) ? trimmed : nil
}
static func label(for voice: String) -> String {
voice.prefix(1).uppercased() + String(voice.dropFirst())
}
}
struct TalkModeGatewayConfigState {
let activeProvider: String
let normalizedPayload: Bool
let missingResolvedPayload: Bool
let executionMode: TalkModeExecutionMode
let defaultVoiceId: String?
let voiceAliases: [String: String]
let defaultModelId: String
let defaultOutputFormat: String?
let realtimeProvider: String?
let realtimeModelId: String?
let realtimeVoiceId: String?
let rawConfigApiKey: String?
let interruptOnSpeech: Bool?
let silenceTimeoutMs: Int
let speechLocaleID: String?
}
enum TalkModeGatewayConfigParser {
static func parse(
config: [String: Any],
defaultProvider: String,
defaultModelIdFallback: String,
defaultRealtimeModelIdFallback: String,
defaultSilenceTimeoutMs: Int) -> TalkModeGatewayConfigState
{
let talk = TalkConfigParsing.bridgeFoundationDictionary(config["talk"] as? [String: Any])
let selection = TalkConfigParsing.selectProviderConfig(
talk,
defaultProvider: defaultProvider,
allowLegacyFallback: false)
let activeProvider = selection?.provider ?? defaultProvider
let activeConfig = selection?.config
let voiceAliases: [String: String]
if let aliases = activeConfig?["voiceAliases"]?.dictionaryValue {
var resolved: [String: String] = [:]
for (key, value) in aliases {
guard let id = value.stringValue else { continue }
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
resolved[normalizedKey] = trimmedId
}
voiceAliases = resolved
} else {
voiceAliases = [:]
}
let model = Self.firstString(activeConfig, keys: ["modelId", "model"])
let defaultModelId = (model?.isEmpty == false) ? model! : defaultModelIdFallback
let defaultVoiceId = Self.firstString(activeConfig, keys: ["voiceId", "voice"])
let defaultOutputFormat = Self.firstString(activeConfig, keys: ["outputFormat"])
let realtime = talk?["realtime"]?.dictionaryValue
let realtimeProviders = realtime?["providers"]?.dictionaryValue
let realtimeProvider = Self.firstString(realtime, keys: ["provider"])
?? Self.singleRealtimeProviderId(realtimeProviders)
let realtimeProviderConfig = Self.realtimeProviderConfig(
providers: realtimeProviders,
provider: realtimeProvider)
let realtimeModel = Self.firstString(realtime, keys: ["model"])
?? Self.firstString(realtimeProviderConfig, keys: ["model"])
let realtimeModelId = realtimeModel ?? defaultRealtimeModelIdFallback
let realtimeVoiceId = Self.firstString(realtime, keys: ["voice"])
?? Self.firstString(realtimeProviderConfig, keys: ["voice"])
let executionMode = Self.resolvedExecutionMode(realtime)
let rawConfigApiKey = activeConfig?["apiKey"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
let interruptOnSpeech = talk?["interruptOnSpeech"]?.boolValue
let silenceTimeoutMs = TalkConfigParsing.resolvedSilenceTimeoutMs(
talk,
fallback: defaultSilenceTimeoutMs)
let speechLocaleID = TalkConfigParsing.resolvedSpeechLocaleID(talk)
return TalkModeGatewayConfigState(
activeProvider: activeProvider,
normalizedPayload: selection?.normalizedPayload == true,
missingResolvedPayload: talk != nil && selection == nil,
executionMode: executionMode,
defaultVoiceId: defaultVoiceId,
voiceAliases: voiceAliases,
defaultModelId: defaultModelId,
defaultOutputFormat: defaultOutputFormat,
realtimeProvider: realtimeProvider,
realtimeModelId: realtimeModelId,
realtimeVoiceId: realtimeVoiceId,
rawConfigApiKey: rawConfigApiKey,
interruptOnSpeech: interruptOnSpeech,
silenceTimeoutMs: silenceTimeoutMs,
speechLocaleID: speechLocaleID)
}
private static func firstString(_ config: [String: AnyCodable]?, keys: [String]) -> String? {
guard let config else { return nil }
for key in keys {
let value = config[key]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines)
if value?.isEmpty == false {
return value
}
}
return nil
}
private static func resolvedExecutionMode(_ realtime: [String: AnyCodable]?) -> TalkModeExecutionMode {
guard let realtime else { return .native }
let mode = Self.firstString(realtime, keys: ["mode"])?.lowercased()
let transport = Self.firstString(realtime, keys: ["transport"])?.lowercased()
let brain = Self.firstString(realtime, keys: ["brain"])?.lowercased()
guard mode == "realtime", brain == nil || brain == "agent-consult" else {
return .native
}
if transport == "gateway-relay" {
return .realtimeRelay
}
if transport == nil || transport == "webrtc" {
return .realtimeClient
}
return .native
}
private static func singleRealtimeProviderId(_ providers: [String: AnyCodable]?) -> String? {
guard let providers, providers.count == 1 else { return nil }
let provider = providers.keys.first?.trimmingCharacters(in: .whitespacesAndNewlines)
return provider?.isEmpty == false ? provider : nil
}
private static func realtimeProviderConfig(
providers: [String: AnyCodable]?,
provider: String?) -> [String: AnyCodable]?
{
guard let providers else { return nil }
if let provider {
return providers[provider]?.dictionaryValue
}
if providers.count == 1 {
return providers.values.first?.dictionaryValue
}
return nil
}
}