talk: add configurable silence timeout

This commit is contained in:
dano does design
2026-03-08 17:58:15 +11:00
committed by Peter Steinberger
parent 097c588a6b
commit 6ff7e8f42e
18 changed files with 162 additions and 9 deletions

View File

@@ -59,8 +59,8 @@ class TalkModeManager(
private const val tag = "TalkMode"
private const val defaultModelIdFallback = "eleven_v3"
private const val defaultOutputFormatFallback = "pcm_24000"
private const val defaultTalkProvider = "elevenlabs"
private const val silenceWindowMs = 500L
private const val defaultTalkProvider = "elevenlabs"
private const val defaultSilenceTimeoutMs = 700L
private const val listenWatchdogMs = 12_000L
private const val chatFinalWaitWithSubscribeMs = 45_000L
private const val chatFinalWaitWithoutSubscribeMs = 6_000L
@@ -105,6 +105,14 @@ private const val defaultTalkProvider = "elevenlabs"
normalizedPayload = false,
)
}
internal fun resolvedSilenceTimeoutMs(talk: JsonObject?): Long {
val timeout = talk?.get("silenceTimeoutMs").asDoubleOrNull() ?: return defaultSilenceTimeoutMs
if (timeout <= 0 || timeout % 1.0 != 0.0 || timeout > Long.MAX_VALUE.toDouble()) {
return defaultSilenceTimeoutMs
}
return timeout.toLong()
}
}
private val mainHandler = Handler(Looper.getMainLooper())
@@ -134,7 +142,7 @@ private const val defaultTalkProvider = "elevenlabs"
private var listeningMode = false
private var silenceJob: Job? = null
private val silenceWindowMs = 700L
private var silenceWindowMs = defaultSilenceTimeoutMs
private var lastTranscript: String = ""
private var lastHeardAtMs: Long? = null
private var lastSpokenText: String? = null
@@ -1411,6 +1419,7 @@ private const val defaultTalkProvider = "elevenlabs"
activeConfig?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val key = activeConfig?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
val silenceTimeoutMs = resolvedSilenceTimeoutMs(talk)
if (!isCanonicalMainSessionKey(mainSessionKey)) {
mainSessionKey = mainKey
@@ -1427,7 +1436,11 @@ private const val defaultTalkProvider = "elevenlabs"
if (!modelOverrideActive) currentModelId = defaultModelId
defaultOutputFormat = outputFormat ?: defaultOutputFormatFallback
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
Log.d(tag, "reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId")
silenceWindowMs = silenceTimeoutMs
Log.d(
tag,
"reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId silenceTimeoutMs=$silenceTimeoutMs",
)
if (interrupt != null) interruptOnSpeech = interrupt
activeProviderIsElevenLabs = activeProvider == defaultTalkProvider
if (!activeProviderIsElevenLabs) {
@@ -1441,6 +1454,7 @@ private const val defaultTalkProvider = "elevenlabs"
}
configLoaded = true
} catch (_: Throwable) {
silenceWindowMs = defaultSilenceTimeoutMs
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
defaultModelId = defaultModelIdFallback
if (!modelOverrideActive) currentModelId = defaultModelId

View File

@@ -54,4 +54,23 @@ class TalkModeConfigParsingTest {
assertEquals("voice-legacy", selection?.config?.get("voiceId")?.jsonPrimitive?.content)
assertEquals("legacy-key", selection?.config?.get("apiKey")?.jsonPrimitive?.content)
}
@Test
fun readsConfiguredSilenceTimeoutMs() {
val talk = buildJsonObject { put("silenceTimeoutMs", 1500) }
assertEquals(1500L, TalkModeManager.resolvedSilenceTimeoutMs(talk))
}
@Test
fun defaultsSilenceTimeoutMsWhenMissing() {
assertEquals(700L, TalkModeManager.resolvedSilenceTimeoutMs(null))
}
@Test
fun defaultsSilenceTimeoutMsWhenInvalid() {
val talk = buildJsonObject { put("silenceTimeoutMs", 0) }
assertEquals(700L, TalkModeManager.resolvedSilenceTimeoutMs(talk))
}
}

View File

@@ -34,6 +34,7 @@ final class TalkModeManager: NSObject {
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
private static let defaultModelIdFallback = "eleven_v3"
private static let defaultTalkProvider = "elevenlabs"
private static let defaultSilenceTimeoutMs = 900
private static let redactedConfigSentinel = "__OPENCLAW_REDACTED__"
var isEnabled: Bool = false
var isListening: Bool = false
@@ -97,7 +98,7 @@ final class TalkModeManager: NSObject {
private var gateway: GatewayNodeSession?
private var gatewayConnected = false
private let silenceWindow: TimeInterval = 0.9
private var silenceWindow: TimeInterval = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000
private var lastAudioActivity: Date?
private var noiseFloorSamples: [Double] = []
private var noiseFloor: Double?
@@ -2001,6 +2002,24 @@ extension TalkModeManager {
config: normalizedProviders[providerID] ?? [:])
}
static func resolvedSilenceTimeoutMs(_ talk: [String: Any]?) -> Int {
switch talk?["silenceTimeoutMs"] {
case let timeout as Int where timeout > 0:
return timeout
case let timeout as Double
where timeout > 0 && timeout.rounded(.towardZero) == timeout && timeout <= Double(Int.max):
return Int(timeout)
case let timeout as NSNumber:
let value = timeout.doubleValue
if value > 0 && value.rounded(.towardZero) == value && value <= Double(Int.max) {
return Int(value)
}
return Self.defaultSilenceTimeoutMs
default:
return Self.defaultSilenceTimeoutMs
}
}
func reloadConfig() async {
guard let gateway else { return }
self.pcmFormatUnavailable = false
@@ -2020,6 +2039,7 @@ extension TalkModeManager {
}
let activeProvider = selection?.provider ?? Self.defaultTalkProvider
let activeConfig = selection?.config
let silenceTimeoutMs = Self.resolvedSilenceTimeoutMs(talk)
self.defaultVoiceId = (activeConfig?["voiceId"] as? String)?
.trimmingCharacters(in: .whitespacesAndNewlines)
if let aliases = activeConfig?["voiceAliases"] as? [String: Any] {
@@ -2067,8 +2087,9 @@ extension TalkModeManager {
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
self.interruptOnSpeech = interrupt
}
self.silenceWindow = TimeInterval(silenceTimeoutMs) / 1000
if selection != nil {
GatewayDiagnostics.log("talk config provider=\(activeProvider)")
GatewayDiagnostics.log("talk config provider=\(activeProvider) silenceTimeoutMs=\(silenceTimeoutMs)")
}
} catch {
self.defaultModelId = Self.defaultModelIdFallback
@@ -2079,6 +2100,7 @@ extension TalkModeManager {
self.gatewayTalkDefaultModelId = nil
self.gatewayTalkApiKeyConfigured = false
self.gatewayTalkConfigLoaded = false
self.silenceWindow = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000
}
}

View File

@@ -47,4 +47,24 @@ import Testing
userInfo: [NSLocalizedDescriptionKey: "queue enqueue failed"])
#expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error) == false)
}
@Test func readsConfiguredSilenceTimeoutMs() {
let talk: [String: Any] = [
"silenceTimeoutMs": 1500,
]
#expect(TalkModeManager.resolvedSilenceTimeoutMs(talk) == 1500)
}
@Test func defaultsSilenceTimeoutMsWhenMissing() {
#expect(TalkModeManager.resolvedSilenceTimeoutMs(nil) == 900)
}
@Test func defaultsSilenceTimeoutMsWhenInvalid() {
let talk: [String: Any] = [
"silenceTimeoutMs": 0,
]
#expect(TalkModeManager.resolvedSilenceTimeoutMs(talk) == 900)
}
}

View File

@@ -12,6 +12,7 @@ actor TalkModeRuntime {
private let ttsLogger = Logger(subsystem: "ai.openclaw", category: "talk.tts")
private static let defaultModelIdFallback = "eleven_v3"
private static let defaultTalkProvider = "elevenlabs"
private static let defaultSilenceTimeoutMs = 700
private final class RMSMeter: @unchecked Sendable {
private let lock = NSLock()
@@ -66,7 +67,7 @@ actor TalkModeRuntime {
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
private let silenceWindow: TimeInterval = 0.7
private var silenceWindow: TimeInterval = TimeInterval(TalkModeRuntime.defaultSilenceTimeoutMs) / 1000
private let minSpeechRMS: Double = 1e-3
private let speechBoostFactor: Double = 6.0
@@ -783,6 +784,7 @@ extension TalkModeRuntime {
}
self.defaultOutputFormat = cfg.outputFormat
self.interruptOnSpeech = cfg.interruptOnSpeech
self.silenceWindow = TimeInterval(cfg.silenceTimeoutMs) / 1000
self.apiKey = cfg.apiKey
let hasApiKey = (cfg.apiKey?.isEmpty == false)
let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
@@ -792,7 +794,8 @@ extension TalkModeRuntime {
"talk config voiceId=\(voiceLabel, privacy: .public) " +
"modelId=\(modelLabel, privacy: .public) " +
"apiKey=\(hasApiKey, privacy: .public) " +
"interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
"interrupt=\(cfg.interruptOnSpeech, privacy: .public) " +
"silenceTimeoutMs=\(cfg.silenceTimeoutMs, privacy: .public)")
}
private struct TalkRuntimeConfig {
@@ -801,6 +804,7 @@ extension TalkModeRuntime {
let modelId: String?
let outputFormat: String?
let interruptOnSpeech: Bool
let silenceTimeoutMs: Int
let apiKey: String?
}
@@ -880,6 +884,21 @@ extension TalkModeRuntime {
normalizedPayload: false)
}
static func resolvedSilenceTimeoutMs(_ talk: [String: AnyCodable]?) -> Int {
if let timeout = talk?["silenceTimeoutMs"]?.intValue, timeout > 0 {
return timeout
}
if
let timeout = talk?["silenceTimeoutMs"]?.doubleValue,
timeout > 0,
timeout.rounded(.towardZero) == timeout,
timeout <= Double(Int.max)
{
return Int(timeout)
}
return Self.defaultSilenceTimeoutMs
}
private func fetchTalkConfig() async -> TalkRuntimeConfig {
let env = ProcessInfo.processInfo.environment
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
@@ -895,6 +914,7 @@ extension TalkModeRuntime {
let selection = Self.selectTalkProviderConfig(talk)
let activeProvider = selection?.provider ?? Self.defaultTalkProvider
let activeConfig = selection?.config
let silenceTimeoutMs = Self.resolvedSilenceTimeoutMs(talk)
let ui = snap.config?["ui"]?.dictionaryValue
let rawSeam = ui?["seamColor"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
await MainActor.run {
@@ -939,6 +959,7 @@ extension TalkModeRuntime {
modelId: resolvedModel,
outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true,
silenceTimeoutMs: silenceTimeoutMs,
apiKey: resolvedApiKey)
} catch {
let resolvedVoice =
@@ -951,6 +972,7 @@ extension TalkModeRuntime {
modelId: Self.defaultModelIdFallback,
outputFormat: nil,
interruptOnSpeech: true,
silenceTimeoutMs: Self.defaultSilenceTimeoutMs,
apiKey: resolvedApiKey)
}
}

View File

@@ -32,4 +32,24 @@ struct TalkModeConfigParsingTests {
#expect(selection?.config["voiceId"]?.stringValue == "voice-legacy")
#expect(selection?.config["apiKey"]?.stringValue == "legacy-key")
}
@Test func readsConfiguredSilenceTimeoutMs() {
let talk: [String: AnyCodable] = [
"silenceTimeoutMs": AnyCodable(1500),
]
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(talk) == 1500)
}
@Test func defaultsSilenceTimeoutMsWhenMissing() {
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(nil) == 700)
}
@Test func defaultsSilenceTimeoutMsWhenInvalid() {
let talk: [String: AnyCodable] = [
"silenceTimeoutMs": AnyCodable(0),
]
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(talk) == 700)
}
}