Files
openclaw/apps/macos/Sources/OpenClaw/VoiceWakeRuntime.swift
Phillip Hampton 350fe63bbf feat(macos): Voice Wake option to trigger Talk Mode (#58490)
* feat(macos): add "Trigger Talk Mode" option to Voice Wake settings

When enabled, detecting a wake phrase activates Talk Mode (full voice
conversation: STT -> LLM -> TTS playback) instead of sending a text
message to the chat. Enables hands-free voice assistant UX.

Implementation:
- Constants.swift: new `openclaw.voiceWakeTriggersTalkMode` defaults key
- AppState.swift: new property with UserDefaults persistence + runtime
  refresh on change so the toggle takes effect immediately
- VoiceWakeSettings.swift: "Trigger Talk Mode" toggle under Voice Wake,
  disabled when Voice Wake is off
- VoiceWakeRuntime.swift: `beginCapture` checks `triggersTalkMode` and
  activates Talk Mode directly, skipping the capture/overlay/forward
  flow. Pauses the wake listener via `pauseForPushToTalk()` to prevent
  two audio pipelines competing on the mic.
- TalkModeController.swift: resumes voice wake listener when Talk Mode
  exits by calling `VoiceWakeRuntime.refresh`, mirroring PTT teardown.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: address review feedback

- TalkModeController: move VoiceWakeRuntime.refresh() after
  TalkModeRuntime.setEnabled(false) completes, preventing mic
  contention race where wake listener restarts before Talk Mode
  finishes tearing down its audio engine (P1)
- VoiceWakeRuntime: remove redundant haltRecognitionPipeline()
  before pauseForPushToTalk() which already calls it via stop() (P2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: resume wake listener based on enabled state, not toggle value

Check swabbleEnabled instead of voiceWakeTriggersTalkMode when deciding
whether to resume the wake listener after Talk Mode exits. This ensures
the paused listener resumes even if the user toggled "Trigger Talk Mode"
off during an active session. (P2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: play trigger chime on Talk Mode activation + send chime before TTS

- VoiceWakeRuntime: play the configured trigger chime in the
  triggersTalkMode branch before pausing the wake listener. The early
  return was skipping the chime that plays in the normal capture flow.
- TalkModeRuntime: play the Voice Wake "send" chime before TTS playback
  starts, giving audible feedback that the AI is about to respond. Talk
  Mode never used this chime despite it being configurable in settings.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: move send chime to transcript finalization (on send, not on reply)

The send chime now plays when the user's speech is finalized and about
to be sent to the AI, not when the TTS response starts. This matches
the semantics of "send sound" -- it confirms your input was captured.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 21:09:36 -04:00

794 lines
32 KiB
Swift

import AVFoundation
import Foundation
import OSLog
import Speech
import SwabbleKit
#if canImport(AppKit)
import AppKit
#endif
/// Background listener that keeps the voice-wake pipeline alive outside the settings test view.
actor VoiceWakeRuntime {
static let shared = VoiceWakeRuntime()
enum ListeningState { case idle, voiceWake, pushToTalk }
private let logger = Logger(subsystem: "ai.openclaw", category: "voicewake.runtime")
private var recognizer: SFSpeechRecognizer?
// Lazily created on start to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
// headphones into the low-quality headset profile even if Voice Wake is disabled.
private var audioEngine: AVAudioEngine?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var recognitionGeneration: Int = 0 // drop stale callbacks after restarts
private var lastHeard: Date?
private var noiseFloorRMS: Double = 1e-4
private var captureStartedAt: Date?
private var captureTask: Task<Void, Never>?
private var capturedTranscript: String = ""
private var isCapturing: Bool = false
private var heardBeyondTrigger: Bool = false
private var triggerChimePlayed: Bool = false
private var committedTranscript: String = ""
private var volatileTranscript: String = ""
private var cooldownUntil: Date?
private var currentConfig: RuntimeConfig?
private var listeningState: ListeningState = .idle
private var overlayToken: UUID?
private var activeTriggerEndTime: TimeInterval?
private var scheduledRestartTask: Task<Void, Never>?
private var lastLoggedText: String?
private var lastLoggedAt: Date?
private var lastTapLogAt: Date?
private var lastCallbackLogAt: Date?
private var lastTranscript: String?
private var lastTranscriptAt: Date?
private var preDetectTask: Task<Void, Never>?
private var isStarting: Bool = false
private var triggerOnlyTask: Task<Void, Never>?
/// Tunables
/// Silence threshold once we've captured user speech (post-trigger).
private let silenceWindow: TimeInterval = 2.0
/// Silence threshold when we only heard the trigger but no post-trigger speech yet.
private let triggerOnlySilenceWindow: TimeInterval = 5.0
// Maximum capture duration from trigger until we force-send, to avoid runaway sessions.
private let captureHardStop: TimeInterval = 120.0
private let debounceAfterSend: TimeInterval = 0.35
// Voice activity detection parameters (RMS-based).
private let minSpeechRMS: Double = 1e-3
private let speechBoostFactor: Double = 6.0 // how far above noise floor we require to mark speech
private let preDetectSilenceWindow: TimeInterval = 1.0
private let triggerPauseWindow: TimeInterval = 0.55
/// Stops the active Speech pipeline without clearing the stored config, so we can restart cleanly.
private func haltRecognitionPipeline() {
// Bump generation first so any in-flight callbacks from the cancelled task get dropped.
self.recognitionGeneration &+= 1
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.audioEngine?.inputNode.removeTap(onBus: 0)
self.audioEngine?.stop()
// Release the engine so we also release any audio session/resources when Voice Wake is idle.
self.audioEngine = nil
}
struct RuntimeConfig: Equatable {
let triggers: [String]
let micID: String?
let localeID: String?
let triggerChime: VoiceWakeChime
let sendChime: VoiceWakeChime
let triggersTalkMode: Bool
}
private struct RecognitionUpdate {
let transcript: String?
let segments: [WakeWordSegment]
let isFinal: Bool
let error: Error?
let generation: Int
}
func refresh(state: AppState) async {
let snapshot = await MainActor.run { () -> (Bool, RuntimeConfig) in
let enabled = state.swabbleEnabled
let config = RuntimeConfig(
triggers: sanitizeVoiceWakeTriggers(state.swabbleTriggerWords),
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
localeID: state.voiceWakeLocaleID.isEmpty ? nil : state.voiceWakeLocaleID,
triggerChime: state.voiceWakeTriggerChime,
sendChime: state.voiceWakeSendChime,
triggersTalkMode: state.voiceWakeTriggersTalkMode)
return (enabled, config)
}
guard voiceWakeSupported, snapshot.0 else {
self.stop()
return
}
guard PermissionManager.voiceWakePermissionsGranted() else {
self.logger.debug("voicewake runtime not starting: permissions missing")
self.stop()
return
}
let config = snapshot.1
if self.isStarting {
return
}
if self.scheduledRestartTask != nil, config == self.currentConfig, self.recognitionTask == nil {
return
}
if self.scheduledRestartTask != nil {
self.scheduledRestartTask?.cancel()
self.scheduledRestartTask = nil
}
if config == self.currentConfig, self.recognitionTask != nil {
return
}
self.stop()
await self.start(with: config)
}
private func start(with config: RuntimeConfig) async {
if self.isStarting {
return
}
self.isStarting = true
defer { self.isStarting = false }
do {
self.recognitionGeneration &+= 1
let generation = self.recognitionGeneration
self.configureSession(localeID: config.localeID)
guard let recognizer, recognizer.isAvailable else {
self.logger.error("voicewake runtime: speech recognizer unavailable")
return
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
self.recognitionRequest?.taskHint = .dictation
guard let request = self.recognitionRequest else { return }
// Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
if self.audioEngine == nil {
self.audioEngine = AVAudioEngine()
}
guard let audioEngine = self.audioEngine else { return }
guard AudioInputDeviceObserver.hasUsableDefaultInputDevice() else {
self.audioEngine = nil
throw NSError(
domain: "VoiceWakeRuntime",
code: 1,
userInfo: [NSLocalizedDescriptionKey: "No usable audio input device available"])
}
let input = audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
guard format.channelCount > 0, format.sampleRate > 0 else {
throw NSError(
domain: "VoiceWakeRuntime",
code: 1,
userInfo: [NSLocalizedDescriptionKey: "No audio input available"])
}
input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
request?.append(buffer)
guard let rms = Self.rmsLevel(buffer: buffer) else { return }
Task.detached { [weak self] in
await self?.noteAudioLevel(rms: rms)
await self?.noteAudioTap(rms: rms)
}
}
audioEngine.prepare()
try audioEngine.start()
self.currentConfig = config
self.lastHeard = Date()
// Preserve any existing cooldownUntil so the debounce after send isn't wiped by a restart.
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in
guard let self else { return }
let transcript = result?.bestTranscription.formattedString
let segments = result.flatMap { result in
transcript
.map { WakeWordSpeechSegments.from(transcription: result.bestTranscription, transcript: $0) }
} ?? []
let isFinal = result?.isFinal ?? false
Task { await self.noteRecognitionCallback(transcript: transcript, isFinal: isFinal, error: error) }
let update = RecognitionUpdate(
transcript: transcript,
segments: segments,
isFinal: isFinal,
error: error,
generation: generation)
Task { await self.handleRecognition(update, config: config) }
}
let preferred = config.micID?.isEmpty == false ? config.micID! : "system-default"
self.logger.info(
"voicewake runtime input preferred=\(preferred, privacy: .public) " +
"\(AudioInputDeviceObserver.defaultInputDeviceSummary(), privacy: .public)")
self.logger.info("voicewake runtime started")
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "started", fields: [
"locale": config.localeID ?? "",
"micID": config.micID ?? "",
])
} catch {
self.logger.error("voicewake runtime failed to start: \(error.localizedDescription, privacy: .public)")
self.stop()
}
}
private func stop(dismissOverlay: Bool = true, cancelScheduledRestart: Bool = true) {
if cancelScheduledRestart {
self.scheduledRestartTask?.cancel()
self.scheduledRestartTask = nil
}
self.captureTask?.cancel()
self.captureTask = nil
self.isCapturing = false
self.capturedTranscript = ""
self.captureStartedAt = nil
self.triggerChimePlayed = false
self.lastTranscript = nil
self.lastTranscriptAt = nil
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
self.haltRecognitionPipeline()
self.recognizer = nil
self.currentConfig = nil
self.listeningState = .idle
self.activeTriggerEndTime = nil
self.logger.debug("voicewake runtime stopped")
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "stopped")
let token = self.overlayToken
self.overlayToken = nil
guard dismissOverlay else { return }
Task { @MainActor in
if let token {
VoiceSessionCoordinator.shared.dismiss(token: token, reason: .explicit, outcome: .empty)
} else {
VoiceWakeOverlayController.shared.dismiss()
}
}
}
private func configureSession(localeID: String?) {
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
self.recognizer = SFSpeechRecognizer(locale: locale)
self.recognizer?.defaultTaskHint = .dictation
}
private func handleRecognition(_ update: RecognitionUpdate, config: RuntimeConfig) async {
if update.generation != self.recognitionGeneration {
return // stale callback from a superseded recognizer session
}
if let error = update.error {
self.logger.debug("voicewake recognition error: \(error.localizedDescription, privacy: .public)")
}
guard let transcript = update.transcript else { return }
let now = Date()
if !transcript.isEmpty {
self.lastHeard = now
if !self.isCapturing {
self.lastTranscript = transcript
self.lastTranscriptAt = now
}
if self.isCapturing {
self.maybeLogRecognition(
transcript: transcript,
segments: update.segments,
triggers: config.triggers,
isFinal: update.isFinal,
match: nil,
usedFallback: false,
capturing: true)
let trimmed = Self.commandAfterTrigger(
transcript: transcript,
segments: update.segments,
triggerEndTime: self.activeTriggerEndTime,
triggers: config.triggers)
self.capturedTranscript = trimmed
self.updateHeardBeyondTrigger(withTrimmed: trimmed)
if update.isFinal {
self.committedTranscript = trimmed
self.volatileTranscript = ""
} else {
self.volatileTranscript = VoiceOverlayTextFormatting.delta(
after: self.committedTranscript,
current: trimmed)
}
let attributed = VoiceOverlayTextFormatting.makeAttributed(
committed: self.committedTranscript,
volatile: self.volatileTranscript,
isFinal: update.isFinal)
let snapshot = self.committedTranscript + self.volatileTranscript
if let token = self.overlayToken {
await MainActor.run {
VoiceSessionCoordinator.shared.updatePartial(
token: token,
text: snapshot,
attributed: attributed)
}
}
}
}
if self.isCapturing { return }
let gateConfig = WakeWordGateConfig(triggers: config.triggers)
var usedFallback = false
var match = WakeWordGate.match(transcript: transcript, segments: update.segments, config: gateConfig)
if match == nil, update.isFinal {
match = VoiceWakeRecognitionDebugSupport.textOnlyFallbackMatch(
transcript: transcript,
triggers: config.triggers,
config: gateConfig,
trimWake: Self.trimmedAfterTrigger)
usedFallback = match != nil
}
self.maybeLogRecognition(
transcript: transcript,
segments: update.segments,
triggers: config.triggers,
isFinal: update.isFinal,
match: match,
usedFallback: usedFallback,
capturing: false)
if let match {
if let cooldown = cooldownUntil, now < cooldown {
return
}
if usedFallback {
self.logger.info("voicewake runtime detected (text-only fallback) len=\(match.command.count)")
} else {
self.logger.info("voicewake runtime detected len=\(match.command.count)")
}
await self.beginCapture(command: match.command, triggerEndTime: match.triggerEndTime, config: config)
} else if !transcript.isEmpty, update.error == nil {
if self.isTriggerOnly(transcript: transcript, triggers: config.triggers) {
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.scheduleTriggerOnlyPauseCheck(triggers: config.triggers, config: config)
} else {
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
self.schedulePreDetectSilenceCheck(
triggers: config.triggers,
gateConfig: gateConfig,
config: config)
}
}
}
private func maybeLogRecognition(
transcript: String,
segments: [WakeWordSegment],
triggers: [String],
isFinal: Bool,
match: WakeWordGateMatch?,
usedFallback: Bool,
capturing: Bool)
{
guard VoiceWakeRecognitionDebugSupport.shouldLogTranscript(
transcript: transcript,
isFinal: isFinal,
loggerLevel: self.logger.logLevel,
lastLoggedText: &self.lastLoggedText,
lastLoggedAt: &self.lastLoggedAt)
else { return }
let summary = VoiceWakeRecognitionDebugSupport.transcriptSummary(
transcript: transcript,
triggers: triggers,
segments: segments)
let matchSummary = VoiceWakeRecognitionDebugSupport.matchSummary(match)
let segmentSummary = segments.map { seg in
let start = String(format: "%.2f", seg.start)
let end = String(format: "%.2f", seg.end)
return "\(seg.text)@\(start)-\(end)"
}.joined(separator: ", ")
self.logger.debug(
"voicewake runtime transcript='\(transcript, privacy: .private)' textOnly=\(summary.textOnly) " +
"isFinal=\(isFinal) timing=\(summary.timingCount)/\(segments.count) " +
"capturing=\(capturing) fallback=\(usedFallback) " +
"\(matchSummary) segments=[\(segmentSummary, privacy: .private)]")
}
private func noteAudioTap(rms: Double) {
let now = Date()
if let last = self.lastTapLogAt, now.timeIntervalSince(last) < 1.0 {
return
}
self.lastTapLogAt = now
let db = 20 * log10(max(rms, 1e-7))
self.logger.debug(
"voicewake runtime audio tap rms=\(String(format: "%.6f", rms)) " +
"db=\(String(format: "%.1f", db)) capturing=\(self.isCapturing)")
}
private func noteRecognitionCallback(transcript: String?, isFinal: Bool, error: Error?) {
guard transcript?.isEmpty ?? true else { return }
let now = Date()
if let last = self.lastCallbackLogAt, now.timeIntervalSince(last) < 1.0 {
return
}
self.lastCallbackLogAt = now
let errorSummary = error?.localizedDescription ?? "none"
self.logger.debug(
"voicewake runtime callback empty transcript isFinal=\(isFinal) error=\(errorSummary, privacy: .public)")
}
private func scheduleTriggerOnlyPauseCheck(triggers: [String], config: RuntimeConfig) {
self.triggerOnlyTask?.cancel()
let lastSeenAt = self.lastTranscriptAt
let lastText = self.lastTranscript
let windowNanos = UInt64(self.triggerPauseWindow * 1_000_000_000)
self.triggerOnlyTask = Task { [weak self, lastSeenAt, lastText] in
try? await Task.sleep(nanoseconds: windowNanos)
guard let self else { return }
await self.triggerOnlyPauseCheck(
lastSeenAt: lastSeenAt,
lastText: lastText,
triggers: triggers,
config: config)
}
}
private func schedulePreDetectSilenceCheck(
triggers: [String],
gateConfig: WakeWordGateConfig,
config: RuntimeConfig)
{
self.preDetectTask?.cancel()
let lastSeenAt = self.lastTranscriptAt
let lastText = self.lastTranscript
let windowNanos = UInt64(self.preDetectSilenceWindow * 1_000_000_000)
self.preDetectTask = Task { [weak self, lastSeenAt, lastText] in
try? await Task.sleep(nanoseconds: windowNanos)
guard let self else { return }
await self.preDetectSilenceCheck(
lastSeenAt: lastSeenAt,
lastText: lastText,
triggers: triggers,
gateConfig: gateConfig,
config: config)
}
}
private func triggerOnlyPauseCheck(
lastSeenAt: Date?,
lastText: String?,
triggers: [String],
config: RuntimeConfig) async
{
guard !Task.isCancelled else { return }
guard !self.isCapturing else { return }
guard let lastSeenAt, let lastText else { return }
guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return }
guard self.isTriggerOnly(transcript: lastText, triggers: triggers) else { return }
if let cooldown = self.cooldownUntil, Date() < cooldown {
return
}
self.logger.info("voicewake runtime detected (trigger-only pause)")
await self.beginCapture(command: "", triggerEndTime: nil, config: config)
}
private func isTriggerOnly(transcript: String, triggers: [String]) -> Bool {
guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return false }
guard VoiceWakeTextUtils.startsWithTrigger(transcript: transcript, triggers: triggers) else { return false }
return Self.trimmedAfterTrigger(transcript, triggers: triggers).isEmpty
}
private func preDetectSilenceCheck(
lastSeenAt: Date?,
lastText: String?,
triggers: [String],
gateConfig: WakeWordGateConfig,
config: RuntimeConfig) async
{
guard !Task.isCancelled else { return }
guard !self.isCapturing else { return }
guard let lastSeenAt, let lastText else { return }
guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return }
guard let match = VoiceWakeRecognitionDebugSupport.textOnlyFallbackMatch(
transcript: lastText,
triggers: triggers,
config: gateConfig,
trimWake: Self.trimmedAfterTrigger)
else { return }
if let cooldown = self.cooldownUntil, Date() < cooldown {
return
}
self.logger.info("voicewake runtime detected (silence fallback) len=\(match.command.count)")
await self.beginCapture(
command: match.command,
triggerEndTime: match.triggerEndTime,
config: config)
}
private func beginCapture(command: String, triggerEndTime: TimeInterval?, config: RuntimeConfig) async {
// When "Trigger Talk Mode" is enabled, skip the capture/overlay flow entirely
// and activate Talk Mode immediately. Talk Mode handles its own STT pipeline.
// Pause the wake listener to avoid two audio pipelines competing on the mic
// (mirrors the push-to-talk coordination pattern).
if config.triggersTalkMode {
self.logger.info("voicewake trigger -> activating Talk Mode (skipping capture)")
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "triggerTalkMode")
if config.triggerChime != .none {
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "voicewake.trigger") }
}
self.pauseForPushToTalk()
await AppStateStore.shared.setTalkEnabled(true)
return
}
self.listeningState = .voiceWake
self.isCapturing = true
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "beginCapture")
self.capturedTranscript = command
self.committedTranscript = ""
self.volatileTranscript = command
self.captureStartedAt = Date()
self.cooldownUntil = nil
self.heardBeyondTrigger = !command.isEmpty
self.triggerChimePlayed = false
self.activeTriggerEndTime = triggerEndTime
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
if config.triggerChime != .none, !self.triggerChimePlayed {
self.triggerChimePlayed = true
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "voicewake.trigger") }
}
let snapshot = self.committedTranscript + self.volatileTranscript
let attributed = VoiceOverlayTextFormatting.makeAttributed(
committed: self.committedTranscript,
volatile: self.volatileTranscript,
isFinal: false)
self.overlayToken = await MainActor.run {
VoiceSessionCoordinator.shared.startSession(
source: .wakeWord,
text: snapshot,
attributed: attributed,
forwardEnabled: true)
}
// Keep the "ears" boosted for the capture window so the status icon animates while recording.
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
self.captureTask?.cancel()
self.captureTask = Task { [weak self] in
guard let self else { return }
await self.monitorCapture(config: config)
}
}
private func monitorCapture(config: RuntimeConfig) async {
let start = self.captureStartedAt ?? Date()
let hardStop = start.addingTimeInterval(self.captureHardStop)
while self.isCapturing {
let now = Date()
if now >= hardStop {
// Hard-stop after a maximum duration so we never leave the recognizer pinned open.
await self.finalizeCapture(config: config)
return
}
let silenceThreshold = self.heardBeyondTrigger ? self.silenceWindow : self.triggerOnlySilenceWindow
if let last = self.lastHeard, now.timeIntervalSince(last) >= silenceThreshold {
await self.finalizeCapture(config: config)
return
}
try? await Task.sleep(nanoseconds: 200_000_000)
}
}
private func finalizeCapture(config: RuntimeConfig) async {
guard self.isCapturing else { return }
self.isCapturing = false
// Disarm trigger matching immediately (before halting recognition) to avoid double-trigger
// races from late callbacks that arrive after isCapturing is cleared.
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
self.captureTask?.cancel()
self.captureTask = nil
let finalTranscript = self.capturedTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "finalizeCapture", fields: [
"finalLen": "\(finalTranscript.count)",
])
// Stop further recognition events so we don't retrigger immediately with buffered audio.
self.haltRecognitionPipeline()
self.capturedTranscript = ""
self.captureStartedAt = nil
self.lastHeard = nil
self.heardBeyondTrigger = false
self.triggerChimePlayed = false
self.activeTriggerEndTime = nil
self.lastTranscript = nil
self.lastTranscriptAt = nil
self.preDetectTask?.cancel()
self.preDetectTask = nil
self.triggerOnlyTask?.cancel()
self.triggerOnlyTask = nil
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
if let token = self.overlayToken {
await MainActor.run { VoiceSessionCoordinator.shared.updateLevel(token: token, 0) }
}
let delay: TimeInterval = 0.0
let sendChime = finalTranscript.isEmpty ? .none : config.sendChime
if let token = self.overlayToken {
await MainActor.run {
VoiceSessionCoordinator.shared.finalize(
token: token,
text: finalTranscript,
sendChime: sendChime,
autoSendAfter: delay)
}
} else if !finalTranscript.isEmpty {
if sendChime != .none {
await MainActor.run { VoiceWakeChimePlayer.play(sendChime, reason: "voicewake.send") }
}
Task.detached {
await VoiceWakeForwarder.forward(transcript: finalTranscript)
}
}
self.overlayToken = nil
self.scheduleRestartRecognizer()
}
// MARK: - Audio level handling
private func noteAudioLevel(rms: Double) {
guard self.isCapturing else { return }
// Update adaptive noise floor: faster when lower energy (quiet), slower when loud.
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor)
if rms >= threshold {
self.lastHeard = Date()
}
// Normalize against the adaptive threshold so the UI meter stays roughly 0...1 across devices.
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
if let token = self.overlayToken {
Task { @MainActor in
VoiceSessionCoordinator.shared.updateLevel(token: token, clamped)
}
}
}
private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? {
guard let channelData = buffer.floatChannelData?.pointee else { return nil }
let frameCount = Int(buffer.frameLength)
guard frameCount > 0 else { return nil }
var sum: Double = 0
for i in 0..<frameCount {
let sample = Double(channelData[i])
sum += sample * sample
}
return sqrt(sum / Double(frameCount))
}
private func restartRecognizer() {
// Restart the recognizer so we listen for the next trigger with a clean buffer.
let current = self.currentConfig
self.stop(dismissOverlay: false, cancelScheduledRestart: false)
if let current {
Task { await self.start(with: current) }
}
}
private func restartRecognizerIfIdleAndOverlayHidden() async {
if self.isCapturing { return }
self.restartRecognizer()
}
private func scheduleRestartRecognizer(delay: TimeInterval = 0.7) {
self.scheduledRestartTask?.cancel()
self.scheduledRestartTask = Task { [weak self] in
let nanos = UInt64(max(0, delay) * 1_000_000_000)
try? await Task.sleep(nanoseconds: nanos)
guard let self else { return }
await self.consumeScheduledRestart()
await self.restartRecognizerIfIdleAndOverlayHidden()
}
}
private func consumeScheduledRestart() {
self.scheduledRestartTask = nil
}
func applyPushToTalkCooldown() {
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
}
func pauseForPushToTalk() {
self.listeningState = .pushToTalk
self.stop(dismissOverlay: false)
}
private func updateHeardBeyondTrigger(withTrimmed trimmed: String) {
if !self.heardBeyondTrigger, !trimmed.isEmpty {
self.heardBeyondTrigger = true
}
}
private static func trimmedAfterTrigger(_ text: String, triggers: [String]) -> String {
for trigger in triggers {
let token = trigger.trimmingCharacters(in: .whitespacesAndNewlines)
guard !token.isEmpty else { continue }
guard let range = text.range(
of: token,
options: [.caseInsensitive, .diacriticInsensitive, .widthInsensitive]) else { continue }
let trimmed = text[range.upperBound...].trimmingCharacters(in: .whitespacesAndNewlines)
return String(trimmed)
}
return text
}
private static func commandAfterTrigger(
transcript: String,
segments: [WakeWordSegment],
triggerEndTime: TimeInterval?,
triggers: [String]) -> String
{
guard let triggerEndTime else {
return self.trimmedAfterTrigger(transcript, triggers: triggers)
}
let trimmed = WakeWordGate.commandText(
transcript: transcript,
segments: segments,
triggerEndTime: triggerEndTime)
return trimmed.isEmpty ? self.trimmedAfterTrigger(transcript, triggers: triggers) : trimmed
}
#if DEBUG
static func _testTrimmedAfterTrigger(_ text: String, triggers: [String]) -> String {
self.trimmedAfterTrigger(text, triggers: triggers)
}
static func _testHasContentAfterTrigger(_ text: String, triggers: [String]) -> Bool {
!self.trimmedAfterTrigger(text, triggers: triggers).isEmpty
}
static func _testAttributedColor(isFinal: Bool) -> NSColor {
VoiceOverlayTextFormatting.makeAttributed(committed: "sample", volatile: "", isFinal: isFinal)
.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
}
#endif
}