Files
openclaw/apps/macos/Sources/OpenClaw/VoiceWakeSettings.swift
Phillip Hampton 350fe63bbf feat(macos): Voice Wake option to trigger Talk Mode (#58490)
* feat(macos): add "Trigger Talk Mode" option to Voice Wake settings

When enabled, detecting a wake phrase activates Talk Mode (full voice
conversation: STT -> LLM -> TTS playback) instead of sending a text
message to the chat. Enables hands-free voice assistant UX.

Implementation:
- Constants.swift: new `openclaw.voiceWakeTriggersTalkMode` defaults key
- AppState.swift: new property with UserDefaults persistence + runtime
  refresh on change so the toggle takes effect immediately
- VoiceWakeSettings.swift: "Trigger Talk Mode" toggle under Voice Wake,
  disabled when Voice Wake is off
- VoiceWakeRuntime.swift: `beginCapture` checks `triggersTalkMode` and
  activates Talk Mode directly, skipping the capture/overlay/forward
  flow. Pauses the wake listener via `pauseForPushToTalk()` to prevent
  two audio pipelines competing on the mic.
- TalkModeController.swift: resumes voice wake listener when Talk Mode
  exits by calling `VoiceWakeRuntime.refresh`, mirroring PTT teardown.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: address review feedback

- TalkModeController: move VoiceWakeRuntime.refresh() after
  TalkModeRuntime.setEnabled(false) completes, preventing mic
  contention race where wake listener restarts before Talk Mode
  finishes tearing down its audio engine (P1)
- VoiceWakeRuntime: remove redundant haltRecognitionPipeline()
  before pauseForPushToTalk() which already calls it via stop() (P2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: resume wake listener based on enabled state, not toggle value

Check swabbleEnabled instead of voiceWakeTriggersTalkMode when deciding
whether to resume the wake listener after Talk Mode exits. This ensures
the paused listener resumes even if the user toggled "Trigger Talk Mode"
off during an active session. (P2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: play trigger chime on Talk Mode activation + send chime before TTS

- VoiceWakeRuntime: play the configured trigger chime in the
  triggersTalkMode branch before pausing the wake listener. The early
  return was skipping the chime that plays in the normal capture flow.
- TalkModeRuntime: play the Voice Wake "send" chime before TTS playback
  starts, giving audible feedback that the AI is about to respond. Talk
  Mode never used this chime despite it being configurable in settings.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: move send chime to transcript finalization (on send, not on reply)

The send chime now plays when the user's speech is finalized and about
to be sent to the AI, not when the TTS response starts. This matches
the semantics of "send sound" -- it confirms your input was captured.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 21:09:36 -04:00

674 lines
26 KiB
Swift

import AppKit
import AVFoundation
import Observation
import Speech
import SwabbleKit
import SwiftUI
import UniformTypeIdentifiers
struct VoiceWakeSettings: View {
@Bindable var state: AppState
let isActive: Bool
@State private var testState: VoiceWakeTestState = .idle
@State private var tester = VoiceWakeTester()
@State private var isTesting = false
@State private var testTimeoutTask: Task<Void, Never>?
@State private var availableMics: [AudioInputDevice] = []
@State private var loadingMics = false
@State private var meterLevel: Double = 0
@State private var meterError: String?
private let meter = MicLevelMonitor()
@State private var micObserver = AudioInputDeviceObserver()
@State private var micRefreshTask: Task<Void, Never>?
@State private var availableLocales: [Locale] = []
@State private var triggerEntries: [TriggerEntry] = []
private let fieldLabelWidth: CGFloat = 140
private let controlWidth: CGFloat = 240
private let isPreview = ProcessInfo.processInfo.isPreview
private struct AudioInputDevice: Identifiable, Equatable {
let uid: String
let name: String
var id: String {
self.uid
}
}
private struct TriggerEntry: Identifiable {
let id: UUID
var value: String
}
private var voiceWakeBinding: Binding<Bool> {
MicRefreshSupport.voiceWakeBinding(for: self.state)
}
var body: some View {
ScrollView(.vertical) {
VStack(alignment: .leading, spacing: 14) {
SettingsToggleRow(
title: "Enable Voice Wake",
subtitle: "Listen for a wake phrase (e.g. \"Claude\") before running voice commands. "
+ "Voice recognition runs fully on-device.",
binding: self.voiceWakeBinding)
.disabled(!voiceWakeSupported)
SettingsToggleRow(
title: "Trigger Talk Mode",
subtitle: """
When a wake phrase is detected, activate Talk Mode for a full voice \
conversation (STT, LLM response, TTS playback) instead of sending a \
text message to the chat.
""",
binding: self.$state.voiceWakeTriggersTalkMode)
.disabled(!self.state.swabbleEnabled)
SettingsToggleRow(
title: "Hold Right Option to talk",
subtitle: """
Push-to-talk mode that starts listening while you hold the key
and shows the preview overlay.
""",
binding: self.$state.voicePushToTalkEnabled)
.disabled(!voiceWakeSupported)
if !voiceWakeSupported {
Label("Voice Wake requires macOS 26 or newer.", systemImage: "exclamationmark.triangle.fill")
.font(.callout)
.foregroundStyle(.yellow)
.padding(8)
.background(Color.secondary.opacity(0.15))
.clipShape(RoundedRectangle(cornerRadius: 8))
}
self.localePicker
self.micPicker
self.levelMeter
VoiceWakeTestCard(
testState: self.$testState,
isTesting: self.$isTesting,
onToggle: self.toggleTest)
self.chimeSection
self.triggerTable
Spacer(minLength: 8)
}
.frame(maxWidth: .infinity, alignment: .leading)
.padding(.horizontal, 12)
}
.task {
guard !self.isPreview else { return }
await self.loadMicsIfNeeded()
}
.task {
guard !self.isPreview else { return }
await self.loadLocalesIfNeeded()
}
.task {
guard !self.isPreview else { return }
await self.restartMeter()
}
.onAppear {
guard !self.isPreview else { return }
self.startMicObserver()
self.loadTriggerEntries()
}
.onChange(of: self.state.voiceWakeMicID) { _, _ in
guard !self.isPreview else { return }
self.updateSelectedMicName()
Task { await self.restartMeter() }
}
.onChange(of: self.isActive) { _, active in
guard !self.isPreview else { return }
if !active {
self.tester.stop()
self.isTesting = false
self.testState = .idle
self.testTimeoutTask?.cancel()
self.micRefreshTask?.cancel()
self.micRefreshTask = nil
Task { await self.meter.stop() }
self.micObserver.stop()
self.syncTriggerEntriesToState()
} else {
self.startMicObserver()
self.loadTriggerEntries()
}
}
.onDisappear {
guard !self.isPreview else { return }
self.tester.stop()
self.isTesting = false
self.testState = .idle
self.testTimeoutTask?.cancel()
self.micRefreshTask?.cancel()
self.micRefreshTask = nil
self.micObserver.stop()
Task { await self.meter.stop() }
self.syncTriggerEntriesToState()
}
}
private func loadTriggerEntries() {
self.triggerEntries = self.state.swabbleTriggerWords.map { TriggerEntry(id: UUID(), value: $0) }
}
private func syncTriggerEntriesToState() {
self.state.swabbleTriggerWords = self.triggerEntries.map(\.value)
}
private var triggerTable: some View {
VStack(alignment: .leading, spacing: 8) {
HStack {
Text("Trigger words")
.font(.callout.weight(.semibold))
Spacer()
Button {
self.addWord()
} label: {
Label("Add word", systemImage: "plus")
}
.disabled(self.triggerEntries
.contains(where: { $0.value.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }))
Button("Reset defaults") {
self.triggerEntries = defaultVoiceWakeTriggers.map { TriggerEntry(id: UUID(), value: $0) }
self.syncTriggerEntriesToState()
}
}
VStack(spacing: 0) {
ForEach(self.$triggerEntries) { $entry in
HStack(spacing: 8) {
TextField("Wake word", text: $entry.value)
.textFieldStyle(.roundedBorder)
.onSubmit {
self.syncTriggerEntriesToState()
}
Button {
self.removeWord(id: entry.id)
} label: {
Image(systemName: "trash")
}
.buttonStyle(.borderless)
.help("Remove trigger word")
.frame(width: 24)
}
.padding(8)
if entry.id != self.triggerEntries.last?.id {
Divider()
}
}
}
.frame(maxWidth: .infinity, minHeight: 180, alignment: .topLeading)
.background(Color(nsColor: .textBackgroundColor))
.clipShape(RoundedRectangle(cornerRadius: 6))
.overlay(
RoundedRectangle(cornerRadius: 6)
.stroke(Color.secondary.opacity(0.25), lineWidth: 1))
Text(
"OpenClaw reacts when any trigger appears in a transcription. "
+ "Keep them short to avoid false positives.")
.font(.footnote)
.foregroundStyle(.secondary)
.fixedSize(horizontal: false, vertical: true)
}
}
private var chimeSection: some View {
VStack(alignment: .leading, spacing: 10) {
HStack(alignment: .firstTextBaseline, spacing: 10) {
Text("Sounds")
.font(.callout.weight(.semibold))
Spacer()
}
self.chimeRow(
title: "Trigger sound",
selection: self.$state.voiceWakeTriggerChime)
self.chimeRow(
title: "Send sound",
selection: self.$state.voiceWakeSendChime)
}
.padding(.top, 4)
}
private func addWord() {
self.triggerEntries.append(TriggerEntry(id: UUID(), value: ""))
}
private func removeWord(id: UUID) {
self.triggerEntries.removeAll { $0.id == id }
self.syncTriggerEntriesToState()
}
private func toggleTest() {
guard voiceWakeSupported else {
self.testState = .failed("Voice Wake requires macOS 26 or newer.")
return
}
if self.isTesting {
self.tester.finalize()
self.isTesting = false
self.testState = .finalizing
Task { @MainActor in
try? await Task.sleep(nanoseconds: 2_000_000_000)
if self.testState == .finalizing {
self.tester.stop()
self.testState = .failed("Stopped")
}
}
self.testTimeoutTask?.cancel()
return
}
let triggers = self.sanitizedTriggers()
self.tester.stop()
self.testTimeoutTask?.cancel()
self.isTesting = true
self.testState = .requesting
Task { @MainActor in
do {
try await self.tester.start(
triggers: triggers,
micID: self.state.voiceWakeMicID.isEmpty ? nil : self.state.voiceWakeMicID,
localeID: self.state.voiceWakeLocaleID,
onUpdate: { newState in
DispatchQueue.main.async { [self] in
self.testState = newState
if case .detected = newState { self.isTesting = false }
if case .failed = newState { self.isTesting = false }
if case .detected = newState { self.testTimeoutTask?.cancel() }
if case .failed = newState { self.testTimeoutTask?.cancel() }
}
})
self.testTimeoutTask?.cancel()
self.testTimeoutTask = Task { @MainActor in
try? await Task.sleep(nanoseconds: 10 * 1_000_000_000)
guard !Task.isCancelled else { return }
if self.isTesting {
self.tester.stop()
if case let .hearing(text) = self.testState,
let command = Self.textOnlyCommand(from: text, triggers: triggers)
{
self.testState = .detected(command)
} else {
self.testState = .failed("Timeout: no trigger heard")
}
self.isTesting = false
}
}
} catch {
self.tester.stop()
self.testState = .failed(error.localizedDescription)
self.isTesting = false
self.testTimeoutTask?.cancel()
}
}
}
private func chimeRow(title: String, selection: Binding<VoiceWakeChime>) -> some View {
HStack(alignment: .center, spacing: 10) {
Text(title)
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
Menu {
Button("No Sound") { self.selectChime(.none, binding: selection) }
Divider()
ForEach(VoiceWakeChimeCatalog.systemOptions, id: \.self) { option in
Button(VoiceWakeChimeCatalog.displayName(for: option)) {
self.selectChime(.system(name: option), binding: selection)
}
}
Divider()
Button("Choose file…") { self.chooseCustomChime(for: selection) }
} label: {
HStack(spacing: 6) {
Text(selection.wrappedValue.displayLabel)
.lineLimit(1)
.truncationMode(.middle)
Spacer()
Image(systemName: "chevron.down")
.font(.caption)
.foregroundStyle(.secondary)
}
.padding(6)
.frame(minWidth: self.controlWidth, maxWidth: .infinity, alignment: .leading)
.background(Color(nsColor: .windowBackgroundColor))
.overlay(
RoundedRectangle(cornerRadius: 6)
.stroke(Color.secondary.opacity(0.25), lineWidth: 1))
.clipShape(RoundedRectangle(cornerRadius: 6))
}
Button("Play") {
VoiceWakeChimePlayer.play(selection.wrappedValue)
}
.keyboardShortcut(.space, modifiers: [.command])
}
}
private func chooseCustomChime(for selection: Binding<VoiceWakeChime>) {
let panel = NSOpenPanel()
panel.allowedContentTypes = [.audio]
panel.allowsMultipleSelection = false
panel.canChooseDirectories = false
panel.resolvesAliases = true
panel.begin { response in
guard response == .OK, let url = panel.url else { return }
do {
let bookmark = try url.bookmarkData(
options: [.withSecurityScope],
includingResourceValuesForKeys: nil,
relativeTo: nil)
let chosen = VoiceWakeChime.custom(displayName: url.lastPathComponent, bookmark: bookmark)
selection.wrappedValue = chosen
VoiceWakeChimePlayer.play(chosen)
} catch {
// Ignore failures; user can retry.
}
}
}
private func selectChime(_ chime: VoiceWakeChime, binding: Binding<VoiceWakeChime>) {
binding.wrappedValue = chime
VoiceWakeChimePlayer.play(chime)
}
private func sanitizedTriggers() -> [String] {
sanitizeVoiceWakeTriggers(self.state.swabbleTriggerWords)
}
private static func textOnlyCommand(from transcript: String, triggers: [String]) -> String? {
VoiceWakeTextUtils.textOnlyCommand(
transcript: transcript,
triggers: triggers,
minCommandLength: 1,
trimWake: { WakeWordGate.stripWake(text: $0, triggers: $1) })
}
private var micPicker: some View {
VStack(alignment: .leading, spacing: 6) {
HStack(alignment: .firstTextBaseline, spacing: 10) {
Text("Microphone")
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
Picker("Microphone", selection: self.$state.voiceWakeMicID) {
Text("System default").tag("")
if self.isSelectedMicUnavailable {
Text(self.state.voiceWakeMicName.isEmpty ? "Unavailable" : self.state.voiceWakeMicName)
.tag(self.state.voiceWakeMicID)
}
ForEach(self.availableMics) { mic in
Text(mic.name).tag(mic.uid)
}
}
.labelsHidden()
.frame(width: self.controlWidth)
}
if self.isSelectedMicUnavailable {
HStack(spacing: 10) {
Color.clear.frame(width: self.fieldLabelWidth, height: 1)
Text("Disconnected (using System default)")
.font(.caption)
.foregroundStyle(.secondary)
.lineLimit(1)
}
}
if self.loadingMics {
ProgressView().controlSize(.small)
}
}
}
private var localePicker: some View {
VStack(alignment: .leading, spacing: 6) {
HStack(alignment: .firstTextBaseline, spacing: 10) {
Text("Recognition language")
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
Picker("Language", selection: self.$state.voiceWakeLocaleID) {
let current = Locale(identifier: Locale.current.identifier)
Text("\(self.friendlyName(for: current)) (System)").tag(Locale.current.identifier)
ForEach(self.availableLocales.map(\.identifier), id: \.self) { id in
if id != Locale.current.identifier {
Text(self.friendlyName(for: Locale(identifier: id))).tag(id)
}
}
}
.labelsHidden()
.frame(width: self.controlWidth)
}
if !self.state.voiceWakeAdditionalLocaleIDs.isEmpty {
VStack(alignment: .leading, spacing: 8) {
Text("Additional languages")
.font(.footnote.weight(.semibold))
ForEach(
Array(self.state.voiceWakeAdditionalLocaleIDs.enumerated()),
id: \.offset)
{ idx, localeID in
HStack(spacing: 8) {
Picker("Extra \(idx + 1)", selection: Binding(
get: { localeID },
set: { newValue in
guard self.state
.voiceWakeAdditionalLocaleIDs.indices
.contains(idx) else { return }
self.state
.voiceWakeAdditionalLocaleIDs[idx] =
newValue
})) {
ForEach(self.availableLocales.map(\.identifier), id: \.self) { id in
Text(self.friendlyName(for: Locale(identifier: id))).tag(id)
}
}
.labelsHidden()
.frame(width: 220)
Button {
guard self.state.voiceWakeAdditionalLocaleIDs.indices.contains(idx) else { return }
self.state.voiceWakeAdditionalLocaleIDs.remove(at: idx)
} label: {
Image(systemName: "trash")
}
.buttonStyle(.borderless)
.help("Remove language")
}
}
Button {
if let first = availableLocales.first {
self.state.voiceWakeAdditionalLocaleIDs.append(first.identifier)
}
} label: {
Label("Add language", systemImage: "plus")
}
.disabled(self.availableLocales.isEmpty)
}
.padding(.top, 4)
} else {
Button {
if let first = availableLocales.first {
self.state.voiceWakeAdditionalLocaleIDs.append(first.identifier)
}
} label: {
Label("Add additional language", systemImage: "plus")
}
.buttonStyle(.link)
.disabled(self.availableLocales.isEmpty)
.padding(.top, 4)
}
Text("Languages are tried in order. Models may need a first-use download on macOS 26.")
.font(.caption)
.foregroundStyle(.secondary)
}
}
@MainActor
private func loadMicsIfNeeded(force: Bool = false) async {
guard force || self.availableMics.isEmpty, !self.loadingMics else { return }
self.loadingMics = true
let discovery = AVCaptureDevice.DiscoverySession(
deviceTypes: [.external, .microphone],
mediaType: .audio,
position: .unspecified)
let aliveUIDs = AudioInputDeviceObserver.aliveInputDeviceUIDs()
let connectedDevices = discovery.devices.filter(\.isConnected)
let devices = aliveUIDs.isEmpty
? connectedDevices
: connectedDevices.filter { aliveUIDs.contains($0.uniqueID) }
self.availableMics = devices.map { AudioInputDevice(uid: $0.uniqueID, name: $0.localizedName) }
self.updateSelectedMicName()
self.loadingMics = false
}
private var isSelectedMicUnavailable: Bool {
let selected = self.state.voiceWakeMicID
guard !selected.isEmpty else { return false }
return !self.availableMics.contains(where: { $0.uid == selected })
}
@MainActor
private func updateSelectedMicName() {
self.state.voiceWakeMicName = MicRefreshSupport.selectedMicName(
selectedID: self.state.voiceWakeMicID,
in: self.availableMics,
uid: \.uid,
name: \.name)
}
private func startMicObserver() {
MicRefreshSupport.startObserver(self.micObserver) {
self.scheduleMicRefresh()
}
}
@MainActor
private func scheduleMicRefresh() {
MicRefreshSupport.schedule(refreshTask: &self.micRefreshTask) {
await self.loadMicsIfNeeded(force: true)
await self.restartMeter()
}
}
@MainActor
private func loadLocalesIfNeeded() async {
guard self.availableLocales.isEmpty else { return }
self.availableLocales = Array(SFSpeechRecognizer.supportedLocales()).sorted { lhs, rhs in
self.friendlyName(for: lhs)
.localizedCaseInsensitiveCompare(self.friendlyName(for: rhs)) == .orderedAscending
}
}
private func friendlyName(for locale: Locale) -> String {
let cleanedID = normalizeLocaleIdentifier(locale.identifier)
let cleanLocale = Locale(identifier: cleanedID)
if let langCode = cleanLocale.language.languageCode?.identifier,
let lang = cleanLocale.localizedString(forLanguageCode: langCode),
let regionCode = cleanLocale.region?.identifier,
let region = cleanLocale.localizedString(forRegionCode: regionCode)
{
return "\(lang) (\(region))"
}
if let langCode = cleanLocale.language.languageCode?.identifier,
let lang = cleanLocale.localizedString(forLanguageCode: langCode)
{
return lang
}
return cleanLocale.localizedString(forIdentifier: cleanedID) ?? cleanedID
}
private var levelMeter: some View {
VStack(alignment: .leading, spacing: 6) {
HStack(alignment: .center, spacing: 10) {
Text("Live level")
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
MicLevelBar(level: self.meterLevel)
.frame(width: self.controlWidth, alignment: .leading)
Text(self.levelLabel)
.font(.callout.monospacedDigit())
.foregroundStyle(.secondary)
.frame(width: 60, alignment: .trailing)
}
if let meterError {
Text(meterError)
.font(.footnote)
.foregroundStyle(.secondary)
}
}
}
private var levelLabel: String {
let db = (meterLevel * 50) - 50
return String(format: "%.0f dB", db)
}
@MainActor
private func restartMeter() async {
self.meterError = nil
await self.meter.stop()
do {
try await self.meter.start { [weak state] level in
Task { @MainActor in
guard state != nil else { return }
self.meterLevel = level
}
}
} catch {
self.meterError = error.localizedDescription
}
}
}
#if DEBUG
struct VoiceWakeSettings_Previews: PreviewProvider {
static var previews: some View {
VoiceWakeSettings(state: .preview, isActive: true)
.frame(width: SettingsTab.windowWidth, height: SettingsTab.windowHeight)
}
}
@MainActor
extension VoiceWakeSettings {
static func exerciseForTesting() {
let state = AppState(preview: true)
state.swabbleEnabled = true
state.voicePushToTalkEnabled = true
state.swabbleTriggerWords = ["Claude", "Hey"]
let view = VoiceWakeSettings(state: state, isActive: true)
view.availableMics = [AudioInputDevice(uid: "mic-1", name: "Built-in")]
view.availableLocales = [Locale(identifier: "en_US")]
view.meterLevel = 0.42
view.meterError = "No input"
view.testState = .detected("ok")
view.isTesting = true
view.triggerEntries = [TriggerEntry(id: UUID(), value: "Claude")]
_ = view.body
_ = view.localePicker
_ = view.micPicker
_ = view.levelMeter
_ = view.triggerTable
_ = view.chimeSection
view.addWord()
if let entryId = view.triggerEntries.first?.id {
view.removeWord(id: entryId)
}
}
}
#endif