Files
openclaw/apps/macos/Sources/OpenClaw/VoiceWakeSettings.swift
Seungwoo hong 63fac653ed fix(talk): Talk Mode TTS improvements for CJK languages (#53553)
* feat(talk): add distinct system sounds for each Talk Mode phase

Play a short system sound on phase transitions to give the user
audible feedback:
- thinking: Tink
- speaking: Pop
- listening (after speech interrupted): Bottle
- listening (after thinking): Submarine

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* feat(talk): add right Shift key to interrupt Talk Mode speech

Add TalkSpeechInterruptMonitor — a dedicated global key monitor that
listens for right Shift (keyCode 60) to interrupt Talk Mode speech.
Independent of Push-to-Talk, so it works even when PTT is disabled.

Stops only the current response; the next conversation cycle
continues normally via sendAndSpeak's resumeListeningIfNeeded flow.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* feat(talk): increase silence detection timeout for CJK locales

Korean, Japanese, and Chinese speakers need longer pauses between
phrases. When the app locale is CJK, enforce a minimum 2000ms
silence window (vs the default 1500ms) to avoid premature
transcript submission.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix(talk): remove force-unwraps and log CJK silence clamp in reloadConfig

Replace non-idiomatic force-unwraps (cfg.voiceId!, cfg.modelId!) with
safe flatMap unwrapping, and add an info log when CJK locale clamps the
silence timeout so the override is observable in diagnostics.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* feat(talk): add settings toggle to mute phase-transition sounds

Add a "Play phase-transition sounds" checkbox to Voice Wake settings.
When disabled, Talk Mode phase transitions (Tink/Pop/Bottle/Submarine)
are silent. Defaults to enabled to preserve existing behavior.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* feat(talk): add toggle for Right Option speech interrupt

Add a "Press Right Option to stop speech" checkbox to Voice Wake
settings. Also change the interrupt key from right Shift to right
Option (keyCode 61) to avoid conflicts with typing.
Defaults to enabled to preserve existing behavior.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix(talk): disable Push-to-Talk while Talk Mode is active

Talk Mode and Push-to-Talk both use the right Option key (keyCode 61).
Disable PTT when Talk Mode is enabled to prevent conflicting handlers,
and restore PTT when Talk Mode is disabled.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix(talk): show info when PTT is paused during Talk Mode

Display a footnote under the Push-to-Talk toggle when both PTT and
Talk Mode are enabled, explaining that PTT is paused while Talk Mode
is active and resumes when Talk Mode is turned off.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fixup: SwiftFormat lint on TalkModeController phase sound switch

Resolves macos-swift CI lint failures introduced by Korean
comment formatting in 'feat(talk): add distinct system sounds for each Talk Mode phase'.
- Collapse consecutive spaces between sound name and comment
- Move floating comments above the listening case expression so
  they're at the correct indent level

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: hongsw <hongsw@hongswui-Macmini.local>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-authored-by: Fabian Williams <fabian@adotob.com>
2026-04-25 17:05:51 -04:00

699 lines
27 KiB
Swift

import AppKit
import AVFoundation
import Observation
import Speech
import SwabbleKit
import SwiftUI
import UniformTypeIdentifiers
struct VoiceWakeSettings: View {
@Bindable var state: AppState
let isActive: Bool
@State private var testState: VoiceWakeTestState = .idle
@State private var tester = VoiceWakeTester()
@State private var isTesting = false
@State private var testTimeoutTask: Task<Void, Never>?
@State private var availableMics: [AudioInputDevice] = []
@State private var loadingMics = false
@State private var meterLevel: Double = 0
@State private var meterError: String?
private let meter = MicLevelMonitor()
@State private var micObserver = AudioInputDeviceObserver()
@State private var micRefreshTask: Task<Void, Never>?
@State private var availableLocales: [Locale] = []
@State private var triggerEntries: [TriggerEntry] = []
private let fieldLabelWidth: CGFloat = 140
private let controlWidth: CGFloat = 240
private let isPreview = ProcessInfo.processInfo.isPreview
private struct AudioInputDevice: Identifiable, Equatable {
let uid: String
let name: String
var id: String {
self.uid
}
}
private struct TriggerEntry: Identifiable {
let id: UUID
var value: String
}
private var voiceWakeBinding: Binding<Bool> {
MicRefreshSupport.voiceWakeBinding(for: self.state)
}
var body: some View {
ScrollView(.vertical) {
VStack(alignment: .leading, spacing: 14) {
SettingsToggleRow(
title: "Enable Voice Wake",
subtitle: "Listen for a wake phrase (e.g. \"Claude\") before running voice commands. "
+ "Voice recognition runs fully on-device.",
binding: self.voiceWakeBinding)
.disabled(!voiceWakeSupported)
SettingsToggleRow(
title: "Trigger Talk Mode",
subtitle: """
When a wake phrase is detected, activate Talk Mode for a full voice \
conversation (STT, LLM response, TTS playback) instead of sending a \
text message to the chat.
""",
binding: self.$state.voiceWakeTriggersTalkMode)
.disabled(!self.state.swabbleEnabled)
SettingsToggleRow(
title: "Hold Right Option to talk",
subtitle: """
Push-to-talk mode that starts listening while you hold the key
and shows the preview overlay.
""",
binding: self.$state.voicePushToTalkEnabled)
.disabled(!voiceWakeSupported)
if self.state.voicePushToTalkEnabled, self.state.talkEnabled {
Text("Push-to-Talk is paused while Talk Mode is active. It resumes when Talk Mode is turned off.")
.font(.footnote)
.foregroundStyle(.secondary)
.padding(.leading, 20)
}
SettingsToggleRow(
title: "Play phase-transition sounds",
subtitle: """
Play short system sounds when Talk Mode switches between
listening, thinking, and speaking.
""",
binding: self.$state.talkPhaseSoundsEnabled)
.disabled(!voiceWakeSupported)
SettingsToggleRow(
title: "Press Right Option to stop speech",
subtitle: """
Tap the right Option key to interrupt the assistant while it is
speaking and return to listening.
""",
binding: self.$state.talkShiftToStopEnabled)
.disabled(!voiceWakeSupported)
if !voiceWakeSupported {
Label("Voice Wake requires macOS 26 or newer.", systemImage: "exclamationmark.triangle.fill")
.font(.callout)
.foregroundStyle(.yellow)
.padding(8)
.background(Color.secondary.opacity(0.15))
.clipShape(RoundedRectangle(cornerRadius: 8))
}
self.localePicker
self.micPicker
self.levelMeter
VoiceWakeTestCard(
testState: self.$testState,
isTesting: self.$isTesting,
onToggle: self.toggleTest)
self.chimeSection
self.triggerTable
Spacer(minLength: 8)
}
.frame(maxWidth: .infinity, alignment: .leading)
.padding(.horizontal, 12)
}
.task {
guard !self.isPreview else { return }
await self.loadMicsIfNeeded()
}
.task {
guard !self.isPreview else { return }
await self.loadLocalesIfNeeded()
}
.task {
guard !self.isPreview else { return }
await self.restartMeter()
}
.onAppear {
guard !self.isPreview else { return }
self.startMicObserver()
self.loadTriggerEntries()
}
.onChange(of: self.state.voiceWakeMicID) { _, _ in
guard !self.isPreview else { return }
self.updateSelectedMicName()
Task { await self.restartMeter() }
}
.onChange(of: self.isActive) { _, active in
guard !self.isPreview else { return }
if !active {
self.tester.stop()
self.isTesting = false
self.testState = .idle
self.testTimeoutTask?.cancel()
self.micRefreshTask?.cancel()
self.micRefreshTask = nil
Task { await self.meter.stop() }
self.micObserver.stop()
self.syncTriggerEntriesToState()
} else {
self.startMicObserver()
self.loadTriggerEntries()
}
}
.onDisappear {
guard !self.isPreview else { return }
self.tester.stop()
self.isTesting = false
self.testState = .idle
self.testTimeoutTask?.cancel()
self.micRefreshTask?.cancel()
self.micRefreshTask = nil
self.micObserver.stop()
Task { await self.meter.stop() }
self.syncTriggerEntriesToState()
}
}
private func loadTriggerEntries() {
self.triggerEntries = self.state.swabbleTriggerWords.map { TriggerEntry(id: UUID(), value: $0) }
}
private func syncTriggerEntriesToState() {
self.state.swabbleTriggerWords = self.triggerEntries.map(\.value)
}
private var triggerTable: some View {
VStack(alignment: .leading, spacing: 8) {
HStack {
Text("Trigger words")
.font(.callout.weight(.semibold))
Spacer()
Button {
self.addWord()
} label: {
Label("Add word", systemImage: "plus")
}
.disabled(self.triggerEntries
.contains(where: { $0.value.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }))
Button("Reset defaults") {
self.triggerEntries = defaultVoiceWakeTriggers.map { TriggerEntry(id: UUID(), value: $0) }
self.syncTriggerEntriesToState()
}
}
VStack(spacing: 0) {
ForEach(self.$triggerEntries) { $entry in
HStack(spacing: 8) {
TextField("Wake word", text: $entry.value)
.textFieldStyle(.roundedBorder)
.onSubmit {
self.syncTriggerEntriesToState()
}
Button {
self.removeWord(id: entry.id)
} label: {
Image(systemName: "trash")
}
.buttonStyle(.borderless)
.help("Remove trigger word")
.frame(width: 24)
}
.padding(8)
if entry.id != self.triggerEntries.last?.id {
Divider()
}
}
}
.frame(maxWidth: .infinity, minHeight: 180, alignment: .topLeading)
.background(Color(nsColor: .textBackgroundColor))
.clipShape(RoundedRectangle(cornerRadius: 6))
.overlay(
RoundedRectangle(cornerRadius: 6)
.stroke(Color.secondary.opacity(0.25), lineWidth: 1))
Text(
"OpenClaw reacts when any trigger appears in a transcription. "
+ "Keep them short to avoid false positives.")
.font(.footnote)
.foregroundStyle(.secondary)
.fixedSize(horizontal: false, vertical: true)
}
}
private var chimeSection: some View {
VStack(alignment: .leading, spacing: 10) {
HStack(alignment: .firstTextBaseline, spacing: 10) {
Text("Sounds")
.font(.callout.weight(.semibold))
Spacer()
}
self.chimeRow(
title: "Trigger sound",
selection: self.$state.voiceWakeTriggerChime)
self.chimeRow(
title: "Send sound",
selection: self.$state.voiceWakeSendChime)
}
.padding(.top, 4)
}
private func addWord() {
self.triggerEntries.append(TriggerEntry(id: UUID(), value: ""))
}
private func removeWord(id: UUID) {
self.triggerEntries.removeAll { $0.id == id }
self.syncTriggerEntriesToState()
}
private func toggleTest() {
guard voiceWakeSupported else {
self.testState = .failed("Voice Wake requires macOS 26 or newer.")
return
}
if self.isTesting {
self.tester.finalize()
self.isTesting = false
self.testState = .finalizing
Task { @MainActor in
try? await Task.sleep(nanoseconds: 2_000_000_000)
if self.testState == .finalizing {
self.tester.stop()
self.testState = .failed("Stopped")
}
}
self.testTimeoutTask?.cancel()
return
}
let triggers = self.sanitizedTriggers()
self.tester.stop()
self.testTimeoutTask?.cancel()
self.isTesting = true
self.testState = .requesting
Task { @MainActor in
do {
try await self.tester.start(
triggers: triggers,
micID: self.state.voiceWakeMicID.isEmpty ? nil : self.state.voiceWakeMicID,
localeID: self.state.voiceWakeLocaleID,
onUpdate: { newState in
DispatchQueue.main.async { [self] in
self.testState = newState
if case .detected = newState { self.isTesting = false }
if case .failed = newState { self.isTesting = false }
if case .detected = newState { self.testTimeoutTask?.cancel() }
if case .failed = newState { self.testTimeoutTask?.cancel() }
}
})
self.testTimeoutTask?.cancel()
self.testTimeoutTask = Task { @MainActor in
try? await Task.sleep(nanoseconds: 10 * 1_000_000_000)
guard !Task.isCancelled else { return }
if self.isTesting {
self.tester.stop()
if case let .hearing(text) = self.testState,
let command = Self.textOnlyCommand(from: text, triggers: triggers)
{
self.testState = .detected(command)
} else {
self.testState = .failed("Timeout: no trigger heard")
}
self.isTesting = false
}
}
} catch {
self.tester.stop()
self.testState = .failed(error.localizedDescription)
self.isTesting = false
self.testTimeoutTask?.cancel()
}
}
}
private func chimeRow(title: String, selection: Binding<VoiceWakeChime>) -> some View {
HStack(alignment: .center, spacing: 10) {
Text(title)
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
Menu {
Button("No Sound") { self.selectChime(.none, binding: selection) }
Divider()
ForEach(VoiceWakeChimeCatalog.systemOptions, id: \.self) { option in
Button(VoiceWakeChimeCatalog.displayName(for: option)) {
self.selectChime(.system(name: option), binding: selection)
}
}
Divider()
Button("Choose file…") { self.chooseCustomChime(for: selection) }
} label: {
HStack(spacing: 6) {
Text(selection.wrappedValue.displayLabel)
.lineLimit(1)
.truncationMode(.middle)
Spacer()
Image(systemName: "chevron.down")
.font(.caption)
.foregroundStyle(.secondary)
}
.padding(6)
.frame(minWidth: self.controlWidth, maxWidth: .infinity, alignment: .leading)
.background(Color(nsColor: .windowBackgroundColor))
.overlay(
RoundedRectangle(cornerRadius: 6)
.stroke(Color.secondary.opacity(0.25), lineWidth: 1))
.clipShape(RoundedRectangle(cornerRadius: 6))
}
Button("Play") {
VoiceWakeChimePlayer.play(selection.wrappedValue)
}
.keyboardShortcut(.space, modifiers: [.command])
}
}
private func chooseCustomChime(for selection: Binding<VoiceWakeChime>) {
let panel = NSOpenPanel()
panel.allowedContentTypes = [.audio]
panel.allowsMultipleSelection = false
panel.canChooseDirectories = false
panel.resolvesAliases = true
panel.begin { response in
guard response == .OK, let url = panel.url else { return }
do {
let bookmark = try url.bookmarkData(
options: [.withSecurityScope],
includingResourceValuesForKeys: nil,
relativeTo: nil)
let chosen = VoiceWakeChime.custom(displayName: url.lastPathComponent, bookmark: bookmark)
selection.wrappedValue = chosen
VoiceWakeChimePlayer.play(chosen)
} catch {
// Ignore failures; user can retry.
}
}
}
private func selectChime(_ chime: VoiceWakeChime, binding: Binding<VoiceWakeChime>) {
binding.wrappedValue = chime
VoiceWakeChimePlayer.play(chime)
}
private func sanitizedTriggers() -> [String] {
sanitizeVoiceWakeTriggers(self.state.swabbleTriggerWords)
}
private static func textOnlyCommand(from transcript: String, triggers: [String]) -> String? {
VoiceWakeTextUtils.textOnlyCommand(
transcript: transcript,
triggers: triggers,
minCommandLength: 1,
trimWake: { WakeWordGate.stripWake(text: $0, triggers: $1) })
}
private var micPicker: some View {
VStack(alignment: .leading, spacing: 6) {
HStack(alignment: .firstTextBaseline, spacing: 10) {
Text("Microphone")
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
Picker("Microphone", selection: self.$state.voiceWakeMicID) {
Text("System default").tag("")
if self.isSelectedMicUnavailable {
Text(self.state.voiceWakeMicName.isEmpty ? "Unavailable" : self.state.voiceWakeMicName)
.tag(self.state.voiceWakeMicID)
}
ForEach(self.availableMics) { mic in
Text(mic.name).tag(mic.uid)
}
}
.labelsHidden()
.frame(width: self.controlWidth)
}
if self.isSelectedMicUnavailable {
HStack(spacing: 10) {
Color.clear.frame(width: self.fieldLabelWidth, height: 1)
Text("Disconnected (using System default)")
.font(.caption)
.foregroundStyle(.secondary)
.lineLimit(1)
}
}
if self.loadingMics {
ProgressView().controlSize(.small)
}
}
}
private var localePicker: some View {
VStack(alignment: .leading, spacing: 6) {
HStack(alignment: .firstTextBaseline, spacing: 10) {
Text("Recognition language")
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
Picker("Language", selection: self.$state.voiceWakeLocaleID) {
let current = Locale(identifier: Locale.current.identifier)
Text("\(self.friendlyName(for: current)) (System)").tag(Locale.current.identifier)
ForEach(self.availableLocales.map(\.identifier), id: \.self) { id in
if id != Locale.current.identifier {
Text(self.friendlyName(for: Locale(identifier: id))).tag(id)
}
}
}
.labelsHidden()
.frame(width: self.controlWidth)
}
if !self.state.voiceWakeAdditionalLocaleIDs.isEmpty {
VStack(alignment: .leading, spacing: 8) {
Text("Additional languages")
.font(.footnote.weight(.semibold))
ForEach(
Array(self.state.voiceWakeAdditionalLocaleIDs.enumerated()),
id: \.offset)
{ idx, localeID in
HStack(spacing: 8) {
Picker("Extra \(idx + 1)", selection: Binding(
get: { localeID },
set: { newValue in
guard self.state
.voiceWakeAdditionalLocaleIDs.indices
.contains(idx) else { return }
self.state
.voiceWakeAdditionalLocaleIDs[idx] =
newValue
})) {
ForEach(self.availableLocales.map(\.identifier), id: \.self) { id in
Text(self.friendlyName(for: Locale(identifier: id))).tag(id)
}
}
.labelsHidden()
.frame(width: 220)
Button {
guard self.state.voiceWakeAdditionalLocaleIDs.indices.contains(idx) else { return }
self.state.voiceWakeAdditionalLocaleIDs.remove(at: idx)
} label: {
Image(systemName: "trash")
}
.buttonStyle(.borderless)
.help("Remove language")
}
}
Button {
if let first = availableLocales.first {
self.state.voiceWakeAdditionalLocaleIDs.append(first.identifier)
}
} label: {
Label("Add language", systemImage: "plus")
}
.disabled(self.availableLocales.isEmpty)
}
.padding(.top, 4)
} else {
Button {
if let first = availableLocales.first {
self.state.voiceWakeAdditionalLocaleIDs.append(first.identifier)
}
} label: {
Label("Add additional language", systemImage: "plus")
}
.buttonStyle(.link)
.disabled(self.availableLocales.isEmpty)
.padding(.top, 4)
}
Text("Languages are tried in order. Models may need a first-use download on macOS 26.")
.font(.caption)
.foregroundStyle(.secondary)
}
}
@MainActor
private func loadMicsIfNeeded(force: Bool = false) async {
guard force || self.availableMics.isEmpty, !self.loadingMics else { return }
self.loadingMics = true
let discovery = AVCaptureDevice.DiscoverySession(
deviceTypes: [.external, .microphone],
mediaType: .audio,
position: .unspecified)
let aliveUIDs = AudioInputDeviceObserver.aliveInputDeviceUIDs()
let connectedDevices = discovery.devices.filter(\.isConnected)
let devices = aliveUIDs.isEmpty
? connectedDevices
: connectedDevices.filter { aliveUIDs.contains($0.uniqueID) }
self.availableMics = devices.map { AudioInputDevice(uid: $0.uniqueID, name: $0.localizedName) }
self.updateSelectedMicName()
self.loadingMics = false
}
private var isSelectedMicUnavailable: Bool {
let selected = self.state.voiceWakeMicID
guard !selected.isEmpty else { return false }
return !self.availableMics.contains(where: { $0.uid == selected })
}
@MainActor
private func updateSelectedMicName() {
self.state.voiceWakeMicName = MicRefreshSupport.selectedMicName(
selectedID: self.state.voiceWakeMicID,
in: self.availableMics,
uid: \.uid,
name: \.name)
}
private func startMicObserver() {
MicRefreshSupport.startObserver(self.micObserver) {
self.scheduleMicRefresh()
}
}
@MainActor
private func scheduleMicRefresh() {
MicRefreshSupport.schedule(refreshTask: &self.micRefreshTask) {
await self.loadMicsIfNeeded(force: true)
await self.restartMeter()
}
}
@MainActor
private func loadLocalesIfNeeded() async {
guard self.availableLocales.isEmpty else { return }
self.availableLocales = Array(SFSpeechRecognizer.supportedLocales()).sorted { lhs, rhs in
self.friendlyName(for: lhs)
.localizedCaseInsensitiveCompare(self.friendlyName(for: rhs)) == .orderedAscending
}
}
private func friendlyName(for locale: Locale) -> String {
let cleanedID = normalizeLocaleIdentifier(locale.identifier)
let cleanLocale = Locale(identifier: cleanedID)
if let langCode = cleanLocale.language.languageCode?.identifier,
let lang = cleanLocale.localizedString(forLanguageCode: langCode),
let regionCode = cleanLocale.region?.identifier,
let region = cleanLocale.localizedString(forRegionCode: regionCode)
{
return "\(lang) (\(region))"
}
if let langCode = cleanLocale.language.languageCode?.identifier,
let lang = cleanLocale.localizedString(forLanguageCode: langCode)
{
return lang
}
return cleanLocale.localizedString(forIdentifier: cleanedID) ?? cleanedID
}
private var levelMeter: some View {
VStack(alignment: .leading, spacing: 6) {
HStack(alignment: .center, spacing: 10) {
Text("Live level")
.font(.callout.weight(.semibold))
.frame(width: self.fieldLabelWidth, alignment: .leading)
MicLevelBar(level: self.meterLevel)
.frame(width: self.controlWidth, alignment: .leading)
Text(self.levelLabel)
.font(.callout.monospacedDigit())
.foregroundStyle(.secondary)
.frame(width: 60, alignment: .trailing)
}
if let meterError {
Text(meterError)
.font(.footnote)
.foregroundStyle(.secondary)
}
}
}
private var levelLabel: String {
let db = (meterLevel * 50) - 50
return String(format: "%.0f dB", db)
}
@MainActor
private func restartMeter() async {
self.meterError = nil
await self.meter.stop()
do {
try await self.meter.start { [weak state] level in
Task { @MainActor in
guard state != nil else { return }
self.meterLevel = level
}
}
} catch {
self.meterError = error.localizedDescription
}
}
}
#if DEBUG
struct VoiceWakeSettings_Previews: PreviewProvider {
static var previews: some View {
VoiceWakeSettings(state: .preview, isActive: true)
.frame(width: SettingsTab.windowWidth, height: SettingsTab.windowHeight)
}
}
@MainActor
extension VoiceWakeSettings {
static func exerciseForTesting() {
let state = AppState(preview: true)
state.swabbleEnabled = true
state.voicePushToTalkEnabled = true
state.swabbleTriggerWords = ["Claude", "Hey"]
let view = VoiceWakeSettings(state: state, isActive: true)
view.availableMics = [AudioInputDevice(uid: "mic-1", name: "Built-in")]
view.availableLocales = [Locale(identifier: "en_US")]
view.meterLevel = 0.42
view.meterError = "No input"
view.testState = .detected("ok")
view.isTesting = true
view.triggerEntries = [TriggerEntry(id: UUID(), value: "Claude")]
_ = view.body
_ = view.localePicker
_ = view.micPicker
_ = view.levelMeter
_ = view.triggerTable
_ = view.chimeSection
view.addWord()
if let entryId = view.triggerEntries.first?.id {
view.removeWord(id: entryId)
}
}
}
#endif