mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-07 22:32:51 +00:00
* feat(talk): add distinct system sounds for each Talk Mode phase Play a short system sound on phase transitions to give the user audible feedback: - thinking: Tink - speaking: Pop - listening (after speech interrupted): Bottle - listening (after thinking): Submarine Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat(talk): add right Shift key to interrupt Talk Mode speech Add TalkSpeechInterruptMonitor — a dedicated global key monitor that listens for right Shift (keyCode 60) to interrupt Talk Mode speech. Independent of Push-to-Talk, so it works even when PTT is disabled. Stops only the current response; the next conversation cycle continues normally via sendAndSpeak's resumeListeningIfNeeded flow. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat(talk): increase silence detection timeout for CJK locales Korean, Japanese, and Chinese speakers need longer pauses between phrases. When the app locale is CJK, enforce a minimum 2000ms silence window (vs the default 1500ms) to avoid premature transcript submission. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(talk): remove force-unwraps and log CJK silence clamp in reloadConfig Replace non-idiomatic force-unwraps (cfg.voiceId!, cfg.modelId!) with safe flatMap unwrapping, and add an info log when CJK locale clamps the silence timeout so the override is observable in diagnostics. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat(talk): add settings toggle to mute phase-transition sounds Add a "Play phase-transition sounds" checkbox to Voice Wake settings. When disabled, Talk Mode phase transitions (Tink/Pop/Bottle/Submarine) are silent. Defaults to enabled to preserve existing behavior. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat(talk): add toggle for Right Option speech interrupt Add a "Press Right Option to stop speech" checkbox to Voice Wake settings. Also change the interrupt key from right Shift to right Option (keyCode 61) to avoid conflicts with typing. Defaults to enabled to preserve existing behavior. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(talk): disable Push-to-Talk while Talk Mode is active Talk Mode and Push-to-Talk both use the right Option key (keyCode 61). Disable PTT when Talk Mode is enabled to prevent conflicting handlers, and restore PTT when Talk Mode is disabled. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(talk): show info when PTT is paused during Talk Mode Display a footnote under the Push-to-Talk toggle when both PTT and Talk Mode are enabled, explaining that PTT is paused while Talk Mode is active and resumes when Talk Mode is turned off. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fixup: SwiftFormat lint on TalkModeController phase sound switch Resolves macos-swift CI lint failures introduced by Korean comment formatting in 'feat(talk): add distinct system sounds for each Talk Mode phase'. - Collapse consecutive spaces between sound name and comment - Move floating comments above the listening case expression so they're at the correct indent level Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: hongsw <hongsw@hongswui-Macmini.local> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Fabian Williams <fabian@adotob.com>
411 lines
16 KiB
Swift
411 lines
16 KiB
Swift
import AppKit
|
|
import AVFoundation
|
|
import Dispatch
|
|
import OSLog
|
|
import Speech
|
|
|
|
/// Observes right Option and starts a push-to-talk capture while it is held.
|
|
final class VoicePushToTalkHotkey: @unchecked Sendable {
|
|
static let shared = VoicePushToTalkHotkey()
|
|
|
|
private var globalMonitor: Any?
|
|
private var localMonitor: Any?
|
|
private var optionDown = false // right option only
|
|
private var active = false
|
|
|
|
private let beginAction: @Sendable () async -> Void
|
|
private let endAction: @Sendable () async -> Void
|
|
|
|
init(
|
|
beginAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.begin() },
|
|
endAction: @escaping @Sendable () async -> Void = { await VoicePushToTalk.shared.end() })
|
|
{
|
|
self.beginAction = beginAction
|
|
self.endAction = endAction
|
|
}
|
|
|
|
func setEnabled(_ enabled: Bool) {
|
|
if ProcessInfo.processInfo.isRunningTests { return }
|
|
self.withMainThread { [weak self] in
|
|
guard let self else { return }
|
|
if enabled {
|
|
self.startMonitoring()
|
|
} else {
|
|
self.stopMonitoring()
|
|
}
|
|
}
|
|
}
|
|
|
|
private func startMonitoring() {
|
|
// assert(Thread.isMainThread) - Removed for Swift 6
|
|
guard self.globalMonitor == nil, self.localMonitor == nil else { return }
|
|
// Listen-only global monitor; we rely on Input Monitoring permission to receive events.
|
|
self.globalMonitor = NSEvent.addGlobalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
|
|
let keyCode = event.keyCode
|
|
let flags = event.modifierFlags
|
|
self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
|
|
}
|
|
// Also listen locally so we still catch events when the app is active/focused.
|
|
self.localMonitor = NSEvent.addLocalMonitorForEvents(matching: .flagsChanged) { [weak self] event in
|
|
let keyCode = event.keyCode
|
|
let flags = event.modifierFlags
|
|
self?.handleFlagsChanged(keyCode: keyCode, modifierFlags: flags)
|
|
return event
|
|
}
|
|
}
|
|
|
|
private func stopMonitoring() {
|
|
// assert(Thread.isMainThread) - Removed for Swift 6
|
|
if let globalMonitor {
|
|
NSEvent.removeMonitor(globalMonitor)
|
|
self.globalMonitor = nil
|
|
}
|
|
if let localMonitor {
|
|
NSEvent.removeMonitor(localMonitor)
|
|
self.localMonitor = nil
|
|
}
|
|
self.optionDown = false
|
|
self.active = false
|
|
}
|
|
|
|
private func handleFlagsChanged(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
|
|
self.withMainThread { [weak self] in
|
|
self?.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
|
|
}
|
|
}
|
|
|
|
private func withMainThread(_ block: @escaping @Sendable () -> Void) {
|
|
DispatchQueue.main.async(execute: block)
|
|
}
|
|
|
|
private func updateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
|
|
// assert(Thread.isMainThread) - Removed for Swift 6
|
|
|
|
// Right Option (keyCode 61) acts as a hold-to-talk modifier.
|
|
if keyCode == 61 {
|
|
self.optionDown = modifierFlags.contains(.option)
|
|
}
|
|
|
|
let chordActive = self.optionDown
|
|
if chordActive, !self.active {
|
|
self.active = true
|
|
Task {
|
|
Logger(subsystem: "ai.openclaw", category: "voicewake.ptt")
|
|
.info("ptt hotkey down")
|
|
await self.beginAction()
|
|
}
|
|
} else if !chordActive, self.active {
|
|
self.active = false
|
|
Task {
|
|
Logger(subsystem: "ai.openclaw", category: "voicewake.ptt")
|
|
.info("ptt hotkey up")
|
|
await self.endAction()
|
|
}
|
|
}
|
|
}
|
|
|
|
func _testUpdateModifierState(keyCode: UInt16, modifierFlags: NSEvent.ModifierFlags) {
|
|
self.updateModifierState(keyCode: keyCode, modifierFlags: modifierFlags)
|
|
}
|
|
}
|
|
|
|
/// Short-lived speech recognizer that records while the hotkey is held.
|
|
actor VoicePushToTalk {
|
|
static let shared = VoicePushToTalk()
|
|
|
|
private let logger = Logger(subsystem: "ai.openclaw", category: "voicewake.ptt")
|
|
|
|
private var recognizer: SFSpeechRecognizer?
|
|
// Lazily created on begin() to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
|
|
// headphones into the low-quality headset profile even if push-to-talk is never used.
|
|
private var audioEngine: AVAudioEngine?
|
|
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
private var recognitionTask: SFSpeechRecognitionTask?
|
|
private var tapInstalled = false
|
|
|
|
/// Session token used to drop stale callbacks when a new capture starts.
|
|
private var sessionID = UUID()
|
|
|
|
private var committed: String = ""
|
|
private var volatile: String = ""
|
|
private var activeConfig: Config?
|
|
private var isCapturing = false
|
|
private var triggerChimePlayed = false
|
|
private var finalized = false
|
|
private var timeoutTask: Task<Void, Never>?
|
|
private var overlayToken: UUID?
|
|
private var adoptedPrefix: String = ""
|
|
|
|
private struct Config {
|
|
let micID: String?
|
|
let localeID: String?
|
|
let triggerChime: VoiceWakeChime
|
|
let sendChime: VoiceWakeChime
|
|
}
|
|
|
|
func begin() async {
|
|
guard voiceWakeSupported else { return }
|
|
guard !self.isCapturing else { return }
|
|
|
|
// Start a fresh session and invalidate any in-flight callbacks tied to an older one.
|
|
let sessionID = UUID()
|
|
self.sessionID = sessionID
|
|
|
|
// Ensure permissions up front.
|
|
let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true)
|
|
guard granted else { return }
|
|
|
|
let config = await MainActor.run { self.makeConfig() }
|
|
self.activeConfig = config
|
|
self.isCapturing = true
|
|
self.triggerChimePlayed = false
|
|
self.finalized = false
|
|
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
|
let snapshot = await MainActor.run { VoiceSessionCoordinator.shared.snapshot() }
|
|
self.adoptedPrefix = snapshot.visible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : ""
|
|
self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)")
|
|
if config.triggerChime != .none {
|
|
self.triggerChimePlayed = true
|
|
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") }
|
|
}
|
|
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
|
|
await VoiceWakeRuntime.shared.pauseForPushToTalk()
|
|
let adoptedPrefix = self.adoptedPrefix
|
|
let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : VoiceOverlayTextFormatting
|
|
.makeAttributed(
|
|
committed: adoptedPrefix,
|
|
volatile: "",
|
|
isFinal: false)
|
|
self.overlayToken = await MainActor.run {
|
|
VoiceSessionCoordinator.shared.startSession(
|
|
source: .pushToTalk,
|
|
text: adoptedPrefix,
|
|
attributed: adoptedAttributed,
|
|
forwardEnabled: true)
|
|
}
|
|
|
|
do {
|
|
try await self.startRecognition(localeID: config.localeID, sessionID: sessionID)
|
|
} catch {
|
|
await MainActor.run {
|
|
VoiceWakeOverlayController.shared.dismiss()
|
|
}
|
|
self.isCapturing = false
|
|
// If push-to-talk fails to start after pausing wake-word, ensure we resume listening.
|
|
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
|
|
await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared)
|
|
}
|
|
}
|
|
|
|
func end() async {
|
|
guard self.isCapturing else { return }
|
|
self.isCapturing = false
|
|
let sessionID = self.sessionID
|
|
|
|
// Stop feeding Speech buffers first, then end the request. Stopping the engine here can race with
|
|
// Speech draining its converter chain (and we already stop/cancel in finalize).
|
|
if self.tapInstalled {
|
|
self.audioEngine?.inputNode.removeTap(onBus: 0)
|
|
self.tapInstalled = false
|
|
}
|
|
self.recognitionRequest?.endAudio()
|
|
|
|
// If we captured nothing, dismiss immediately when the user lets go.
|
|
if self.committed.isEmpty, self.volatile.isEmpty, self.adoptedPrefix.isEmpty {
|
|
await self.finalize(transcriptOverride: "", reason: "emptyOnRelease", sessionID: sessionID)
|
|
return
|
|
}
|
|
|
|
// Otherwise, give Speech a brief window to deliver the final result; then fall back.
|
|
self.timeoutTask?.cancel()
|
|
self.timeoutTask = Task { [weak self] in
|
|
try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
|
|
await self?.finalize(transcriptOverride: nil, reason: "timeout", sessionID: sessionID)
|
|
}
|
|
}
|
|
|
|
// MARK: - Private
|
|
|
|
private func startRecognition(localeID: String?, sessionID: UUID) async throws {
|
|
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
|
|
self.recognizer = SFSpeechRecognizer(locale: locale)
|
|
guard let recognizer, recognizer.isAvailable else {
|
|
throw NSError(
|
|
domain: "VoicePushToTalk",
|
|
code: 1,
|
|
userInfo: [NSLocalizedDescriptionKey: "Recognizer unavailable"])
|
|
}
|
|
|
|
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
|
self.recognitionRequest?.shouldReportPartialResults = true
|
|
guard let request = self.recognitionRequest else { return }
|
|
|
|
// Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
|
|
if self.audioEngine == nil {
|
|
self.audioEngine = AVAudioEngine()
|
|
}
|
|
guard let audioEngine = self.audioEngine else { return }
|
|
|
|
guard AudioInputDeviceObserver.hasUsableDefaultInputDevice() else {
|
|
self.audioEngine = nil
|
|
throw NSError(
|
|
domain: "VoicePushToTalk",
|
|
code: 1,
|
|
userInfo: [NSLocalizedDescriptionKey: "No usable audio input device available"])
|
|
}
|
|
|
|
let input = audioEngine.inputNode
|
|
let format = input.outputFormat(forBus: 0)
|
|
if self.tapInstalled {
|
|
input.removeTap(onBus: 0)
|
|
self.tapInstalled = false
|
|
}
|
|
// Pipe raw mic buffers into the Speech request while the chord is held.
|
|
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
|
|
request?.append(buffer)
|
|
}
|
|
self.tapInstalled = true
|
|
|
|
audioEngine.prepare()
|
|
try audioEngine.start()
|
|
|
|
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
|
guard let self else { return }
|
|
if let error {
|
|
self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
|
|
}
|
|
let transcript = result?.bestTranscription.formattedString
|
|
let isFinal = result?.isFinal ?? false
|
|
// Hop to a Task so UI updates stay off the Speech callback thread.
|
|
Task.detached { [weak self, transcript, isFinal, sessionID] in
|
|
guard let self else { return }
|
|
await self.handle(transcript: transcript, isFinal: isFinal, sessionID: sessionID)
|
|
}
|
|
}
|
|
}
|
|
|
|
private func handle(transcript: String?, isFinal: Bool, sessionID: UUID) async {
|
|
guard sessionID == self.sessionID else {
|
|
self.logger.debug("push-to-talk drop transcript for stale session")
|
|
return
|
|
}
|
|
guard let transcript else { return }
|
|
if isFinal {
|
|
self.committed = transcript
|
|
self.volatile = ""
|
|
} else {
|
|
self.volatile = VoiceOverlayTextFormatting.delta(after: self.committed, current: transcript)
|
|
}
|
|
|
|
let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed)
|
|
let snapshot = Self.join(committedWithPrefix, self.volatile)
|
|
let attributed = VoiceOverlayTextFormatting.makeAttributed(
|
|
committed: committedWithPrefix,
|
|
volatile: self.volatile,
|
|
isFinal: isFinal)
|
|
if let token = self.overlayToken {
|
|
await MainActor.run {
|
|
VoiceSessionCoordinator.shared.updatePartial(
|
|
token: token,
|
|
text: snapshot,
|
|
attributed: attributed)
|
|
}
|
|
}
|
|
}
|
|
|
|
private func finalize(transcriptOverride: String?, reason: String, sessionID: UUID?) async {
|
|
if self.finalized { return }
|
|
if let sessionID, sessionID != self.sessionID {
|
|
self.logger.debug("push-to-talk drop finalize for stale session")
|
|
return
|
|
}
|
|
self.finalized = true
|
|
self.isCapturing = false
|
|
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
|
|
|
let finalRecognized: String = {
|
|
if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
|
|
return override
|
|
}
|
|
return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
|
|
}()
|
|
let finalText = Self.join(self.adoptedPrefix, finalRecognized)
|
|
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
|
|
|
|
let token = self.overlayToken
|
|
let logger = self.logger
|
|
await MainActor.run {
|
|
logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
|
|
if let token {
|
|
VoiceSessionCoordinator.shared.finalize(
|
|
token: token,
|
|
text: finalText,
|
|
sendChime: chime,
|
|
autoSendAfter: nil)
|
|
VoiceSessionCoordinator.shared.sendNow(token: token, reason: reason)
|
|
} else if !finalText.isEmpty {
|
|
if chime != .none {
|
|
VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send")
|
|
}
|
|
Task.detached {
|
|
await VoiceWakeForwarder.forward(transcript: finalText)
|
|
}
|
|
}
|
|
}
|
|
|
|
self.recognitionTask?.cancel()
|
|
self.recognitionRequest = nil
|
|
self.recognitionTask = nil
|
|
if self.tapInstalled {
|
|
self.audioEngine?.inputNode.removeTap(onBus: 0)
|
|
self.tapInstalled = false
|
|
}
|
|
if self.audioEngine?.isRunning == true {
|
|
self.audioEngine?.stop()
|
|
self.audioEngine?.reset()
|
|
}
|
|
// Release the engine so we also release any audio session/resources when push-to-talk ends.
|
|
self.audioEngine = nil
|
|
|
|
self.committed = ""
|
|
self.volatile = ""
|
|
self.activeConfig = nil
|
|
self.triggerChimePlayed = false
|
|
self.overlayToken = nil
|
|
self.adoptedPrefix = ""
|
|
|
|
// Resume the wake-word runtime after push-to-talk finishes.
|
|
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
|
|
_ = await MainActor.run { Task { await VoiceWakeRuntime.shared.refresh(state: AppStateStore.shared) } }
|
|
}
|
|
|
|
@MainActor
|
|
private func makeConfig() -> Config {
|
|
let state = AppStateStore.shared
|
|
return Config(
|
|
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
|
|
localeID: state.voiceWakeLocaleID,
|
|
triggerChime: state.voiceWakeTriggerChime,
|
|
sendChime: state.voiceWakeSendChime)
|
|
}
|
|
|
|
// MARK: - Test helpers
|
|
|
|
static func _testDelta(committed: String, current: String) -> String {
|
|
VoiceOverlayTextFormatting.delta(after: committed, current: current)
|
|
}
|
|
|
|
static func _testAttributedColors(isFinal: Bool) -> (NSColor, NSColor) {
|
|
let sample = VoiceOverlayTextFormatting.makeAttributed(committed: "a", volatile: "b", isFinal: isFinal)
|
|
let committedColor = sample.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
|
|
let volatileColor = sample.attribute(.foregroundColor, at: 1, effectiveRange: nil) as? NSColor ?? .clear
|
|
return (committedColor, volatileColor)
|
|
}
|
|
|
|
private static func join(_ prefix: String, _ suffix: String) -> String {
|
|
if prefix.isEmpty { return suffix }
|
|
if suffix.isEmpty { return prefix }
|
|
return "\(prefix) \(suffix)"
|
|
}
|
|
}
|