diff --git a/CHANGELOG.md b/CHANGELOG.md index d5099a84edf..c006332a433 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai - WhatsApp: close long-lived web sockets through Baileys `end(error)` before falling back to raw websocket close, so listener teardown runs Baileys cleanup instead of leaving zombie sockets. Fixes #52442. Thanks @essendigitalgroup-cyber. - Twitch/plugins: emit a flat JSON Schema for Twitch channel config so single-account and multi-account configs validate before runtime load, and add source-checkout diagnostics for missing pnpm workspace dependencies. Thanks @vincentkoc. - Gateway/sessions: move hot transcript reads and mirror appends onto async bounded IO with serialized parent-linked writes, keeping large session histories from stalling Gateway requests and channel replies. Fixes #75656. Thanks @DerFlash. +- macOS/Talk Mode: downmix multi-channel microphone buffers before handing them to Apple Speech across Push-to-Talk, Talk Mode, Voice Wake, and the wake-word tester, so pro audio interfaces no longer produce empty transcripts. Fixes #42533. Thanks @jbuecker. - macOS/Talk Mode: subscribe native WebChat to active-session transcript updates and render external spoken user turns in the chat thread instead of only showing assistant replies. Fixes #75155. Thanks @SledderBling. - macOS/Voice Wake: accept trigger-only phrases in the built-in Voice Wake test, matching the settings UI and runtime trigger-only path instead of requiring extra command text after the wake word. Fixes #64986. Thanks @zoiks65. - Cron/TTS: run cron announce payloads through the normal TTS directive transform before outbound delivery, so scheduled `[[tts]]` replies generate voice payloads instead of leaking raw tags. Fixes #52125. Thanks @kenchen3000. diff --git a/apps/macos/Sources/OpenClaw/SpeechAudioBufferNormalizer.swift b/apps/macos/Sources/OpenClaw/SpeechAudioBufferNormalizer.swift new file mode 100644 index 00000000000..ac3ed4d36fd --- /dev/null +++ b/apps/macos/Sources/OpenClaw/SpeechAudioBufferNormalizer.swift @@ -0,0 +1,86 @@ +@preconcurrency import AVFoundation + +enum SpeechAudioBufferNormalizer { + static func speechCompatibleBuffer(from buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer { + let format = buffer.format + guard format.channelCount > 2, format.sampleRate > 0 else { + return buffer + } + return self.downmixFloatBuffer(buffer) ?? self.convertBuffer(buffer) ?? buffer + } + + private static func downmixFloatBuffer(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? { + let format = buffer.format + guard format.commonFormat == .pcmFormatFloat32, + !format.isInterleaved, + let source = buffer.floatChannelData, + let targetFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: format.sampleRate, + channels: 1, + interleaved: false), + let output = AVAudioPCMBuffer( + pcmFormat: targetFormat, + frameCapacity: buffer.frameCapacity), + let target = output.floatChannelData?[0] + else { + return nil + } + + output.frameLength = buffer.frameLength + let channelCount = Int(format.channelCount) + let frameCount = Int(buffer.frameLength) + guard channelCount > 0, frameCount > 0 else { return output } + + let scale = 1.0 / Float(channelCount) + for frame in 0.. AVAudioPCMBuffer? { + guard let targetFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: buffer.format.sampleRate, + channels: 1, + interleaved: false), + let converter = AVAudioConverter(from: buffer.format, to: targetFormat) + else { + return nil + } + + let frameCapacity = AVAudioFrameCount( + max(1, ceil(Double(buffer.frameLength) * targetFormat.sampleRate / buffer.format.sampleRate))) + guard let output = AVAudioPCMBuffer(pcmFormat: targetFormat, frameCapacity: frameCapacity) else { + return nil + } + + let input = ConverterInput(buffer) + var error: NSError? + let status = converter.convert(to: output, error: &error) { _, outStatus in + if input.didProvide { + outStatus.pointee = .noDataNow + return nil + } + input.didProvide = true + outStatus.pointee = .haveData + return input.buffer + } + guard status != .error else { return nil } + return output + } + + private final class ConverterInput: @unchecked Sendable { + let buffer: AVAudioPCMBuffer + var didProvide = false + + init(_ buffer: AVAudioPCMBuffer) { + self.buffer = buffer + } + } +} diff --git a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift index 8ec32302138..d61e0f1b034 100644 --- a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift +++ b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift @@ -225,7 +225,7 @@ actor TalkModeRuntime { input.removeTap(onBus: 0) let meter = self.rmsMeter input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request, meter] buffer, _ in - request?.append(buffer) + request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)) if let rms = Self.rmsLevel(buffer: buffer) { meter.set(rms) } diff --git a/apps/macos/Sources/OpenClaw/VoicePushToTalk.swift b/apps/macos/Sources/OpenClaw/VoicePushToTalk.swift index 872dcc224a6..efc6eaa7eec 100644 --- a/apps/macos/Sources/OpenClaw/VoicePushToTalk.swift +++ b/apps/macos/Sources/OpenClaw/VoicePushToTalk.swift @@ -260,9 +260,9 @@ actor VoicePushToTalk { input.removeTap(onBus: 0) self.tapInstalled = false } - // Pipe raw mic buffers into the Speech request while the chord is held. + // Pipe Speech-compatible mic buffers into the request while the chord is held. input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in - request?.append(buffer) + request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)) } self.tapInstalled = true diff --git a/apps/macos/Sources/OpenClaw/VoiceWakeRuntime.swift b/apps/macos/Sources/OpenClaw/VoiceWakeRuntime.swift index 3db6a92146f..644a9bd0717 100644 --- a/apps/macos/Sources/OpenClaw/VoiceWakeRuntime.swift +++ b/apps/macos/Sources/OpenClaw/VoiceWakeRuntime.swift @@ -187,7 +187,7 @@ actor VoiceWakeRuntime { } input.removeTap(onBus: 0) input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in - request?.append(buffer) + request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)) guard let rms = Self.rmsLevel(buffer: buffer) else { return } Task.detached { [weak self] in await self?.noteAudioLevel(rms: rms) diff --git a/apps/macos/Sources/OpenClaw/VoiceWakeTester.swift b/apps/macos/Sources/OpenClaw/VoiceWakeTester.swift index 2eff26b7765..9971b0b9cd1 100644 --- a/apps/macos/Sources/OpenClaw/VoiceWakeTester.swift +++ b/apps/macos/Sources/OpenClaw/VoiceWakeTester.swift @@ -116,7 +116,7 @@ final class VoiceWakeTester { } inputNode.removeTap(onBus: 0) inputNode.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in - request?.append(buffer) + request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)) } engine.prepare() diff --git a/apps/macos/Sources/OpenClaw/VoiceWakeTextUtils.swift b/apps/macos/Sources/OpenClaw/VoiceWakeTextUtils.swift index d2f2aaf017b..89652cd52bf 100644 --- a/apps/macos/Sources/OpenClaw/VoiceWakeTextUtils.swift +++ b/apps/macos/Sources/OpenClaw/VoiceWakeTextUtils.swift @@ -145,6 +145,7 @@ enum VoiceWakeTextUtils { || self.hasOnlyFillerBeforeTrigger(transcript: transcript, triggers: triggers) else { return nil } let trimmed = trimWake(transcript, triggers) + guard !self.isFillerOnly(trimmed) else { return nil } guard trimmed.count >= minCommandLength else { return nil } return trimmed } @@ -159,7 +160,8 @@ enum VoiceWakeTextUtils { self.startsWithTrigger(transcript: transcript, triggers: triggers) || self.hasOnlyFillerBeforeTrigger(transcript: transcript, triggers: triggers) else { return false } - return trimWake(transcript, triggers).isEmpty + let trimmed = trimWake(transcript, triggers) + return trimmed.isEmpty || self.isFillerOnly(trimmed) } static func hasOnlyFillerBeforeTrigger(transcript: String, triggers: [String]) -> Bool { @@ -173,6 +175,16 @@ enum VoiceWakeTextUtils { return prefixTokens.allSatisfy { self.wakePrefixFillers.contains($0) } } + private static func isFillerOnly(_ text: String) -> Bool { + let tokens = text + .split(whereSeparator: { + $0.isWhitespace || self.whitespaceAndPunctuation.contains($0.unicodeScalars.first!) + }) + .map { self.normalizeToken(String($0)) } + .filter { !$0.isEmpty } + return !tokens.isEmpty && tokens.allSatisfy { self.wakePrefixFillers.contains($0) } + } + static func matchedTriggerWord(transcript: String, triggers: [String]) -> String? { if let rawMatch = self.bestRawTriggerMatch(transcript: transcript, triggers: triggers) { return rawMatch.normalizedTrigger diff --git a/apps/macos/Tests/OpenClawIPCTests/VoicePushToTalkTests.swift b/apps/macos/Tests/OpenClawIPCTests/VoicePushToTalkTests.swift index aeb1d700474..7784b009a40 100644 --- a/apps/macos/Tests/OpenClawIPCTests/VoicePushToTalkTests.swift +++ b/apps/macos/Tests/OpenClawIPCTests/VoicePushToTalkTests.swift @@ -1,7 +1,50 @@ +import AVFoundation import Testing @testable import OpenClaw struct VoicePushToTalkTests { + @Test func `speech normalizer passes through mono buffers`() throws { + let format = try #require(AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 16_000, + channels: 1, + interleaved: false)) + let buffer = try #require(AVAudioPCMBuffer(pcmFormat: format, frameCapacity: 4)) + buffer.frameLength = 4 + + let normalized = SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer) + + #expect(normalized === buffer) + } + + @Test func `speech normalizer downmixes multichannel float buffers to mono`() throws { + var layout = AudioChannelLayout() + layout.mChannelLayoutTag = kAudioChannelLayoutTag_Quadraphonic + let channelLayout = AVAudioChannelLayout(layout: &layout) + let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 16_000, + interleaved: false, + channelLayout: channelLayout) + let buffer = try #require(AVAudioPCMBuffer(pcmFormat: format, frameCapacity: 2)) + buffer.frameLength = 2 + let channels = try #require(buffer.floatChannelData) + for frame in 0..<2 { + channels[0][frame] = 1 + channels[1][frame] = 3 + channels[2][frame] = 5 + channels[3][frame] = 7 + } + + let normalized = SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer) + + #expect(normalized.format.channelCount == 1) + #expect(normalized.frameLength == 2) + let output = try #require(normalized.floatChannelData?[0]) + #expect(output[0] == 4) + #expect(output[1] == 4) + } + @Test func `delta trims committed prefix`() { let delta = VoicePushToTalk._testDelta(committed: "hello ", current: "hello world again") #expect(delta == "world again")