fix: downmix speech buffers for macos voice

This commit is contained in:
Peter Steinberger
2026-05-02 02:47:25 +01:00
parent ff45bc1f88
commit c1996f5d75
8 changed files with 148 additions and 6 deletions

View File

@@ -0,0 +1,86 @@
@preconcurrency import AVFoundation
enum SpeechAudioBufferNormalizer {
static func speechCompatibleBuffer(from buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer {
let format = buffer.format
guard format.channelCount > 2, format.sampleRate > 0 else {
return buffer
}
return self.downmixFloatBuffer(buffer) ?? self.convertBuffer(buffer) ?? buffer
}
private static func downmixFloatBuffer(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
let format = buffer.format
guard format.commonFormat == .pcmFormatFloat32,
!format.isInterleaved,
let source = buffer.floatChannelData,
let targetFormat = AVAudioFormat(
commonFormat: .pcmFormatFloat32,
sampleRate: format.sampleRate,
channels: 1,
interleaved: false),
let output = AVAudioPCMBuffer(
pcmFormat: targetFormat,
frameCapacity: buffer.frameCapacity),
let target = output.floatChannelData?[0]
else {
return nil
}
output.frameLength = buffer.frameLength
let channelCount = Int(format.channelCount)
let frameCount = Int(buffer.frameLength)
guard channelCount > 0, frameCount > 0 else { return output }
let scale = 1.0 / Float(channelCount)
for frame in 0..<frameCount {
var sum: Float = 0
for channel in 0..<channelCount {
sum += source[channel][frame]
}
target[frame] = sum * scale
}
return output
}
private static func convertBuffer(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
guard let targetFormat = AVAudioFormat(
commonFormat: .pcmFormatFloat32,
sampleRate: buffer.format.sampleRate,
channels: 1,
interleaved: false),
let converter = AVAudioConverter(from: buffer.format, to: targetFormat)
else {
return nil
}
let frameCapacity = AVAudioFrameCount(
max(1, ceil(Double(buffer.frameLength) * targetFormat.sampleRate / buffer.format.sampleRate)))
guard let output = AVAudioPCMBuffer(pcmFormat: targetFormat, frameCapacity: frameCapacity) else {
return nil
}
let input = ConverterInput(buffer)
var error: NSError?
let status = converter.convert(to: output, error: &error) { _, outStatus in
if input.didProvide {
outStatus.pointee = .noDataNow
return nil
}
input.didProvide = true
outStatus.pointee = .haveData
return input.buffer
}
guard status != .error else { return nil }
return output
}
private final class ConverterInput: @unchecked Sendable {
let buffer: AVAudioPCMBuffer
var didProvide = false
init(_ buffer: AVAudioPCMBuffer) {
self.buffer = buffer
}
}
}

View File

@@ -225,7 +225,7 @@ actor TalkModeRuntime {
input.removeTap(onBus: 0)
let meter = self.rmsMeter
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request, meter] buffer, _ in
request?.append(buffer)
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
if let rms = Self.rmsLevel(buffer: buffer) {
meter.set(rms)
}

View File

@@ -260,9 +260,9 @@ actor VoicePushToTalk {
input.removeTap(onBus: 0)
self.tapInstalled = false
}
// Pipe raw mic buffers into the Speech request while the chord is held.
// Pipe Speech-compatible mic buffers into the request while the chord is held.
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
request?.append(buffer)
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
}
self.tapInstalled = true

View File

@@ -187,7 +187,7 @@ actor VoiceWakeRuntime {
}
input.removeTap(onBus: 0)
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
request?.append(buffer)
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
guard let rms = Self.rmsLevel(buffer: buffer) else { return }
Task.detached { [weak self] in
await self?.noteAudioLevel(rms: rms)

View File

@@ -116,7 +116,7 @@ final class VoiceWakeTester {
}
inputNode.removeTap(onBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
request?.append(buffer)
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
}
engine.prepare()

View File

@@ -145,6 +145,7 @@ enum VoiceWakeTextUtils {
|| self.hasOnlyFillerBeforeTrigger(transcript: transcript, triggers: triggers)
else { return nil }
let trimmed = trimWake(transcript, triggers)
guard !self.isFillerOnly(trimmed) else { return nil }
guard trimmed.count >= minCommandLength else { return nil }
return trimmed
}
@@ -159,7 +160,8 @@ enum VoiceWakeTextUtils {
self.startsWithTrigger(transcript: transcript, triggers: triggers)
|| self.hasOnlyFillerBeforeTrigger(transcript: transcript, triggers: triggers)
else { return false }
return trimWake(transcript, triggers).isEmpty
let trimmed = trimWake(transcript, triggers)
return trimmed.isEmpty || self.isFillerOnly(trimmed)
}
static func hasOnlyFillerBeforeTrigger(transcript: String, triggers: [String]) -> Bool {
@@ -173,6 +175,16 @@ enum VoiceWakeTextUtils {
return prefixTokens.allSatisfy { self.wakePrefixFillers.contains($0) }
}
private static func isFillerOnly(_ text: String) -> Bool {
let tokens = text
.split(whereSeparator: {
$0.isWhitespace || self.whitespaceAndPunctuation.contains($0.unicodeScalars.first!)
})
.map { self.normalizeToken(String($0)) }
.filter { !$0.isEmpty }
return !tokens.isEmpty && tokens.allSatisfy { self.wakePrefixFillers.contains($0) }
}
static func matchedTriggerWord(transcript: String, triggers: [String]) -> String? {
if let rawMatch = self.bestRawTriggerMatch(transcript: transcript, triggers: triggers) {
return rawMatch.normalizedTrigger

View File

@@ -1,7 +1,50 @@
import AVFoundation
import Testing
@testable import OpenClaw
struct VoicePushToTalkTests {
@Test func `speech normalizer passes through mono buffers`() throws {
let format = try #require(AVAudioFormat(
commonFormat: .pcmFormatFloat32,
sampleRate: 16_000,
channels: 1,
interleaved: false))
let buffer = try #require(AVAudioPCMBuffer(pcmFormat: format, frameCapacity: 4))
buffer.frameLength = 4
let normalized = SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)
#expect(normalized === buffer)
}
@Test func `speech normalizer downmixes multichannel float buffers to mono`() throws {
var layout = AudioChannelLayout()
layout.mChannelLayoutTag = kAudioChannelLayoutTag_Quadraphonic
let channelLayout = AVAudioChannelLayout(layout: &layout)
let format = AVAudioFormat(
commonFormat: .pcmFormatFloat32,
sampleRate: 16_000,
interleaved: false,
channelLayout: channelLayout)
let buffer = try #require(AVAudioPCMBuffer(pcmFormat: format, frameCapacity: 2))
buffer.frameLength = 2
let channels = try #require(buffer.floatChannelData)
for frame in 0..<2 {
channels[0][frame] = 1
channels[1][frame] = 3
channels[2][frame] = 5
channels[3][frame] = 7
}
let normalized = SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)
#expect(normalized.format.channelCount == 1)
#expect(normalized.frameLength == 2)
let output = try #require(normalized.floatChannelData?[0])
#expect(output[0] == 4)
#expect(output[1] == 4)
}
@Test func `delta trims committed prefix`() {
let delta = VoicePushToTalk._testDelta(committed: "hello ", current: "hello world again")
#expect(delta == "world again")