mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 17:31:06 +00:00
fix: downmix speech buffers for macos voice
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
@preconcurrency import AVFoundation
|
||||
|
||||
enum SpeechAudioBufferNormalizer {
|
||||
static func speechCompatibleBuffer(from buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer {
|
||||
let format = buffer.format
|
||||
guard format.channelCount > 2, format.sampleRate > 0 else {
|
||||
return buffer
|
||||
}
|
||||
return self.downmixFloatBuffer(buffer) ?? self.convertBuffer(buffer) ?? buffer
|
||||
}
|
||||
|
||||
private static func downmixFloatBuffer(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
|
||||
let format = buffer.format
|
||||
guard format.commonFormat == .pcmFormatFloat32,
|
||||
!format.isInterleaved,
|
||||
let source = buffer.floatChannelData,
|
||||
let targetFormat = AVAudioFormat(
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
sampleRate: format.sampleRate,
|
||||
channels: 1,
|
||||
interleaved: false),
|
||||
let output = AVAudioPCMBuffer(
|
||||
pcmFormat: targetFormat,
|
||||
frameCapacity: buffer.frameCapacity),
|
||||
let target = output.floatChannelData?[0]
|
||||
else {
|
||||
return nil
|
||||
}
|
||||
|
||||
output.frameLength = buffer.frameLength
|
||||
let channelCount = Int(format.channelCount)
|
||||
let frameCount = Int(buffer.frameLength)
|
||||
guard channelCount > 0, frameCount > 0 else { return output }
|
||||
|
||||
let scale = 1.0 / Float(channelCount)
|
||||
for frame in 0..<frameCount {
|
||||
var sum: Float = 0
|
||||
for channel in 0..<channelCount {
|
||||
sum += source[channel][frame]
|
||||
}
|
||||
target[frame] = sum * scale
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
private static func convertBuffer(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
|
||||
guard let targetFormat = AVAudioFormat(
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
sampleRate: buffer.format.sampleRate,
|
||||
channels: 1,
|
||||
interleaved: false),
|
||||
let converter = AVAudioConverter(from: buffer.format, to: targetFormat)
|
||||
else {
|
||||
return nil
|
||||
}
|
||||
|
||||
let frameCapacity = AVAudioFrameCount(
|
||||
max(1, ceil(Double(buffer.frameLength) * targetFormat.sampleRate / buffer.format.sampleRate)))
|
||||
guard let output = AVAudioPCMBuffer(pcmFormat: targetFormat, frameCapacity: frameCapacity) else {
|
||||
return nil
|
||||
}
|
||||
|
||||
let input = ConverterInput(buffer)
|
||||
var error: NSError?
|
||||
let status = converter.convert(to: output, error: &error) { _, outStatus in
|
||||
if input.didProvide {
|
||||
outStatus.pointee = .noDataNow
|
||||
return nil
|
||||
}
|
||||
input.didProvide = true
|
||||
outStatus.pointee = .haveData
|
||||
return input.buffer
|
||||
}
|
||||
guard status != .error else { return nil }
|
||||
return output
|
||||
}
|
||||
|
||||
private final class ConverterInput: @unchecked Sendable {
|
||||
let buffer: AVAudioPCMBuffer
|
||||
var didProvide = false
|
||||
|
||||
init(_ buffer: AVAudioPCMBuffer) {
|
||||
self.buffer = buffer
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,7 +225,7 @@ actor TalkModeRuntime {
|
||||
input.removeTap(onBus: 0)
|
||||
let meter = self.rmsMeter
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request, meter] buffer, _ in
|
||||
request?.append(buffer)
|
||||
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
|
||||
if let rms = Self.rmsLevel(buffer: buffer) {
|
||||
meter.set(rms)
|
||||
}
|
||||
|
||||
@@ -260,9 +260,9 @@ actor VoicePushToTalk {
|
||||
input.removeTap(onBus: 0)
|
||||
self.tapInstalled = false
|
||||
}
|
||||
// Pipe raw mic buffers into the Speech request while the chord is held.
|
||||
// Pipe Speech-compatible mic buffers into the request while the chord is held.
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
|
||||
request?.append(buffer)
|
||||
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
|
||||
}
|
||||
self.tapInstalled = true
|
||||
|
||||
|
||||
@@ -187,7 +187,7 @@ actor VoiceWakeRuntime {
|
||||
}
|
||||
input.removeTap(onBus: 0)
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
|
||||
request?.append(buffer)
|
||||
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
|
||||
guard let rms = Self.rmsLevel(buffer: buffer) else { return }
|
||||
Task.detached { [weak self] in
|
||||
await self?.noteAudioLevel(rms: rms)
|
||||
|
||||
@@ -116,7 +116,7 @@ final class VoiceWakeTester {
|
||||
}
|
||||
inputNode.removeTap(onBus: 0)
|
||||
inputNode.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in
|
||||
request?.append(buffer)
|
||||
request?.append(SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer))
|
||||
}
|
||||
|
||||
engine.prepare()
|
||||
|
||||
@@ -145,6 +145,7 @@ enum VoiceWakeTextUtils {
|
||||
|| self.hasOnlyFillerBeforeTrigger(transcript: transcript, triggers: triggers)
|
||||
else { return nil }
|
||||
let trimmed = trimWake(transcript, triggers)
|
||||
guard !self.isFillerOnly(trimmed) else { return nil }
|
||||
guard trimmed.count >= minCommandLength else { return nil }
|
||||
return trimmed
|
||||
}
|
||||
@@ -159,7 +160,8 @@ enum VoiceWakeTextUtils {
|
||||
self.startsWithTrigger(transcript: transcript, triggers: triggers)
|
||||
|| self.hasOnlyFillerBeforeTrigger(transcript: transcript, triggers: triggers)
|
||||
else { return false }
|
||||
return trimWake(transcript, triggers).isEmpty
|
||||
let trimmed = trimWake(transcript, triggers)
|
||||
return trimmed.isEmpty || self.isFillerOnly(trimmed)
|
||||
}
|
||||
|
||||
static func hasOnlyFillerBeforeTrigger(transcript: String, triggers: [String]) -> Bool {
|
||||
@@ -173,6 +175,16 @@ enum VoiceWakeTextUtils {
|
||||
return prefixTokens.allSatisfy { self.wakePrefixFillers.contains($0) }
|
||||
}
|
||||
|
||||
private static func isFillerOnly(_ text: String) -> Bool {
|
||||
let tokens = text
|
||||
.split(whereSeparator: {
|
||||
$0.isWhitespace || self.whitespaceAndPunctuation.contains($0.unicodeScalars.first!)
|
||||
})
|
||||
.map { self.normalizeToken(String($0)) }
|
||||
.filter { !$0.isEmpty }
|
||||
return !tokens.isEmpty && tokens.allSatisfy { self.wakePrefixFillers.contains($0) }
|
||||
}
|
||||
|
||||
static func matchedTriggerWord(transcript: String, triggers: [String]) -> String? {
|
||||
if let rawMatch = self.bestRawTriggerMatch(transcript: transcript, triggers: triggers) {
|
||||
return rawMatch.normalizedTrigger
|
||||
|
||||
@@ -1,7 +1,50 @@
|
||||
import AVFoundation
|
||||
import Testing
|
||||
@testable import OpenClaw
|
||||
|
||||
struct VoicePushToTalkTests {
|
||||
@Test func `speech normalizer passes through mono buffers`() throws {
|
||||
let format = try #require(AVAudioFormat(
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
sampleRate: 16_000,
|
||||
channels: 1,
|
||||
interleaved: false))
|
||||
let buffer = try #require(AVAudioPCMBuffer(pcmFormat: format, frameCapacity: 4))
|
||||
buffer.frameLength = 4
|
||||
|
||||
let normalized = SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)
|
||||
|
||||
#expect(normalized === buffer)
|
||||
}
|
||||
|
||||
@Test func `speech normalizer downmixes multichannel float buffers to mono`() throws {
|
||||
var layout = AudioChannelLayout()
|
||||
layout.mChannelLayoutTag = kAudioChannelLayoutTag_Quadraphonic
|
||||
let channelLayout = AVAudioChannelLayout(layout: &layout)
|
||||
let format = AVAudioFormat(
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
sampleRate: 16_000,
|
||||
interleaved: false,
|
||||
channelLayout: channelLayout)
|
||||
let buffer = try #require(AVAudioPCMBuffer(pcmFormat: format, frameCapacity: 2))
|
||||
buffer.frameLength = 2
|
||||
let channels = try #require(buffer.floatChannelData)
|
||||
for frame in 0..<2 {
|
||||
channels[0][frame] = 1
|
||||
channels[1][frame] = 3
|
||||
channels[2][frame] = 5
|
||||
channels[3][frame] = 7
|
||||
}
|
||||
|
||||
let normalized = SpeechAudioBufferNormalizer.speechCompatibleBuffer(from: buffer)
|
||||
|
||||
#expect(normalized.format.channelCount == 1)
|
||||
#expect(normalized.frameLength == 2)
|
||||
let output = try #require(normalized.floatChannelData?[0])
|
||||
#expect(output[0] == 4)
|
||||
#expect(output[1] == 4)
|
||||
}
|
||||
|
||||
@Test func `delta trims committed prefix`() {
|
||||
let delta = VoicePushToTalk._testDelta(committed: "hello ", current: "hello world again")
|
||||
#expect(delta == "world again")
|
||||
|
||||
Reference in New Issue
Block a user