mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 16:20:43 +00:00
perf(voice-call): trim realtime audio copies
This commit is contained in:
committed by
Peter Steinberger
parent
7fc9a82dca
commit
309ff6bada
@@ -411,6 +411,32 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
|
||||
});
|
||||
|
||||
it("fuses telephony mu-law conversion into the Gemini 16 kHz PCM input frame", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onopen();
|
||||
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
|
||||
|
||||
bridge.sendAudio(Buffer.from([0xff, 0x00]));
|
||||
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledWith({
|
||||
audio: {
|
||||
data: expect.any(String),
|
||||
mimeType: "audio/pcm;rate=16000",
|
||||
},
|
||||
});
|
||||
const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64");
|
||||
expect(Array.from({ length: sent.length / 2 }, (_, i) => sent.readInt16LE(i * 2))).toEqual([
|
||||
0, -16062, -32124, -32124,
|
||||
]);
|
||||
});
|
||||
|
||||
it("accepts PCM16 24 kHz audio without the telephony mu-law hop", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
|
||||
@@ -50,6 +50,11 @@ const MAX_PENDING_AUDIO_CHUNKS = 320;
|
||||
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 500;
|
||||
const GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS = 30 * 60 * 1000;
|
||||
const GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS = 60 * 1000;
|
||||
const MULAW_LINEAR_SAMPLES = new Int16Array(256);
|
||||
|
||||
for (let i = 0; i < MULAW_LINEAR_SAMPLES.length; i += 1) {
|
||||
MULAW_LINEAR_SAMPLES[i] = decodeMulawSample(i);
|
||||
}
|
||||
|
||||
type GoogleRealtimeSensitivity = "low" | "high";
|
||||
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
|
||||
@@ -330,6 +335,8 @@ function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): Func
|
||||
|
||||
function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveConnectConfig {
|
||||
const functionDeclarations = buildFunctionDeclarations(config.tools);
|
||||
const realtimeInputConfig = buildRealtimeInputConfig(config);
|
||||
const thinkingConfig = buildThinkingConfig(config);
|
||||
return {
|
||||
responseModalities: ["AUDIO" as Modality],
|
||||
...(typeof config.temperature === "number" && config.temperature > 0
|
||||
@@ -344,15 +351,13 @@ function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveCon
|
||||
},
|
||||
systemInstruction: config.instructions,
|
||||
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
|
||||
...(buildRealtimeInputConfig(config)
|
||||
? { realtimeInputConfig: buildRealtimeInputConfig(config) }
|
||||
: {}),
|
||||
...(realtimeInputConfig ? { realtimeInputConfig } : {}),
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
...(typeof config.enableAffectiveDialog === "boolean"
|
||||
? { enableAffectiveDialog: config.enableAffectiveDialog }
|
||||
: {}),
|
||||
...(buildThinkingConfig(config) ? { thinkingConfig: buildThinkingConfig(config) } : {}),
|
||||
...(thinkingConfig ? { thinkingConfig } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -487,12 +492,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
this.audioStreamEnded = false;
|
||||
}
|
||||
|
||||
const pcm = this.toInputPcm(audio);
|
||||
const pcm16k = resamplePcm(
|
||||
pcm,
|
||||
this.audioFormat.sampleRateHz,
|
||||
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
|
||||
);
|
||||
const pcm16k = this.toGoogleInputPcm16k(audio);
|
||||
this.session.sendRealtimeInput({
|
||||
audio: {
|
||||
data: pcm16k.toString("base64"),
|
||||
@@ -617,6 +617,21 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
return this.audioFormat.encoding === "pcm16" ? audio : mulawToPcm(audio);
|
||||
}
|
||||
|
||||
private toGoogleInputPcm16k(audio: Buffer): Buffer {
|
||||
if (
|
||||
this.audioFormat.encoding === "g711_ulaw" &&
|
||||
this.audioFormat.sampleRateHz === 8_000 &&
|
||||
GOOGLE_REALTIME_INPUT_SAMPLE_RATE === 16_000
|
||||
) {
|
||||
return convertMulaw8kToPcm16k(audio);
|
||||
}
|
||||
return resamplePcm(
|
||||
this.toInputPcm(audio),
|
||||
this.audioFormat.sampleRateHz,
|
||||
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
|
||||
);
|
||||
}
|
||||
|
||||
private toOutputAudio(pcm: Buffer, sampleRate: number): Buffer {
|
||||
return this.audioFormat.encoding === "pcm16"
|
||||
? resamplePcm(pcm, sampleRate, this.audioFormat.sampleRateHz)
|
||||
@@ -726,6 +741,30 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
}
|
||||
}
|
||||
|
||||
function convertMulaw8kToPcm16k(muLaw: Buffer): Buffer {
|
||||
if (muLaw.length === 0) {
|
||||
return Buffer.alloc(0);
|
||||
}
|
||||
const pcm = Buffer.alloc(muLaw.length * 4);
|
||||
for (let i = 0; i < muLaw.length; i += 1) {
|
||||
const current = MULAW_LINEAR_SAMPLES[muLaw[i] ?? 0] ?? 0;
|
||||
const next = MULAW_LINEAR_SAMPLES[muLaw[i + 1] ?? muLaw[i] ?? 0] ?? current;
|
||||
pcm.writeInt16LE(current, i * 4);
|
||||
pcm.writeInt16LE(Math.round((current + next) / 2), i * 4 + 2);
|
||||
}
|
||||
return pcm;
|
||||
}
|
||||
|
||||
function decodeMulawSample(value: number): number {
|
||||
const muLaw = ~value & 0xff;
|
||||
const sign = muLaw & 0x80;
|
||||
const exponent = (muLaw >> 4) & 0x07;
|
||||
const mantissa = muLaw & 0x0f;
|
||||
let sample = ((mantissa << 3) + 132) << exponent;
|
||||
sample -= 132;
|
||||
return sign ? -sample : sample;
|
||||
}
|
||||
|
||||
async function createGoogleRealtimeBrowserSession(
|
||||
req: RealtimeVoiceBrowserSessionCreateRequest,
|
||||
): Promise<RealtimeVoiceBrowserSession> {
|
||||
|
||||
Reference in New Issue
Block a user