From 7225a2678e8c3fa155f6bd94f77280c12ac65381 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 5 May 2026 20:59:23 +0100 Subject: [PATCH] feat: expose talk-capable realtime providers --- .../google/realtime-voice-provider.test.ts | 23 ++++++++++++++++++- extensions/google/realtime-voice-provider.ts | 19 ++++++++++++++- .../openai/realtime-voice-provider.test.ts | 21 ++++++++++++++++- extensions/openai/realtime-voice-provider.ts | 21 +++++++++++++++-- 4 files changed, 79 insertions(+), 5 deletions(-) diff --git a/extensions/google/realtime-voice-provider.test.ts b/extensions/google/realtime-voice-provider.test.ts index 2b8a78fc7ea..4c32f423fc8 100644 --- a/extensions/google/realtime-voice-provider.test.ts +++ b/extensions/google/realtime-voice-provider.test.ts @@ -65,6 +65,27 @@ describe("buildGoogleRealtimeVoiceProvider", () => { delete process.env.GOOGLE_API_KEY; }); + it("declares realtime Talk capabilities for catalog selection", () => { + const provider = buildGoogleRealtimeVoiceProvider(); + + expect(provider.capabilities).toEqual({ + transports: ["provider-websocket", "gateway-relay"], + inputAudioFormats: [ + { encoding: "g711_ulaw", sampleRateHz: 8000, channels: 1 }, + { encoding: "pcm16", sampleRateHz: 24000, channels: 1 }, + ], + outputAudioFormats: [ + { encoding: "g711_ulaw", sampleRateHz: 8000, channels: 1 }, + { encoding: "pcm16", sampleRateHz: 24000, channels: 1 }, + ], + supportsBrowserSession: true, + supportsBargeIn: true, + supportsToolCalls: true, + supportsVideoFrames: true, + supportsSessionResumption: true, + }); + }); + it("normalizes provider config and cfg model-provider key fallback", () => { const provider = buildGoogleRealtimeVoiceProvider(); const resolved = provider.resolveConfig?.({ @@ -294,7 +315,7 @@ describe("buildGoogleRealtimeVoiceProvider", () => { }); expect(session).toMatchObject({ provider: "google", - transport: "json-pcm-websocket", + transport: "provider-websocket", protocol: "google-live-bidi", clientSecret: "auth_tokens/browser-session", websocketUrl: diff --git a/extensions/google/realtime-voice-provider.ts b/extensions/google/realtime-voice-provider.ts index 68c90529671..9fedbd3d3ba 100644 --- a/extensions/google/realtime-voice-provider.ts +++ b/extensions/google/realtime-voice-provider.ts @@ -32,6 +32,7 @@ import { convertPcmToMulaw8k, mulawToPcm, REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, resamplePcm, } from "openclaw/plugin-sdk/realtime-voice"; @@ -877,7 +878,7 @@ async function createGoogleRealtimeBrowserSession( return { provider: "google", - transport: "json-pcm-websocket", + transport: "provider-websocket", protocol: "google-live-bidi", clientSecret, websocketUrl: GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL, @@ -900,6 +901,22 @@ export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin label: "Google Live Voice", defaultModel: GOOGLE_REALTIME_DEFAULT_MODEL, autoSelectOrder: 20, + capabilities: { + transports: ["provider-websocket", "gateway-relay"], + inputAudioFormats: [ + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + ], + outputAudioFormats: [ + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + ], + supportsBrowserSession: true, + supportsBargeIn: true, + supportsToolCalls: true, + supportsVideoFrames: true, + supportsSessionResumption: true, + }, resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg), isConfigured: ({ providerConfig }) => Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()), diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index 810c1058c20..467ab65494a 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -114,6 +114,25 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { vi.unstubAllEnvs(); }); + it("declares realtime Talk capabilities for catalog selection", () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + + expect(provider.capabilities).toEqual({ + transports: ["webrtc", "gateway-relay"], + inputAudioFormats: [ + { encoding: "g711_ulaw", sampleRateHz: 8000, channels: 1 }, + { encoding: "pcm16", sampleRateHz: 24000, channels: 1 }, + ], + outputAudioFormats: [ + { encoding: "g711_ulaw", sampleRateHz: 8000, channels: 1 }, + { encoding: "pcm16", sampleRateHz: 24000, channels: 1 }, + ], + supportsBrowserSession: true, + supportsBargeIn: true, + supportsToolCalls: true, + }); + }); + it("adds OpenClaw attribution headers to native realtime websocket requests", () => { vi.stubEnv("OPENCLAW_VERSION", "2026.3.22"); const provider = buildOpenAIRealtimeVoiceProvider(); @@ -192,7 +211,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { }); expect(session).toMatchObject({ provider: "openai", - transport: "webrtc-sdp", + transport: "webrtc", clientSecret: "client-secret-123", offerUrl: "https://api.openai.com/v1/realtime/calls", }); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index e4f23275168..674dbdc5bc0 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -20,7 +20,10 @@ import type { RealtimeVoiceProviderPlugin, RealtimeVoiceTool, } from "openclaw/plugin-sdk/realtime-voice"; -import { REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ } from "openclaw/plugin-sdk/realtime-voice"; +import { + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, +} from "openclaw/plugin-sdk/realtime-voice"; import { normalizeResolvedSecretInputString, normalizeSecretInputString, @@ -857,7 +860,7 @@ async function createOpenAIRealtimeBrowserSession( const offerHeaders = resolveOpenAIRealtimeBrowserOfferHeaders(); return { provider: "openai", - transport: "webrtc-sdp", + transport: "webrtc", clientSecret, offerUrl: "https://api.openai.com/v1/realtime/calls", ...(offerHeaders ? { offerHeaders } : {}), @@ -873,6 +876,20 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin label: "OpenAI Realtime Voice", defaultModel: OPENAI_REALTIME_DEFAULT_MODEL, autoSelectOrder: 10, + capabilities: { + transports: ["webrtc", "gateway-relay"], + inputAudioFormats: [ + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + ], + outputAudioFormats: [ + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + ], + supportsBrowserSession: true, + supportsBargeIn: true, + supportsToolCalls: true, + }, resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig), isConfigured: ({ providerConfig }) => hasOpenAIRealtimeApiKeyInput(normalizeProviderConfig(providerConfig).apiKey),