From 4dc2aedb767d81928a37da8a1d76585429cdd048 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 3 May 2026 17:05:59 -0700 Subject: [PATCH] fix(openai): flatten realtime transcription session update --- .../realtime-transcription-provider.test.ts | 78 ++++++++----------- .../openai/realtime-transcription-provider.ts | 28 +++---- 2 files changed, 44 insertions(+), 62 deletions(-) diff --git a/extensions/openai/realtime-transcription-provider.test.ts b/extensions/openai/realtime-transcription-provider.test.ts index 4e5eb9a37d1..a2b4585ea4c 100644 --- a/extensions/openai/realtime-transcription-provider.test.ts +++ b/extensions/openai/realtime-transcription-provider.test.ts @@ -54,21 +54,17 @@ type SentRealtimeEvent = { type: string; audio?: string; session?: { - audio?: { - input?: { - format?: { type?: string }; - transcription?: { - model?: string; - language?: string; - prompt?: string; - }; - turn_detection?: { - type?: string; - threshold?: number; - prefix_padding_ms?: number; - silence_duration_ms?: number; - }; - }; + input_audio_format?: string; + input_audio_transcription?: { + model?: string; + language?: string; + prompt?: string; + }; + turn_detection?: { + type?: string; + threshold?: number; + prefix_padding_ms?: number; + silence_duration_ms?: number; }; }; }; @@ -179,21 +175,17 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => { { type: "transcription_session.update", session: { - audio: { - input: { - format: { type: "audio/pcmu" }, - transcription: { - model: "gpt-4o-transcribe", - language: "en", - prompt: "expect OpenClaw product names", - }, - turn_detection: { - type: "server_vad", - threshold: 0.45, - prefix_padding_ms: 300, - silence_duration_ms: 900, - }, - }, + input_audio_format: "g711_ulaw", + input_audio_transcription: { + model: "gpt-4o-transcribe", + language: "en", + prompt: "expect OpenClaw product names", + }, + turn_detection: { + type: "server_vad", + threshold: 0.45, + prefix_padding_ms: 300, + silence_duration_ms: 900, }, }, }, @@ -207,21 +199,17 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => { { type: "transcription_session.update", session: { - audio: { - input: { - format: { type: "audio/pcmu" }, - transcription: { - model: "gpt-4o-transcribe", - language: "en", - prompt: "expect OpenClaw product names", - }, - turn_detection: { - type: "server_vad", - threshold: 0.45, - prefix_padding_ms: 300, - silence_duration_ms: 900, - }, - }, + input_audio_format: "g711_ulaw", + input_audio_transcription: { + model: "gpt-4o-transcribe", + language: "en", + prompt: "expect OpenClaw product names", + }, + turn_detection: { + type: "server_vad", + threshold: 0.45, + prefix_padding_ms: 300, + silence_duration_ms: 900, }, }, }, diff --git a/extensions/openai/realtime-transcription-provider.ts b/extensions/openai/realtime-transcription-provider.ts index 16fd9e575a7..a49169c850f 100644 --- a/extensions/openai/realtime-transcription-provider.ts +++ b/extensions/openai/realtime-transcription-provider.ts @@ -150,23 +150,17 @@ function createOpenAIRealtimeTranscriptionSession( transport.sendJson({ type: "transcription_session.update", session: { - audio: { - input: { - format: { - type: "audio/pcmu", - }, - transcription: { - model: config.model, - ...(config.language ? { language: config.language } : {}), - ...(config.prompt ? { prompt: config.prompt } : {}), - }, - turn_detection: { - type: "server_vad", - threshold: config.vadThreshold, - prefix_padding_ms: 300, - silence_duration_ms: config.silenceDurationMs, - }, - }, + input_audio_format: "g711_ulaw", + input_audio_transcription: { + model: config.model, + ...(config.language ? { language: config.language } : {}), + ...(config.prompt ? { prompt: config.prompt } : {}), + }, + turn_detection: { + type: "server_vad", + threshold: config.vadThreshold, + prefix_padding_ms: 300, + silence_duration_ms: config.silenceDurationMs, }, }, });