diff --git a/extensions/openai/realtime-transcription-provider.test.ts b/extensions/openai/realtime-transcription-provider.test.ts index 214b4908cf5..b947ce87608 100644 --- a/extensions/openai/realtime-transcription-provider.test.ts +++ b/extensions/openai/realtime-transcription-provider.test.ts @@ -1,7 +1,13 @@ -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it } from "vitest"; import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js"; describe("buildOpenAIRealtimeTranscriptionProvider", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + it("normalizes OpenAI config defaults", () => { const provider = buildOpenAIRealtimeTranscriptionProvider(); const resolved = provider.resolveConfig?.({ @@ -20,6 +26,24 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => { }); }); + it("reads provider-owned env fallbacks", () => { + process.env.REALTIME_TRANSCRIPTION_MODEL = "gpt-4o-transcribe"; + process.env.SILENCE_DURATION_MS = "900"; + process.env.VAD_THRESHOLD = "0.45"; + + const provider = buildOpenAIRealtimeTranscriptionProvider(); + const resolved = provider.resolveConfig?.({ + cfg: {} as never, + rawConfig: {}, + }); + + expect(resolved).toEqual({ + model: "gpt-4o-transcribe", + silenceDurationMs: 900, + vadThreshold: 0.45, + }); + }); + it("accepts the legacy openai-realtime alias", () => { const provider = buildOpenAIRealtimeTranscriptionProvider(); expect(provider.aliases).toContain("openai-realtime"); diff --git a/extensions/openai/realtime-transcription-provider.ts b/extensions/openai/realtime-transcription-provider.ts index d4fd8d09350..984f0dcc847 100644 --- a/extensions/openai/realtime-transcription-provider.ts +++ b/extensions/openai/realtime-transcription-provider.ts @@ -57,9 +57,21 @@ function normalizeProviderConfig( value: raw?.openaiApiKey, path: "plugins.entries.voice-call.config.streaming.openaiApiKey", }), - model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel), - silenceDurationMs: asNumber(raw?.silenceDurationMs), - vadThreshold: asNumber(raw?.vadThreshold), + model: + trimToUndefined(raw?.model) ?? + trimToUndefined(raw?.sttModel) ?? + trimToUndefined(process.env.REALTIME_TRANSCRIPTION_MODEL) ?? + trimToUndefined(process.env.STREAMING_STT_MODEL), + silenceDurationMs: + asNumber(raw?.silenceDurationMs) ?? + (typeof process.env.SILENCE_DURATION_MS === "string" + ? Number.parseInt(process.env.SILENCE_DURATION_MS, 10) + : undefined), + vadThreshold: + asNumber(raw?.vadThreshold) ?? + (typeof process.env.VAD_THRESHOLD === "string" + ? Number.parseFloat(process.env.VAD_THRESHOLD) + : undefined), }; } diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts new file mode 100644 index 00000000000..92803299695 --- /dev/null +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -0,0 +1,32 @@ +import { afterEach, describe, expect, it } from "vitest"; +import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js"; + +describe("buildOpenAIRealtimeVoiceProvider", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it("normalizes provider-owned env fallbacks", () => { + process.env.REALTIME_VOICE_MODEL = "gpt-realtime"; + process.env.REALTIME_VOICE_VOICE = "verse"; + process.env.REALTIME_VOICE_TEMPERATURE = "0.6"; + process.env.SILENCE_DURATION_MS = "850"; + process.env.VAD_THRESHOLD = "0.35"; + + const provider = buildOpenAIRealtimeVoiceProvider(); + const resolved = provider.resolveConfig?.({ + cfg: {} as never, + rawConfig: {}, + }); + + expect(resolved).toEqual({ + model: "gpt-realtime", + voice: "verse", + temperature: 0.6, + silenceDurationMs: 850, + vadThreshold: 0.35, + }); + }); +}); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index 2afb8dcfc40..687b5098a89 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -103,11 +103,25 @@ function normalizeProviderConfig( value: raw?.apiKey, path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey", }), - model: trimToUndefined(raw?.model), - voice: raw?.voice as OpenAIRealtimeVoice | undefined, - temperature: asNumber(raw?.temperature), - vadThreshold: asNumber(raw?.vadThreshold), - silenceDurationMs: asNumber(raw?.silenceDurationMs), + model: trimToUndefined(raw?.model) ?? trimToUndefined(process.env.REALTIME_VOICE_MODEL), + voice: (trimToUndefined(raw?.voice) ?? trimToUndefined(process.env.REALTIME_VOICE_VOICE)) as + | OpenAIRealtimeVoice + | undefined, + temperature: + asNumber(raw?.temperature) ?? + (typeof process.env.REALTIME_VOICE_TEMPERATURE === "string" + ? Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE) + : undefined), + vadThreshold: + asNumber(raw?.vadThreshold) ?? + (typeof process.env.VAD_THRESHOLD === "string" + ? Number.parseFloat(process.env.VAD_THRESHOLD) + : undefined), + silenceDurationMs: + asNumber(raw?.silenceDurationMs) ?? + (typeof process.env.SILENCE_DURATION_MS === "string" + ? Number.parseInt(process.env.SILENCE_DURATION_MS, 10) + : undefined), prefixPaddingMs: asNumber(raw?.prefixPaddingMs), azureEndpoint: trimToUndefined(raw?.azureEndpoint), azureDeployment: trimToUndefined(raw?.azureDeployment), diff --git a/extensions/voice-call/README.md b/extensions/voice-call/README.md index a2dd7eba40a..b97f679a64e 100644 --- a/extensions/voice-call/README.md +++ b/extensions/voice-call/README.md @@ -76,7 +76,15 @@ Put under `plugins.entries.voice-call.config`: streaming: { enabled: true, + // optional; if omitted, Voice Call picks the first registered + // realtime-transcription provider by autoSelectOrder + provider: "openai", streamPath: "/voice/stream", + providers: { + openai: { + model: "gpt-4o-transcribe", + }, + }, preStartTimeoutMs: 5000, maxPendingConnections: 32, maxPendingConnectionsPerIp: 4, @@ -145,4 +153,4 @@ Actions: - While a Twilio stream is active, playback does not fall back to TwiML ``; stream-TTS failures fail the playback request. - Outbound conversation calls suppress barge-in only while the initial greeting is actively speaking, then re-enable normal interruption. - Twilio stream disconnect auto-end uses a short grace window so quick reconnects do not end the call. -- Media streaming requires `ws` plus a configured realtime-transcription provider. The bundled provider today is OpenAI. +- Realtime provider selection is generic. Configure `streaming.provider` / `realtime.provider` and put provider-owned options under `providers.`. diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index fc667698586..a932ca95912 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -72,45 +72,28 @@ const voiceCallConfigSchema = { advanced: true, }, "streaming.enabled": { label: "Enable Streaming", advanced: true }, - "streaming.provider": { label: "Streaming Provider", advanced: true }, - "streaming.providers.openai.apiKey": { - label: "OpenAI Realtime API Key", - sensitive: true, + "streaming.provider": { + label: "Streaming Provider", + help: "Uses the first registered realtime transcription provider when unset.", advanced: true, }, - "streaming.providers.openai.model": { label: "Realtime STT Model", advanced: true }, + "streaming.providers": { label: "Streaming Provider Config", advanced: true }, "streaming.streamPath": { label: "Media Stream Path", advanced: true }, "realtime.enabled": { label: "Enable Realtime Voice", advanced: true }, - "realtime.provider": { label: "Realtime Voice Provider", advanced: true }, - "realtime.streamPath": { label: "Realtime Stream Path", advanced: true }, - "realtime.instructions": { label: "Realtime Instructions", advanced: true }, - "realtime.providers.openai.apiKey": { - label: "OpenAI Realtime API Key", - sensitive: true, + "realtime.provider": { + label: "Realtime Voice Provider", + help: "Uses the first registered realtime voice provider when unset.", advanced: true, }, - "realtime.providers.openai.model": { label: "OpenAI Realtime Model", advanced: true }, - "realtime.providers.openai.voice": { label: "OpenAI Realtime Voice", advanced: true }, + "realtime.streamPath": { label: "Realtime Stream Path", advanced: true }, + "realtime.instructions": { label: "Realtime Instructions", advanced: true }, + "realtime.providers": { label: "Realtime Provider Config", advanced: true }, "tts.provider": { label: "TTS Provider Override", help: "Deep-merges with messages.tts (Microsoft is ignored for calls).", advanced: true, }, - "tts.providers.openai.model": { label: "OpenAI TTS Model", advanced: true }, - "tts.providers.openai.voice": { label: "OpenAI TTS Voice", advanced: true }, - "tts.providers.openai.apiKey": { - label: "OpenAI API Key", - sensitive: true, - advanced: true, - }, - "tts.providers.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true }, - "tts.providers.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true }, - "tts.providers.elevenlabs.apiKey": { - label: "ElevenLabs API Key", - sensitive: true, - advanced: true, - }, - "tts.providers.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true }, + "tts.providers": { label: "TTS Provider Config", advanced: true }, publicUrl: { label: "Public Webhook URL", advanced: true }, skipSignatureVerification: { label: "Skip Signature Verification", diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index f0700789bb3..e70c5d5aeb5 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -88,54 +88,45 @@ }, "streaming.provider": { "label": "Streaming Provider", + "help": "Uses the first registered realtime transcription provider when unset.", "advanced": true }, - "streaming.providers.openai.apiKey": { - "label": "OpenAI Realtime API Key", - "sensitive": true, - "advanced": true - }, - "streaming.providers.openai.model": { - "label": "Realtime STT Model", + "streaming.providers": { + "label": "Streaming Provider Config", "advanced": true }, "streaming.streamPath": { "label": "Media Stream Path", "advanced": true }, + "realtime.enabled": { + "label": "Enable Realtime Voice", + "advanced": true + }, + "realtime.provider": { + "label": "Realtime Voice Provider", + "help": "Uses the first registered realtime voice provider when unset.", + "advanced": true + }, + "realtime.streamPath": { + "label": "Realtime Stream Path", + "advanced": true + }, + "realtime.instructions": { + "label": "Realtime Instructions", + "advanced": true + }, + "realtime.providers": { + "label": "Realtime Provider Config", + "advanced": true + }, "tts.provider": { "label": "TTS Provider Override", "help": "Deep-merges with messages.tts (Microsoft is ignored for calls).", "advanced": true }, - "tts.providers.openai.model": { - "label": "OpenAI TTS Model", - "advanced": true - }, - "tts.providers.openai.voice": { - "label": "OpenAI TTS Voice", - "advanced": true - }, - "tts.providers.openai.apiKey": { - "label": "OpenAI API Key", - "sensitive": true, - "advanced": true - }, - "tts.providers.elevenlabs.modelId": { - "label": "ElevenLabs Model ID", - "advanced": true - }, - "tts.providers.elevenlabs.voiceId": { - "label": "ElevenLabs Voice ID", - "advanced": true - }, - "tts.providers.elevenlabs.apiKey": { - "label": "ElevenLabs API Key", - "sensitive": true, - "advanced": true - }, - "tts.providers.elevenlabs.baseUrl": { - "label": "ElevenLabs Base URL", + "tts.providers": { + "label": "TTS Provider Config", "advanced": true }, "publicUrl": { @@ -470,19 +461,6 @@ "skipSignatureVerification": { "type": "boolean" }, - "stt": { - "type": "object", - "additionalProperties": false, - "properties": { - "provider": { - "type": "string", - "enum": ["openai"] - }, - "model": { - "type": "string" - } - } - }, "tts": { "type": "object", "additionalProperties": false, diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index ec268b1c3ca..966ef8cdf97 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -223,8 +223,8 @@ describe("normalizeVoiceCallConfig", () => { expect(normalized.serve.path).toBe("/voice/webhook"); expect(normalized.streaming.streamPath).toBe("/custom-stream"); - expect(normalized.streaming.provider).toBe("openai"); - expect(normalized.streaming.providers.openai).toEqual({}); + expect(normalized.streaming.provider).toBeUndefined(); + expect(normalized.streaming.providers).toEqual({}); expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime"); expect(normalized.tunnel.provider).toBe("none"); expect(normalized.webhookSecurity.allowedHosts).toEqual([]); @@ -271,3 +271,48 @@ describe("normalizeVoiceCallConfig", () => { expect(elevenlabs.voiceSettings).toEqual({ speed: 1.1 }); }); }); + +describe("resolveVoiceCallConfig", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it("keeps legacy streaming OpenAI fields inside providers.openai without forcing provider selection", () => { + const resolved = resolveVoiceCallConfig({ + enabled: true, + provider: "twilio", + streaming: { + enabled: true, + openaiApiKey: "sk-test", // pragma: allowlist secret + sttModel: "gpt-4o-transcribe", + silenceDurationMs: 700, + vadThreshold: 0.4, + }, + }); + + expect(resolved.streaming.provider).toBeUndefined(); + expect(resolved.streaming.providers.openai).toEqual({ + apiKey: "sk-test", + model: "gpt-4o-transcribe", + silenceDurationMs: 700, + vadThreshold: 0.4, + }); + }); + + it("maps realtime instructions from the legacy env hook without altering provider selection", () => { + process.env.REALTIME_VOICE_INSTRUCTIONS = "Stay concise."; + + const resolved = resolveVoiceCallConfig({ + enabled: true, + provider: "twilio", + realtime: { + enabled: true, + }, + }); + + expect(resolved.realtime.instructions).toBe("Stay concise."); + expect(resolved.realtime.provider).toBeUndefined(); + }); +}); diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 7e2c519a7be..021b7042cf0 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -64,21 +64,6 @@ export const PlivoConfigSchema = z .strict(); export type PlivoConfig = z.infer; -// ----------------------------------------------------------------------------- -// STT/TTS Configuration -// ----------------------------------------------------------------------------- - -export const SttConfigSchema = z - .object({ - /** One-shot STT provider for non-streaming paths. */ - provider: z.literal("openai").default("openai"), - /** Whisper model to use */ - model: z.string().min(1).default("whisper-1"), - }) - .strict() - .default({ provider: "openai", model: "whisper-1" }); -export type SttConfig = z.infer; - export { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema }; export type VoiceCallTtsConfig = z.infer; @@ -255,7 +240,7 @@ export const VoiceCallStreamingConfigSchema = z /** Enable real-time audio streaming (requires WebSocket support) */ enabled: z.boolean().default(false), /** Provider id from registered realtime transcription providers. */ - provider: z.string().min(1).default("openai"), + provider: z.string().min(1).optional(), /** @deprecated Legacy alias for provider. */ sttProvider: z.string().min(1).optional(), /** @deprecated Legacy OpenAI-specific API key field. */ @@ -285,7 +270,6 @@ export const VoiceCallStreamingConfigSchema = z .strict() .default({ enabled: false, - provider: "openai", streamPath: "/voice/stream", providers: {}, preStartTimeoutMs: 5000, @@ -381,9 +365,6 @@ export const VoiceCallConfigSchema = z /** Skip webhook signature verification (development only, NOT for production) */ skipSignatureVerification: z.boolean().default(false), - /** STT configuration */ - stt: SttConfigSchema, - /** TTS override (deep-merges with core messages.tts) */ tts: TtsConfigSchema, @@ -467,36 +448,73 @@ function sanitizeVoiceCallProviderConfigs( ); } +function mergeLegacyStreamingOpenAICompat( + streaming: VoiceCallStreamingConfig, +): VoiceCallStreamingConfig { + const providers = { ...(streaming.providers ?? {}) }; + const legacyStreamingRaw = streaming as Record; + const openaiRaw = + providers.openai && typeof providers.openai === "object" + ? { ...(providers.openai as Record) } + : {}; + + if (typeof openaiRaw.apiKey !== "string" && typeof legacyStreamingRaw.openaiApiKey === "string") { + openaiRaw.apiKey = legacyStreamingRaw.openaiApiKey; + } + if (typeof openaiRaw.model !== "string" && typeof legacyStreamingRaw.sttModel === "string") { + openaiRaw.model = legacyStreamingRaw.sttModel; + } + if ( + openaiRaw.silenceDurationMs == null && + typeof legacyStreamingRaw.silenceDurationMs === "number" + ) { + openaiRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs; + } + if (openaiRaw.vadThreshold == null && typeof legacyStreamingRaw.vadThreshold === "number") { + openaiRaw.vadThreshold = legacyStreamingRaw.vadThreshold; + } + if (Object.keys(openaiRaw).length > 0) { + providers.openai = openaiRaw; + } + + return { + ...streaming, + providers, + }; +} + +function mergeLegacyRealtimeOpenAICompat( + realtime: VoiceCallRealtimeConfig, +): VoiceCallRealtimeConfig { + const providers = { ...(realtime.providers ?? {}) }; + const openaiRaw = + providers.openai && typeof providers.openai === "object" + ? { ...(providers.openai as Record) } + : {}; + + if (Object.keys(openaiRaw).length > 0) { + providers.openai = openaiRaw; + } + + return { + ...realtime, + providers, + }; +} + export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig { const defaults = cloneDefaultVoiceCallConfig(); const serve = { ...defaults.serve, ...config.serve }; const streamingProvider = config.streaming?.provider ?? - (typeof config.streaming?.sttProvider === "string" - ? config.streaming.sttProvider - : undefined) ?? - defaults.streaming.provider; + (typeof config.streaming?.sttProvider === "string" ? config.streaming.sttProvider : undefined); const streamingProviders = sanitizeVoiceCallProviderConfigs( config.streaming?.providers ?? defaults.streaming.providers, ); - if ( - typeof streamingProvider === "string" && - streamingProvider.trim() && - !(streamingProvider in streamingProviders) - ) { - streamingProviders[streamingProvider] = {}; - } const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider; const realtimeProviders = sanitizeVoiceCallProviderConfigs( config.realtime?.providers ?? defaults.realtime.providers, ); - if ( - typeof realtimeProvider === "string" && - realtimeProvider.trim() && - !(realtimeProvider in realtimeProviders) - ) { - realtimeProviders[realtimeProvider] = {}; - } return { ...defaults, ...config, @@ -529,7 +547,6 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools, providers: realtimeProviders, }, - stt: { ...defaults.stt, ...config.stt }, tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts), }; } @@ -584,132 +601,16 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC resolved.webhookSecurity.trustForwardingHeaders ?? false; resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? []; - resolved.streaming = { - ...resolved.streaming, - providers: { ...(resolved.streaming.providers ?? {}) }, - }; - const legacyStreamingRaw = resolved.streaming as Record; - const openaiStreamingRaw = - resolved.streaming.providers.openai && typeof resolved.streaming.providers.openai === "object" - ? { ...(resolved.streaming.providers.openai as Record) } - : {}; - if ( - typeof openaiStreamingRaw.apiKey !== "string" && - typeof legacyStreamingRaw.openaiApiKey === "string" - ) { - openaiStreamingRaw.apiKey = legacyStreamingRaw.openaiApiKey; - } - if ( - typeof openaiStreamingRaw.model !== "string" && - typeof legacyStreamingRaw.sttModel === "string" - ) { - openaiStreamingRaw.model = legacyStreamingRaw.sttModel; - } - if ( - openaiStreamingRaw.silenceDurationMs == null && - typeof legacyStreamingRaw.silenceDurationMs === "number" - ) { - openaiStreamingRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs; - } - if ( - openaiStreamingRaw.vadThreshold == null && - typeof legacyStreamingRaw.vadThreshold === "number" - ) { - openaiStreamingRaw.vadThreshold = legacyStreamingRaw.vadThreshold; - } - if (typeof openaiStreamingRaw.apiKey !== "string" || !openaiStreamingRaw.apiKey.trim()) { - if (process.env.OPENAI_API_KEY) { - openaiStreamingRaw.apiKey = process.env.OPENAI_API_KEY; - } - } - if ( - typeof openaiStreamingRaw.model !== "string" && - typeof process.env.REALTIME_TRANSCRIPTION_MODEL === "string" - ) { - openaiStreamingRaw.model = process.env.REALTIME_TRANSCRIPTION_MODEL; - } - if ( - typeof openaiStreamingRaw.model !== "string" && - typeof process.env.STREAMING_STT_MODEL === "string" - ) { - openaiStreamingRaw.model = process.env.STREAMING_STT_MODEL; - } - if (openaiStreamingRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") { - openaiStreamingRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD); - } - if ( - openaiStreamingRaw.silenceDurationMs == null && - typeof process.env.SILENCE_DURATION_MS === "string" - ) { - openaiStreamingRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10); - } - if (Object.keys(openaiStreamingRaw).length > 0) { - resolved.streaming.providers.openai = openaiStreamingRaw; - } - if ( - typeof resolved.streaming.provider === "string" && - resolved.streaming.provider.trim() && - !(resolved.streaming.provider in resolved.streaming.providers) - ) { - resolved.streaming.providers[resolved.streaming.provider] = {}; - } + // Keep parsing legacy OpenAI-shaped fields, but isolate them to the OpenAI provider blob. + resolved.streaming = mergeLegacyStreamingOpenAICompat(resolved.streaming); - resolved.realtime = { - ...resolved.realtime, - providers: { ...(resolved.realtime.providers ?? {}) }, - }; - const openaiRealtimeRaw = - resolved.realtime.providers.openai && typeof resolved.realtime.providers.openai === "object" - ? { ...(resolved.realtime.providers.openai as Record) } - : {}; - if (typeof openaiRealtimeRaw.apiKey !== "string" || !openaiRealtimeRaw.apiKey.trim()) { - if (process.env.OPENAI_API_KEY) { - openaiRealtimeRaw.apiKey = process.env.OPENAI_API_KEY; - } - } - if ( - typeof openaiRealtimeRaw.model !== "string" && - typeof process.env.REALTIME_VOICE_MODEL === "string" - ) { - openaiRealtimeRaw.model = process.env.REALTIME_VOICE_MODEL; - } - if ( - typeof openaiRealtimeRaw.voice !== "string" && - typeof process.env.REALTIME_VOICE_VOICE === "string" - ) { - openaiRealtimeRaw.voice = process.env.REALTIME_VOICE_VOICE; - } + resolved.realtime = mergeLegacyRealtimeOpenAICompat(resolved.realtime); if ( typeof resolved.realtime.instructions !== "string" && typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string" ) { resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS; } - if ( - openaiRealtimeRaw.temperature == null && - typeof process.env.REALTIME_VOICE_TEMPERATURE === "string" - ) { - openaiRealtimeRaw.temperature = Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE); - } - if (openaiRealtimeRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") { - openaiRealtimeRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD); - } - if ( - openaiRealtimeRaw.silenceDurationMs == null && - typeof process.env.SILENCE_DURATION_MS === "string" - ) { - openaiRealtimeRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10); - } - if (Object.keys(openaiRealtimeRaw).length > 0) { - resolved.realtime.providers.openai = openaiRealtimeRaw; - } - if ( - typeof resolved.realtime.provider === "string" && - resolved.realtime.provider.trim() && - !(resolved.realtime.provider in resolved.realtime.providers) - ) { - resolved.realtime.providers[resolved.realtime.provider] = {}; - } return normalizeVoiceCallConfig(resolved); } diff --git a/extensions/voice-call/src/test-fixtures.ts b/extensions/voice-call/src/test-fixtures.ts index bb05a6e4bc6..b1cdb515ecc 100644 --- a/extensions/voice-call/src/test-fixtures.ts +++ b/extensions/voice-call/src/test-fixtures.ts @@ -30,7 +30,6 @@ export function createVoiceCallBaseConfig(params?: { }, streaming: { enabled: false, - provider: "openai", providers: { openai: { model: "gpt-4o-transcribe", @@ -51,7 +50,6 @@ export function createVoiceCallBaseConfig(params?: { providers: {}, }, skipSignatureVerification: false, - stt: { provider: "openai", model: "whisper-1" }, tts: { provider: "openai", providers: { diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts index cb15233f856..e914b27b8a1 100644 --- a/extensions/voice-call/src/webhook.test.ts +++ b/extensions/voice-call/src/webhook.test.ts @@ -24,12 +24,16 @@ const mocks = vi.hoisted(() => { }; return { - getRealtimeTranscriptionProvider: vi.fn(() => realtimeTranscriptionProvider), + getRealtimeTranscriptionProvider: vi.fn< + (...args: unknown[]) => RealtimeTranscriptionProviderPlugin | undefined + >(() => realtimeTranscriptionProvider), + listRealtimeTranscriptionProviders: vi.fn(() => [realtimeTranscriptionProvider]), }; }); vi.mock("./realtime-transcription.runtime.js", () => ({ getRealtimeTranscriptionProvider: mocks.getRealtimeTranscriptionProvider, + listRealtimeTranscriptionProviders: mocks.listRealtimeTranscriptionProviders, })); const provider: VoiceCallProvider = { @@ -110,6 +114,48 @@ function expectWebhookUrl(url: string, expectedPath: string) { expect(parsed.port).not.toBe("0"); } +describe("VoiceCallWebhookServer realtime transcription provider selection", () => { + it("auto-selects the first registered provider when streaming.provider is unset", async () => { + const { manager } = createManager([]); + const config = createConfig({ + streaming: { + ...createConfig().streaming, + enabled: true, + providers: { + openai: { + apiKey: "sk-test", // pragma: allowlist secret + }, + }, + }, + }); + const autoSelectedProvider: RealtimeTranscriptionProviderPlugin = { + id: "openai", + label: "OpenAI", + autoSelectOrder: 5, + isConfigured: () => true, + resolveConfig: ({ rawConfig }) => rawConfig, + createSession: () => ({ + connect: async () => {}, + sendAudio: () => {}, + close: () => {}, + isConnected: () => true, + }), + }; + mocks.getRealtimeTranscriptionProvider.mockReturnValueOnce(undefined); + mocks.listRealtimeTranscriptionProviders.mockReturnValueOnce([autoSelectedProvider]); + + const server = new VoiceCallWebhookServer(config, manager, provider); + try { + await server.start(); + expect(mocks.getRealtimeTranscriptionProvider).toHaveBeenCalledWith(undefined, undefined); + expect(mocks.listRealtimeTranscriptionProviders).toHaveBeenCalledWith(undefined); + expect(server.getMediaStreamHandler()).toBeTruthy(); + } finally { + await server.stop(); + } + }); +}); + async function runStaleCallReaperCase(params: { callAgeMs: number; staleCallReaperSeconds: number; diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index e9f36a61ad5..86287d7f73b 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -158,18 +158,32 @@ export class VoiceCallWebhookServer { */ private async initializeMediaStreaming(): Promise { const streaming = this.config.streaming; - const selectedProviderId = streaming.provider; const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined; - const { getRealtimeTranscriptionProvider } = + const { getRealtimeTranscriptionProvider, listRealtimeTranscriptionProviders } = await import("./realtime-transcription.runtime.js"); - const provider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig); - if (!provider) { + const selectedProviderId = streaming.provider?.trim(); + const configuredProvider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig); + if (selectedProviderId && !configuredProvider) { console.warn( `[voice-call] Streaming enabled but realtime transcription provider "${selectedProviderId}" is not registered`, ); return; } + const provider = + configuredProvider ?? + [...listRealtimeTranscriptionProviders(pluginConfig)].sort( + (left, right) => + (left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER) - + (right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER), + )[0]; + if (!provider) { + console.warn( + "[voice-call] Streaming enabled but no realtime transcription provider is registered", + ); + return; + } const selectedProviderConfig = + selectedProviderId && streaming.providers[selectedProviderId] && typeof streaming.providers[selectedProviderId] === "object" ? (streaming.providers[selectedProviderId] as Record)