refactor(voice-call): clean provider boundaries

This commit is contained in:
Peter Steinberger
2026-04-04 12:33:35 +09:00
parent 6964e4acf7
commit 7cd40ad565
12 changed files with 308 additions and 253 deletions

View File

@@ -1,7 +1,13 @@
import { describe, expect, it } from "vitest";
import { afterEach, describe, expect, it } from "vitest";
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
describe("buildOpenAIRealtimeTranscriptionProvider", () => {
const originalEnv = { ...process.env };
afterEach(() => {
process.env = { ...originalEnv };
});
it("normalizes OpenAI config defaults", () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
const resolved = provider.resolveConfig?.({
@@ -20,6 +26,24 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
});
});
it("reads provider-owned env fallbacks", () => {
process.env.REALTIME_TRANSCRIPTION_MODEL = "gpt-4o-transcribe";
process.env.SILENCE_DURATION_MS = "900";
process.env.VAD_THRESHOLD = "0.45";
const provider = buildOpenAIRealtimeTranscriptionProvider();
const resolved = provider.resolveConfig?.({
cfg: {} as never,
rawConfig: {},
});
expect(resolved).toEqual({
model: "gpt-4o-transcribe",
silenceDurationMs: 900,
vadThreshold: 0.45,
});
});
it("accepts the legacy openai-realtime alias", () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
expect(provider.aliases).toContain("openai-realtime");

View File

@@ -57,9 +57,21 @@ function normalizeProviderConfig(
value: raw?.openaiApiKey,
path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
}),
model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
silenceDurationMs: asNumber(raw?.silenceDurationMs),
vadThreshold: asNumber(raw?.vadThreshold),
model:
trimToUndefined(raw?.model) ??
trimToUndefined(raw?.sttModel) ??
trimToUndefined(process.env.REALTIME_TRANSCRIPTION_MODEL) ??
trimToUndefined(process.env.STREAMING_STT_MODEL),
silenceDurationMs:
asNumber(raw?.silenceDurationMs) ??
(typeof process.env.SILENCE_DURATION_MS === "string"
? Number.parseInt(process.env.SILENCE_DURATION_MS, 10)
: undefined),
vadThreshold:
asNumber(raw?.vadThreshold) ??
(typeof process.env.VAD_THRESHOLD === "string"
? Number.parseFloat(process.env.VAD_THRESHOLD)
: undefined),
};
}

View File

@@ -0,0 +1,32 @@
import { afterEach, describe, expect, it } from "vitest";
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
describe("buildOpenAIRealtimeVoiceProvider", () => {
const originalEnv = { ...process.env };
afterEach(() => {
process.env = { ...originalEnv };
});
it("normalizes provider-owned env fallbacks", () => {
process.env.REALTIME_VOICE_MODEL = "gpt-realtime";
process.env.REALTIME_VOICE_VOICE = "verse";
process.env.REALTIME_VOICE_TEMPERATURE = "0.6";
process.env.SILENCE_DURATION_MS = "850";
process.env.VAD_THRESHOLD = "0.35";
const provider = buildOpenAIRealtimeVoiceProvider();
const resolved = provider.resolveConfig?.({
cfg: {} as never,
rawConfig: {},
});
expect(resolved).toEqual({
model: "gpt-realtime",
voice: "verse",
temperature: 0.6,
silenceDurationMs: 850,
vadThreshold: 0.35,
});
});
});

View File

@@ -103,11 +103,25 @@ function normalizeProviderConfig(
value: raw?.apiKey,
path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
}),
model: trimToUndefined(raw?.model),
voice: raw?.voice as OpenAIRealtimeVoice | undefined,
temperature: asNumber(raw?.temperature),
vadThreshold: asNumber(raw?.vadThreshold),
silenceDurationMs: asNumber(raw?.silenceDurationMs),
model: trimToUndefined(raw?.model) ?? trimToUndefined(process.env.REALTIME_VOICE_MODEL),
voice: (trimToUndefined(raw?.voice) ?? trimToUndefined(process.env.REALTIME_VOICE_VOICE)) as
| OpenAIRealtimeVoice
| undefined,
temperature:
asNumber(raw?.temperature) ??
(typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
? Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE)
: undefined),
vadThreshold:
asNumber(raw?.vadThreshold) ??
(typeof process.env.VAD_THRESHOLD === "string"
? Number.parseFloat(process.env.VAD_THRESHOLD)
: undefined),
silenceDurationMs:
asNumber(raw?.silenceDurationMs) ??
(typeof process.env.SILENCE_DURATION_MS === "string"
? Number.parseInt(process.env.SILENCE_DURATION_MS, 10)
: undefined),
prefixPaddingMs: asNumber(raw?.prefixPaddingMs),
azureEndpoint: trimToUndefined(raw?.azureEndpoint),
azureDeployment: trimToUndefined(raw?.azureDeployment),

View File

@@ -76,7 +76,15 @@ Put under `plugins.entries.voice-call.config`:
streaming: {
enabled: true,
// optional; if omitted, Voice Call picks the first registered
// realtime-transcription provider by autoSelectOrder
provider: "openai",
streamPath: "/voice/stream",
providers: {
openai: {
model: "gpt-4o-transcribe",
},
},
preStartTimeoutMs: 5000,
maxPendingConnections: 32,
maxPendingConnectionsPerIp: 4,
@@ -145,4 +153,4 @@ Actions:
- While a Twilio stream is active, playback does not fall back to TwiML `<Say>`; stream-TTS failures fail the playback request.
- Outbound conversation calls suppress barge-in only while the initial greeting is actively speaking, then re-enable normal interruption.
- Twilio stream disconnect auto-end uses a short grace window so quick reconnects do not end the call.
- Media streaming requires `ws` plus a configured realtime-transcription provider. The bundled provider today is OpenAI.
- Realtime provider selection is generic. Configure `streaming.provider` / `realtime.provider` and put provider-owned options under `providers.<id>`.

View File

@@ -72,45 +72,28 @@ const voiceCallConfigSchema = {
advanced: true,
},
"streaming.enabled": { label: "Enable Streaming", advanced: true },
"streaming.provider": { label: "Streaming Provider", advanced: true },
"streaming.providers.openai.apiKey": {
label: "OpenAI Realtime API Key",
sensitive: true,
"streaming.provider": {
label: "Streaming Provider",
help: "Uses the first registered realtime transcription provider when unset.",
advanced: true,
},
"streaming.providers.openai.model": { label: "Realtime STT Model", advanced: true },
"streaming.providers": { label: "Streaming Provider Config", advanced: true },
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
"realtime.enabled": { label: "Enable Realtime Voice", advanced: true },
"realtime.provider": { label: "Realtime Voice Provider", advanced: true },
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
"realtime.providers.openai.apiKey": {
label: "OpenAI Realtime API Key",
sensitive: true,
"realtime.provider": {
label: "Realtime Voice Provider",
help: "Uses the first registered realtime voice provider when unset.",
advanced: true,
},
"realtime.providers.openai.model": { label: "OpenAI Realtime Model", advanced: true },
"realtime.providers.openai.voice": { label: "OpenAI Realtime Voice", advanced: true },
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
"realtime.providers": { label: "Realtime Provider Config", advanced: true },
"tts.provider": {
label: "TTS Provider Override",
help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
advanced: true,
},
"tts.providers.openai.model": { label: "OpenAI TTS Model", advanced: true },
"tts.providers.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
"tts.providers.openai.apiKey": {
label: "OpenAI API Key",
sensitive: true,
advanced: true,
},
"tts.providers.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
"tts.providers.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
"tts.providers.elevenlabs.apiKey": {
label: "ElevenLabs API Key",
sensitive: true,
advanced: true,
},
"tts.providers.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
"tts.providers": { label: "TTS Provider Config", advanced: true },
publicUrl: { label: "Public Webhook URL", advanced: true },
skipSignatureVerification: {
label: "Skip Signature Verification",

View File

@@ -88,54 +88,45 @@
},
"streaming.provider": {
"label": "Streaming Provider",
"help": "Uses the first registered realtime transcription provider when unset.",
"advanced": true
},
"streaming.providers.openai.apiKey": {
"label": "OpenAI Realtime API Key",
"sensitive": true,
"advanced": true
},
"streaming.providers.openai.model": {
"label": "Realtime STT Model",
"streaming.providers": {
"label": "Streaming Provider Config",
"advanced": true
},
"streaming.streamPath": {
"label": "Media Stream Path",
"advanced": true
},
"realtime.enabled": {
"label": "Enable Realtime Voice",
"advanced": true
},
"realtime.provider": {
"label": "Realtime Voice Provider",
"help": "Uses the first registered realtime voice provider when unset.",
"advanced": true
},
"realtime.streamPath": {
"label": "Realtime Stream Path",
"advanced": true
},
"realtime.instructions": {
"label": "Realtime Instructions",
"advanced": true
},
"realtime.providers": {
"label": "Realtime Provider Config",
"advanced": true
},
"tts.provider": {
"label": "TTS Provider Override",
"help": "Deep-merges with messages.tts (Microsoft is ignored for calls).",
"advanced": true
},
"tts.providers.openai.model": {
"label": "OpenAI TTS Model",
"advanced": true
},
"tts.providers.openai.voice": {
"label": "OpenAI TTS Voice",
"advanced": true
},
"tts.providers.openai.apiKey": {
"label": "OpenAI API Key",
"sensitive": true,
"advanced": true
},
"tts.providers.elevenlabs.modelId": {
"label": "ElevenLabs Model ID",
"advanced": true
},
"tts.providers.elevenlabs.voiceId": {
"label": "ElevenLabs Voice ID",
"advanced": true
},
"tts.providers.elevenlabs.apiKey": {
"label": "ElevenLabs API Key",
"sensitive": true,
"advanced": true
},
"tts.providers.elevenlabs.baseUrl": {
"label": "ElevenLabs Base URL",
"tts.providers": {
"label": "TTS Provider Config",
"advanced": true
},
"publicUrl": {
@@ -470,19 +461,6 @@
"skipSignatureVerification": {
"type": "boolean"
},
"stt": {
"type": "object",
"additionalProperties": false,
"properties": {
"provider": {
"type": "string",
"enum": ["openai"]
},
"model": {
"type": "string"
}
}
},
"tts": {
"type": "object",
"additionalProperties": false,

View File

@@ -223,8 +223,8 @@ describe("normalizeVoiceCallConfig", () => {
expect(normalized.serve.path).toBe("/voice/webhook");
expect(normalized.streaming.streamPath).toBe("/custom-stream");
expect(normalized.streaming.provider).toBe("openai");
expect(normalized.streaming.providers.openai).toEqual({});
expect(normalized.streaming.provider).toBeUndefined();
expect(normalized.streaming.providers).toEqual({});
expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime");
expect(normalized.tunnel.provider).toBe("none");
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
@@ -271,3 +271,48 @@ describe("normalizeVoiceCallConfig", () => {
expect(elevenlabs.voiceSettings).toEqual({ speed: 1.1 });
});
});
describe("resolveVoiceCallConfig", () => {
const originalEnv = { ...process.env };
afterEach(() => {
process.env = { ...originalEnv };
});
it("keeps legacy streaming OpenAI fields inside providers.openai without forcing provider selection", () => {
const resolved = resolveVoiceCallConfig({
enabled: true,
provider: "twilio",
streaming: {
enabled: true,
openaiApiKey: "sk-test", // pragma: allowlist secret
sttModel: "gpt-4o-transcribe",
silenceDurationMs: 700,
vadThreshold: 0.4,
},
});
expect(resolved.streaming.provider).toBeUndefined();
expect(resolved.streaming.providers.openai).toEqual({
apiKey: "sk-test",
model: "gpt-4o-transcribe",
silenceDurationMs: 700,
vadThreshold: 0.4,
});
});
it("maps realtime instructions from the legacy env hook without altering provider selection", () => {
process.env.REALTIME_VOICE_INSTRUCTIONS = "Stay concise.";
const resolved = resolveVoiceCallConfig({
enabled: true,
provider: "twilio",
realtime: {
enabled: true,
},
});
expect(resolved.realtime.instructions).toBe("Stay concise.");
expect(resolved.realtime.provider).toBeUndefined();
});
});

View File

@@ -64,21 +64,6 @@ export const PlivoConfigSchema = z
.strict();
export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
// -----------------------------------------------------------------------------
// STT/TTS Configuration
// -----------------------------------------------------------------------------
export const SttConfigSchema = z
.object({
/** One-shot STT provider for non-streaming paths. */
provider: z.literal("openai").default("openai"),
/** Whisper model to use */
model: z.string().min(1).default("whisper-1"),
})
.strict()
.default({ provider: "openai", model: "whisper-1" });
export type SttConfig = z.infer<typeof SttConfigSchema>;
export { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema };
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
@@ -255,7 +240,7 @@ export const VoiceCallStreamingConfigSchema = z
/** Enable real-time audio streaming (requires WebSocket support) */
enabled: z.boolean().default(false),
/** Provider id from registered realtime transcription providers. */
provider: z.string().min(1).default("openai"),
provider: z.string().min(1).optional(),
/** @deprecated Legacy alias for provider. */
sttProvider: z.string().min(1).optional(),
/** @deprecated Legacy OpenAI-specific API key field. */
@@ -285,7 +270,6 @@ export const VoiceCallStreamingConfigSchema = z
.strict()
.default({
enabled: false,
provider: "openai",
streamPath: "/voice/stream",
providers: {},
preStartTimeoutMs: 5000,
@@ -381,9 +365,6 @@ export const VoiceCallConfigSchema = z
/** Skip webhook signature verification (development only, NOT for production) */
skipSignatureVerification: z.boolean().default(false),
/** STT configuration */
stt: SttConfigSchema,
/** TTS override (deep-merges with core messages.tts) */
tts: TtsConfigSchema,
@@ -467,36 +448,73 @@ function sanitizeVoiceCallProviderConfigs(
);
}
function mergeLegacyStreamingOpenAICompat(
streaming: VoiceCallStreamingConfig,
): VoiceCallStreamingConfig {
const providers = { ...(streaming.providers ?? {}) };
const legacyStreamingRaw = streaming as Record<string, unknown>;
const openaiRaw =
providers.openai && typeof providers.openai === "object"
? { ...(providers.openai as Record<string, unknown>) }
: {};
if (typeof openaiRaw.apiKey !== "string" && typeof legacyStreamingRaw.openaiApiKey === "string") {
openaiRaw.apiKey = legacyStreamingRaw.openaiApiKey;
}
if (typeof openaiRaw.model !== "string" && typeof legacyStreamingRaw.sttModel === "string") {
openaiRaw.model = legacyStreamingRaw.sttModel;
}
if (
openaiRaw.silenceDurationMs == null &&
typeof legacyStreamingRaw.silenceDurationMs === "number"
) {
openaiRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs;
}
if (openaiRaw.vadThreshold == null && typeof legacyStreamingRaw.vadThreshold === "number") {
openaiRaw.vadThreshold = legacyStreamingRaw.vadThreshold;
}
if (Object.keys(openaiRaw).length > 0) {
providers.openai = openaiRaw;
}
return {
...streaming,
providers,
};
}
function mergeLegacyRealtimeOpenAICompat(
realtime: VoiceCallRealtimeConfig,
): VoiceCallRealtimeConfig {
const providers = { ...(realtime.providers ?? {}) };
const openaiRaw =
providers.openai && typeof providers.openai === "object"
? { ...(providers.openai as Record<string, unknown>) }
: {};
if (Object.keys(openaiRaw).length > 0) {
providers.openai = openaiRaw;
}
return {
...realtime,
providers,
};
}
export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
const defaults = cloneDefaultVoiceCallConfig();
const serve = { ...defaults.serve, ...config.serve };
const streamingProvider =
config.streaming?.provider ??
(typeof config.streaming?.sttProvider === "string"
? config.streaming.sttProvider
: undefined) ??
defaults.streaming.provider;
(typeof config.streaming?.sttProvider === "string" ? config.streaming.sttProvider : undefined);
const streamingProviders = sanitizeVoiceCallProviderConfigs(
config.streaming?.providers ?? defaults.streaming.providers,
);
if (
typeof streamingProvider === "string" &&
streamingProvider.trim() &&
!(streamingProvider in streamingProviders)
) {
streamingProviders[streamingProvider] = {};
}
const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider;
const realtimeProviders = sanitizeVoiceCallProviderConfigs(
config.realtime?.providers ?? defaults.realtime.providers,
);
if (
typeof realtimeProvider === "string" &&
realtimeProvider.trim() &&
!(realtimeProvider in realtimeProviders)
) {
realtimeProviders[realtimeProvider] = {};
}
return {
...defaults,
...config,
@@ -529,7 +547,6 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
(config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
providers: realtimeProviders,
},
stt: { ...defaults.stt, ...config.stt },
tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
};
}
@@ -584,132 +601,16 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
resolved.webhookSecurity.trustForwardingHeaders ?? false;
resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
resolved.streaming = {
...resolved.streaming,
providers: { ...(resolved.streaming.providers ?? {}) },
};
const legacyStreamingRaw = resolved.streaming as Record<string, unknown>;
const openaiStreamingRaw =
resolved.streaming.providers.openai && typeof resolved.streaming.providers.openai === "object"
? { ...(resolved.streaming.providers.openai as Record<string, unknown>) }
: {};
if (
typeof openaiStreamingRaw.apiKey !== "string" &&
typeof legacyStreamingRaw.openaiApiKey === "string"
) {
openaiStreamingRaw.apiKey = legacyStreamingRaw.openaiApiKey;
}
if (
typeof openaiStreamingRaw.model !== "string" &&
typeof legacyStreamingRaw.sttModel === "string"
) {
openaiStreamingRaw.model = legacyStreamingRaw.sttModel;
}
if (
openaiStreamingRaw.silenceDurationMs == null &&
typeof legacyStreamingRaw.silenceDurationMs === "number"
) {
openaiStreamingRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs;
}
if (
openaiStreamingRaw.vadThreshold == null &&
typeof legacyStreamingRaw.vadThreshold === "number"
) {
openaiStreamingRaw.vadThreshold = legacyStreamingRaw.vadThreshold;
}
if (typeof openaiStreamingRaw.apiKey !== "string" || !openaiStreamingRaw.apiKey.trim()) {
if (process.env.OPENAI_API_KEY) {
openaiStreamingRaw.apiKey = process.env.OPENAI_API_KEY;
}
}
if (
typeof openaiStreamingRaw.model !== "string" &&
typeof process.env.REALTIME_TRANSCRIPTION_MODEL === "string"
) {
openaiStreamingRaw.model = process.env.REALTIME_TRANSCRIPTION_MODEL;
}
if (
typeof openaiStreamingRaw.model !== "string" &&
typeof process.env.STREAMING_STT_MODEL === "string"
) {
openaiStreamingRaw.model = process.env.STREAMING_STT_MODEL;
}
if (openaiStreamingRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
openaiStreamingRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
}
if (
openaiStreamingRaw.silenceDurationMs == null &&
typeof process.env.SILENCE_DURATION_MS === "string"
) {
openaiStreamingRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
}
if (Object.keys(openaiStreamingRaw).length > 0) {
resolved.streaming.providers.openai = openaiStreamingRaw;
}
if (
typeof resolved.streaming.provider === "string" &&
resolved.streaming.provider.trim() &&
!(resolved.streaming.provider in resolved.streaming.providers)
) {
resolved.streaming.providers[resolved.streaming.provider] = {};
}
// Keep parsing legacy OpenAI-shaped fields, but isolate them to the OpenAI provider blob.
resolved.streaming = mergeLegacyStreamingOpenAICompat(resolved.streaming);
resolved.realtime = {
...resolved.realtime,
providers: { ...(resolved.realtime.providers ?? {}) },
};
const openaiRealtimeRaw =
resolved.realtime.providers.openai && typeof resolved.realtime.providers.openai === "object"
? { ...(resolved.realtime.providers.openai as Record<string, unknown>) }
: {};
if (typeof openaiRealtimeRaw.apiKey !== "string" || !openaiRealtimeRaw.apiKey.trim()) {
if (process.env.OPENAI_API_KEY) {
openaiRealtimeRaw.apiKey = process.env.OPENAI_API_KEY;
}
}
if (
typeof openaiRealtimeRaw.model !== "string" &&
typeof process.env.REALTIME_VOICE_MODEL === "string"
) {
openaiRealtimeRaw.model = process.env.REALTIME_VOICE_MODEL;
}
if (
typeof openaiRealtimeRaw.voice !== "string" &&
typeof process.env.REALTIME_VOICE_VOICE === "string"
) {
openaiRealtimeRaw.voice = process.env.REALTIME_VOICE_VOICE;
}
resolved.realtime = mergeLegacyRealtimeOpenAICompat(resolved.realtime);
if (
typeof resolved.realtime.instructions !== "string" &&
typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string"
) {
resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS;
}
if (
openaiRealtimeRaw.temperature == null &&
typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
) {
openaiRealtimeRaw.temperature = Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE);
}
if (openaiRealtimeRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
openaiRealtimeRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
}
if (
openaiRealtimeRaw.silenceDurationMs == null &&
typeof process.env.SILENCE_DURATION_MS === "string"
) {
openaiRealtimeRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
}
if (Object.keys(openaiRealtimeRaw).length > 0) {
resolved.realtime.providers.openai = openaiRealtimeRaw;
}
if (
typeof resolved.realtime.provider === "string" &&
resolved.realtime.provider.trim() &&
!(resolved.realtime.provider in resolved.realtime.providers)
) {
resolved.realtime.providers[resolved.realtime.provider] = {};
}
return normalizeVoiceCallConfig(resolved);
}

View File

@@ -30,7 +30,6 @@ export function createVoiceCallBaseConfig(params?: {
},
streaming: {
enabled: false,
provider: "openai",
providers: {
openai: {
model: "gpt-4o-transcribe",
@@ -51,7 +50,6 @@ export function createVoiceCallBaseConfig(params?: {
providers: {},
},
skipSignatureVerification: false,
stt: { provider: "openai", model: "whisper-1" },
tts: {
provider: "openai",
providers: {

View File

@@ -24,12 +24,16 @@ const mocks = vi.hoisted(() => {
};
return {
getRealtimeTranscriptionProvider: vi.fn(() => realtimeTranscriptionProvider),
getRealtimeTranscriptionProvider: vi.fn<
(...args: unknown[]) => RealtimeTranscriptionProviderPlugin | undefined
>(() => realtimeTranscriptionProvider),
listRealtimeTranscriptionProviders: vi.fn(() => [realtimeTranscriptionProvider]),
};
});
vi.mock("./realtime-transcription.runtime.js", () => ({
getRealtimeTranscriptionProvider: mocks.getRealtimeTranscriptionProvider,
listRealtimeTranscriptionProviders: mocks.listRealtimeTranscriptionProviders,
}));
const provider: VoiceCallProvider = {
@@ -110,6 +114,48 @@ function expectWebhookUrl(url: string, expectedPath: string) {
expect(parsed.port).not.toBe("0");
}
describe("VoiceCallWebhookServer realtime transcription provider selection", () => {
it("auto-selects the first registered provider when streaming.provider is unset", async () => {
const { manager } = createManager([]);
const config = createConfig({
streaming: {
...createConfig().streaming,
enabled: true,
providers: {
openai: {
apiKey: "sk-test", // pragma: allowlist secret
},
},
},
});
const autoSelectedProvider: RealtimeTranscriptionProviderPlugin = {
id: "openai",
label: "OpenAI",
autoSelectOrder: 5,
isConfigured: () => true,
resolveConfig: ({ rawConfig }) => rawConfig,
createSession: () => ({
connect: async () => {},
sendAudio: () => {},
close: () => {},
isConnected: () => true,
}),
};
mocks.getRealtimeTranscriptionProvider.mockReturnValueOnce(undefined);
mocks.listRealtimeTranscriptionProviders.mockReturnValueOnce([autoSelectedProvider]);
const server = new VoiceCallWebhookServer(config, manager, provider);
try {
await server.start();
expect(mocks.getRealtimeTranscriptionProvider).toHaveBeenCalledWith(undefined, undefined);
expect(mocks.listRealtimeTranscriptionProviders).toHaveBeenCalledWith(undefined);
expect(server.getMediaStreamHandler()).toBeTruthy();
} finally {
await server.stop();
}
});
});
async function runStaleCallReaperCase(params: {
callAgeMs: number;
staleCallReaperSeconds: number;

View File

@@ -158,18 +158,32 @@ export class VoiceCallWebhookServer {
*/
private async initializeMediaStreaming(): Promise<void> {
const streaming = this.config.streaming;
const selectedProviderId = streaming.provider;
const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined;
const { getRealtimeTranscriptionProvider } =
const { getRealtimeTranscriptionProvider, listRealtimeTranscriptionProviders } =
await import("./realtime-transcription.runtime.js");
const provider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig);
if (!provider) {
const selectedProviderId = streaming.provider?.trim();
const configuredProvider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig);
if (selectedProviderId && !configuredProvider) {
console.warn(
`[voice-call] Streaming enabled but realtime transcription provider "${selectedProviderId}" is not registered`,
);
return;
}
const provider =
configuredProvider ??
[...listRealtimeTranscriptionProviders(pluginConfig)].sort(
(left, right) =>
(left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER) -
(right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER),
)[0];
if (!provider) {
console.warn(
"[voice-call] Streaming enabled but no realtime transcription provider is registered",
);
return;
}
const selectedProviderConfig =
selectedProviderId &&
streaming.providers[selectedProviderId] &&
typeof streaming.providers[selectedProviderId] === "object"
? (streaming.providers[selectedProviderId] as Record<string, unknown>)