mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-10 16:51:13 +00:00
refactor(voice-call): clean provider boundaries
This commit is contained in:
@@ -1,7 +1,13 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
|
||||
|
||||
describe("buildOpenAIRealtimeTranscriptionProvider", () => {
|
||||
const originalEnv = { ...process.env };
|
||||
|
||||
afterEach(() => {
|
||||
process.env = { ...originalEnv };
|
||||
});
|
||||
|
||||
it("normalizes OpenAI config defaults", () => {
|
||||
const provider = buildOpenAIRealtimeTranscriptionProvider();
|
||||
const resolved = provider.resolveConfig?.({
|
||||
@@ -20,6 +26,24 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("reads provider-owned env fallbacks", () => {
|
||||
process.env.REALTIME_TRANSCRIPTION_MODEL = "gpt-4o-transcribe";
|
||||
process.env.SILENCE_DURATION_MS = "900";
|
||||
process.env.VAD_THRESHOLD = "0.45";
|
||||
|
||||
const provider = buildOpenAIRealtimeTranscriptionProvider();
|
||||
const resolved = provider.resolveConfig?.({
|
||||
cfg: {} as never,
|
||||
rawConfig: {},
|
||||
});
|
||||
|
||||
expect(resolved).toEqual({
|
||||
model: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 900,
|
||||
vadThreshold: 0.45,
|
||||
});
|
||||
});
|
||||
|
||||
it("accepts the legacy openai-realtime alias", () => {
|
||||
const provider = buildOpenAIRealtimeTranscriptionProvider();
|
||||
expect(provider.aliases).toContain("openai-realtime");
|
||||
|
||||
@@ -57,9 +57,21 @@ function normalizeProviderConfig(
|
||||
value: raw?.openaiApiKey,
|
||||
path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
|
||||
}),
|
||||
model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
|
||||
silenceDurationMs: asNumber(raw?.silenceDurationMs),
|
||||
vadThreshold: asNumber(raw?.vadThreshold),
|
||||
model:
|
||||
trimToUndefined(raw?.model) ??
|
||||
trimToUndefined(raw?.sttModel) ??
|
||||
trimToUndefined(process.env.REALTIME_TRANSCRIPTION_MODEL) ??
|
||||
trimToUndefined(process.env.STREAMING_STT_MODEL),
|
||||
silenceDurationMs:
|
||||
asNumber(raw?.silenceDurationMs) ??
|
||||
(typeof process.env.SILENCE_DURATION_MS === "string"
|
||||
? Number.parseInt(process.env.SILENCE_DURATION_MS, 10)
|
||||
: undefined),
|
||||
vadThreshold:
|
||||
asNumber(raw?.vadThreshold) ??
|
||||
(typeof process.env.VAD_THRESHOLD === "string"
|
||||
? Number.parseFloat(process.env.VAD_THRESHOLD)
|
||||
: undefined),
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
32
extensions/openai/realtime-voice-provider.test.ts
Normal file
32
extensions/openai/realtime-voice-provider.test.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
|
||||
describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
const originalEnv = { ...process.env };
|
||||
|
||||
afterEach(() => {
|
||||
process.env = { ...originalEnv };
|
||||
});
|
||||
|
||||
it("normalizes provider-owned env fallbacks", () => {
|
||||
process.env.REALTIME_VOICE_MODEL = "gpt-realtime";
|
||||
process.env.REALTIME_VOICE_VOICE = "verse";
|
||||
process.env.REALTIME_VOICE_TEMPERATURE = "0.6";
|
||||
process.env.SILENCE_DURATION_MS = "850";
|
||||
process.env.VAD_THRESHOLD = "0.35";
|
||||
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const resolved = provider.resolveConfig?.({
|
||||
cfg: {} as never,
|
||||
rawConfig: {},
|
||||
});
|
||||
|
||||
expect(resolved).toEqual({
|
||||
model: "gpt-realtime",
|
||||
voice: "verse",
|
||||
temperature: 0.6,
|
||||
silenceDurationMs: 850,
|
||||
vadThreshold: 0.35,
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -103,11 +103,25 @@ function normalizeProviderConfig(
|
||||
value: raw?.apiKey,
|
||||
path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
|
||||
}),
|
||||
model: trimToUndefined(raw?.model),
|
||||
voice: raw?.voice as OpenAIRealtimeVoice | undefined,
|
||||
temperature: asNumber(raw?.temperature),
|
||||
vadThreshold: asNumber(raw?.vadThreshold),
|
||||
silenceDurationMs: asNumber(raw?.silenceDurationMs),
|
||||
model: trimToUndefined(raw?.model) ?? trimToUndefined(process.env.REALTIME_VOICE_MODEL),
|
||||
voice: (trimToUndefined(raw?.voice) ?? trimToUndefined(process.env.REALTIME_VOICE_VOICE)) as
|
||||
| OpenAIRealtimeVoice
|
||||
| undefined,
|
||||
temperature:
|
||||
asNumber(raw?.temperature) ??
|
||||
(typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
|
||||
? Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE)
|
||||
: undefined),
|
||||
vadThreshold:
|
||||
asNumber(raw?.vadThreshold) ??
|
||||
(typeof process.env.VAD_THRESHOLD === "string"
|
||||
? Number.parseFloat(process.env.VAD_THRESHOLD)
|
||||
: undefined),
|
||||
silenceDurationMs:
|
||||
asNumber(raw?.silenceDurationMs) ??
|
||||
(typeof process.env.SILENCE_DURATION_MS === "string"
|
||||
? Number.parseInt(process.env.SILENCE_DURATION_MS, 10)
|
||||
: undefined),
|
||||
prefixPaddingMs: asNumber(raw?.prefixPaddingMs),
|
||||
azureEndpoint: trimToUndefined(raw?.azureEndpoint),
|
||||
azureDeployment: trimToUndefined(raw?.azureDeployment),
|
||||
|
||||
@@ -76,7 +76,15 @@ Put under `plugins.entries.voice-call.config`:
|
||||
|
||||
streaming: {
|
||||
enabled: true,
|
||||
// optional; if omitted, Voice Call picks the first registered
|
||||
// realtime-transcription provider by autoSelectOrder
|
||||
provider: "openai",
|
||||
streamPath: "/voice/stream",
|
||||
providers: {
|
||||
openai: {
|
||||
model: "gpt-4o-transcribe",
|
||||
},
|
||||
},
|
||||
preStartTimeoutMs: 5000,
|
||||
maxPendingConnections: 32,
|
||||
maxPendingConnectionsPerIp: 4,
|
||||
@@ -145,4 +153,4 @@ Actions:
|
||||
- While a Twilio stream is active, playback does not fall back to TwiML `<Say>`; stream-TTS failures fail the playback request.
|
||||
- Outbound conversation calls suppress barge-in only while the initial greeting is actively speaking, then re-enable normal interruption.
|
||||
- Twilio stream disconnect auto-end uses a short grace window so quick reconnects do not end the call.
|
||||
- Media streaming requires `ws` plus a configured realtime-transcription provider. The bundled provider today is OpenAI.
|
||||
- Realtime provider selection is generic. Configure `streaming.provider` / `realtime.provider` and put provider-owned options under `providers.<id>`.
|
||||
|
||||
@@ -72,45 +72,28 @@ const voiceCallConfigSchema = {
|
||||
advanced: true,
|
||||
},
|
||||
"streaming.enabled": { label: "Enable Streaming", advanced: true },
|
||||
"streaming.provider": { label: "Streaming Provider", advanced: true },
|
||||
"streaming.providers.openai.apiKey": {
|
||||
label: "OpenAI Realtime API Key",
|
||||
sensitive: true,
|
||||
"streaming.provider": {
|
||||
label: "Streaming Provider",
|
||||
help: "Uses the first registered realtime transcription provider when unset.",
|
||||
advanced: true,
|
||||
},
|
||||
"streaming.providers.openai.model": { label: "Realtime STT Model", advanced: true },
|
||||
"streaming.providers": { label: "Streaming Provider Config", advanced: true },
|
||||
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
|
||||
"realtime.enabled": { label: "Enable Realtime Voice", advanced: true },
|
||||
"realtime.provider": { label: "Realtime Voice Provider", advanced: true },
|
||||
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
|
||||
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
|
||||
"realtime.providers.openai.apiKey": {
|
||||
label: "OpenAI Realtime API Key",
|
||||
sensitive: true,
|
||||
"realtime.provider": {
|
||||
label: "Realtime Voice Provider",
|
||||
help: "Uses the first registered realtime voice provider when unset.",
|
||||
advanced: true,
|
||||
},
|
||||
"realtime.providers.openai.model": { label: "OpenAI Realtime Model", advanced: true },
|
||||
"realtime.providers.openai.voice": { label: "OpenAI Realtime Voice", advanced: true },
|
||||
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
|
||||
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
|
||||
"realtime.providers": { label: "Realtime Provider Config", advanced: true },
|
||||
"tts.provider": {
|
||||
label: "TTS Provider Override",
|
||||
help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
|
||||
advanced: true,
|
||||
},
|
||||
"tts.providers.openai.model": { label: "OpenAI TTS Model", advanced: true },
|
||||
"tts.providers.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
|
||||
"tts.providers.openai.apiKey": {
|
||||
label: "OpenAI API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.providers.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
|
||||
"tts.providers.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
|
||||
"tts.providers.elevenlabs.apiKey": {
|
||||
label: "ElevenLabs API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.providers.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
|
||||
"tts.providers": { label: "TTS Provider Config", advanced: true },
|
||||
publicUrl: { label: "Public Webhook URL", advanced: true },
|
||||
skipSignatureVerification: {
|
||||
label: "Skip Signature Verification",
|
||||
|
||||
@@ -88,54 +88,45 @@
|
||||
},
|
||||
"streaming.provider": {
|
||||
"label": "Streaming Provider",
|
||||
"help": "Uses the first registered realtime transcription provider when unset.",
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.providers.openai.apiKey": {
|
||||
"label": "OpenAI Realtime API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.providers.openai.model": {
|
||||
"label": "Realtime STT Model",
|
||||
"streaming.providers": {
|
||||
"label": "Streaming Provider Config",
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.streamPath": {
|
||||
"label": "Media Stream Path",
|
||||
"advanced": true
|
||||
},
|
||||
"realtime.enabled": {
|
||||
"label": "Enable Realtime Voice",
|
||||
"advanced": true
|
||||
},
|
||||
"realtime.provider": {
|
||||
"label": "Realtime Voice Provider",
|
||||
"help": "Uses the first registered realtime voice provider when unset.",
|
||||
"advanced": true
|
||||
},
|
||||
"realtime.streamPath": {
|
||||
"label": "Realtime Stream Path",
|
||||
"advanced": true
|
||||
},
|
||||
"realtime.instructions": {
|
||||
"label": "Realtime Instructions",
|
||||
"advanced": true
|
||||
},
|
||||
"realtime.providers": {
|
||||
"label": "Realtime Provider Config",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.provider": {
|
||||
"label": "TTS Provider Override",
|
||||
"help": "Deep-merges with messages.tts (Microsoft is ignored for calls).",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.providers.openai.model": {
|
||||
"label": "OpenAI TTS Model",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.providers.openai.voice": {
|
||||
"label": "OpenAI TTS Voice",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.providers.openai.apiKey": {
|
||||
"label": "OpenAI API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.providers.elevenlabs.modelId": {
|
||||
"label": "ElevenLabs Model ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.providers.elevenlabs.voiceId": {
|
||||
"label": "ElevenLabs Voice ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.providers.elevenlabs.apiKey": {
|
||||
"label": "ElevenLabs API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.providers.elevenlabs.baseUrl": {
|
||||
"label": "ElevenLabs Base URL",
|
||||
"tts.providers": {
|
||||
"label": "TTS Provider Config",
|
||||
"advanced": true
|
||||
},
|
||||
"publicUrl": {
|
||||
@@ -470,19 +461,6 @@
|
||||
"skipSignatureVerification": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"stt": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": ["openai"]
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tts": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
|
||||
@@ -223,8 +223,8 @@ describe("normalizeVoiceCallConfig", () => {
|
||||
|
||||
expect(normalized.serve.path).toBe("/voice/webhook");
|
||||
expect(normalized.streaming.streamPath).toBe("/custom-stream");
|
||||
expect(normalized.streaming.provider).toBe("openai");
|
||||
expect(normalized.streaming.providers.openai).toEqual({});
|
||||
expect(normalized.streaming.provider).toBeUndefined();
|
||||
expect(normalized.streaming.providers).toEqual({});
|
||||
expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime");
|
||||
expect(normalized.tunnel.provider).toBe("none");
|
||||
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
|
||||
@@ -271,3 +271,48 @@ describe("normalizeVoiceCallConfig", () => {
|
||||
expect(elevenlabs.voiceSettings).toEqual({ speed: 1.1 });
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveVoiceCallConfig", () => {
|
||||
const originalEnv = { ...process.env };
|
||||
|
||||
afterEach(() => {
|
||||
process.env = { ...originalEnv };
|
||||
});
|
||||
|
||||
it("keeps legacy streaming OpenAI fields inside providers.openai without forcing provider selection", () => {
|
||||
const resolved = resolveVoiceCallConfig({
|
||||
enabled: true,
|
||||
provider: "twilio",
|
||||
streaming: {
|
||||
enabled: true,
|
||||
openaiApiKey: "sk-test", // pragma: allowlist secret
|
||||
sttModel: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 700,
|
||||
vadThreshold: 0.4,
|
||||
},
|
||||
});
|
||||
|
||||
expect(resolved.streaming.provider).toBeUndefined();
|
||||
expect(resolved.streaming.providers.openai).toEqual({
|
||||
apiKey: "sk-test",
|
||||
model: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 700,
|
||||
vadThreshold: 0.4,
|
||||
});
|
||||
});
|
||||
|
||||
it("maps realtime instructions from the legacy env hook without altering provider selection", () => {
|
||||
process.env.REALTIME_VOICE_INSTRUCTIONS = "Stay concise.";
|
||||
|
||||
const resolved = resolveVoiceCallConfig({
|
||||
enabled: true,
|
||||
provider: "twilio",
|
||||
realtime: {
|
||||
enabled: true,
|
||||
},
|
||||
});
|
||||
|
||||
expect(resolved.realtime.instructions).toBe("Stay concise.");
|
||||
expect(resolved.realtime.provider).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -64,21 +64,6 @@ export const PlivoConfigSchema = z
|
||||
.strict();
|
||||
export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// STT/TTS Configuration
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const SttConfigSchema = z
|
||||
.object({
|
||||
/** One-shot STT provider for non-streaming paths. */
|
||||
provider: z.literal("openai").default("openai"),
|
||||
/** Whisper model to use */
|
||||
model: z.string().min(1).default("whisper-1"),
|
||||
})
|
||||
.strict()
|
||||
.default({ provider: "openai", model: "whisper-1" });
|
||||
export type SttConfig = z.infer<typeof SttConfigSchema>;
|
||||
|
||||
export { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema };
|
||||
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
|
||||
|
||||
@@ -255,7 +240,7 @@ export const VoiceCallStreamingConfigSchema = z
|
||||
/** Enable real-time audio streaming (requires WebSocket support) */
|
||||
enabled: z.boolean().default(false),
|
||||
/** Provider id from registered realtime transcription providers. */
|
||||
provider: z.string().min(1).default("openai"),
|
||||
provider: z.string().min(1).optional(),
|
||||
/** @deprecated Legacy alias for provider. */
|
||||
sttProvider: z.string().min(1).optional(),
|
||||
/** @deprecated Legacy OpenAI-specific API key field. */
|
||||
@@ -285,7 +270,6 @@ export const VoiceCallStreamingConfigSchema = z
|
||||
.strict()
|
||||
.default({
|
||||
enabled: false,
|
||||
provider: "openai",
|
||||
streamPath: "/voice/stream",
|
||||
providers: {},
|
||||
preStartTimeoutMs: 5000,
|
||||
@@ -381,9 +365,6 @@ export const VoiceCallConfigSchema = z
|
||||
/** Skip webhook signature verification (development only, NOT for production) */
|
||||
skipSignatureVerification: z.boolean().default(false),
|
||||
|
||||
/** STT configuration */
|
||||
stt: SttConfigSchema,
|
||||
|
||||
/** TTS override (deep-merges with core messages.tts) */
|
||||
tts: TtsConfigSchema,
|
||||
|
||||
@@ -467,36 +448,73 @@ function sanitizeVoiceCallProviderConfigs(
|
||||
);
|
||||
}
|
||||
|
||||
function mergeLegacyStreamingOpenAICompat(
|
||||
streaming: VoiceCallStreamingConfig,
|
||||
): VoiceCallStreamingConfig {
|
||||
const providers = { ...(streaming.providers ?? {}) };
|
||||
const legacyStreamingRaw = streaming as Record<string, unknown>;
|
||||
const openaiRaw =
|
||||
providers.openai && typeof providers.openai === "object"
|
||||
? { ...(providers.openai as Record<string, unknown>) }
|
||||
: {};
|
||||
|
||||
if (typeof openaiRaw.apiKey !== "string" && typeof legacyStreamingRaw.openaiApiKey === "string") {
|
||||
openaiRaw.apiKey = legacyStreamingRaw.openaiApiKey;
|
||||
}
|
||||
if (typeof openaiRaw.model !== "string" && typeof legacyStreamingRaw.sttModel === "string") {
|
||||
openaiRaw.model = legacyStreamingRaw.sttModel;
|
||||
}
|
||||
if (
|
||||
openaiRaw.silenceDurationMs == null &&
|
||||
typeof legacyStreamingRaw.silenceDurationMs === "number"
|
||||
) {
|
||||
openaiRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs;
|
||||
}
|
||||
if (openaiRaw.vadThreshold == null && typeof legacyStreamingRaw.vadThreshold === "number") {
|
||||
openaiRaw.vadThreshold = legacyStreamingRaw.vadThreshold;
|
||||
}
|
||||
if (Object.keys(openaiRaw).length > 0) {
|
||||
providers.openai = openaiRaw;
|
||||
}
|
||||
|
||||
return {
|
||||
...streaming,
|
||||
providers,
|
||||
};
|
||||
}
|
||||
|
||||
function mergeLegacyRealtimeOpenAICompat(
|
||||
realtime: VoiceCallRealtimeConfig,
|
||||
): VoiceCallRealtimeConfig {
|
||||
const providers = { ...(realtime.providers ?? {}) };
|
||||
const openaiRaw =
|
||||
providers.openai && typeof providers.openai === "object"
|
||||
? { ...(providers.openai as Record<string, unknown>) }
|
||||
: {};
|
||||
|
||||
if (Object.keys(openaiRaw).length > 0) {
|
||||
providers.openai = openaiRaw;
|
||||
}
|
||||
|
||||
return {
|
||||
...realtime,
|
||||
providers,
|
||||
};
|
||||
}
|
||||
|
||||
export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
|
||||
const defaults = cloneDefaultVoiceCallConfig();
|
||||
const serve = { ...defaults.serve, ...config.serve };
|
||||
const streamingProvider =
|
||||
config.streaming?.provider ??
|
||||
(typeof config.streaming?.sttProvider === "string"
|
||||
? config.streaming.sttProvider
|
||||
: undefined) ??
|
||||
defaults.streaming.provider;
|
||||
(typeof config.streaming?.sttProvider === "string" ? config.streaming.sttProvider : undefined);
|
||||
const streamingProviders = sanitizeVoiceCallProviderConfigs(
|
||||
config.streaming?.providers ?? defaults.streaming.providers,
|
||||
);
|
||||
if (
|
||||
typeof streamingProvider === "string" &&
|
||||
streamingProvider.trim() &&
|
||||
!(streamingProvider in streamingProviders)
|
||||
) {
|
||||
streamingProviders[streamingProvider] = {};
|
||||
}
|
||||
const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider;
|
||||
const realtimeProviders = sanitizeVoiceCallProviderConfigs(
|
||||
config.realtime?.providers ?? defaults.realtime.providers,
|
||||
);
|
||||
if (
|
||||
typeof realtimeProvider === "string" &&
|
||||
realtimeProvider.trim() &&
|
||||
!(realtimeProvider in realtimeProviders)
|
||||
) {
|
||||
realtimeProviders[realtimeProvider] = {};
|
||||
}
|
||||
return {
|
||||
...defaults,
|
||||
...config,
|
||||
@@ -529,7 +547,6 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
|
||||
(config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
|
||||
providers: realtimeProviders,
|
||||
},
|
||||
stt: { ...defaults.stt, ...config.stt },
|
||||
tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
|
||||
};
|
||||
}
|
||||
@@ -584,132 +601,16 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
|
||||
resolved.webhookSecurity.trustForwardingHeaders ?? false;
|
||||
resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
|
||||
|
||||
resolved.streaming = {
|
||||
...resolved.streaming,
|
||||
providers: { ...(resolved.streaming.providers ?? {}) },
|
||||
};
|
||||
const legacyStreamingRaw = resolved.streaming as Record<string, unknown>;
|
||||
const openaiStreamingRaw =
|
||||
resolved.streaming.providers.openai && typeof resolved.streaming.providers.openai === "object"
|
||||
? { ...(resolved.streaming.providers.openai as Record<string, unknown>) }
|
||||
: {};
|
||||
if (
|
||||
typeof openaiStreamingRaw.apiKey !== "string" &&
|
||||
typeof legacyStreamingRaw.openaiApiKey === "string"
|
||||
) {
|
||||
openaiStreamingRaw.apiKey = legacyStreamingRaw.openaiApiKey;
|
||||
}
|
||||
if (
|
||||
typeof openaiStreamingRaw.model !== "string" &&
|
||||
typeof legacyStreamingRaw.sttModel === "string"
|
||||
) {
|
||||
openaiStreamingRaw.model = legacyStreamingRaw.sttModel;
|
||||
}
|
||||
if (
|
||||
openaiStreamingRaw.silenceDurationMs == null &&
|
||||
typeof legacyStreamingRaw.silenceDurationMs === "number"
|
||||
) {
|
||||
openaiStreamingRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs;
|
||||
}
|
||||
if (
|
||||
openaiStreamingRaw.vadThreshold == null &&
|
||||
typeof legacyStreamingRaw.vadThreshold === "number"
|
||||
) {
|
||||
openaiStreamingRaw.vadThreshold = legacyStreamingRaw.vadThreshold;
|
||||
}
|
||||
if (typeof openaiStreamingRaw.apiKey !== "string" || !openaiStreamingRaw.apiKey.trim()) {
|
||||
if (process.env.OPENAI_API_KEY) {
|
||||
openaiStreamingRaw.apiKey = process.env.OPENAI_API_KEY;
|
||||
}
|
||||
}
|
||||
if (
|
||||
typeof openaiStreamingRaw.model !== "string" &&
|
||||
typeof process.env.REALTIME_TRANSCRIPTION_MODEL === "string"
|
||||
) {
|
||||
openaiStreamingRaw.model = process.env.REALTIME_TRANSCRIPTION_MODEL;
|
||||
}
|
||||
if (
|
||||
typeof openaiStreamingRaw.model !== "string" &&
|
||||
typeof process.env.STREAMING_STT_MODEL === "string"
|
||||
) {
|
||||
openaiStreamingRaw.model = process.env.STREAMING_STT_MODEL;
|
||||
}
|
||||
if (openaiStreamingRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
|
||||
openaiStreamingRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
|
||||
}
|
||||
if (
|
||||
openaiStreamingRaw.silenceDurationMs == null &&
|
||||
typeof process.env.SILENCE_DURATION_MS === "string"
|
||||
) {
|
||||
openaiStreamingRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
|
||||
}
|
||||
if (Object.keys(openaiStreamingRaw).length > 0) {
|
||||
resolved.streaming.providers.openai = openaiStreamingRaw;
|
||||
}
|
||||
if (
|
||||
typeof resolved.streaming.provider === "string" &&
|
||||
resolved.streaming.provider.trim() &&
|
||||
!(resolved.streaming.provider in resolved.streaming.providers)
|
||||
) {
|
||||
resolved.streaming.providers[resolved.streaming.provider] = {};
|
||||
}
|
||||
// Keep parsing legacy OpenAI-shaped fields, but isolate them to the OpenAI provider blob.
|
||||
resolved.streaming = mergeLegacyStreamingOpenAICompat(resolved.streaming);
|
||||
|
||||
resolved.realtime = {
|
||||
...resolved.realtime,
|
||||
providers: { ...(resolved.realtime.providers ?? {}) },
|
||||
};
|
||||
const openaiRealtimeRaw =
|
||||
resolved.realtime.providers.openai && typeof resolved.realtime.providers.openai === "object"
|
||||
? { ...(resolved.realtime.providers.openai as Record<string, unknown>) }
|
||||
: {};
|
||||
if (typeof openaiRealtimeRaw.apiKey !== "string" || !openaiRealtimeRaw.apiKey.trim()) {
|
||||
if (process.env.OPENAI_API_KEY) {
|
||||
openaiRealtimeRaw.apiKey = process.env.OPENAI_API_KEY;
|
||||
}
|
||||
}
|
||||
if (
|
||||
typeof openaiRealtimeRaw.model !== "string" &&
|
||||
typeof process.env.REALTIME_VOICE_MODEL === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.model = process.env.REALTIME_VOICE_MODEL;
|
||||
}
|
||||
if (
|
||||
typeof openaiRealtimeRaw.voice !== "string" &&
|
||||
typeof process.env.REALTIME_VOICE_VOICE === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.voice = process.env.REALTIME_VOICE_VOICE;
|
||||
}
|
||||
resolved.realtime = mergeLegacyRealtimeOpenAICompat(resolved.realtime);
|
||||
if (
|
||||
typeof resolved.realtime.instructions !== "string" &&
|
||||
typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string"
|
||||
) {
|
||||
resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS;
|
||||
}
|
||||
if (
|
||||
openaiRealtimeRaw.temperature == null &&
|
||||
typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.temperature = Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE);
|
||||
}
|
||||
if (openaiRealtimeRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
|
||||
openaiRealtimeRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
|
||||
}
|
||||
if (
|
||||
openaiRealtimeRaw.silenceDurationMs == null &&
|
||||
typeof process.env.SILENCE_DURATION_MS === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
|
||||
}
|
||||
if (Object.keys(openaiRealtimeRaw).length > 0) {
|
||||
resolved.realtime.providers.openai = openaiRealtimeRaw;
|
||||
}
|
||||
if (
|
||||
typeof resolved.realtime.provider === "string" &&
|
||||
resolved.realtime.provider.trim() &&
|
||||
!(resolved.realtime.provider in resolved.realtime.providers)
|
||||
) {
|
||||
resolved.realtime.providers[resolved.realtime.provider] = {};
|
||||
}
|
||||
|
||||
return normalizeVoiceCallConfig(resolved);
|
||||
}
|
||||
|
||||
@@ -30,7 +30,6 @@ export function createVoiceCallBaseConfig(params?: {
|
||||
},
|
||||
streaming: {
|
||||
enabled: false,
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
model: "gpt-4o-transcribe",
|
||||
@@ -51,7 +50,6 @@ export function createVoiceCallBaseConfig(params?: {
|
||||
providers: {},
|
||||
},
|
||||
skipSignatureVerification: false,
|
||||
stt: { provider: "openai", model: "whisper-1" },
|
||||
tts: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
|
||||
@@ -24,12 +24,16 @@ const mocks = vi.hoisted(() => {
|
||||
};
|
||||
|
||||
return {
|
||||
getRealtimeTranscriptionProvider: vi.fn(() => realtimeTranscriptionProvider),
|
||||
getRealtimeTranscriptionProvider: vi.fn<
|
||||
(...args: unknown[]) => RealtimeTranscriptionProviderPlugin | undefined
|
||||
>(() => realtimeTranscriptionProvider),
|
||||
listRealtimeTranscriptionProviders: vi.fn(() => [realtimeTranscriptionProvider]),
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("./realtime-transcription.runtime.js", () => ({
|
||||
getRealtimeTranscriptionProvider: mocks.getRealtimeTranscriptionProvider,
|
||||
listRealtimeTranscriptionProviders: mocks.listRealtimeTranscriptionProviders,
|
||||
}));
|
||||
|
||||
const provider: VoiceCallProvider = {
|
||||
@@ -110,6 +114,48 @@ function expectWebhookUrl(url: string, expectedPath: string) {
|
||||
expect(parsed.port).not.toBe("0");
|
||||
}
|
||||
|
||||
describe("VoiceCallWebhookServer realtime transcription provider selection", () => {
|
||||
it("auto-selects the first registered provider when streaming.provider is unset", async () => {
|
||||
const { manager } = createManager([]);
|
||||
const config = createConfig({
|
||||
streaming: {
|
||||
...createConfig().streaming,
|
||||
enabled: true,
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
const autoSelectedProvider: RealtimeTranscriptionProviderPlugin = {
|
||||
id: "openai",
|
||||
label: "OpenAI",
|
||||
autoSelectOrder: 5,
|
||||
isConfigured: () => true,
|
||||
resolveConfig: ({ rawConfig }) => rawConfig,
|
||||
createSession: () => ({
|
||||
connect: async () => {},
|
||||
sendAudio: () => {},
|
||||
close: () => {},
|
||||
isConnected: () => true,
|
||||
}),
|
||||
};
|
||||
mocks.getRealtimeTranscriptionProvider.mockReturnValueOnce(undefined);
|
||||
mocks.listRealtimeTranscriptionProviders.mockReturnValueOnce([autoSelectedProvider]);
|
||||
|
||||
const server = new VoiceCallWebhookServer(config, manager, provider);
|
||||
try {
|
||||
await server.start();
|
||||
expect(mocks.getRealtimeTranscriptionProvider).toHaveBeenCalledWith(undefined, undefined);
|
||||
expect(mocks.listRealtimeTranscriptionProviders).toHaveBeenCalledWith(undefined);
|
||||
expect(server.getMediaStreamHandler()).toBeTruthy();
|
||||
} finally {
|
||||
await server.stop();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
async function runStaleCallReaperCase(params: {
|
||||
callAgeMs: number;
|
||||
staleCallReaperSeconds: number;
|
||||
|
||||
@@ -158,18 +158,32 @@ export class VoiceCallWebhookServer {
|
||||
*/
|
||||
private async initializeMediaStreaming(): Promise<void> {
|
||||
const streaming = this.config.streaming;
|
||||
const selectedProviderId = streaming.provider;
|
||||
const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined;
|
||||
const { getRealtimeTranscriptionProvider } =
|
||||
const { getRealtimeTranscriptionProvider, listRealtimeTranscriptionProviders } =
|
||||
await import("./realtime-transcription.runtime.js");
|
||||
const provider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig);
|
||||
if (!provider) {
|
||||
const selectedProviderId = streaming.provider?.trim();
|
||||
const configuredProvider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig);
|
||||
if (selectedProviderId && !configuredProvider) {
|
||||
console.warn(
|
||||
`[voice-call] Streaming enabled but realtime transcription provider "${selectedProviderId}" is not registered`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
const provider =
|
||||
configuredProvider ??
|
||||
[...listRealtimeTranscriptionProviders(pluginConfig)].sort(
|
||||
(left, right) =>
|
||||
(left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER) -
|
||||
(right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER),
|
||||
)[0];
|
||||
if (!provider) {
|
||||
console.warn(
|
||||
"[voice-call] Streaming enabled but no realtime transcription provider is registered",
|
||||
);
|
||||
return;
|
||||
}
|
||||
const selectedProviderConfig =
|
||||
selectedProviderId &&
|
||||
streaming.providers[selectedProviderId] &&
|
||||
typeof streaming.providers[selectedProviderId] === "object"
|
||||
? (streaming.providers[selectedProviderId] as Record<string, unknown>)
|
||||
|
||||
Reference in New Issue
Block a user