refactor(voice-call): use config for realtime tuning

This commit is contained in:
Peter Steinberger
2026-04-04 12:43:08 +09:00
parent e636ba6ab0
commit ed0cbcba2f
6 changed files with 34 additions and 77 deletions

View File

@@ -1,13 +1,7 @@
import { afterEach, describe, expect, it } from "vitest";
import { describe, expect, it } from "vitest";
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
describe("buildOpenAIRealtimeTranscriptionProvider", () => {
const originalEnv = { ...process.env };
afterEach(() => {
process.env = { ...originalEnv };
});
it("normalizes OpenAI config defaults", () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
const resolved = provider.resolveConfig?.({
@@ -26,15 +20,19 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
});
});
it("reads provider-owned env fallbacks", () => {
process.env.REALTIME_TRANSCRIPTION_MODEL = "gpt-4o-transcribe";
process.env.SILENCE_DURATION_MS = "900";
process.env.VAD_THRESHOLD = "0.45";
it("keeps provider-owned transcription settings configurable via raw provider config", () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
const resolved = provider.resolveConfig?.({
cfg: {} as never,
rawConfig: {},
rawConfig: {
providers: {
openai: {
model: "gpt-4o-transcribe",
silenceDurationMs: 900,
vadThreshold: 0.45,
},
},
},
});
expect(resolved).toEqual({

View File

@@ -57,21 +57,9 @@ function normalizeProviderConfig(
value: raw?.openaiApiKey,
path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
}),
model:
trimToUndefined(raw?.model) ??
trimToUndefined(raw?.sttModel) ??
trimToUndefined(process.env.REALTIME_TRANSCRIPTION_MODEL) ??
trimToUndefined(process.env.STREAMING_STT_MODEL),
silenceDurationMs:
asNumber(raw?.silenceDurationMs) ??
(typeof process.env.SILENCE_DURATION_MS === "string"
? Number.parseInt(process.env.SILENCE_DURATION_MS, 10)
: undefined),
vadThreshold:
asNumber(raw?.vadThreshold) ??
(typeof process.env.VAD_THRESHOLD === "string"
? Number.parseFloat(process.env.VAD_THRESHOLD)
: undefined),
model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
silenceDurationMs: asNumber(raw?.silenceDurationMs),
vadThreshold: asNumber(raw?.vadThreshold),
};
}

View File

@@ -1,24 +1,22 @@
import { afterEach, describe, expect, it } from "vitest";
import { describe, expect, it } from "vitest";
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
describe("buildOpenAIRealtimeVoiceProvider", () => {
const originalEnv = { ...process.env };
afterEach(() => {
process.env = { ...originalEnv };
});
it("normalizes provider-owned env fallbacks", () => {
process.env.REALTIME_VOICE_MODEL = "gpt-realtime";
process.env.REALTIME_VOICE_VOICE = "verse";
process.env.REALTIME_VOICE_TEMPERATURE = "0.6";
process.env.SILENCE_DURATION_MS = "850";
process.env.VAD_THRESHOLD = "0.35";
it("normalizes provider-owned voice settings from raw provider config", () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const resolved = provider.resolveConfig?.({
cfg: {} as never,
rawConfig: {},
rawConfig: {
providers: {
openai: {
model: "gpt-realtime",
voice: "verse",
temperature: 0.6,
silenceDurationMs: 850,
vadThreshold: 0.35,
},
},
},
});
expect(resolved).toEqual({

View File

@@ -103,25 +103,11 @@ function normalizeProviderConfig(
value: raw?.apiKey,
path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
}),
model: trimToUndefined(raw?.model) ?? trimToUndefined(process.env.REALTIME_VOICE_MODEL),
voice: (trimToUndefined(raw?.voice) ?? trimToUndefined(process.env.REALTIME_VOICE_VOICE)) as
| OpenAIRealtimeVoice
| undefined,
temperature:
asNumber(raw?.temperature) ??
(typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
? Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE)
: undefined),
vadThreshold:
asNumber(raw?.vadThreshold) ??
(typeof process.env.VAD_THRESHOLD === "string"
? Number.parseFloat(process.env.VAD_THRESHOLD)
: undefined),
silenceDurationMs:
asNumber(raw?.silenceDurationMs) ??
(typeof process.env.SILENCE_DURATION_MS === "string"
? Number.parseInt(process.env.SILENCE_DURATION_MS, 10)
: undefined),
model: trimToUndefined(raw?.model),
voice: trimToUndefined(raw?.voice) as OpenAIRealtimeVoice | undefined,
temperature: asNumber(raw?.temperature),
vadThreshold: asNumber(raw?.vadThreshold),
silenceDurationMs: asNumber(raw?.silenceDurationMs),
prefixPaddingMs: asNumber(raw?.prefixPaddingMs),
azureEndpoint: trimToUndefined(raw?.azureEndpoint),
azureDeployment: trimToUndefined(raw?.azureDeployment),

View File

@@ -273,12 +273,6 @@ describe("normalizeVoiceCallConfig", () => {
});
describe("resolveVoiceCallConfig", () => {
const originalEnv = { ...process.env };
afterEach(() => {
process.env = { ...originalEnv };
});
it("keeps legacy streaming OpenAI fields inside providers.openai without forcing provider selection", () => {
const resolved = resolveVoiceCallConfig({
enabled: true,
@@ -301,14 +295,13 @@ describe("resolveVoiceCallConfig", () => {
});
});
it("maps realtime instructions from the legacy env hook without altering provider selection", () => {
process.env.REALTIME_VOICE_INSTRUCTIONS = "Stay concise.";
it("preserves configured realtime instructions without env indirection", () => {
const resolved = resolveVoiceCallConfig({
enabled: true,
provider: "twilio",
realtime: {
enabled: true,
instructions: "Stay concise.",
},
});

View File

@@ -605,12 +605,6 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
resolved.streaming = mergeLegacyStreamingOpenAICompat(resolved.streaming);
resolved.realtime = mergeLegacyRealtimeOpenAICompat(resolved.realtime);
if (
typeof resolved.realtime.instructions !== "string" &&
typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string"
) {
resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS;
}
return normalizeVoiceCallConfig(resolved);
}