fix(openai): harden realtime stt

This commit is contained in:
Peter Steinberger
2026-04-23 02:21:42 +01:00
parent 26bf916382
commit 4ff720a837
6 changed files with 208 additions and 5 deletions

View File

@@ -140,6 +140,56 @@ async function createTempAgentDir(): Promise<string> {
return await fs.mkdtemp(path.join(os.tmpdir(), "openai-plugin-live-"));
}
async function waitForLiveExpectation(expectation: () => void, timeoutMs = 30_000) {
const started = Date.now();
let lastError: unknown;
while (Date.now() - started < timeoutMs) {
try {
expectation();
return;
} catch (error) {
lastError = error;
await new Promise((resolve) => setTimeout(resolve, 100));
}
}
throw lastError;
}
function normalizeTranscriptForMatch(value: string): string {
return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
}
function linearToMulaw(sample: number): number {
const bias = 132;
const clip = 32635;
let next = Math.max(-clip, Math.min(clip, sample));
const sign = next < 0 ? 0x80 : 0;
if (next < 0) {
next = -next;
}
next += bias;
let exponent = 7;
for (let expMask = 0x4000; (next & expMask) === 0 && exponent > 0; exponent -= 1) {
expMask >>= 1;
}
const mantissa = (next >> (exponent + 3)) & 0x0f;
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}
function convertPcm24kToMulaw8k(pcm: Buffer): Buffer {
const inputSamples = Math.floor(pcm.length / 2);
const outputSamples = Math.floor(inputSamples / 3);
const mulaw = Buffer.alloc(outputSamples);
for (let i = 0; i < outputSamples; i += 1) {
mulaw[i] = linearToMulaw(pcm.readInt16LE(i * 3 * 2));
}
return mulaw;
}
describeLive("openai plugin live", () => {
it("registers an OpenAI provider that can complete a live request", async () => {
const { providers } = await registerOpenAIPlugin();
@@ -247,6 +297,89 @@ describeLive("openai plugin live", () => {
expect(text).toMatch(/\bok\b/);
}, 45_000);
it("opens OpenAI realtime STT before sending audio", async () => {
const { realtimeTranscriptionProviders } = await registerOpenAIPlugin();
const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
const errors: Error[] = [];
const session = realtimeProvider.createSession({
providerConfig: {
apiKey: OPENAI_API_KEY,
language: "en",
},
onError: (error) => errors.push(error),
});
try {
await session.connect();
await new Promise((resolve) => setTimeout(resolve, 1_000));
expect(errors).toEqual([]);
expect(session.isConnected()).toBe(true);
} finally {
session.close();
}
}, 30_000);
it("streams realtime STT through the registered transcription provider", async () => {
const { realtimeTranscriptionProviders, speechProviders } = await registerOpenAIPlugin();
const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
const speechProvider = requireRegisteredProvider(speechProviders, "openai");
const cfg = createLiveConfig();
const ttsConfig = createLiveTtsConfig();
const phrase = "Testing OpenClaw OpenAI realtime transcription integration test OK.";
const telephony = await speechProvider.synthesizeTelephony?.({
text: phrase,
cfg,
providerConfig: ttsConfig.providerConfigs.openai ?? {},
timeoutMs: ttsConfig.timeoutMs,
});
if (!telephony) {
throw new Error("OpenAI telephony synthesis did not return audio");
}
expect(telephony.outputFormat).toBe("pcm");
expect(telephony.sampleRate).toBe(24_000);
const transcripts: string[] = [];
const partials: string[] = [];
const errors: Error[] = [];
const session = realtimeProvider.createSession({
providerConfig: {
apiKey: OPENAI_API_KEY,
language: "en",
silenceDurationMs: 500,
},
onPartial: (partial) => partials.push(partial),
onTranscript: (transcript) => transcripts.push(transcript),
onError: (error) => errors.push(error),
});
try {
await session.connect();
const speech = convertPcm24kToMulaw8k(telephony.audioBuffer);
const silence = Buffer.alloc(8_000, 0xff);
const audio = Buffer.concat([silence.subarray(0, 4_000), speech, silence]);
for (let offset = 0; offset < audio.byteLength; offset += 160) {
session.sendAudio(audio.subarray(offset, offset + 160));
await new Promise((resolve) => setTimeout(resolve, 5));
}
await waitForLiveExpectation(() => {
if (errors[0]) {
throw errors[0];
}
expect(normalizeTranscriptForMatch(transcripts.join(" "))).toContain("openclaw");
}, 60_000);
} finally {
session.close();
}
const normalized = transcripts.join(" ").toLowerCase();
const compact = normalizeTranscriptForMatch(normalized);
expect(compact).toContain("openclaw");
expect(normalized).toContain("transcription");
expect(partials.length + transcripts.length).toBeGreaterThan(0);
}, 180_000);
it("generates an image through the registered image provider", async () => {
const { imageProviders } = await registerOpenAIPlugin();
const imageProvider = requireRegisteredProvider(imageProviders, "openai");

View File

@@ -27,7 +27,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
rawConfig: {
providers: {
openai: {
language: "en",
model: "gpt-4o-transcribe",
prompt: "expect OpenClaw product names",
silenceDurationMs: 900,
vadThreshold: 0.45,
},
@@ -36,7 +38,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
});
expect(resolved).toEqual({
language: "en",
model: "gpt-4o-transcribe",
prompt: "expect OpenClaw product names",
silenceDurationMs: 900,
vadThreshold: 0.45,
});

View File

@@ -22,14 +22,18 @@ import {
type OpenAIRealtimeTranscriptionProviderConfig = {
apiKey?: string;
language?: string;
model?: string;
prompt?: string;
silenceDurationMs?: number;
vadThreshold?: number;
};
type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
apiKey: string;
language?: string;
model: string;
prompt?: string;
silenceDurationMs: number;
vadThreshold: number;
};
@@ -55,7 +59,9 @@ function normalizeProviderConfig(
value: raw?.openaiApiKey,
path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
}),
language: trimToUndefined(raw?.language),
model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
prompt: trimToUndefined(raw?.prompt),
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
vadThreshold: asFiniteNumber(raw?.vadThreshold),
};
@@ -141,6 +147,8 @@ class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession
input_audio_format: "g711_ulaw",
input_audio_transcription: {
model: this.config.model,
...(this.config.language ? { language: this.config.language } : {}),
...(this.config.prompt ? { prompt: this.config.prompt } : {}),
},
turn_detection: {
type: "server_vad",
@@ -301,7 +309,9 @@ export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptio
return new OpenAIRealtimeTranscriptionSession({
...req,
apiKey,
language: config.language,
model: config.model ?? "gpt-4o-transcribe",
prompt: config.prompt,
silenceDurationMs: config.silenceDurationMs ?? 800,
vadThreshold: config.vadThreshold ?? 0.5,
});