mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 16:01:01 +00:00
fix(openai): harden realtime stt
This commit is contained in:
@@ -140,6 +140,56 @@ async function createTempAgentDir(): Promise<string> {
|
||||
return await fs.mkdtemp(path.join(os.tmpdir(), "openai-plugin-live-"));
|
||||
}
|
||||
|
||||
async function waitForLiveExpectation(expectation: () => void, timeoutMs = 30_000) {
|
||||
const started = Date.now();
|
||||
let lastError: unknown;
|
||||
while (Date.now() - started < timeoutMs) {
|
||||
try {
|
||||
expectation();
|
||||
return;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
}
|
||||
}
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
function normalizeTranscriptForMatch(value: string): string {
|
||||
return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
|
||||
}
|
||||
|
||||
function linearToMulaw(sample: number): number {
|
||||
const bias = 132;
|
||||
const clip = 32635;
|
||||
let next = Math.max(-clip, Math.min(clip, sample));
|
||||
const sign = next < 0 ? 0x80 : 0;
|
||||
if (next < 0) {
|
||||
next = -next;
|
||||
}
|
||||
|
||||
next += bias;
|
||||
let exponent = 7;
|
||||
for (let expMask = 0x4000; (next & expMask) === 0 && exponent > 0; exponent -= 1) {
|
||||
expMask >>= 1;
|
||||
}
|
||||
|
||||
const mantissa = (next >> (exponent + 3)) & 0x0f;
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
|
||||
function convertPcm24kToMulaw8k(pcm: Buffer): Buffer {
|
||||
const inputSamples = Math.floor(pcm.length / 2);
|
||||
const outputSamples = Math.floor(inputSamples / 3);
|
||||
const mulaw = Buffer.alloc(outputSamples);
|
||||
|
||||
for (let i = 0; i < outputSamples; i += 1) {
|
||||
mulaw[i] = linearToMulaw(pcm.readInt16LE(i * 3 * 2));
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
describeLive("openai plugin live", () => {
|
||||
it("registers an OpenAI provider that can complete a live request", async () => {
|
||||
const { providers } = await registerOpenAIPlugin();
|
||||
@@ -247,6 +297,89 @@ describeLive("openai plugin live", () => {
|
||||
expect(text).toMatch(/\bok\b/);
|
||||
}, 45_000);
|
||||
|
||||
it("opens OpenAI realtime STT before sending audio", async () => {
|
||||
const { realtimeTranscriptionProviders } = await registerOpenAIPlugin();
|
||||
const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
|
||||
const errors: Error[] = [];
|
||||
const session = realtimeProvider.createSession({
|
||||
providerConfig: {
|
||||
apiKey: OPENAI_API_KEY,
|
||||
language: "en",
|
||||
},
|
||||
onError: (error) => errors.push(error),
|
||||
});
|
||||
|
||||
try {
|
||||
await session.connect();
|
||||
await new Promise((resolve) => setTimeout(resolve, 1_000));
|
||||
expect(errors).toEqual([]);
|
||||
expect(session.isConnected()).toBe(true);
|
||||
} finally {
|
||||
session.close();
|
||||
}
|
||||
}, 30_000);
|
||||
|
||||
it("streams realtime STT through the registered transcription provider", async () => {
|
||||
const { realtimeTranscriptionProviders, speechProviders } = await registerOpenAIPlugin();
|
||||
const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
|
||||
const speechProvider = requireRegisteredProvider(speechProviders, "openai");
|
||||
const cfg = createLiveConfig();
|
||||
const ttsConfig = createLiveTtsConfig();
|
||||
const phrase = "Testing OpenClaw OpenAI realtime transcription integration test OK.";
|
||||
|
||||
const telephony = await speechProvider.synthesizeTelephony?.({
|
||||
text: phrase,
|
||||
cfg,
|
||||
providerConfig: ttsConfig.providerConfigs.openai ?? {},
|
||||
timeoutMs: ttsConfig.timeoutMs,
|
||||
});
|
||||
if (!telephony) {
|
||||
throw new Error("OpenAI telephony synthesis did not return audio");
|
||||
}
|
||||
expect(telephony.outputFormat).toBe("pcm");
|
||||
expect(telephony.sampleRate).toBe(24_000);
|
||||
|
||||
const transcripts: string[] = [];
|
||||
const partials: string[] = [];
|
||||
const errors: Error[] = [];
|
||||
const session = realtimeProvider.createSession({
|
||||
providerConfig: {
|
||||
apiKey: OPENAI_API_KEY,
|
||||
language: "en",
|
||||
silenceDurationMs: 500,
|
||||
},
|
||||
onPartial: (partial) => partials.push(partial),
|
||||
onTranscript: (transcript) => transcripts.push(transcript),
|
||||
onError: (error) => errors.push(error),
|
||||
});
|
||||
|
||||
try {
|
||||
await session.connect();
|
||||
const speech = convertPcm24kToMulaw8k(telephony.audioBuffer);
|
||||
const silence = Buffer.alloc(8_000, 0xff);
|
||||
const audio = Buffer.concat([silence.subarray(0, 4_000), speech, silence]);
|
||||
for (let offset = 0; offset < audio.byteLength; offset += 160) {
|
||||
session.sendAudio(audio.subarray(offset, offset + 160));
|
||||
await new Promise((resolve) => setTimeout(resolve, 5));
|
||||
}
|
||||
|
||||
await waitForLiveExpectation(() => {
|
||||
if (errors[0]) {
|
||||
throw errors[0];
|
||||
}
|
||||
expect(normalizeTranscriptForMatch(transcripts.join(" "))).toContain("openclaw");
|
||||
}, 60_000);
|
||||
} finally {
|
||||
session.close();
|
||||
}
|
||||
|
||||
const normalized = transcripts.join(" ").toLowerCase();
|
||||
const compact = normalizeTranscriptForMatch(normalized);
|
||||
expect(compact).toContain("openclaw");
|
||||
expect(normalized).toContain("transcription");
|
||||
expect(partials.length + transcripts.length).toBeGreaterThan(0);
|
||||
}, 180_000);
|
||||
|
||||
it("generates an image through the registered image provider", async () => {
|
||||
const { imageProviders } = await registerOpenAIPlugin();
|
||||
const imageProvider = requireRegisteredProvider(imageProviders, "openai");
|
||||
|
||||
@@ -27,7 +27,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
|
||||
rawConfig: {
|
||||
providers: {
|
||||
openai: {
|
||||
language: "en",
|
||||
model: "gpt-4o-transcribe",
|
||||
prompt: "expect OpenClaw product names",
|
||||
silenceDurationMs: 900,
|
||||
vadThreshold: 0.45,
|
||||
},
|
||||
@@ -36,7 +38,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
|
||||
});
|
||||
|
||||
expect(resolved).toEqual({
|
||||
language: "en",
|
||||
model: "gpt-4o-transcribe",
|
||||
prompt: "expect OpenClaw product names",
|
||||
silenceDurationMs: 900,
|
||||
vadThreshold: 0.45,
|
||||
});
|
||||
|
||||
@@ -22,14 +22,18 @@ import {
|
||||
|
||||
type OpenAIRealtimeTranscriptionProviderConfig = {
|
||||
apiKey?: string;
|
||||
language?: string;
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
silenceDurationMs?: number;
|
||||
vadThreshold?: number;
|
||||
};
|
||||
|
||||
type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
|
||||
apiKey: string;
|
||||
language?: string;
|
||||
model: string;
|
||||
prompt?: string;
|
||||
silenceDurationMs: number;
|
||||
vadThreshold: number;
|
||||
};
|
||||
@@ -55,7 +59,9 @@ function normalizeProviderConfig(
|
||||
value: raw?.openaiApiKey,
|
||||
path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
|
||||
}),
|
||||
language: trimToUndefined(raw?.language),
|
||||
model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
|
||||
prompt: trimToUndefined(raw?.prompt),
|
||||
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
|
||||
vadThreshold: asFiniteNumber(raw?.vadThreshold),
|
||||
};
|
||||
@@ -141,6 +147,8 @@ class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession
|
||||
input_audio_format: "g711_ulaw",
|
||||
input_audio_transcription: {
|
||||
model: this.config.model,
|
||||
...(this.config.language ? { language: this.config.language } : {}),
|
||||
...(this.config.prompt ? { prompt: this.config.prompt } : {}),
|
||||
},
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
@@ -301,7 +309,9 @@ export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptio
|
||||
return new OpenAIRealtimeTranscriptionSession({
|
||||
...req,
|
||||
apiKey,
|
||||
language: config.language,
|
||||
model: config.model ?? "gpt-4o-transcribe",
|
||||
prompt: config.prompt,
|
||||
silenceDurationMs: config.silenceDurationMs ?? 800,
|
||||
vadThreshold: config.vadThreshold ?? 0.5,
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user