mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-14 10:41:23 +00:00
515 lines
15 KiB
TypeScript
515 lines
15 KiB
TypeScript
import type {
|
|
RealtimeVoiceBridge,
|
|
RealtimeVoiceBridgeCreateRequest,
|
|
RealtimeVoiceProviderConfig,
|
|
RealtimeVoiceProviderPlugin,
|
|
RealtimeVoiceTool,
|
|
} from "openclaw/plugin-sdk/realtime-voice";
|
|
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
|
import WebSocket from "ws";
|
|
import {
|
|
asFiniteNumber,
|
|
readRealtimeErrorDetail,
|
|
resolveOpenAIProviderConfigRecord,
|
|
trimToUndefined,
|
|
} from "./realtime-provider-shared.js";
|
|
|
|
export type OpenAIRealtimeVoice =
|
|
| "alloy"
|
|
| "ash"
|
|
| "ballad"
|
|
| "cedar"
|
|
| "coral"
|
|
| "echo"
|
|
| "marin"
|
|
| "sage"
|
|
| "shimmer"
|
|
| "verse";
|
|
|
|
type OpenAIRealtimeVoiceProviderConfig = {
|
|
apiKey?: string;
|
|
model?: string;
|
|
voice?: OpenAIRealtimeVoice;
|
|
temperature?: number;
|
|
vadThreshold?: number;
|
|
silenceDurationMs?: number;
|
|
prefixPaddingMs?: number;
|
|
azureEndpoint?: string;
|
|
azureDeployment?: string;
|
|
azureApiVersion?: string;
|
|
};
|
|
|
|
type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
|
|
apiKey: string;
|
|
model?: string;
|
|
voice?: OpenAIRealtimeVoice;
|
|
temperature?: number;
|
|
vadThreshold?: number;
|
|
silenceDurationMs?: number;
|
|
prefixPaddingMs?: number;
|
|
azureEndpoint?: string;
|
|
azureDeployment?: string;
|
|
azureApiVersion?: string;
|
|
};
|
|
|
|
type RealtimeEvent = {
|
|
type: string;
|
|
delta?: string;
|
|
transcript?: string;
|
|
item_id?: string;
|
|
call_id?: string;
|
|
name?: string;
|
|
error?: unknown;
|
|
};
|
|
|
|
type RealtimeSessionUpdate = {
|
|
type: "session.update";
|
|
session: {
|
|
modalities: string[];
|
|
instructions?: string;
|
|
voice: OpenAIRealtimeVoice;
|
|
input_audio_format: string;
|
|
output_audio_format: string;
|
|
turn_detection: {
|
|
type: "server_vad";
|
|
threshold: number;
|
|
prefix_padding_ms: number;
|
|
silence_duration_ms: number;
|
|
create_response: boolean;
|
|
};
|
|
temperature: number;
|
|
input_audio_transcription?: { model: string };
|
|
tools?: RealtimeVoiceTool[];
|
|
tool_choice?: string;
|
|
};
|
|
};
|
|
|
|
function normalizeProviderConfig(
|
|
config: RealtimeVoiceProviderConfig,
|
|
): OpenAIRealtimeVoiceProviderConfig {
|
|
const raw = resolveOpenAIProviderConfigRecord(config);
|
|
return {
|
|
apiKey: normalizeResolvedSecretInputString({
|
|
value: raw?.apiKey,
|
|
path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
|
|
}),
|
|
model: trimToUndefined(raw?.model),
|
|
voice: trimToUndefined(raw?.voice) as OpenAIRealtimeVoice | undefined,
|
|
temperature: asFiniteNumber(raw?.temperature),
|
|
vadThreshold: asFiniteNumber(raw?.vadThreshold),
|
|
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
|
|
prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs),
|
|
azureEndpoint: trimToUndefined(raw?.azureEndpoint),
|
|
azureDeployment: trimToUndefined(raw?.azureDeployment),
|
|
azureApiVersion: trimToUndefined(raw?.azureApiVersion),
|
|
};
|
|
}
|
|
|
|
function base64ToBuffer(b64: string): Buffer {
|
|
return Buffer.from(b64, "base64");
|
|
}
|
|
|
|
class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
|
private static readonly DEFAULT_MODEL = "gpt-realtime";
|
|
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
|
private static readonly BASE_RECONNECT_DELAY_MS = 1000;
|
|
private static readonly CONNECT_TIMEOUT_MS = 10_000;
|
|
|
|
private ws: WebSocket | null = null;
|
|
private connected = false;
|
|
private intentionallyClosed = false;
|
|
private reconnectAttempts = 0;
|
|
private pendingAudio: Buffer[] = [];
|
|
private markQueue: string[] = [];
|
|
private responseStartTimestamp: number | null = null;
|
|
private latestMediaTimestamp = 0;
|
|
private lastAssistantItemId: string | null = null;
|
|
private toolCallBuffers = new Map<string, { name: string; callId: string; args: string }>();
|
|
|
|
constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {}
|
|
|
|
async connect(): Promise<void> {
|
|
this.intentionallyClosed = false;
|
|
this.reconnectAttempts = 0;
|
|
await this.doConnect();
|
|
}
|
|
|
|
sendAudio(audio: Buffer): void {
|
|
if (!this.connected || this.ws?.readyState !== WebSocket.OPEN) {
|
|
if (this.pendingAudio.length < 320) {
|
|
this.pendingAudio.push(audio);
|
|
}
|
|
return;
|
|
}
|
|
this.sendEvent({
|
|
type: "input_audio_buffer.append",
|
|
audio: audio.toString("base64"),
|
|
});
|
|
}
|
|
|
|
setMediaTimestamp(ts: number): void {
|
|
this.latestMediaTimestamp = ts;
|
|
}
|
|
|
|
sendUserMessage(text: string): void {
|
|
this.sendEvent({
|
|
type: "conversation.item.create",
|
|
item: {
|
|
type: "message",
|
|
role: "user",
|
|
content: [{ type: "input_text", text }],
|
|
},
|
|
});
|
|
this.sendEvent({ type: "response.create" });
|
|
}
|
|
|
|
triggerGreeting(instructions?: string): void {
|
|
if (!this.connected || !this.ws) {
|
|
return;
|
|
}
|
|
this.sendEvent({
|
|
type: "response.create",
|
|
response: {
|
|
instructions: instructions ?? this.config.instructions,
|
|
},
|
|
});
|
|
}
|
|
|
|
submitToolResult(callId: string, result: unknown): void {
|
|
this.sendEvent({
|
|
type: "conversation.item.create",
|
|
item: {
|
|
type: "function_call_output",
|
|
call_id: callId,
|
|
output: JSON.stringify(result),
|
|
},
|
|
});
|
|
this.sendEvent({ type: "response.create" });
|
|
}
|
|
|
|
acknowledgeMark(): void {
|
|
if (this.markQueue.length === 0) {
|
|
return;
|
|
}
|
|
this.markQueue.shift();
|
|
if (this.markQueue.length === 0) {
|
|
this.responseStartTimestamp = null;
|
|
this.lastAssistantItemId = null;
|
|
}
|
|
}
|
|
|
|
close(): void {
|
|
this.intentionallyClosed = true;
|
|
this.connected = false;
|
|
if (this.ws) {
|
|
this.ws.close(1000, "Bridge closed");
|
|
this.ws = null;
|
|
}
|
|
}
|
|
|
|
isConnected(): boolean {
|
|
return this.connected;
|
|
}
|
|
|
|
private async doConnect(): Promise<void> {
|
|
await new Promise<void>((resolve, reject) => {
|
|
const { url, headers } = this.resolveConnectionParams();
|
|
this.ws = new WebSocket(url, { headers });
|
|
|
|
const connectTimeout = setTimeout(() => {
|
|
reject(new Error("OpenAI realtime connection timeout"));
|
|
}, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS);
|
|
|
|
this.ws.on("open", () => {
|
|
clearTimeout(connectTimeout);
|
|
this.connected = true;
|
|
this.reconnectAttempts = 0;
|
|
this.sendSessionUpdate();
|
|
for (const chunk of this.pendingAudio.splice(0)) {
|
|
this.sendAudio(chunk);
|
|
}
|
|
this.config.onReady?.();
|
|
resolve();
|
|
});
|
|
|
|
this.ws.on("message", (data: Buffer) => {
|
|
try {
|
|
this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent);
|
|
} catch (error) {
|
|
console.error("[openai] realtime event parse failed:", error);
|
|
}
|
|
});
|
|
|
|
this.ws.on("error", (error) => {
|
|
if (!this.connected) {
|
|
clearTimeout(connectTimeout);
|
|
reject(error);
|
|
}
|
|
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
|
|
});
|
|
|
|
this.ws.on("close", () => {
|
|
this.connected = false;
|
|
if (this.intentionallyClosed) {
|
|
this.config.onClose?.("completed");
|
|
return;
|
|
}
|
|
void this.attemptReconnect();
|
|
});
|
|
});
|
|
}
|
|
|
|
private resolveConnectionParams(): { url: string; headers: Record<string, string> } {
|
|
const cfg = this.config;
|
|
if (cfg.azureEndpoint && cfg.azureDeployment) {
|
|
const base = cfg.azureEndpoint
|
|
.replace(/\/$/, "")
|
|
.replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
|
|
const apiVersion = cfg.azureApiVersion ?? "2024-10-01-preview";
|
|
return {
|
|
url: `${base}/openai/realtime?api-version=${apiVersion}&deployment=${encodeURIComponent(
|
|
cfg.azureDeployment,
|
|
)}`,
|
|
headers: { "api-key": cfg.apiKey },
|
|
};
|
|
}
|
|
|
|
if (cfg.azureEndpoint) {
|
|
const base = cfg.azureEndpoint
|
|
.replace(/\/$/, "")
|
|
.replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
|
|
return {
|
|
url: `${base}/v1/realtime?model=${encodeURIComponent(
|
|
cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
|
|
)}`,
|
|
headers: { Authorization: `Bearer ${cfg.apiKey}` },
|
|
};
|
|
}
|
|
|
|
return {
|
|
url: `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(
|
|
cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
|
|
)}`,
|
|
headers: {
|
|
Authorization: `Bearer ${cfg.apiKey}`,
|
|
"OpenAI-Beta": "realtime=v1",
|
|
},
|
|
};
|
|
}
|
|
|
|
private async attemptReconnect(): Promise<void> {
|
|
if (this.intentionallyClosed) {
|
|
return;
|
|
}
|
|
if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) {
|
|
this.config.onClose?.("error");
|
|
return;
|
|
}
|
|
this.reconnectAttempts += 1;
|
|
const delay =
|
|
OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
|
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
if (this.intentionallyClosed) {
|
|
return;
|
|
}
|
|
try {
|
|
await this.doConnect();
|
|
} catch (error) {
|
|
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
|
|
await this.attemptReconnect();
|
|
}
|
|
}
|
|
|
|
private sendSessionUpdate(): void {
|
|
const cfg = this.config;
|
|
const sessionUpdate: RealtimeSessionUpdate = {
|
|
type: "session.update",
|
|
session: {
|
|
modalities: ["text", "audio"],
|
|
instructions: cfg.instructions,
|
|
voice: cfg.voice ?? "alloy",
|
|
input_audio_format: "g711_ulaw",
|
|
output_audio_format: "g711_ulaw",
|
|
input_audio_transcription: {
|
|
model: "whisper-1",
|
|
},
|
|
turn_detection: {
|
|
type: "server_vad",
|
|
threshold: cfg.vadThreshold ?? 0.5,
|
|
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
|
|
silence_duration_ms: cfg.silenceDurationMs ?? 500,
|
|
create_response: true,
|
|
},
|
|
temperature: cfg.temperature ?? 0.8,
|
|
...(cfg.tools && cfg.tools.length > 0
|
|
? {
|
|
tools: cfg.tools,
|
|
tool_choice: "auto",
|
|
}
|
|
: {}),
|
|
},
|
|
};
|
|
this.sendEvent(sessionUpdate);
|
|
}
|
|
|
|
private handleEvent(event: RealtimeEvent): void {
|
|
switch (event.type) {
|
|
case "response.audio.delta": {
|
|
if (!event.delta) {
|
|
return;
|
|
}
|
|
const audio = base64ToBuffer(event.delta);
|
|
this.config.onAudio(audio);
|
|
if (this.responseStartTimestamp === null) {
|
|
this.responseStartTimestamp = this.latestMediaTimestamp;
|
|
}
|
|
if (event.item_id) {
|
|
this.lastAssistantItemId = event.item_id;
|
|
}
|
|
this.sendMark();
|
|
return;
|
|
}
|
|
|
|
case "input_audio_buffer.speech_started":
|
|
this.handleBargeIn();
|
|
return;
|
|
|
|
case "response.audio_transcript.delta":
|
|
if (event.delta) {
|
|
this.config.onTranscript?.("assistant", event.delta, false);
|
|
}
|
|
return;
|
|
|
|
case "response.audio_transcript.done":
|
|
if (event.transcript) {
|
|
this.config.onTranscript?.("assistant", event.transcript, true);
|
|
}
|
|
return;
|
|
|
|
case "conversation.item.input_audio_transcription.completed":
|
|
if (event.transcript) {
|
|
this.config.onTranscript?.("user", event.transcript, true);
|
|
}
|
|
return;
|
|
|
|
case "conversation.item.input_audio_transcription.delta":
|
|
if (event.delta) {
|
|
this.config.onTranscript?.("user", event.delta, false);
|
|
}
|
|
return;
|
|
|
|
case "response.function_call_arguments.delta": {
|
|
const key = event.item_id ?? "unknown";
|
|
const existing = this.toolCallBuffers.get(key);
|
|
if (existing && event.delta) {
|
|
existing.args += event.delta;
|
|
} else if (event.item_id) {
|
|
this.toolCallBuffers.set(event.item_id, {
|
|
name: event.name ?? "",
|
|
callId: event.call_id ?? "",
|
|
args: event.delta ?? "",
|
|
});
|
|
}
|
|
return;
|
|
}
|
|
|
|
case "response.function_call_arguments.done": {
|
|
const key = event.item_id ?? "unknown";
|
|
const buffered = this.toolCallBuffers.get(key);
|
|
if (this.config.onToolCall) {
|
|
const rawArgs =
|
|
buffered?.args ||
|
|
((event as unknown as Record<string, unknown>).arguments as string) ||
|
|
"{}";
|
|
let args: unknown = {};
|
|
try {
|
|
args = JSON.parse(rawArgs);
|
|
} catch {}
|
|
this.config.onToolCall({
|
|
itemId: key,
|
|
callId: buffered?.callId || event.call_id || "",
|
|
name: buffered?.name || event.name || "",
|
|
args,
|
|
});
|
|
}
|
|
this.toolCallBuffers.delete(key);
|
|
return;
|
|
}
|
|
|
|
case "error": {
|
|
const detail = readRealtimeErrorDetail(event.error);
|
|
this.config.onError?.(new Error(detail));
|
|
return;
|
|
}
|
|
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
|
|
private handleBargeIn(): void {
|
|
if (this.markQueue.length > 0 && this.responseStartTimestamp !== null) {
|
|
const elapsedMs = this.latestMediaTimestamp - this.responseStartTimestamp;
|
|
if (this.lastAssistantItemId) {
|
|
this.sendEvent({
|
|
type: "conversation.item.truncate",
|
|
item_id: this.lastAssistantItemId,
|
|
content_index: 0,
|
|
audio_end_ms: Math.max(0, elapsedMs),
|
|
});
|
|
}
|
|
this.config.onClearAudio();
|
|
this.markQueue = [];
|
|
this.lastAssistantItemId = null;
|
|
this.responseStartTimestamp = null;
|
|
return;
|
|
}
|
|
this.config.onClearAudio();
|
|
}
|
|
|
|
private sendMark(): void {
|
|
const markName = `audio-${Date.now()}`;
|
|
this.markQueue.push(markName);
|
|
this.config.onMark?.(markName);
|
|
}
|
|
|
|
private sendEvent(event: unknown): void {
|
|
if (this.ws?.readyState === WebSocket.OPEN) {
|
|
this.ws.send(JSON.stringify(event));
|
|
}
|
|
}
|
|
}
|
|
|
|
export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
|
|
return {
|
|
id: "openai",
|
|
label: "OpenAI Realtime Voice",
|
|
autoSelectOrder: 10,
|
|
resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
|
|
isConfigured: ({ providerConfig }) =>
|
|
Boolean(normalizeProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
|
|
createBridge: (req) => {
|
|
const config = normalizeProviderConfig(req.providerConfig);
|
|
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
|
|
if (!apiKey) {
|
|
throw new Error("OpenAI API key missing");
|
|
}
|
|
return new OpenAIRealtimeVoiceBridge({
|
|
...req,
|
|
apiKey,
|
|
model: config.model,
|
|
voice: config.voice,
|
|
temperature: config.temperature,
|
|
vadThreshold: config.vadThreshold,
|
|
silenceDurationMs: config.silenceDurationMs,
|
|
prefixPaddingMs: config.prefixPaddingMs,
|
|
azureEndpoint: config.azureEndpoint,
|
|
azureDeployment: config.azureDeployment,
|
|
azureApiVersion: config.azureApiVersion,
|
|
});
|
|
},
|
|
};
|
|
}
|
|
|
|
export type { OpenAIRealtimeVoiceProviderConfig };
|