feat: add realtime voice OAuth controls

This commit is contained in:
Colin
2026-05-07 19:40:09 -04:00
committed by Peter Steinberger
parent cd58e46223
commit 7a2a31dede
43 changed files with 2020 additions and 498 deletions

View File

@@ -99,6 +99,7 @@ Docs: https://docs.openclaw.ai
- Agents/compaction: keep contributor diagnostics to a bounded top-three selection without sorting the full history. Thanks @shakkernerd.
- Sessions/UI: avoid full-array sorting while selecting ACPX leases, Google Meet calendar events, and latest chat sessions. Thanks @shakkernerd.
- Plugin SDK: mark direct `deliverOutboundPayloads` and legacy reply-dispatch bridges as deprecated compatibility substrate, enrich `sendDurableMessageBatch` with explicit durable send outcomes, migrate bundled send/turn paths off deprecated APIs, and enforce the split with `check:deprecated-api-usage`.
- OpenAI/Talk: let browser realtime Talk, Gateway relay/Voice Call realtime bridges, and OpenAI realtime transcription use `openai-codex` OAuth when no direct API key is configured, make Google Meet `test_speech` honor `mode: "bidi"`, expose Control UI launch options for provider/model/voice/transport/VAD/reasoning, and update the default OpenAI realtime voice model to `gpt-realtime-2`.
- Telegram: preserve the channel-specific 10-option poll cap in the unified outbound adapter so over-limit polls are rejected before send. (#78762) Thanks @obviyus.
- Telegram/streaming: continue over-limit draft previews in a new message instead of stopping when rendered preview text crosses Telegram's message limit. (#74508) Thanks @anagnorisis2peripeteia.
- Slack: route handled top-level channel turns in implicit-conversation channels to thread-scoped sessions when Slack reply threading is enabled, keeping the root turn and later thread replies on one OpenClaw session. (#78522) Thanks @zeroth-blip.

View File

@@ -2983,6 +2983,10 @@ public struct TalkClientCreateParams: Codable, Sendable {
public let provider: String?
public let model: String?
public let voice: String?
public let vadthreshold: Double?
public let silencedurationms: Int?
public let prefixpaddingms: Int?
public let reasoningeffort: String?
public let mode: AnyCodable?
public let transport: AnyCodable?
public let brain: AnyCodable?
@@ -2992,6 +2996,10 @@ public struct TalkClientCreateParams: Codable, Sendable {
provider: String?,
model: String?,
voice: String?,
vadthreshold: Double?,
silencedurationms: Int?,
prefixpaddingms: Int?,
reasoningeffort: String?,
mode: AnyCodable?,
transport: AnyCodable?,
brain: AnyCodable?)
@@ -3000,6 +3008,10 @@ public struct TalkClientCreateParams: Codable, Sendable {
self.provider = provider
self.model = model
self.voice = voice
self.vadthreshold = vadthreshold
self.silencedurationms = silencedurationms
self.prefixpaddingms = prefixpaddingms
self.reasoningeffort = reasoningeffort
self.mode = mode
self.transport = transport
self.brain = brain
@@ -3010,6 +3022,10 @@ public struct TalkClientCreateParams: Codable, Sendable {
case provider
case model
case voice
case vadthreshold = "vadThreshold"
case silencedurationms = "silenceDurationMs"
case prefixpaddingms = "prefixPaddingMs"
case reasoningeffort = "reasoningEffort"
case mode
case transport
case brain
@@ -3163,6 +3179,10 @@ public struct TalkSessionCreateParams: Codable, Sendable {
public let provider: String?
public let model: String?
public let voice: String?
public let vadthreshold: Double?
public let silencedurationms: Int?
public let prefixpaddingms: Int?
public let reasoningeffort: String?
public let mode: AnyCodable?
public let transport: AnyCodable?
public let brain: AnyCodable?
@@ -3173,6 +3193,10 @@ public struct TalkSessionCreateParams: Codable, Sendable {
provider: String?,
model: String?,
voice: String?,
vadthreshold: Double?,
silencedurationms: Int?,
prefixpaddingms: Int?,
reasoningeffort: String?,
mode: AnyCodable?,
transport: AnyCodable?,
brain: AnyCodable?,
@@ -3182,6 +3206,10 @@ public struct TalkSessionCreateParams: Codable, Sendable {
self.provider = provider
self.model = model
self.voice = voice
self.vadthreshold = vadthreshold
self.silencedurationms = silencedurationms
self.prefixpaddingms = prefixpaddingms
self.reasoningeffort = reasoningeffort
self.mode = mode
self.transport = transport
self.brain = brain
@@ -3193,6 +3221,10 @@ public struct TalkSessionCreateParams: Codable, Sendable {
case provider
case model
case voice
case vadthreshold = "vadThreshold"
case silencedurationms = "silenceDurationMs"
case prefixpaddingms = "prefixPaddingMs"
case reasoningeffort = "reasoningEffort"
case mode
case transport
case brain

View File

@@ -576,7 +576,7 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
```
<Note>
Set `OPENAI_TTS_BASE_URL` to override the TTS base URL without affecting the chat API endpoint.
Set `OPENAI_TTS_BASE_URL` to override the TTS base URL without affecting the chat API endpoint. OpenAI TTS is still configured through an API key; for OAuth-only live talk-back, use the Realtime voice path instead of agent-mode STT -> TTS speech.
</Note>
</Accordion>
@@ -627,10 +627,10 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
| Prompt | `...openai.prompt` | (unset) |
| Silence duration | `...openai.silenceDurationMs` | `800` |
| VAD threshold | `...openai.vadThreshold` | `0.5` |
| API key | `...openai.apiKey` | Falls back to `OPENAI_API_KEY` |
| Auth | `...openai.apiKey`, `OPENAI_API_KEY`, or `openai-codex` OAuth | API keys connect directly; OAuth mints a Realtime transcription client secret |
<Note>
Uses a WebSocket connection to `wss://api.openai.com/v1/realtime` with G.711 u-law (`g711_ulaw` / `audio/pcmu`) audio. This streaming provider is for Voice Call's realtime transcription path; Discord voice currently records short segments and uses the batch `tools.media.audio` transcription path instead.
Uses a WebSocket connection to `wss://api.openai.com/v1/realtime` with G.711 u-law (`g711_ulaw` / `audio/pcmu`) audio. When only `openai-codex` OAuth is configured, the Gateway mints an ephemeral Realtime transcription client secret before opening the WebSocket. This streaming provider is for Voice Call's realtime transcription path; Discord voice currently records short segments and uses the batch `tools.media.audio` transcription path instead.
</Note>
</Accordion>
@@ -645,7 +645,9 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
| Temperature (Azure deployment bridge) | `...openai.temperature` | `0.8` |
| VAD threshold | `...openai.vadThreshold` | `0.5` |
| Silence duration | `...openai.silenceDurationMs` | `500` |
| API key | `...openai.apiKey` | Falls back to `OPENAI_API_KEY` |
| Prefix padding | `...openai.prefixPaddingMs` | `300` |
| Reasoning effort | `...openai.reasoningEffort` | (unset) |
| Auth | `...openai.apiKey`, `OPENAI_API_KEY`, or `openai-codex` OAuth | Browser Talk and non-Azure backend bridges can use Codex OAuth |
Available built-in Realtime voices for `gpt-realtime-2`: `alloy`, `ash`,
`ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, `cedar`.
@@ -667,7 +669,11 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
<Note>
Control UI Talk uses OpenAI browser realtime sessions with a Gateway-minted
ephemeral client secret and a direct browser WebRTC SDP exchange against the
OpenAI Realtime API. Maintainer live verification is available with
OpenAI Realtime API. When no direct OpenAI API key is configured, the
Gateway can mint that client secret with the selected `openai-codex` OAuth
profile. Gateway relay and Voice Call backend realtime WebSocket bridges use
the same OAuth fallback for native OpenAI endpoints. Maintainer live
verification is available with
`OPENAI_API_KEY=... GEMINI_API_KEY=... node --import tsx scripts/dev/realtime-talk-live-smoke.ts`;
the OpenAI legs verify both the backend WebSocket bridge and the browser
WebRTC SDP exchange without logging secrets.

View File

@@ -171,7 +171,9 @@ Imported themes are stored only in the current browser profile. They are not wri
</Accordion>
<Accordion title="Talk mode (browser realtime)">
Talk mode uses a registered realtime voice provider. Configure OpenAI with `talk.realtime.provider: "openai"` plus `talk.realtime.providers.openai.apiKey`, or configure Google with `talk.realtime.provider: "google"` plus `talk.realtime.providers.google.apiKey`. The browser never receives a standard provider API key. OpenAI receives an ephemeral Realtime client secret for WebRTC. Google Live receives a one-use constrained Live API auth token for a browser WebSocket session, with instructions and tool declarations locked into the token by the Gateway. Providers that only expose a backend realtime bridge run through the Gateway relay transport, so credentials and vendor sockets stay server-side while browser audio moves through authenticated Gateway RPCs. The Realtime session prompt is assembled by the Gateway; `talk.client.create` does not accept caller-provided instruction overrides.
Talk mode uses a registered realtime voice provider. Configure OpenAI with `talk.realtime.provider: "openai"` plus either `talk.realtime.providers.openai.apiKey`, `OPENAI_API_KEY`, or an `openai-codex` OAuth profile; configure Google with `talk.realtime.provider: "google"` plus `talk.realtime.providers.google.apiKey`. The browser never receives a standard provider API key. OpenAI receives an ephemeral Realtime client secret for WebRTC. Google Live receives a one-use constrained Live API auth token for a browser WebSocket session, with instructions and tool declarations locked into the token by the Gateway. Providers that only expose a backend realtime bridge run through the Gateway relay transport, so credentials and vendor sockets stay server-side while browser audio moves through authenticated Gateway RPCs. The Realtime session prompt is assembled by the Gateway; `talk.client.create` does not accept caller-provided instruction overrides.
The Chat composer includes a Talk options button next to the Talk start/stop button. The options apply to the next Talk session and can override provider, transport, model, voice, reasoning effort, VAD threshold, silence duration, and prefix padding. When an option is blank, the Gateway uses configured defaults where available or the provider default. Selecting Gateway relay forces the backend relay path; selecting WebRTC keeps the session client-owned and fails instead of silently falling back to relay if the provider cannot create a browser session.
In the Chat composer, the Talk control is the waves button next to the microphone dictation button. When Talk starts, the composer status row shows `Connecting Talk...`, then `Talk live` while audio is connected, or `Asking OpenClaw...` while a realtime tool call is consulting the configured larger model through `talk.client.toolCall`.

View File

@@ -2562,7 +2562,7 @@ describe("google-meet plugin", () => {
expect(focusCall[3]).toEqual({ progress: false });
});
it("does not mutate realtime browser prompts when status is requested", async () => {
it("refreshes blocked realtime browser health read-only when status is requested", async () => {
let openedTab = false;
const { methods, nodesInvoke } = setup(
{
@@ -2574,7 +2574,22 @@ describe("google-meet plugin", () => {
const raw = params as { path?: string; body?: { url?: string; targetId?: string } };
if (command === "browser.proxy") {
if (raw.path === "/tabs") {
return { payload: { result: { running: true, tabs: [] } } };
return {
payload: {
result: {
running: true,
tabs: openedTab
? [
{
targetId: "tab-1",
title: "Meet",
url: "https://meet.google.com/abc-defg-hij",
},
]
: [],
},
},
};
}
if (raw.path === "/tabs/open") {
openedTab = true;
@@ -2621,6 +2636,7 @@ describe("google-meet plugin", () => {
const join = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.join", {
url: "https://meet.google.com/abc-defg-hij",
})) as { session: { id: string } };
openedTab = true;
nodesInvoke.mockClear();
const status = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.status", {
@@ -2628,11 +2644,23 @@ describe("google-meet plugin", () => {
})) as { session?: { chrome?: { health?: { manualActionRequired?: boolean } } } };
expect(status.session?.chrome?.health?.manualActionRequired).toBe(true);
expect(
nodesInvoke.mock.calls.some(
([params]) => requireRecord(params, "node invoke").command === "browser.proxy",
),
).toBe(false);
expect(nodesInvoke).toHaveBeenCalledWith(
expect.objectContaining({
command: "browser.proxy",
params: expect.objectContaining({
path: "/act",
body: expect.objectContaining({ targetId: "tab-1" }),
}),
}),
);
expect(nodesInvoke).not.toHaveBeenCalledWith(
expect.objectContaining({
command: "browser.proxy",
params: expect.objectContaining({
path: "/permissions/grant",
}),
}),
);
});
it("retries caption enable until the captions button is available", async () => {
@@ -3573,6 +3601,52 @@ describe("google-meet plugin", () => {
expect(result.speechOutputTimedOut).toBe(false);
});
it("uses the requested bidirectional realtime mode for test speech", async () => {
const runtime = new GoogleMeetRuntime({
config: resolveGoogleMeetConfig({ defaultMode: "agent" }),
fullConfig: {} as never,
runtime: {} as never,
logger: noopLogger,
});
const session: GoogleMeetSession = {
id: "meet_1",
url: "https://meet.google.com/abc-defg-hij",
transport: "chrome",
mode: "bidi",
state: "active",
createdAt: "2026-04-27T00:00:00.000Z",
updatedAt: "2026-04-27T00:00:00.000Z",
participantIdentity: "signed-in Google Chrome profile",
realtime: {
enabled: true,
strategy: "bidi",
provider: "openai",
toolPolicy: "safe-read-only",
},
chrome: {
audioBackend: "blackhole-2ch",
launched: true,
health: { audioOutputActive: true, lastOutputBytes: 10 },
},
notes: [],
};
vi.spyOn(runtime, "list").mockReturnValue([]);
const join = vi.spyOn(runtime, "join").mockResolvedValue({ session, spoken: true });
await runtime.testSpeech({
url: "https://meet.google.com/abc-defg-hij",
mode: "bidi",
message: "Say exactly: hello.",
});
expect(join).toHaveBeenCalledWith(
expect.objectContaining({
message: "Say exactly: hello.",
mode: "bidi",
}),
);
});
it("rejects observe-only mode for test speech", async () => {
const runtime = new GoogleMeetRuntime({
config: resolveGoogleMeetConfig({}),

View File

@@ -812,6 +812,69 @@ describe("google-meet CLI", () => {
}
});
it("delegates test speech mode to the gateway-owned runtime", async () => {
const callGatewayFromCli = vi.fn(async () => ({
createdSession: true,
spoken: true,
speechOutputVerified: true,
speechOutputTimedOut: false,
session: {
id: "meet_gateway",
url: "https://meet.google.com/abc-defg-hij",
state: "active",
transport: "chrome",
mode: "bidi",
participantIdentity: "signed-in Google Chrome profile",
createdAt: "2026-04-25T00:00:00.000Z",
updatedAt: "2026-04-25T00:00:01.000Z",
realtime: { enabled: true, strategy: "bidi", provider: "openai" },
notes: [],
},
}));
const ensureRuntime = vi.fn(async () => {
throw new Error("local runtime should not be loaded");
});
const stdout = captureStdout();
try {
await setupCli({
callGatewayFromCli,
ensureRuntime: ensureRuntime as unknown as () => Promise<GoogleMeetRuntime>,
}).parseAsync(
[
"googlemeet",
"test-speech",
"https://meet.google.com/abc-defg-hij",
"--transport",
"chrome",
"--mode",
"bidi",
"--message",
"Hello meeting",
],
{ from: "user" },
);
expect(callGatewayFromCli).toHaveBeenCalledWith(
"googlemeet.testSpeech",
{ json: true, timeout: expect.any(String) },
{
url: "https://meet.google.com/abc-defg-hij",
transport: "chrome",
mode: "bidi",
message: "Hello meeting",
},
{ progress: false },
);
expect(ensureRuntime).not.toHaveBeenCalled();
expect(JSON.parse(stdout.output())).toMatchObject({
createdSession: true,
session: { mode: "bidi" },
});
} finally {
stdout.restore();
}
});
it("runs a listen-first health probe", async () => {
const testListen = vi.fn(async () => ({
createdSession: true,

View File

@@ -229,6 +229,7 @@ export async function startNodeAgentAudioBridge(params: {
});
sttSession = resolved.provider.createSession({
cfg: params.fullConfig,
providerConfig: resolved.providerConfig,
onTranscript: (text) => {
const trimmed = text.trim();

View File

@@ -704,6 +704,7 @@ export async function startCommandAgentAudioBridge(params: {
});
sttSession = resolved.provider.createSession({
cfg: params.fullConfig,
providerConfig: resolved.providerConfig,
onTranscript: (text) => {
const trimmed = text.trim();

View File

@@ -674,6 +674,13 @@ export class GoogleMeetRuntime {
"test_speech requires mode: agent or bidi; use join mode: transcribe for observe-only sessions.",
);
}
const requestedMode = request.mode ? resolveMode(request.mode, this.params.config) : undefined;
const mode =
requestedMode && isGoogleMeetTalkBackMode(requestedMode)
? requestedMode
: isGoogleMeetTalkBackMode(this.params.config.defaultMode)
? this.params.config.defaultMode
: "agent";
const url = normalizeMeetUrl(request.url);
const transport = resolveTransport(request.transport, this.params.config);
const beforeSessions = this.list();
@@ -690,7 +697,7 @@ export class GoogleMeetRuntime {
...request,
transport,
url,
mode: "agent",
mode,
message: request.message ?? "Say exactly: Google Meet speech test complete.",
});
let health = result.session.chrome?.health;
@@ -821,10 +828,6 @@ export class GoogleMeetRuntime {
async #refreshStatusHealthForSession(session: GoogleMeetSession) {
if (session.transport === "chrome" || session.transport === "chrome-node") {
if (session.chrome?.health?.manualActionRequired) {
this.#refreshSpeechReadiness(session);
return;
}
await this.#refreshBrowserHealthForChromeSession(session, { force: true, readOnly: true });
return;
}

View File

@@ -1,4 +1,9 @@
import {
createProviderHttpError,
resolveProviderRequestHeaders,
} from "openclaw/plugin-sdk/provider-http";
import { captureWsEvent } from "openclaw/plugin-sdk/proxy-capture";
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
export const trimToUndefined = normalizeOptionalString;
@@ -56,3 +61,108 @@ export function captureOpenAIRealtimeWsClose(params: {
},
});
}
export type OpenAIRealtimeClientSecretResult = {
value: string;
expiresAt?: number;
};
type OpenAIRealtimeSecretRequest = {
authToken: string;
auditContext: string;
url: string;
body: unknown;
errorMessage: string;
missingValueMessage: string;
};
function readStringField(value: unknown, key: string): string | undefined {
if (!value || typeof value !== "object") {
return undefined;
}
const raw = (value as Record<string, unknown>)[key];
return typeof raw === "string" && raw.trim() ? raw.trim() : undefined;
}
async function createOpenAIRealtimeSecret(
params: OpenAIRealtimeSecretRequest,
): Promise<OpenAIRealtimeClientSecretResult> {
const { response, release } = await fetchWithSsrFGuard({
url: params.url,
init: {
method: "POST",
headers: resolveProviderRequestHeaders({
provider: "openai",
baseUrl: params.url,
capability: "audio",
transport: "http",
defaultHeaders: {
Authorization: `Bearer ${params.authToken}`,
"Content-Type": "application/json",
},
}) ?? {
Authorization: `Bearer ${params.authToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify(params.body),
},
auditContext: params.auditContext,
});
const payload = await (async () => {
try {
if (!response.ok) {
throw await createProviderHttpError(response, params.errorMessage);
}
return (await response.json()) as unknown;
} finally {
await release();
}
})();
const nestedSecret =
payload && typeof payload === "object"
? (payload as Record<string, unknown>).client_secret
: undefined;
const clientSecret = readStringField(payload, "value") ?? readStringField(nestedSecret, "value");
if (!clientSecret) {
throw new Error(params.missingValueMessage);
}
const expiresAt =
payload && typeof payload === "object"
? (payload as Record<string, unknown>).expires_at
: undefined;
return {
value: clientSecret,
...(typeof expiresAt === "number" ? { expiresAt } : {}),
};
}
export async function createOpenAIRealtimeClientSecret(params: {
authToken: string;
auditContext: string;
session: Record<string, unknown>;
}): Promise<OpenAIRealtimeClientSecretResult> {
const url = "https://api.openai.com/v1/realtime/client_secrets";
return createOpenAIRealtimeSecret({
...params,
url,
body: { session: params.session },
errorMessage: "OpenAI Realtime client secret failed",
missingValueMessage: "OpenAI Realtime client secret response did not include a value",
});
}
export async function createOpenAIRealtimeTranscriptionClientSecret(params: {
authToken: string;
auditContext: string;
session: Record<string, unknown>;
}): Promise<OpenAIRealtimeClientSecretResult> {
const url = "https://api.openai.com/v1/realtime/transcription_sessions";
return createOpenAIRealtimeSecret({
...params,
url,
body: params.session,
errorMessage: "OpenAI Realtime transcription client secret failed",
missingValueMessage:
"OpenAI Realtime transcription client secret response did not include a value",
});
}

View File

@@ -1,7 +1,7 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
const { FakeWebSocket } = vi.hoisted(() => {
const { FakeWebSocket, providerAuthMocks, ssrfMocks } = vi.hoisted(() => {
type Listener = (...args: unknown[]) => void;
class MockWebSocket {
@@ -10,11 +10,15 @@ const { FakeWebSocket } = vi.hoisted(() => {
static instances: MockWebSocket[] = [];
readonly listeners = new Map<string, Listener[]>();
readonly headers?: Record<string, string>;
readonly url?: string;
readyState = 0;
sent: string[] = [];
closed = false;
constructor() {
constructor(url?: string, options?: { headers?: Record<string, string> }) {
this.url = url;
this.headers = options?.headers;
MockWebSocket.instances.push(this);
}
@@ -42,40 +46,59 @@ const { FakeWebSocket } = vi.hoisted(() => {
}
}
return { FakeWebSocket: MockWebSocket };
return {
FakeWebSocket: MockWebSocket,
providerAuthMocks: {
isProviderAuthProfileConfigured: vi.fn(),
resolveProviderAuthProfileApiKey: vi.fn(),
},
ssrfMocks: {
fetchWithSsrFGuard: vi.fn(),
},
};
});
vi.mock("ws", () => ({
default: FakeWebSocket,
}));
vi.mock("openclaw/plugin-sdk/provider-auth", () => ({
isProviderAuthProfileConfigured: providerAuthMocks.isProviderAuthProfileConfigured,
resolveProviderAuthProfileApiKey: providerAuthMocks.resolveProviderAuthProfileApiKey,
}));
vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
fetchWithSsrFGuard: ssrfMocks.fetchWithSsrFGuard,
}));
type FakeWebSocketInstance = InstanceType<typeof FakeWebSocket>;
type SentRealtimeEvent = {
type: string;
audio?: string;
session?: {
input_audio_format?: string;
input_audio_transcription?: {
model?: string;
language?: string;
prompt?: string;
};
turn_detection?: {
type?: string;
threshold?: number;
prefix_padding_ms?: number;
silence_duration_ms?: number;
};
};
session?: unknown;
};
function parseSent(socket: FakeWebSocketInstance): SentRealtimeEvent[] {
return socket.sent.map((payload) => JSON.parse(payload) as SentRealtimeEvent);
}
async function waitForFakeSocket(): Promise<FakeWebSocketInstance> {
for (let attempt = 0; attempt < 20; attempt += 1) {
const socket = FakeWebSocket.instances[0];
if (socket) {
return socket;
}
await new Promise((resolve) => setTimeout(resolve, 0));
}
throw new Error("expected session to create a websocket");
}
describe("buildOpenAIRealtimeTranscriptionProvider", () => {
beforeEach(() => {
FakeWebSocket.instances = [];
providerAuthMocks.isProviderAuthProfileConfigured.mockReset();
providerAuthMocks.resolveProviderAuthProfileApiKey.mockReset();
ssrfMocks.fetchWithSsrFGuard.mockReset();
});
it("normalizes OpenAI config defaults", () => {
@@ -147,6 +170,83 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
expect(provider.aliases).toContain("openai-realtime");
});
it("treats a Codex OAuth profile as configured when no API key is present", () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
const cfg = { auth: { order: { "openai-codex": ["openai-codex:default"] } } };
providerAuthMocks.isProviderAuthProfileConfigured.mockReturnValue(true);
expect(provider.isConfigured({ cfg: cfg as never, providerConfig: {} })).toBe(true);
expect(providerAuthMocks.isProviderAuthProfileConfigured).toHaveBeenCalledWith({
provider: "openai-codex",
cfg,
});
});
it("mints a Codex OAuth client secret for realtime transcription sockets", async () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
const release = vi.fn();
providerAuthMocks.resolveProviderAuthProfileApiKey.mockResolvedValue("oauth-token");
ssrfMocks.fetchWithSsrFGuard.mockResolvedValue({
response: new Response(JSON.stringify({ value: "ek-test" }), { status: 200 }),
release,
});
const cfg = { auth: { order: { "openai-codex": ["openai-codex:default"] } } };
const session = provider.createSession({
cfg: cfg as never,
providerConfig: {},
});
const connecting = session.connect();
const socket = await waitForFakeSocket();
expect(socket.headers).toMatchObject({ Authorization: "Bearer ek-test" });
expect(providerAuthMocks.resolveProviderAuthProfileApiKey).toHaveBeenCalledWith({
provider: "openai-codex",
cfg,
});
expect(ssrfMocks.fetchWithSsrFGuard).toHaveBeenCalledWith(
expect.objectContaining({
auditContext: "openai-realtime-transcription-session",
url: "https://api.openai.com/v1/realtime/transcription_sessions",
init: expect.objectContaining({
method: "POST",
headers: expect.objectContaining({
Authorization: "Bearer oauth-token",
"Content-Type": "application/json",
}),
body: expect.any(String),
}),
}),
);
const request = ssrfMocks.fetchWithSsrFGuard.mock.calls[0]?.[0] as
| { init?: { body?: unknown } }
| undefined;
expect(JSON.parse(String(request?.init?.body))).toMatchObject({
type: "transcription",
audio: {
input: {
format: { type: "audio/pcmu" },
transcription: { model: "gpt-4o-transcribe" },
},
},
});
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");
socket.emit("message", Buffer.from(JSON.stringify({ type: "transcription_session.updated" })));
await connecting;
expect(release).toHaveBeenCalled();
expect(parseSent(socket)[0]).toMatchObject({
type: "transcription_session.update",
session: {
input_audio_format: "g711_ulaw",
input_audio_transcription: { model: "gpt-4o-transcribe" },
},
});
session.close();
});
it("waits for the OpenAI session update before draining audio", async () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
const session = provider.createSession({
@@ -161,10 +261,7 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
});
const connecting = session.connect();
const socket = FakeWebSocket.instances[0];
if (!socket) {
throw new Error("expected session to create a websocket");
}
const socket = await waitForFakeSocket();
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");

View File

@@ -1,3 +1,8 @@
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
import {
isProviderAuthProfileConfigured,
resolveProviderAuthProfileApiKey,
} from "openclaw/plugin-sdk/provider-auth";
import { resolveProviderRequestHeaders } from "openclaw/plugin-sdk/provider-http";
import {
createRealtimeTranscriptionWebSocketSession,
@@ -10,6 +15,7 @@ import {
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import {
asFiniteNumber,
createOpenAIRealtimeTranscriptionClientSecret,
readRealtimeErrorDetail,
resolveOpenAIProviderConfigRecord,
trimToUndefined,
@@ -25,7 +31,8 @@ type OpenAIRealtimeTranscriptionProviderConfig = {
};
type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
apiKey: string;
apiKey?: string;
cfg?: OpenClawConfig;
language?: string;
model: string;
prompt?: string;
@@ -40,6 +47,41 @@ type RealtimeEvent = {
error?: unknown;
};
type OpenAIRealtimeTranscriptionSessionCreate = {
type: "transcription";
audio: {
input: {
format: { type: "audio/pcmu" };
transcription: {
model: string;
language?: string;
prompt?: string;
};
turn_detection: {
type: "server_vad";
threshold: number;
prefix_padding_ms: number;
silence_duration_ms: number;
};
};
};
};
type OpenAIRealtimeTranscriptionSessionUpdate = {
input_audio_format: "g711_ulaw";
input_audio_transcription: {
model: string;
language?: string;
prompt?: string;
};
turn_detection: {
type: "server_vad";
threshold: number;
prefix_padding_ms: number;
silence_duration_ms: number;
};
};
const OPENAI_REALTIME_TRANSCRIPTION_URL = "wss://api.openai.com/v1/realtime?intent=transcription";
const OPENAI_REALTIME_TRANSCRIPTION_CONNECT_TIMEOUT_MS = 10_000;
const OPENAI_REALTIME_TRANSCRIPTION_MAX_RECONNECT_ATTEMPTS = 5;
@@ -68,6 +110,71 @@ function normalizeProviderConfig(
};
}
function buildOpenAIRealtimeTranscriptionSessionCreateConfig(
config: OpenAIRealtimeTranscriptionSessionConfig,
): OpenAIRealtimeTranscriptionSessionCreate {
return {
type: "transcription",
audio: {
input: {
format: { type: "audio/pcmu" },
transcription: {
model: config.model,
...(config.language ? { language: config.language } : {}),
...(config.prompt ? { prompt: config.prompt } : {}),
},
turn_detection: {
type: "server_vad",
threshold: config.vadThreshold,
prefix_padding_ms: 300,
silence_duration_ms: config.silenceDurationMs,
},
},
},
};
}
function buildOpenAIRealtimeTranscriptionSessionUpdateConfig(
config: OpenAIRealtimeTranscriptionSessionConfig,
): OpenAIRealtimeTranscriptionSessionUpdate {
return {
input_audio_format: "g711_ulaw",
input_audio_transcription: {
model: config.model,
...(config.language ? { language: config.language } : {}),
...(config.prompt ? { prompt: config.prompt } : {}),
},
turn_detection: {
type: "server_vad",
threshold: config.vadThreshold,
prefix_padding_ms: 300,
silence_duration_ms: config.silenceDurationMs,
},
};
}
async function resolveOpenAIRealtimeTranscriptionAuthorization(
config: OpenAIRealtimeTranscriptionSessionConfig,
): Promise<string> {
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (apiKey) {
return apiKey;
}
const authToken = await resolveProviderAuthProfileApiKey({
provider: "openai-codex",
cfg: config.cfg,
});
if (!authToken) {
throw new Error("OpenAI API key or Codex OAuth missing");
}
const clientSecret = await createOpenAIRealtimeTranscriptionClientSecret({
authToken,
auditContext: "openai-realtime-transcription-session",
session: buildOpenAIRealtimeTranscriptionSessionCreateConfig(config),
});
return clientSecret.value;
}
function createOpenAIRealtimeTranscriptionSession(
config: OpenAIRealtimeTranscriptionSessionConfig,
): RealtimeTranscriptionSession {
@@ -122,18 +229,21 @@ function createOpenAIRealtimeTranscriptionSession(
providerId: "openai",
callbacks: config,
url: OPENAI_REALTIME_TRANSCRIPTION_URL,
headers: resolveProviderRequestHeaders({
provider: "openai",
baseUrl: OPENAI_REALTIME_TRANSCRIPTION_URL,
capability: "audio",
transport: "websocket",
defaultHeaders: {
Authorization: `Bearer ${config.apiKey}`,
"OpenAI-Beta": "realtime=v1",
},
}) ?? {
Authorization: `Bearer ${config.apiKey}`,
"OpenAI-Beta": "realtime=v1",
headers: async () => {
const bearer = await resolveOpenAIRealtimeTranscriptionAuthorization(config);
return (
resolveProviderRequestHeaders({
provider: "openai",
baseUrl: OPENAI_REALTIME_TRANSCRIPTION_URL,
capability: "audio",
transport: "websocket",
defaultHeaders: {
Authorization: `Bearer ${bearer}`,
},
}) ?? {
Authorization: `Bearer ${bearer}`,
}
);
},
connectTimeoutMs: OPENAI_REALTIME_TRANSCRIPTION_CONNECT_TIMEOUT_MS,
maxReconnectAttempts: OPENAI_REALTIME_TRANSCRIPTION_MAX_RECONNECT_ATTEMPTS,
@@ -150,20 +260,7 @@ function createOpenAIRealtimeTranscriptionSession(
onOpen: (transport: RealtimeTranscriptionWebSocketTransport) => {
transport.sendJson({
type: "transcription_session.update",
session: {
input_audio_format: "g711_ulaw",
input_audio_transcription: {
model: config.model,
...(config.language ? { language: config.language } : {}),
...(config.prompt ? { prompt: config.prompt } : {}),
},
turn_detection: {
type: "server_vad",
threshold: config.vadThreshold,
prefix_padding_ms: 300,
silence_duration_ms: config.silenceDurationMs,
},
},
session: buildOpenAIRealtimeTranscriptionSessionUpdateConfig(config),
});
},
onMessage: handleEvent,
@@ -178,17 +275,17 @@ export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptio
defaultModel: OPENAI_REALTIME_TRANSCRIPTION_DEFAULT_MODEL,
autoSelectOrder: 10,
resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
isConfigured: ({ providerConfig }) =>
Boolean(normalizeProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
isConfigured: ({ cfg, providerConfig }) =>
Boolean(
normalizeProviderConfig(providerConfig).apiKey ||
process.env.OPENAI_API_KEY ||
isProviderAuthProfileConfigured({ provider: "openai-codex", cfg }),
),
createSession: (req) => {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
return createOpenAIRealtimeTranscriptionSession({
...req,
apiKey,
apiKey: config.apiKey,
language: config.language,
model: config.model ?? OPENAI_REALTIME_TRANSCRIPTION_DEFAULT_MODEL,
prompt: config.prompt,

View File

@@ -2,7 +2,13 @@ import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/rea
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
const { FakeWebSocket, execFileSyncMock, fetchWithSsrFGuardMock } = vi.hoisted(() => {
const {
FakeWebSocket,
execFileSyncMock,
fetchWithSsrFGuardMock,
isProviderAuthProfileConfiguredMock,
resolveProviderAuthProfileApiKeyMock,
} = vi.hoisted(() => {
type Listener = (...args: unknown[]) => void;
class MockWebSocket {
@@ -55,6 +61,8 @@ const { FakeWebSocket, execFileSyncMock, fetchWithSsrFGuardMock } = vi.hoisted((
FakeWebSocket: MockWebSocket,
execFileSyncMock: vi.fn(),
fetchWithSsrFGuardMock: vi.fn(),
isProviderAuthProfileConfiguredMock: vi.fn(),
resolveProviderAuthProfileApiKeyMock: vi.fn(),
};
});
@@ -74,6 +82,11 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
fetchWithSsrFGuard: fetchWithSsrFGuardMock,
}));
vi.mock("openclaw/plugin-sdk/provider-auth", () => ({
isProviderAuthProfileConfigured: isProviderAuthProfileConfiguredMock,
resolveProviderAuthProfileApiKey: resolveProviderAuthProfileApiKeyMock,
}));
type FakeWebSocketInstance = InstanceType<typeof FakeWebSocket>;
type SentRealtimeEvent = {
type: string;
@@ -82,8 +95,14 @@ type SentRealtimeEvent = {
content_index?: number;
audio_end_ms?: number;
session?: {
type?: string;
model?: string;
modalities?: string[];
instructions?: string;
voice?: string;
input_audio_format?: string;
output_audio_format?: string;
input_audio_transcription?: Record<string, unknown>;
turn_detection?: {
create_response?: boolean;
};
@@ -100,6 +119,7 @@ type SentRealtimeEvent = {
};
output?: {
format?: Record<string, unknown>;
voice?: string;
};
};
item?: unknown;
@@ -124,6 +144,10 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
FakeWebSocket.instances = [];
execFileSyncMock.mockReset();
fetchWithSsrFGuardMock.mockReset();
isProviderAuthProfileConfiguredMock.mockReset();
isProviderAuthProfileConfiguredMock.mockReturnValue(false);
resolveProviderAuthProfileApiKeyMock.mockReset();
resolveProviderAuthProfileApiKeyMock.mockResolvedValue(undefined);
});
afterEach(() => {
@@ -184,32 +208,100 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
expect(options?.headers).not.toHaveProperty("OpenAI-Beta");
});
it("keeps Azure deployment realtime bridge requests on the deployment-compatible session shape", () => {
it("mints an ephemeral Realtime secret for native websocket bridges when using Codex OAuth", async () => {
resolveProviderAuthProfileApiKeyMock.mockResolvedValueOnce("oauth-token");
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: createJsonResponse({
client_secret: { value: "ephemeral-realtime-secret" },
}),
release: vi.fn(async () => undefined),
});
const provider = buildOpenAIRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "sk-test", // pragma: allowlist secret
azureEndpoint: "https://example.openai.azure.com",
azureDeployment: "realtime-prod",
},
cfg: {} as never,
providerConfig: { model: "gpt-realtime-2" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
void bridge.connect();
const socket = FakeWebSocket.instances[0];
if (!socket) {
throw new Error("expected bridge to create a websocket");
}
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");
await vi.waitFor(() => expect(FakeWebSocket.instances.length).toBe(1));
bridge.close();
expect(parseSent(socket)[0]?.session).toMatchObject({
modalities: ["text", "audio"],
input_audio_format: "g711_ulaw",
output_audio_format: "g711_ulaw",
expect(resolveProviderAuthProfileApiKeyMock).toHaveBeenCalledWith({
provider: "openai-codex",
cfg: {},
});
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://api.openai.com/v1/realtime/client_secrets",
init: expect.objectContaining({
method: "POST",
headers: expect.objectContaining({
Authorization: "Bearer oauth-token", // pragma: allowlist secret
"Content-Type": "application/json",
}),
}),
auditContext: "openai-realtime-bridge-session",
}),
);
const request = fetchWithSsrFGuardMock.mock.calls[0]?.[0] as
| { init?: { body?: string } }
| undefined;
const body = JSON.parse(request?.init?.body ?? "{}") as {
session?: {
type?: string;
model?: string;
audio?: { output?: { voice?: string } };
};
};
expect(body.session).toMatchObject({
type: "realtime",
model: "gpt-realtime-2",
audio: { output: { voice: "alloy" } },
});
const socket = FakeWebSocket.instances[0];
const options = socket?.args[1] as { headers?: Record<string, string> } | undefined;
expect(options?.headers?.Authorization).toBe("Bearer ephemeral-realtime-secret");
expect(options?.headers).not.toHaveProperty("OpenAI-Beta");
});
it("does not open a native websocket after slow OAuth resolution times out", async () => {
vi.useFakeTimers();
resolveProviderAuthProfileApiKeyMock.mockResolvedValueOnce("oauth-token");
let resolveClientSecret: (value: {
response: Response;
release: () => Promise<void>;
}) => void = () => {};
fetchWithSsrFGuardMock.mockReturnValueOnce(
new Promise((resolve) => {
resolveClientSecret = resolve;
}),
);
const provider = buildOpenAIRealtimeVoiceProvider();
const bridge = provider.createBridge({
cfg: {} as never,
providerConfig: { model: "gpt-realtime-2" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
const connecting = expect(bridge.connect()).rejects.toThrow(
"OpenAI realtime connection timeout",
);
await vi.advanceTimersByTimeAsync(10_000);
await connecting;
resolveClientSecret({
response: createJsonResponse({
client_secret: { value: "ephemeral-realtime-secret" },
}),
release: vi.fn(async () => undefined),
});
await vi.runAllTimersAsync();
expect(FakeWebSocket.instances).toHaveLength(0);
bridge.close();
});
it("returns browser-safe OpenClaw attribution headers for native WebRTC offers", async () => {
@@ -229,6 +321,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
const session = await provider.createBrowserSession({
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
instructions: "Be concise.",
voice: " Marin ",
});
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
@@ -257,7 +350,9 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
turn_detection?: Record<string, unknown>;
transcription?: Record<string, unknown>;
};
output?: Record<string, unknown>;
};
reasoning?: Record<string, unknown>;
};
};
expect(body.session?.model).toBe("gpt-realtime-2");
@@ -270,6 +365,8 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
},
transcription: { model: "gpt-4o-mini-transcribe" },
});
expect(body.session?.audio?.output).toEqual({ voice: "marin" });
expect(body.session).not.toHaveProperty("temperature");
expect(session).toMatchObject({
provider: "openai",
transport: "webrtc",
@@ -359,20 +456,84 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
expect(execFileSyncMock).not.toHaveBeenCalled();
});
it("fails closed when keychain refs cannot be resolved", () => {
it("treats OpenAI Codex OAuth profiles as configured for browser realtime sessions", () => {
isProviderAuthProfileConfiguredMock.mockReturnValue(true);
const provider = buildOpenAIRealtimeVoiceProvider();
const cfg = { agents: { defaults: {} } } as never;
expect(provider.isConfigured({ cfg, providerConfig: {} })).toBe(true);
expect(isProviderAuthProfileConfiguredMock).toHaveBeenCalledWith({
provider: "openai-codex",
cfg,
});
});
it("does not use Codex OAuth to configure Azure realtime sessions", () => {
isProviderAuthProfileConfiguredMock.mockReturnValue(true);
const provider = buildOpenAIRealtimeVoiceProvider();
const cfg = { agents: { defaults: {} } } as never;
expect(
provider.isConfigured({
cfg,
providerConfig: {
azureEndpoint: "https://example.openai.azure.com",
azureDeployment: "realtime",
},
}),
).toBe(false);
expect(isProviderAuthProfileConfiguredMock).not.toHaveBeenCalled();
});
it("uses OpenAI Codex OAuth to mint browser realtime client secrets when no API key is set", async () => {
resolveProviderAuthProfileApiKeyMock.mockResolvedValueOnce("oauth-realtime-token");
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: createJsonResponse({
client_secret: { value: "client-secret-123" },
}),
release: vi.fn(async () => undefined),
});
const provider = buildOpenAIRealtimeVoiceProvider();
if (!provider.createBrowserSession) {
throw new Error("expected OpenAI realtime provider to support browser sessions");
}
const cfg = { agents: { defaults: {} } } as never;
await provider.createBrowserSession({
cfg,
providerConfig: {},
instructions: "Be concise.",
});
expect(resolveProviderAuthProfileApiKeyMock).toHaveBeenCalledWith({
provider: "openai-codex",
cfg,
});
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
expect.objectContaining({
init: expect.objectContaining({
headers: expect.objectContaining({
Authorization: "Bearer oauth-realtime-token", // pragma: allowlist secret
}),
}),
}),
);
});
it("fails closed when keychain refs cannot be resolved", async () => {
vi.stubEnv("OPENAI_API_KEY", "keychain:openclaw:OPENAI_REALTIME_MISSING_TEST");
execFileSyncMock.mockImplementationOnce(() => {
throw new Error("keychain unavailable");
});
const provider = buildOpenAIRealtimeVoiceProvider();
expect(() =>
provider.createBridge({
providerConfig: {},
onAudio: vi.fn(),
onClearAudio: vi.fn(),
}),
).toThrow("OpenAI API key missing");
const bridge = provider.createBridge({
providerConfig: {},
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await expect(bridge.connect()).rejects.toThrow("OpenAI API key or Codex OAuth missing");
});
it("normalizes provider-owned voice settings from raw provider config", () => {
@@ -383,10 +544,11 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
providers: {
openai: {
model: "gpt-realtime-2",
voice: "verse",
voice: " Verse ",
temperature: 0.6,
silenceDurationMs: 850,
vadThreshold: 0.35,
reasoningEffort: "low",
},
},
},
@@ -398,6 +560,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
temperature: 0.6,
silenceDurationMs: 850,
vadThreshold: 0.35,
reasoningEffort: "low",
});
});
@@ -443,6 +606,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
},
output: {
format: { type: "audio/pcmu" },
voice: "alloy",
},
},
});
@@ -461,6 +625,53 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
expect(bridge.isConnected()).toBe(true);
});
it("keeps Azure deployment bridges on deployment-compatible session payloads", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "sk-test", // pragma: allowlist secret
azureEndpoint: "https://example.openai.azure.com/",
azureDeployment: "realtime-prod",
azureApiVersion: "2024-10-01-preview",
voice: "verse",
},
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
instructions: "Be helpful.",
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
const connecting = bridge.connect();
const socket = FakeWebSocket.instances[0];
if (!socket) {
throw new Error("expected bridge to create a websocket");
}
expect(socket.args[0]).toBe(
"wss://example.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=realtime-prod",
);
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");
await Promise.resolve();
const session = parseSent(socket)[0]?.session;
expect(session).toMatchObject({
modalities: ["text", "audio"],
instructions: "Be helpful.",
voice: "verse",
input_audio_format: "pcm16",
output_audio_format: "pcm16",
input_audio_transcription: { model: "whisper-1" },
turn_detection: { create_response: true },
temperature: 0.8,
});
expect(session).not.toHaveProperty("type");
expect(session).not.toHaveProperty("audio");
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
await connecting;
});
it("rejects connection when session configuration fails before readiness", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const bridge = provider.createBridge({

View File

@@ -1,9 +1,10 @@
import { execFileSync } from "node:child_process";
import { randomUUID } from "node:crypto";
import {
createProviderHttpError,
resolveProviderRequestHeaders,
} from "openclaw/plugin-sdk/provider-http";
isProviderAuthProfileConfigured,
resolveProviderAuthProfileApiKey,
} from "openclaw/plugin-sdk/provider-auth";
import { resolveProviderRequestHeaders } from "openclaw/plugin-sdk/provider-http";
import {
captureWsEvent,
createDebugProxyWebSocketAgent,
@@ -29,11 +30,11 @@ import {
normalizeResolvedSecretInputString,
normalizeSecretInputString,
} from "openclaw/plugin-sdk/secret-input";
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
import WebSocket from "ws";
import {
asFiniteNumber,
captureOpenAIRealtimeWsClose,
createOpenAIRealtimeClientSecret,
readRealtimeErrorDetail,
resolveOpenAIProviderConfigRecord,
trimToUndefined,
@@ -61,13 +62,14 @@ type OpenAIRealtimeVoiceProviderConfig = {
prefixPaddingMs?: number;
interruptResponseOnInputAudio?: boolean;
minBargeInAudioEndMs?: number;
reasoningEffort?: string;
azureEndpoint?: string;
azureDeployment?: string;
azureApiVersion?: string;
};
type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
apiKey: string;
apiKey?: string;
model?: string;
voice?: OpenAIRealtimeVoice;
temperature?: number;
@@ -76,6 +78,7 @@ type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
prefixPaddingMs?: number;
interruptResponseOnInputAudio?: boolean;
minBargeInAudioEndMs?: number;
reasoningEffort?: string;
azureEndpoint?: string;
azureDeployment?: string;
azureApiVersion?: string;
@@ -88,6 +91,28 @@ const OPENAI_REALTIME_ACTIVE_RESPONSE_ERROR_PREFIX =
const OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR =
"Cancellation failed: no active response found";
const OPENAI_REALTIME_DEFAULT_MIN_BARGE_IN_AUDIO_END_MS = 250;
const OPENAI_REALTIME_VOICES = [
"alloy",
"ash",
"ballad",
"coral",
"echo",
"sage",
"shimmer",
"verse",
"marin",
"cedar",
] as const satisfies readonly OpenAIRealtimeVoice[];
function normalizeOpenAIRealtimeVoice(value: unknown): OpenAIRealtimeVoice | undefined {
if (typeof value !== "string") {
return undefined;
}
const normalized = value.trim().toLowerCase();
return OPENAI_REALTIME_VOICES.includes(normalized as OpenAIRealtimeVoice)
? (normalized as OpenAIRealtimeVoice)
: undefined;
}
type RealtimeEvent = {
type: string;
@@ -112,63 +137,64 @@ type RealtimeEvent = {
error?: unknown;
};
type RealtimeSessionUpdate = {
type: "session.update";
session: RealtimeSessionUpdatePayload;
type RealtimeTurnDetectionConfig = {
type: "server_vad";
threshold: number;
prefix_padding_ms: number;
silence_duration_ms: number;
create_response: boolean;
interrupt_response?: boolean;
};
type RealtimeSessionUpdatePayload =
| RealtimeSessionUpdateGaPayload
| RealtimeSessionUpdateBetaPayload;
type RealtimeSessionUpdateGaPayload = {
type: "realtime";
model: string;
instructions?: string;
output_modalities: ["audio"];
audio: {
input: {
format: RealtimeAudioFormatConfig;
transcription: { model: string };
noise_reduction?: { type: "near_field" };
turn_detection: {
type: "server_vad";
threshold: number;
prefix_padding_ms: number;
silence_duration_ms: number;
create_response: boolean;
interrupt_response: boolean;
type RealtimeGaSessionUpdate = {
type: "session.update";
session: {
type: "realtime";
model?: string;
instructions?: string;
output_modalities: string[];
audio: {
input: {
format: OpenAIRealtimeAudioFormatConfig;
turn_detection: RealtimeTurnDetectionConfig;
noise_reduction?: { type: "near_field" };
transcription?: { model: string };
};
output: {
format: OpenAIRealtimeAudioFormatConfig;
voice: OpenAIRealtimeVoice;
};
};
output: {
format: RealtimeAudioFormatConfig;
voice: OpenAIRealtimeVoice;
reasoning?: { effort: string };
tools?: RealtimeVoiceTool[];
tool_choice?: string;
};
};
type RealtimeAzureDeploymentSessionUpdate = {
type: "session.update";
session: {
modalities: string[];
instructions?: string;
voice: OpenAIRealtimeVoice;
input_audio_format: "g711_ulaw" | "pcm16";
output_audio_format: "g711_ulaw" | "pcm16";
input_audio_transcription?: { model: string };
turn_detection: RealtimeTurnDetectionConfig;
temperature: number;
tools?: RealtimeVoiceTool[];
tool_choice?: string;
};
};
type OpenAIRealtimeAudioFormatConfig =
| {
type: "audio/pcm";
rate: 24000;
}
| {
type: "audio/pcmu";
};
};
tools?: RealtimeVoiceTool[];
tool_choice?: string;
};
type RealtimeSessionUpdateBetaPayload = {
modalities: string[];
instructions?: string;
voice: OpenAIRealtimeVoice;
input_audio_format: string;
output_audio_format: string;
turn_detection: {
type: "server_vad";
threshold: number;
prefix_padding_ms: number;
silence_duration_ms: number;
create_response: boolean;
};
temperature: number;
input_audio_transcription?: { model: string };
tools?: RealtimeVoiceTool[];
tool_choice?: string;
};
type RealtimeAudioFormatConfig = { type: "audio/pcmu" } | { type: "audio/pcm"; rate: 24000 };
function normalizeProviderConfig(
config: RealtimeVoiceProviderConfig,
@@ -180,7 +206,7 @@ function normalizeProviderConfig(
path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
}),
model: trimToUndefined(raw?.model),
voice: trimToUndefined(raw?.voice) as OpenAIRealtimeVoice | undefined,
voice: normalizeOpenAIRealtimeVoice(raw?.voice),
temperature: asFiniteNumber(raw?.temperature),
vadThreshold: asFiniteNumber(raw?.vadThreshold),
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
@@ -190,6 +216,7 @@ function normalizeProviderConfig(
? raw.interruptResponseOnInputAudio
: undefined,
minBargeInAudioEndMs: asNonNegativeInteger(raw?.minBargeInAudioEndMs),
reasoningEffort: trimToUndefined(raw?.reasoningEffort),
azureEndpoint: trimToUndefined(raw?.azureEndpoint),
azureDeployment: trimToUndefined(raw?.azureDeployment),
azureApiVersion: trimToUndefined(raw?.azureApiVersion),
@@ -272,6 +299,44 @@ function hasOpenAIRealtimeApiKeyInput(configuredApiKey: string | undefined): boo
);
}
async function resolveOpenAIRealtimeBrowserApiKey(params: {
configuredApiKey: string | undefined;
cfg: RealtimeVoiceBrowserSessionCreateRequest["cfg"] | undefined;
}): Promise<string | undefined> {
const resolved = resolveOpenAIRealtimeApiKey(params.configuredApiKey);
if (resolved.status === "available") {
return resolved.value;
}
return await resolveProviderAuthProfileApiKey({
provider: "openai-codex",
cfg: params.cfg,
});
}
async function requireOpenAIRealtimeBrowserApiKey(params: {
configuredApiKey: string | undefined;
cfg: RealtimeVoiceBrowserSessionCreateRequest["cfg"] | undefined;
}): Promise<string> {
const apiKey = await resolveOpenAIRealtimeBrowserApiKey(params);
if (apiKey) {
return apiKey;
}
throw new Error("OpenAI API key or Codex OAuth missing");
}
function hasOpenAIRealtimeBrowserAuthInput(params: {
configuredApiKey: string | undefined;
cfg: RealtimeVoiceBrowserSessionCreateRequest["cfg"] | undefined;
}): boolean {
if (hasOpenAIRealtimeApiKeyInput(params.configuredApiKey)) {
return true;
}
return isProviderAuthProfileConfigured({
provider: "openai-codex",
cfg: params.cfg,
});
}
function base64ToBuffer(b64: string): Buffer {
return Buffer.from(b64, "base64");
}
@@ -298,6 +363,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
private continuingToolCallIds = new Set<string>();
private latestMediaTimestamp = 0;
private lastAssistantItemId: string | null = null;
private connectionUrl = "";
private toolCallBuffers = new Map<string, { name: string; callId: string; args: string }>();
private deliveredToolCallKeys = new Set<string>();
private readonly flowId = randomUUID();
@@ -415,14 +481,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
clearTimeout(connectTimeout);
reject(error);
};
const { url, headers } = this.resolveConnectionParams();
const debugProxy = resolveDebugProxySettings();
const proxyAgent = createDebugProxyWebSocketAgent(debugProxy);
this.ws = new WebSocket(url, {
headers,
...(proxyAgent ? { agent: proxyAgent } : {}),
});
connectTimeout = setTimeout(() => {
if (!this.sessionConfigured && !this.intentionallyClosed) {
this.ws?.terminate();
@@ -430,95 +488,126 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
}, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS);
this.ws.on("open", () => {
this.resetRealtimeSessionState();
this.connected = true;
this.sessionConfigured = false;
this.reconnectAttempts = 0;
captureWsEvent({
url,
direction: "local",
kind: "ws-open",
flowId: this.flowId,
meta: {
provider: "openai",
capability: "realtime-voice",
},
});
this.sendSessionUpdate();
});
this.ws.on("message", (data: Buffer) => {
captureWsEvent({
url,
direction: "inbound",
kind: "ws-frame",
flowId: this.flowId,
payload: data,
meta: {
provider: "openai",
capability: "realtime-voice",
},
});
try {
const event = JSON.parse(data.toString()) as RealtimeEvent;
this.handleEvent(event);
if (event.type === "session.updated") {
settleResolve();
}
if (event.type === "error" && !this.sessionConfigured) {
settleReject(new Error(readRealtimeErrorDetail(event.error)));
}
} catch (error) {
console.error("[openai] realtime event parse failed:", error);
const openWebSocket = (connection: { url: string; headers: Record<string, string> }) => {
if (settled) {
return;
}
});
this.ws.on("error", (error) => {
captureWsEvent({
url,
direction: "local",
kind: "error",
flowId: this.flowId,
errorText: error instanceof Error ? error.message : String(error),
meta: {
provider: "openai",
capability: "realtime-voice",
},
});
if (!this.sessionConfigured) {
settleReject(error instanceof Error ? error : new Error(String(error)));
}
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
});
this.ws.on("close", (code, reasonBuffer) => {
captureOpenAIRealtimeWsClose({
url,
flowId: this.flowId,
capability: "realtime-voice",
code,
reasonBuffer,
});
this.connected = false;
this.sessionConfigured = false;
if (this.intentionallyClosed) {
settleResolve();
this.config.onClose?.("completed");
return;
}
if (!this.sessionConfigured && !settled) {
settleReject(new Error("OpenAI realtime connection closed before ready"));
return;
}
void this.attemptReconnect();
});
const url = connection.url;
this.connectionUrl = connection.url;
const debugProxy = resolveDebugProxySettings();
const proxyAgent = createDebugProxyWebSocketAgent(debugProxy);
const ws = new WebSocket(connection.url, {
headers: connection.headers,
...(proxyAgent ? { agent: proxyAgent } : {}),
});
this.ws = ws;
ws.on("open", () => {
this.resetRealtimeSessionState();
this.connected = true;
this.sessionConfigured = false;
this.reconnectAttempts = 0;
captureWsEvent({
url,
direction: "local",
kind: "ws-open",
flowId: this.flowId,
meta: {
provider: "openai",
capability: "realtime-voice",
},
});
this.sendSessionUpdate();
});
ws.on("message", (data: Buffer) => {
captureWsEvent({
url,
direction: "inbound",
kind: "ws-frame",
flowId: this.flowId,
payload: data,
meta: {
provider: "openai",
capability: "realtime-voice",
},
});
try {
const event = JSON.parse(data.toString()) as RealtimeEvent;
this.handleEvent(event);
if (event.type === "session.updated") {
settleResolve();
}
if (event.type === "error" && !this.sessionConfigured) {
settleReject(new Error(readRealtimeErrorDetail(event.error)));
}
} catch (error) {
console.error("[openai] realtime event parse failed:", error);
}
});
ws.on("error", (error) => {
captureWsEvent({
url,
direction: "local",
kind: "error",
flowId: this.flowId,
errorText: error instanceof Error ? error.message : String(error),
meta: {
provider: "openai",
capability: "realtime-voice",
},
});
if (!this.sessionConfigured) {
settleReject(error instanceof Error ? error : new Error(String(error)));
}
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
});
ws.on("close", (code, reasonBuffer) => {
captureOpenAIRealtimeWsClose({
url,
flowId: this.flowId,
capability: "realtime-voice",
code,
reasonBuffer,
});
this.connected = false;
this.sessionConfigured = false;
if (this.intentionallyClosed) {
settleResolve();
this.config.onClose?.("completed");
return;
}
if (!this.sessionConfigured && !settled) {
settleReject(new Error("OpenAI realtime connection closed before ready"));
return;
}
void this.attemptReconnect();
});
};
const connectionOrPromise = this.resolveConnectionParams();
if (connectionOrPromise instanceof Promise) {
void connectionOrPromise.then(openWebSocket).catch((error: unknown) => {
settleReject(error instanceof Error ? error : new Error(String(error)));
});
return;
}
openWebSocket(connectionOrPromise);
});
}
private resolveConnectionParams(): { url: string; headers: Record<string, string> } {
private resolveConnectionParams():
| { url: string; headers: Record<string, string> }
| Promise<{ url: string; headers: Record<string, string> }> {
const cfg = this.config;
if (cfg.azureEndpoint && cfg.azureDeployment) {
const apiKey = requireOpenAIRealtimeApiKey(cfg.apiKey);
const base = cfg.azureEndpoint
.replace(/\/$/, "")
.replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
@@ -533,11 +622,16 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
baseUrl: url,
capability: "audio",
transport: "websocket",
defaultHeaders: { "api-key": cfg.apiKey },
}) ?? { "api-key": cfg.apiKey },
defaultHeaders: { "api-key": apiKey },
}) ?? { "api-key": apiKey },
};
}
const directApiKey = resolveOpenAIRealtimeApiKey(cfg.apiKey);
if (directApiKey.status === "missing") {
return this.resolveOAuthConnectionParams();
}
const apiKey = directApiKey.value;
if (cfg.azureEndpoint) {
const base = cfg.azureEndpoint
.replace(/\/$/, "")
@@ -552,8 +646,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
baseUrl: url,
capability: "audio",
transport: "websocket",
defaultHeaders: { Authorization: `Bearer ${cfg.apiKey}` },
}) ?? { Authorization: `Bearer ${cfg.apiKey}` },
defaultHeaders: { Authorization: `Bearer ${apiKey}` },
}) ?? { Authorization: `Bearer ${apiKey}` },
};
}
@@ -568,10 +662,48 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
capability: "audio",
transport: "websocket",
defaultHeaders: {
Authorization: `Bearer ${cfg.apiKey}`,
Authorization: `Bearer ${apiKey}`,
},
}) ?? {
Authorization: `Bearer ${cfg.apiKey}`,
Authorization: `Bearer ${apiKey}`,
},
};
}
private async resolveOAuthConnectionParams(): Promise<{
url: string;
headers: Record<string, string>;
}> {
const cfg = this.config;
const authToken = await requireOpenAIRealtimeBrowserApiKey({
configuredApiKey: cfg.apiKey,
cfg: cfg.cfg,
});
const model = cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL;
const clientSecret = await createOpenAIRealtimeClientSecret({
authToken,
auditContext: "openai-realtime-bridge-session",
session: {
type: "realtime",
model,
audio: {
output: { voice: cfg.voice ?? "alloy" },
},
},
});
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(model)}`;
return {
url,
headers: resolveProviderRequestHeaders({
provider: "openai",
baseUrl: url,
capability: "audio",
transport: "websocket",
defaultHeaders: {
Authorization: `Bearer ${clientSecret.value}`,
},
}) ?? {
Authorization: `Bearer ${clientSecret.value}`,
},
};
}
@@ -600,99 +732,96 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
private sendSessionUpdate(): void {
this.sendEvent({
type: "session.update",
session: this.resolveSessionUpdatePayload(),
} satisfies RealtimeSessionUpdate);
if (this.usesAzureDeploymentRealtimeApi()) {
this.sendEvent(this.buildAzureDeploymentSessionUpdate());
return;
}
this.sendEvent(this.buildGaSessionUpdate());
}
private resolveSessionUpdatePayload(): RealtimeSessionUpdatePayload {
if (this.usesAzureDeploymentRealtimeApi()) {
return this.resolveBetaSessionUpdatePayload();
}
return this.resolveGaSessionUpdatePayload();
private buildGaSessionUpdate(): RealtimeGaSessionUpdate {
const cfg = this.config;
const autoRespondToAudio = cfg.autoRespondToAudio ?? true;
const interruptResponseOnInputAudio = cfg.interruptResponseOnInputAudio ?? autoRespondToAudio;
return {
type: "session.update",
session: {
type: "realtime",
model: cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
instructions: cfg.instructions,
output_modalities: ["audio"],
audio: {
input: {
format: this.resolveRealtimeAudioFormat(),
noise_reduction: { type: "near_field" },
transcription: { model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL },
turn_detection: {
type: "server_vad",
threshold: cfg.vadThreshold ?? 0.5,
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
silence_duration_ms: cfg.silenceDurationMs ?? 500,
create_response: autoRespondToAudio,
interrupt_response: interruptResponseOnInputAudio,
},
},
output: {
format: this.resolveRealtimeAudioFormat(),
voice: cfg.voice ?? "alloy",
},
},
...(cfg.reasoningEffort ? { reasoning: { effort: cfg.reasoningEffort } } : {}),
...(cfg.tools && cfg.tools.length > 0
? {
tools: cfg.tools,
tool_choice: "auto",
}
: {}),
},
};
}
private usesAzureDeploymentRealtimeApi(): boolean {
return Boolean(this.config.azureEndpoint && this.config.azureDeployment);
}
private resolveGaSessionUpdatePayload(): RealtimeSessionUpdateGaPayload {
private buildAzureDeploymentSessionUpdate(): RealtimeAzureDeploymentSessionUpdate {
const cfg = this.config;
const autoRespondToAudio = cfg.autoRespondToAudio ?? true;
const interruptResponseOnInputAudio = cfg.interruptResponseOnInputAudio ?? autoRespondToAudio;
const format = this.resolveLegacyRealtimeAudioFormat();
return {
type: "realtime",
model: cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
instructions: cfg.instructions,
output_modalities: ["audio"],
audio: {
input: {
format: this.resolveRealtimeAudioFormatConfig(),
noise_reduction: {
type: "near_field",
},
transcription: {
model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL,
},
turn_detection: {
type: "server_vad",
threshold: cfg.vadThreshold ?? 0.5,
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
silence_duration_ms: cfg.silenceDurationMs ?? 500,
create_response: autoRespondToAudio,
interrupt_response: interruptResponseOnInputAudio,
},
},
output: {
format: this.resolveRealtimeAudioFormatConfig(),
voice: cfg.voice ?? "alloy",
type: "session.update",
session: {
modalities: ["text", "audio"],
instructions: cfg.instructions,
voice: cfg.voice ?? "alloy",
input_audio_format: format,
output_audio_format: format,
input_audio_transcription: { model: "whisper-1" },
turn_detection: {
type: "server_vad",
threshold: cfg.vadThreshold ?? 0.5,
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
silence_duration_ms: cfg.silenceDurationMs ?? 500,
create_response: cfg.autoRespondToAudio ?? true,
},
temperature: cfg.temperature ?? 0.8,
...(cfg.tools && cfg.tools.length > 0
? {
tools: cfg.tools,
tool_choice: "auto",
}
: {}),
},
...(cfg.tools && cfg.tools.length > 0
? {
tools: cfg.tools,
tool_choice: "auto",
}
: {}),
};
}
private resolveBetaSessionUpdatePayload(): RealtimeSessionUpdateBetaPayload {
const cfg = this.config;
return {
modalities: ["text", "audio"],
instructions: cfg.instructions,
voice: cfg.voice ?? "alloy",
input_audio_format: this.resolveRealtimeAudioFormat(),
output_audio_format: this.resolveRealtimeAudioFormat(),
input_audio_transcription: {
model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL,
},
turn_detection: {
type: "server_vad",
threshold: cfg.vadThreshold ?? 0.5,
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
silence_duration_ms: cfg.silenceDurationMs ?? 500,
create_response: cfg.autoRespondToAudio ?? true,
},
temperature: cfg.temperature ?? 0.8,
...(cfg.tools && cfg.tools.length > 0
? {
tools: cfg.tools,
tool_choice: "auto",
}
: {}),
};
}
private resolveRealtimeAudioFormatConfig(): RealtimeAudioFormatConfig {
private resolveRealtimeAudioFormat(): OpenAIRealtimeAudioFormatConfig {
return this.audioFormat.encoding === "pcm16"
? { type: "audio/pcm", rate: 24000 }
: { type: "audio/pcmu" };
}
private resolveRealtimeAudioFormat(): "g711_ulaw" | "pcm16" {
private resolveLegacyRealtimeAudioFormat(): "g711_ulaw" | "pcm16" {
return this.audioFormat.encoding === "pcm16" ? "pcm16" : "g711_ulaw";
}
@@ -978,7 +1107,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
this.config.onEvent?.({ direction: "client", type, ...(detail ? { detail } : {}) });
const payload = JSON.stringify(event);
captureWsEvent({
url: this.resolveConnectionParams().url,
url: this.connectionUrl,
direction: "outbound",
kind: "ws-frame",
flowId: this.flowId,
@@ -1018,14 +1147,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
}
function readStringField(value: unknown, key: string): string | undefined {
if (!value || typeof value !== "object") {
return undefined;
}
const raw = (value as Record<string, unknown>)[key];
return typeof raw === "string" && raw.trim() ? raw.trim() : undefined;
}
function resolveOpenAIRealtimeBrowserOfferHeaders(): Record<string, string> | undefined {
const headers = resolveProviderRequestHeaders({
provider: "openai",
@@ -1048,13 +1169,16 @@ async function createOpenAIRealtimeBrowserSession(
req: RealtimeVoiceBrowserSessionCreateRequest,
): Promise<RealtimeVoiceBrowserSession> {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = requireOpenAIRealtimeApiKey(config.apiKey);
const apiKey = await requireOpenAIRealtimeBrowserApiKey({
configuredApiKey: config.apiKey,
cfg: req.cfg,
});
if (config.azureEndpoint || config.azureDeployment) {
throw new Error("OpenAI Realtime browser sessions do not support Azure endpoints yet");
}
const model = req.model ?? config.model ?? OPENAI_REALTIME_DEFAULT_MODEL;
const voice = (req.voice ?? config.voice ?? "alloy") as OpenAIRealtimeVoice;
const voice = normalizeOpenAIRealtimeVoice(req.voice) ?? config.voice ?? "alloy";
const session: Record<string, unknown> = {
type: "realtime",
model,
@@ -1066,6 +1190,15 @@ async function createOpenAIRealtimeBrowserSession(
type: "server_vad",
create_response: true,
interrupt_response: true,
...(typeof (req.vadThreshold ?? config.vadThreshold) === "number"
? { threshold: req.vadThreshold ?? config.vadThreshold }
: {}),
...(typeof (req.prefixPaddingMs ?? config.prefixPaddingMs) === "number"
? { prefix_padding_ms: req.prefixPaddingMs ?? config.prefixPaddingMs }
: {}),
...(typeof (req.silenceDurationMs ?? config.silenceDurationMs) === "number"
? { silence_duration_ms: req.silenceDurationMs ?? config.silenceDurationMs }
: {}),
},
transcription: { model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL },
},
@@ -1076,60 +1209,26 @@ async function createOpenAIRealtimeBrowserSession(
session.tools = req.tools;
session.tool_choice = "auto";
}
const { response, release } = await fetchWithSsrFGuard({
url: "https://api.openai.com/v1/realtime/client_secrets",
init: {
method: "POST",
headers: resolveProviderRequestHeaders({
provider: "openai",
baseUrl: "https://api.openai.com/v1/realtime/client_secrets",
capability: "audio",
transport: "http",
defaultHeaders: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
}) ?? {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({ session }),
},
auditContext: "openai-realtime-browser-session",
});
const payload = await (async () => {
try {
if (!response.ok) {
throw await createProviderHttpError(response, "OpenAI Realtime browser session failed");
}
return (await response.json()) as unknown;
} finally {
await release();
}
})();
const nestedSecret =
payload && typeof payload === "object"
? (payload as Record<string, unknown>).client_secret
: undefined;
const clientSecret = readStringField(payload, "value") ?? readStringField(nestedSecret, "value");
if (!clientSecret) {
throw new Error("OpenAI Realtime browser session did not return a client secret");
const reasoningEffort = trimToUndefined(req.reasoningEffort) ?? config.reasoningEffort;
if (reasoningEffort) {
session.reasoning = { effort: reasoningEffort };
}
const expiresAt =
payload && typeof payload === "object"
? (payload as Record<string, unknown>).expires_at
: undefined;
const clientSecret = await createOpenAIRealtimeClientSecret({
authToken: apiKey,
auditContext: "openai-realtime-browser-session",
session,
});
const offerHeaders = resolveOpenAIRealtimeBrowserOfferHeaders();
return {
provider: "openai",
transport: "webrtc",
clientSecret,
clientSecret: clientSecret.value,
offerUrl: "https://api.openai.com/v1/realtime/calls",
...(offerHeaders ? { offerHeaders } : {}),
model,
voice,
...(typeof expiresAt === "number" ? { expiresAt } : {}),
...(typeof clientSecret.expiresAt === "number" ? { expiresAt: clientSecret.expiresAt } : {}),
};
}
@@ -1154,14 +1253,21 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
supportsToolCalls: true,
},
resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
isConfigured: ({ providerConfig }) =>
hasOpenAIRealtimeApiKeyInput(normalizeProviderConfig(providerConfig).apiKey),
isConfigured: ({ cfg, providerConfig }) => {
const config = normalizeProviderConfig(providerConfig);
if (config.azureEndpoint || config.azureDeployment) {
return hasOpenAIRealtimeApiKeyInput(config.apiKey);
}
return hasOpenAIRealtimeBrowserAuthInput({
configuredApiKey: config.apiKey,
cfg,
});
},
createBridge: (req) => {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = requireOpenAIRealtimeApiKey(config.apiKey);
return new OpenAIRealtimeVoiceBridge({
...req,
apiKey,
apiKey: config.apiKey,
model: config.model,
voice: config.voice,
temperature: config.temperature,
@@ -1171,6 +1277,7 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
interruptResponseOnInputAudio:
req.interruptResponseOnInputAudio ?? config.interruptResponseOnInputAudio,
minBargeInAudioEndMs: config.minBargeInAudioEndMs,
reasoningEffort: config.reasoningEffort,
azureEndpoint: config.azureEndpoint,
azureDeployment: config.azureDeployment,
azureApiVersion: config.azureApiVersion,

View File

@@ -9,6 +9,7 @@
import type { IncomingMessage } from "node:http";
import type { Duplex } from "node:stream";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
import type {
RealtimeTranscriptionProviderConfig,
RealtimeTranscriptionProviderPlugin,
@@ -31,6 +32,8 @@ export interface MediaStreamConfig {
transcriptionProvider: RealtimeTranscriptionProviderPlugin;
/** Provider-owned config blob passed into the transcription session. */
providerConfig: RealtimeTranscriptionProviderConfig;
/** Full runtime config, used by providers that can resolve OAuth profiles. */
cfg?: OpenClawConfig;
/** Close sockets that never send a valid `start` frame within this window. */
preStartTimeoutMs?: number;
/** Max concurrent pre-start sockets. */
@@ -314,6 +317,7 @@ export class MediaStreamHandler {
}
const sttSession = this.config.transcriptionProvider.createSession({
cfg: this.config.cfg,
providerConfig: this.config.providerConfig,
onPartial: (partial) => {
const session = this.sessions.get(streamSid);

View File

@@ -334,6 +334,7 @@ export async function createVoiceCallRuntime(params: {
realtimeProvider.provider,
realtimeProvider.providerConfig,
config.serve.path,
cfg,
);
if (config.realtime.toolPolicy !== "none") {
realtimeHandler.registerToolHandler(

View File

@@ -346,6 +346,7 @@ export class VoiceCallWebhookServer {
const streamConfig: MediaStreamConfig = {
transcriptionProvider: provider,
providerConfig,
cfg: this.fullConfig ?? (this.coreConfig as OpenClawConfig | null) ?? undefined,
preStartTimeoutMs: streaming.preStartTimeoutMs,
maxPendingConnections: streaming.maxPendingConnections,
maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,

View File

@@ -1,6 +1,7 @@
import { randomUUID } from "node:crypto";
import http from "node:http";
import type { Duplex } from "node:stream";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import {
buildRealtimeVoiceAgentConsultWorkingResponse,
@@ -309,6 +310,7 @@ export class RealtimeCallHandler {
private readonly realtimeProvider: RealtimeVoiceProviderPlugin,
private readonly providerConfig: RealtimeVoiceProviderConfig,
private readonly servePath: string,
private readonly coreConfig?: OpenClawConfig,
) {}
setPublicUrl(url: string): void {
@@ -603,6 +605,7 @@ export class RealtimeCallHandler {
});
const session = createRealtimeVoiceBridgeSession({
provider: this.realtimeProvider,
cfg: this.coreConfig,
providerConfig: this.providerConfig,
instructions: this.config.instructions,
tools: this.config.tools,

View File

@@ -476,8 +476,8 @@ try {
const statusNames = new Set((result.statuses ?? []).map((entry) => entry.status));
const transcriptTexts = new Set((result.transcripts ?? []).map((entry) => entry.text));
const expectedMethods = [
"talk.session.appendAudio",
"talk.client.toolCall",
"talk.session.appendAudio",
"talk.session.submitToolResult",
"talk.session.close",
];

View File

@@ -163,6 +163,10 @@ export const TalkClientCreateParamsSchema = Type.Object(
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
vadThreshold: Type.Optional(Type.Number()),
silenceDurationMs: Type.Optional(Type.Integer({ minimum: 1 })),
prefixPaddingMs: Type.Optional(Type.Integer({ minimum: 0 })),
reasoningEffort: Type.Optional(Type.String()),
mode: Type.Optional(TalkModeSchema),
transport: Type.Optional(TalkTransportSchema),
brain: Type.Optional(TalkBrainSchema),
@@ -203,6 +207,10 @@ export const TalkSessionCreateParamsSchema = Type.Object(
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
vadThreshold: Type.Optional(Type.Number()),
silenceDurationMs: Type.Optional(Type.Integer({ minimum: 1 })),
prefixPaddingMs: Type.Optional(Type.Integer({ minimum: 0 })),
reasoningEffort: Type.Optional(Type.String()),
mode: Type.Optional(TalkModeSchema),
transport: Type.Optional(TalkTransportSchema),
brain: Type.Optional(TalkBrainSchema),

View File

@@ -24,6 +24,7 @@ import { chatHandlers } from "./chat.js";
import { asRecord } from "./record-shared.js";
import {
buildRealtimeInstructions,
buildRealtimeVoiceLaunchOptions,
buildTalkRealtimeConfig,
isUnsupportedBrowserWebRtcSession,
} from "./talk-shared.js";
@@ -114,6 +115,10 @@ export const talkClientHandlers: GatewayRequestHandlers = {
provider?: string;
model?: string;
voice?: string;
vadThreshold?: number;
silenceDurationMs?: number;
prefixPaddingMs?: number;
reasoningEffort?: string;
mode?: string;
transport?: string;
brain?: string;
@@ -180,13 +185,17 @@ export const talkClientHandlers: GatewayRequestHandlers = {
cfgForResolve: runtimeConfig,
noRegisteredProviderMessage: "No realtime voice provider registered",
});
const launchOptions = buildRealtimeVoiceLaunchOptions({
requested: typedParams,
defaults: realtimeConfig,
});
if (resolution.provider.createBrowserSession && transport !== "gateway-relay") {
const session = await resolution.provider.createBrowserSession({
cfg: runtimeConfig,
providerConfig: resolution.providerConfig,
instructions: buildRealtimeInstructions(realtimeConfig.instructions),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model,
voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice,
...launchOptions,
});
if (
!isUnsupportedBrowserWebRtcSession(session) &&

View File

@@ -52,6 +52,7 @@ import { formatForLog } from "../ws-log.js";
import {
broadcastTalkRoomEvents,
buildRealtimeInstructions,
buildRealtimeVoiceLaunchOptions,
buildTalkRealtimeConfig,
buildTalkTranscriptionConfig,
canUseTalkDirectTools,
@@ -235,17 +236,20 @@ export const talkSessionHandlers: GatewayRequestHandlers = {
cfgForResolve: runtimeConfig,
noRegisteredProviderMessage: "No realtime voice provider registered",
});
const model = normalizeOptionalString(params.model) ?? realtimeConfig.model;
const voice = normalizeOptionalString(params.voice) ?? realtimeConfig.voice;
const launchOptions = buildRealtimeVoiceLaunchOptions({
requested: params,
defaults: realtimeConfig,
});
const session = createTalkRealtimeRelaySession({
context,
connId,
cfg: runtimeConfig,
provider: resolution.provider,
providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }),
providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, launchOptions),
instructions: buildRealtimeInstructions(realtimeConfig.instructions),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model,
voice,
model: launchOptions.model,
voice: launchOptions.voice,
});
rememberUnifiedTalkSession(session.relaySessionId, {
kind: "realtime-relay",

View File

@@ -221,19 +221,60 @@ export function buildRealtimeInstructions(configuredInstructions?: string): stri
return `${DEFAULT_REALTIME_INSTRUCTIONS}\n\nAdditional realtime instructions:\n${extra}`;
}
type RealtimeVoiceLaunchOptions = {
model?: string;
voice?: string;
vadThreshold?: number;
silenceDurationMs?: number;
prefixPaddingMs?: number;
reasoningEffort?: string;
};
type RealtimeVoiceLaunchOptionInput = {
model?: unknown;
voice?: unknown;
vadThreshold?: unknown;
silenceDurationMs?: unknown;
prefixPaddingMs?: unknown;
reasoningEffort?: unknown;
};
export function buildRealtimeVoiceLaunchOptions(params: {
requested: RealtimeVoiceLaunchOptionInput;
defaults: RealtimeVoiceLaunchOptions;
}): RealtimeVoiceLaunchOptions {
return withRealtimeBrowserOverrides(
params.defaults,
params.requested,
) as RealtimeVoiceLaunchOptions;
}
export function withRealtimeBrowserOverrides(
providerConfig: RealtimeVoiceProviderConfig,
params: { model?: string; voice?: string },
params: RealtimeVoiceLaunchOptionInput,
): RealtimeVoiceProviderConfig {
const overrides: RealtimeVoiceProviderConfig = {};
const model = normalizeOptionalString(params.model);
const voice = normalizeOptionalString(params.voice);
const reasoningEffort = normalizeOptionalString(params.reasoningEffort);
if (model) {
overrides.model = model;
}
if (voice) {
overrides.voice = voice;
}
if (typeof params.vadThreshold === "number" && Number.isFinite(params.vadThreshold)) {
overrides.vadThreshold = params.vadThreshold;
}
if (typeof params.silenceDurationMs === "number" && Number.isFinite(params.silenceDurationMs)) {
overrides.silenceDurationMs = params.silenceDurationMs;
}
if (typeof params.prefixPaddingMs === "number" && Number.isFinite(params.prefixPaddingMs)) {
overrides.prefixPaddingMs = params.prefixPaddingMs;
}
if (reasoningEffort) {
overrides.reasoningEffort = reasoningEffort;
}
return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig;
}

View File

@@ -1219,7 +1219,13 @@ describe("talk.client.create handler", () => {
const respond = vi.fn();
await talkHandlers["talk.client.create"]({
req: { type: "req", id: "1", method: "talk.client.create" },
params: { sessionKey: "main" },
params: {
sessionKey: "main",
vadThreshold: 0.45,
silenceDurationMs: 650,
prefixPaddingMs: 250,
reasoningEffort: "low",
},
client: { connId: "conn-1" } as never,
isWebchatConnect: () => false,
respond: respond as never,
@@ -1252,6 +1258,10 @@ describe("talk.client.create handler", () => {
model: "gpt-realtime",
voice: "alloy",
instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."),
vadThreshold: 0.45,
silenceDurationMs: 650,
prefixPaddingMs: 250,
reasoningEffort: "low",
}),
);
expect(respond).toHaveBeenCalledWith(

View File

@@ -1,4 +1,5 @@
import { randomUUID } from "node:crypto";
import type { OpenClawConfig } from "../config/types.js";
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
import { recordTalkObservabilityEvent } from "../talk/observability.js";
import {
@@ -20,6 +21,7 @@ import {
} from "../talk/talk-session-controller.js";
import { abortChatRunById } from "./chat-abort.js";
import type { GatewayRequestContext } from "./server-methods/shared-types.js";
import { forgetUnifiedTalkSession } from "./talk-session-registry.js";
const RELAY_SESSION_TTL_MS = 30 * 60 * 1000;
const MAX_AUDIO_BASE64_BYTES = 512 * 1024;
@@ -68,6 +70,7 @@ type RelaySession = {
type CreateTalkRealtimeRelaySessionParams = {
context: GatewayRequestContext;
connId: string;
cfg?: OpenClawConfig;
provider: RealtimeVoiceProviderPlugin;
providerConfig: RealtimeVoiceProviderConfig;
instructions: string;
@@ -113,6 +116,7 @@ function abortRelayAgentRuns(session: RelaySession, reason: string): void {
function closeRelaySession(session: RelaySession, reason: "completed" | "error"): void {
relaySessions.delete(session.id);
forgetUnifiedTalkSession(session.id);
clearTimeout(session.cleanupTimer);
abortRelayAgentRuns(session, reason === "error" ? "relay-error" : "relay-closed");
session.bridge.close();
@@ -180,6 +184,7 @@ export function createTalkRealtimeRelaySession(
});
const bridge = createRealtimeVoiceBridgeSession({
provider: params.provider,
cfg: params.cfg,
providerConfig: params.providerConfig,
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
instructions: params.instructions,
@@ -281,6 +286,7 @@ export function createTalkRealtimeRelaySession(
return;
}
relaySessions.delete(relaySessionId);
forgetUnifiedTalkSession(relaySessionId);
clearTimeout(active.cleanupTimer);
abortRelayAgentRuns(active, "relay-closed");
emit(
@@ -449,6 +455,7 @@ export function stopTalkRealtimeRelaySession(params: {
export function clearTalkRealtimeRelaySessionsForTest(): void {
for (const session of relaySessions.values()) {
clearTimeout(session.cleanupTimer);
forgetUnifiedTalkSession(session.id);
session.bridge.close();
}
relaySessions.clear();

View File

@@ -37,6 +37,7 @@ describe("talk transcription gateway relay", () => {
};
const events: Array<{ event: string; payload: unknown; connIds: string[] }> = [];
const context = {
getRuntimeConfig: () => ({}),
broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet<string>) => {
events.push({ event, payload, connIds: [...connIds] });
},
@@ -170,6 +171,7 @@ describe("talk transcription gateway relay", () => {
};
const events: Array<{ event: string; payload: unknown; connIds: string[] }> = [];
const context = {
getRuntimeConfig: () => ({}),
broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet<string>) => {
events.push({ event, payload, connIds: [...connIds] });
},

View File

@@ -160,6 +160,7 @@ export function createTalkTranscriptionRelaySession(
return relay ? ensureTranscriptionTurn(relay) : "turn-1";
};
const sttSession = params.provider.createSession({
cfg: params.context.getRuntimeConfig(),
providerConfig: params.providerConfig,
onSpeechStart: () => {
ensureTurnId();

View File

@@ -22,6 +22,7 @@ export type RealtimeTranscriptionSessionCallbacks = {
};
export type RealtimeTranscriptionSessionCreateRequest = RealtimeTranscriptionSessionCallbacks & {
cfg?: OpenClawConfig;
providerConfig: RealtimeTranscriptionProviderConfig;
};

View File

@@ -15,6 +15,7 @@ afterEach(async () => {
async function createRealtimeServer(params?: {
closeOnConnection?: boolean;
initialEvent?: unknown;
onUpgrade?: (headers: Record<string, string | string[] | undefined>) => void;
onBinary?: (payload: Buffer) => void;
onText?: (payload: unknown) => void;
}) {
@@ -23,6 +24,7 @@ async function createRealtimeServer(params?: {
const clients = new Set<WebSocket>();
server.on("upgrade", (request, socket, head) => {
params?.onUpgrade?.(request.headers);
wss.handleUpgrade(request, socket, head, (ws) => {
clients.add(ws);
ws.on("close", () => clients.delete(ws));
@@ -139,6 +141,85 @@ describe("createRealtimeTranscriptionWebSocketSession", () => {
session.close();
});
it("resolves async URLs and headers before opening the socket", async () => {
const seenAuthHeaders: Array<string | string[] | undefined> = [];
const server = await createRealtimeServer({
onUpgrade: (headers) => {
seenAuthHeaders.push(headers.authorization);
},
});
const session = createRealtimeTranscriptionWebSocketSession({
providerId: "test",
callbacks: {},
url: async () => server.url,
headers: async () => ({ Authorization: "Bearer resolved-token" }),
readyOnOpen: true,
sendAudio: (audio, transport) => {
transport.sendBinary(audio);
},
});
await session.connect();
expect(seenAuthHeaders).toEqual(["Bearer resolved-token"]);
session.close();
});
it("applies the connect timeout while resolving async connection details", async () => {
const onError = vi.fn();
const session = createRealtimeTranscriptionWebSocketSession({
providerId: "test",
callbacks: { onError },
url: () => new Promise<string>(() => {}),
connectTimeoutMs: 10,
connectTimeoutMessage: "test realtime transcription connection timeout",
readyOnOpen: true,
sendAudio: (audio, transport) => {
transport.sendBinary(audio);
},
});
await expect(session.connect()).rejects.toThrow(
"test realtime transcription connection timeout",
);
expect(session.isConnected()).toBe(false);
expect(onError).toHaveBeenCalledWith(expect.any(Error));
expect(onError.mock.calls[0]?.[0]).toMatchObject({
message: "test realtime transcription connection timeout",
});
});
it("does not open a socket when closed while async connection resolves", async () => {
const seenAuthHeaders: Array<string | string[] | undefined> = [];
let resolveUrl!: (url: string) => void;
const url = new Promise<string>((resolve) => {
resolveUrl = resolve;
});
const server = await createRealtimeServer({
onUpgrade: (headers) => {
seenAuthHeaders.push(headers.authorization);
},
});
const session = createRealtimeTranscriptionWebSocketSession({
providerId: "test",
callbacks: {},
url: () => url,
headers: async () => ({ Authorization: "Bearer resolved-token" }),
readyOnOpen: true,
sendAudio: (audio, transport) => {
transport.sendBinary(audio);
},
});
const connecting = session.connect();
session.close();
resolveUrl(server.url);
await connecting;
expect(seenAuthHeaders).toEqual([]);
expect(session.isConnected()).toBe(false);
});
it("rejects provider setup errors before ready", async () => {
const server = await createRealtimeServer({ initialEvent: { type: "error", message: "nope" } });
const onError = vi.fn();

View File

@@ -24,7 +24,9 @@ export type RealtimeTranscriptionWebSocketSessionOptions<Event = unknown> = {
connectTimeoutMessage?: string;
connectTimeoutMs?: number;
closeTimeoutMs?: number;
headers?: Record<string, string>;
headers?:
| Record<string, string>
| (() => Record<string, string> | Promise<Record<string, string>>);
maxQueuedBytes?: number;
maxReconnectAttempts?: number;
onClose?: (transport: RealtimeTranscriptionWebSocketTransport) => void;
@@ -36,7 +38,7 @@ export type RealtimeTranscriptionWebSocketSessionOptions<Event = unknown> = {
reconnectDelayMs?: number;
reconnectLimitMessage?: string;
sendAudio: (audio: Buffer, transport: RealtimeTranscriptionWebSocketTransport) => void;
url: string | (() => string);
url: string | (() => string | Promise<string>);
};
const DEFAULT_CONNECT_TIMEOUT_MS = 10_000;
@@ -157,22 +159,37 @@ class WebSocketRealtimeTranscriptionSession<Event> implements RealtimeTranscript
private async doConnect(): Promise<void> {
await new Promise<void>((resolve, reject) => {
this.ready = false;
this.currentUrl =
typeof this.options.url === "function" ? this.options.url() : this.options.url;
const debugProxy = resolveDebugProxySettings();
const proxyAgent = createDebugProxyWebSocketAgent(debugProxy);
let settled = false;
let opened = false;
let connectTimeout: ReturnType<typeof setTimeout> | undefined;
const normalizeError = (error: unknown) =>
error instanceof Error ? error : new Error(String(error));
const clearConnectTimeout = () => {
if (connectTimeout) {
clearTimeout(connectTimeout);
connectTimeout = undefined;
}
};
const finishClosedConnect = () => {
if (settled) {
return;
}
settled = true;
clearConnectTimeout();
resolve();
};
const finishConnect = () => {
if (settled) {
return;
}
settled = true;
if (connectTimeout) {
clearTimeout(connectTimeout);
}
clearConnectTimeout();
this.ready = true;
this.flushQueuedAudio();
resolve();
@@ -183,9 +200,7 @@ class WebSocketRealtimeTranscriptionSession<Event> implements RealtimeTranscript
return;
}
settled = true;
if (connectTimeout) {
clearTimeout(connectTimeout);
}
clearConnectTimeout();
this.emitError(error);
this.suppressReconnect = true;
this.forceClose();
@@ -194,10 +209,6 @@ class WebSocketRealtimeTranscriptionSession<Event> implements RealtimeTranscript
this.markReady = finishConnect;
this.failConnect = failConnect;
this.ws = new WebSocket(this.currentUrl, {
headers: this.options.headers,
...(proxyAgent ? { agent: proxyAgent } : {}),
});
connectTimeout = setTimeout(() => {
failConnect(
@@ -208,77 +219,116 @@ class WebSocketRealtimeTranscriptionSession<Event> implements RealtimeTranscript
);
}, this.connectTimeoutMs);
this.ws.on("open", () => {
opened = true;
this.connected = true;
this.reconnectAttempts = 0;
this.captureLocalOpen();
void (async () => {
let connection: { headers?: Record<string, string>; url: string };
try {
this.options.onOpen?.(this.transport);
if (this.options.readyOnOpen) {
finishConnect();
}
connection = await this.resolveConnection();
} catch (error) {
failConnect(error instanceof Error ? error : new Error(String(error)));
}
});
this.ws.on("message", (data) => {
const payload = rawWsDataToBuffer(data);
this.captureFrame("inbound", payload);
try {
if (!this.options.onMessage) {
return;
}
const parseMessage = this.options.parseMessage ?? defaultParseMessage;
this.options.onMessage(parseMessage(payload) as Event, this.transport);
} catch (error) {
this.emitError(error);
}
});
this.ws.on("error", (error) => {
const normalized = error instanceof Error ? error : new Error(String(error));
this.captureError(normalized);
if (!opened || !settled) {
failConnect(normalized);
failConnect(normalizeError(error));
return;
}
this.emitError(normalized);
});
this.ws.on("close", (code, reasonBuffer) => {
if (connectTimeout) {
clearTimeout(connectTimeout);
}
this.captureClose(code, reasonBuffer);
this.connected = false;
this.ready = false;
if (this.closeTimer) {
clearTimeout(this.closeTimer);
this.closeTimer = undefined;
if (settled) {
return;
}
if (this.closed) {
finishClosedConnect();
return;
}
if (this.suppressReconnect) {
this.suppressReconnect = false;
this.currentUrl = connection.url;
try {
this.ws = new WebSocket(this.currentUrl, {
headers: connection.headers,
...(proxyAgent ? { agent: proxyAgent } : {}),
});
} catch (error) {
failConnect(normalizeError(error));
return;
}
if (!opened || !settled) {
failConnect(
new Error(
this.options.connectClosedBeforeReadyMessage ??
`${this.options.providerId} realtime transcription connection closed before ready`,
),
);
return;
}
void this.attemptReconnect();
});
this.ws.on("open", () => {
opened = true;
this.connected = true;
this.reconnectAttempts = 0;
this.captureLocalOpen();
try {
this.options.onOpen?.(this.transport);
if (this.options.readyOnOpen) {
finishConnect();
}
} catch (error) {
failConnect(normalizeError(error));
}
});
this.ws.on("message", (data) => {
const payload = rawWsDataToBuffer(data);
this.captureFrame("inbound", payload);
try {
if (!this.options.onMessage) {
return;
}
const parseMessage = this.options.parseMessage ?? defaultParseMessage;
this.options.onMessage(parseMessage(payload) as Event, this.transport);
} catch (error) {
this.emitError(error);
}
});
this.ws.on("error", (error) => {
const normalized = normalizeError(error);
this.captureError(normalized);
if (!opened || !settled) {
failConnect(normalized);
return;
}
this.emitError(normalized);
});
this.ws.on("close", (code, reasonBuffer) => {
clearConnectTimeout();
this.captureClose(code, reasonBuffer);
this.connected = false;
this.ready = false;
if (this.closeTimer) {
clearTimeout(this.closeTimer);
this.closeTimer = undefined;
}
if (this.closed) {
return;
}
if (this.suppressReconnect) {
this.suppressReconnect = false;
return;
}
if (!opened || !settled) {
failConnect(
new Error(
this.options.connectClosedBeforeReadyMessage ??
`${this.options.providerId} realtime transcription connection closed before ready`,
),
);
return;
}
void this.attemptReconnect();
});
})();
});
}
private async resolveConnection(): Promise<{
headers?: Record<string, string>;
url: string;
}> {
const url = await (typeof this.options.url === "function"
? this.options.url()
: this.options.url);
const headers = await (typeof this.options.headers === "function"
? this.options.headers()
: this.options.headers);
return { url, headers };
}
private async attemptReconnect(): Promise<void> {
if (this.closed || this.reconnecting) {
return;

View File

@@ -100,6 +100,7 @@ export type RealtimeVoiceProviderConfiguredContext = {
};
export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
cfg?: OpenClawConfig;
providerConfig: RealtimeVoiceProviderConfig;
audioFormat?: RealtimeVoiceAudioFormat;
instructions?: string;
@@ -109,11 +110,16 @@ export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
};
export type RealtimeVoiceBrowserSessionCreateRequest = {
cfg?: OpenClawConfig;
providerConfig: RealtimeVoiceProviderConfig;
instructions?: string;
tools?: RealtimeVoiceTool[];
model?: string;
voice?: string;
vadThreshold?: number;
silenceDurationMs?: number;
prefixPaddingMs?: number;
reasoningEffort?: string;
};
export type RealtimeVoiceBrowserAudioContract = {

View File

@@ -48,6 +48,7 @@ describe("realtime voice bridge session runtime", () => {
createRealtimeVoiceBridgeSession({
provider,
cfg: { talk: { realtime: { provider: "test" } } } as never,
providerConfig: {},
audioSink: {
isOpen: () => true,
@@ -61,6 +62,7 @@ describe("realtime voice bridge session runtime", () => {
callbacks?.onClearAudio();
callbacks?.onMark?.("mark-1");
expect(callbacks?.cfg).toEqual({ talk: { realtime: { provider: "test" } } });
expect(sendAudio).toHaveBeenCalledWith(Buffer.from([1, 2]));
expect(clearAudio).toHaveBeenCalled();
expect(sendMark).toHaveBeenCalledWith("mark-1");

View File

@@ -1,3 +1,4 @@
import type { OpenClawConfig } from "../config/types.openclaw.js";
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
import type {
RealtimeVoiceBridge,
@@ -36,6 +37,7 @@ export type RealtimeVoiceBridgeSession = {
export type RealtimeVoiceBridgeSessionParams = {
provider: RealtimeVoiceProviderPlugin;
cfg?: OpenClawConfig;
providerConfig: RealtimeVoiceProviderConfig;
audioFormat?: RealtimeVoiceAudioFormat;
audioSink: RealtimeVoiceAudioSink;
@@ -81,6 +83,7 @@ export function createRealtimeVoiceBridgeSession(
};
const canSendAudio = () => params.audioSink.isOpen?.() ?? true;
bridge = params.provider.createBridge({
cfg: params.cfg,
providerConfig: params.providerConfig,
audioFormat: params.audioFormat,
instructions: params.instructions,

View File

@@ -690,6 +690,38 @@
color: var(--text);
}
.agent-chat__talk-options {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(132px, 1fr));
gap: 8px;
padding: 10px;
border-bottom: 1px solid color-mix(in srgb, var(--border) 50%, transparent);
background: color-mix(in srgb, var(--bg-elevated) 72%, transparent);
}
.agent-chat__talk-options label {
display: flex;
flex-direction: column;
gap: 4px;
min-width: 0;
font-size: 0.68rem;
color: var(--muted);
}
.agent-chat__talk-options input,
.agent-chat__talk-options select {
width: 100%;
min-width: 0;
height: 30px;
border-radius: var(--radius-sm);
border: 1px solid var(--border);
background: var(--bg);
color: var(--text);
font: inherit;
font-size: 0.75rem;
padding: 0 8px;
}
.agent-chat__input-divider {
width: 1px;
height: 16px;

View File

@@ -2447,6 +2447,8 @@ export function renderApp(state: AppViewState) {
realtimeTalkStatus: state.realtimeTalkStatus,
realtimeTalkDetail: state.realtimeTalkDetail,
realtimeTalkTranscript: state.realtimeTalkTranscript,
realtimeTalkOptionsOpen: state.realtimeTalkOptionsOpen,
realtimeTalkOptions: state.realtimeTalkOptions,
connected: state.connected,
canSend: state.connected,
disabledReason: chatDisabledReason,
@@ -2489,6 +2491,10 @@ export function renderApp(state: AppViewState) {
});
},
onToggleRealtimeTalk: () => state.toggleRealtimeTalk(),
onToggleRealtimeTalkOptions: () => {
state.realtimeTalkOptionsOpen = !state.realtimeTalkOptionsOpen;
},
onRealtimeTalkOptionsChange: (next) => state.updateRealtimeTalkOptions(next),
canAbort: hasAbortableSessionRun(state),
onAbort: () => void state.handleAbortChat(),
onQueueRemove: (id) => state.removeQueuedMessage(id),

View File

@@ -123,6 +123,18 @@ export type AppViewState = {
realtimeTalkStatus: RealtimeTalkStatus;
realtimeTalkDetail: string | null;
realtimeTalkTranscript: string | null;
realtimeTalkOptionsOpen: boolean;
realtimeTalkOptions: {
provider: string;
model: string;
voice: string;
transport: string;
vadThreshold: string;
silenceDurationMs: string;
prefixPaddingMs: string;
reasoningEffort: string;
};
updateRealtimeTalkOptions: (next: Partial<AppViewState["realtimeTalkOptions"]>) => void;
chatManualRefreshInFlight: boolean;
chatHeaderControlsHidden: boolean;
chatMobileControlsOpen: boolean;

View File

@@ -67,7 +67,11 @@ import {
import type { AppViewState } from "./app-view-state.ts";
import { normalizeAssistantIdentity } from "./assistant-identity.ts";
import { exportChatMarkdown } from "./chat/export.ts";
import { RealtimeTalkSession, type RealtimeTalkStatus } from "./chat/realtime-talk.ts";
import {
RealtimeTalkSession,
type RealtimeTalkLaunchOptions,
type RealtimeTalkStatus,
} from "./chat/realtime-talk.ts";
import type { ChatSideResult } from "./chat/side-result.ts";
import {
loadToolsEffective as loadToolsEffectiveInternal,
@@ -231,6 +235,17 @@ export class OpenClawApp extends LitElement {
@state() realtimeTalkStatus: RealtimeTalkStatus = "idle";
@state() realtimeTalkDetail: string | null = null;
@state() realtimeTalkTranscript: string | null = null;
@state() realtimeTalkOptionsOpen = false;
@state() realtimeTalkOptions = {
provider: "",
model: "",
voice: "",
transport: "",
vadThreshold: "",
silenceDurationMs: "",
prefixPaddingMs: "",
reasoningEffort: "",
};
private realtimeTalkSession: RealtimeTalkSession | null = null;
private nativeBridgeCleanup: (() => void) | null = null;
@state() chatManualRefreshInFlight = false;
@@ -955,6 +970,43 @@ export class OpenClawApp extends LitElement {
);
}
updateRealtimeTalkOptions(next: Partial<typeof this.realtimeTalkOptions>) {
this.realtimeTalkOptions = { ...this.realtimeTalkOptions, ...next };
}
private buildRealtimeTalkLaunchOptions(): RealtimeTalkLaunchOptions {
const options = this.realtimeTalkOptions ?? {
provider: "",
model: "",
voice: "",
transport: "",
vadThreshold: "",
silenceDurationMs: "",
prefixPaddingMs: "",
reasoningEffort: "",
};
const text = (value: string) => value.trim() || undefined;
const number = (value: string) => {
const trimmed = value.trim();
if (!trimmed) {
return undefined;
}
const parsed = Number(trimmed);
return Number.isFinite(parsed) ? parsed : undefined;
};
const transport = text(options.transport) as RealtimeTalkLaunchOptions["transport"] | undefined;
return {
provider: text(options.provider),
model: text(options.model),
voice: text(options.voice),
transport,
vadThreshold: number(options.vadThreshold),
silenceDurationMs: number(options.silenceDurationMs),
prefixPaddingMs: number(options.prefixPaddingMs),
reasoningEffort: text(options.reasoningEffort),
};
}
async toggleRealtimeTalk() {
if (this.realtimeTalkSession) {
if (this.realtimeTalkStatus === "error") {
@@ -978,18 +1030,23 @@ export class OpenClawApp extends LitElement {
this.realtimeTalkStatus = "connecting";
this.realtimeTalkDetail = null;
this.realtimeTalkTranscript = null;
const session = new RealtimeTalkSession(this.client, this.sessionKey, {
onStatus: (status, detail) => {
this.realtimeTalkStatus = status;
this.realtimeTalkDetail = detail ?? null;
if (status === "idle" || status === "error") {
this.realtimeTalkActive = status !== "idle";
}
const session = new RealtimeTalkSession(
this.client,
this.sessionKey,
{
onStatus: (status, detail) => {
this.realtimeTalkStatus = status;
this.realtimeTalkDetail = detail ?? null;
if (status === "idle" || status === "error") {
this.realtimeTalkActive = status !== "idle";
}
},
onTranscript: (entry) => {
this.realtimeTalkTranscript = `${entry.role === "user" ? "You" : "OpenClaw"}: ${entry.text}`;
},
},
onTranscript: (entry) => {
this.realtimeTalkTranscript = `${entry.role === "user" ? "You" : "OpenClaw"}: ${entry.text}`;
},
});
this.buildRealtimeTalkLaunchOptions(),
);
this.realtimeTalkSession = session;
try {
await session.start();

View File

@@ -49,6 +49,7 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport
private readonly consultAbortControllers = new Set<AbortController>();
private cancelRequestedForPlayback = false;
private speechFramesDuringPlayback = 0;
private lastRelayError: string | undefined;
constructor(
private readonly session: RealtimeTalkGatewayRelaySessionResult,
@@ -85,6 +86,18 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport
}
stop(): void {
const wasClosed = this.closed;
this.stopLocal();
if (!wasClosed) {
void this.ctx.client
.request("talk.session.close", {
sessionId: this.session.relaySessionId,
})
.catch(() => undefined);
}
}
private stopLocal(): void {
this.closed = true;
this.unsubscribe?.();
this.unsubscribe = null;
@@ -100,9 +113,6 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport
this.inputContext = null;
void this.outputContext?.close();
this.outputContext = null;
void this.ctx.client.request("talk.session.close", {
sessionId: this.session.relaySessionId,
});
}
private startMicrophonePump(): void {
@@ -120,11 +130,21 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport
if (this.detectBargeInSpeech(samples)) {
this.cancelOutputForBargeIn();
}
void this.ctx.client.request("talk.session.appendAudio", {
sessionId: this.session.relaySessionId,
audioBase64: bytesToBase64(pcm),
timestamp: Math.round((this.inputContext?.currentTime ?? 0) * 1000),
});
void this.ctx.client
.request("talk.session.appendAudio", {
sessionId: this.session.relaySessionId,
audioBase64: bytesToBase64(pcm),
timestamp: Math.round((this.inputContext?.currentTime ?? 0) * 1000),
})
.catch((error: unknown) => {
if (!this.closed) {
this.ctx.callbacks.onStatus?.(
"error",
error instanceof Error ? error.message : String(error),
);
this.stopLocal();
}
});
};
this.inputSource.connect(this.inputProcessor);
this.inputProcessor.connect(this.inputContext.destination);
@@ -167,15 +187,17 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport
void this.handleToolCall(event);
return;
case "error":
this.ctx.callbacks.onStatus?.("error", event.message ?? "Realtime relay failed");
this.lastRelayError = event.message ?? "Realtime relay failed";
this.ctx.callbacks.onStatus?.("error", this.lastRelayError);
return;
case "close":
this.abortConsults();
if (!this.closed) {
this.ctx.callbacks.onStatus?.(
event.reason === "error" ? "error" : "idle",
event.reason === "error" ? "Realtime relay closed" : undefined,
event.reason === "error" ? (this.lastRelayError ?? "Realtime relay closed") : undefined,
);
this.stopLocal();
}
return;
default:

View File

@@ -22,6 +22,17 @@ export type {
RealtimeTalkStatus,
};
export type RealtimeTalkLaunchOptions = {
provider?: string;
model?: string;
voice?: string;
transport?: "webrtc" | "provider-websocket" | "gateway-relay" | "managed-room";
vadThreshold?: number;
silenceDurationMs?: number;
prefixPaddingMs?: number;
reasoningEffort?: string;
};
function createTransport(
session: RealtimeTalkSessionResult,
ctx: RealtimeTalkTransportContext,
@@ -53,6 +64,12 @@ function resolveTransport(session: RealtimeTalkSessionResult): string {
return normalizeTalkTransport((session as { transport?: string }).transport) ?? "webrtc";
}
function compactLaunchParams(
params: RealtimeTalkLaunchOptions & { sessionKey: string; mode?: string; brain?: string },
): Record<string, unknown> {
return Object.fromEntries(Object.entries(params).filter(([, value]) => value !== undefined));
}
export class RealtimeTalkSession {
private transport: RealtimeTalkTransport | null = null;
private closed = false;
@@ -61,6 +78,7 @@ export class RealtimeTalkSession {
private readonly client: GatewayBrowserClient,
private readonly sessionKey: string,
private readonly callbacks: RealtimeTalkCallbacks = {},
private readonly options: RealtimeTalkLaunchOptions = {},
) {}
async start(): Promise<void> {
@@ -82,17 +100,28 @@ export class RealtimeTalkSession {
private async createSession(): Promise<RealtimeTalkSessionResult> {
try {
return await this.client.request<RealtimeTalkSessionResult>("talk.client.create", {
sessionKey: this.sessionKey,
});
} catch (error) {
try {
return await this.client.request<RealtimeTalkSessionResult>("talk.session.create", {
return await this.client.request<RealtimeTalkSessionResult>(
"talk.client.create",
compactLaunchParams({
sessionKey: this.sessionKey,
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
});
...this.options,
}),
);
} catch (error) {
if (this.options.transport && this.options.transport !== "gateway-relay") {
throw error;
}
try {
return await this.client.request<RealtimeTalkSessionResult>(
"talk.session.create",
compactLaunchParams({
sessionKey: this.sessionKey,
...this.options,
mode: "realtime",
transport: this.options.transport ?? "gateway-relay",
brain: "agent-consult",
}),
);
} catch {
throw error;
}

View File

@@ -217,6 +217,103 @@ describe("GatewayRelayRealtimeTalkTransport", () => {
transport.stop();
});
it("stops microphone pumping when the relay rejects appended audio", async () => {
const onStatus = vi.fn();
const client = createClient();
vi.mocked(client.request).mockImplementation(async (method) => {
if (method === "talk.session.appendAudio") {
throw new Error("Unknown realtime relay session");
}
return {};
});
const transport = new GatewayRelayRealtimeTalkTransport(createSession(), {
callbacks: { onStatus },
client,
sessionKey: "main",
});
await transport.start();
pumpMicrophone(new Float32Array(4096));
await vi.waitFor(() =>
expect(onStatus).toHaveBeenCalledWith("error", "Unknown realtime relay session"),
);
pumpMicrophone(new Float32Array(4096));
transport.stop();
const appendCalls = vi
.mocked(client.request)
.mock.calls.filter(([method]) => method === "talk.session.appendAudio");
const closeCalls = vi
.mocked(client.request)
.mock.calls.filter(([method]) => method === "talk.session.close");
expect(appendCalls).toHaveLength(1);
expect(closeCalls).toHaveLength(0);
});
it("treats relay close events as local shutdown", async () => {
const onStatus = vi.fn();
const client = createClient();
const transport = new GatewayRelayRealtimeTalkTransport(createSession(), {
callbacks: { onStatus },
client,
sessionKey: "main",
});
await transport.start();
pumpMicrophone(new Float32Array(4096));
emitGatewayFrame({
event: "talk.event",
payload: {
relaySessionId: "relay-1",
type: "close",
reason: "error",
},
});
pumpMicrophone(new Float32Array(4096));
transport.stop();
const appendCalls = vi
.mocked(client.request)
.mock.calls.filter(([method]) => method === "talk.session.appendAudio");
const closeCalls = vi
.mocked(client.request)
.mock.calls.filter(([method]) => method === "talk.session.close");
expect(onStatus).toHaveBeenCalledWith("error", "Realtime relay closed");
expect(appendCalls).toHaveLength(1);
expect(closeCalls).toHaveLength(0);
});
it("preserves relay error details across close events", async () => {
const onStatus = vi.fn();
const client = createClient();
const transport = new GatewayRelayRealtimeTalkTransport(createSession(), {
callbacks: { onStatus },
client,
sessionKey: "main",
});
await transport.start();
emitGatewayFrame({
event: "talk.event",
payload: {
relaySessionId: "relay-1",
type: "error",
message: "API version mismatch",
},
});
emitGatewayFrame({
event: "talk.event",
payload: {
relaySessionId: "relay-1",
type: "close",
reason: "error",
},
});
expect(onStatus).toHaveBeenCalledWith("error", "API version mismatch");
expect(onStatus).toHaveBeenLastCalledWith("error", "API version mismatch");
});
it("cancels relay playback after sustained input speech", async () => {
const client = createClient();
const transport = new GatewayRelayRealtimeTalkTransport(createSession(), {

View File

@@ -174,4 +174,41 @@ describe("RealtimeTalkSession", () => {
expect(googleCtor).not.toHaveBeenCalled();
expect(relayCtor).not.toHaveBeenCalled();
});
it("passes launch options to client-owned realtime session creation", async () => {
const request = vi.fn(async () => ({
provider: "openai",
transport: "webrtc",
clientSecret: "secret",
}));
const session = new RealtimeTalkSession(
{ request } as never,
"main",
{},
{
provider: "openai",
model: "gpt-realtime-2",
voice: "marin",
transport: "webrtc",
vadThreshold: 0.45,
silenceDurationMs: 650,
prefixPaddingMs: 250,
reasoningEffort: "low",
},
);
await session.start();
expect(request).toHaveBeenCalledWith("talk.client.create", {
sessionKey: "main",
provider: "openai",
model: "gpt-realtime-2",
voice: "marin",
transport: "webrtc",
vadThreshold: 0.45,
silenceDurationMs: 650,
prefixPaddingMs: 250,
reasoningEffort: "low",
});
});
});

View File

@@ -506,10 +506,70 @@ describe("chat voice controls", () => {
it("keeps Talk visible without the stale browser dictation button", () => {
const container = renderChatView();
expect(container.querySelectorAll('[aria-label="Start Talk"]')).toHaveLength(1);
expect(container.querySelector('[aria-label="Start Talk"]')).not.toBeNull();
expect(container.querySelector('[aria-label="Talk options"]')).not.toBeNull();
expect(container.querySelector('[aria-label="Voice input"]')).toBeNull();
});
it("renders editable Talk launch options", () => {
const onRealtimeTalkOptionsChange = vi.fn();
const container = renderChatView({
realtimeTalkOptionsOpen: true,
realtimeTalkOptions: {
provider: "openai",
model: "gpt-realtime-2",
voice: "marin",
transport: "webrtc",
vadThreshold: "0.45",
silenceDurationMs: "650",
prefixPaddingMs: "250",
reasoningEffort: "low",
},
onRealtimeTalkOptionsChange,
});
const model = container.querySelector<HTMLInputElement>(
'.agent-chat__talk-options input[placeholder="gpt-realtime-2"]',
);
const voice = container.querySelector<HTMLSelectElement>(
".agent-chat__talk-options label:nth-of-type(4) select",
);
const voiceOptions = Array.from(
container.querySelectorAll<HTMLOptionElement>(
".agent-chat__talk-options label:nth-of-type(4) option",
),
).map((option) => option.value);
const reasoningOptions = Array.from(
container.querySelectorAll<HTMLOptionElement>(
".agent-chat__talk-options label:nth-of-type(5) option",
),
).map((option) => option.value);
expect(voice).not.toBeNull();
expect(voiceOptions).toEqual([
"",
"alloy",
"ash",
"ballad",
"coral",
"echo",
"sage",
"shimmer",
"verse",
"marin",
"cedar",
]);
expect(voiceOptions).not.toContain("nova");
expect(voiceOptions).not.toContain("onyx");
expect(voiceOptions).not.toContain("fable");
expect(reasoningOptions).toEqual(["", "minimal", "low", "medium", "high"]);
expect(model).not.toBeNull();
model!.value = "gpt-realtime-mini";
model!.dispatchEvent(new Event("input", { bubbles: true }));
expect(onRealtimeTalkOptionsChange).toHaveBeenCalledWith({ model: "gpt-realtime-mini" });
});
it("lets users dismiss Talk start errors", () => {
const onDismissError = vi.fn();
const container = renderChatView({

View File

@@ -77,6 +77,17 @@ export type ChatProps = {
realtimeTalkStatus?: RealtimeTalkStatus;
realtimeTalkDetail?: string | null;
realtimeTalkTranscript?: string | null;
realtimeTalkOptionsOpen?: boolean;
realtimeTalkOptions?: {
provider: string;
model: string;
voice: string;
transport: string;
vadThreshold: string;
silenceDurationMs: string;
prefixPaddingMs: string;
reasoningEffort: string;
};
connected: boolean;
canSend: boolean;
disabledReason: string | null;
@@ -111,6 +122,10 @@ export type ChatProps = {
onCompact?: () => void | Promise<void>;
onOpenSessionCheckpoints?: () => void | Promise<void>;
onToggleRealtimeTalk?: () => void;
onToggleRealtimeTalkOptions?: () => void;
onRealtimeTalkOptionsChange?: (
next: Partial<NonNullable<ChatProps["realtimeTalkOptions"]>>,
) => void;
onDismissError?: () => void;
onAbort?: () => void;
onQueueRemove: (id: string) => void;
@@ -154,6 +169,110 @@ function getDeletedMessages(sessionKey: string): DeletedMessages {
);
}
function renderRealtimeTalkOptions(props: ChatProps) {
const options = props.realtimeTalkOptions;
const onChange = props.onRealtimeTalkOptionsChange;
if (!props.realtimeTalkOptionsOpen || !options || !onChange) {
return nothing;
}
const update = (key: keyof NonNullable<ChatProps["realtimeTalkOptions"]>) => (event: Event) => {
const value = (event.currentTarget as HTMLInputElement | HTMLSelectElement).value;
onChange({ [key]: value });
};
return html`
<div class="agent-chat__talk-options" aria-label="Talk options">
<label>
<span>Provider</span>
<select .value=${options.provider} @change=${update("provider")}>
<option value="">Auto</option>
<option value="openai">OpenAI</option>
<option value="google">Google</option>
</select>
</label>
<label>
<span>Transport</span>
<select .value=${options.transport} @change=${update("transport")}>
<option value="">Auto</option>
<option value="webrtc">WebRTC</option>
<option value="gateway-relay">Gateway relay</option>
<option value="provider-websocket">Provider WebSocket</option>
</select>
</label>
<label>
<span>Model</span>
<input
.value=${options.model}
@input=${update("model")}
placeholder="gpt-realtime-2"
spellcheck="false"
/>
</label>
<label>
<span>Voice</span>
<select .value=${options.voice} @change=${update("voice")}>
<option value="">Default</option>
${[
"alloy",
"ash",
"ballad",
"coral",
"echo",
"sage",
"shimmer",
"verse",
"marin",
"cedar",
].map((voice) => html`<option value=${voice}>${voice}</option>`)}
</select>
</label>
<label>
<span>Reasoning</span>
<select .value=${options.reasoningEffort} @change=${update("reasoningEffort")}>
<option value="">Default</option>
<option value="minimal">Minimal</option>
<option value="low">Low</option>
<option value="medium">Medium</option>
<option value="high">High</option>
</select>
</label>
<label>
<span>VAD</span>
<input
type="number"
min="0"
max="1"
step="0.05"
.value=${options.vadThreshold}
@input=${update("vadThreshold")}
placeholder="0.5"
/>
</label>
<label>
<span>Silence ms</span>
<input
type="number"
min="1"
step="50"
.value=${options.silenceDurationMs}
@input=${update("silenceDurationMs")}
placeholder="500"
/>
</label>
<label>
<span>Prefix ms</span>
<input
type="number"
min="0"
step="50"
.value=${options.prefixPaddingMs}
@input=${update("prefixPaddingMs")}
placeholder="300"
/>
</label>
</div>
`;
}
interface ChatEphemeralState {
slashMenuOpen: boolean;
slashMenuItems: SlashCommandDef[];
@@ -1244,6 +1363,7 @@ export function renderChat(props: ChatProps) {
@change=${(e: Event) => handleFileSelect(e, props)}
/>
${renderRealtimeTalkOptions(props)}
${props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript
? html`
<div class="agent-chat__stt-interim agent-chat__talk-status">
@@ -1311,6 +1431,17 @@ export function renderChat(props: ChatProps) {
>
${props.realtimeTalkActive ? icons.volume2 : icons.radio}
</button>
<button
class="agent-chat__input-btn ${props.realtimeTalkOptionsOpen
? "agent-chat__input-btn--active"
: ""}"
@click=${props.onToggleRealtimeTalkOptions}
title="Talk options"
aria-label="Talk options"
?disabled=${!props.connected || props.realtimeTalkActive}
>
${icons.settings}
</button>
`
: nothing}
${tokens ? html`<span class="agent-chat__token-count">${tokens}</span>` : nothing}