diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bb60486a43..89a7e1b71c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -99,6 +99,7 @@ Docs: https://docs.openclaw.ai - Agents/compaction: keep contributor diagnostics to a bounded top-three selection without sorting the full history. Thanks @shakkernerd. - Sessions/UI: avoid full-array sorting while selecting ACPX leases, Google Meet calendar events, and latest chat sessions. Thanks @shakkernerd. - Plugin SDK: mark direct `deliverOutboundPayloads` and legacy reply-dispatch bridges as deprecated compatibility substrate, enrich `sendDurableMessageBatch` with explicit durable send outcomes, migrate bundled send/turn paths off deprecated APIs, and enforce the split with `check:deprecated-api-usage`. +- OpenAI/Talk: let browser realtime Talk, Gateway relay/Voice Call realtime bridges, and OpenAI realtime transcription use `openai-codex` OAuth when no direct API key is configured, make Google Meet `test_speech` honor `mode: "bidi"`, expose Control UI launch options for provider/model/voice/transport/VAD/reasoning, and update the default OpenAI realtime voice model to `gpt-realtime-2`. - Telegram: preserve the channel-specific 10-option poll cap in the unified outbound adapter so over-limit polls are rejected before send. (#78762) Thanks @obviyus. - Telegram/streaming: continue over-limit draft previews in a new message instead of stopping when rendered preview text crosses Telegram's message limit. (#74508) Thanks @anagnorisis2peripeteia. - Slack: route handled top-level channel turns in implicit-conversation channels to thread-scoped sessions when Slack reply threading is enabled, keeping the root turn and later thread replies on one OpenClaw session. (#78522) Thanks @zeroth-blip. diff --git a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift index ad1ec5bcf98..2020b92481a 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift @@ -2983,6 +2983,10 @@ public struct TalkClientCreateParams: Codable, Sendable { public let provider: String? public let model: String? public let voice: String? + public let vadthreshold: Double? + public let silencedurationms: Int? + public let prefixpaddingms: Int? + public let reasoningeffort: String? public let mode: AnyCodable? public let transport: AnyCodable? public let brain: AnyCodable? @@ -2992,6 +2996,10 @@ public struct TalkClientCreateParams: Codable, Sendable { provider: String?, model: String?, voice: String?, + vadthreshold: Double?, + silencedurationms: Int?, + prefixpaddingms: Int?, + reasoningeffort: String?, mode: AnyCodable?, transport: AnyCodable?, brain: AnyCodable?) @@ -3000,6 +3008,10 @@ public struct TalkClientCreateParams: Codable, Sendable { self.provider = provider self.model = model self.voice = voice + self.vadthreshold = vadthreshold + self.silencedurationms = silencedurationms + self.prefixpaddingms = prefixpaddingms + self.reasoningeffort = reasoningeffort self.mode = mode self.transport = transport self.brain = brain @@ -3010,6 +3022,10 @@ public struct TalkClientCreateParams: Codable, Sendable { case provider case model case voice + case vadthreshold = "vadThreshold" + case silencedurationms = "silenceDurationMs" + case prefixpaddingms = "prefixPaddingMs" + case reasoningeffort = "reasoningEffort" case mode case transport case brain @@ -3163,6 +3179,10 @@ public struct TalkSessionCreateParams: Codable, Sendable { public let provider: String? public let model: String? public let voice: String? + public let vadthreshold: Double? + public let silencedurationms: Int? + public let prefixpaddingms: Int? + public let reasoningeffort: String? public let mode: AnyCodable? public let transport: AnyCodable? public let brain: AnyCodable? @@ -3173,6 +3193,10 @@ public struct TalkSessionCreateParams: Codable, Sendable { provider: String?, model: String?, voice: String?, + vadthreshold: Double?, + silencedurationms: Int?, + prefixpaddingms: Int?, + reasoningeffort: String?, mode: AnyCodable?, transport: AnyCodable?, brain: AnyCodable?, @@ -3182,6 +3206,10 @@ public struct TalkSessionCreateParams: Codable, Sendable { self.provider = provider self.model = model self.voice = voice + self.vadthreshold = vadthreshold + self.silencedurationms = silencedurationms + self.prefixpaddingms = prefixpaddingms + self.reasoningeffort = reasoningeffort self.mode = mode self.transport = transport self.brain = brain @@ -3193,6 +3221,10 @@ public struct TalkSessionCreateParams: Codable, Sendable { case provider case model case voice + case vadthreshold = "vadThreshold" + case silencedurationms = "silenceDurationMs" + case prefixpaddingms = "prefixPaddingMs" + case reasoningeffort = "reasoningEffort" case mode case transport case brain diff --git a/docs/providers/openai.md b/docs/providers/openai.md index 83370a03817..d5c34538a36 100644 --- a/docs/providers/openai.md +++ b/docs/providers/openai.md @@ -576,7 +576,7 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil ``` - Set `OPENAI_TTS_BASE_URL` to override the TTS base URL without affecting the chat API endpoint. + Set `OPENAI_TTS_BASE_URL` to override the TTS base URL without affecting the chat API endpoint. OpenAI TTS is still configured through an API key; for OAuth-only live talk-back, use the Realtime voice path instead of agent-mode STT -> TTS speech. @@ -627,10 +627,10 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil | Prompt | `...openai.prompt` | (unset) | | Silence duration | `...openai.silenceDurationMs` | `800` | | VAD threshold | `...openai.vadThreshold` | `0.5` | - | API key | `...openai.apiKey` | Falls back to `OPENAI_API_KEY` | + | Auth | `...openai.apiKey`, `OPENAI_API_KEY`, or `openai-codex` OAuth | API keys connect directly; OAuth mints a Realtime transcription client secret | - Uses a WebSocket connection to `wss://api.openai.com/v1/realtime` with G.711 u-law (`g711_ulaw` / `audio/pcmu`) audio. This streaming provider is for Voice Call's realtime transcription path; Discord voice currently records short segments and uses the batch `tools.media.audio` transcription path instead. + Uses a WebSocket connection to `wss://api.openai.com/v1/realtime` with G.711 u-law (`g711_ulaw` / `audio/pcmu`) audio. When only `openai-codex` OAuth is configured, the Gateway mints an ephemeral Realtime transcription client secret before opening the WebSocket. This streaming provider is for Voice Call's realtime transcription path; Discord voice currently records short segments and uses the batch `tools.media.audio` transcription path instead. @@ -645,7 +645,9 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil | Temperature (Azure deployment bridge) | `...openai.temperature` | `0.8` | | VAD threshold | `...openai.vadThreshold` | `0.5` | | Silence duration | `...openai.silenceDurationMs` | `500` | - | API key | `...openai.apiKey` | Falls back to `OPENAI_API_KEY` | + | Prefix padding | `...openai.prefixPaddingMs` | `300` | + | Reasoning effort | `...openai.reasoningEffort` | (unset) | + | Auth | `...openai.apiKey`, `OPENAI_API_KEY`, or `openai-codex` OAuth | Browser Talk and non-Azure backend bridges can use Codex OAuth | Available built-in Realtime voices for `gpt-realtime-2`: `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, `cedar`. @@ -667,7 +669,11 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil Control UI Talk uses OpenAI browser realtime sessions with a Gateway-minted ephemeral client secret and a direct browser WebRTC SDP exchange against the - OpenAI Realtime API. Maintainer live verification is available with + OpenAI Realtime API. When no direct OpenAI API key is configured, the + Gateway can mint that client secret with the selected `openai-codex` OAuth + profile. Gateway relay and Voice Call backend realtime WebSocket bridges use + the same OAuth fallback for native OpenAI endpoints. Maintainer live + verification is available with `OPENAI_API_KEY=... GEMINI_API_KEY=... node --import tsx scripts/dev/realtime-talk-live-smoke.ts`; the OpenAI legs verify both the backend WebSocket bridge and the browser WebRTC SDP exchange without logging secrets. diff --git a/docs/web/control-ui.md b/docs/web/control-ui.md index ce014cb893b..e4250ebdd38 100644 --- a/docs/web/control-ui.md +++ b/docs/web/control-ui.md @@ -171,7 +171,9 @@ Imported themes are stored only in the current browser profile. They are not wri - Talk mode uses a registered realtime voice provider. Configure OpenAI with `talk.realtime.provider: "openai"` plus `talk.realtime.providers.openai.apiKey`, or configure Google with `talk.realtime.provider: "google"` plus `talk.realtime.providers.google.apiKey`. The browser never receives a standard provider API key. OpenAI receives an ephemeral Realtime client secret for WebRTC. Google Live receives a one-use constrained Live API auth token for a browser WebSocket session, with instructions and tool declarations locked into the token by the Gateway. Providers that only expose a backend realtime bridge run through the Gateway relay transport, so credentials and vendor sockets stay server-side while browser audio moves through authenticated Gateway RPCs. The Realtime session prompt is assembled by the Gateway; `talk.client.create` does not accept caller-provided instruction overrides. + Talk mode uses a registered realtime voice provider. Configure OpenAI with `talk.realtime.provider: "openai"` plus either `talk.realtime.providers.openai.apiKey`, `OPENAI_API_KEY`, or an `openai-codex` OAuth profile; configure Google with `talk.realtime.provider: "google"` plus `talk.realtime.providers.google.apiKey`. The browser never receives a standard provider API key. OpenAI receives an ephemeral Realtime client secret for WebRTC. Google Live receives a one-use constrained Live API auth token for a browser WebSocket session, with instructions and tool declarations locked into the token by the Gateway. Providers that only expose a backend realtime bridge run through the Gateway relay transport, so credentials and vendor sockets stay server-side while browser audio moves through authenticated Gateway RPCs. The Realtime session prompt is assembled by the Gateway; `talk.client.create` does not accept caller-provided instruction overrides. + + The Chat composer includes a Talk options button next to the Talk start/stop button. The options apply to the next Talk session and can override provider, transport, model, voice, reasoning effort, VAD threshold, silence duration, and prefix padding. When an option is blank, the Gateway uses configured defaults where available or the provider default. Selecting Gateway relay forces the backend relay path; selecting WebRTC keeps the session client-owned and fails instead of silently falling back to relay if the provider cannot create a browser session. In the Chat composer, the Talk control is the waves button next to the microphone dictation button. When Talk starts, the composer status row shows `Connecting Talk...`, then `Talk live` while audio is connected, or `Asking OpenClaw...` while a realtime tool call is consulting the configured larger model through `talk.client.toolCall`. diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index a4090fa8492..4b89963a80b 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -2562,7 +2562,7 @@ describe("google-meet plugin", () => { expect(focusCall[3]).toEqual({ progress: false }); }); - it("does not mutate realtime browser prompts when status is requested", async () => { + it("refreshes blocked realtime browser health read-only when status is requested", async () => { let openedTab = false; const { methods, nodesInvoke } = setup( { @@ -2574,7 +2574,22 @@ describe("google-meet plugin", () => { const raw = params as { path?: string; body?: { url?: string; targetId?: string } }; if (command === "browser.proxy") { if (raw.path === "/tabs") { - return { payload: { result: { running: true, tabs: [] } } }; + return { + payload: { + result: { + running: true, + tabs: openedTab + ? [ + { + targetId: "tab-1", + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }, + ] + : [], + }, + }, + }; } if (raw.path === "/tabs/open") { openedTab = true; @@ -2621,6 +2636,7 @@ describe("google-meet plugin", () => { const join = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.join", { url: "https://meet.google.com/abc-defg-hij", })) as { session: { id: string } }; + openedTab = true; nodesInvoke.mockClear(); const status = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.status", { @@ -2628,11 +2644,23 @@ describe("google-meet plugin", () => { })) as { session?: { chrome?: { health?: { manualActionRequired?: boolean } } } }; expect(status.session?.chrome?.health?.manualActionRequired).toBe(true); - expect( - nodesInvoke.mock.calls.some( - ([params]) => requireRecord(params, "node invoke").command === "browser.proxy", - ), - ).toBe(false); + expect(nodesInvoke).toHaveBeenCalledWith( + expect.objectContaining({ + command: "browser.proxy", + params: expect.objectContaining({ + path: "/act", + body: expect.objectContaining({ targetId: "tab-1" }), + }), + }), + ); + expect(nodesInvoke).not.toHaveBeenCalledWith( + expect.objectContaining({ + command: "browser.proxy", + params: expect.objectContaining({ + path: "/permissions/grant", + }), + }), + ); }); it("retries caption enable until the captions button is available", async () => { @@ -3573,6 +3601,52 @@ describe("google-meet plugin", () => { expect(result.speechOutputTimedOut).toBe(false); }); + it("uses the requested bidirectional realtime mode for test speech", async () => { + const runtime = new GoogleMeetRuntime({ + config: resolveGoogleMeetConfig({ defaultMode: "agent" }), + fullConfig: {} as never, + runtime: {} as never, + logger: noopLogger, + }); + const session: GoogleMeetSession = { + id: "meet_1", + url: "https://meet.google.com/abc-defg-hij", + transport: "chrome", + mode: "bidi", + state: "active", + createdAt: "2026-04-27T00:00:00.000Z", + updatedAt: "2026-04-27T00:00:00.000Z", + participantIdentity: "signed-in Google Chrome profile", + realtime: { + enabled: true, + strategy: "bidi", + provider: "openai", + toolPolicy: "safe-read-only", + }, + chrome: { + audioBackend: "blackhole-2ch", + launched: true, + health: { audioOutputActive: true, lastOutputBytes: 10 }, + }, + notes: [], + }; + vi.spyOn(runtime, "list").mockReturnValue([]); + const join = vi.spyOn(runtime, "join").mockResolvedValue({ session, spoken: true }); + + await runtime.testSpeech({ + url: "https://meet.google.com/abc-defg-hij", + mode: "bidi", + message: "Say exactly: hello.", + }); + + expect(join).toHaveBeenCalledWith( + expect.objectContaining({ + message: "Say exactly: hello.", + mode: "bidi", + }), + ); + }); + it("rejects observe-only mode for test speech", async () => { const runtime = new GoogleMeetRuntime({ config: resolveGoogleMeetConfig({}), diff --git a/extensions/google-meet/src/cli.test.ts b/extensions/google-meet/src/cli.test.ts index fbb7e1724ee..9b04e318e35 100644 --- a/extensions/google-meet/src/cli.test.ts +++ b/extensions/google-meet/src/cli.test.ts @@ -812,6 +812,69 @@ describe("google-meet CLI", () => { } }); + it("delegates test speech mode to the gateway-owned runtime", async () => { + const callGatewayFromCli = vi.fn(async () => ({ + createdSession: true, + spoken: true, + speechOutputVerified: true, + speechOutputTimedOut: false, + session: { + id: "meet_gateway", + url: "https://meet.google.com/abc-defg-hij", + state: "active", + transport: "chrome", + mode: "bidi", + participantIdentity: "signed-in Google Chrome profile", + createdAt: "2026-04-25T00:00:00.000Z", + updatedAt: "2026-04-25T00:00:01.000Z", + realtime: { enabled: true, strategy: "bidi", provider: "openai" }, + notes: [], + }, + })); + const ensureRuntime = vi.fn(async () => { + throw new Error("local runtime should not be loaded"); + }); + const stdout = captureStdout(); + try { + await setupCli({ + callGatewayFromCli, + ensureRuntime: ensureRuntime as unknown as () => Promise, + }).parseAsync( + [ + "googlemeet", + "test-speech", + "https://meet.google.com/abc-defg-hij", + "--transport", + "chrome", + "--mode", + "bidi", + "--message", + "Hello meeting", + ], + { from: "user" }, + ); + + expect(callGatewayFromCli).toHaveBeenCalledWith( + "googlemeet.testSpeech", + { json: true, timeout: expect.any(String) }, + { + url: "https://meet.google.com/abc-defg-hij", + transport: "chrome", + mode: "bidi", + message: "Hello meeting", + }, + { progress: false }, + ); + expect(ensureRuntime).not.toHaveBeenCalled(); + expect(JSON.parse(stdout.output())).toMatchObject({ + createdSession: true, + session: { mode: "bidi" }, + }); + } finally { + stdout.restore(); + } + }); + it("runs a listen-first health probe", async () => { const testListen = vi.fn(async () => ({ createdSession: true, diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index 8504263ff3b..9e647e47d64 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -229,6 +229,7 @@ export async function startNodeAgentAudioBridge(params: { }); sttSession = resolved.provider.createSession({ + cfg: params.fullConfig, providerConfig: resolved.providerConfig, onTranscript: (text) => { const trimmed = text.trim(); diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index 20055b86e21..15455fdf64a 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -704,6 +704,7 @@ export async function startCommandAgentAudioBridge(params: { }); sttSession = resolved.provider.createSession({ + cfg: params.fullConfig, providerConfig: resolved.providerConfig, onTranscript: (text) => { const trimmed = text.trim(); diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index 64b41c7e1e4..ef4d07cf67a 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -674,6 +674,13 @@ export class GoogleMeetRuntime { "test_speech requires mode: agent or bidi; use join mode: transcribe for observe-only sessions.", ); } + const requestedMode = request.mode ? resolveMode(request.mode, this.params.config) : undefined; + const mode = + requestedMode && isGoogleMeetTalkBackMode(requestedMode) + ? requestedMode + : isGoogleMeetTalkBackMode(this.params.config.defaultMode) + ? this.params.config.defaultMode + : "agent"; const url = normalizeMeetUrl(request.url); const transport = resolveTransport(request.transport, this.params.config); const beforeSessions = this.list(); @@ -690,7 +697,7 @@ export class GoogleMeetRuntime { ...request, transport, url, - mode: "agent", + mode, message: request.message ?? "Say exactly: Google Meet speech test complete.", }); let health = result.session.chrome?.health; @@ -821,10 +828,6 @@ export class GoogleMeetRuntime { async #refreshStatusHealthForSession(session: GoogleMeetSession) { if (session.transport === "chrome" || session.transport === "chrome-node") { - if (session.chrome?.health?.manualActionRequired) { - this.#refreshSpeechReadiness(session); - return; - } await this.#refreshBrowserHealthForChromeSession(session, { force: true, readOnly: true }); return; } diff --git a/extensions/openai/realtime-provider-shared.ts b/extensions/openai/realtime-provider-shared.ts index 478cc8f3cee..074b48993e5 100644 --- a/extensions/openai/realtime-provider-shared.ts +++ b/extensions/openai/realtime-provider-shared.ts @@ -1,4 +1,9 @@ +import { + createProviderHttpError, + resolveProviderRequestHeaders, +} from "openclaw/plugin-sdk/provider-http"; import { captureWsEvent } from "openclaw/plugin-sdk/proxy-capture"; +import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime"; import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime"; export const trimToUndefined = normalizeOptionalString; @@ -56,3 +61,108 @@ export function captureOpenAIRealtimeWsClose(params: { }, }); } + +export type OpenAIRealtimeClientSecretResult = { + value: string; + expiresAt?: number; +}; + +type OpenAIRealtimeSecretRequest = { + authToken: string; + auditContext: string; + url: string; + body: unknown; + errorMessage: string; + missingValueMessage: string; +}; + +function readStringField(value: unknown, key: string): string | undefined { + if (!value || typeof value !== "object") { + return undefined; + } + const raw = (value as Record)[key]; + return typeof raw === "string" && raw.trim() ? raw.trim() : undefined; +} + +async function createOpenAIRealtimeSecret( + params: OpenAIRealtimeSecretRequest, +): Promise { + const { response, release } = await fetchWithSsrFGuard({ + url: params.url, + init: { + method: "POST", + headers: resolveProviderRequestHeaders({ + provider: "openai", + baseUrl: params.url, + capability: "audio", + transport: "http", + defaultHeaders: { + Authorization: `Bearer ${params.authToken}`, + "Content-Type": "application/json", + }, + }) ?? { + Authorization: `Bearer ${params.authToken}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(params.body), + }, + auditContext: params.auditContext, + }); + const payload = await (async () => { + try { + if (!response.ok) { + throw await createProviderHttpError(response, params.errorMessage); + } + return (await response.json()) as unknown; + } finally { + await release(); + } + })(); + const nestedSecret = + payload && typeof payload === "object" + ? (payload as Record).client_secret + : undefined; + const clientSecret = readStringField(payload, "value") ?? readStringField(nestedSecret, "value"); + if (!clientSecret) { + throw new Error(params.missingValueMessage); + } + const expiresAt = + payload && typeof payload === "object" + ? (payload as Record).expires_at + : undefined; + return { + value: clientSecret, + ...(typeof expiresAt === "number" ? { expiresAt } : {}), + }; +} + +export async function createOpenAIRealtimeClientSecret(params: { + authToken: string; + auditContext: string; + session: Record; +}): Promise { + const url = "https://api.openai.com/v1/realtime/client_secrets"; + return createOpenAIRealtimeSecret({ + ...params, + url, + body: { session: params.session }, + errorMessage: "OpenAI Realtime client secret failed", + missingValueMessage: "OpenAI Realtime client secret response did not include a value", + }); +} + +export async function createOpenAIRealtimeTranscriptionClientSecret(params: { + authToken: string; + auditContext: string; + session: Record; +}): Promise { + const url = "https://api.openai.com/v1/realtime/transcription_sessions"; + return createOpenAIRealtimeSecret({ + ...params, + url, + body: params.session, + errorMessage: "OpenAI Realtime transcription client secret failed", + missingValueMessage: + "OpenAI Realtime transcription client secret response did not include a value", + }); +} diff --git a/extensions/openai/realtime-transcription-provider.test.ts b/extensions/openai/realtime-transcription-provider.test.ts index a2b4585ea4c..e40edd063da 100644 --- a/extensions/openai/realtime-transcription-provider.test.ts +++ b/extensions/openai/realtime-transcription-provider.test.ts @@ -1,7 +1,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js"; -const { FakeWebSocket } = vi.hoisted(() => { +const { FakeWebSocket, providerAuthMocks, ssrfMocks } = vi.hoisted(() => { type Listener = (...args: unknown[]) => void; class MockWebSocket { @@ -10,11 +10,15 @@ const { FakeWebSocket } = vi.hoisted(() => { static instances: MockWebSocket[] = []; readonly listeners = new Map(); + readonly headers?: Record; + readonly url?: string; readyState = 0; sent: string[] = []; closed = false; - constructor() { + constructor(url?: string, options?: { headers?: Record }) { + this.url = url; + this.headers = options?.headers; MockWebSocket.instances.push(this); } @@ -42,40 +46,59 @@ const { FakeWebSocket } = vi.hoisted(() => { } } - return { FakeWebSocket: MockWebSocket }; + return { + FakeWebSocket: MockWebSocket, + providerAuthMocks: { + isProviderAuthProfileConfigured: vi.fn(), + resolveProviderAuthProfileApiKey: vi.fn(), + }, + ssrfMocks: { + fetchWithSsrFGuard: vi.fn(), + }, + }; }); vi.mock("ws", () => ({ default: FakeWebSocket, })); +vi.mock("openclaw/plugin-sdk/provider-auth", () => ({ + isProviderAuthProfileConfigured: providerAuthMocks.isProviderAuthProfileConfigured, + resolveProviderAuthProfileApiKey: providerAuthMocks.resolveProviderAuthProfileApiKey, +})); + +vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({ + fetchWithSsrFGuard: ssrfMocks.fetchWithSsrFGuard, +})); + type FakeWebSocketInstance = InstanceType; type SentRealtimeEvent = { type: string; audio?: string; - session?: { - input_audio_format?: string; - input_audio_transcription?: { - model?: string; - language?: string; - prompt?: string; - }; - turn_detection?: { - type?: string; - threshold?: number; - prefix_padding_ms?: number; - silence_duration_ms?: number; - }; - }; + session?: unknown; }; function parseSent(socket: FakeWebSocketInstance): SentRealtimeEvent[] { return socket.sent.map((payload) => JSON.parse(payload) as SentRealtimeEvent); } +async function waitForFakeSocket(): Promise { + for (let attempt = 0; attempt < 20; attempt += 1) { + const socket = FakeWebSocket.instances[0]; + if (socket) { + return socket; + } + await new Promise((resolve) => setTimeout(resolve, 0)); + } + throw new Error("expected session to create a websocket"); +} + describe("buildOpenAIRealtimeTranscriptionProvider", () => { beforeEach(() => { FakeWebSocket.instances = []; + providerAuthMocks.isProviderAuthProfileConfigured.mockReset(); + providerAuthMocks.resolveProviderAuthProfileApiKey.mockReset(); + ssrfMocks.fetchWithSsrFGuard.mockReset(); }); it("normalizes OpenAI config defaults", () => { @@ -147,6 +170,83 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => { expect(provider.aliases).toContain("openai-realtime"); }); + it("treats a Codex OAuth profile as configured when no API key is present", () => { + const provider = buildOpenAIRealtimeTranscriptionProvider(); + const cfg = { auth: { order: { "openai-codex": ["openai-codex:default"] } } }; + providerAuthMocks.isProviderAuthProfileConfigured.mockReturnValue(true); + + expect(provider.isConfigured({ cfg: cfg as never, providerConfig: {} })).toBe(true); + expect(providerAuthMocks.isProviderAuthProfileConfigured).toHaveBeenCalledWith({ + provider: "openai-codex", + cfg, + }); + }); + + it("mints a Codex OAuth client secret for realtime transcription sockets", async () => { + const provider = buildOpenAIRealtimeTranscriptionProvider(); + const release = vi.fn(); + providerAuthMocks.resolveProviderAuthProfileApiKey.mockResolvedValue("oauth-token"); + ssrfMocks.fetchWithSsrFGuard.mockResolvedValue({ + response: new Response(JSON.stringify({ value: "ek-test" }), { status: 200 }), + release, + }); + const cfg = { auth: { order: { "openai-codex": ["openai-codex:default"] } } }; + const session = provider.createSession({ + cfg: cfg as never, + providerConfig: {}, + }); + + const connecting = session.connect(); + const socket = await waitForFakeSocket(); + + expect(socket.headers).toMatchObject({ Authorization: "Bearer ek-test" }); + expect(providerAuthMocks.resolveProviderAuthProfileApiKey).toHaveBeenCalledWith({ + provider: "openai-codex", + cfg, + }); + expect(ssrfMocks.fetchWithSsrFGuard).toHaveBeenCalledWith( + expect.objectContaining({ + auditContext: "openai-realtime-transcription-session", + url: "https://api.openai.com/v1/realtime/transcription_sessions", + init: expect.objectContaining({ + method: "POST", + headers: expect.objectContaining({ + Authorization: "Bearer oauth-token", + "Content-Type": "application/json", + }), + body: expect.any(String), + }), + }), + ); + const request = ssrfMocks.fetchWithSsrFGuard.mock.calls[0]?.[0] as + | { init?: { body?: unknown } } + | undefined; + expect(JSON.parse(String(request?.init?.body))).toMatchObject({ + type: "transcription", + audio: { + input: { + format: { type: "audio/pcmu" }, + transcription: { model: "gpt-4o-transcribe" }, + }, + }, + }); + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + socket.emit("message", Buffer.from(JSON.stringify({ type: "transcription_session.updated" }))); + await connecting; + + expect(release).toHaveBeenCalled(); + expect(parseSent(socket)[0]).toMatchObject({ + type: "transcription_session.update", + session: { + input_audio_format: "g711_ulaw", + input_audio_transcription: { model: "gpt-4o-transcribe" }, + }, + }); + session.close(); + }); + it("waits for the OpenAI session update before draining audio", async () => { const provider = buildOpenAIRealtimeTranscriptionProvider(); const session = provider.createSession({ @@ -161,10 +261,7 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => { }); const connecting = session.connect(); - const socket = FakeWebSocket.instances[0]; - if (!socket) { - throw new Error("expected session to create a websocket"); - } + const socket = await waitForFakeSocket(); socket.readyState = FakeWebSocket.OPEN; socket.emit("open"); diff --git a/extensions/openai/realtime-transcription-provider.ts b/extensions/openai/realtime-transcription-provider.ts index c1e8e9bf4b5..4c401a3999f 100644 --- a/extensions/openai/realtime-transcription-provider.ts +++ b/extensions/openai/realtime-transcription-provider.ts @@ -1,3 +1,8 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; +import { + isProviderAuthProfileConfigured, + resolveProviderAuthProfileApiKey, +} from "openclaw/plugin-sdk/provider-auth"; import { resolveProviderRequestHeaders } from "openclaw/plugin-sdk/provider-http"; import { createRealtimeTranscriptionWebSocketSession, @@ -10,6 +15,7 @@ import { import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; import { asFiniteNumber, + createOpenAIRealtimeTranscriptionClientSecret, readRealtimeErrorDetail, resolveOpenAIProviderConfigRecord, trimToUndefined, @@ -25,7 +31,8 @@ type OpenAIRealtimeTranscriptionProviderConfig = { }; type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & { - apiKey: string; + apiKey?: string; + cfg?: OpenClawConfig; language?: string; model: string; prompt?: string; @@ -40,6 +47,41 @@ type RealtimeEvent = { error?: unknown; }; +type OpenAIRealtimeTranscriptionSessionCreate = { + type: "transcription"; + audio: { + input: { + format: { type: "audio/pcmu" }; + transcription: { + model: string; + language?: string; + prompt?: string; + }; + turn_detection: { + type: "server_vad"; + threshold: number; + prefix_padding_ms: number; + silence_duration_ms: number; + }; + }; + }; +}; + +type OpenAIRealtimeTranscriptionSessionUpdate = { + input_audio_format: "g711_ulaw"; + input_audio_transcription: { + model: string; + language?: string; + prompt?: string; + }; + turn_detection: { + type: "server_vad"; + threshold: number; + prefix_padding_ms: number; + silence_duration_ms: number; + }; +}; + const OPENAI_REALTIME_TRANSCRIPTION_URL = "wss://api.openai.com/v1/realtime?intent=transcription"; const OPENAI_REALTIME_TRANSCRIPTION_CONNECT_TIMEOUT_MS = 10_000; const OPENAI_REALTIME_TRANSCRIPTION_MAX_RECONNECT_ATTEMPTS = 5; @@ -68,6 +110,71 @@ function normalizeProviderConfig( }; } +function buildOpenAIRealtimeTranscriptionSessionCreateConfig( + config: OpenAIRealtimeTranscriptionSessionConfig, +): OpenAIRealtimeTranscriptionSessionCreate { + return { + type: "transcription", + audio: { + input: { + format: { type: "audio/pcmu" }, + transcription: { + model: config.model, + ...(config.language ? { language: config.language } : {}), + ...(config.prompt ? { prompt: config.prompt } : {}), + }, + turn_detection: { + type: "server_vad", + threshold: config.vadThreshold, + prefix_padding_ms: 300, + silence_duration_ms: config.silenceDurationMs, + }, + }, + }, + }; +} + +function buildOpenAIRealtimeTranscriptionSessionUpdateConfig( + config: OpenAIRealtimeTranscriptionSessionConfig, +): OpenAIRealtimeTranscriptionSessionUpdate { + return { + input_audio_format: "g711_ulaw", + input_audio_transcription: { + model: config.model, + ...(config.language ? { language: config.language } : {}), + ...(config.prompt ? { prompt: config.prompt } : {}), + }, + turn_detection: { + type: "server_vad", + threshold: config.vadThreshold, + prefix_padding_ms: 300, + silence_duration_ms: config.silenceDurationMs, + }, + }; +} + +async function resolveOpenAIRealtimeTranscriptionAuthorization( + config: OpenAIRealtimeTranscriptionSessionConfig, +): Promise { + const apiKey = config.apiKey || process.env.OPENAI_API_KEY; + if (apiKey) { + return apiKey; + } + const authToken = await resolveProviderAuthProfileApiKey({ + provider: "openai-codex", + cfg: config.cfg, + }); + if (!authToken) { + throw new Error("OpenAI API key or Codex OAuth missing"); + } + const clientSecret = await createOpenAIRealtimeTranscriptionClientSecret({ + authToken, + auditContext: "openai-realtime-transcription-session", + session: buildOpenAIRealtimeTranscriptionSessionCreateConfig(config), + }); + return clientSecret.value; +} + function createOpenAIRealtimeTranscriptionSession( config: OpenAIRealtimeTranscriptionSessionConfig, ): RealtimeTranscriptionSession { @@ -122,18 +229,21 @@ function createOpenAIRealtimeTranscriptionSession( providerId: "openai", callbacks: config, url: OPENAI_REALTIME_TRANSCRIPTION_URL, - headers: resolveProviderRequestHeaders({ - provider: "openai", - baseUrl: OPENAI_REALTIME_TRANSCRIPTION_URL, - capability: "audio", - transport: "websocket", - defaultHeaders: { - Authorization: `Bearer ${config.apiKey}`, - "OpenAI-Beta": "realtime=v1", - }, - }) ?? { - Authorization: `Bearer ${config.apiKey}`, - "OpenAI-Beta": "realtime=v1", + headers: async () => { + const bearer = await resolveOpenAIRealtimeTranscriptionAuthorization(config); + return ( + resolveProviderRequestHeaders({ + provider: "openai", + baseUrl: OPENAI_REALTIME_TRANSCRIPTION_URL, + capability: "audio", + transport: "websocket", + defaultHeaders: { + Authorization: `Bearer ${bearer}`, + }, + }) ?? { + Authorization: `Bearer ${bearer}`, + } + ); }, connectTimeoutMs: OPENAI_REALTIME_TRANSCRIPTION_CONNECT_TIMEOUT_MS, maxReconnectAttempts: OPENAI_REALTIME_TRANSCRIPTION_MAX_RECONNECT_ATTEMPTS, @@ -150,20 +260,7 @@ function createOpenAIRealtimeTranscriptionSession( onOpen: (transport: RealtimeTranscriptionWebSocketTransport) => { transport.sendJson({ type: "transcription_session.update", - session: { - input_audio_format: "g711_ulaw", - input_audio_transcription: { - model: config.model, - ...(config.language ? { language: config.language } : {}), - ...(config.prompt ? { prompt: config.prompt } : {}), - }, - turn_detection: { - type: "server_vad", - threshold: config.vadThreshold, - prefix_padding_ms: 300, - silence_duration_ms: config.silenceDurationMs, - }, - }, + session: buildOpenAIRealtimeTranscriptionSessionUpdateConfig(config), }); }, onMessage: handleEvent, @@ -178,17 +275,17 @@ export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptio defaultModel: OPENAI_REALTIME_TRANSCRIPTION_DEFAULT_MODEL, autoSelectOrder: 10, resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig), - isConfigured: ({ providerConfig }) => - Boolean(normalizeProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY), + isConfigured: ({ cfg, providerConfig }) => + Boolean( + normalizeProviderConfig(providerConfig).apiKey || + process.env.OPENAI_API_KEY || + isProviderAuthProfileConfigured({ provider: "openai-codex", cfg }), + ), createSession: (req) => { const config = normalizeProviderConfig(req.providerConfig); - const apiKey = config.apiKey || process.env.OPENAI_API_KEY; - if (!apiKey) { - throw new Error("OpenAI API key missing"); - } return createOpenAIRealtimeTranscriptionSession({ ...req, - apiKey, + apiKey: config.apiKey, language: config.language, model: config.model ?? OPENAI_REALTIME_TRANSCRIPTION_DEFAULT_MODEL, prompt: config.prompt, diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index edd34b6ae82..13dd6509ed4 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -2,7 +2,13 @@ import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/rea import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js"; -const { FakeWebSocket, execFileSyncMock, fetchWithSsrFGuardMock } = vi.hoisted(() => { +const { + FakeWebSocket, + execFileSyncMock, + fetchWithSsrFGuardMock, + isProviderAuthProfileConfiguredMock, + resolveProviderAuthProfileApiKeyMock, +} = vi.hoisted(() => { type Listener = (...args: unknown[]) => void; class MockWebSocket { @@ -55,6 +61,8 @@ const { FakeWebSocket, execFileSyncMock, fetchWithSsrFGuardMock } = vi.hoisted(( FakeWebSocket: MockWebSocket, execFileSyncMock: vi.fn(), fetchWithSsrFGuardMock: vi.fn(), + isProviderAuthProfileConfiguredMock: vi.fn(), + resolveProviderAuthProfileApiKeyMock: vi.fn(), }; }); @@ -74,6 +82,11 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({ fetchWithSsrFGuard: fetchWithSsrFGuardMock, })); +vi.mock("openclaw/plugin-sdk/provider-auth", () => ({ + isProviderAuthProfileConfigured: isProviderAuthProfileConfiguredMock, + resolveProviderAuthProfileApiKey: resolveProviderAuthProfileApiKeyMock, +})); + type FakeWebSocketInstance = InstanceType; type SentRealtimeEvent = { type: string; @@ -82,8 +95,14 @@ type SentRealtimeEvent = { content_index?: number; audio_end_ms?: number; session?: { + type?: string; + model?: string; + modalities?: string[]; + instructions?: string; + voice?: string; input_audio_format?: string; output_audio_format?: string; + input_audio_transcription?: Record; turn_detection?: { create_response?: boolean; }; @@ -100,6 +119,7 @@ type SentRealtimeEvent = { }; output?: { format?: Record; + voice?: string; }; }; item?: unknown; @@ -124,6 +144,10 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { FakeWebSocket.instances = []; execFileSyncMock.mockReset(); fetchWithSsrFGuardMock.mockReset(); + isProviderAuthProfileConfiguredMock.mockReset(); + isProviderAuthProfileConfiguredMock.mockReturnValue(false); + resolveProviderAuthProfileApiKeyMock.mockReset(); + resolveProviderAuthProfileApiKeyMock.mockResolvedValue(undefined); }); afterEach(() => { @@ -184,32 +208,100 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(options?.headers).not.toHaveProperty("OpenAI-Beta"); }); - it("keeps Azure deployment realtime bridge requests on the deployment-compatible session shape", () => { + it("mints an ephemeral Realtime secret for native websocket bridges when using Codex OAuth", async () => { + resolveProviderAuthProfileApiKeyMock.mockResolvedValueOnce("oauth-token"); + fetchWithSsrFGuardMock.mockResolvedValueOnce({ + response: createJsonResponse({ + client_secret: { value: "ephemeral-realtime-secret" }, + }), + release: vi.fn(async () => undefined), + }); const provider = buildOpenAIRealtimeVoiceProvider(); const bridge = provider.createBridge({ - providerConfig: { - apiKey: "sk-test", // pragma: allowlist secret - azureEndpoint: "https://example.openai.azure.com", - azureDeployment: "realtime-prod", - }, + cfg: {} as never, + providerConfig: { model: "gpt-realtime-2" }, onAudio: vi.fn(), onClearAudio: vi.fn(), }); void bridge.connect(); - const socket = FakeWebSocket.instances[0]; - if (!socket) { - throw new Error("expected bridge to create a websocket"); - } - socket.readyState = FakeWebSocket.OPEN; - socket.emit("open"); + await vi.waitFor(() => expect(FakeWebSocket.instances.length).toBe(1)); bridge.close(); - expect(parseSent(socket)[0]?.session).toMatchObject({ - modalities: ["text", "audio"], - input_audio_format: "g711_ulaw", - output_audio_format: "g711_ulaw", + expect(resolveProviderAuthProfileApiKeyMock).toHaveBeenCalledWith({ + provider: "openai-codex", + cfg: {}, }); + expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://api.openai.com/v1/realtime/client_secrets", + init: expect.objectContaining({ + method: "POST", + headers: expect.objectContaining({ + Authorization: "Bearer oauth-token", // pragma: allowlist secret + "Content-Type": "application/json", + }), + }), + auditContext: "openai-realtime-bridge-session", + }), + ); + const request = fetchWithSsrFGuardMock.mock.calls[0]?.[0] as + | { init?: { body?: string } } + | undefined; + const body = JSON.parse(request?.init?.body ?? "{}") as { + session?: { + type?: string; + model?: string; + audio?: { output?: { voice?: string } }; + }; + }; + expect(body.session).toMatchObject({ + type: "realtime", + model: "gpt-realtime-2", + audio: { output: { voice: "alloy" } }, + }); + const socket = FakeWebSocket.instances[0]; + const options = socket?.args[1] as { headers?: Record } | undefined; + expect(options?.headers?.Authorization).toBe("Bearer ephemeral-realtime-secret"); + expect(options?.headers).not.toHaveProperty("OpenAI-Beta"); + }); + + it("does not open a native websocket after slow OAuth resolution times out", async () => { + vi.useFakeTimers(); + resolveProviderAuthProfileApiKeyMock.mockResolvedValueOnce("oauth-token"); + let resolveClientSecret: (value: { + response: Response; + release: () => Promise; + }) => void = () => {}; + fetchWithSsrFGuardMock.mockReturnValueOnce( + new Promise((resolve) => { + resolveClientSecret = resolve; + }), + ); + const provider = buildOpenAIRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + cfg: {} as never, + providerConfig: { model: "gpt-realtime-2" }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + const connecting = expect(bridge.connect()).rejects.toThrow( + "OpenAI realtime connection timeout", + ); + await vi.advanceTimersByTimeAsync(10_000); + await connecting; + + resolveClientSecret({ + response: createJsonResponse({ + client_secret: { value: "ephemeral-realtime-secret" }, + }), + release: vi.fn(async () => undefined), + }); + await vi.runAllTimersAsync(); + + expect(FakeWebSocket.instances).toHaveLength(0); + bridge.close(); }); it("returns browser-safe OpenClaw attribution headers for native WebRTC offers", async () => { @@ -229,6 +321,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { const session = await provider.createBrowserSession({ providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret instructions: "Be concise.", + voice: " Marin ", }); expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith( @@ -257,7 +350,9 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { turn_detection?: Record; transcription?: Record; }; + output?: Record; }; + reasoning?: Record; }; }; expect(body.session?.model).toBe("gpt-realtime-2"); @@ -270,6 +365,8 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { }, transcription: { model: "gpt-4o-mini-transcribe" }, }); + expect(body.session?.audio?.output).toEqual({ voice: "marin" }); + expect(body.session).not.toHaveProperty("temperature"); expect(session).toMatchObject({ provider: "openai", transport: "webrtc", @@ -359,20 +456,84 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(execFileSyncMock).not.toHaveBeenCalled(); }); - it("fails closed when keychain refs cannot be resolved", () => { + it("treats OpenAI Codex OAuth profiles as configured for browser realtime sessions", () => { + isProviderAuthProfileConfiguredMock.mockReturnValue(true); + const provider = buildOpenAIRealtimeVoiceProvider(); + const cfg = { agents: { defaults: {} } } as never; + + expect(provider.isConfigured({ cfg, providerConfig: {} })).toBe(true); + expect(isProviderAuthProfileConfiguredMock).toHaveBeenCalledWith({ + provider: "openai-codex", + cfg, + }); + }); + + it("does not use Codex OAuth to configure Azure realtime sessions", () => { + isProviderAuthProfileConfiguredMock.mockReturnValue(true); + const provider = buildOpenAIRealtimeVoiceProvider(); + const cfg = { agents: { defaults: {} } } as never; + + expect( + provider.isConfigured({ + cfg, + providerConfig: { + azureEndpoint: "https://example.openai.azure.com", + azureDeployment: "realtime", + }, + }), + ).toBe(false); + expect(isProviderAuthProfileConfiguredMock).not.toHaveBeenCalled(); + }); + + it("uses OpenAI Codex OAuth to mint browser realtime client secrets when no API key is set", async () => { + resolveProviderAuthProfileApiKeyMock.mockResolvedValueOnce("oauth-realtime-token"); + fetchWithSsrFGuardMock.mockResolvedValueOnce({ + response: createJsonResponse({ + client_secret: { value: "client-secret-123" }, + }), + release: vi.fn(async () => undefined), + }); + const provider = buildOpenAIRealtimeVoiceProvider(); + if (!provider.createBrowserSession) { + throw new Error("expected OpenAI realtime provider to support browser sessions"); + } + const cfg = { agents: { defaults: {} } } as never; + + await provider.createBrowserSession({ + cfg, + providerConfig: {}, + instructions: "Be concise.", + }); + + expect(resolveProviderAuthProfileApiKeyMock).toHaveBeenCalledWith({ + provider: "openai-codex", + cfg, + }); + expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith( + expect.objectContaining({ + init: expect.objectContaining({ + headers: expect.objectContaining({ + Authorization: "Bearer oauth-realtime-token", // pragma: allowlist secret + }), + }), + }), + ); + }); + + it("fails closed when keychain refs cannot be resolved", async () => { vi.stubEnv("OPENAI_API_KEY", "keychain:openclaw:OPENAI_REALTIME_MISSING_TEST"); execFileSyncMock.mockImplementationOnce(() => { throw new Error("keychain unavailable"); }); const provider = buildOpenAIRealtimeVoiceProvider(); - expect(() => - provider.createBridge({ - providerConfig: {}, - onAudio: vi.fn(), - onClearAudio: vi.fn(), - }), - ).toThrow("OpenAI API key missing"); + const bridge = provider.createBridge({ + providerConfig: {}, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + await expect(bridge.connect()).rejects.toThrow("OpenAI API key or Codex OAuth missing"); }); it("normalizes provider-owned voice settings from raw provider config", () => { @@ -383,10 +544,11 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { providers: { openai: { model: "gpt-realtime-2", - voice: "verse", + voice: " Verse ", temperature: 0.6, silenceDurationMs: 850, vadThreshold: 0.35, + reasoningEffort: "low", }, }, }, @@ -398,6 +560,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { temperature: 0.6, silenceDurationMs: 850, vadThreshold: 0.35, + reasoningEffort: "low", }); }); @@ -443,6 +606,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { }, output: { format: { type: "audio/pcmu" }, + voice: "alloy", }, }, }); @@ -461,6 +625,53 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(bridge.isConnected()).toBe(true); }); + it("keeps Azure deployment bridges on deployment-compatible session payloads", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { + apiKey: "sk-test", // pragma: allowlist secret + azureEndpoint: "https://example.openai.azure.com/", + azureDeployment: "realtime-prod", + azureApiVersion: "2024-10-01-preview", + voice: "verse", + }, + audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + instructions: "Be helpful.", + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + expect(socket.args[0]).toBe( + "wss://example.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=realtime-prod", + ); + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + await Promise.resolve(); + + const session = parseSent(socket)[0]?.session; + expect(session).toMatchObject({ + modalities: ["text", "audio"], + instructions: "Be helpful.", + voice: "verse", + input_audio_format: "pcm16", + output_audio_format: "pcm16", + input_audio_transcription: { model: "whisper-1" }, + turn_detection: { create_response: true }, + temperature: 0.8, + }); + expect(session).not.toHaveProperty("type"); + expect(session).not.toHaveProperty("audio"); + + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; + }); + it("rejects connection when session configuration fails before readiness", async () => { const provider = buildOpenAIRealtimeVoiceProvider(); const bridge = provider.createBridge({ diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index e75acfe88cb..b0fac2a4d9d 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -1,9 +1,10 @@ import { execFileSync } from "node:child_process"; import { randomUUID } from "node:crypto"; import { - createProviderHttpError, - resolveProviderRequestHeaders, -} from "openclaw/plugin-sdk/provider-http"; + isProviderAuthProfileConfigured, + resolveProviderAuthProfileApiKey, +} from "openclaw/plugin-sdk/provider-auth"; +import { resolveProviderRequestHeaders } from "openclaw/plugin-sdk/provider-http"; import { captureWsEvent, createDebugProxyWebSocketAgent, @@ -29,11 +30,11 @@ import { normalizeResolvedSecretInputString, normalizeSecretInputString, } from "openclaw/plugin-sdk/secret-input"; -import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime"; import WebSocket from "ws"; import { asFiniteNumber, captureOpenAIRealtimeWsClose, + createOpenAIRealtimeClientSecret, readRealtimeErrorDetail, resolveOpenAIProviderConfigRecord, trimToUndefined, @@ -61,13 +62,14 @@ type OpenAIRealtimeVoiceProviderConfig = { prefixPaddingMs?: number; interruptResponseOnInputAudio?: boolean; minBargeInAudioEndMs?: number; + reasoningEffort?: string; azureEndpoint?: string; azureDeployment?: string; azureApiVersion?: string; }; type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & { - apiKey: string; + apiKey?: string; model?: string; voice?: OpenAIRealtimeVoice; temperature?: number; @@ -76,6 +78,7 @@ type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & { prefixPaddingMs?: number; interruptResponseOnInputAudio?: boolean; minBargeInAudioEndMs?: number; + reasoningEffort?: string; azureEndpoint?: string; azureDeployment?: string; azureApiVersion?: string; @@ -88,6 +91,28 @@ const OPENAI_REALTIME_ACTIVE_RESPONSE_ERROR_PREFIX = const OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR = "Cancellation failed: no active response found"; const OPENAI_REALTIME_DEFAULT_MIN_BARGE_IN_AUDIO_END_MS = 250; +const OPENAI_REALTIME_VOICES = [ + "alloy", + "ash", + "ballad", + "coral", + "echo", + "sage", + "shimmer", + "verse", + "marin", + "cedar", +] as const satisfies readonly OpenAIRealtimeVoice[]; + +function normalizeOpenAIRealtimeVoice(value: unknown): OpenAIRealtimeVoice | undefined { + if (typeof value !== "string") { + return undefined; + } + const normalized = value.trim().toLowerCase(); + return OPENAI_REALTIME_VOICES.includes(normalized as OpenAIRealtimeVoice) + ? (normalized as OpenAIRealtimeVoice) + : undefined; +} type RealtimeEvent = { type: string; @@ -112,63 +137,64 @@ type RealtimeEvent = { error?: unknown; }; -type RealtimeSessionUpdate = { - type: "session.update"; - session: RealtimeSessionUpdatePayload; +type RealtimeTurnDetectionConfig = { + type: "server_vad"; + threshold: number; + prefix_padding_ms: number; + silence_duration_ms: number; + create_response: boolean; + interrupt_response?: boolean; }; -type RealtimeSessionUpdatePayload = - | RealtimeSessionUpdateGaPayload - | RealtimeSessionUpdateBetaPayload; - -type RealtimeSessionUpdateGaPayload = { - type: "realtime"; - model: string; - instructions?: string; - output_modalities: ["audio"]; - audio: { - input: { - format: RealtimeAudioFormatConfig; - transcription: { model: string }; - noise_reduction?: { type: "near_field" }; - turn_detection: { - type: "server_vad"; - threshold: number; - prefix_padding_ms: number; - silence_duration_ms: number; - create_response: boolean; - interrupt_response: boolean; +type RealtimeGaSessionUpdate = { + type: "session.update"; + session: { + type: "realtime"; + model?: string; + instructions?: string; + output_modalities: string[]; + audio: { + input: { + format: OpenAIRealtimeAudioFormatConfig; + turn_detection: RealtimeTurnDetectionConfig; + noise_reduction?: { type: "near_field" }; + transcription?: { model: string }; + }; + output: { + format: OpenAIRealtimeAudioFormatConfig; + voice: OpenAIRealtimeVoice; }; }; - output: { - format: RealtimeAudioFormatConfig; - voice: OpenAIRealtimeVoice; + reasoning?: { effort: string }; + tools?: RealtimeVoiceTool[]; + tool_choice?: string; + }; +}; + +type RealtimeAzureDeploymentSessionUpdate = { + type: "session.update"; + session: { + modalities: string[]; + instructions?: string; + voice: OpenAIRealtimeVoice; + input_audio_format: "g711_ulaw" | "pcm16"; + output_audio_format: "g711_ulaw" | "pcm16"; + input_audio_transcription?: { model: string }; + turn_detection: RealtimeTurnDetectionConfig; + temperature: number; + tools?: RealtimeVoiceTool[]; + tool_choice?: string; + }; +}; + +type OpenAIRealtimeAudioFormatConfig = + | { + type: "audio/pcm"; + rate: 24000; + } + | { + type: "audio/pcmu"; }; - }; - tools?: RealtimeVoiceTool[]; - tool_choice?: string; -}; - -type RealtimeSessionUpdateBetaPayload = { - modalities: string[]; - instructions?: string; - voice: OpenAIRealtimeVoice; - input_audio_format: string; - output_audio_format: string; - turn_detection: { - type: "server_vad"; - threshold: number; - prefix_padding_ms: number; - silence_duration_ms: number; - create_response: boolean; - }; - temperature: number; - input_audio_transcription?: { model: string }; - tools?: RealtimeVoiceTool[]; - tool_choice?: string; -}; - -type RealtimeAudioFormatConfig = { type: "audio/pcmu" } | { type: "audio/pcm"; rate: 24000 }; function normalizeProviderConfig( config: RealtimeVoiceProviderConfig, @@ -180,7 +206,7 @@ function normalizeProviderConfig( path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey", }), model: trimToUndefined(raw?.model), - voice: trimToUndefined(raw?.voice) as OpenAIRealtimeVoice | undefined, + voice: normalizeOpenAIRealtimeVoice(raw?.voice), temperature: asFiniteNumber(raw?.temperature), vadThreshold: asFiniteNumber(raw?.vadThreshold), silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs), @@ -190,6 +216,7 @@ function normalizeProviderConfig( ? raw.interruptResponseOnInputAudio : undefined, minBargeInAudioEndMs: asNonNegativeInteger(raw?.minBargeInAudioEndMs), + reasoningEffort: trimToUndefined(raw?.reasoningEffort), azureEndpoint: trimToUndefined(raw?.azureEndpoint), azureDeployment: trimToUndefined(raw?.azureDeployment), azureApiVersion: trimToUndefined(raw?.azureApiVersion), @@ -272,6 +299,44 @@ function hasOpenAIRealtimeApiKeyInput(configuredApiKey: string | undefined): boo ); } +async function resolveOpenAIRealtimeBrowserApiKey(params: { + configuredApiKey: string | undefined; + cfg: RealtimeVoiceBrowserSessionCreateRequest["cfg"] | undefined; +}): Promise { + const resolved = resolveOpenAIRealtimeApiKey(params.configuredApiKey); + if (resolved.status === "available") { + return resolved.value; + } + return await resolveProviderAuthProfileApiKey({ + provider: "openai-codex", + cfg: params.cfg, + }); +} + +async function requireOpenAIRealtimeBrowserApiKey(params: { + configuredApiKey: string | undefined; + cfg: RealtimeVoiceBrowserSessionCreateRequest["cfg"] | undefined; +}): Promise { + const apiKey = await resolveOpenAIRealtimeBrowserApiKey(params); + if (apiKey) { + return apiKey; + } + throw new Error("OpenAI API key or Codex OAuth missing"); +} + +function hasOpenAIRealtimeBrowserAuthInput(params: { + configuredApiKey: string | undefined; + cfg: RealtimeVoiceBrowserSessionCreateRequest["cfg"] | undefined; +}): boolean { + if (hasOpenAIRealtimeApiKeyInput(params.configuredApiKey)) { + return true; + } + return isProviderAuthProfileConfigured({ + provider: "openai-codex", + cfg: params.cfg, + }); +} + function base64ToBuffer(b64: string): Buffer { return Buffer.from(b64, "base64"); } @@ -298,6 +363,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { private continuingToolCallIds = new Set(); private latestMediaTimestamp = 0; private lastAssistantItemId: string | null = null; + private connectionUrl = ""; private toolCallBuffers = new Map(); private deliveredToolCallKeys = new Set(); private readonly flowId = randomUUID(); @@ -415,14 +481,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { clearTimeout(connectTimeout); reject(error); }; - const { url, headers } = this.resolveConnectionParams(); - const debugProxy = resolveDebugProxySettings(); - const proxyAgent = createDebugProxyWebSocketAgent(debugProxy); - this.ws = new WebSocket(url, { - headers, - ...(proxyAgent ? { agent: proxyAgent } : {}), - }); - connectTimeout = setTimeout(() => { if (!this.sessionConfigured && !this.intentionallyClosed) { this.ws?.terminate(); @@ -430,95 +488,126 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } }, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS); - this.ws.on("open", () => { - this.resetRealtimeSessionState(); - this.connected = true; - this.sessionConfigured = false; - this.reconnectAttempts = 0; - captureWsEvent({ - url, - direction: "local", - kind: "ws-open", - flowId: this.flowId, - meta: { - provider: "openai", - capability: "realtime-voice", - }, - }); - this.sendSessionUpdate(); - }); - - this.ws.on("message", (data: Buffer) => { - captureWsEvent({ - url, - direction: "inbound", - kind: "ws-frame", - flowId: this.flowId, - payload: data, - meta: { - provider: "openai", - capability: "realtime-voice", - }, - }); - try { - const event = JSON.parse(data.toString()) as RealtimeEvent; - this.handleEvent(event); - if (event.type === "session.updated") { - settleResolve(); - } - if (event.type === "error" && !this.sessionConfigured) { - settleReject(new Error(readRealtimeErrorDetail(event.error))); - } - } catch (error) { - console.error("[openai] realtime event parse failed:", error); + const openWebSocket = (connection: { url: string; headers: Record }) => { + if (settled) { + return; } - }); - - this.ws.on("error", (error) => { - captureWsEvent({ - url, - direction: "local", - kind: "error", - flowId: this.flowId, - errorText: error instanceof Error ? error.message : String(error), - meta: { - provider: "openai", - capability: "realtime-voice", - }, - }); - if (!this.sessionConfigured) { - settleReject(error instanceof Error ? error : new Error(String(error))); - } - this.config.onError?.(error instanceof Error ? error : new Error(String(error))); - }); - - this.ws.on("close", (code, reasonBuffer) => { - captureOpenAIRealtimeWsClose({ - url, - flowId: this.flowId, - capability: "realtime-voice", - code, - reasonBuffer, - }); - this.connected = false; - this.sessionConfigured = false; if (this.intentionallyClosed) { settleResolve(); - this.config.onClose?.("completed"); return; } - if (!this.sessionConfigured && !settled) { - settleReject(new Error("OpenAI realtime connection closed before ready")); - return; - } - void this.attemptReconnect(); - }); + const url = connection.url; + this.connectionUrl = connection.url; + const debugProxy = resolveDebugProxySettings(); + const proxyAgent = createDebugProxyWebSocketAgent(debugProxy); + const ws = new WebSocket(connection.url, { + headers: connection.headers, + ...(proxyAgent ? { agent: proxyAgent } : {}), + }); + this.ws = ws; + + ws.on("open", () => { + this.resetRealtimeSessionState(); + this.connected = true; + this.sessionConfigured = false; + this.reconnectAttempts = 0; + captureWsEvent({ + url, + direction: "local", + kind: "ws-open", + flowId: this.flowId, + meta: { + provider: "openai", + capability: "realtime-voice", + }, + }); + this.sendSessionUpdate(); + }); + + ws.on("message", (data: Buffer) => { + captureWsEvent({ + url, + direction: "inbound", + kind: "ws-frame", + flowId: this.flowId, + payload: data, + meta: { + provider: "openai", + capability: "realtime-voice", + }, + }); + try { + const event = JSON.parse(data.toString()) as RealtimeEvent; + this.handleEvent(event); + if (event.type === "session.updated") { + settleResolve(); + } + if (event.type === "error" && !this.sessionConfigured) { + settleReject(new Error(readRealtimeErrorDetail(event.error))); + } + } catch (error) { + console.error("[openai] realtime event parse failed:", error); + } + }); + + ws.on("error", (error) => { + captureWsEvent({ + url, + direction: "local", + kind: "error", + flowId: this.flowId, + errorText: error instanceof Error ? error.message : String(error), + meta: { + provider: "openai", + capability: "realtime-voice", + }, + }); + if (!this.sessionConfigured) { + settleReject(error instanceof Error ? error : new Error(String(error))); + } + this.config.onError?.(error instanceof Error ? error : new Error(String(error))); + }); + + ws.on("close", (code, reasonBuffer) => { + captureOpenAIRealtimeWsClose({ + url, + flowId: this.flowId, + capability: "realtime-voice", + code, + reasonBuffer, + }); + this.connected = false; + this.sessionConfigured = false; + if (this.intentionallyClosed) { + settleResolve(); + this.config.onClose?.("completed"); + return; + } + if (!this.sessionConfigured && !settled) { + settleReject(new Error("OpenAI realtime connection closed before ready")); + return; + } + void this.attemptReconnect(); + }); + }; + + const connectionOrPromise = this.resolveConnectionParams(); + if (connectionOrPromise instanceof Promise) { + void connectionOrPromise.then(openWebSocket).catch((error: unknown) => { + settleReject(error instanceof Error ? error : new Error(String(error))); + }); + return; + } + openWebSocket(connectionOrPromise); }); } - private resolveConnectionParams(): { url: string; headers: Record } { + private resolveConnectionParams(): + | { url: string; headers: Record } + | Promise<{ url: string; headers: Record }> { const cfg = this.config; if (cfg.azureEndpoint && cfg.azureDeployment) { + const apiKey = requireOpenAIRealtimeApiKey(cfg.apiKey); const base = cfg.azureEndpoint .replace(/\/$/, "") .replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`); @@ -533,11 +622,16 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { baseUrl: url, capability: "audio", transport: "websocket", - defaultHeaders: { "api-key": cfg.apiKey }, - }) ?? { "api-key": cfg.apiKey }, + defaultHeaders: { "api-key": apiKey }, + }) ?? { "api-key": apiKey }, }; } + const directApiKey = resolveOpenAIRealtimeApiKey(cfg.apiKey); + if (directApiKey.status === "missing") { + return this.resolveOAuthConnectionParams(); + } + const apiKey = directApiKey.value; if (cfg.azureEndpoint) { const base = cfg.azureEndpoint .replace(/\/$/, "") @@ -552,8 +646,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { baseUrl: url, capability: "audio", transport: "websocket", - defaultHeaders: { Authorization: `Bearer ${cfg.apiKey}` }, - }) ?? { Authorization: `Bearer ${cfg.apiKey}` }, + defaultHeaders: { Authorization: `Bearer ${apiKey}` }, + }) ?? { Authorization: `Bearer ${apiKey}` }, }; } @@ -568,10 +662,48 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { capability: "audio", transport: "websocket", defaultHeaders: { - Authorization: `Bearer ${cfg.apiKey}`, + Authorization: `Bearer ${apiKey}`, }, }) ?? { - Authorization: `Bearer ${cfg.apiKey}`, + Authorization: `Bearer ${apiKey}`, + }, + }; + } + + private async resolveOAuthConnectionParams(): Promise<{ + url: string; + headers: Record; + }> { + const cfg = this.config; + const authToken = await requireOpenAIRealtimeBrowserApiKey({ + configuredApiKey: cfg.apiKey, + cfg: cfg.cfg, + }); + const model = cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL; + const clientSecret = await createOpenAIRealtimeClientSecret({ + authToken, + auditContext: "openai-realtime-bridge-session", + session: { + type: "realtime", + model, + audio: { + output: { voice: cfg.voice ?? "alloy" }, + }, + }, + }); + const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(model)}`; + return { + url, + headers: resolveProviderRequestHeaders({ + provider: "openai", + baseUrl: url, + capability: "audio", + transport: "websocket", + defaultHeaders: { + Authorization: `Bearer ${clientSecret.value}`, + }, + }) ?? { + Authorization: `Bearer ${clientSecret.value}`, }, }; } @@ -600,99 +732,96 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } private sendSessionUpdate(): void { - this.sendEvent({ - type: "session.update", - session: this.resolveSessionUpdatePayload(), - } satisfies RealtimeSessionUpdate); + if (this.usesAzureDeploymentRealtimeApi()) { + this.sendEvent(this.buildAzureDeploymentSessionUpdate()); + return; + } + + this.sendEvent(this.buildGaSessionUpdate()); } - private resolveSessionUpdatePayload(): RealtimeSessionUpdatePayload { - if (this.usesAzureDeploymentRealtimeApi()) { - return this.resolveBetaSessionUpdatePayload(); - } - return this.resolveGaSessionUpdatePayload(); + private buildGaSessionUpdate(): RealtimeGaSessionUpdate { + const cfg = this.config; + const autoRespondToAudio = cfg.autoRespondToAudio ?? true; + const interruptResponseOnInputAudio = cfg.interruptResponseOnInputAudio ?? autoRespondToAudio; + return { + type: "session.update", + session: { + type: "realtime", + model: cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL, + instructions: cfg.instructions, + output_modalities: ["audio"], + audio: { + input: { + format: this.resolveRealtimeAudioFormat(), + noise_reduction: { type: "near_field" }, + transcription: { model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL }, + turn_detection: { + type: "server_vad", + threshold: cfg.vadThreshold ?? 0.5, + prefix_padding_ms: cfg.prefixPaddingMs ?? 300, + silence_duration_ms: cfg.silenceDurationMs ?? 500, + create_response: autoRespondToAudio, + interrupt_response: interruptResponseOnInputAudio, + }, + }, + output: { + format: this.resolveRealtimeAudioFormat(), + voice: cfg.voice ?? "alloy", + }, + }, + ...(cfg.reasoningEffort ? { reasoning: { effort: cfg.reasoningEffort } } : {}), + ...(cfg.tools && cfg.tools.length > 0 + ? { + tools: cfg.tools, + tool_choice: "auto", + } + : {}), + }, + }; } private usesAzureDeploymentRealtimeApi(): boolean { return Boolean(this.config.azureEndpoint && this.config.azureDeployment); } - private resolveGaSessionUpdatePayload(): RealtimeSessionUpdateGaPayload { + private buildAzureDeploymentSessionUpdate(): RealtimeAzureDeploymentSessionUpdate { const cfg = this.config; - const autoRespondToAudio = cfg.autoRespondToAudio ?? true; - const interruptResponseOnInputAudio = cfg.interruptResponseOnInputAudio ?? autoRespondToAudio; + const format = this.resolveLegacyRealtimeAudioFormat(); return { - type: "realtime", - model: cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL, - instructions: cfg.instructions, - output_modalities: ["audio"], - audio: { - input: { - format: this.resolveRealtimeAudioFormatConfig(), - noise_reduction: { - type: "near_field", - }, - transcription: { - model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL, - }, - turn_detection: { - type: "server_vad", - threshold: cfg.vadThreshold ?? 0.5, - prefix_padding_ms: cfg.prefixPaddingMs ?? 300, - silence_duration_ms: cfg.silenceDurationMs ?? 500, - create_response: autoRespondToAudio, - interrupt_response: interruptResponseOnInputAudio, - }, - }, - output: { - format: this.resolveRealtimeAudioFormatConfig(), - voice: cfg.voice ?? "alloy", + type: "session.update", + session: { + modalities: ["text", "audio"], + instructions: cfg.instructions, + voice: cfg.voice ?? "alloy", + input_audio_format: format, + output_audio_format: format, + input_audio_transcription: { model: "whisper-1" }, + turn_detection: { + type: "server_vad", + threshold: cfg.vadThreshold ?? 0.5, + prefix_padding_ms: cfg.prefixPaddingMs ?? 300, + silence_duration_ms: cfg.silenceDurationMs ?? 500, + create_response: cfg.autoRespondToAudio ?? true, }, + temperature: cfg.temperature ?? 0.8, + ...(cfg.tools && cfg.tools.length > 0 + ? { + tools: cfg.tools, + tool_choice: "auto", + } + : {}), }, - ...(cfg.tools && cfg.tools.length > 0 - ? { - tools: cfg.tools, - tool_choice: "auto", - } - : {}), }; } - private resolveBetaSessionUpdatePayload(): RealtimeSessionUpdateBetaPayload { - const cfg = this.config; - return { - modalities: ["text", "audio"], - instructions: cfg.instructions, - voice: cfg.voice ?? "alloy", - input_audio_format: this.resolveRealtimeAudioFormat(), - output_audio_format: this.resolveRealtimeAudioFormat(), - input_audio_transcription: { - model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL, - }, - turn_detection: { - type: "server_vad", - threshold: cfg.vadThreshold ?? 0.5, - prefix_padding_ms: cfg.prefixPaddingMs ?? 300, - silence_duration_ms: cfg.silenceDurationMs ?? 500, - create_response: cfg.autoRespondToAudio ?? true, - }, - temperature: cfg.temperature ?? 0.8, - ...(cfg.tools && cfg.tools.length > 0 - ? { - tools: cfg.tools, - tool_choice: "auto", - } - : {}), - }; - } - - private resolveRealtimeAudioFormatConfig(): RealtimeAudioFormatConfig { + private resolveRealtimeAudioFormat(): OpenAIRealtimeAudioFormatConfig { return this.audioFormat.encoding === "pcm16" ? { type: "audio/pcm", rate: 24000 } : { type: "audio/pcmu" }; } - private resolveRealtimeAudioFormat(): "g711_ulaw" | "pcm16" { + private resolveLegacyRealtimeAudioFormat(): "g711_ulaw" | "pcm16" { return this.audioFormat.encoding === "pcm16" ? "pcm16" : "g711_ulaw"; } @@ -978,7 +1107,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { this.config.onEvent?.({ direction: "client", type, ...(detail ? { detail } : {}) }); const payload = JSON.stringify(event); captureWsEvent({ - url: this.resolveConnectionParams().url, + url: this.connectionUrl, direction: "outbound", kind: "ws-frame", flowId: this.flowId, @@ -1018,14 +1147,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } } -function readStringField(value: unknown, key: string): string | undefined { - if (!value || typeof value !== "object") { - return undefined; - } - const raw = (value as Record)[key]; - return typeof raw === "string" && raw.trim() ? raw.trim() : undefined; -} - function resolveOpenAIRealtimeBrowserOfferHeaders(): Record | undefined { const headers = resolveProviderRequestHeaders({ provider: "openai", @@ -1048,13 +1169,16 @@ async function createOpenAIRealtimeBrowserSession( req: RealtimeVoiceBrowserSessionCreateRequest, ): Promise { const config = normalizeProviderConfig(req.providerConfig); - const apiKey = requireOpenAIRealtimeApiKey(config.apiKey); + const apiKey = await requireOpenAIRealtimeBrowserApiKey({ + configuredApiKey: config.apiKey, + cfg: req.cfg, + }); if (config.azureEndpoint || config.azureDeployment) { throw new Error("OpenAI Realtime browser sessions do not support Azure endpoints yet"); } const model = req.model ?? config.model ?? OPENAI_REALTIME_DEFAULT_MODEL; - const voice = (req.voice ?? config.voice ?? "alloy") as OpenAIRealtimeVoice; + const voice = normalizeOpenAIRealtimeVoice(req.voice) ?? config.voice ?? "alloy"; const session: Record = { type: "realtime", model, @@ -1066,6 +1190,15 @@ async function createOpenAIRealtimeBrowserSession( type: "server_vad", create_response: true, interrupt_response: true, + ...(typeof (req.vadThreshold ?? config.vadThreshold) === "number" + ? { threshold: req.vadThreshold ?? config.vadThreshold } + : {}), + ...(typeof (req.prefixPaddingMs ?? config.prefixPaddingMs) === "number" + ? { prefix_padding_ms: req.prefixPaddingMs ?? config.prefixPaddingMs } + : {}), + ...(typeof (req.silenceDurationMs ?? config.silenceDurationMs) === "number" + ? { silence_duration_ms: req.silenceDurationMs ?? config.silenceDurationMs } + : {}), }, transcription: { model: OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL }, }, @@ -1076,60 +1209,26 @@ async function createOpenAIRealtimeBrowserSession( session.tools = req.tools; session.tool_choice = "auto"; } - - const { response, release } = await fetchWithSsrFGuard({ - url: "https://api.openai.com/v1/realtime/client_secrets", - init: { - method: "POST", - headers: resolveProviderRequestHeaders({ - provider: "openai", - baseUrl: "https://api.openai.com/v1/realtime/client_secrets", - capability: "audio", - transport: "http", - defaultHeaders: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - }) ?? { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ session }), - }, - auditContext: "openai-realtime-browser-session", - }); - const payload = await (async () => { - try { - if (!response.ok) { - throw await createProviderHttpError(response, "OpenAI Realtime browser session failed"); - } - return (await response.json()) as unknown; - } finally { - await release(); - } - })(); - const nestedSecret = - payload && typeof payload === "object" - ? (payload as Record).client_secret - : undefined; - const clientSecret = readStringField(payload, "value") ?? readStringField(nestedSecret, "value"); - if (!clientSecret) { - throw new Error("OpenAI Realtime browser session did not return a client secret"); + const reasoningEffort = trimToUndefined(req.reasoningEffort) ?? config.reasoningEffort; + if (reasoningEffort) { + session.reasoning = { effort: reasoningEffort }; } - const expiresAt = - payload && typeof payload === "object" - ? (payload as Record).expires_at - : undefined; + + const clientSecret = await createOpenAIRealtimeClientSecret({ + authToken: apiKey, + auditContext: "openai-realtime-browser-session", + session, + }); const offerHeaders = resolveOpenAIRealtimeBrowserOfferHeaders(); return { provider: "openai", transport: "webrtc", - clientSecret, + clientSecret: clientSecret.value, offerUrl: "https://api.openai.com/v1/realtime/calls", ...(offerHeaders ? { offerHeaders } : {}), model, voice, - ...(typeof expiresAt === "number" ? { expiresAt } : {}), + ...(typeof clientSecret.expiresAt === "number" ? { expiresAt: clientSecret.expiresAt } : {}), }; } @@ -1154,14 +1253,21 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin supportsToolCalls: true, }, resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig), - isConfigured: ({ providerConfig }) => - hasOpenAIRealtimeApiKeyInput(normalizeProviderConfig(providerConfig).apiKey), + isConfigured: ({ cfg, providerConfig }) => { + const config = normalizeProviderConfig(providerConfig); + if (config.azureEndpoint || config.azureDeployment) { + return hasOpenAIRealtimeApiKeyInput(config.apiKey); + } + return hasOpenAIRealtimeBrowserAuthInput({ + configuredApiKey: config.apiKey, + cfg, + }); + }, createBridge: (req) => { const config = normalizeProviderConfig(req.providerConfig); - const apiKey = requireOpenAIRealtimeApiKey(config.apiKey); return new OpenAIRealtimeVoiceBridge({ ...req, - apiKey, + apiKey: config.apiKey, model: config.model, voice: config.voice, temperature: config.temperature, @@ -1171,6 +1277,7 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin interruptResponseOnInputAudio: req.interruptResponseOnInputAudio ?? config.interruptResponseOnInputAudio, minBargeInAudioEndMs: config.minBargeInAudioEndMs, + reasoningEffort: config.reasoningEffort, azureEndpoint: config.azureEndpoint, azureDeployment: config.azureDeployment, azureApiVersion: config.azureApiVersion, diff --git a/extensions/voice-call/src/media-stream.ts b/extensions/voice-call/src/media-stream.ts index 509dc64c361..2f3058cf286 100644 --- a/extensions/voice-call/src/media-stream.ts +++ b/extensions/voice-call/src/media-stream.ts @@ -9,6 +9,7 @@ import type { IncomingMessage } from "node:http"; import type { Duplex } from "node:stream"; +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; import type { RealtimeTranscriptionProviderConfig, RealtimeTranscriptionProviderPlugin, @@ -31,6 +32,8 @@ export interface MediaStreamConfig { transcriptionProvider: RealtimeTranscriptionProviderPlugin; /** Provider-owned config blob passed into the transcription session. */ providerConfig: RealtimeTranscriptionProviderConfig; + /** Full runtime config, used by providers that can resolve OAuth profiles. */ + cfg?: OpenClawConfig; /** Close sockets that never send a valid `start` frame within this window. */ preStartTimeoutMs?: number; /** Max concurrent pre-start sockets. */ @@ -314,6 +317,7 @@ export class MediaStreamHandler { } const sttSession = this.config.transcriptionProvider.createSession({ + cfg: this.config.cfg, providerConfig: this.config.providerConfig, onPartial: (partial) => { const session = this.sessions.get(streamSid); diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 1a1557843ad..60b839ca80d 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -334,6 +334,7 @@ export async function createVoiceCallRuntime(params: { realtimeProvider.provider, realtimeProvider.providerConfig, config.serve.path, + cfg, ); if (config.realtime.toolPolicy !== "none") { realtimeHandler.registerToolHandler( diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index 3fff86cfcc6..5242ba1f192 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -346,6 +346,7 @@ export class VoiceCallWebhookServer { const streamConfig: MediaStreamConfig = { transcriptionProvider: provider, providerConfig, + cfg: this.fullConfig ?? (this.coreConfig as OpenClawConfig | null) ?? undefined, preStartTimeoutMs: streaming.preStartTimeoutMs, maxPendingConnections: streaming.maxPendingConnections, maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp, diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 13c56adde44..d48b0e743fd 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -1,6 +1,7 @@ import { randomUUID } from "node:crypto"; import http from "node:http"; import type { Duplex } from "node:stream"; +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; import { buildRealtimeVoiceAgentConsultWorkingResponse, @@ -309,6 +310,7 @@ export class RealtimeCallHandler { private readonly realtimeProvider: RealtimeVoiceProviderPlugin, private readonly providerConfig: RealtimeVoiceProviderConfig, private readonly servePath: string, + private readonly coreConfig?: OpenClawConfig, ) {} setPublicUrl(url: string): void { @@ -603,6 +605,7 @@ export class RealtimeCallHandler { }); const session = createRealtimeVoiceBridgeSession({ provider: this.realtimeProvider, + cfg: this.coreConfig, providerConfig: this.providerConfig, instructions: this.config.instructions, tools: this.config.tools, diff --git a/scripts/dev/realtime-talk-live-smoke.ts b/scripts/dev/realtime-talk-live-smoke.ts index 7183c9fe11f..84c519fc51b 100644 --- a/scripts/dev/realtime-talk-live-smoke.ts +++ b/scripts/dev/realtime-talk-live-smoke.ts @@ -476,8 +476,8 @@ try { const statusNames = new Set((result.statuses ?? []).map((entry) => entry.status)); const transcriptTexts = new Set((result.transcripts ?? []).map((entry) => entry.text)); const expectedMethods = [ - "talk.session.appendAudio", "talk.client.toolCall", + "talk.session.appendAudio", "talk.session.submitToolResult", "talk.session.close", ]; diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 1f9c76a8c68..86c21322d75 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -163,6 +163,10 @@ export const TalkClientCreateParamsSchema = Type.Object( provider: Type.Optional(Type.String()), model: Type.Optional(Type.String()), voice: Type.Optional(Type.String()), + vadThreshold: Type.Optional(Type.Number()), + silenceDurationMs: Type.Optional(Type.Integer({ minimum: 1 })), + prefixPaddingMs: Type.Optional(Type.Integer({ minimum: 0 })), + reasoningEffort: Type.Optional(Type.String()), mode: Type.Optional(TalkModeSchema), transport: Type.Optional(TalkTransportSchema), brain: Type.Optional(TalkBrainSchema), @@ -203,6 +207,10 @@ export const TalkSessionCreateParamsSchema = Type.Object( provider: Type.Optional(Type.String()), model: Type.Optional(Type.String()), voice: Type.Optional(Type.String()), + vadThreshold: Type.Optional(Type.Number()), + silenceDurationMs: Type.Optional(Type.Integer({ minimum: 1 })), + prefixPaddingMs: Type.Optional(Type.Integer({ minimum: 0 })), + reasoningEffort: Type.Optional(Type.String()), mode: Type.Optional(TalkModeSchema), transport: Type.Optional(TalkTransportSchema), brain: Type.Optional(TalkBrainSchema), diff --git a/src/gateway/server-methods/talk-client.ts b/src/gateway/server-methods/talk-client.ts index fd6e61e9422..795875fa17c 100644 --- a/src/gateway/server-methods/talk-client.ts +++ b/src/gateway/server-methods/talk-client.ts @@ -24,6 +24,7 @@ import { chatHandlers } from "./chat.js"; import { asRecord } from "./record-shared.js"; import { buildRealtimeInstructions, + buildRealtimeVoiceLaunchOptions, buildTalkRealtimeConfig, isUnsupportedBrowserWebRtcSession, } from "./talk-shared.js"; @@ -114,6 +115,10 @@ export const talkClientHandlers: GatewayRequestHandlers = { provider?: string; model?: string; voice?: string; + vadThreshold?: number; + silenceDurationMs?: number; + prefixPaddingMs?: number; + reasoningEffort?: string; mode?: string; transport?: string; brain?: string; @@ -180,13 +185,17 @@ export const talkClientHandlers: GatewayRequestHandlers = { cfgForResolve: runtimeConfig, noRegisteredProviderMessage: "No realtime voice provider registered", }); + const launchOptions = buildRealtimeVoiceLaunchOptions({ + requested: typedParams, + defaults: realtimeConfig, + }); if (resolution.provider.createBrowserSession && transport !== "gateway-relay") { const session = await resolution.provider.createBrowserSession({ + cfg: runtimeConfig, providerConfig: resolution.providerConfig, instructions: buildRealtimeInstructions(realtimeConfig.instructions), tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL], - model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model, - voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice, + ...launchOptions, }); if ( !isUnsupportedBrowserWebRtcSession(session) && diff --git a/src/gateway/server-methods/talk-session.ts b/src/gateway/server-methods/talk-session.ts index ffd0b4464e5..7c5e7158cbb 100644 --- a/src/gateway/server-methods/talk-session.ts +++ b/src/gateway/server-methods/talk-session.ts @@ -52,6 +52,7 @@ import { formatForLog } from "../ws-log.js"; import { broadcastTalkRoomEvents, buildRealtimeInstructions, + buildRealtimeVoiceLaunchOptions, buildTalkRealtimeConfig, buildTalkTranscriptionConfig, canUseTalkDirectTools, @@ -235,17 +236,20 @@ export const talkSessionHandlers: GatewayRequestHandlers = { cfgForResolve: runtimeConfig, noRegisteredProviderMessage: "No realtime voice provider registered", }); - const model = normalizeOptionalString(params.model) ?? realtimeConfig.model; - const voice = normalizeOptionalString(params.voice) ?? realtimeConfig.voice; + const launchOptions = buildRealtimeVoiceLaunchOptions({ + requested: params, + defaults: realtimeConfig, + }); const session = createTalkRealtimeRelaySession({ context, connId, + cfg: runtimeConfig, provider: resolution.provider, - providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }), + providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, launchOptions), instructions: buildRealtimeInstructions(realtimeConfig.instructions), tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL], - model, - voice, + model: launchOptions.model, + voice: launchOptions.voice, }); rememberUnifiedTalkSession(session.relaySessionId, { kind: "realtime-relay", diff --git a/src/gateway/server-methods/talk-shared.ts b/src/gateway/server-methods/talk-shared.ts index df7d4489fe7..0f4c7b709e7 100644 --- a/src/gateway/server-methods/talk-shared.ts +++ b/src/gateway/server-methods/talk-shared.ts @@ -221,19 +221,60 @@ export function buildRealtimeInstructions(configuredInstructions?: string): stri return `${DEFAULT_REALTIME_INSTRUCTIONS}\n\nAdditional realtime instructions:\n${extra}`; } +type RealtimeVoiceLaunchOptions = { + model?: string; + voice?: string; + vadThreshold?: number; + silenceDurationMs?: number; + prefixPaddingMs?: number; + reasoningEffort?: string; +}; + +type RealtimeVoiceLaunchOptionInput = { + model?: unknown; + voice?: unknown; + vadThreshold?: unknown; + silenceDurationMs?: unknown; + prefixPaddingMs?: unknown; + reasoningEffort?: unknown; +}; + +export function buildRealtimeVoiceLaunchOptions(params: { + requested: RealtimeVoiceLaunchOptionInput; + defaults: RealtimeVoiceLaunchOptions; +}): RealtimeVoiceLaunchOptions { + return withRealtimeBrowserOverrides( + params.defaults, + params.requested, + ) as RealtimeVoiceLaunchOptions; +} + export function withRealtimeBrowserOverrides( providerConfig: RealtimeVoiceProviderConfig, - params: { model?: string; voice?: string }, + params: RealtimeVoiceLaunchOptionInput, ): RealtimeVoiceProviderConfig { const overrides: RealtimeVoiceProviderConfig = {}; const model = normalizeOptionalString(params.model); const voice = normalizeOptionalString(params.voice); + const reasoningEffort = normalizeOptionalString(params.reasoningEffort); if (model) { overrides.model = model; } if (voice) { overrides.voice = voice; } + if (typeof params.vadThreshold === "number" && Number.isFinite(params.vadThreshold)) { + overrides.vadThreshold = params.vadThreshold; + } + if (typeof params.silenceDurationMs === "number" && Number.isFinite(params.silenceDurationMs)) { + overrides.silenceDurationMs = params.silenceDurationMs; + } + if (typeof params.prefixPaddingMs === "number" && Number.isFinite(params.prefixPaddingMs)) { + overrides.prefixPaddingMs = params.prefixPaddingMs; + } + if (reasoningEffort) { + overrides.reasoningEffort = reasoningEffort; + } return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig; } diff --git a/src/gateway/server-methods/talk.test.ts b/src/gateway/server-methods/talk.test.ts index eceb3cda3fe..d5d28caa0a4 100644 --- a/src/gateway/server-methods/talk.test.ts +++ b/src/gateway/server-methods/talk.test.ts @@ -1219,7 +1219,13 @@ describe("talk.client.create handler", () => { const respond = vi.fn(); await talkHandlers["talk.client.create"]({ req: { type: "req", id: "1", method: "talk.client.create" }, - params: { sessionKey: "main" }, + params: { + sessionKey: "main", + vadThreshold: 0.45, + silenceDurationMs: 650, + prefixPaddingMs: 250, + reasoningEffort: "low", + }, client: { connId: "conn-1" } as never, isWebchatConnect: () => false, respond: respond as never, @@ -1252,6 +1258,10 @@ describe("talk.client.create handler", () => { model: "gpt-realtime", voice: "alloy", instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."), + vadThreshold: 0.45, + silenceDurationMs: 650, + prefixPaddingMs: 250, + reasoningEffort: "low", }), ); expect(respond).toHaveBeenCalledWith( diff --git a/src/gateway/talk-realtime-relay.ts b/src/gateway/talk-realtime-relay.ts index 00f4c52a440..9282d24a5c9 100644 --- a/src/gateway/talk-realtime-relay.ts +++ b/src/gateway/talk-realtime-relay.ts @@ -1,4 +1,5 @@ import { randomUUID } from "node:crypto"; +import type { OpenClawConfig } from "../config/types.js"; import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; import { recordTalkObservabilityEvent } from "../talk/observability.js"; import { @@ -20,6 +21,7 @@ import { } from "../talk/talk-session-controller.js"; import { abortChatRunById } from "./chat-abort.js"; import type { GatewayRequestContext } from "./server-methods/shared-types.js"; +import { forgetUnifiedTalkSession } from "./talk-session-registry.js"; const RELAY_SESSION_TTL_MS = 30 * 60 * 1000; const MAX_AUDIO_BASE64_BYTES = 512 * 1024; @@ -68,6 +70,7 @@ type RelaySession = { type CreateTalkRealtimeRelaySessionParams = { context: GatewayRequestContext; connId: string; + cfg?: OpenClawConfig; provider: RealtimeVoiceProviderPlugin; providerConfig: RealtimeVoiceProviderConfig; instructions: string; @@ -113,6 +116,7 @@ function abortRelayAgentRuns(session: RelaySession, reason: string): void { function closeRelaySession(session: RelaySession, reason: "completed" | "error"): void { relaySessions.delete(session.id); + forgetUnifiedTalkSession(session.id); clearTimeout(session.cleanupTimer); abortRelayAgentRuns(session, reason === "error" ? "relay-error" : "relay-closed"); session.bridge.close(); @@ -180,6 +184,7 @@ export function createTalkRealtimeRelaySession( }); const bridge = createRealtimeVoiceBridgeSession({ provider: params.provider, + cfg: params.cfg, providerConfig: params.providerConfig, audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, instructions: params.instructions, @@ -281,6 +286,7 @@ export function createTalkRealtimeRelaySession( return; } relaySessions.delete(relaySessionId); + forgetUnifiedTalkSession(relaySessionId); clearTimeout(active.cleanupTimer); abortRelayAgentRuns(active, "relay-closed"); emit( @@ -449,6 +455,7 @@ export function stopTalkRealtimeRelaySession(params: { export function clearTalkRealtimeRelaySessionsForTest(): void { for (const session of relaySessions.values()) { clearTimeout(session.cleanupTimer); + forgetUnifiedTalkSession(session.id); session.bridge.close(); } relaySessions.clear(); diff --git a/src/gateway/talk-transcription-relay.test.ts b/src/gateway/talk-transcription-relay.test.ts index 541e66140e5..1297027c096 100644 --- a/src/gateway/talk-transcription-relay.test.ts +++ b/src/gateway/talk-transcription-relay.test.ts @@ -37,6 +37,7 @@ describe("talk transcription gateway relay", () => { }; const events: Array<{ event: string; payload: unknown; connIds: string[] }> = []; const context = { + getRuntimeConfig: () => ({}), broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet) => { events.push({ event, payload, connIds: [...connIds] }); }, @@ -170,6 +171,7 @@ describe("talk transcription gateway relay", () => { }; const events: Array<{ event: string; payload: unknown; connIds: string[] }> = []; const context = { + getRuntimeConfig: () => ({}), broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet) => { events.push({ event, payload, connIds: [...connIds] }); }, diff --git a/src/gateway/talk-transcription-relay.ts b/src/gateway/talk-transcription-relay.ts index 02ce2341efc..1d1c125780a 100644 --- a/src/gateway/talk-transcription-relay.ts +++ b/src/gateway/talk-transcription-relay.ts @@ -160,6 +160,7 @@ export function createTalkTranscriptionRelaySession( return relay ? ensureTranscriptionTurn(relay) : "turn-1"; }; const sttSession = params.provider.createSession({ + cfg: params.context.getRuntimeConfig(), providerConfig: params.providerConfig, onSpeechStart: () => { ensureTurnId(); diff --git a/src/realtime-transcription/provider-types.ts b/src/realtime-transcription/provider-types.ts index f407fa840a5..eadcb0c9c91 100644 --- a/src/realtime-transcription/provider-types.ts +++ b/src/realtime-transcription/provider-types.ts @@ -22,6 +22,7 @@ export type RealtimeTranscriptionSessionCallbacks = { }; export type RealtimeTranscriptionSessionCreateRequest = RealtimeTranscriptionSessionCallbacks & { + cfg?: OpenClawConfig; providerConfig: RealtimeTranscriptionProviderConfig; }; diff --git a/src/realtime-transcription/websocket-session.test.ts b/src/realtime-transcription/websocket-session.test.ts index 749d03a2977..ef1ddc8458d 100644 --- a/src/realtime-transcription/websocket-session.test.ts +++ b/src/realtime-transcription/websocket-session.test.ts @@ -15,6 +15,7 @@ afterEach(async () => { async function createRealtimeServer(params?: { closeOnConnection?: boolean; initialEvent?: unknown; + onUpgrade?: (headers: Record) => void; onBinary?: (payload: Buffer) => void; onText?: (payload: unknown) => void; }) { @@ -23,6 +24,7 @@ async function createRealtimeServer(params?: { const clients = new Set(); server.on("upgrade", (request, socket, head) => { + params?.onUpgrade?.(request.headers); wss.handleUpgrade(request, socket, head, (ws) => { clients.add(ws); ws.on("close", () => clients.delete(ws)); @@ -139,6 +141,85 @@ describe("createRealtimeTranscriptionWebSocketSession", () => { session.close(); }); + it("resolves async URLs and headers before opening the socket", async () => { + const seenAuthHeaders: Array = []; + const server = await createRealtimeServer({ + onUpgrade: (headers) => { + seenAuthHeaders.push(headers.authorization); + }, + }); + const session = createRealtimeTranscriptionWebSocketSession({ + providerId: "test", + callbacks: {}, + url: async () => server.url, + headers: async () => ({ Authorization: "Bearer resolved-token" }), + readyOnOpen: true, + sendAudio: (audio, transport) => { + transport.sendBinary(audio); + }, + }); + + await session.connect(); + + expect(seenAuthHeaders).toEqual(["Bearer resolved-token"]); + session.close(); + }); + + it("applies the connect timeout while resolving async connection details", async () => { + const onError = vi.fn(); + const session = createRealtimeTranscriptionWebSocketSession({ + providerId: "test", + callbacks: { onError }, + url: () => new Promise(() => {}), + connectTimeoutMs: 10, + connectTimeoutMessage: "test realtime transcription connection timeout", + readyOnOpen: true, + sendAudio: (audio, transport) => { + transport.sendBinary(audio); + }, + }); + + await expect(session.connect()).rejects.toThrow( + "test realtime transcription connection timeout", + ); + expect(session.isConnected()).toBe(false); + expect(onError).toHaveBeenCalledWith(expect.any(Error)); + expect(onError.mock.calls[0]?.[0]).toMatchObject({ + message: "test realtime transcription connection timeout", + }); + }); + + it("does not open a socket when closed while async connection resolves", async () => { + const seenAuthHeaders: Array = []; + let resolveUrl!: (url: string) => void; + const url = new Promise((resolve) => { + resolveUrl = resolve; + }); + const server = await createRealtimeServer({ + onUpgrade: (headers) => { + seenAuthHeaders.push(headers.authorization); + }, + }); + const session = createRealtimeTranscriptionWebSocketSession({ + providerId: "test", + callbacks: {}, + url: () => url, + headers: async () => ({ Authorization: "Bearer resolved-token" }), + readyOnOpen: true, + sendAudio: (audio, transport) => { + transport.sendBinary(audio); + }, + }); + + const connecting = session.connect(); + session.close(); + resolveUrl(server.url); + await connecting; + + expect(seenAuthHeaders).toEqual([]); + expect(session.isConnected()).toBe(false); + }); + it("rejects provider setup errors before ready", async () => { const server = await createRealtimeServer({ initialEvent: { type: "error", message: "nope" } }); const onError = vi.fn(); diff --git a/src/realtime-transcription/websocket-session.ts b/src/realtime-transcription/websocket-session.ts index 5510c45a937..7b16cd6932b 100644 --- a/src/realtime-transcription/websocket-session.ts +++ b/src/realtime-transcription/websocket-session.ts @@ -24,7 +24,9 @@ export type RealtimeTranscriptionWebSocketSessionOptions = { connectTimeoutMessage?: string; connectTimeoutMs?: number; closeTimeoutMs?: number; - headers?: Record; + headers?: + | Record + | (() => Record | Promise>); maxQueuedBytes?: number; maxReconnectAttempts?: number; onClose?: (transport: RealtimeTranscriptionWebSocketTransport) => void; @@ -36,7 +38,7 @@ export type RealtimeTranscriptionWebSocketSessionOptions = { reconnectDelayMs?: number; reconnectLimitMessage?: string; sendAudio: (audio: Buffer, transport: RealtimeTranscriptionWebSocketTransport) => void; - url: string | (() => string); + url: string | (() => string | Promise); }; const DEFAULT_CONNECT_TIMEOUT_MS = 10_000; @@ -157,22 +159,37 @@ class WebSocketRealtimeTranscriptionSession implements RealtimeTranscript private async doConnect(): Promise { await new Promise((resolve, reject) => { this.ready = false; - this.currentUrl = - typeof this.options.url === "function" ? this.options.url() : this.options.url; const debugProxy = resolveDebugProxySettings(); const proxyAgent = createDebugProxyWebSocketAgent(debugProxy); let settled = false; let opened = false; let connectTimeout: ReturnType | undefined; + const normalizeError = (error: unknown) => + error instanceof Error ? error : new Error(String(error)); + + const clearConnectTimeout = () => { + if (connectTimeout) { + clearTimeout(connectTimeout); + connectTimeout = undefined; + } + }; + + const finishClosedConnect = () => { + if (settled) { + return; + } + settled = true; + clearConnectTimeout(); + resolve(); + }; + const finishConnect = () => { if (settled) { return; } settled = true; - if (connectTimeout) { - clearTimeout(connectTimeout); - } + clearConnectTimeout(); this.ready = true; this.flushQueuedAudio(); resolve(); @@ -183,9 +200,7 @@ class WebSocketRealtimeTranscriptionSession implements RealtimeTranscript return; } settled = true; - if (connectTimeout) { - clearTimeout(connectTimeout); - } + clearConnectTimeout(); this.emitError(error); this.suppressReconnect = true; this.forceClose(); @@ -194,10 +209,6 @@ class WebSocketRealtimeTranscriptionSession implements RealtimeTranscript this.markReady = finishConnect; this.failConnect = failConnect; - this.ws = new WebSocket(this.currentUrl, { - headers: this.options.headers, - ...(proxyAgent ? { agent: proxyAgent } : {}), - }); connectTimeout = setTimeout(() => { failConnect( @@ -208,77 +219,116 @@ class WebSocketRealtimeTranscriptionSession implements RealtimeTranscript ); }, this.connectTimeoutMs); - this.ws.on("open", () => { - opened = true; - this.connected = true; - this.reconnectAttempts = 0; - this.captureLocalOpen(); + void (async () => { + let connection: { headers?: Record; url: string }; try { - this.options.onOpen?.(this.transport); - if (this.options.readyOnOpen) { - finishConnect(); - } + connection = await this.resolveConnection(); } catch (error) { - failConnect(error instanceof Error ? error : new Error(String(error))); - } - }); - - this.ws.on("message", (data) => { - const payload = rawWsDataToBuffer(data); - this.captureFrame("inbound", payload); - try { - if (!this.options.onMessage) { - return; - } - const parseMessage = this.options.parseMessage ?? defaultParseMessage; - this.options.onMessage(parseMessage(payload) as Event, this.transport); - } catch (error) { - this.emitError(error); - } - }); - - this.ws.on("error", (error) => { - const normalized = error instanceof Error ? error : new Error(String(error)); - this.captureError(normalized); - if (!opened || !settled) { - failConnect(normalized); + failConnect(normalizeError(error)); return; } - this.emitError(normalized); - }); - - this.ws.on("close", (code, reasonBuffer) => { - if (connectTimeout) { - clearTimeout(connectTimeout); - } - this.captureClose(code, reasonBuffer); - this.connected = false; - this.ready = false; - if (this.closeTimer) { - clearTimeout(this.closeTimer); - this.closeTimer = undefined; + if (settled) { + return; } if (this.closed) { + finishClosedConnect(); return; } - if (this.suppressReconnect) { - this.suppressReconnect = false; + + this.currentUrl = connection.url; + try { + this.ws = new WebSocket(this.currentUrl, { + headers: connection.headers, + ...(proxyAgent ? { agent: proxyAgent } : {}), + }); + } catch (error) { + failConnect(normalizeError(error)); return; } - if (!opened || !settled) { - failConnect( - new Error( - this.options.connectClosedBeforeReadyMessage ?? - `${this.options.providerId} realtime transcription connection closed before ready`, - ), - ); - return; - } - void this.attemptReconnect(); - }); + + this.ws.on("open", () => { + opened = true; + this.connected = true; + this.reconnectAttempts = 0; + this.captureLocalOpen(); + try { + this.options.onOpen?.(this.transport); + if (this.options.readyOnOpen) { + finishConnect(); + } + } catch (error) { + failConnect(normalizeError(error)); + } + }); + + this.ws.on("message", (data) => { + const payload = rawWsDataToBuffer(data); + this.captureFrame("inbound", payload); + try { + if (!this.options.onMessage) { + return; + } + const parseMessage = this.options.parseMessage ?? defaultParseMessage; + this.options.onMessage(parseMessage(payload) as Event, this.transport); + } catch (error) { + this.emitError(error); + } + }); + + this.ws.on("error", (error) => { + const normalized = normalizeError(error); + this.captureError(normalized); + if (!opened || !settled) { + failConnect(normalized); + return; + } + this.emitError(normalized); + }); + + this.ws.on("close", (code, reasonBuffer) => { + clearConnectTimeout(); + this.captureClose(code, reasonBuffer); + this.connected = false; + this.ready = false; + if (this.closeTimer) { + clearTimeout(this.closeTimer); + this.closeTimer = undefined; + } + if (this.closed) { + return; + } + if (this.suppressReconnect) { + this.suppressReconnect = false; + return; + } + if (!opened || !settled) { + failConnect( + new Error( + this.options.connectClosedBeforeReadyMessage ?? + `${this.options.providerId} realtime transcription connection closed before ready`, + ), + ); + return; + } + void this.attemptReconnect(); + }); + })(); }); } + private async resolveConnection(): Promise<{ + headers?: Record; + url: string; + }> { + const url = await (typeof this.options.url === "function" + ? this.options.url() + : this.options.url); + const headers = await (typeof this.options.headers === "function" + ? this.options.headers() + : this.options.headers); + return { url, headers }; + } + private async attemptReconnect(): Promise { if (this.closed || this.reconnecting) { return; diff --git a/src/talk/provider-types.ts b/src/talk/provider-types.ts index 98f7974bb0f..ebc4ead19c7 100644 --- a/src/talk/provider-types.ts +++ b/src/talk/provider-types.ts @@ -100,6 +100,7 @@ export type RealtimeVoiceProviderConfiguredContext = { }; export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & { + cfg?: OpenClawConfig; providerConfig: RealtimeVoiceProviderConfig; audioFormat?: RealtimeVoiceAudioFormat; instructions?: string; @@ -109,11 +110,16 @@ export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & { }; export type RealtimeVoiceBrowserSessionCreateRequest = { + cfg?: OpenClawConfig; providerConfig: RealtimeVoiceProviderConfig; instructions?: string; tools?: RealtimeVoiceTool[]; model?: string; voice?: string; + vadThreshold?: number; + silenceDurationMs?: number; + prefixPaddingMs?: number; + reasoningEffort?: string; }; export type RealtimeVoiceBrowserAudioContract = { diff --git a/src/talk/session-runtime.test.ts b/src/talk/session-runtime.test.ts index 49bba69c823..a2fac91125a 100644 --- a/src/talk/session-runtime.test.ts +++ b/src/talk/session-runtime.test.ts @@ -48,6 +48,7 @@ describe("realtime voice bridge session runtime", () => { createRealtimeVoiceBridgeSession({ provider, + cfg: { talk: { realtime: { provider: "test" } } } as never, providerConfig: {}, audioSink: { isOpen: () => true, @@ -61,6 +62,7 @@ describe("realtime voice bridge session runtime", () => { callbacks?.onClearAudio(); callbacks?.onMark?.("mark-1"); + expect(callbacks?.cfg).toEqual({ talk: { realtime: { provider: "test" } } }); expect(sendAudio).toHaveBeenCalledWith(Buffer.from([1, 2])); expect(clearAudio).toHaveBeenCalled(); expect(sendMark).toHaveBeenCalledWith("mark-1"); diff --git a/src/talk/session-runtime.ts b/src/talk/session-runtime.ts index a4c49989afd..e30d5123f3d 100644 --- a/src/talk/session-runtime.ts +++ b/src/talk/session-runtime.ts @@ -1,3 +1,4 @@ +import type { OpenClawConfig } from "../config/types.openclaw.js"; import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; import type { RealtimeVoiceBridge, @@ -36,6 +37,7 @@ export type RealtimeVoiceBridgeSession = { export type RealtimeVoiceBridgeSessionParams = { provider: RealtimeVoiceProviderPlugin; + cfg?: OpenClawConfig; providerConfig: RealtimeVoiceProviderConfig; audioFormat?: RealtimeVoiceAudioFormat; audioSink: RealtimeVoiceAudioSink; @@ -81,6 +83,7 @@ export function createRealtimeVoiceBridgeSession( }; const canSendAudio = () => params.audioSink.isOpen?.() ?? true; bridge = params.provider.createBridge({ + cfg: params.cfg, providerConfig: params.providerConfig, audioFormat: params.audioFormat, instructions: params.instructions, diff --git a/ui/src/styles/chat/layout.css b/ui/src/styles/chat/layout.css index b0abb2f189f..5be268b0abe 100644 --- a/ui/src/styles/chat/layout.css +++ b/ui/src/styles/chat/layout.css @@ -690,6 +690,38 @@ color: var(--text); } +.agent-chat__talk-options { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(132px, 1fr)); + gap: 8px; + padding: 10px; + border-bottom: 1px solid color-mix(in srgb, var(--border) 50%, transparent); + background: color-mix(in srgb, var(--bg-elevated) 72%, transparent); +} + +.agent-chat__talk-options label { + display: flex; + flex-direction: column; + gap: 4px; + min-width: 0; + font-size: 0.68rem; + color: var(--muted); +} + +.agent-chat__talk-options input, +.agent-chat__talk-options select { + width: 100%; + min-width: 0; + height: 30px; + border-radius: var(--radius-sm); + border: 1px solid var(--border); + background: var(--bg); + color: var(--text); + font: inherit; + font-size: 0.75rem; + padding: 0 8px; +} + .agent-chat__input-divider { width: 1px; height: 16px; diff --git a/ui/src/ui/app-render.ts b/ui/src/ui/app-render.ts index 98c8ce79abe..5a17c30f7f8 100644 --- a/ui/src/ui/app-render.ts +++ b/ui/src/ui/app-render.ts @@ -2447,6 +2447,8 @@ export function renderApp(state: AppViewState) { realtimeTalkStatus: state.realtimeTalkStatus, realtimeTalkDetail: state.realtimeTalkDetail, realtimeTalkTranscript: state.realtimeTalkTranscript, + realtimeTalkOptionsOpen: state.realtimeTalkOptionsOpen, + realtimeTalkOptions: state.realtimeTalkOptions, connected: state.connected, canSend: state.connected, disabledReason: chatDisabledReason, @@ -2489,6 +2491,10 @@ export function renderApp(state: AppViewState) { }); }, onToggleRealtimeTalk: () => state.toggleRealtimeTalk(), + onToggleRealtimeTalkOptions: () => { + state.realtimeTalkOptionsOpen = !state.realtimeTalkOptionsOpen; + }, + onRealtimeTalkOptionsChange: (next) => state.updateRealtimeTalkOptions(next), canAbort: hasAbortableSessionRun(state), onAbort: () => void state.handleAbortChat(), onQueueRemove: (id) => state.removeQueuedMessage(id), diff --git a/ui/src/ui/app-view-state.ts b/ui/src/ui/app-view-state.ts index 37902e88ce2..26c3202fa82 100644 --- a/ui/src/ui/app-view-state.ts +++ b/ui/src/ui/app-view-state.ts @@ -123,6 +123,18 @@ export type AppViewState = { realtimeTalkStatus: RealtimeTalkStatus; realtimeTalkDetail: string | null; realtimeTalkTranscript: string | null; + realtimeTalkOptionsOpen: boolean; + realtimeTalkOptions: { + provider: string; + model: string; + voice: string; + transport: string; + vadThreshold: string; + silenceDurationMs: string; + prefixPaddingMs: string; + reasoningEffort: string; + }; + updateRealtimeTalkOptions: (next: Partial) => void; chatManualRefreshInFlight: boolean; chatHeaderControlsHidden: boolean; chatMobileControlsOpen: boolean; diff --git a/ui/src/ui/app.ts b/ui/src/ui/app.ts index a31e51dc146..4d3f8ab2a64 100644 --- a/ui/src/ui/app.ts +++ b/ui/src/ui/app.ts @@ -67,7 +67,11 @@ import { import type { AppViewState } from "./app-view-state.ts"; import { normalizeAssistantIdentity } from "./assistant-identity.ts"; import { exportChatMarkdown } from "./chat/export.ts"; -import { RealtimeTalkSession, type RealtimeTalkStatus } from "./chat/realtime-talk.ts"; +import { + RealtimeTalkSession, + type RealtimeTalkLaunchOptions, + type RealtimeTalkStatus, +} from "./chat/realtime-talk.ts"; import type { ChatSideResult } from "./chat/side-result.ts"; import { loadToolsEffective as loadToolsEffectiveInternal, @@ -231,6 +235,17 @@ export class OpenClawApp extends LitElement { @state() realtimeTalkStatus: RealtimeTalkStatus = "idle"; @state() realtimeTalkDetail: string | null = null; @state() realtimeTalkTranscript: string | null = null; + @state() realtimeTalkOptionsOpen = false; + @state() realtimeTalkOptions = { + provider: "", + model: "", + voice: "", + transport: "", + vadThreshold: "", + silenceDurationMs: "", + prefixPaddingMs: "", + reasoningEffort: "", + }; private realtimeTalkSession: RealtimeTalkSession | null = null; private nativeBridgeCleanup: (() => void) | null = null; @state() chatManualRefreshInFlight = false; @@ -955,6 +970,43 @@ export class OpenClawApp extends LitElement { ); } + updateRealtimeTalkOptions(next: Partial) { + this.realtimeTalkOptions = { ...this.realtimeTalkOptions, ...next }; + } + + private buildRealtimeTalkLaunchOptions(): RealtimeTalkLaunchOptions { + const options = this.realtimeTalkOptions ?? { + provider: "", + model: "", + voice: "", + transport: "", + vadThreshold: "", + silenceDurationMs: "", + prefixPaddingMs: "", + reasoningEffort: "", + }; + const text = (value: string) => value.trim() || undefined; + const number = (value: string) => { + const trimmed = value.trim(); + if (!trimmed) { + return undefined; + } + const parsed = Number(trimmed); + return Number.isFinite(parsed) ? parsed : undefined; + }; + const transport = text(options.transport) as RealtimeTalkLaunchOptions["transport"] | undefined; + return { + provider: text(options.provider), + model: text(options.model), + voice: text(options.voice), + transport, + vadThreshold: number(options.vadThreshold), + silenceDurationMs: number(options.silenceDurationMs), + prefixPaddingMs: number(options.prefixPaddingMs), + reasoningEffort: text(options.reasoningEffort), + }; + } + async toggleRealtimeTalk() { if (this.realtimeTalkSession) { if (this.realtimeTalkStatus === "error") { @@ -978,18 +1030,23 @@ export class OpenClawApp extends LitElement { this.realtimeTalkStatus = "connecting"; this.realtimeTalkDetail = null; this.realtimeTalkTranscript = null; - const session = new RealtimeTalkSession(this.client, this.sessionKey, { - onStatus: (status, detail) => { - this.realtimeTalkStatus = status; - this.realtimeTalkDetail = detail ?? null; - if (status === "idle" || status === "error") { - this.realtimeTalkActive = status !== "idle"; - } + const session = new RealtimeTalkSession( + this.client, + this.sessionKey, + { + onStatus: (status, detail) => { + this.realtimeTalkStatus = status; + this.realtimeTalkDetail = detail ?? null; + if (status === "idle" || status === "error") { + this.realtimeTalkActive = status !== "idle"; + } + }, + onTranscript: (entry) => { + this.realtimeTalkTranscript = `${entry.role === "user" ? "You" : "OpenClaw"}: ${entry.text}`; + }, }, - onTranscript: (entry) => { - this.realtimeTalkTranscript = `${entry.role === "user" ? "You" : "OpenClaw"}: ${entry.text}`; - }, - }); + this.buildRealtimeTalkLaunchOptions(), + ); this.realtimeTalkSession = session; try { await session.start(); diff --git a/ui/src/ui/chat/realtime-talk-gateway-relay.ts b/ui/src/ui/chat/realtime-talk-gateway-relay.ts index 9a1baef9588..00f8b252a6c 100644 --- a/ui/src/ui/chat/realtime-talk-gateway-relay.ts +++ b/ui/src/ui/chat/realtime-talk-gateway-relay.ts @@ -49,6 +49,7 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport private readonly consultAbortControllers = new Set(); private cancelRequestedForPlayback = false; private speechFramesDuringPlayback = 0; + private lastRelayError: string | undefined; constructor( private readonly session: RealtimeTalkGatewayRelaySessionResult, @@ -85,6 +86,18 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport } stop(): void { + const wasClosed = this.closed; + this.stopLocal(); + if (!wasClosed) { + void this.ctx.client + .request("talk.session.close", { + sessionId: this.session.relaySessionId, + }) + .catch(() => undefined); + } + } + + private stopLocal(): void { this.closed = true; this.unsubscribe?.(); this.unsubscribe = null; @@ -100,9 +113,6 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport this.inputContext = null; void this.outputContext?.close(); this.outputContext = null; - void this.ctx.client.request("talk.session.close", { - sessionId: this.session.relaySessionId, - }); } private startMicrophonePump(): void { @@ -120,11 +130,21 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport if (this.detectBargeInSpeech(samples)) { this.cancelOutputForBargeIn(); } - void this.ctx.client.request("talk.session.appendAudio", { - sessionId: this.session.relaySessionId, - audioBase64: bytesToBase64(pcm), - timestamp: Math.round((this.inputContext?.currentTime ?? 0) * 1000), - }); + void this.ctx.client + .request("talk.session.appendAudio", { + sessionId: this.session.relaySessionId, + audioBase64: bytesToBase64(pcm), + timestamp: Math.round((this.inputContext?.currentTime ?? 0) * 1000), + }) + .catch((error: unknown) => { + if (!this.closed) { + this.ctx.callbacks.onStatus?.( + "error", + error instanceof Error ? error.message : String(error), + ); + this.stopLocal(); + } + }); }; this.inputSource.connect(this.inputProcessor); this.inputProcessor.connect(this.inputContext.destination); @@ -167,15 +187,17 @@ export class GatewayRelayRealtimeTalkTransport implements RealtimeTalkTransport void this.handleToolCall(event); return; case "error": - this.ctx.callbacks.onStatus?.("error", event.message ?? "Realtime relay failed"); + this.lastRelayError = event.message ?? "Realtime relay failed"; + this.ctx.callbacks.onStatus?.("error", this.lastRelayError); return; case "close": this.abortConsults(); if (!this.closed) { this.ctx.callbacks.onStatus?.( event.reason === "error" ? "error" : "idle", - event.reason === "error" ? "Realtime relay closed" : undefined, + event.reason === "error" ? (this.lastRelayError ?? "Realtime relay closed") : undefined, ); + this.stopLocal(); } return; default: diff --git a/ui/src/ui/chat/realtime-talk.ts b/ui/src/ui/chat/realtime-talk.ts index acc92906c65..d2aedd6c57f 100644 --- a/ui/src/ui/chat/realtime-talk.ts +++ b/ui/src/ui/chat/realtime-talk.ts @@ -22,6 +22,17 @@ export type { RealtimeTalkStatus, }; +export type RealtimeTalkLaunchOptions = { + provider?: string; + model?: string; + voice?: string; + transport?: "webrtc" | "provider-websocket" | "gateway-relay" | "managed-room"; + vadThreshold?: number; + silenceDurationMs?: number; + prefixPaddingMs?: number; + reasoningEffort?: string; +}; + function createTransport( session: RealtimeTalkSessionResult, ctx: RealtimeTalkTransportContext, @@ -53,6 +64,12 @@ function resolveTransport(session: RealtimeTalkSessionResult): string { return normalizeTalkTransport((session as { transport?: string }).transport) ?? "webrtc"; } +function compactLaunchParams( + params: RealtimeTalkLaunchOptions & { sessionKey: string; mode?: string; brain?: string }, +): Record { + return Object.fromEntries(Object.entries(params).filter(([, value]) => value !== undefined)); +} + export class RealtimeTalkSession { private transport: RealtimeTalkTransport | null = null; private closed = false; @@ -61,6 +78,7 @@ export class RealtimeTalkSession { private readonly client: GatewayBrowserClient, private readonly sessionKey: string, private readonly callbacks: RealtimeTalkCallbacks = {}, + private readonly options: RealtimeTalkLaunchOptions = {}, ) {} async start(): Promise { @@ -82,17 +100,28 @@ export class RealtimeTalkSession { private async createSession(): Promise { try { - return await this.client.request("talk.client.create", { - sessionKey: this.sessionKey, - }); - } catch (error) { - try { - return await this.client.request("talk.session.create", { + return await this.client.request( + "talk.client.create", + compactLaunchParams({ sessionKey: this.sessionKey, - mode: "realtime", - transport: "gateway-relay", - brain: "agent-consult", - }); + ...this.options, + }), + ); + } catch (error) { + if (this.options.transport && this.options.transport !== "gateway-relay") { + throw error; + } + try { + return await this.client.request( + "talk.session.create", + compactLaunchParams({ + sessionKey: this.sessionKey, + ...this.options, + mode: "realtime", + transport: this.options.transport ?? "gateway-relay", + brain: "agent-consult", + }), + ); } catch { throw error; } diff --git a/ui/src/ui/realtime-talk-gateway-relay.test.ts b/ui/src/ui/realtime-talk-gateway-relay.test.ts index 7e4a48596ee..f434c60f437 100644 --- a/ui/src/ui/realtime-talk-gateway-relay.test.ts +++ b/ui/src/ui/realtime-talk-gateway-relay.test.ts @@ -217,6 +217,103 @@ describe("GatewayRelayRealtimeTalkTransport", () => { transport.stop(); }); + it("stops microphone pumping when the relay rejects appended audio", async () => { + const onStatus = vi.fn(); + const client = createClient(); + vi.mocked(client.request).mockImplementation(async (method) => { + if (method === "talk.session.appendAudio") { + throw new Error("Unknown realtime relay session"); + } + return {}; + }); + const transport = new GatewayRelayRealtimeTalkTransport(createSession(), { + callbacks: { onStatus }, + client, + sessionKey: "main", + }); + + await transport.start(); + pumpMicrophone(new Float32Array(4096)); + await vi.waitFor(() => + expect(onStatus).toHaveBeenCalledWith("error", "Unknown realtime relay session"), + ); + pumpMicrophone(new Float32Array(4096)); + transport.stop(); + + const appendCalls = vi + .mocked(client.request) + .mock.calls.filter(([method]) => method === "talk.session.appendAudio"); + const closeCalls = vi + .mocked(client.request) + .mock.calls.filter(([method]) => method === "talk.session.close"); + expect(appendCalls).toHaveLength(1); + expect(closeCalls).toHaveLength(0); + }); + + it("treats relay close events as local shutdown", async () => { + const onStatus = vi.fn(); + const client = createClient(); + const transport = new GatewayRelayRealtimeTalkTransport(createSession(), { + callbacks: { onStatus }, + client, + sessionKey: "main", + }); + + await transport.start(); + pumpMicrophone(new Float32Array(4096)); + emitGatewayFrame({ + event: "talk.event", + payload: { + relaySessionId: "relay-1", + type: "close", + reason: "error", + }, + }); + pumpMicrophone(new Float32Array(4096)); + transport.stop(); + + const appendCalls = vi + .mocked(client.request) + .mock.calls.filter(([method]) => method === "talk.session.appendAudio"); + const closeCalls = vi + .mocked(client.request) + .mock.calls.filter(([method]) => method === "talk.session.close"); + expect(onStatus).toHaveBeenCalledWith("error", "Realtime relay closed"); + expect(appendCalls).toHaveLength(1); + expect(closeCalls).toHaveLength(0); + }); + + it("preserves relay error details across close events", async () => { + const onStatus = vi.fn(); + const client = createClient(); + const transport = new GatewayRelayRealtimeTalkTransport(createSession(), { + callbacks: { onStatus }, + client, + sessionKey: "main", + }); + + await transport.start(); + emitGatewayFrame({ + event: "talk.event", + payload: { + relaySessionId: "relay-1", + type: "error", + message: "API version mismatch", + }, + }); + emitGatewayFrame({ + event: "talk.event", + payload: { + relaySessionId: "relay-1", + type: "close", + reason: "error", + }, + }); + + expect(onStatus).toHaveBeenCalledWith("error", "API version mismatch"); + expect(onStatus).toHaveBeenLastCalledWith("error", "API version mismatch"); + }); + it("cancels relay playback after sustained input speech", async () => { const client = createClient(); const transport = new GatewayRelayRealtimeTalkTransport(createSession(), { diff --git a/ui/src/ui/realtime-talk.test.ts b/ui/src/ui/realtime-talk.test.ts index af2699714fd..8e8a834386a 100644 --- a/ui/src/ui/realtime-talk.test.ts +++ b/ui/src/ui/realtime-talk.test.ts @@ -174,4 +174,41 @@ describe("RealtimeTalkSession", () => { expect(googleCtor).not.toHaveBeenCalled(); expect(relayCtor).not.toHaveBeenCalled(); }); + + it("passes launch options to client-owned realtime session creation", async () => { + const request = vi.fn(async () => ({ + provider: "openai", + transport: "webrtc", + clientSecret: "secret", + })); + const session = new RealtimeTalkSession( + { request } as never, + "main", + {}, + { + provider: "openai", + model: "gpt-realtime-2", + voice: "marin", + transport: "webrtc", + vadThreshold: 0.45, + silenceDurationMs: 650, + prefixPaddingMs: 250, + reasoningEffort: "low", + }, + ); + + await session.start(); + + expect(request).toHaveBeenCalledWith("talk.client.create", { + sessionKey: "main", + provider: "openai", + model: "gpt-realtime-2", + voice: "marin", + transport: "webrtc", + vadThreshold: 0.45, + silenceDurationMs: 650, + prefixPaddingMs: 250, + reasoningEffort: "low", + }); + }); }); diff --git a/ui/src/ui/views/chat.test.ts b/ui/src/ui/views/chat.test.ts index 27760a071fa..96816cbac15 100644 --- a/ui/src/ui/views/chat.test.ts +++ b/ui/src/ui/views/chat.test.ts @@ -506,10 +506,70 @@ describe("chat voice controls", () => { it("keeps Talk visible without the stale browser dictation button", () => { const container = renderChatView(); - expect(container.querySelectorAll('[aria-label="Start Talk"]')).toHaveLength(1); + expect(container.querySelector('[aria-label="Start Talk"]')).not.toBeNull(); + expect(container.querySelector('[aria-label="Talk options"]')).not.toBeNull(); expect(container.querySelector('[aria-label="Voice input"]')).toBeNull(); }); + it("renders editable Talk launch options", () => { + const onRealtimeTalkOptionsChange = vi.fn(); + const container = renderChatView({ + realtimeTalkOptionsOpen: true, + realtimeTalkOptions: { + provider: "openai", + model: "gpt-realtime-2", + voice: "marin", + transport: "webrtc", + vadThreshold: "0.45", + silenceDurationMs: "650", + prefixPaddingMs: "250", + reasoningEffort: "low", + }, + onRealtimeTalkOptionsChange, + }); + + const model = container.querySelector( + '.agent-chat__talk-options input[placeholder="gpt-realtime-2"]', + ); + const voice = container.querySelector( + ".agent-chat__talk-options label:nth-of-type(4) select", + ); + const voiceOptions = Array.from( + container.querySelectorAll( + ".agent-chat__talk-options label:nth-of-type(4) option", + ), + ).map((option) => option.value); + const reasoningOptions = Array.from( + container.querySelectorAll( + ".agent-chat__talk-options label:nth-of-type(5) option", + ), + ).map((option) => option.value); + + expect(voice).not.toBeNull(); + expect(voiceOptions).toEqual([ + "", + "alloy", + "ash", + "ballad", + "coral", + "echo", + "sage", + "shimmer", + "verse", + "marin", + "cedar", + ]); + expect(voiceOptions).not.toContain("nova"); + expect(voiceOptions).not.toContain("onyx"); + expect(voiceOptions).not.toContain("fable"); + expect(reasoningOptions).toEqual(["", "minimal", "low", "medium", "high"]); + expect(model).not.toBeNull(); + model!.value = "gpt-realtime-mini"; + model!.dispatchEvent(new Event("input", { bubbles: true })); + + expect(onRealtimeTalkOptionsChange).toHaveBeenCalledWith({ model: "gpt-realtime-mini" }); + }); + it("lets users dismiss Talk start errors", () => { const onDismissError = vi.fn(); const container = renderChatView({ diff --git a/ui/src/ui/views/chat.ts b/ui/src/ui/views/chat.ts index 5899d19c426..47dcb68f562 100644 --- a/ui/src/ui/views/chat.ts +++ b/ui/src/ui/views/chat.ts @@ -77,6 +77,17 @@ export type ChatProps = { realtimeTalkStatus?: RealtimeTalkStatus; realtimeTalkDetail?: string | null; realtimeTalkTranscript?: string | null; + realtimeTalkOptionsOpen?: boolean; + realtimeTalkOptions?: { + provider: string; + model: string; + voice: string; + transport: string; + vadThreshold: string; + silenceDurationMs: string; + prefixPaddingMs: string; + reasoningEffort: string; + }; connected: boolean; canSend: boolean; disabledReason: string | null; @@ -111,6 +122,10 @@ export type ChatProps = { onCompact?: () => void | Promise; onOpenSessionCheckpoints?: () => void | Promise; onToggleRealtimeTalk?: () => void; + onToggleRealtimeTalkOptions?: () => void; + onRealtimeTalkOptionsChange?: ( + next: Partial>, + ) => void; onDismissError?: () => void; onAbort?: () => void; onQueueRemove: (id: string) => void; @@ -154,6 +169,110 @@ function getDeletedMessages(sessionKey: string): DeletedMessages { ); } +function renderRealtimeTalkOptions(props: ChatProps) { + const options = props.realtimeTalkOptions; + const onChange = props.onRealtimeTalkOptionsChange; + if (!props.realtimeTalkOptionsOpen || !options || !onChange) { + return nothing; + } + const update = (key: keyof NonNullable) => (event: Event) => { + const value = (event.currentTarget as HTMLInputElement | HTMLSelectElement).value; + onChange({ [key]: value }); + }; + return html` +
+ + + + + + + + +
+ `; +} + interface ChatEphemeralState { slashMenuOpen: boolean; slashMenuItems: SlashCommandDef[]; @@ -1244,6 +1363,7 @@ export function renderChat(props: ChatProps) { @change=${(e: Event) => handleFileSelect(e, props)} /> + ${renderRealtimeTalkOptions(props)} ${props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript ? html`
@@ -1311,6 +1431,17 @@ export function renderChat(props: ChatProps) { > ${props.realtimeTalkActive ? icons.volume2 : icons.radio} + ` : nothing} ${tokens ? html`${tokens}` : nothing}