From 04066d246abc4e13a9507e1a93e12be75ae41753 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 03:33:29 +0100 Subject: [PATCH] feat: add browser realtime talk --- CHANGELOG.md | 1 + .../.generated/plugin-sdk-api-baseline.sha256 | 4 +- docs/providers/openai.md | 26 +- docs/web/control-ui.md | 9 + extensions/google-meet/src/agent-consult.ts | 34 +- extensions/google-meet/src/config.ts | 4 +- extensions/openai/realtime-voice-provider.ts | 78 ++++- src/gateway/method-scopes.ts | 1 + src/gateway/protocol/index.ts | 14 + src/gateway/protocol/schema/channels.ts | 22 ++ .../protocol/schema/protocol-schemas.ts | 4 + src/gateway/protocol/schema/types.ts | 2 + src/gateway/server-methods-list.ts | 1 + src/gateway/server-methods/talk.ts | 126 ++++++++ src/plugin-sdk/realtime-voice.ts | 6 + src/plugins/types.ts | 5 + src/realtime-voice/agent-consult-tool.ts | 28 ++ src/realtime-voice/provider-types.ts | 16 + ui/src/styles/chat/layout.css | 9 + ui/src/ui/app-lifecycle.ts | 11 + ui/src/ui/app-render.ts | 5 + ui/src/ui/app-view-state.ts | 6 + ui/src/ui/app.ts | 51 +++ ui/src/ui/chat/realtime-talk.ts | 300 ++++++++++++++++++ ui/src/ui/gateway.ts | 13 + ui/src/ui/views/chat.ts | 34 ++ 26 files changed, 765 insertions(+), 45 deletions(-) create mode 100644 src/realtime-voice/agent-consult-tool.ts create mode 100644 ui/src/ui/chat/realtime-talk.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index ebda08f4c06..4cf86d6fdf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai ### Changes - Control UI/chat: add a Steer action on queued messages so a browser follow-up can be injected into the active run without retyping it. +- Control UI/Talk: add browser WebRTC realtime voice sessions backed by OpenAI Realtime, with Gateway-minted ephemeral client secrets and `openclaw_agent_consult` handoff to the full OpenClaw agent. - Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it. - Agents/subagents: add optional forked context for native `sessions_spawn` runs so agents can let a child inherit the requester transcript when needed, while keeping clean isolated sessions as the default; includes prompt guidance, context-engine hook metadata, docs, and QA coverage. - Codex harness: add structured debug logging for embedded harness selection decisions so `/status` stays simple while gateway logs explain auto-selection and Pi fallback reasons. (#70760) Thanks @100yenadmin. diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 9a694d66467..627b848d38c 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -793ed905cb0ba93b9a2f8c2c85c3cfb4d194dd9263353e74952bf9e382b03dc2 plugin-sdk-api-baseline.json -032e7fd6f48344c9b3b98fd3e877e6d30cab92ed9a39dd309796cf1f0220820f plugin-sdk-api-baseline.jsonl +96905c33f4498446f612ae17dee6affdf84ef0e2e5a0f25bf7191c315f5b826f plugin-sdk-api-baseline.json +d8eb6331562fde29531eaac18409bb7fabcc70623bf25395f8e5710a49765f0f plugin-sdk-api-baseline.jsonl diff --git a/docs/providers/openai.md b/docs/providers/openai.md index c79a26d0f3f..ce1c6b7e0d4 100644 --- a/docs/providers/openai.md +++ b/docs/providers/openai.md @@ -25,19 +25,19 @@ API-enabled model such as `openai/gpt-5.4` for `OPENAI_API_KEY` setups. ## OpenClaw feature coverage -| OpenAI capability | OpenClaw surface | Status | -| ------------------------- | ------------------------------------------------------ | ------------------------------------------------------ | -| Chat / Responses | `openai/` model provider | Yes | -| Codex subscription models | `openai-codex/` with `openai-codex` OAuth | Yes | -| Codex app-server harness | `openai/` with `embeddedHarness.runtime: codex` | Yes | -| Server-side web search | Native OpenAI Responses tool | Yes, when web search is enabled and no provider pinned | -| Images | `image_generate` | Yes | -| Videos | `video_generate` | Yes | -| Text-to-speech | `messages.tts.provider: "openai"` / `tts` | Yes | -| Batch speech-to-text | `tools.media.audio` / media understanding | Yes | -| Streaming speech-to-text | Voice Call `streaming.provider: "openai"` | Yes | -| Realtime voice | Voice Call `realtime.provider: "openai"` | Yes | -| Embeddings | memory embedding provider | Yes | +| OpenAI capability | OpenClaw surface | Status | +| ------------------------- | ---------------------------------------------------------- | ------------------------------------------------------ | +| Chat / Responses | `openai/` model provider | Yes | +| Codex subscription models | `openai-codex/` with `openai-codex` OAuth | Yes | +| Codex app-server harness | `openai/` with `embeddedHarness.runtime: codex` | Yes | +| Server-side web search | Native OpenAI Responses tool | Yes, when web search is enabled and no provider pinned | +| Images | `image_generate` | Yes | +| Videos | `video_generate` | Yes | +| Text-to-speech | `messages.tts.provider: "openai"` / `tts` | Yes | +| Batch speech-to-text | `tools.media.audio` / media understanding | Yes | +| Streaming speech-to-text | Voice Call `streaming.provider: "openai"` | Yes | +| Realtime voice | Voice Call `realtime.provider: "openai"` / Control UI Talk | Yes | +| Embeddings | memory embedding provider | Yes | ## Getting started diff --git a/docs/web/control-ui.md b/docs/web/control-ui.md index b0a3da3c4a3..213f2e65fa7 100644 --- a/docs/web/control-ui.md +++ b/docs/web/control-ui.md @@ -105,6 +105,11 @@ locale picker lives in the Gateway Access card, not under Appearance. ## What it can do (today) - Chat with the model via Gateway WS (`chat.history`, `chat.send`, `chat.abort`, `chat.inject`) +- Talk to OpenAI Realtime directly from the browser via WebRTC. The Gateway + mints a short-lived Realtime client secret with `talk.realtime.session`; the + browser sends microphone audio directly to OpenAI and relays + `openclaw_agent_consult` tool calls back through `chat.send` for the larger + configured OpenClaw model. - Stream tool calls + live tool output cards in Chat (agent events) - Channels: built-in plus bundled/external plugin channels status, QR login, and per-channel config (`channels.status`, `web.login.*`, `config.patch`) - Instances: presence list + refresh (`system-presence`) @@ -151,6 +156,10 @@ Cron jobs panel notes: - `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `...`, `...`, `...`, `...`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`. - `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery). - The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options. +- Talk mode uses the registered realtime voice provider. Configure OpenAI with + `talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the + Voice Call realtime provider config. The browser never receives the standard + OpenAI API key; it receives only the ephemeral Realtime client secret. - Stop: - Click **Stop** (calls `chat.abort`) - While a run is active, normal follow-ups queue. Click **Steer** on a queued message to inject that follow-up into the running turn. diff --git a/extensions/google-meet/src/agent-consult.ts b/extensions/google-meet/src/agent-consult.ts index 1e606768372..1229a6a5ca3 100644 --- a/extensions/google-meet/src/agent-consult.ts +++ b/extensions/google-meet/src/agent-consult.ts @@ -1,7 +1,11 @@ import { randomUUID } from "node:crypto"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime"; -import type { RealtimeVoiceTool } from "openclaw/plugin-sdk/realtime-voice"; +import { + REALTIME_VOICE_AGENT_CONSULT_TOOL, + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + type RealtimeVoiceTool, +} from "openclaw/plugin-sdk/realtime-voice"; import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime"; import type { GoogleMeetConfig, GoogleMeetToolPolicy } from "./config.js"; @@ -11,32 +15,8 @@ type AgentPayload = { isReasoning?: boolean; }; -export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult"; - -export const GOOGLE_MEET_AGENT_CONSULT_TOOL: RealtimeVoiceTool = { - type: "function", - name: GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME, - description: - "Ask the full OpenClaw agent for deeper reasoning, current information, or tool-backed help before speaking in the meeting.", - parameters: { - type: "object", - properties: { - question: { - type: "string", - description: "The concrete question or task the meeting participant asked.", - }, - context: { - type: "string", - description: "Optional relevant meeting context or transcript summary.", - }, - responseStyle: { - type: "string", - description: "Optional style hint for the spoken answer.", - }, - }, - required: ["question"], - }, -}; +export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME; +export const GOOGLE_MEET_AGENT_CONSULT_TOOL = REALTIME_VOICE_AGENT_CONSULT_TOOL; export function resolveGoogleMeetRealtimeTools(policy: GoogleMeetToolPolicy): RealtimeVoiceTool[] { return policy === "none" ? [] : [GOOGLE_MEET_AGENT_CONSULT_TOOL]; diff --git a/extensions/google-meet/src/config.ts b/extensions/google-meet/src/config.ts index aa12162fd90..c18e341f835 100644 --- a/extensions/google-meet/src/config.ts +++ b/extensions/google-meet/src/config.ts @@ -1,3 +1,4 @@ +import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "openclaw/plugin-sdk/realtime-voice"; import { normalizeOptionalLowercaseString, normalizeOptionalString, @@ -94,8 +95,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [ "-", ] as const; -export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = - "You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering."; +export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`; export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = { enabled: true, diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index 8d37f5f7f55..eb17661952e 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -6,6 +6,8 @@ import { } from "openclaw/plugin-sdk/proxy-capture"; import type { RealtimeVoiceBridge, + RealtimeVoiceBrowserSession, + RealtimeVoiceBrowserSessionCreateRequest, RealtimeVoiceBridgeCreateRequest, RealtimeVoiceProviderConfig, RealtimeVoiceProviderPlugin, @@ -59,6 +61,8 @@ type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & { azureApiVersion?: string; }; +const OPENAI_REALTIME_DEFAULT_MODEL = "gpt-realtime-1.5"; + type RealtimeEvent = { type: string; delta?: string; @@ -117,7 +121,7 @@ function base64ToBuffer(b64: string): Buffer { } class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { - private static readonly DEFAULT_MODEL = "gpt-realtime-1.5"; + private static readonly DEFAULT_MODEL = OPENAI_REALTIME_DEFAULT_MODEL; private static readonly MAX_RECONNECT_ATTEMPTS = 5; private static readonly BASE_RECONNECT_DELAY_MS = 1000; private static readonly CONNECT_TIMEOUT_MS = 10_000; @@ -579,6 +583,77 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } } +function readStringField(value: unknown, key: string): string | undefined { + if (!value || typeof value !== "object") { + return undefined; + } + const raw = (value as Record)[key]; + return typeof raw === "string" && raw.trim() ? raw.trim() : undefined; +} + +async function createOpenAIRealtimeBrowserSession( + req: RealtimeVoiceBrowserSessionCreateRequest, +): Promise { + const config = normalizeProviderConfig(req.providerConfig); + const apiKey = config.apiKey || process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error("OpenAI API key missing"); + } + if (config.azureEndpoint || config.azureDeployment) { + throw new Error("OpenAI Realtime browser sessions do not support Azure endpoints yet"); + } + + const model = req.model ?? config.model ?? OPENAI_REALTIME_DEFAULT_MODEL; + const voice = (req.voice ?? config.voice ?? "alloy") as OpenAIRealtimeVoice; + const session: Record = { + type: "realtime", + model, + instructions: req.instructions, + audio: { + output: { voice }, + }, + }; + if (req.tools && req.tools.length > 0) { + session.tools = req.tools; + session.tool_choice = "auto"; + } + + const response = await fetch("https://api.openai.com/v1/realtime/client_secrets", { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ session }), + }); + if (!response.ok) { + const detail = await response.text().catch(() => ""); + throw new Error( + `OpenAI Realtime browser session failed (${response.status}): ${detail || response.statusText}`, + ); + } + const payload = (await response.json()) as unknown; + const nestedSecret = + payload && typeof payload === "object" + ? (payload as Record).client_secret + : undefined; + const clientSecret = readStringField(payload, "value") ?? readStringField(nestedSecret, "value"); + if (!clientSecret) { + throw new Error("OpenAI Realtime browser session did not return a client secret"); + } + const expiresAt = + payload && typeof payload === "object" + ? (payload as Record).expires_at + : undefined; + return { + provider: "openai", + clientSecret, + model, + voice, + ...(typeof expiresAt === "number" ? { expiresAt } : {}), + }; +} + export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin { return { id: "openai", @@ -607,6 +682,7 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin azureApiVersion: config.azureApiVersion, }); }, + createBrowserSession: createOpenAIRealtimeBrowserSession, }; } diff --git a/src/gateway/method-scopes.ts b/src/gateway/method-scopes.ts index 19664b9606c..834be3dea7d 100644 --- a/src/gateway/method-scopes.ts +++ b/src/gateway/method-scopes.ts @@ -125,6 +125,7 @@ const METHOD_SCOPE_GROUPS: Record = { "agent.wait", "wake", "talk.mode", + "talk.realtime.session", "talk.speak", "tts.enable", "tts.disable", diff --git a/src/gateway/protocol/index.ts b/src/gateway/protocol/index.ts index 7945e0fb677..908b95363a9 100644 --- a/src/gateway/protocol/index.ts +++ b/src/gateway/protocol/index.ts @@ -52,6 +52,10 @@ import { TalkConfigParamsSchema, type TalkConfigResult, TalkConfigResultSchema, + type TalkRealtimeSessionParams, + TalkRealtimeSessionParamsSchema, + type TalkRealtimeSessionResult, + TalkRealtimeSessionResultSchema, type TalkSpeakParams, TalkSpeakParamsSchema, type TalkSpeakResult, @@ -428,6 +432,12 @@ export const validateWizardStatusParams = ajv.compile(Wizard export const validateTalkModeParams = ajv.compile(TalkModeParamsSchema); export const validateTalkConfigParams = ajv.compile(TalkConfigParamsSchema); export const validateTalkConfigResult = ajv.compile(TalkConfigResultSchema); +export const validateTalkRealtimeSessionParams = ajv.compile( + TalkRealtimeSessionParamsSchema, +); +export const validateTalkRealtimeSessionResult = ajv.compile( + TalkRealtimeSessionResultSchema, +); export const validateTalkSpeakParams = ajv.compile(TalkSpeakParamsSchema); export const validateTalkSpeakResult = ajv.compile(TalkSpeakResultSchema); export const validateChannelsStatusParams = ajv.compile( @@ -616,6 +626,8 @@ export { WizardStatusResultSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkRealtimeSessionParamsSchema, + TalkRealtimeSessionResultSchema, TalkSpeakParamsSchema, TalkSpeakResultSchema, ChannelsStatusParamsSchema, @@ -720,6 +732,8 @@ export type { WizardStatusResult, TalkConfigParams, TalkConfigResult, + TalkRealtimeSessionParams, + TalkRealtimeSessionResult, TalkSpeakParams, TalkSpeakResult, TalkModeParams, diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 5e134af1a27..20ce30eadf1 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -36,6 +36,28 @@ export const TalkSpeakParamsSchema = Type.Object( { additionalProperties: false }, ); +export const TalkRealtimeSessionParamsSchema = Type.Object( + { + sessionKey: Type.Optional(Type.String()), + provider: Type.Optional(Type.String()), + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + instructions: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + +export const TalkRealtimeSessionResultSchema = Type.Object( + { + provider: NonEmptyString, + clientSecret: NonEmptyString, + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + expiresAt: Type.Optional(Type.Number()), + }, + { additionalProperties: false }, +); + const talkProviderFieldSchemas = { apiKey: Type.Optional(SecretInputSchema), }; diff --git a/src/gateway/protocol/schema/protocol-schemas.ts b/src/gateway/protocol/schema/protocol-schemas.ts index 0f9c443740e..c3972fbf764 100644 --- a/src/gateway/protocol/schema/protocol-schemas.ts +++ b/src/gateway/protocol/schema/protocol-schemas.ts @@ -54,6 +54,8 @@ import { ChannelsLogoutParamsSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkRealtimeSessionParamsSchema, + TalkRealtimeSessionResultSchema, TalkSpeakParamsSchema, TalkSpeakResultSchema, ChannelsStatusParamsSchema, @@ -279,6 +281,8 @@ export const ProtocolSchemas = { TalkModeParams: TalkModeParamsSchema, TalkConfigParams: TalkConfigParamsSchema, TalkConfigResult: TalkConfigResultSchema, + TalkRealtimeSessionParams: TalkRealtimeSessionParamsSchema, + TalkRealtimeSessionResult: TalkRealtimeSessionResultSchema, TalkSpeakParams: TalkSpeakParamsSchema, TalkSpeakResult: TalkSpeakResultSchema, ChannelsStatusParams: ChannelsStatusParamsSchema, diff --git a/src/gateway/protocol/schema/types.ts b/src/gateway/protocol/schema/types.ts index 9a1ef1ddfd1..fb4da4202b7 100644 --- a/src/gateway/protocol/schema/types.ts +++ b/src/gateway/protocol/schema/types.ts @@ -80,6 +80,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">; export type TalkModeParams = SchemaType<"TalkModeParams">; export type TalkConfigParams = SchemaType<"TalkConfigParams">; export type TalkConfigResult = SchemaType<"TalkConfigResult">; +export type TalkRealtimeSessionParams = SchemaType<"TalkRealtimeSessionParams">; +export type TalkRealtimeSessionResult = SchemaType<"TalkRealtimeSessionResult">; export type TalkSpeakParams = SchemaType<"TalkSpeakParams">; export type TalkSpeakResult = SchemaType<"TalkSpeakResult">; export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">; diff --git a/src/gateway/server-methods-list.ts b/src/gateway/server-methods-list.ts index 6506d48a23f..352ca7bcd29 100644 --- a/src/gateway/server-methods-list.ts +++ b/src/gateway/server-methods-list.ts @@ -48,6 +48,7 @@ const BASE_METHODS = [ "wizard.cancel", "wizard.status", "talk.config", + "talk.realtime.session", "talk.speak", "talk.mode", "commands.list", diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index 8aea28f2090..e419b73ac06 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -7,6 +7,13 @@ import { } from "../../config/talk.js"; import type { TalkConfigResponse, TalkProviderConfig } from "../../config/types.gateway.js"; import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js"; +import { + REALTIME_VOICE_AGENT_CONSULT_TOOL, + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, +} from "../../realtime-voice/agent-consult-tool.js"; +import { getRealtimeVoiceProvider } from "../../realtime-voice/provider-registry.js"; +import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js"; +import type { RealtimeVoiceProviderConfig } from "../../realtime-voice/provider-types.js"; import { normalizeLowercaseStringOrEmpty, normalizeOptionalLowercaseString, @@ -22,6 +29,7 @@ import { type TalkSpeakParams, validateTalkConfigParams, validateTalkModeParams, + validateTalkRealtimeSessionParams, validateTalkSpeakParams, } from "../protocol/index.js"; import { formatForLog } from "../ws-log.js"; @@ -136,6 +144,63 @@ function buildTalkTtsConfig( }; } +function getRecord(value: unknown): Record | undefined { + return asRecord(value) ?? undefined; +} + +function getVoiceCallRealtimeConfig(config: OpenClawConfig): { + provider?: string; + providers?: Record; +} { + const plugins = getRecord(config.plugins); + const entries = getRecord(plugins?.entries); + const voiceCall = getRecord(entries?.["voice-call"]); + const pluginConfig = getRecord(voiceCall?.config); + const realtime = getRecord(pluginConfig?.realtime); + const providersRaw = getRecord(realtime?.providers); + const providers: Record = {}; + if (providersRaw) { + for (const [providerId, providerConfig] of Object.entries(providersRaw)) { + const record = getRecord(providerConfig); + if (record) { + providers[providerId] = record; + } + } + } + return { + provider: normalizeOptionalString(realtime?.provider), + providers: Object.keys(providers).length > 0 ? providers : undefined, + }; +} + +function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) { + const voiceCallRealtime = getVoiceCallRealtimeConfig(config); + const talkProviderConfigs = config.talk?.providers as + | Record + | undefined; + const talkProvider = normalizeOptionalString(config.talk?.provider); + const talkProviderSupportsRealtime = talkProvider + ? Boolean(getRealtimeVoiceProvider(talkProvider, config)) + : false; + const provider = + normalizeOptionalString(requestedProvider) ?? + (talkProviderSupportsRealtime ? talkProvider : undefined) ?? + voiceCallRealtime.provider; + return { + provider, + providers: { + ...voiceCallRealtime.providers, + ...talkProviderConfigs, + }, + }; +} + +function buildRealtimeInstructions(extra: string | undefined): string { + const base = `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`; + const trimmed = normalizeOptionalString(extra); + return trimmed ? `${base}\n\n${trimmed}` : base; +} + function isFallbackEligibleTalkReason(reason: TalkSpeakReason): boolean { return ( reason === "talk_unconfigured" || @@ -334,6 +399,67 @@ export const talkHandlers: GatewayRequestHandlers = { respond(true, { config: configPayload }, undefined); }, + "talk.realtime.session": async ({ params, respond }) => { + if (!validateTalkRealtimeSessionParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.realtime.session params: ${formatValidationErrors(validateTalkRealtimeSessionParams.errors)}`, + ), + ); + return; + } + const typedParams = params as { + provider?: string; + model?: string; + voice?: string; + instructions?: string; + }; + try { + const runtimeConfig = loadConfig(); + const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, typedParams.provider); + const resolution = resolveConfiguredRealtimeVoiceProvider({ + configuredProviderId: realtimeConfig.provider, + providerConfigs: realtimeConfig.providers, + cfg: runtimeConfig, + cfgForResolve: runtimeConfig, + noRegisteredProviderMessage: "No realtime voice provider registered", + }); + if (!resolution.provider.createBrowserSession) { + respond( + false, + undefined, + errorShape( + ErrorCodes.UNAVAILABLE, + `Realtime voice provider "${resolution.provider.id}" does not support browser WebRTC sessions`, + ), + ); + return; + } + const session = await resolution.provider.createBrowserSession({ + providerConfig: resolution.providerConfig, + instructions: buildRealtimeInstructions(typedParams.instructions), + tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL], + model: normalizeOptionalString(typedParams.model), + voice: normalizeOptionalString(typedParams.voice), + }); + respond( + true, + { + provider: session.provider, + clientSecret: session.clientSecret, + ...(session.model ? { model: session.model } : {}), + ...(session.voice ? { voice: session.voice } : {}), + ...(typeof session.expiresAt === "number" ? { expiresAt: session.expiresAt } : {}), + }, + undefined, + ); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, "talk.speak": async ({ params, respond }) => { if (!validateTalkSpeakParams(params)) { respond( diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index 4dcfb7c9c76..07aedef6299 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -2,6 +2,8 @@ export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; export type { RealtimeVoiceBridge, RealtimeVoiceBridgeCallbacks, + RealtimeVoiceBrowserSession, + RealtimeVoiceBrowserSessionCreateRequest, RealtimeVoiceBridgeCreateRequest, RealtimeVoiceCloseReason, RealtimeVoiceProviderConfig, @@ -12,6 +14,10 @@ export type { RealtimeVoiceTool, RealtimeVoiceToolCallEvent, } from "../realtime-voice/provider-types.js"; +export { + REALTIME_VOICE_AGENT_CONSULT_TOOL, + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, +} from "../realtime-voice/agent-consult-tool.js"; export { canonicalizeRealtimeVoiceProviderId, getRealtimeVoiceProvider, diff --git a/src/plugins/types.ts b/src/plugins/types.ts index 47b9bcca476..203d654330a 100644 --- a/src/plugins/types.ts +++ b/src/plugins/types.ts @@ -40,6 +40,8 @@ import type { } from "../realtime-transcription/provider-types.js"; import type { RealtimeVoiceBridge, + RealtimeVoiceBrowserSession, + RealtimeVoiceBrowserSessionCreateRequest, RealtimeVoiceBridgeCreateRequest, RealtimeVoiceProviderConfig, RealtimeVoiceProviderConfiguredContext, @@ -1661,6 +1663,9 @@ export type RealtimeVoiceProviderPlugin = { resolveConfig?: (ctx: RealtimeVoiceProviderResolveConfigContext) => RealtimeVoiceProviderConfig; isConfigured: (ctx: RealtimeVoiceProviderConfiguredContext) => boolean; createBridge: (req: RealtimeVoiceBridgeCreateRequest) => RealtimeVoiceBridge; + createBrowserSession?: ( + req: RealtimeVoiceBrowserSessionCreateRequest, + ) => Promise; }; export type PluginRealtimeVoiceProviderEntry = RealtimeVoiceProviderPlugin & { diff --git a/src/realtime-voice/agent-consult-tool.ts b/src/realtime-voice/agent-consult-tool.ts new file mode 100644 index 00000000000..bdafb384581 --- /dev/null +++ b/src/realtime-voice/agent-consult-tool.ts @@ -0,0 +1,28 @@ +import type { RealtimeVoiceTool } from "./provider-types.js"; + +export const REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult"; + +export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = { + type: "function", + name: REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + description: + "Ask the full OpenClaw agent for deeper reasoning, current information, or tool-backed help before speaking.", + parameters: { + type: "object", + properties: { + question: { + type: "string", + description: "The concrete question or task the user asked.", + }, + context: { + type: "string", + description: "Optional relevant context or transcript summary.", + }, + responseStyle: { + type: "string", + description: "Optional style hint for the spoken answer.", + }, + }, + required: ["question"], + }, +}; diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts index 8f317e93298..ba72ff89640 100644 --- a/src/realtime-voice/provider-types.ts +++ b/src/realtime-voice/provider-types.ts @@ -53,6 +53,22 @@ export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & { tools?: RealtimeVoiceTool[]; }; +export type RealtimeVoiceBrowserSessionCreateRequest = { + providerConfig: RealtimeVoiceProviderConfig; + instructions?: string; + tools?: RealtimeVoiceTool[]; + model?: string; + voice?: string; +}; + +export type RealtimeVoiceBrowserSession = { + provider: RealtimeVoiceProviderId; + clientSecret: string; + model?: string; + voice?: string; + expiresAt?: number; +}; + export type RealtimeVoiceBridge = { connect(): Promise; sendAudio(audio: Buffer): void; diff --git a/ui/src/styles/chat/layout.css b/ui/src/styles/chat/layout.css index d4bade8e9db..e26f1d2f7b6 100644 --- a/ui/src/styles/chat/layout.css +++ b/ui/src/styles/chat/layout.css @@ -584,6 +584,15 @@ background: color-mix(in srgb, var(--accent) 12%, transparent); } +.agent-chat__input-btn--talk { + color: var(--danger, #ef4444); + background: color-mix(in srgb, var(--danger, #ef4444) 14%, transparent); +} + +.agent-chat__talk-status { + color: var(--text); +} + .agent-chat__input-divider { width: 1px; height: 16px; diff --git a/ui/src/ui/app-lifecycle.ts b/ui/src/ui/app-lifecycle.ts index 8fd8ecdda39..67e25af88f3 100644 --- a/ui/src/ui/app-lifecycle.ts +++ b/ui/src/ui/app-lifecycle.ts @@ -33,6 +33,11 @@ type LifecycleHost = { allowExternalEmbedUrls: boolean; chatHasAutoScrolled: boolean; chatManualRefreshInFlight: boolean; + realtimeTalkSession?: { stop: () => void } | null; + realtimeTalkActive?: boolean; + realtimeTalkStatus?: string; + realtimeTalkDetail?: string | null; + realtimeTalkTranscript?: string | null; chatLoading: boolean; chatMessages: unknown[]; chatToolMessages: unknown[]; @@ -77,6 +82,12 @@ export function handleDisconnected(host: LifecycleHost) { stopNodesPolling(host as unknown as Parameters[0]); stopLogsPolling(host as unknown as Parameters[0]); stopDebugPolling(host as unknown as Parameters[0]); + host.realtimeTalkSession?.stop(); + host.realtimeTalkSession = null; + host.realtimeTalkActive = false; + host.realtimeTalkStatus = "idle"; + host.realtimeTalkDetail = null; + host.realtimeTalkTranscript = null; host.client?.stop(); host.client = null; host.connected = false; diff --git a/ui/src/ui/app-render.ts b/ui/src/ui/app-render.ts index 2cf5e69bc1f..c12675947e0 100644 --- a/ui/src/ui/app-render.ts +++ b/ui/src/ui/app-render.ts @@ -2228,6 +2228,10 @@ export function renderApp(state: AppViewState) { streamStartedAt: state.chatStreamStartedAt, draft: state.chatMessage, queue: state.chatQueue, + realtimeTalkActive: state.realtimeTalkActive, + realtimeTalkStatus: state.realtimeTalkStatus, + realtimeTalkDetail: state.realtimeTalkDetail, + realtimeTalkTranscript: state.realtimeTalkTranscript, connected: state.connected, canSend: state.connected, disabledReason: chatDisabledReason, @@ -2256,6 +2260,7 @@ export function renderApp(state: AppViewState) { attachments: state.chatAttachments, onAttachmentsChange: (next) => (state.chatAttachments = next), onSend: () => state.handleSendChat(), + onToggleRealtimeTalk: () => state.toggleRealtimeTalk(), canAbort: Boolean(state.chatRunId), onAbort: () => void state.handleAbortChat(), onQueueRemove: (id) => state.removeQueuedMessage(id), diff --git a/ui/src/ui/app-view-state.ts b/ui/src/ui/app-view-state.ts index 528b9363ee4..73cf7aaf370 100644 --- a/ui/src/ui/app-view-state.ts +++ b/ui/src/ui/app-view-state.ts @@ -1,5 +1,6 @@ import type { EventLogEntry } from "./app-events.ts"; import type { CompactionStatus, FallbackStatus } from "./app-tool-stream.ts"; +import type { RealtimeTalkStatus } from "./chat/realtime-talk.ts"; import type { ChatSideResult } from "./chat/side-result.ts"; import type { CronModelSuggestionsState, CronState } from "./controllers/cron.ts"; import type { DevicePairingList } from "./controllers/devices.ts"; @@ -92,6 +93,10 @@ export type AppViewState = { chatModelsLoading: boolean; chatModelCatalog: ModelCatalogEntry[]; chatQueue: ChatQueueItem[]; + realtimeTalkActive: boolean; + realtimeTalkStatus: RealtimeTalkStatus; + realtimeTalkDetail: string | null; + realtimeTalkTranscript: string | null; chatManualRefreshInFlight: boolean; nodesLoading: boolean; nodes: Array>; @@ -425,6 +430,7 @@ export type AppViewState = { setPassword: (next: string) => void; setChatMessage: (next: string) => void; handleSendChat: (messageOverride?: string, opts?: { restoreDraft?: boolean }) => Promise; + toggleRealtimeTalk: () => Promise; steerQueuedChatMessage: (id: string) => Promise; handleAbortChat: () => Promise; removeQueuedMessage: (id: string) => void; diff --git a/ui/src/ui/app.ts b/ui/src/ui/app.ts index d955a73423d..e8ffedd38bd 100644 --- a/ui/src/ui/app.ts +++ b/ui/src/ui/app.ts @@ -57,6 +57,7 @@ import { import type { AppViewState } from "./app-view-state.ts"; import { normalizeAssistantIdentity } from "./assistant-identity.ts"; import { exportChatMarkdown } from "./chat/export.ts"; +import { RealtimeTalkSession, type RealtimeTalkStatus } from "./chat/realtime-talk.ts"; import type { ChatSideResult } from "./chat/side-result.ts"; import { loadToolsEffective as loadToolsEffectiveInternal, @@ -192,6 +193,11 @@ export class OpenClawApp extends LitElement { @state() chatModelCatalog: ModelCatalogEntry[] = []; @state() chatQueue: ChatQueueItem[] = []; @state() chatAttachments: ChatAttachment[] = []; + @state() realtimeTalkActive = false; + @state() realtimeTalkStatus: RealtimeTalkStatus = "idle"; + @state() realtimeTalkDetail: string | null = null; + @state() realtimeTalkTranscript: string | null = null; + private realtimeTalkSession: RealtimeTalkSession | null = null; @state() chatManualRefreshInFlight = false; @state() navDrawerOpen = false; @@ -710,6 +716,51 @@ export class OpenClawApp extends LitElement { ); } + async toggleRealtimeTalk() { + if (this.realtimeTalkSession) { + this.realtimeTalkSession.stop(); + this.realtimeTalkSession = null; + this.realtimeTalkActive = false; + this.realtimeTalkStatus = "idle"; + this.realtimeTalkDetail = null; + this.realtimeTalkTranscript = null; + return; + } + if (!this.client || !this.connected) { + this.lastError = "Gateway not connected"; + return; + } + this.realtimeTalkActive = true; + this.realtimeTalkStatus = "connecting"; + this.realtimeTalkDetail = null; + this.realtimeTalkTranscript = null; + const session = new RealtimeTalkSession(this.client, this.sessionKey, { + onStatus: (status, detail) => { + this.realtimeTalkStatus = status; + this.realtimeTalkDetail = detail ?? null; + if (status === "idle" || status === "error") { + this.realtimeTalkActive = status !== "idle"; + } + }, + onTranscript: (entry) => { + this.realtimeTalkTranscript = `${entry.role === "user" ? "You" : "OpenClaw"}: ${entry.text}`; + }, + }); + this.realtimeTalkSession = session; + try { + await session.start(); + } catch (error) { + session.stop(); + if (this.realtimeTalkSession === session) { + this.realtimeTalkSession = null; + } + this.realtimeTalkActive = false; + this.realtimeTalkStatus = "error"; + this.realtimeTalkDetail = error instanceof Error ? error.message : String(error); + this.lastError = this.realtimeTalkDetail; + } + } + async steerQueuedChatMessage(id: string) { await steerQueuedChatMessageInternal( this as unknown as Parameters[0], diff --git a/ui/src/ui/chat/realtime-talk.ts b/ui/src/ui/chat/realtime-talk.ts new file mode 100644 index 00000000000..bb980bf2775 --- /dev/null +++ b/ui/src/ui/chat/realtime-talk.ts @@ -0,0 +1,300 @@ +import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../../../src/realtime-voice/agent-consult-tool.js"; +import type { GatewayBrowserClient, GatewayEventFrame } from "../gateway.ts"; +import { generateUUID } from "../uuid.ts"; + +export type RealtimeTalkStatus = "idle" | "connecting" | "listening" | "thinking" | "error"; + +export type RealtimeTalkCallbacks = { + onStatus?: (status: RealtimeTalkStatus, detail?: string) => void; + onTranscript?: (entry: { role: "user" | "assistant"; text: string; final: boolean }) => void; +}; + +export type RealtimeTalkSessionResult = { + provider: string; + clientSecret: string; + model?: string; + voice?: string; + expiresAt?: number; +}; + +type RealtimeServerEvent = { + type?: string; + item_id?: string; + call_id?: string; + name?: string; + delta?: string; + transcript?: string; + arguments?: string; +}; + +type ToolBuffer = { + name: string; + callId: string; + args: string; +}; + +type ChatPayload = { + runId?: string; + state?: string; + errorMessage?: string; + message?: unknown; +}; + +function extractTextFromMessage(message: unknown): string { + if (!message || typeof message !== "object") { + return ""; + } + const record = message as Record; + if (typeof record.text === "string") { + return record.text; + } + const content = Array.isArray(record.content) ? record.content : []; + const parts = content + .map((block) => { + if (!block || typeof block !== "object") { + return ""; + } + const entry = block as Record; + return entry.type === "text" && typeof entry.text === "string" ? entry.text : ""; + }) + .filter(Boolean); + return parts.join("\n\n").trim(); +} + +function waitForChatResult(params: { + client: GatewayBrowserClient; + runId: string; + timeoutMs: number; +}): Promise { + return new Promise((resolve, reject) => { + const timer = window.setTimeout(() => { + unsubscribe(); + reject(new Error("OpenClaw tool call timed out")); + }, params.timeoutMs); + const unsubscribe = params.client.addEventListener((evt: GatewayEventFrame) => { + if (evt.event !== "chat") { + return; + } + const payload = evt.payload as ChatPayload | undefined; + if (!payload || payload.runId !== params.runId) { + return; + } + if (payload.state === "final") { + window.clearTimeout(timer); + unsubscribe(); + resolve(extractTextFromMessage(payload.message) || "OpenClaw finished with no text."); + } else if (payload.state === "error") { + window.clearTimeout(timer); + unsubscribe(); + reject(new Error(payload.errorMessage ?? "OpenClaw tool call failed")); + } + }); + }); +} + +export class RealtimeTalkSession { + private peer: RTCPeerConnection | null = null; + private channel: RTCDataChannel | null = null; + private media: MediaStream | null = null; + private audio: HTMLAudioElement | null = null; + private closed = false; + private toolBuffers = new Map(); + + constructor( + private readonly client: GatewayBrowserClient, + private readonly sessionKey: string, + private readonly callbacks: RealtimeTalkCallbacks = {}, + ) {} + + async start(): Promise { + if (!navigator.mediaDevices?.getUserMedia || typeof RTCPeerConnection === "undefined") { + throw new Error("Realtime Talk requires browser WebRTC and microphone access"); + } + this.closed = false; + this.callbacks.onStatus?.("connecting"); + const session = await this.client.request("talk.realtime.session", { + sessionKey: this.sessionKey, + }); + this.peer = new RTCPeerConnection(); + this.audio = document.createElement("audio"); + this.audio.autoplay = true; + this.audio.style.display = "none"; + document.body.append(this.audio); + this.peer.addEventListener("track", (event) => { + if (this.audio) { + this.audio.srcObject = event.streams[0]; + } + }); + this.media = await navigator.mediaDevices.getUserMedia({ audio: true }); + for (const track of this.media.getAudioTracks()) { + this.peer.addTrack(track, this.media); + } + this.channel = this.peer.createDataChannel("oai-events"); + this.channel.addEventListener("open", () => this.callbacks.onStatus?.("listening")); + this.channel.addEventListener("message", (event) => this.handleRealtimeEvent(event.data)); + this.peer.addEventListener("connectionstatechange", () => { + if (this.closed) { + return; + } + if (this.peer?.connectionState === "failed" || this.peer?.connectionState === "closed") { + this.callbacks.onStatus?.("error", "Realtime connection closed"); + } + }); + + const offer = await this.peer.createOffer(); + await this.peer.setLocalDescription(offer); + const sdp = await fetch("https://api.openai.com/v1/realtime/calls", { + method: "POST", + body: offer.sdp, + headers: { + Authorization: `Bearer ${session.clientSecret}`, + "Content-Type": "application/sdp", + }, + }); + if (!sdp.ok) { + throw new Error(`Realtime WebRTC setup failed (${sdp.status})`); + } + await this.peer.setRemoteDescription({ + type: "answer", + sdp: await sdp.text(), + }); + } + + stop(): void { + this.closed = true; + this.callbacks.onStatus?.("idle"); + this.channel?.close(); + this.channel = null; + this.peer?.close(); + this.peer = null; + this.media?.getTracks().forEach((track) => track.stop()); + this.media = null; + this.audio?.remove(); + this.audio = null; + this.toolBuffers.clear(); + } + + private send(event: unknown): void { + if (this.channel?.readyState === "open") { + this.channel.send(JSON.stringify(event)); + } + } + + private handleRealtimeEvent(data: unknown): void { + let event: RealtimeServerEvent; + try { + event = JSON.parse(String(data)) as RealtimeServerEvent; + } catch { + return; + } + switch (event.type) { + case "conversation.item.input_audio_transcription.completed": + if (event.transcript) { + this.callbacks.onTranscript?.({ role: "user", text: event.transcript, final: true }); + } + return; + case "response.audio_transcript.done": + if (event.transcript) { + this.callbacks.onTranscript?.({ + role: "assistant", + text: event.transcript, + final: true, + }); + } + return; + case "response.function_call_arguments.delta": + this.bufferToolDelta(event); + return; + case "response.function_call_arguments.done": + void this.handleToolCall(event); + return; + default: + return; + } + } + + private bufferToolDelta(event: RealtimeServerEvent): void { + const key = event.item_id ?? "unknown"; + const existing = this.toolBuffers.get(key); + if (existing) { + existing.args += event.delta ?? ""; + return; + } + this.toolBuffers.set(key, { + name: event.name ?? "", + callId: event.call_id ?? "", + args: event.delta ?? "", + }); + } + + private async handleToolCall(event: RealtimeServerEvent): Promise { + const key = event.item_id ?? "unknown"; + const buffered = this.toolBuffers.get(key); + this.toolBuffers.delete(key); + const name = buffered?.name || event.name || ""; + const callId = buffered?.callId || event.call_id || ""; + if (name !== REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME || !callId) { + return; + } + this.callbacks.onStatus?.("thinking"); + let question = ""; + try { + const args = JSON.parse(buffered?.args || event.arguments || "{}") as { + question?: unknown; + context?: unknown; + responseStyle?: unknown; + }; + question = typeof args.question === "string" ? args.question.trim() : ""; + const context = typeof args.context === "string" ? args.context.trim() : ""; + const responseStyle = typeof args.responseStyle === "string" ? args.responseStyle.trim() : ""; + if (context || responseStyle) { + question = [ + question, + context ? `Context:\n${context}` : undefined, + responseStyle ? `Spoken style:\n${responseStyle}` : undefined, + ] + .filter(Boolean) + .join("\n\n"); + } + } catch {} + if (!question) { + this.submitToolResult(callId, { + error: `${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} requires a question`, + }); + this.callbacks.onStatus?.("listening"); + return; + } + try { + const idempotencyKey = generateUUID(); + const response = await this.client.request<{ runId?: string }>("chat.send", { + sessionKey: this.sessionKey, + message: question, + idempotencyKey, + }); + const result = await waitForChatResult({ + client: this.client, + runId: response.runId ?? idempotencyKey, + timeoutMs: 120_000, + }); + this.submitToolResult(callId, { result }); + } catch (error) { + this.submitToolResult(callId, { + error: error instanceof Error ? error.message : String(error), + }); + } finally { + this.callbacks.onStatus?.("listening"); + } + } + + private submitToolResult(callId: string, result: unknown): void { + this.send({ + type: "conversation.item.create", + item: { + type: "function_call_output", + call_id: callId, + output: JSON.stringify(result), + }, + }); + this.send({ type: "response.create" }); + } +} diff --git a/ui/src/ui/gateway.ts b/ui/src/ui/gateway.ts index dc34e84aeff..9e3a4990281 100644 --- a/ui/src/ui/gateway.ts +++ b/ui/src/ui/gateway.ts @@ -223,6 +223,8 @@ export type GatewayBrowserClientOptions = { onGap?: (info: { expected: number; received: number }) => void; }; +export type GatewayEventListener = (evt: GatewayEventFrame) => void; + // 4008 = application-defined code (browser rejects 1008 "Policy Violation") const CONNECT_FAILED_CLOSE_CODE = 4008; @@ -298,6 +300,7 @@ export class GatewayBrowserClient { private pendingConnectError: GatewayErrorInfo | undefined; private pendingDeviceTokenRetry = false; private deviceTokenRetryBudgetUsed = false; + private eventListeners = new Set(); constructor(private opts: GatewayBrowserClientOptions) {} @@ -549,6 +552,9 @@ export class GatewayBrowserClient { } try { this.opts.onEvent?.(evt); + for (const listener of this.eventListeners) { + listener(evt); + } } catch (err) { console.error("[gateway] event handler error:", err); } @@ -625,6 +631,13 @@ export class GatewayBrowserClient { return p; } + addEventListener(listener: GatewayEventListener): () => void { + this.eventListeners.add(listener); + return () => { + this.eventListeners.delete(listener); + }; + } + private queueConnect() { this.connectNonce = null; this.connectSent = false; diff --git a/ui/src/ui/views/chat.ts b/ui/src/ui/views/chat.ts index 896b5254239..ae8621787f3 100644 --- a/ui/src/ui/views/chat.ts +++ b/ui/src/ui/views/chat.ts @@ -18,6 +18,7 @@ import { import { InputHistory } from "../chat/input-history.ts"; import { PinnedMessages } from "../chat/pinned-messages.ts"; import { getPinnedMessageSummary } from "../chat/pinned-summary.ts"; +import type { RealtimeTalkStatus } from "../chat/realtime-talk.ts"; import { renderChatRunControls } from "../chat/run-controls.ts"; import { getOrCreateSessionCacheValue } from "../chat/session-cache.ts"; import { renderSideResult } from "../chat/side-result-render.ts"; @@ -65,6 +66,10 @@ export type ChatProps = { assistantAvatarUrl?: string | null; draft: string; queue: ChatQueueItem[]; + realtimeTalkActive?: boolean; + realtimeTalkStatus?: RealtimeTalkStatus; + realtimeTalkDetail?: string | null; + realtimeTalkTranscript?: string | null; connected: boolean; canSend: boolean; disabledReason: string | null; @@ -95,6 +100,7 @@ export type ChatProps = { onDraftChange: (next: string) => void; onRequestUpdate?: () => void; onSend: () => void; + onToggleRealtimeTalk?: () => void; onAbort?: () => void; onQueueRemove: (id: string) => void; onQueueSteer?: (id: string) => void; @@ -1207,6 +1213,19 @@ export function renderChat(props: ChatProps) { ${vs.sttRecording && vs.sttInterimText ? html`
${vs.sttInterimText}
` : nothing} + ${props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript + ? html` +
+ ${props.realtimeTalkDetail ?? + props.realtimeTalkTranscript ?? + (props.realtimeTalkStatus === "thinking" + ? "Asking OpenClaw..." + : props.realtimeTalkStatus === "connecting" + ? "Connecting Talk..." + : "Talk live")} +
+ ` + : nothing}