feat: add browser realtime talk

This commit is contained in:
Peter Steinberger
2026-04-24 03:33:29 +01:00
parent d42069b11e
commit 04066d246a
26 changed files with 765 additions and 45 deletions

View File

@@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Control UI/chat: add a Steer action on queued messages so a browser follow-up can be injected into the active run without retyping it.
- Control UI/Talk: add browser WebRTC realtime voice sessions backed by OpenAI Realtime, with Gateway-minted ephemeral client secrets and `openclaw_agent_consult` handoff to the full OpenClaw agent.
- Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it.
- Agents/subagents: add optional forked context for native `sessions_spawn` runs so agents can let a child inherit the requester transcript when needed, while keeping clean isolated sessions as the default; includes prompt guidance, context-engine hook metadata, docs, and QA coverage.
- Codex harness: add structured debug logging for embedded harness selection decisions so `/status` stays simple while gateway logs explain auto-selection and Pi fallback reasons. (#70760) Thanks @100yenadmin.

View File

@@ -1,2 +1,2 @@
793ed905cb0ba93b9a2f8c2c85c3cfb4d194dd9263353e74952bf9e382b03dc2 plugin-sdk-api-baseline.json
032e7fd6f48344c9b3b98fd3e877e6d30cab92ed9a39dd309796cf1f0220820f plugin-sdk-api-baseline.jsonl
96905c33f4498446f612ae17dee6affdf84ef0e2e5a0f25bf7191c315f5b826f plugin-sdk-api-baseline.json
d8eb6331562fde29531eaac18409bb7fabcc70623bf25395f8e5710a49765f0f plugin-sdk-api-baseline.jsonl

View File

@@ -25,19 +25,19 @@ API-enabled model such as `openai/gpt-5.4` for `OPENAI_API_KEY` setups.
## OpenClaw feature coverage
| OpenAI capability | OpenClaw surface | Status |
| ------------------------- | ------------------------------------------------------ | ------------------------------------------------------ |
| Chat / Responses | `openai/<model>` model provider | Yes |
| Codex subscription models | `openai-codex/<model>` with `openai-codex` OAuth | Yes |
| Codex app-server harness | `openai/<model>` with `embeddedHarness.runtime: codex` | Yes |
| Server-side web search | Native OpenAI Responses tool | Yes, when web search is enabled and no provider pinned |
| Images | `image_generate` | Yes |
| Videos | `video_generate` | Yes |
| Text-to-speech | `messages.tts.provider: "openai"` / `tts` | Yes |
| Batch speech-to-text | `tools.media.audio` / media understanding | Yes |
| Streaming speech-to-text | Voice Call `streaming.provider: "openai"` | Yes |
| Realtime voice | Voice Call `realtime.provider: "openai"` | Yes |
| Embeddings | memory embedding provider | Yes |
| OpenAI capability | OpenClaw surface | Status |
| ------------------------- | ---------------------------------------------------------- | ------------------------------------------------------ |
| Chat / Responses | `openai/<model>` model provider | Yes |
| Codex subscription models | `openai-codex/<model>` with `openai-codex` OAuth | Yes |
| Codex app-server harness | `openai/<model>` with `embeddedHarness.runtime: codex` | Yes |
| Server-side web search | Native OpenAI Responses tool | Yes, when web search is enabled and no provider pinned |
| Images | `image_generate` | Yes |
| Videos | `video_generate` | Yes |
| Text-to-speech | `messages.tts.provider: "openai"` / `tts` | Yes |
| Batch speech-to-text | `tools.media.audio` / media understanding | Yes |
| Streaming speech-to-text | Voice Call `streaming.provider: "openai"` | Yes |
| Realtime voice | Voice Call `realtime.provider: "openai"` / Control UI Talk | Yes |
| Embeddings | memory embedding provider | Yes |
## Getting started

View File

@@ -105,6 +105,11 @@ locale picker lives in the Gateway Access card, not under Appearance.
## What it can do (today)
- Chat with the model via Gateway WS (`chat.history`, `chat.send`, `chat.abort`, `chat.inject`)
- Talk to OpenAI Realtime directly from the browser via WebRTC. The Gateway
mints a short-lived Realtime client secret with `talk.realtime.session`; the
browser sends microphone audio directly to OpenAI and relays
`openclaw_agent_consult` tool calls back through `chat.send` for the larger
configured OpenClaw model.
- Stream tool calls + live tool output cards in Chat (agent events)
- Channels: built-in plus bundled/external plugin channels status, QR login, and per-channel config (`channels.status`, `web.login.*`, `config.patch`)
- Instances: presence list + refresh (`system-presence`)
@@ -151,6 +156,10 @@ Cron jobs panel notes:
- `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `<tool_call>...</tool_call>`, `<function_call>...</function_call>`, `<tool_calls>...</tool_calls>`, `<function_calls>...</function_calls>`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`.
- `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery).
- The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options.
- Talk mode uses the registered realtime voice provider. Configure OpenAI with
`talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the
Voice Call realtime provider config. The browser never receives the standard
OpenAI API key; it receives only the ephemeral Realtime client secret.
- Stop:
- Click **Stop** (calls `chat.abort`)
- While a run is active, normal follow-ups queue. Click **Steer** on a queued message to inject that follow-up into the running turn.

View File

@@ -1,7 +1,11 @@
import { randomUUID } from "node:crypto";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime";
import type { RealtimeVoiceTool } from "openclaw/plugin-sdk/realtime-voice";
import {
REALTIME_VOICE_AGENT_CONSULT_TOOL,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
type RealtimeVoiceTool,
} from "openclaw/plugin-sdk/realtime-voice";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import type { GoogleMeetConfig, GoogleMeetToolPolicy } from "./config.js";
@@ -11,32 +15,8 @@ type AgentPayload = {
isReasoning?: boolean;
};
export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult";
export const GOOGLE_MEET_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
type: "function",
name: GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME,
description:
"Ask the full OpenClaw agent for deeper reasoning, current information, or tool-backed help before speaking in the meeting.",
parameters: {
type: "object",
properties: {
question: {
type: "string",
description: "The concrete question or task the meeting participant asked.",
},
context: {
type: "string",
description: "Optional relevant meeting context or transcript summary.",
},
responseStyle: {
type: "string",
description: "Optional style hint for the spoken answer.",
},
},
required: ["question"],
},
};
export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME;
export const GOOGLE_MEET_AGENT_CONSULT_TOOL = REALTIME_VOICE_AGENT_CONSULT_TOOL;
export function resolveGoogleMeetRealtimeTools(policy: GoogleMeetToolPolicy): RealtimeVoiceTool[] {
return policy === "none" ? [] : [GOOGLE_MEET_AGENT_CONSULT_TOOL];

View File

@@ -1,3 +1,4 @@
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "openclaw/plugin-sdk/realtime-voice";
import {
normalizeOptionalLowercaseString,
normalizeOptionalString,
@@ -94,8 +95,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
"-",
] as const;
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS =
"You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering.";
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
enabled: true,

View File

@@ -6,6 +6,8 @@ import {
} from "openclaw/plugin-sdk/proxy-capture";
import type {
RealtimeVoiceBridge,
RealtimeVoiceBrowserSession,
RealtimeVoiceBrowserSessionCreateRequest,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
@@ -59,6 +61,8 @@ type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
azureApiVersion?: string;
};
const OPENAI_REALTIME_DEFAULT_MODEL = "gpt-realtime-1.5";
type RealtimeEvent = {
type: string;
delta?: string;
@@ -117,7 +121,7 @@ function base64ToBuffer(b64: string): Buffer {
}
class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
private static readonly DEFAULT_MODEL = "gpt-realtime-1.5";
private static readonly DEFAULT_MODEL = OPENAI_REALTIME_DEFAULT_MODEL;
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
private static readonly BASE_RECONNECT_DELAY_MS = 1000;
private static readonly CONNECT_TIMEOUT_MS = 10_000;
@@ -579,6 +583,77 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
}
function readStringField(value: unknown, key: string): string | undefined {
if (!value || typeof value !== "object") {
return undefined;
}
const raw = (value as Record<string, unknown>)[key];
return typeof raw === "string" && raw.trim() ? raw.trim() : undefined;
}
async function createOpenAIRealtimeBrowserSession(
req: RealtimeVoiceBrowserSessionCreateRequest,
): Promise<RealtimeVoiceBrowserSession> {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
if (config.azureEndpoint || config.azureDeployment) {
throw new Error("OpenAI Realtime browser sessions do not support Azure endpoints yet");
}
const model = req.model ?? config.model ?? OPENAI_REALTIME_DEFAULT_MODEL;
const voice = (req.voice ?? config.voice ?? "alloy") as OpenAIRealtimeVoice;
const session: Record<string, unknown> = {
type: "realtime",
model,
instructions: req.instructions,
audio: {
output: { voice },
},
};
if (req.tools && req.tools.length > 0) {
session.tools = req.tools;
session.tool_choice = "auto";
}
const response = await fetch("https://api.openai.com/v1/realtime/client_secrets", {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({ session }),
});
if (!response.ok) {
const detail = await response.text().catch(() => "");
throw new Error(
`OpenAI Realtime browser session failed (${response.status}): ${detail || response.statusText}`,
);
}
const payload = (await response.json()) as unknown;
const nestedSecret =
payload && typeof payload === "object"
? (payload as Record<string, unknown>).client_secret
: undefined;
const clientSecret = readStringField(payload, "value") ?? readStringField(nestedSecret, "value");
if (!clientSecret) {
throw new Error("OpenAI Realtime browser session did not return a client secret");
}
const expiresAt =
payload && typeof payload === "object"
? (payload as Record<string, unknown>).expires_at
: undefined;
return {
provider: "openai",
clientSecret,
model,
voice,
...(typeof expiresAt === "number" ? { expiresAt } : {}),
};
}
export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
return {
id: "openai",
@@ -607,6 +682,7 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
azureApiVersion: config.azureApiVersion,
});
},
createBrowserSession: createOpenAIRealtimeBrowserSession,
};
}

View File

@@ -125,6 +125,7 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
"agent.wait",
"wake",
"talk.mode",
"talk.realtime.session",
"talk.speak",
"tts.enable",
"tts.disable",

View File

@@ -52,6 +52,10 @@ import {
TalkConfigParamsSchema,
type TalkConfigResult,
TalkConfigResultSchema,
type TalkRealtimeSessionParams,
TalkRealtimeSessionParamsSchema,
type TalkRealtimeSessionResult,
TalkRealtimeSessionResultSchema,
type TalkSpeakParams,
TalkSpeakParamsSchema,
type TalkSpeakResult,
@@ -428,6 +432,12 @@ export const validateWizardStatusParams = ajv.compile<WizardStatusParams>(Wizard
export const validateTalkModeParams = ajv.compile<TalkModeParams>(TalkModeParamsSchema);
export const validateTalkConfigParams = ajv.compile<TalkConfigParams>(TalkConfigParamsSchema);
export const validateTalkConfigResult = ajv.compile<TalkConfigResult>(TalkConfigResultSchema);
export const validateTalkRealtimeSessionParams = ajv.compile<TalkRealtimeSessionParams>(
TalkRealtimeSessionParamsSchema,
);
export const validateTalkRealtimeSessionResult = ajv.compile<TalkRealtimeSessionResult>(
TalkRealtimeSessionResultSchema,
);
export const validateTalkSpeakParams = ajv.compile<TalkSpeakParams>(TalkSpeakParamsSchema);
export const validateTalkSpeakResult = ajv.compile<TalkSpeakResult>(TalkSpeakResultSchema);
export const validateChannelsStatusParams = ajv.compile<ChannelsStatusParams>(
@@ -616,6 +626,8 @@ export {
WizardStatusResultSchema,
TalkConfigParamsSchema,
TalkConfigResultSchema,
TalkRealtimeSessionParamsSchema,
TalkRealtimeSessionResultSchema,
TalkSpeakParamsSchema,
TalkSpeakResultSchema,
ChannelsStatusParamsSchema,
@@ -720,6 +732,8 @@ export type {
WizardStatusResult,
TalkConfigParams,
TalkConfigResult,
TalkRealtimeSessionParams,
TalkRealtimeSessionResult,
TalkSpeakParams,
TalkSpeakResult,
TalkModeParams,

View File

@@ -36,6 +36,28 @@ export const TalkSpeakParamsSchema = Type.Object(
{ additionalProperties: false },
);
export const TalkRealtimeSessionParamsSchema = Type.Object(
{
sessionKey: Type.Optional(Type.String()),
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
instructions: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkRealtimeSessionResultSchema = Type.Object(
{
provider: NonEmptyString,
clientSecret: NonEmptyString,
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
expiresAt: Type.Optional(Type.Number()),
},
{ additionalProperties: false },
);
const talkProviderFieldSchemas = {
apiKey: Type.Optional(SecretInputSchema),
};

View File

@@ -54,6 +54,8 @@ import {
ChannelsLogoutParamsSchema,
TalkConfigParamsSchema,
TalkConfigResultSchema,
TalkRealtimeSessionParamsSchema,
TalkRealtimeSessionResultSchema,
TalkSpeakParamsSchema,
TalkSpeakResultSchema,
ChannelsStatusParamsSchema,
@@ -279,6 +281,8 @@ export const ProtocolSchemas = {
TalkModeParams: TalkModeParamsSchema,
TalkConfigParams: TalkConfigParamsSchema,
TalkConfigResult: TalkConfigResultSchema,
TalkRealtimeSessionParams: TalkRealtimeSessionParamsSchema,
TalkRealtimeSessionResult: TalkRealtimeSessionResultSchema,
TalkSpeakParams: TalkSpeakParamsSchema,
TalkSpeakResult: TalkSpeakResultSchema,
ChannelsStatusParams: ChannelsStatusParamsSchema,

View File

@@ -80,6 +80,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">;
export type TalkModeParams = SchemaType<"TalkModeParams">;
export type TalkConfigParams = SchemaType<"TalkConfigParams">;
export type TalkConfigResult = SchemaType<"TalkConfigResult">;
export type TalkRealtimeSessionParams = SchemaType<"TalkRealtimeSessionParams">;
export type TalkRealtimeSessionResult = SchemaType<"TalkRealtimeSessionResult">;
export type TalkSpeakParams = SchemaType<"TalkSpeakParams">;
export type TalkSpeakResult = SchemaType<"TalkSpeakResult">;
export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">;

View File

@@ -48,6 +48,7 @@ const BASE_METHODS = [
"wizard.cancel",
"wizard.status",
"talk.config",
"talk.realtime.session",
"talk.speak",
"talk.mode",
"commands.list",

View File

@@ -7,6 +7,13 @@ import {
} from "../../config/talk.js";
import type { TalkConfigResponse, TalkProviderConfig } from "../../config/types.gateway.js";
import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js";
import {
REALTIME_VOICE_AGENT_CONSULT_TOOL,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
} from "../../realtime-voice/agent-consult-tool.js";
import { getRealtimeVoiceProvider } from "../../realtime-voice/provider-registry.js";
import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js";
import type { RealtimeVoiceProviderConfig } from "../../realtime-voice/provider-types.js";
import {
normalizeLowercaseStringOrEmpty,
normalizeOptionalLowercaseString,
@@ -22,6 +29,7 @@ import {
type TalkSpeakParams,
validateTalkConfigParams,
validateTalkModeParams,
validateTalkRealtimeSessionParams,
validateTalkSpeakParams,
} from "../protocol/index.js";
import { formatForLog } from "../ws-log.js";
@@ -136,6 +144,63 @@ function buildTalkTtsConfig(
};
}
function getRecord(value: unknown): Record<string, unknown> | undefined {
return asRecord(value) ?? undefined;
}
function getVoiceCallRealtimeConfig(config: OpenClawConfig): {
provider?: string;
providers?: Record<string, RealtimeVoiceProviderConfig>;
} {
const plugins = getRecord(config.plugins);
const entries = getRecord(plugins?.entries);
const voiceCall = getRecord(entries?.["voice-call"]);
const pluginConfig = getRecord(voiceCall?.config);
const realtime = getRecord(pluginConfig?.realtime);
const providersRaw = getRecord(realtime?.providers);
const providers: Record<string, RealtimeVoiceProviderConfig> = {};
if (providersRaw) {
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
const record = getRecord(providerConfig);
if (record) {
providers[providerId] = record;
}
}
}
return {
provider: normalizeOptionalString(realtime?.provider),
providers: Object.keys(providers).length > 0 ? providers : undefined,
};
}
function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) {
const voiceCallRealtime = getVoiceCallRealtimeConfig(config);
const talkProviderConfigs = config.talk?.providers as
| Record<string, RealtimeVoiceProviderConfig>
| undefined;
const talkProvider = normalizeOptionalString(config.talk?.provider);
const talkProviderSupportsRealtime = talkProvider
? Boolean(getRealtimeVoiceProvider(talkProvider, config))
: false;
const provider =
normalizeOptionalString(requestedProvider) ??
(talkProviderSupportsRealtime ? talkProvider : undefined) ??
voiceCallRealtime.provider;
return {
provider,
providers: {
...voiceCallRealtime.providers,
...talkProviderConfigs,
},
};
}
function buildRealtimeInstructions(extra: string | undefined): string {
const base = `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
const trimmed = normalizeOptionalString(extra);
return trimmed ? `${base}\n\n${trimmed}` : base;
}
function isFallbackEligibleTalkReason(reason: TalkSpeakReason): boolean {
return (
reason === "talk_unconfigured" ||
@@ -334,6 +399,67 @@ export const talkHandlers: GatewayRequestHandlers = {
respond(true, { config: configPayload }, undefined);
},
"talk.realtime.session": async ({ params, respond }) => {
if (!validateTalkRealtimeSessionParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.realtime.session params: ${formatValidationErrors(validateTalkRealtimeSessionParams.errors)}`,
),
);
return;
}
const typedParams = params as {
provider?: string;
model?: string;
voice?: string;
instructions?: string;
};
try {
const runtimeConfig = loadConfig();
const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, typedParams.provider);
const resolution = resolveConfiguredRealtimeVoiceProvider({
configuredProviderId: realtimeConfig.provider,
providerConfigs: realtimeConfig.providers,
cfg: runtimeConfig,
cfgForResolve: runtimeConfig,
noRegisteredProviderMessage: "No realtime voice provider registered",
});
if (!resolution.provider.createBrowserSession) {
respond(
false,
undefined,
errorShape(
ErrorCodes.UNAVAILABLE,
`Realtime voice provider "${resolution.provider.id}" does not support browser WebRTC sessions`,
),
);
return;
}
const session = await resolution.provider.createBrowserSession({
providerConfig: resolution.providerConfig,
instructions: buildRealtimeInstructions(typedParams.instructions),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model: normalizeOptionalString(typedParams.model),
voice: normalizeOptionalString(typedParams.voice),
});
respond(
true,
{
provider: session.provider,
clientSecret: session.clientSecret,
...(session.model ? { model: session.model } : {}),
...(session.voice ? { voice: session.voice } : {}),
...(typeof session.expiresAt === "number" ? { expiresAt: session.expiresAt } : {}),
},
undefined,
);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.speak": async ({ params, respond }) => {
if (!validateTalkSpeakParams(params)) {
respond(

View File

@@ -2,6 +2,8 @@ export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
export type {
RealtimeVoiceBridge,
RealtimeVoiceBridgeCallbacks,
RealtimeVoiceBrowserSession,
RealtimeVoiceBrowserSessionCreateRequest,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceCloseReason,
RealtimeVoiceProviderConfig,
@@ -12,6 +14,10 @@ export type {
RealtimeVoiceTool,
RealtimeVoiceToolCallEvent,
} from "../realtime-voice/provider-types.js";
export {
REALTIME_VOICE_AGENT_CONSULT_TOOL,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
} from "../realtime-voice/agent-consult-tool.js";
export {
canonicalizeRealtimeVoiceProviderId,
getRealtimeVoiceProvider,

View File

@@ -40,6 +40,8 @@ import type {
} from "../realtime-transcription/provider-types.js";
import type {
RealtimeVoiceBridge,
RealtimeVoiceBrowserSession,
RealtimeVoiceBrowserSessionCreateRequest,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderConfiguredContext,
@@ -1661,6 +1663,9 @@ export type RealtimeVoiceProviderPlugin = {
resolveConfig?: (ctx: RealtimeVoiceProviderResolveConfigContext) => RealtimeVoiceProviderConfig;
isConfigured: (ctx: RealtimeVoiceProviderConfiguredContext) => boolean;
createBridge: (req: RealtimeVoiceBridgeCreateRequest) => RealtimeVoiceBridge;
createBrowserSession?: (
req: RealtimeVoiceBrowserSessionCreateRequest,
) => Promise<RealtimeVoiceBrowserSession>;
};
export type PluginRealtimeVoiceProviderEntry = RealtimeVoiceProviderPlugin & {

View File

@@ -0,0 +1,28 @@
import type { RealtimeVoiceTool } from "./provider-types.js";
export const REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult";
export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
type: "function",
name: REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
description:
"Ask the full OpenClaw agent for deeper reasoning, current information, or tool-backed help before speaking.",
parameters: {
type: "object",
properties: {
question: {
type: "string",
description: "The concrete question or task the user asked.",
},
context: {
type: "string",
description: "Optional relevant context or transcript summary.",
},
responseStyle: {
type: "string",
description: "Optional style hint for the spoken answer.",
},
},
required: ["question"],
},
};

View File

@@ -53,6 +53,22 @@ export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
tools?: RealtimeVoiceTool[];
};
export type RealtimeVoiceBrowserSessionCreateRequest = {
providerConfig: RealtimeVoiceProviderConfig;
instructions?: string;
tools?: RealtimeVoiceTool[];
model?: string;
voice?: string;
};
export type RealtimeVoiceBrowserSession = {
provider: RealtimeVoiceProviderId;
clientSecret: string;
model?: string;
voice?: string;
expiresAt?: number;
};
export type RealtimeVoiceBridge = {
connect(): Promise<void>;
sendAudio(audio: Buffer): void;

View File

@@ -584,6 +584,15 @@
background: color-mix(in srgb, var(--accent) 12%, transparent);
}
.agent-chat__input-btn--talk {
color: var(--danger, #ef4444);
background: color-mix(in srgb, var(--danger, #ef4444) 14%, transparent);
}
.agent-chat__talk-status {
color: var(--text);
}
.agent-chat__input-divider {
width: 1px;
height: 16px;

View File

@@ -33,6 +33,11 @@ type LifecycleHost = {
allowExternalEmbedUrls: boolean;
chatHasAutoScrolled: boolean;
chatManualRefreshInFlight: boolean;
realtimeTalkSession?: { stop: () => void } | null;
realtimeTalkActive?: boolean;
realtimeTalkStatus?: string;
realtimeTalkDetail?: string | null;
realtimeTalkTranscript?: string | null;
chatLoading: boolean;
chatMessages: unknown[];
chatToolMessages: unknown[];
@@ -77,6 +82,12 @@ export function handleDisconnected(host: LifecycleHost) {
stopNodesPolling(host as unknown as Parameters<typeof stopNodesPolling>[0]);
stopLogsPolling(host as unknown as Parameters<typeof stopLogsPolling>[0]);
stopDebugPolling(host as unknown as Parameters<typeof stopDebugPolling>[0]);
host.realtimeTalkSession?.stop();
host.realtimeTalkSession = null;
host.realtimeTalkActive = false;
host.realtimeTalkStatus = "idle";
host.realtimeTalkDetail = null;
host.realtimeTalkTranscript = null;
host.client?.stop();
host.client = null;
host.connected = false;

View File

@@ -2228,6 +2228,10 @@ export function renderApp(state: AppViewState) {
streamStartedAt: state.chatStreamStartedAt,
draft: state.chatMessage,
queue: state.chatQueue,
realtimeTalkActive: state.realtimeTalkActive,
realtimeTalkStatus: state.realtimeTalkStatus,
realtimeTalkDetail: state.realtimeTalkDetail,
realtimeTalkTranscript: state.realtimeTalkTranscript,
connected: state.connected,
canSend: state.connected,
disabledReason: chatDisabledReason,
@@ -2256,6 +2260,7 @@ export function renderApp(state: AppViewState) {
attachments: state.chatAttachments,
onAttachmentsChange: (next) => (state.chatAttachments = next),
onSend: () => state.handleSendChat(),
onToggleRealtimeTalk: () => state.toggleRealtimeTalk(),
canAbort: Boolean(state.chatRunId),
onAbort: () => void state.handleAbortChat(),
onQueueRemove: (id) => state.removeQueuedMessage(id),

View File

@@ -1,5 +1,6 @@
import type { EventLogEntry } from "./app-events.ts";
import type { CompactionStatus, FallbackStatus } from "./app-tool-stream.ts";
import type { RealtimeTalkStatus } from "./chat/realtime-talk.ts";
import type { ChatSideResult } from "./chat/side-result.ts";
import type { CronModelSuggestionsState, CronState } from "./controllers/cron.ts";
import type { DevicePairingList } from "./controllers/devices.ts";
@@ -92,6 +93,10 @@ export type AppViewState = {
chatModelsLoading: boolean;
chatModelCatalog: ModelCatalogEntry[];
chatQueue: ChatQueueItem[];
realtimeTalkActive: boolean;
realtimeTalkStatus: RealtimeTalkStatus;
realtimeTalkDetail: string | null;
realtimeTalkTranscript: string | null;
chatManualRefreshInFlight: boolean;
nodesLoading: boolean;
nodes: Array<Record<string, unknown>>;
@@ -425,6 +430,7 @@ export type AppViewState = {
setPassword: (next: string) => void;
setChatMessage: (next: string) => void;
handleSendChat: (messageOverride?: string, opts?: { restoreDraft?: boolean }) => Promise<void>;
toggleRealtimeTalk: () => Promise<void>;
steerQueuedChatMessage: (id: string) => Promise<void>;
handleAbortChat: () => Promise<void>;
removeQueuedMessage: (id: string) => void;

View File

@@ -57,6 +57,7 @@ import {
import type { AppViewState } from "./app-view-state.ts";
import { normalizeAssistantIdentity } from "./assistant-identity.ts";
import { exportChatMarkdown } from "./chat/export.ts";
import { RealtimeTalkSession, type RealtimeTalkStatus } from "./chat/realtime-talk.ts";
import type { ChatSideResult } from "./chat/side-result.ts";
import {
loadToolsEffective as loadToolsEffectiveInternal,
@@ -192,6 +193,11 @@ export class OpenClawApp extends LitElement {
@state() chatModelCatalog: ModelCatalogEntry[] = [];
@state() chatQueue: ChatQueueItem[] = [];
@state() chatAttachments: ChatAttachment[] = [];
@state() realtimeTalkActive = false;
@state() realtimeTalkStatus: RealtimeTalkStatus = "idle";
@state() realtimeTalkDetail: string | null = null;
@state() realtimeTalkTranscript: string | null = null;
private realtimeTalkSession: RealtimeTalkSession | null = null;
@state() chatManualRefreshInFlight = false;
@state() navDrawerOpen = false;
@@ -710,6 +716,51 @@ export class OpenClawApp extends LitElement {
);
}
async toggleRealtimeTalk() {
if (this.realtimeTalkSession) {
this.realtimeTalkSession.stop();
this.realtimeTalkSession = null;
this.realtimeTalkActive = false;
this.realtimeTalkStatus = "idle";
this.realtimeTalkDetail = null;
this.realtimeTalkTranscript = null;
return;
}
if (!this.client || !this.connected) {
this.lastError = "Gateway not connected";
return;
}
this.realtimeTalkActive = true;
this.realtimeTalkStatus = "connecting";
this.realtimeTalkDetail = null;
this.realtimeTalkTranscript = null;
const session = new RealtimeTalkSession(this.client, this.sessionKey, {
onStatus: (status, detail) => {
this.realtimeTalkStatus = status;
this.realtimeTalkDetail = detail ?? null;
if (status === "idle" || status === "error") {
this.realtimeTalkActive = status !== "idle";
}
},
onTranscript: (entry) => {
this.realtimeTalkTranscript = `${entry.role === "user" ? "You" : "OpenClaw"}: ${entry.text}`;
},
});
this.realtimeTalkSession = session;
try {
await session.start();
} catch (error) {
session.stop();
if (this.realtimeTalkSession === session) {
this.realtimeTalkSession = null;
}
this.realtimeTalkActive = false;
this.realtimeTalkStatus = "error";
this.realtimeTalkDetail = error instanceof Error ? error.message : String(error);
this.lastError = this.realtimeTalkDetail;
}
}
async steerQueuedChatMessage(id: string) {
await steerQueuedChatMessageInternal(
this as unknown as Parameters<typeof steerQueuedChatMessageInternal>[0],

View File

@@ -0,0 +1,300 @@
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../../../src/realtime-voice/agent-consult-tool.js";
import type { GatewayBrowserClient, GatewayEventFrame } from "../gateway.ts";
import { generateUUID } from "../uuid.ts";
export type RealtimeTalkStatus = "idle" | "connecting" | "listening" | "thinking" | "error";
export type RealtimeTalkCallbacks = {
onStatus?: (status: RealtimeTalkStatus, detail?: string) => void;
onTranscript?: (entry: { role: "user" | "assistant"; text: string; final: boolean }) => void;
};
export type RealtimeTalkSessionResult = {
provider: string;
clientSecret: string;
model?: string;
voice?: string;
expiresAt?: number;
};
type RealtimeServerEvent = {
type?: string;
item_id?: string;
call_id?: string;
name?: string;
delta?: string;
transcript?: string;
arguments?: string;
};
type ToolBuffer = {
name: string;
callId: string;
args: string;
};
type ChatPayload = {
runId?: string;
state?: string;
errorMessage?: string;
message?: unknown;
};
function extractTextFromMessage(message: unknown): string {
if (!message || typeof message !== "object") {
return "";
}
const record = message as Record<string, unknown>;
if (typeof record.text === "string") {
return record.text;
}
const content = Array.isArray(record.content) ? record.content : [];
const parts = content
.map((block) => {
if (!block || typeof block !== "object") {
return "";
}
const entry = block as Record<string, unknown>;
return entry.type === "text" && typeof entry.text === "string" ? entry.text : "";
})
.filter(Boolean);
return parts.join("\n\n").trim();
}
function waitForChatResult(params: {
client: GatewayBrowserClient;
runId: string;
timeoutMs: number;
}): Promise<string> {
return new Promise((resolve, reject) => {
const timer = window.setTimeout(() => {
unsubscribe();
reject(new Error("OpenClaw tool call timed out"));
}, params.timeoutMs);
const unsubscribe = params.client.addEventListener((evt: GatewayEventFrame) => {
if (evt.event !== "chat") {
return;
}
const payload = evt.payload as ChatPayload | undefined;
if (!payload || payload.runId !== params.runId) {
return;
}
if (payload.state === "final") {
window.clearTimeout(timer);
unsubscribe();
resolve(extractTextFromMessage(payload.message) || "OpenClaw finished with no text.");
} else if (payload.state === "error") {
window.clearTimeout(timer);
unsubscribe();
reject(new Error(payload.errorMessage ?? "OpenClaw tool call failed"));
}
});
});
}
export class RealtimeTalkSession {
private peer: RTCPeerConnection | null = null;
private channel: RTCDataChannel | null = null;
private media: MediaStream | null = null;
private audio: HTMLAudioElement | null = null;
private closed = false;
private toolBuffers = new Map<string, ToolBuffer>();
constructor(
private readonly client: GatewayBrowserClient,
private readonly sessionKey: string,
private readonly callbacks: RealtimeTalkCallbacks = {},
) {}
async start(): Promise<void> {
if (!navigator.mediaDevices?.getUserMedia || typeof RTCPeerConnection === "undefined") {
throw new Error("Realtime Talk requires browser WebRTC and microphone access");
}
this.closed = false;
this.callbacks.onStatus?.("connecting");
const session = await this.client.request<RealtimeTalkSessionResult>("talk.realtime.session", {
sessionKey: this.sessionKey,
});
this.peer = new RTCPeerConnection();
this.audio = document.createElement("audio");
this.audio.autoplay = true;
this.audio.style.display = "none";
document.body.append(this.audio);
this.peer.addEventListener("track", (event) => {
if (this.audio) {
this.audio.srcObject = event.streams[0];
}
});
this.media = await navigator.mediaDevices.getUserMedia({ audio: true });
for (const track of this.media.getAudioTracks()) {
this.peer.addTrack(track, this.media);
}
this.channel = this.peer.createDataChannel("oai-events");
this.channel.addEventListener("open", () => this.callbacks.onStatus?.("listening"));
this.channel.addEventListener("message", (event) => this.handleRealtimeEvent(event.data));
this.peer.addEventListener("connectionstatechange", () => {
if (this.closed) {
return;
}
if (this.peer?.connectionState === "failed" || this.peer?.connectionState === "closed") {
this.callbacks.onStatus?.("error", "Realtime connection closed");
}
});
const offer = await this.peer.createOffer();
await this.peer.setLocalDescription(offer);
const sdp = await fetch("https://api.openai.com/v1/realtime/calls", {
method: "POST",
body: offer.sdp,
headers: {
Authorization: `Bearer ${session.clientSecret}`,
"Content-Type": "application/sdp",
},
});
if (!sdp.ok) {
throw new Error(`Realtime WebRTC setup failed (${sdp.status})`);
}
await this.peer.setRemoteDescription({
type: "answer",
sdp: await sdp.text(),
});
}
stop(): void {
this.closed = true;
this.callbacks.onStatus?.("idle");
this.channel?.close();
this.channel = null;
this.peer?.close();
this.peer = null;
this.media?.getTracks().forEach((track) => track.stop());
this.media = null;
this.audio?.remove();
this.audio = null;
this.toolBuffers.clear();
}
private send(event: unknown): void {
if (this.channel?.readyState === "open") {
this.channel.send(JSON.stringify(event));
}
}
private handleRealtimeEvent(data: unknown): void {
let event: RealtimeServerEvent;
try {
event = JSON.parse(String(data)) as RealtimeServerEvent;
} catch {
return;
}
switch (event.type) {
case "conversation.item.input_audio_transcription.completed":
if (event.transcript) {
this.callbacks.onTranscript?.({ role: "user", text: event.transcript, final: true });
}
return;
case "response.audio_transcript.done":
if (event.transcript) {
this.callbacks.onTranscript?.({
role: "assistant",
text: event.transcript,
final: true,
});
}
return;
case "response.function_call_arguments.delta":
this.bufferToolDelta(event);
return;
case "response.function_call_arguments.done":
void this.handleToolCall(event);
return;
default:
return;
}
}
private bufferToolDelta(event: RealtimeServerEvent): void {
const key = event.item_id ?? "unknown";
const existing = this.toolBuffers.get(key);
if (existing) {
existing.args += event.delta ?? "";
return;
}
this.toolBuffers.set(key, {
name: event.name ?? "",
callId: event.call_id ?? "",
args: event.delta ?? "",
});
}
private async handleToolCall(event: RealtimeServerEvent): Promise<void> {
const key = event.item_id ?? "unknown";
const buffered = this.toolBuffers.get(key);
this.toolBuffers.delete(key);
const name = buffered?.name || event.name || "";
const callId = buffered?.callId || event.call_id || "";
if (name !== REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME || !callId) {
return;
}
this.callbacks.onStatus?.("thinking");
let question = "";
try {
const args = JSON.parse(buffered?.args || event.arguments || "{}") as {
question?: unknown;
context?: unknown;
responseStyle?: unknown;
};
question = typeof args.question === "string" ? args.question.trim() : "";
const context = typeof args.context === "string" ? args.context.trim() : "";
const responseStyle = typeof args.responseStyle === "string" ? args.responseStyle.trim() : "";
if (context || responseStyle) {
question = [
question,
context ? `Context:\n${context}` : undefined,
responseStyle ? `Spoken style:\n${responseStyle}` : undefined,
]
.filter(Boolean)
.join("\n\n");
}
} catch {}
if (!question) {
this.submitToolResult(callId, {
error: `${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} requires a question`,
});
this.callbacks.onStatus?.("listening");
return;
}
try {
const idempotencyKey = generateUUID();
const response = await this.client.request<{ runId?: string }>("chat.send", {
sessionKey: this.sessionKey,
message: question,
idempotencyKey,
});
const result = await waitForChatResult({
client: this.client,
runId: response.runId ?? idempotencyKey,
timeoutMs: 120_000,
});
this.submitToolResult(callId, { result });
} catch (error) {
this.submitToolResult(callId, {
error: error instanceof Error ? error.message : String(error),
});
} finally {
this.callbacks.onStatus?.("listening");
}
}
private submitToolResult(callId: string, result: unknown): void {
this.send({
type: "conversation.item.create",
item: {
type: "function_call_output",
call_id: callId,
output: JSON.stringify(result),
},
});
this.send({ type: "response.create" });
}
}

View File

@@ -223,6 +223,8 @@ export type GatewayBrowserClientOptions = {
onGap?: (info: { expected: number; received: number }) => void;
};
export type GatewayEventListener = (evt: GatewayEventFrame) => void;
// 4008 = application-defined code (browser rejects 1008 "Policy Violation")
const CONNECT_FAILED_CLOSE_CODE = 4008;
@@ -298,6 +300,7 @@ export class GatewayBrowserClient {
private pendingConnectError: GatewayErrorInfo | undefined;
private pendingDeviceTokenRetry = false;
private deviceTokenRetryBudgetUsed = false;
private eventListeners = new Set<GatewayEventListener>();
constructor(private opts: GatewayBrowserClientOptions) {}
@@ -549,6 +552,9 @@ export class GatewayBrowserClient {
}
try {
this.opts.onEvent?.(evt);
for (const listener of this.eventListeners) {
listener(evt);
}
} catch (err) {
console.error("[gateway] event handler error:", err);
}
@@ -625,6 +631,13 @@ export class GatewayBrowserClient {
return p;
}
addEventListener(listener: GatewayEventListener): () => void {
this.eventListeners.add(listener);
return () => {
this.eventListeners.delete(listener);
};
}
private queueConnect() {
this.connectNonce = null;
this.connectSent = false;

View File

@@ -18,6 +18,7 @@ import {
import { InputHistory } from "../chat/input-history.ts";
import { PinnedMessages } from "../chat/pinned-messages.ts";
import { getPinnedMessageSummary } from "../chat/pinned-summary.ts";
import type { RealtimeTalkStatus } from "../chat/realtime-talk.ts";
import { renderChatRunControls } from "../chat/run-controls.ts";
import { getOrCreateSessionCacheValue } from "../chat/session-cache.ts";
import { renderSideResult } from "../chat/side-result-render.ts";
@@ -65,6 +66,10 @@ export type ChatProps = {
assistantAvatarUrl?: string | null;
draft: string;
queue: ChatQueueItem[];
realtimeTalkActive?: boolean;
realtimeTalkStatus?: RealtimeTalkStatus;
realtimeTalkDetail?: string | null;
realtimeTalkTranscript?: string | null;
connected: boolean;
canSend: boolean;
disabledReason: string | null;
@@ -95,6 +100,7 @@ export type ChatProps = {
onDraftChange: (next: string) => void;
onRequestUpdate?: () => void;
onSend: () => void;
onToggleRealtimeTalk?: () => void;
onAbort?: () => void;
onQueueRemove: (id: string) => void;
onQueueSteer?: (id: string) => void;
@@ -1207,6 +1213,19 @@ export function renderChat(props: ChatProps) {
${vs.sttRecording && vs.sttInterimText
? html`<div class="agent-chat__stt-interim">${vs.sttInterimText}</div>`
: nothing}
${props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript
? html`
<div class="agent-chat__stt-interim agent-chat__talk-status">
${props.realtimeTalkDetail ??
props.realtimeTalkTranscript ??
(props.realtimeTalkStatus === "thinking"
? "Asking OpenClaw..."
: props.realtimeTalkStatus === "connecting"
? "Connecting Talk..."
: "Talk live")}
</div>
`
: nothing}
<textarea
${ref((el) => el && adjustTextareaHeight(el as HTMLTextAreaElement))}
@@ -1288,6 +1307,21 @@ export function renderChat(props: ChatProps) {
</button>
`
: nothing}
${props.onToggleRealtimeTalk
? html`
<button
class="agent-chat__input-btn ${props.realtimeTalkActive
? "agent-chat__input-btn--talk"
: ""}"
@click=${props.onToggleRealtimeTalk}
title=${props.realtimeTalkActive ? "Stop Talk" : "Start Talk"}
aria-label=${props.realtimeTalkActive ? "Stop Talk" : "Start Talk"}
?disabled=${!props.connected}
>
${props.realtimeTalkActive ? icons.volume2 : icons.radio}
</button>
`
: nothing}
${tokens ? html`<span class="agent-chat__token-count">${tokens}</span>` : nothing}
</div>