mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:40:44 +00:00
feat: add browser realtime talk
This commit is contained in:
@@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Changes
|
||||
|
||||
- Control UI/chat: add a Steer action on queued messages so a browser follow-up can be injected into the active run without retyping it.
|
||||
- Control UI/Talk: add browser WebRTC realtime voice sessions backed by OpenAI Realtime, with Gateway-minted ephemeral client secrets and `openclaw_agent_consult` handoff to the full OpenClaw agent.
|
||||
- Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it.
|
||||
- Agents/subagents: add optional forked context for native `sessions_spawn` runs so agents can let a child inherit the requester transcript when needed, while keeping clean isolated sessions as the default; includes prompt guidance, context-engine hook metadata, docs, and QA coverage.
|
||||
- Codex harness: add structured debug logging for embedded harness selection decisions so `/status` stays simple while gateway logs explain auto-selection and Pi fallback reasons. (#70760) Thanks @100yenadmin.
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
793ed905cb0ba93b9a2f8c2c85c3cfb4d194dd9263353e74952bf9e382b03dc2 plugin-sdk-api-baseline.json
|
||||
032e7fd6f48344c9b3b98fd3e877e6d30cab92ed9a39dd309796cf1f0220820f plugin-sdk-api-baseline.jsonl
|
||||
96905c33f4498446f612ae17dee6affdf84ef0e2e5a0f25bf7191c315f5b826f plugin-sdk-api-baseline.json
|
||||
d8eb6331562fde29531eaac18409bb7fabcc70623bf25395f8e5710a49765f0f plugin-sdk-api-baseline.jsonl
|
||||
|
||||
@@ -25,19 +25,19 @@ API-enabled model such as `openai/gpt-5.4` for `OPENAI_API_KEY` setups.
|
||||
|
||||
## OpenClaw feature coverage
|
||||
|
||||
| OpenAI capability | OpenClaw surface | Status |
|
||||
| ------------------------- | ------------------------------------------------------ | ------------------------------------------------------ |
|
||||
| Chat / Responses | `openai/<model>` model provider | Yes |
|
||||
| Codex subscription models | `openai-codex/<model>` with `openai-codex` OAuth | Yes |
|
||||
| Codex app-server harness | `openai/<model>` with `embeddedHarness.runtime: codex` | Yes |
|
||||
| Server-side web search | Native OpenAI Responses tool | Yes, when web search is enabled and no provider pinned |
|
||||
| Images | `image_generate` | Yes |
|
||||
| Videos | `video_generate` | Yes |
|
||||
| Text-to-speech | `messages.tts.provider: "openai"` / `tts` | Yes |
|
||||
| Batch speech-to-text | `tools.media.audio` / media understanding | Yes |
|
||||
| Streaming speech-to-text | Voice Call `streaming.provider: "openai"` | Yes |
|
||||
| Realtime voice | Voice Call `realtime.provider: "openai"` | Yes |
|
||||
| Embeddings | memory embedding provider | Yes |
|
||||
| OpenAI capability | OpenClaw surface | Status |
|
||||
| ------------------------- | ---------------------------------------------------------- | ------------------------------------------------------ |
|
||||
| Chat / Responses | `openai/<model>` model provider | Yes |
|
||||
| Codex subscription models | `openai-codex/<model>` with `openai-codex` OAuth | Yes |
|
||||
| Codex app-server harness | `openai/<model>` with `embeddedHarness.runtime: codex` | Yes |
|
||||
| Server-side web search | Native OpenAI Responses tool | Yes, when web search is enabled and no provider pinned |
|
||||
| Images | `image_generate` | Yes |
|
||||
| Videos | `video_generate` | Yes |
|
||||
| Text-to-speech | `messages.tts.provider: "openai"` / `tts` | Yes |
|
||||
| Batch speech-to-text | `tools.media.audio` / media understanding | Yes |
|
||||
| Streaming speech-to-text | Voice Call `streaming.provider: "openai"` | Yes |
|
||||
| Realtime voice | Voice Call `realtime.provider: "openai"` / Control UI Talk | Yes |
|
||||
| Embeddings | memory embedding provider | Yes |
|
||||
|
||||
## Getting started
|
||||
|
||||
|
||||
@@ -105,6 +105,11 @@ locale picker lives in the Gateway Access card, not under Appearance.
|
||||
## What it can do (today)
|
||||
|
||||
- Chat with the model via Gateway WS (`chat.history`, `chat.send`, `chat.abort`, `chat.inject`)
|
||||
- Talk to OpenAI Realtime directly from the browser via WebRTC. The Gateway
|
||||
mints a short-lived Realtime client secret with `talk.realtime.session`; the
|
||||
browser sends microphone audio directly to OpenAI and relays
|
||||
`openclaw_agent_consult` tool calls back through `chat.send` for the larger
|
||||
configured OpenClaw model.
|
||||
- Stream tool calls + live tool output cards in Chat (agent events)
|
||||
- Channels: built-in plus bundled/external plugin channels status, QR login, and per-channel config (`channels.status`, `web.login.*`, `config.patch`)
|
||||
- Instances: presence list + refresh (`system-presence`)
|
||||
@@ -151,6 +156,10 @@ Cron jobs panel notes:
|
||||
- `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `<tool_call>...</tool_call>`, `<function_call>...</function_call>`, `<tool_calls>...</tool_calls>`, `<function_calls>...</function_calls>`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`.
|
||||
- `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery).
|
||||
- The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options.
|
||||
- Talk mode uses the registered realtime voice provider. Configure OpenAI with
|
||||
`talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the
|
||||
Voice Call realtime provider config. The browser never receives the standard
|
||||
OpenAI API key; it receives only the ephemeral Realtime client secret.
|
||||
- Stop:
|
||||
- Click **Stop** (calls `chat.abort`)
|
||||
- While a run is active, normal follow-ups queue. Click **Steer** on a queued message to inject that follow-up into the running turn.
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
|
||||
import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime";
|
||||
import type { RealtimeVoiceTool } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import {
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
type RealtimeVoiceTool,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
|
||||
import type { GoogleMeetConfig, GoogleMeetToolPolicy } from "./config.js";
|
||||
|
||||
@@ -11,32 +15,8 @@ type AgentPayload = {
|
||||
isReasoning?: boolean;
|
||||
};
|
||||
|
||||
export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult";
|
||||
|
||||
export const GOOGLE_MEET_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
|
||||
type: "function",
|
||||
name: GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME,
|
||||
description:
|
||||
"Ask the full OpenClaw agent for deeper reasoning, current information, or tool-backed help before speaking in the meeting.",
|
||||
parameters: {
|
||||
type: "object",
|
||||
properties: {
|
||||
question: {
|
||||
type: "string",
|
||||
description: "The concrete question or task the meeting participant asked.",
|
||||
},
|
||||
context: {
|
||||
type: "string",
|
||||
description: "Optional relevant meeting context or transcript summary.",
|
||||
},
|
||||
responseStyle: {
|
||||
type: "string",
|
||||
description: "Optional style hint for the spoken answer.",
|
||||
},
|
||||
},
|
||||
required: ["question"],
|
||||
},
|
||||
};
|
||||
export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME;
|
||||
export const GOOGLE_MEET_AGENT_CONSULT_TOOL = REALTIME_VOICE_AGENT_CONSULT_TOOL;
|
||||
|
||||
export function resolveGoogleMeetRealtimeTools(policy: GoogleMeetToolPolicy): RealtimeVoiceTool[] {
|
||||
return policy === "none" ? [] : [GOOGLE_MEET_AGENT_CONSULT_TOOL];
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import {
|
||||
normalizeOptionalLowercaseString,
|
||||
normalizeOptionalString,
|
||||
@@ -94,8 +95,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
|
||||
"-",
|
||||
] as const;
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS =
|
||||
"You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering.";
|
||||
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
|
||||
enabled: true,
|
||||
|
||||
@@ -6,6 +6,8 @@ import {
|
||||
} from "openclaw/plugin-sdk/proxy-capture";
|
||||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBrowserSession,
|
||||
RealtimeVoiceBrowserSessionCreateRequest,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
@@ -59,6 +61,8 @@ type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
|
||||
azureApiVersion?: string;
|
||||
};
|
||||
|
||||
const OPENAI_REALTIME_DEFAULT_MODEL = "gpt-realtime-1.5";
|
||||
|
||||
type RealtimeEvent = {
|
||||
type: string;
|
||||
delta?: string;
|
||||
@@ -117,7 +121,7 @@ function base64ToBuffer(b64: string): Buffer {
|
||||
}
|
||||
|
||||
class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
private static readonly DEFAULT_MODEL = "gpt-realtime-1.5";
|
||||
private static readonly DEFAULT_MODEL = OPENAI_REALTIME_DEFAULT_MODEL;
|
||||
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
||||
private static readonly BASE_RECONNECT_DELAY_MS = 1000;
|
||||
private static readonly CONNECT_TIMEOUT_MS = 10_000;
|
||||
@@ -579,6 +583,77 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
}
|
||||
}
|
||||
|
||||
function readStringField(value: unknown, key: string): string | undefined {
|
||||
if (!value || typeof value !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
const raw = (value as Record<string, unknown>)[key];
|
||||
return typeof raw === "string" && raw.trim() ? raw.trim() : undefined;
|
||||
}
|
||||
|
||||
async function createOpenAIRealtimeBrowserSession(
|
||||
req: RealtimeVoiceBrowserSessionCreateRequest,
|
||||
): Promise<RealtimeVoiceBrowserSession> {
|
||||
const config = normalizeProviderConfig(req.providerConfig);
|
||||
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key missing");
|
||||
}
|
||||
if (config.azureEndpoint || config.azureDeployment) {
|
||||
throw new Error("OpenAI Realtime browser sessions do not support Azure endpoints yet");
|
||||
}
|
||||
|
||||
const model = req.model ?? config.model ?? OPENAI_REALTIME_DEFAULT_MODEL;
|
||||
const voice = (req.voice ?? config.voice ?? "alloy") as OpenAIRealtimeVoice;
|
||||
const session: Record<string, unknown> = {
|
||||
type: "realtime",
|
||||
model,
|
||||
instructions: req.instructions,
|
||||
audio: {
|
||||
output: { voice },
|
||||
},
|
||||
};
|
||||
if (req.tools && req.tools.length > 0) {
|
||||
session.tools = req.tools;
|
||||
session.tool_choice = "auto";
|
||||
}
|
||||
|
||||
const response = await fetch("https://api.openai.com/v1/realtime/client_secrets", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ session }),
|
||||
});
|
||||
if (!response.ok) {
|
||||
const detail = await response.text().catch(() => "");
|
||||
throw new Error(
|
||||
`OpenAI Realtime browser session failed (${response.status}): ${detail || response.statusText}`,
|
||||
);
|
||||
}
|
||||
const payload = (await response.json()) as unknown;
|
||||
const nestedSecret =
|
||||
payload && typeof payload === "object"
|
||||
? (payload as Record<string, unknown>).client_secret
|
||||
: undefined;
|
||||
const clientSecret = readStringField(payload, "value") ?? readStringField(nestedSecret, "value");
|
||||
if (!clientSecret) {
|
||||
throw new Error("OpenAI Realtime browser session did not return a client secret");
|
||||
}
|
||||
const expiresAt =
|
||||
payload && typeof payload === "object"
|
||||
? (payload as Record<string, unknown>).expires_at
|
||||
: undefined;
|
||||
return {
|
||||
provider: "openai",
|
||||
clientSecret,
|
||||
model,
|
||||
voice,
|
||||
...(typeof expiresAt === "number" ? { expiresAt } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
|
||||
return {
|
||||
id: "openai",
|
||||
@@ -607,6 +682,7 @@ export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
|
||||
azureApiVersion: config.azureApiVersion,
|
||||
});
|
||||
},
|
||||
createBrowserSession: createOpenAIRealtimeBrowserSession,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -125,6 +125,7 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
|
||||
"agent.wait",
|
||||
"wake",
|
||||
"talk.mode",
|
||||
"talk.realtime.session",
|
||||
"talk.speak",
|
||||
"tts.enable",
|
||||
"tts.disable",
|
||||
|
||||
@@ -52,6 +52,10 @@ import {
|
||||
TalkConfigParamsSchema,
|
||||
type TalkConfigResult,
|
||||
TalkConfigResultSchema,
|
||||
type TalkRealtimeSessionParams,
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
type TalkRealtimeSessionResult,
|
||||
TalkRealtimeSessionResultSchema,
|
||||
type TalkSpeakParams,
|
||||
TalkSpeakParamsSchema,
|
||||
type TalkSpeakResult,
|
||||
@@ -428,6 +432,12 @@ export const validateWizardStatusParams = ajv.compile<WizardStatusParams>(Wizard
|
||||
export const validateTalkModeParams = ajv.compile<TalkModeParams>(TalkModeParamsSchema);
|
||||
export const validateTalkConfigParams = ajv.compile<TalkConfigParams>(TalkConfigParamsSchema);
|
||||
export const validateTalkConfigResult = ajv.compile<TalkConfigResult>(TalkConfigResultSchema);
|
||||
export const validateTalkRealtimeSessionParams = ajv.compile<TalkRealtimeSessionParams>(
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
);
|
||||
export const validateTalkRealtimeSessionResult = ajv.compile<TalkRealtimeSessionResult>(
|
||||
TalkRealtimeSessionResultSchema,
|
||||
);
|
||||
export const validateTalkSpeakParams = ajv.compile<TalkSpeakParams>(TalkSpeakParamsSchema);
|
||||
export const validateTalkSpeakResult = ajv.compile<TalkSpeakResult>(TalkSpeakResultSchema);
|
||||
export const validateChannelsStatusParams = ajv.compile<ChannelsStatusParams>(
|
||||
@@ -616,6 +626,8 @@ export {
|
||||
WizardStatusResultSchema,
|
||||
TalkConfigParamsSchema,
|
||||
TalkConfigResultSchema,
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
TalkRealtimeSessionResultSchema,
|
||||
TalkSpeakParamsSchema,
|
||||
TalkSpeakResultSchema,
|
||||
ChannelsStatusParamsSchema,
|
||||
@@ -720,6 +732,8 @@ export type {
|
||||
WizardStatusResult,
|
||||
TalkConfigParams,
|
||||
TalkConfigResult,
|
||||
TalkRealtimeSessionParams,
|
||||
TalkRealtimeSessionResult,
|
||||
TalkSpeakParams,
|
||||
TalkSpeakResult,
|
||||
TalkModeParams,
|
||||
|
||||
@@ -36,6 +36,28 @@ export const TalkSpeakParamsSchema = Type.Object(
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkRealtimeSessionParamsSchema = Type.Object(
|
||||
{
|
||||
sessionKey: Type.Optional(Type.String()),
|
||||
provider: Type.Optional(Type.String()),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
instructions: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkRealtimeSessionResultSchema = Type.Object(
|
||||
{
|
||||
provider: NonEmptyString,
|
||||
clientSecret: NonEmptyString,
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
expiresAt: Type.Optional(Type.Number()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const talkProviderFieldSchemas = {
|
||||
apiKey: Type.Optional(SecretInputSchema),
|
||||
};
|
||||
|
||||
@@ -54,6 +54,8 @@ import {
|
||||
ChannelsLogoutParamsSchema,
|
||||
TalkConfigParamsSchema,
|
||||
TalkConfigResultSchema,
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
TalkRealtimeSessionResultSchema,
|
||||
TalkSpeakParamsSchema,
|
||||
TalkSpeakResultSchema,
|
||||
ChannelsStatusParamsSchema,
|
||||
@@ -279,6 +281,8 @@ export const ProtocolSchemas = {
|
||||
TalkModeParams: TalkModeParamsSchema,
|
||||
TalkConfigParams: TalkConfigParamsSchema,
|
||||
TalkConfigResult: TalkConfigResultSchema,
|
||||
TalkRealtimeSessionParams: TalkRealtimeSessionParamsSchema,
|
||||
TalkRealtimeSessionResult: TalkRealtimeSessionResultSchema,
|
||||
TalkSpeakParams: TalkSpeakParamsSchema,
|
||||
TalkSpeakResult: TalkSpeakResultSchema,
|
||||
ChannelsStatusParams: ChannelsStatusParamsSchema,
|
||||
|
||||
@@ -80,6 +80,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">;
|
||||
export type TalkModeParams = SchemaType<"TalkModeParams">;
|
||||
export type TalkConfigParams = SchemaType<"TalkConfigParams">;
|
||||
export type TalkConfigResult = SchemaType<"TalkConfigResult">;
|
||||
export type TalkRealtimeSessionParams = SchemaType<"TalkRealtimeSessionParams">;
|
||||
export type TalkRealtimeSessionResult = SchemaType<"TalkRealtimeSessionResult">;
|
||||
export type TalkSpeakParams = SchemaType<"TalkSpeakParams">;
|
||||
export type TalkSpeakResult = SchemaType<"TalkSpeakResult">;
|
||||
export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">;
|
||||
|
||||
@@ -48,6 +48,7 @@ const BASE_METHODS = [
|
||||
"wizard.cancel",
|
||||
"wizard.status",
|
||||
"talk.config",
|
||||
"talk.realtime.session",
|
||||
"talk.speak",
|
||||
"talk.mode",
|
||||
"commands.list",
|
||||
|
||||
@@ -7,6 +7,13 @@ import {
|
||||
} from "../../config/talk.js";
|
||||
import type { TalkConfigResponse, TalkProviderConfig } from "../../config/types.gateway.js";
|
||||
import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js";
|
||||
import {
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
} from "../../realtime-voice/agent-consult-tool.js";
|
||||
import { getRealtimeVoiceProvider } from "../../realtime-voice/provider-registry.js";
|
||||
import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js";
|
||||
import type { RealtimeVoiceProviderConfig } from "../../realtime-voice/provider-types.js";
|
||||
import {
|
||||
normalizeLowercaseStringOrEmpty,
|
||||
normalizeOptionalLowercaseString,
|
||||
@@ -22,6 +29,7 @@ import {
|
||||
type TalkSpeakParams,
|
||||
validateTalkConfigParams,
|
||||
validateTalkModeParams,
|
||||
validateTalkRealtimeSessionParams,
|
||||
validateTalkSpeakParams,
|
||||
} from "../protocol/index.js";
|
||||
import { formatForLog } from "../ws-log.js";
|
||||
@@ -136,6 +144,63 @@ function buildTalkTtsConfig(
|
||||
};
|
||||
}
|
||||
|
||||
function getRecord(value: unknown): Record<string, unknown> | undefined {
|
||||
return asRecord(value) ?? undefined;
|
||||
}
|
||||
|
||||
function getVoiceCallRealtimeConfig(config: OpenClawConfig): {
|
||||
provider?: string;
|
||||
providers?: Record<string, RealtimeVoiceProviderConfig>;
|
||||
} {
|
||||
const plugins = getRecord(config.plugins);
|
||||
const entries = getRecord(plugins?.entries);
|
||||
const voiceCall = getRecord(entries?.["voice-call"]);
|
||||
const pluginConfig = getRecord(voiceCall?.config);
|
||||
const realtime = getRecord(pluginConfig?.realtime);
|
||||
const providersRaw = getRecord(realtime?.providers);
|
||||
const providers: Record<string, RealtimeVoiceProviderConfig> = {};
|
||||
if (providersRaw) {
|
||||
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
|
||||
const record = getRecord(providerConfig);
|
||||
if (record) {
|
||||
providers[providerId] = record;
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
provider: normalizeOptionalString(realtime?.provider),
|
||||
providers: Object.keys(providers).length > 0 ? providers : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) {
|
||||
const voiceCallRealtime = getVoiceCallRealtimeConfig(config);
|
||||
const talkProviderConfigs = config.talk?.providers as
|
||||
| Record<string, RealtimeVoiceProviderConfig>
|
||||
| undefined;
|
||||
const talkProvider = normalizeOptionalString(config.talk?.provider);
|
||||
const talkProviderSupportsRealtime = talkProvider
|
||||
? Boolean(getRealtimeVoiceProvider(talkProvider, config))
|
||||
: false;
|
||||
const provider =
|
||||
normalizeOptionalString(requestedProvider) ??
|
||||
(talkProviderSupportsRealtime ? talkProvider : undefined) ??
|
||||
voiceCallRealtime.provider;
|
||||
return {
|
||||
provider,
|
||||
providers: {
|
||||
...voiceCallRealtime.providers,
|
||||
...talkProviderConfigs,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function buildRealtimeInstructions(extra: string | undefined): string {
|
||||
const base = `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
|
||||
const trimmed = normalizeOptionalString(extra);
|
||||
return trimmed ? `${base}\n\n${trimmed}` : base;
|
||||
}
|
||||
|
||||
function isFallbackEligibleTalkReason(reason: TalkSpeakReason): boolean {
|
||||
return (
|
||||
reason === "talk_unconfigured" ||
|
||||
@@ -334,6 +399,67 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
|
||||
respond(true, { config: configPayload }, undefined);
|
||||
},
|
||||
"talk.realtime.session": async ({ params, respond }) => {
|
||||
if (!validateTalkRealtimeSessionParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.realtime.session params: ${formatValidationErrors(validateTalkRealtimeSessionParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const typedParams = params as {
|
||||
provider?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
instructions?: string;
|
||||
};
|
||||
try {
|
||||
const runtimeConfig = loadConfig();
|
||||
const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, typedParams.provider);
|
||||
const resolution = resolveConfiguredRealtimeVoiceProvider({
|
||||
configuredProviderId: realtimeConfig.provider,
|
||||
providerConfigs: realtimeConfig.providers,
|
||||
cfg: runtimeConfig,
|
||||
cfgForResolve: runtimeConfig,
|
||||
noRegisteredProviderMessage: "No realtime voice provider registered",
|
||||
});
|
||||
if (!resolution.provider.createBrowserSession) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.UNAVAILABLE,
|
||||
`Realtime voice provider "${resolution.provider.id}" does not support browser WebRTC sessions`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const session = await resolution.provider.createBrowserSession({
|
||||
providerConfig: resolution.providerConfig,
|
||||
instructions: buildRealtimeInstructions(typedParams.instructions),
|
||||
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
|
||||
model: normalizeOptionalString(typedParams.model),
|
||||
voice: normalizeOptionalString(typedParams.voice),
|
||||
});
|
||||
respond(
|
||||
true,
|
||||
{
|
||||
provider: session.provider,
|
||||
clientSecret: session.clientSecret,
|
||||
...(session.model ? { model: session.model } : {}),
|
||||
...(session.voice ? { voice: session.voice } : {}),
|
||||
...(typeof session.expiresAt === "number" ? { expiresAt: session.expiresAt } : {}),
|
||||
},
|
||||
undefined,
|
||||
);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.speak": async ({ params, respond }) => {
|
||||
if (!validateTalkSpeakParams(params)) {
|
||||
respond(
|
||||
|
||||
@@ -2,6 +2,8 @@ export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
|
||||
export type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBridgeCallbacks,
|
||||
RealtimeVoiceBrowserSession,
|
||||
RealtimeVoiceBrowserSessionCreateRequest,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceCloseReason,
|
||||
RealtimeVoiceProviderConfig,
|
||||
@@ -12,6 +14,10 @@ export type {
|
||||
RealtimeVoiceTool,
|
||||
RealtimeVoiceToolCallEvent,
|
||||
} from "../realtime-voice/provider-types.js";
|
||||
export {
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
} from "../realtime-voice/agent-consult-tool.js";
|
||||
export {
|
||||
canonicalizeRealtimeVoiceProviderId,
|
||||
getRealtimeVoiceProvider,
|
||||
|
||||
@@ -40,6 +40,8 @@ import type {
|
||||
} from "../realtime-transcription/provider-types.js";
|
||||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBrowserSession,
|
||||
RealtimeVoiceBrowserSessionCreateRequest,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderConfiguredContext,
|
||||
@@ -1661,6 +1663,9 @@ export type RealtimeVoiceProviderPlugin = {
|
||||
resolveConfig?: (ctx: RealtimeVoiceProviderResolveConfigContext) => RealtimeVoiceProviderConfig;
|
||||
isConfigured: (ctx: RealtimeVoiceProviderConfiguredContext) => boolean;
|
||||
createBridge: (req: RealtimeVoiceBridgeCreateRequest) => RealtimeVoiceBridge;
|
||||
createBrowserSession?: (
|
||||
req: RealtimeVoiceBrowserSessionCreateRequest,
|
||||
) => Promise<RealtimeVoiceBrowserSession>;
|
||||
};
|
||||
|
||||
export type PluginRealtimeVoiceProviderEntry = RealtimeVoiceProviderPlugin & {
|
||||
|
||||
28
src/realtime-voice/agent-consult-tool.ts
Normal file
28
src/realtime-voice/agent-consult-tool.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import type { RealtimeVoiceTool } from "./provider-types.js";
|
||||
|
||||
export const REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult";
|
||||
|
||||
export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
|
||||
type: "function",
|
||||
name: REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
description:
|
||||
"Ask the full OpenClaw agent for deeper reasoning, current information, or tool-backed help before speaking.",
|
||||
parameters: {
|
||||
type: "object",
|
||||
properties: {
|
||||
question: {
|
||||
type: "string",
|
||||
description: "The concrete question or task the user asked.",
|
||||
},
|
||||
context: {
|
||||
type: "string",
|
||||
description: "Optional relevant context or transcript summary.",
|
||||
},
|
||||
responseStyle: {
|
||||
type: "string",
|
||||
description: "Optional style hint for the spoken answer.",
|
||||
},
|
||||
},
|
||||
required: ["question"],
|
||||
},
|
||||
};
|
||||
@@ -53,6 +53,22 @@ export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
|
||||
tools?: RealtimeVoiceTool[];
|
||||
};
|
||||
|
||||
export type RealtimeVoiceBrowserSessionCreateRequest = {
|
||||
providerConfig: RealtimeVoiceProviderConfig;
|
||||
instructions?: string;
|
||||
tools?: RealtimeVoiceTool[];
|
||||
model?: string;
|
||||
voice?: string;
|
||||
};
|
||||
|
||||
export type RealtimeVoiceBrowserSession = {
|
||||
provider: RealtimeVoiceProviderId;
|
||||
clientSecret: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
expiresAt?: number;
|
||||
};
|
||||
|
||||
export type RealtimeVoiceBridge = {
|
||||
connect(): Promise<void>;
|
||||
sendAudio(audio: Buffer): void;
|
||||
|
||||
@@ -584,6 +584,15 @@
|
||||
background: color-mix(in srgb, var(--accent) 12%, transparent);
|
||||
}
|
||||
|
||||
.agent-chat__input-btn--talk {
|
||||
color: var(--danger, #ef4444);
|
||||
background: color-mix(in srgb, var(--danger, #ef4444) 14%, transparent);
|
||||
}
|
||||
|
||||
.agent-chat__talk-status {
|
||||
color: var(--text);
|
||||
}
|
||||
|
||||
.agent-chat__input-divider {
|
||||
width: 1px;
|
||||
height: 16px;
|
||||
|
||||
@@ -33,6 +33,11 @@ type LifecycleHost = {
|
||||
allowExternalEmbedUrls: boolean;
|
||||
chatHasAutoScrolled: boolean;
|
||||
chatManualRefreshInFlight: boolean;
|
||||
realtimeTalkSession?: { stop: () => void } | null;
|
||||
realtimeTalkActive?: boolean;
|
||||
realtimeTalkStatus?: string;
|
||||
realtimeTalkDetail?: string | null;
|
||||
realtimeTalkTranscript?: string | null;
|
||||
chatLoading: boolean;
|
||||
chatMessages: unknown[];
|
||||
chatToolMessages: unknown[];
|
||||
@@ -77,6 +82,12 @@ export function handleDisconnected(host: LifecycleHost) {
|
||||
stopNodesPolling(host as unknown as Parameters<typeof stopNodesPolling>[0]);
|
||||
stopLogsPolling(host as unknown as Parameters<typeof stopLogsPolling>[0]);
|
||||
stopDebugPolling(host as unknown as Parameters<typeof stopDebugPolling>[0]);
|
||||
host.realtimeTalkSession?.stop();
|
||||
host.realtimeTalkSession = null;
|
||||
host.realtimeTalkActive = false;
|
||||
host.realtimeTalkStatus = "idle";
|
||||
host.realtimeTalkDetail = null;
|
||||
host.realtimeTalkTranscript = null;
|
||||
host.client?.stop();
|
||||
host.client = null;
|
||||
host.connected = false;
|
||||
|
||||
@@ -2228,6 +2228,10 @@ export function renderApp(state: AppViewState) {
|
||||
streamStartedAt: state.chatStreamStartedAt,
|
||||
draft: state.chatMessage,
|
||||
queue: state.chatQueue,
|
||||
realtimeTalkActive: state.realtimeTalkActive,
|
||||
realtimeTalkStatus: state.realtimeTalkStatus,
|
||||
realtimeTalkDetail: state.realtimeTalkDetail,
|
||||
realtimeTalkTranscript: state.realtimeTalkTranscript,
|
||||
connected: state.connected,
|
||||
canSend: state.connected,
|
||||
disabledReason: chatDisabledReason,
|
||||
@@ -2256,6 +2260,7 @@ export function renderApp(state: AppViewState) {
|
||||
attachments: state.chatAttachments,
|
||||
onAttachmentsChange: (next) => (state.chatAttachments = next),
|
||||
onSend: () => state.handleSendChat(),
|
||||
onToggleRealtimeTalk: () => state.toggleRealtimeTalk(),
|
||||
canAbort: Boolean(state.chatRunId),
|
||||
onAbort: () => void state.handleAbortChat(),
|
||||
onQueueRemove: (id) => state.removeQueuedMessage(id),
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import type { EventLogEntry } from "./app-events.ts";
|
||||
import type { CompactionStatus, FallbackStatus } from "./app-tool-stream.ts";
|
||||
import type { RealtimeTalkStatus } from "./chat/realtime-talk.ts";
|
||||
import type { ChatSideResult } from "./chat/side-result.ts";
|
||||
import type { CronModelSuggestionsState, CronState } from "./controllers/cron.ts";
|
||||
import type { DevicePairingList } from "./controllers/devices.ts";
|
||||
@@ -92,6 +93,10 @@ export type AppViewState = {
|
||||
chatModelsLoading: boolean;
|
||||
chatModelCatalog: ModelCatalogEntry[];
|
||||
chatQueue: ChatQueueItem[];
|
||||
realtimeTalkActive: boolean;
|
||||
realtimeTalkStatus: RealtimeTalkStatus;
|
||||
realtimeTalkDetail: string | null;
|
||||
realtimeTalkTranscript: string | null;
|
||||
chatManualRefreshInFlight: boolean;
|
||||
nodesLoading: boolean;
|
||||
nodes: Array<Record<string, unknown>>;
|
||||
@@ -425,6 +430,7 @@ export type AppViewState = {
|
||||
setPassword: (next: string) => void;
|
||||
setChatMessage: (next: string) => void;
|
||||
handleSendChat: (messageOverride?: string, opts?: { restoreDraft?: boolean }) => Promise<void>;
|
||||
toggleRealtimeTalk: () => Promise<void>;
|
||||
steerQueuedChatMessage: (id: string) => Promise<void>;
|
||||
handleAbortChat: () => Promise<void>;
|
||||
removeQueuedMessage: (id: string) => void;
|
||||
|
||||
@@ -57,6 +57,7 @@ import {
|
||||
import type { AppViewState } from "./app-view-state.ts";
|
||||
import { normalizeAssistantIdentity } from "./assistant-identity.ts";
|
||||
import { exportChatMarkdown } from "./chat/export.ts";
|
||||
import { RealtimeTalkSession, type RealtimeTalkStatus } from "./chat/realtime-talk.ts";
|
||||
import type { ChatSideResult } from "./chat/side-result.ts";
|
||||
import {
|
||||
loadToolsEffective as loadToolsEffectiveInternal,
|
||||
@@ -192,6 +193,11 @@ export class OpenClawApp extends LitElement {
|
||||
@state() chatModelCatalog: ModelCatalogEntry[] = [];
|
||||
@state() chatQueue: ChatQueueItem[] = [];
|
||||
@state() chatAttachments: ChatAttachment[] = [];
|
||||
@state() realtimeTalkActive = false;
|
||||
@state() realtimeTalkStatus: RealtimeTalkStatus = "idle";
|
||||
@state() realtimeTalkDetail: string | null = null;
|
||||
@state() realtimeTalkTranscript: string | null = null;
|
||||
private realtimeTalkSession: RealtimeTalkSession | null = null;
|
||||
@state() chatManualRefreshInFlight = false;
|
||||
@state() navDrawerOpen = false;
|
||||
|
||||
@@ -710,6 +716,51 @@ export class OpenClawApp extends LitElement {
|
||||
);
|
||||
}
|
||||
|
||||
async toggleRealtimeTalk() {
|
||||
if (this.realtimeTalkSession) {
|
||||
this.realtimeTalkSession.stop();
|
||||
this.realtimeTalkSession = null;
|
||||
this.realtimeTalkActive = false;
|
||||
this.realtimeTalkStatus = "idle";
|
||||
this.realtimeTalkDetail = null;
|
||||
this.realtimeTalkTranscript = null;
|
||||
return;
|
||||
}
|
||||
if (!this.client || !this.connected) {
|
||||
this.lastError = "Gateway not connected";
|
||||
return;
|
||||
}
|
||||
this.realtimeTalkActive = true;
|
||||
this.realtimeTalkStatus = "connecting";
|
||||
this.realtimeTalkDetail = null;
|
||||
this.realtimeTalkTranscript = null;
|
||||
const session = new RealtimeTalkSession(this.client, this.sessionKey, {
|
||||
onStatus: (status, detail) => {
|
||||
this.realtimeTalkStatus = status;
|
||||
this.realtimeTalkDetail = detail ?? null;
|
||||
if (status === "idle" || status === "error") {
|
||||
this.realtimeTalkActive = status !== "idle";
|
||||
}
|
||||
},
|
||||
onTranscript: (entry) => {
|
||||
this.realtimeTalkTranscript = `${entry.role === "user" ? "You" : "OpenClaw"}: ${entry.text}`;
|
||||
},
|
||||
});
|
||||
this.realtimeTalkSession = session;
|
||||
try {
|
||||
await session.start();
|
||||
} catch (error) {
|
||||
session.stop();
|
||||
if (this.realtimeTalkSession === session) {
|
||||
this.realtimeTalkSession = null;
|
||||
}
|
||||
this.realtimeTalkActive = false;
|
||||
this.realtimeTalkStatus = "error";
|
||||
this.realtimeTalkDetail = error instanceof Error ? error.message : String(error);
|
||||
this.lastError = this.realtimeTalkDetail;
|
||||
}
|
||||
}
|
||||
|
||||
async steerQueuedChatMessage(id: string) {
|
||||
await steerQueuedChatMessageInternal(
|
||||
this as unknown as Parameters<typeof steerQueuedChatMessageInternal>[0],
|
||||
|
||||
300
ui/src/ui/chat/realtime-talk.ts
Normal file
300
ui/src/ui/chat/realtime-talk.ts
Normal file
@@ -0,0 +1,300 @@
|
||||
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../../../src/realtime-voice/agent-consult-tool.js";
|
||||
import type { GatewayBrowserClient, GatewayEventFrame } from "../gateway.ts";
|
||||
import { generateUUID } from "../uuid.ts";
|
||||
|
||||
export type RealtimeTalkStatus = "idle" | "connecting" | "listening" | "thinking" | "error";
|
||||
|
||||
export type RealtimeTalkCallbacks = {
|
||||
onStatus?: (status: RealtimeTalkStatus, detail?: string) => void;
|
||||
onTranscript?: (entry: { role: "user" | "assistant"; text: string; final: boolean }) => void;
|
||||
};
|
||||
|
||||
export type RealtimeTalkSessionResult = {
|
||||
provider: string;
|
||||
clientSecret: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
expiresAt?: number;
|
||||
};
|
||||
|
||||
type RealtimeServerEvent = {
|
||||
type?: string;
|
||||
item_id?: string;
|
||||
call_id?: string;
|
||||
name?: string;
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
arguments?: string;
|
||||
};
|
||||
|
||||
type ToolBuffer = {
|
||||
name: string;
|
||||
callId: string;
|
||||
args: string;
|
||||
};
|
||||
|
||||
type ChatPayload = {
|
||||
runId?: string;
|
||||
state?: string;
|
||||
errorMessage?: string;
|
||||
message?: unknown;
|
||||
};
|
||||
|
||||
function extractTextFromMessage(message: unknown): string {
|
||||
if (!message || typeof message !== "object") {
|
||||
return "";
|
||||
}
|
||||
const record = message as Record<string, unknown>;
|
||||
if (typeof record.text === "string") {
|
||||
return record.text;
|
||||
}
|
||||
const content = Array.isArray(record.content) ? record.content : [];
|
||||
const parts = content
|
||||
.map((block) => {
|
||||
if (!block || typeof block !== "object") {
|
||||
return "";
|
||||
}
|
||||
const entry = block as Record<string, unknown>;
|
||||
return entry.type === "text" && typeof entry.text === "string" ? entry.text : "";
|
||||
})
|
||||
.filter(Boolean);
|
||||
return parts.join("\n\n").trim();
|
||||
}
|
||||
|
||||
function waitForChatResult(params: {
|
||||
client: GatewayBrowserClient;
|
||||
runId: string;
|
||||
timeoutMs: number;
|
||||
}): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = window.setTimeout(() => {
|
||||
unsubscribe();
|
||||
reject(new Error("OpenClaw tool call timed out"));
|
||||
}, params.timeoutMs);
|
||||
const unsubscribe = params.client.addEventListener((evt: GatewayEventFrame) => {
|
||||
if (evt.event !== "chat") {
|
||||
return;
|
||||
}
|
||||
const payload = evt.payload as ChatPayload | undefined;
|
||||
if (!payload || payload.runId !== params.runId) {
|
||||
return;
|
||||
}
|
||||
if (payload.state === "final") {
|
||||
window.clearTimeout(timer);
|
||||
unsubscribe();
|
||||
resolve(extractTextFromMessage(payload.message) || "OpenClaw finished with no text.");
|
||||
} else if (payload.state === "error") {
|
||||
window.clearTimeout(timer);
|
||||
unsubscribe();
|
||||
reject(new Error(payload.errorMessage ?? "OpenClaw tool call failed"));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export class RealtimeTalkSession {
|
||||
private peer: RTCPeerConnection | null = null;
|
||||
private channel: RTCDataChannel | null = null;
|
||||
private media: MediaStream | null = null;
|
||||
private audio: HTMLAudioElement | null = null;
|
||||
private closed = false;
|
||||
private toolBuffers = new Map<string, ToolBuffer>();
|
||||
|
||||
constructor(
|
||||
private readonly client: GatewayBrowserClient,
|
||||
private readonly sessionKey: string,
|
||||
private readonly callbacks: RealtimeTalkCallbacks = {},
|
||||
) {}
|
||||
|
||||
async start(): Promise<void> {
|
||||
if (!navigator.mediaDevices?.getUserMedia || typeof RTCPeerConnection === "undefined") {
|
||||
throw new Error("Realtime Talk requires browser WebRTC and microphone access");
|
||||
}
|
||||
this.closed = false;
|
||||
this.callbacks.onStatus?.("connecting");
|
||||
const session = await this.client.request<RealtimeTalkSessionResult>("talk.realtime.session", {
|
||||
sessionKey: this.sessionKey,
|
||||
});
|
||||
this.peer = new RTCPeerConnection();
|
||||
this.audio = document.createElement("audio");
|
||||
this.audio.autoplay = true;
|
||||
this.audio.style.display = "none";
|
||||
document.body.append(this.audio);
|
||||
this.peer.addEventListener("track", (event) => {
|
||||
if (this.audio) {
|
||||
this.audio.srcObject = event.streams[0];
|
||||
}
|
||||
});
|
||||
this.media = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
for (const track of this.media.getAudioTracks()) {
|
||||
this.peer.addTrack(track, this.media);
|
||||
}
|
||||
this.channel = this.peer.createDataChannel("oai-events");
|
||||
this.channel.addEventListener("open", () => this.callbacks.onStatus?.("listening"));
|
||||
this.channel.addEventListener("message", (event) => this.handleRealtimeEvent(event.data));
|
||||
this.peer.addEventListener("connectionstatechange", () => {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
if (this.peer?.connectionState === "failed" || this.peer?.connectionState === "closed") {
|
||||
this.callbacks.onStatus?.("error", "Realtime connection closed");
|
||||
}
|
||||
});
|
||||
|
||||
const offer = await this.peer.createOffer();
|
||||
await this.peer.setLocalDescription(offer);
|
||||
const sdp = await fetch("https://api.openai.com/v1/realtime/calls", {
|
||||
method: "POST",
|
||||
body: offer.sdp,
|
||||
headers: {
|
||||
Authorization: `Bearer ${session.clientSecret}`,
|
||||
"Content-Type": "application/sdp",
|
||||
},
|
||||
});
|
||||
if (!sdp.ok) {
|
||||
throw new Error(`Realtime WebRTC setup failed (${sdp.status})`);
|
||||
}
|
||||
await this.peer.setRemoteDescription({
|
||||
type: "answer",
|
||||
sdp: await sdp.text(),
|
||||
});
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
this.closed = true;
|
||||
this.callbacks.onStatus?.("idle");
|
||||
this.channel?.close();
|
||||
this.channel = null;
|
||||
this.peer?.close();
|
||||
this.peer = null;
|
||||
this.media?.getTracks().forEach((track) => track.stop());
|
||||
this.media = null;
|
||||
this.audio?.remove();
|
||||
this.audio = null;
|
||||
this.toolBuffers.clear();
|
||||
}
|
||||
|
||||
private send(event: unknown): void {
|
||||
if (this.channel?.readyState === "open") {
|
||||
this.channel.send(JSON.stringify(event));
|
||||
}
|
||||
}
|
||||
|
||||
private handleRealtimeEvent(data: unknown): void {
|
||||
let event: RealtimeServerEvent;
|
||||
try {
|
||||
event = JSON.parse(String(data)) as RealtimeServerEvent;
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
switch (event.type) {
|
||||
case "conversation.item.input_audio_transcription.completed":
|
||||
if (event.transcript) {
|
||||
this.callbacks.onTranscript?.({ role: "user", text: event.transcript, final: true });
|
||||
}
|
||||
return;
|
||||
case "response.audio_transcript.done":
|
||||
if (event.transcript) {
|
||||
this.callbacks.onTranscript?.({
|
||||
role: "assistant",
|
||||
text: event.transcript,
|
||||
final: true,
|
||||
});
|
||||
}
|
||||
return;
|
||||
case "response.function_call_arguments.delta":
|
||||
this.bufferToolDelta(event);
|
||||
return;
|
||||
case "response.function_call_arguments.done":
|
||||
void this.handleToolCall(event);
|
||||
return;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private bufferToolDelta(event: RealtimeServerEvent): void {
|
||||
const key = event.item_id ?? "unknown";
|
||||
const existing = this.toolBuffers.get(key);
|
||||
if (existing) {
|
||||
existing.args += event.delta ?? "";
|
||||
return;
|
||||
}
|
||||
this.toolBuffers.set(key, {
|
||||
name: event.name ?? "",
|
||||
callId: event.call_id ?? "",
|
||||
args: event.delta ?? "",
|
||||
});
|
||||
}
|
||||
|
||||
private async handleToolCall(event: RealtimeServerEvent): Promise<void> {
|
||||
const key = event.item_id ?? "unknown";
|
||||
const buffered = this.toolBuffers.get(key);
|
||||
this.toolBuffers.delete(key);
|
||||
const name = buffered?.name || event.name || "";
|
||||
const callId = buffered?.callId || event.call_id || "";
|
||||
if (name !== REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME || !callId) {
|
||||
return;
|
||||
}
|
||||
this.callbacks.onStatus?.("thinking");
|
||||
let question = "";
|
||||
try {
|
||||
const args = JSON.parse(buffered?.args || event.arguments || "{}") as {
|
||||
question?: unknown;
|
||||
context?: unknown;
|
||||
responseStyle?: unknown;
|
||||
};
|
||||
question = typeof args.question === "string" ? args.question.trim() : "";
|
||||
const context = typeof args.context === "string" ? args.context.trim() : "";
|
||||
const responseStyle = typeof args.responseStyle === "string" ? args.responseStyle.trim() : "";
|
||||
if (context || responseStyle) {
|
||||
question = [
|
||||
question,
|
||||
context ? `Context:\n${context}` : undefined,
|
||||
responseStyle ? `Spoken style:\n${responseStyle}` : undefined,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
}
|
||||
} catch {}
|
||||
if (!question) {
|
||||
this.submitToolResult(callId, {
|
||||
error: `${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} requires a question`,
|
||||
});
|
||||
this.callbacks.onStatus?.("listening");
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const idempotencyKey = generateUUID();
|
||||
const response = await this.client.request<{ runId?: string }>("chat.send", {
|
||||
sessionKey: this.sessionKey,
|
||||
message: question,
|
||||
idempotencyKey,
|
||||
});
|
||||
const result = await waitForChatResult({
|
||||
client: this.client,
|
||||
runId: response.runId ?? idempotencyKey,
|
||||
timeoutMs: 120_000,
|
||||
});
|
||||
this.submitToolResult(callId, { result });
|
||||
} catch (error) {
|
||||
this.submitToolResult(callId, {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
} finally {
|
||||
this.callbacks.onStatus?.("listening");
|
||||
}
|
||||
}
|
||||
|
||||
private submitToolResult(callId: string, result: unknown): void {
|
||||
this.send({
|
||||
type: "conversation.item.create",
|
||||
item: {
|
||||
type: "function_call_output",
|
||||
call_id: callId,
|
||||
output: JSON.stringify(result),
|
||||
},
|
||||
});
|
||||
this.send({ type: "response.create" });
|
||||
}
|
||||
}
|
||||
@@ -223,6 +223,8 @@ export type GatewayBrowserClientOptions = {
|
||||
onGap?: (info: { expected: number; received: number }) => void;
|
||||
};
|
||||
|
||||
export type GatewayEventListener = (evt: GatewayEventFrame) => void;
|
||||
|
||||
// 4008 = application-defined code (browser rejects 1008 "Policy Violation")
|
||||
const CONNECT_FAILED_CLOSE_CODE = 4008;
|
||||
|
||||
@@ -298,6 +300,7 @@ export class GatewayBrowserClient {
|
||||
private pendingConnectError: GatewayErrorInfo | undefined;
|
||||
private pendingDeviceTokenRetry = false;
|
||||
private deviceTokenRetryBudgetUsed = false;
|
||||
private eventListeners = new Set<GatewayEventListener>();
|
||||
|
||||
constructor(private opts: GatewayBrowserClientOptions) {}
|
||||
|
||||
@@ -549,6 +552,9 @@ export class GatewayBrowserClient {
|
||||
}
|
||||
try {
|
||||
this.opts.onEvent?.(evt);
|
||||
for (const listener of this.eventListeners) {
|
||||
listener(evt);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[gateway] event handler error:", err);
|
||||
}
|
||||
@@ -625,6 +631,13 @@ export class GatewayBrowserClient {
|
||||
return p;
|
||||
}
|
||||
|
||||
addEventListener(listener: GatewayEventListener): () => void {
|
||||
this.eventListeners.add(listener);
|
||||
return () => {
|
||||
this.eventListeners.delete(listener);
|
||||
};
|
||||
}
|
||||
|
||||
private queueConnect() {
|
||||
this.connectNonce = null;
|
||||
this.connectSent = false;
|
||||
|
||||
@@ -18,6 +18,7 @@ import {
|
||||
import { InputHistory } from "../chat/input-history.ts";
|
||||
import { PinnedMessages } from "../chat/pinned-messages.ts";
|
||||
import { getPinnedMessageSummary } from "../chat/pinned-summary.ts";
|
||||
import type { RealtimeTalkStatus } from "../chat/realtime-talk.ts";
|
||||
import { renderChatRunControls } from "../chat/run-controls.ts";
|
||||
import { getOrCreateSessionCacheValue } from "../chat/session-cache.ts";
|
||||
import { renderSideResult } from "../chat/side-result-render.ts";
|
||||
@@ -65,6 +66,10 @@ export type ChatProps = {
|
||||
assistantAvatarUrl?: string | null;
|
||||
draft: string;
|
||||
queue: ChatQueueItem[];
|
||||
realtimeTalkActive?: boolean;
|
||||
realtimeTalkStatus?: RealtimeTalkStatus;
|
||||
realtimeTalkDetail?: string | null;
|
||||
realtimeTalkTranscript?: string | null;
|
||||
connected: boolean;
|
||||
canSend: boolean;
|
||||
disabledReason: string | null;
|
||||
@@ -95,6 +100,7 @@ export type ChatProps = {
|
||||
onDraftChange: (next: string) => void;
|
||||
onRequestUpdate?: () => void;
|
||||
onSend: () => void;
|
||||
onToggleRealtimeTalk?: () => void;
|
||||
onAbort?: () => void;
|
||||
onQueueRemove: (id: string) => void;
|
||||
onQueueSteer?: (id: string) => void;
|
||||
@@ -1207,6 +1213,19 @@ export function renderChat(props: ChatProps) {
|
||||
${vs.sttRecording && vs.sttInterimText
|
||||
? html`<div class="agent-chat__stt-interim">${vs.sttInterimText}</div>`
|
||||
: nothing}
|
||||
${props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript
|
||||
? html`
|
||||
<div class="agent-chat__stt-interim agent-chat__talk-status">
|
||||
${props.realtimeTalkDetail ??
|
||||
props.realtimeTalkTranscript ??
|
||||
(props.realtimeTalkStatus === "thinking"
|
||||
? "Asking OpenClaw..."
|
||||
: props.realtimeTalkStatus === "connecting"
|
||||
? "Connecting Talk..."
|
||||
: "Talk live")}
|
||||
</div>
|
||||
`
|
||||
: nothing}
|
||||
|
||||
<textarea
|
||||
${ref((el) => el && adjustTextareaHeight(el as HTMLTextAreaElement))}
|
||||
@@ -1288,6 +1307,21 @@ export function renderChat(props: ChatProps) {
|
||||
</button>
|
||||
`
|
||||
: nothing}
|
||||
${props.onToggleRealtimeTalk
|
||||
? html`
|
||||
<button
|
||||
class="agent-chat__input-btn ${props.realtimeTalkActive
|
||||
? "agent-chat__input-btn--talk"
|
||||
: ""}"
|
||||
@click=${props.onToggleRealtimeTalk}
|
||||
title=${props.realtimeTalkActive ? "Stop Talk" : "Start Talk"}
|
||||
aria-label=${props.realtimeTalkActive ? "Stop Talk" : "Start Talk"}
|
||||
?disabled=${!props.connected}
|
||||
>
|
||||
${props.realtimeTalkActive ? icons.volume2 : icons.radio}
|
||||
</button>
|
||||
`
|
||||
: nothing}
|
||||
${tokens ? html`<span class="agent-chat__token-count">${tokens}</span>` : nothing}
|
||||
</div>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user