feat: add unified talk gateway sessions

This commit is contained in:
Peter Steinberger
2026-05-05 20:59:34 +01:00
parent 7225a2678e
commit c434d7720b
40 changed files with 7015 additions and 164 deletions

View File

@@ -1,6 +1,7 @@
import { describe, expect, it } from "vitest";
import type { OpenClawConfig } from "../../../config/types.js";
import { LEGACY_CONFIG_MIGRATIONS_RUNTIME_TTS } from "./legacy-config-migrations.runtime.tts.js";
import { normalizeLegacyTalkConfig } from "./legacy-talk-config-normalizer.js";
function migrateLegacyConfig(raw: unknown): {
config: OpenClawConfig | null;
@@ -21,6 +22,83 @@ function migrateLegacyConfig(raw: unknown): {
}
describe("legacy migrate provider-shaped config", () => {
it("moves legacy realtime Talk selectors into talk.realtime without treating speech config as runtime fallback", () => {
const changes: string[] = [];
const migrated = normalizeLegacyTalkConfig(
{
talk: {
provider: "openai",
providers: {
openai: {
apiKey: "test-key",
custom: true,
},
},
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
model: "gpt-realtime",
voice: "alloy",
} as never,
},
changes,
);
expect(changes).toContain(
"Moved legacy realtime Talk provider/model fields into talk.realtime.",
);
expect(migrated.talk).toEqual({
provider: "openai",
providers: {
openai: {
apiKey: "test-key",
custom: true,
},
},
realtime: {
provider: "openai",
providers: {
openai: {
apiKey: "test-key",
custom: true,
},
},
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
model: "gpt-realtime",
voice: "alloy",
},
});
});
it("does not copy plain Talk speech provider config into talk.realtime", () => {
const changes: string[] = [];
const migrated = normalizeLegacyTalkConfig(
{
talk: {
provider: "elevenlabs",
providers: {
elevenlabs: {
voiceId: "voice-1",
},
},
},
},
changes,
);
expect(changes).toEqual([]);
expect(migrated.talk).toEqual({
provider: "elevenlabs",
providers: {
elevenlabs: {
voiceId: "voice-1",
},
},
});
});
it("moves messages.tts.<provider> keys into messages.tts.providers", () => {
const res = migrateLegacyConfig({
messages: {

View File

@@ -14,6 +14,31 @@ function buildLegacyTalkProviderCompat(
return Object.keys(compat).length > 0 ? compat : undefined;
}
function buildLegacyRealtimeTalkCompat(
talk: Record<string, unknown>,
normalizedTalk: NonNullable<OpenClawConfig["talk"]>,
): Record<string, unknown> | undefined {
if (talk.realtime !== undefined) {
return undefined;
}
const compat: Record<string, unknown> = {};
for (const key of ["model", "voice", "mode", "transport", "brain"] as const) {
if (talk[key] !== undefined) {
compat[key] = talk[key];
}
}
if (Object.keys(compat).length === 0) {
return undefined;
}
if (normalizedTalk.provider !== undefined) {
compat.provider = normalizedTalk.provider;
}
if (normalizedTalk.providers !== undefined) {
compat.providers = normalizedTalk.providers;
}
return normalizeTalkSection({ realtime: compat } as OpenClawConfig["talk"])?.realtime;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object" && !Array.isArray(value));
}
@@ -35,6 +60,13 @@ export function normalizeLegacyTalkConfig(cfg: OpenClawConfig, changes: string[]
},
};
}
const legacyRealtimeCompat = buildLegacyRealtimeTalkCompat(rawTalk, normalizedTalk);
if (legacyRealtimeCompat) {
normalizedTalk.realtime = {
...legacyRealtimeCompat,
...normalizedTalk.realtime,
};
}
if (Object.keys(normalizedTalk).length === 0 || isDeepStrictEqual(normalizedTalk, rawTalk)) {
return cfg;
}
@@ -42,6 +74,9 @@ export function normalizeLegacyTalkConfig(cfg: OpenClawConfig, changes: string[]
changes.push(
"Normalized talk.provider/providers shape (trimmed provider ids and merged missing compatibility fields).",
);
if (legacyRealtimeCompat) {
changes.push("Moved legacy realtime Talk provider/model fields into talk.realtime.");
}
return {
...cfg,
talk: normalizedTalk,

View File

@@ -150,6 +150,21 @@ export const FIELD_HELP: Record<string, string> = {
"Provider-specific Talk settings keyed by provider id. During migration, prefer this over legacy talk.* keys.",
"talk.providers.*": "Provider-owned Talk config fields for the matching provider id.",
"talk.providers.*.apiKey": "Provider API key for Talk mode.", // pragma: allowlist secret
"talk.realtime":
"Realtime Talk provider, model, voice, mode, transport, and brain strategy. Keep speech/TTS provider config in talk.provider and talk.providers.",
"talk.realtime.provider": "Active realtime voice provider id, such as openai or google.",
"talk.realtime.providers": "Provider-specific realtime voice settings keyed by provider id.",
"talk.realtime.providers.*": "Provider-owned realtime voice config for the matching provider id.",
"talk.realtime.providers.*.apiKey": "Provider API key for realtime Talk.", // pragma: allowlist secret
"talk.realtime.model":
"Realtime provider model id override for browser or Gateway-owned Talk sessions.",
"talk.realtime.voice":
"Realtime provider voice id override for browser or Gateway-owned Talk sessions.",
"talk.realtime.mode": "Talk execution mode: realtime, stt-tts, or transcription.",
"talk.realtime.transport":
"Talk byte/session transport: webrtc, provider-websocket, gateway-relay, or managed-room.",
"talk.realtime.brain":
"Talk reasoning strategy: agent-consult for Gateway-mediated agent help, direct-tools for owner-only tool calls, or none.",
"talk.speechLocale":
'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.',
"talk.interruptOnSpeech":

View File

@@ -875,6 +875,16 @@ export const FIELD_LABELS: Record<string, string> = {
"talk.providers": "Talk Provider Settings",
"talk.providers.*": "Talk Provider Config",
"talk.providers.*.apiKey": "Talk Provider API Key", // pragma: allowlist secret
"talk.realtime": "Talk Realtime",
"talk.realtime.provider": "Talk Realtime Provider",
"talk.realtime.providers": "Talk Realtime Provider Settings",
"talk.realtime.providers.*": "Talk Realtime Provider Config",
"talk.realtime.providers.*.apiKey": "Talk Realtime Provider API Key", // pragma: allowlist secret
"talk.realtime.model": "Talk Realtime Model",
"talk.realtime.voice": "Talk Realtime Voice",
"talk.realtime.mode": "Talk Realtime Mode",
"talk.realtime.transport": "Talk Realtime Transport",
"talk.realtime.brain": "Talk Realtime Brain",
channels: "Channels",
"channels.defaults": "Channel Defaults",
"channels.defaults.groupPolicy": "Default Group Policy",

View File

@@ -31,6 +31,19 @@ describe("talk normalization", () => {
custom: true,
},
},
realtime: {
provider: "openai",
providers: {
openai: {
model: "gpt-realtime",
},
},
model: "gpt-realtime",
voice: "alloy",
mode: "realtime",
transport: "webrtc",
brain: "agent-consult",
},
interruptOnSpeech: true,
});
@@ -42,6 +55,19 @@ describe("talk normalization", () => {
custom: true,
},
},
realtime: {
provider: "openai",
providers: {
openai: {
model: "gpt-realtime",
},
},
model: "gpt-realtime",
voice: "alloy",
mode: "realtime",
transport: "webrtc",
brain: "agent-consult",
},
interruptOnSpeech: true,
});
});

View File

@@ -5,6 +5,7 @@ import type {
TalkConfig,
TalkConfigResponse,
TalkProviderConfig,
TalkRealtimeConfig,
} from "./types.gateway.js";
import type { OpenClawConfig } from "./types.openclaw.js";
import { coerceSecretRef } from "./types.secrets.js";
@@ -85,6 +86,50 @@ function normalizeTalkProviders(value: unknown): Record<string, TalkProviderConf
return Object.keys(providers).length > 0 ? providers : undefined;
}
function normalizeTalkRealtimeConfig(value: unknown): TalkRealtimeConfig | undefined {
if (!isRecord(value)) {
return undefined;
}
const source = value;
const normalized: TalkRealtimeConfig = {};
const provider = normalizeOptionalString(source.provider);
if (provider) {
normalized.provider = provider;
}
const providers = normalizeTalkProviders(source.providers);
if (providers) {
normalized.providers = providers;
}
const model = normalizeOptionalString(source.model);
if (model) {
normalized.model = model;
}
const voice = normalizeOptionalString(source.voice);
if (voice) {
normalized.voice = voice;
}
if (source.mode === "realtime" || source.mode === "stt-tts" || source.mode === "transcription") {
normalized.mode = source.mode;
}
if (
source.transport === "webrtc" ||
source.transport === "provider-websocket" ||
source.transport === "gateway-relay" ||
source.transport === "managed-room"
) {
normalized.transport = source.transport;
}
if (
source.brain === "agent-consult" ||
source.brain === "direct-tools" ||
source.brain === "none"
) {
normalized.brain = source.brain;
}
return Object.keys(normalized).length > 0 ? normalized : undefined;
}
function activeProviderFromTalk(talk: TalkConfig): string | undefined {
const provider = normalizeOptionalString(talk.provider);
const providers = talk.providers;
@@ -118,10 +163,14 @@ export function normalizeTalkSection(value: TalkConfig | undefined): TalkConfig
}
const providers = normalizeTalkProviders(source.providers);
const realtime = normalizeTalkRealtimeConfig(source.realtime);
const provider = normalizeOptionalString(source.provider);
if (providers) {
normalized.providers = providers;
}
if (realtime) {
normalized.realtime = realtime;
}
if (provider) {
normalized.provider = provider;
}
@@ -182,6 +231,9 @@ export function buildTalkConfigResponse(value: unknown): TalkConfigResponse | un
if (normalized?.providers && Object.keys(normalized.providers).length > 0) {
payload.providers = normalized.providers;
}
if (normalized?.realtime && Object.keys(normalized.realtime).length > 0) {
payload.realtime = normalized.realtime;
}
const resolved =
resolveActiveTalkProviderConfig(normalized) ??

View File

@@ -55,6 +55,23 @@ export type TalkProviderConfig = {
[key: string]: unknown;
};
export type TalkRealtimeConfig = {
/** Active realtime voice provider. */
provider?: string;
/** Provider-specific realtime voice config keyed by provider id. */
providers?: Record<string, TalkProviderConfig>;
/** Provider model override for realtime sessions. */
model?: string;
/** Provider voice override for realtime sessions. */
voice?: string;
/** Realtime execution mode. */
mode?: "realtime" | "stt-tts" | "transcription";
/** Byte/session transport. */
transport?: "webrtc" | "provider-websocket" | "gateway-relay" | "managed-room";
/** Tool/agent strategy for realtime sessions. */
brain?: "agent-consult" | "direct-tools" | "none";
};
export type ResolvedTalkConfig = {
/** Active Talk TTS provider resolved from the current config payload. */
provider: string;
@@ -67,6 +84,8 @@ export type TalkConfig = {
provider?: string;
/** Provider-specific Talk config keyed by provider id. */
providers?: Record<string, TalkProviderConfig>;
/** Realtime Talk provider, model, voice, mode, transport, and brain config. */
realtime?: TalkRealtimeConfig;
/** BCP 47 locale id used for Talk speech recognition on device nodes. */
speechLocale?: string;
/** Stop speaking when user starts talking (default: true). */

View File

@@ -212,10 +212,44 @@ const TalkProviderEntrySchema = z
})
.catchall(z.unknown());
const TalkRealtimeSchema = z
.object({
provider: z.string().optional(),
providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
model: z.string().optional(),
voice: z.string().optional(),
mode: z.enum(["realtime", "stt-tts", "transcription"]).optional(),
transport: z.enum(["webrtc", "provider-websocket", "gateway-relay", "managed-room"]).optional(),
brain: z.enum(["agent-consult", "direct-tools", "none"]).optional(),
})
.strict()
.superRefine((realtime, ctx) => {
const provider = normalizeLowercaseStringOrEmpty(realtime.provider ?? "");
const providers = realtime.providers ? Object.keys(realtime.providers) : [];
if (provider && providers.length > 0 && !(provider in realtime.providers!)) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ["provider"],
message: `talk.realtime.provider must match a key in talk.realtime.providers (missing "${provider}")`,
});
}
if (!provider && providers.length > 1) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ["provider"],
message:
"talk.realtime.provider is required when talk.realtime.providers defines multiple providers",
});
}
});
const TalkSchema = z
.object({
provider: z.string().optional(),
providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
realtime: TalkRealtimeSchema.optional(),
speechLocale: z.string().optional(),
interruptOnSpeech: z.boolean().optional(),
silenceTimeoutMs: z.number().int().positive().optional(),

View File

@@ -322,7 +322,9 @@ describe("gateway broadcaster", () => {
expect(readSocket.send).toHaveBeenCalledTimes(0);
broadcastToConnIds("tick", { ts: 1 }, new Set(["c-read"]));
expect(readSocket.send).toHaveBeenCalledTimes(1);
broadcastToConnIds("talk.realtime.relay", { type: "ready" }, new Set(["c-read"]));
broadcastToConnIds("talk.transcription.relay", { type: "session.ready" }, new Set(["c-read"]));
expect(readSocket.send).toHaveBeenCalledTimes(3);
expect(approvalsSocket.send).toHaveBeenCalledTimes(1);
expect(pairingSocket.send).toHaveBeenCalledTimes(1);
});

View File

@@ -41,6 +41,11 @@ describe("method scope resolution", () => {
["diagnostics.stability", ["operator.read"]],
["node.pair.approve", ["operator.pairing"]],
["poll", ["operator.write"]],
["talk.session.create", ["operator.write"]],
["talk.session.inputAudio", ["operator.write"]],
["talk.session.control", ["operator.write"]],
["talk.session.toolResult", ["operator.write"]],
["talk.session.close", ["operator.write"]],
["update.status", ["operator.admin"]],
["config.patch", ["operator.admin"]],
["nativeHook.invoke", ["operator.admin"]],
@@ -96,6 +101,24 @@ describe("operator scope authorization", () => {
});
});
it("allows operator.write clients to use unified Talk sessions", () => {
for (const method of [
"talk.session.create",
"talk.session.inputAudio",
"talk.session.control",
"talk.session.toolResult",
"talk.session.close",
]) {
expect(authorizeOperatorScopesForMethod(method, ["operator.write"])).toEqual({
allowed: true,
});
expect(authorizeOperatorScopesForMethod(method, ["operator.read"])).toEqual({
allowed: false,
missingScope: "operator.write",
});
}
});
it("requires admin for browser.request", () => {
setPluginGatewayMethodScope("browser.request", "operator.admin");

View File

@@ -122,7 +122,9 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
"chat.history",
"config.get",
"config.schema.lookup",
"talk.catalog",
"talk.config",
"talk.handoff.join",
"agents.files.list",
"agents.files.get",
"artifacts.list",
@@ -137,11 +139,27 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
"agent.wait",
"wake",
"talk.mode",
"talk.session.create",
"talk.session.inputAudio",
"talk.session.control",
"talk.session.toolResult",
"talk.session.close",
"talk.handoff.create",
"talk.handoff.revoke",
"talk.handoff.turnStart",
"talk.handoff.turnEnd",
"talk.handoff.turnCancel",
"talk.realtime.session",
"talk.realtime.toolCall",
"talk.realtime.relayAudio",
"talk.realtime.relayCancel",
"talk.realtime.relayMark",
"talk.realtime.relayStop",
"talk.realtime.relayToolResult",
"talk.transcription.session",
"talk.transcription.relayAudio",
"talk.transcription.relayCancel",
"talk.transcription.relayStop",
"talk.speak",
"tts.enable",
"tts.disable",

View File

@@ -7,7 +7,21 @@ import {
validateNodeEventResult,
validateNodePresenceAlivePayload,
validateTalkConfigResult,
validateTalkEvent,
validateTalkHandoffCreateParams,
validateTalkHandoffCreateResult,
validateTalkHandoffJoinResult,
validateTalkRealtimeRelayAudioParams,
validateTalkRealtimeRelayCancelParams,
validateTalkHandoffTurnCancelParams,
validateTalkHandoffTurnEndParams,
validateTalkHandoffTurnResult,
validateTalkHandoffTurnStartParams,
validateTalkRealtimeSessionParams,
validateTalkRealtimeToolCallParams,
validateTalkTranscriptionRelayCancelParams,
validateTalkTranscriptionRelayAudioParams,
validateTalkTranscriptionSessionParams,
validateWakeParams,
} from "./index.js";
@@ -104,7 +118,7 @@ describe("validateTalkConfigResult", () => {
).toBe(true);
});
it("rejects normalized talk payloads without talk.resolved", () => {
it("accepts normalized talk payloads without resolved provider materialization", () => {
expect(
validateTalkConfigResult({
config: {
@@ -118,18 +132,50 @@ describe("validateTalkConfigResult", () => {
},
},
}),
).toBe(false);
).toBe(true);
});
it("accepts realtime Talk defaults without requiring a speech provider", () => {
expect(
validateTalkConfigResult({
config: {
talk: {
realtime: {
provider: "openai",
providers: {
openai: {
apiKey: {
source: "env",
provider: "default",
id: "OPENAI_API_KEY",
},
model: "gpt-realtime",
},
},
model: "gpt-realtime",
voice: "alloy",
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
},
},
},
}),
).toBe(true);
});
});
describe("validateTalkRealtimeSessionParams", () => {
it("accepts provider, model, and voice overrides", () => {
it("accepts provider, model, voice, mode, transport, and brain overrides", () => {
expect(
validateTalkRealtimeSessionParams({
sessionKey: "agent:main:main",
provider: "openai",
model: "gpt-realtime-1.5",
voice: "alloy",
mode: "realtime",
transport: "webrtc",
brain: "agent-consult",
}),
).toBe(true);
});
@@ -147,6 +193,294 @@ describe("validateTalkRealtimeSessionParams", () => {
});
});
describe("validateTalkEvent", () => {
it("pins the common Talk event envelope used by relay and surface adapters", () => {
expect(
validateTalkEvent({
id: "talk-session:1",
type: "capture.started",
sessionId: "talk-session",
turnId: "turn-1",
captureId: "capture-1",
seq: 1,
timestamp: "2026-05-05T12:00:00.000Z",
mode: "stt-tts",
transport: "managed-room",
brain: "agent-consult",
provider: "openai",
final: false,
callId: "call-1",
itemId: "item-1",
parentId: "parent-1",
payload: { source: "ptt" },
}),
).toBe(true);
});
it("rejects stale or vendor-shaped event payloads without required correlation", () => {
expect(
validateTalkEvent({
type: "output.audio.delta",
sessionId: "talk-session",
seq: 0,
timestamp: "2026-05-05T12:00:00.000Z",
mode: "realtime-duplex",
transport: "webrtc-sdp",
brain: "agent-consult",
payload: { byteLength: 12 },
}),
).toBe(false);
expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required");
});
it("requires turnId and captureId for scoped Talk events", () => {
expect(
validateTalkEvent({
id: "talk-session:1",
type: "turn.started",
sessionId: "talk-session",
seq: 1,
timestamp: "2026-05-05T12:00:00.000Z",
mode: "stt-tts",
transport: "managed-room",
brain: "agent-consult",
payload: {},
}),
).toBe(false);
expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required");
expect(
validateTalkEvent({
id: "talk-session:2",
type: "capture.started",
sessionId: "talk-session",
turnId: "turn-1",
seq: 2,
timestamp: "2026-05-05T12:00:01.000Z",
mode: "stt-tts",
transport: "managed-room",
brain: "agent-consult",
payload: {},
}),
).toBe(false);
expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required");
});
});
describe("validateTalkHandoff", () => {
it("accepts session-scoped provider, model, and voice selection", () => {
expect(
validateTalkHandoffCreateParams({
sessionKey: "agent:main:main",
provider: "openai",
model: "gpt-realtime-1.5",
voice: "alloy",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
}),
).toBe(true);
expect(
validateTalkHandoffCreateResult({
id: "handoff-1",
roomId: "talk_handoff-1",
roomUrl: "/talk/rooms/talk_handoff-1",
token: "token-1",
sessionKey: "agent:main:main",
provider: "openai",
model: "gpt-realtime-1.5",
voice: "alloy",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
createdAt: 1,
expiresAt: 2,
room: {
recentTalkEvents: [
{
id: "talk_handoff-1:1",
type: "session.started",
sessionId: "talk_handoff-1",
seq: 1,
timestamp: "2026-05-05T12:00:00.000Z",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
payload: {},
},
],
},
}),
).toBe(true);
expect(
validateTalkHandoffJoinResult({
id: "handoff-1",
roomId: "talk_handoff-1",
roomUrl: "/talk/rooms/talk_handoff-1",
sessionKey: "agent:main:main",
provider: "openai",
model: "gpt-realtime-1.5",
voice: "alloy",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
createdAt: 1,
expiresAt: 2,
room: {
activeClientId: "conn-1",
recentTalkEvents: [
{
id: "talk_handoff-1:1",
type: "session.ready",
sessionId: "talk_handoff-1",
seq: 1,
timestamp: "2026-05-05T12:00:00.000Z",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
payload: {},
},
],
},
}),
).toBe(true);
});
it("rejects request-time instruction overrides", () => {
expect(
validateTalkHandoffCreateParams({
sessionKey: "agent:main:main",
instructionsOverride: "Ignore configured policy.",
}),
).toBe(false);
expect(formatValidationErrors(validateTalkHandoffCreateParams.errors)).toContain(
"unexpected property 'instructionsOverride'",
);
});
it("accepts handoff turn lifecycle params and results", () => {
expect(
validateTalkHandoffTurnStartParams({
id: "handoff-1",
token: "token-1",
turnId: "turn-1",
}),
).toBe(true);
expect(
validateTalkHandoffTurnEndParams({
id: "handoff-1",
token: "token-1",
}),
).toBe(true);
expect(
validateTalkHandoffTurnCancelParams({
id: "handoff-1",
token: "token-1",
reason: "barge-in",
}),
).toBe(true);
expect(
validateTalkHandoffTurnResult({
ok: true,
turnId: "turn-1",
events: [
{
id: "talk_handoff-1:2",
type: "turn.started",
sessionId: "talk_handoff-1",
turnId: "turn-1",
seq: 2,
timestamp: "2026-05-05T12:00:00.000Z",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
payload: {},
},
],
record: {
id: "handoff-1",
roomId: "talk_handoff-1",
roomUrl: "/talk/rooms/talk_handoff-1",
sessionKey: "agent:main:main",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
createdAt: 1,
expiresAt: 2,
room: {
activeClientId: "conn-1",
activeTurnId: "turn-1",
recentTalkEvents: [
{
id: "talk_handoff-1:2",
type: "turn.started",
sessionId: "talk_handoff-1",
turnId: "turn-1",
seq: 2,
timestamp: "2026-05-05T12:00:00.000Z",
mode: "realtime",
transport: "managed-room",
brain: "agent-consult",
payload: {},
},
],
},
},
}),
).toBe(true);
});
});
describe("validateTalkRealtimeToolCallParams", () => {
it("accepts optional relay session correlation", () => {
expect(
validateTalkRealtimeToolCallParams({
sessionKey: "agent:main:main",
relaySessionId: "relay-1",
callId: "call-1",
name: "openclaw_agent_consult",
args: { question: "what now" },
}),
).toBe(true);
});
});
describe("validateTalkRealtimeRelayParams", () => {
it("accepts relay audio and cancel params", () => {
expect(
validateTalkRealtimeRelayAudioParams({
relaySessionId: "relay-1",
audioBase64: "aGVsbG8=",
timestamp: 123,
}),
).toBe(true);
expect(
validateTalkRealtimeRelayCancelParams({
relaySessionId: "relay-1",
reason: "barge-in",
}),
).toBe(true);
});
});
describe("validateTalkTranscriptionParams", () => {
it("accepts transcription session, relay audio, and cancel params", () => {
expect(validateTalkTranscriptionSessionParams({ provider: "openai" })).toBe(true);
expect(
validateTalkTranscriptionRelayAudioParams({
transcriptionSessionId: "stt-1",
audioBase64: "aGVsbG8=",
}),
).toBe(true);
expect(
validateTalkTranscriptionRelayCancelParams({
transcriptionSessionId: "stt-1",
reason: "barge-in",
}),
).toBe(true);
});
});
describe("validateWakeParams", () => {
it("accepts valid wake params", () => {
expect(validateWakeParams({ mode: "now", text: "hello" })).toBe(true);

View File

@@ -61,12 +61,40 @@ import {
ChannelsStopParamsSchema,
type ChannelsLogoutParams,
ChannelsLogoutParamsSchema,
type TalkEvent,
TalkEventSchema,
type TalkCatalogParams,
TalkCatalogParamsSchema,
type TalkCatalogResult,
TalkCatalogResultSchema,
type TalkConfigParams,
TalkConfigParamsSchema,
type TalkConfigResult,
TalkConfigResultSchema,
type TalkHandoffCreateParams,
TalkHandoffCreateParamsSchema,
type TalkHandoffCreateResult,
TalkHandoffCreateResultSchema,
type TalkHandoffJoinParams,
TalkHandoffJoinParamsSchema,
type TalkHandoffJoinResult,
TalkHandoffJoinResultSchema,
type TalkHandoffRevokeParams,
TalkHandoffRevokeParamsSchema,
type TalkHandoffRevokeResult,
TalkHandoffRevokeResultSchema,
type TalkHandoffTurnCancelParams,
TalkHandoffTurnCancelParamsSchema,
type TalkHandoffTurnEndParams,
TalkHandoffTurnEndParamsSchema,
type TalkHandoffTurnResult,
TalkHandoffTurnResultSchema,
type TalkHandoffTurnStartParams,
TalkHandoffTurnStartParamsSchema,
type TalkRealtimeRelayAudioParams,
TalkRealtimeRelayAudioParamsSchema,
type TalkRealtimeRelayCancelParams,
TalkRealtimeRelayCancelParamsSchema,
type TalkRealtimeRelayMarkParams,
TalkRealtimeRelayMarkParamsSchema,
type TalkRealtimeRelayOkResult,
@@ -79,6 +107,38 @@ import {
TalkRealtimeSessionParamsSchema,
type TalkRealtimeSessionResult,
TalkRealtimeSessionResultSchema,
type TalkRealtimeToolCallParams,
TalkRealtimeToolCallParamsSchema,
type TalkRealtimeToolCallResult,
TalkRealtimeToolCallResultSchema,
type TalkSessionCloseParams,
TalkSessionCloseParamsSchema,
type TalkSessionControlParams,
TalkSessionControlParamsSchema,
type TalkSessionControlResult,
TalkSessionControlResultSchema,
type TalkSessionCreateParams,
TalkSessionCreateParamsSchema,
type TalkSessionCreateResult,
TalkSessionCreateResultSchema,
type TalkSessionInputAudioParams,
TalkSessionInputAudioParamsSchema,
type TalkSessionOkResult,
TalkSessionOkResultSchema,
type TalkSessionToolResultParams,
TalkSessionToolResultParamsSchema,
type TalkTranscriptionRelayAudioParams,
TalkTranscriptionRelayAudioParamsSchema,
type TalkTranscriptionRelayCancelParams,
TalkTranscriptionRelayCancelParamsSchema,
type TalkTranscriptionRelayOkResult,
TalkTranscriptionRelayOkResultSchema,
type TalkTranscriptionRelayStopParams,
TalkTranscriptionRelayStopParamsSchema,
type TalkTranscriptionSessionParams,
TalkTranscriptionSessionParamsSchema,
type TalkTranscriptionSessionResult,
TalkTranscriptionSessionResultSchema,
type TalkSpeakParams,
TalkSpeakParamsSchema,
type TalkSpeakResult,
@@ -532,17 +592,82 @@ export const validateWizardNextParams = ajv.compile<WizardNextParams>(WizardNext
export const validateWizardCancelParams = ajv.compile<WizardCancelParams>(WizardCancelParamsSchema);
export const validateWizardStatusParams = ajv.compile<WizardStatusParams>(WizardStatusParamsSchema);
export const validateTalkModeParams = ajv.compile<TalkModeParams>(TalkModeParamsSchema);
export const validateTalkEvent = ajv.compile<TalkEvent>(TalkEventSchema);
export const validateTalkCatalogParams = ajv.compile<TalkCatalogParams>(TalkCatalogParamsSchema);
export const validateTalkCatalogResult = ajv.compile<TalkCatalogResult>(TalkCatalogResultSchema);
export const validateTalkConfigParams = ajv.compile<TalkConfigParams>(TalkConfigParamsSchema);
export const validateTalkConfigResult = ajv.compile<TalkConfigResult>(TalkConfigResultSchema);
export const validateTalkHandoffCreateParams = ajv.compile<TalkHandoffCreateParams>(
TalkHandoffCreateParamsSchema,
);
export const validateTalkHandoffCreateResult = ajv.compile<TalkHandoffCreateResult>(
TalkHandoffCreateResultSchema,
);
export const validateTalkHandoffJoinParams = ajv.compile<TalkHandoffJoinParams>(
TalkHandoffJoinParamsSchema,
);
export const validateTalkHandoffJoinResult = ajv.compile<TalkHandoffJoinResult>(
TalkHandoffJoinResultSchema,
);
export const validateTalkHandoffRevokeParams = ajv.compile<TalkHandoffRevokeParams>(
TalkHandoffRevokeParamsSchema,
);
export const validateTalkHandoffRevokeResult = ajv.compile<TalkHandoffRevokeResult>(
TalkHandoffRevokeResultSchema,
);
export const validateTalkHandoffTurnStartParams = ajv.compile<TalkHandoffTurnStartParams>(
TalkHandoffTurnStartParamsSchema,
);
export const validateTalkHandoffTurnEndParams = ajv.compile<TalkHandoffTurnEndParams>(
TalkHandoffTurnEndParamsSchema,
);
export const validateTalkHandoffTurnCancelParams = ajv.compile<TalkHandoffTurnCancelParams>(
TalkHandoffTurnCancelParamsSchema,
);
export const validateTalkHandoffTurnResult = ajv.compile<TalkHandoffTurnResult>(
TalkHandoffTurnResultSchema,
);
export const validateTalkRealtimeSessionParams = ajv.compile<TalkRealtimeSessionParams>(
TalkRealtimeSessionParamsSchema,
);
export const validateTalkRealtimeSessionResult = ajv.compile<TalkRealtimeSessionResult>(
TalkRealtimeSessionResultSchema,
);
export const validateTalkRealtimeToolCallParams = ajv.compile<TalkRealtimeToolCallParams>(
TalkRealtimeToolCallParamsSchema,
);
export const validateTalkRealtimeToolCallResult = ajv.compile<TalkRealtimeToolCallResult>(
TalkRealtimeToolCallResultSchema,
);
export const validateTalkSessionCreateParams = ajv.compile<TalkSessionCreateParams>(
TalkSessionCreateParamsSchema,
);
export const validateTalkSessionCreateResult = ajv.compile<TalkSessionCreateResult>(
TalkSessionCreateResultSchema,
);
export const validateTalkSessionInputAudioParams = ajv.compile<TalkSessionInputAudioParams>(
TalkSessionInputAudioParamsSchema,
);
export const validateTalkSessionControlParams = ajv.compile<TalkSessionControlParams>(
TalkSessionControlParamsSchema,
);
export const validateTalkSessionControlResult = ajv.compile<TalkSessionControlResult>(
TalkSessionControlResultSchema,
);
export const validateTalkSessionToolResultParams = ajv.compile<TalkSessionToolResultParams>(
TalkSessionToolResultParamsSchema,
);
export const validateTalkSessionCloseParams = ajv.compile<TalkSessionCloseParams>(
TalkSessionCloseParamsSchema,
);
export const validateTalkSessionOkResult =
ajv.compile<TalkSessionOkResult>(TalkSessionOkResultSchema);
export const validateTalkRealtimeRelayAudioParams = ajv.compile<TalkRealtimeRelayAudioParams>(
TalkRealtimeRelayAudioParamsSchema,
);
export const validateTalkRealtimeRelayCancelParams = ajv.compile<TalkRealtimeRelayCancelParams>(
TalkRealtimeRelayCancelParamsSchema,
);
export const validateTalkRealtimeRelayMarkParams = ajv.compile<TalkRealtimeRelayMarkParams>(
TalkRealtimeRelayMarkParamsSchema,
);
@@ -551,6 +676,21 @@ export const validateTalkRealtimeRelayStopParams = ajv.compile<TalkRealtimeRelay
);
export const validateTalkRealtimeRelayToolResultParams =
ajv.compile<TalkRealtimeRelayToolResultParams>(TalkRealtimeRelayToolResultParamsSchema);
export const validateTalkTranscriptionSessionParams = ajv.compile<TalkTranscriptionSessionParams>(
TalkTranscriptionSessionParamsSchema,
);
export const validateTalkTranscriptionSessionResult = ajv.compile<TalkTranscriptionSessionResult>(
TalkTranscriptionSessionResultSchema,
);
export const validateTalkTranscriptionRelayAudioParams =
ajv.compile<TalkTranscriptionRelayAudioParams>(TalkTranscriptionRelayAudioParamsSchema);
export const validateTalkTranscriptionRelayCancelParams =
ajv.compile<TalkTranscriptionRelayCancelParams>(TalkTranscriptionRelayCancelParamsSchema);
export const validateTalkTranscriptionRelayStopParams =
ajv.compile<TalkTranscriptionRelayStopParams>(TalkTranscriptionRelayStopParamsSchema);
export const validateTalkTranscriptionRelayOkResult = ajv.compile<TalkTranscriptionRelayOkResult>(
TalkTranscriptionRelayOkResultSchema,
);
export const validateTalkSpeakParams = ajv.compile<TalkSpeakParams>(TalkSpeakParamsSchema);
export const validateTalkSpeakResult = ajv.compile<TalkSpeakResult>(TalkSpeakResultSchema);
export const validateChannelsStatusParams = ajv.compile<ChannelsStatusParams>(
@@ -765,15 +905,45 @@ export {
WizardNextResultSchema,
WizardStartResultSchema,
WizardStatusResultSchema,
TalkEventSchema,
TalkCatalogParamsSchema,
TalkCatalogResultSchema,
TalkConfigParamsSchema,
TalkConfigResultSchema,
TalkHandoffCreateParamsSchema,
TalkHandoffCreateResultSchema,
TalkHandoffJoinParamsSchema,
TalkHandoffJoinResultSchema,
TalkHandoffRevokeParamsSchema,
TalkHandoffRevokeResultSchema,
TalkHandoffTurnStartParamsSchema,
TalkHandoffTurnEndParamsSchema,
TalkHandoffTurnCancelParamsSchema,
TalkHandoffTurnResultSchema,
TalkRealtimeSessionParamsSchema,
TalkRealtimeSessionResultSchema,
TalkRealtimeToolCallParamsSchema,
TalkRealtimeToolCallResultSchema,
TalkSessionCreateParamsSchema,
TalkSessionCreateResultSchema,
TalkSessionInputAudioParamsSchema,
TalkSessionControlParamsSchema,
TalkSessionControlResultSchema,
TalkSessionToolResultParamsSchema,
TalkSessionCloseParamsSchema,
TalkSessionOkResultSchema,
TalkRealtimeRelayAudioParamsSchema,
TalkRealtimeRelayCancelParamsSchema,
TalkRealtimeRelayMarkParamsSchema,
TalkRealtimeRelayStopParamsSchema,
TalkRealtimeRelayToolResultParamsSchema,
TalkRealtimeRelayOkResultSchema,
TalkTranscriptionSessionParamsSchema,
TalkTranscriptionSessionResultSchema,
TalkTranscriptionRelayAudioParamsSchema,
TalkTranscriptionRelayCancelParamsSchema,
TalkTranscriptionRelayStopParamsSchema,
TalkTranscriptionRelayOkResultSchema,
TalkSpeakParamsSchema,
TalkSpeakResultSchema,
ChannelsStatusParamsSchema,
@@ -879,15 +1049,44 @@ export type {
WizardNextResult,
WizardStartResult,
WizardStatusResult,
TalkCatalogParams,
TalkCatalogResult,
TalkConfigParams,
TalkConfigResult,
TalkHandoffCreateParams,
TalkHandoffCreateResult,
TalkHandoffJoinParams,
TalkHandoffJoinResult,
TalkHandoffRevokeParams,
TalkHandoffRevokeResult,
TalkHandoffTurnStartParams,
TalkHandoffTurnEndParams,
TalkHandoffTurnCancelParams,
TalkHandoffTurnResult,
TalkRealtimeSessionParams,
TalkRealtimeSessionResult,
TalkRealtimeToolCallParams,
TalkRealtimeToolCallResult,
TalkSessionCreateParams,
TalkSessionCreateResult,
TalkSessionInputAudioParams,
TalkSessionControlParams,
TalkSessionControlResult,
TalkSessionToolResultParams,
TalkSessionCloseParams,
TalkSessionOkResult,
TalkRealtimeRelayAudioParams,
TalkRealtimeRelayCancelParams,
TalkRealtimeRelayMarkParams,
TalkRealtimeRelayStopParams,
TalkRealtimeRelayToolResultParams,
TalkRealtimeRelayOkResult,
TalkTranscriptionSessionParams,
TalkTranscriptionSessionResult,
TalkTranscriptionRelayAudioParams,
TalkTranscriptionRelayCancelParams,
TalkTranscriptionRelayStopParams,
TalkTranscriptionRelayOkResult,
TalkSpeakParams,
TalkSpeakResult,
TalkModeParams,

View File

@@ -36,12 +36,408 @@ export const TalkSpeakParamsSchema = Type.Object(
{ additionalProperties: false },
);
const TalkModeSchema = Type.Union([
Type.Literal("realtime"),
Type.Literal("stt-tts"),
Type.Literal("transcription"),
]);
const TalkTransportSchema = Type.Union([
Type.Literal("webrtc"),
Type.Literal("provider-websocket"),
Type.Literal("gateway-relay"),
Type.Literal("managed-room"),
]);
const TalkBrainSchema = Type.Union([
Type.Literal("agent-consult"),
Type.Literal("direct-tools"),
Type.Literal("none"),
]);
const TalkEventTypeSchema = Type.Union([
Type.Literal("session.started"),
Type.Literal("session.ready"),
Type.Literal("session.closed"),
Type.Literal("session.error"),
Type.Literal("session.replaced"),
Type.Literal("turn.started"),
Type.Literal("turn.ended"),
Type.Literal("turn.cancelled"),
Type.Literal("capture.started"),
Type.Literal("capture.stopped"),
Type.Literal("capture.cancelled"),
Type.Literal("capture.once"),
Type.Literal("input.audio.delta"),
Type.Literal("input.audio.committed"),
Type.Literal("transcript.delta"),
Type.Literal("transcript.done"),
Type.Literal("output.text.delta"),
Type.Literal("output.text.done"),
Type.Literal("output.audio.started"),
Type.Literal("output.audio.delta"),
Type.Literal("output.audio.done"),
Type.Literal("tool.call"),
Type.Literal("tool.progress"),
Type.Literal("tool.result"),
Type.Literal("tool.error"),
Type.Literal("usage.metrics"),
Type.Literal("latency.metrics"),
Type.Literal("health.changed"),
]);
const TURN_SCOPED_TALK_EVENT_TYPES = [
"turn.started",
"turn.ended",
"turn.cancelled",
"input.audio.delta",
"input.audio.committed",
"transcript.delta",
"transcript.done",
"output.text.delta",
"output.text.done",
"output.audio.started",
"output.audio.delta",
"output.audio.done",
"tool.call",
"tool.progress",
"tool.result",
"tool.error",
];
const CAPTURE_SCOPED_TALK_EVENT_TYPES = [
"capture.started",
"capture.stopped",
"capture.cancelled",
"capture.once",
];
function requireJsonSchemaProperties(properties: string[]): Record<string, { required: string[] }> {
const conditionalRequirementKey = ["th", "en"].join("");
return Object.fromEntries([[conditionalRequirementKey, { required: properties }]]);
}
export const TalkEventSchema = Type.Object(
{
id: NonEmptyString,
type: TalkEventTypeSchema,
sessionId: NonEmptyString,
turnId: Type.Optional(Type.String()),
captureId: Type.Optional(Type.String()),
seq: Type.Integer({ minimum: 1 }),
timestamp: NonEmptyString,
mode: TalkModeSchema,
transport: TalkTransportSchema,
brain: TalkBrainSchema,
provider: Type.Optional(Type.String()),
final: Type.Optional(Type.Boolean()),
callId: Type.Optional(Type.String()),
itemId: Type.Optional(Type.String()),
parentId: Type.Optional(Type.String()),
payload: Type.Unknown(),
},
{
additionalProperties: false,
allOf: [
{
if: {
properties: { type: { enum: TURN_SCOPED_TALK_EVENT_TYPES } },
required: ["type"],
},
...requireJsonSchemaProperties(["turnId"]),
},
{
if: {
properties: { type: { enum: CAPTURE_SCOPED_TALK_EVENT_TYPES } },
required: ["type"],
},
...requireJsonSchemaProperties(["captureId"]),
},
],
},
);
export const TalkRealtimeSessionParamsSchema = Type.Object(
{
sessionKey: Type.Optional(Type.String()),
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
mode: Type.Optional(TalkModeSchema),
transport: Type.Optional(TalkTransportSchema),
brain: Type.Optional(TalkBrainSchema),
},
{ additionalProperties: false },
);
export const TalkRealtimeToolCallParamsSchema = Type.Object(
{
sessionKey: NonEmptyString,
callId: NonEmptyString,
name: NonEmptyString,
args: Type.Optional(Type.Unknown()),
relaySessionId: Type.Optional(NonEmptyString),
},
{ additionalProperties: false },
);
export const TalkRealtimeToolCallResultSchema = Type.Object(
{
runId: NonEmptyString,
idempotencyKey: NonEmptyString,
},
{ additionalProperties: false },
);
export const TalkSessionCreateParamsSchema = Type.Object(
{
sessionKey: Type.Optional(Type.String()),
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
mode: Type.Optional(TalkModeSchema),
transport: Type.Optional(TalkTransportSchema),
brain: Type.Optional(TalkBrainSchema),
ttlMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 3600000 })),
},
{ additionalProperties: false },
);
export const TalkSessionInputAudioParamsSchema = Type.Object(
{
sessionId: NonEmptyString,
audioBase64: NonEmptyString,
timestamp: Type.Optional(Type.Number()),
},
{ additionalProperties: false },
);
export const TalkSessionControlParamsSchema = Type.Object(
{
sessionId: NonEmptyString,
type: Type.Union([
Type.Literal("turn.start"),
Type.Literal("turn.end"),
Type.Literal("turn.cancel"),
]),
turnId: Type.Optional(Type.String()),
reason: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkSessionToolResultParamsSchema = Type.Object(
{
sessionId: NonEmptyString,
callId: NonEmptyString,
result: Type.Unknown(),
},
{ additionalProperties: false },
);
export const TalkSessionCloseParamsSchema = Type.Object(
{
sessionId: NonEmptyString,
},
{ additionalProperties: false },
);
export const TalkHandoffCreateParamsSchema = Type.Object(
{
sessionKey: NonEmptyString,
sessionId: Type.Optional(Type.String()),
channel: Type.Optional(Type.String()),
target: Type.Optional(Type.String()),
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
mode: Type.Optional(TalkModeSchema),
transport: Type.Optional(TalkTransportSchema),
brain: Type.Optional(TalkBrainSchema),
ttlMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 3600000 })),
},
{ additionalProperties: false },
);
const TalkHandoffRoomSchema = Type.Object(
{
activeClientId: Type.Optional(Type.String()),
activeTurnId: Type.Optional(Type.String()),
recentTalkEvents: Type.Array(TalkEventSchema),
},
{ additionalProperties: false },
);
export const TalkHandoffCreateResultSchema = Type.Object(
{
id: NonEmptyString,
roomId: NonEmptyString,
roomUrl: NonEmptyString,
token: NonEmptyString,
sessionKey: NonEmptyString,
sessionId: Type.Optional(Type.String()),
channel: Type.Optional(Type.String()),
target: Type.Optional(Type.String()),
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
mode: TalkModeSchema,
transport: TalkTransportSchema,
brain: TalkBrainSchema,
createdAt: Type.Number(),
expiresAt: Type.Number(),
room: TalkHandoffRoomSchema,
},
{ additionalProperties: false },
);
const TalkHandoffPublicRecordSchema = Type.Object(
{
id: NonEmptyString,
roomId: NonEmptyString,
roomUrl: NonEmptyString,
sessionKey: NonEmptyString,
sessionId: Type.Optional(Type.String()),
channel: Type.Optional(Type.String()),
target: Type.Optional(Type.String()),
provider: Type.Optional(Type.String()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
mode: TalkModeSchema,
transport: TalkTransportSchema,
brain: TalkBrainSchema,
createdAt: Type.Number(),
expiresAt: Type.Number(),
room: TalkHandoffRoomSchema,
},
{ additionalProperties: false },
);
export const TalkHandoffJoinParamsSchema = Type.Object(
{
id: NonEmptyString,
token: NonEmptyString,
},
{ additionalProperties: false },
);
export const TalkHandoffJoinResultSchema = TalkHandoffPublicRecordSchema;
export const TalkHandoffRevokeParamsSchema = Type.Object(
{
id: NonEmptyString,
},
{ additionalProperties: false },
);
export const TalkHandoffRevokeResultSchema = Type.Object(
{
ok: Type.Boolean(),
revoked: Type.Boolean(),
},
{ additionalProperties: false },
);
export const TalkHandoffTurnStartParamsSchema = Type.Object(
{
id: NonEmptyString,
token: NonEmptyString,
turnId: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkHandoffTurnEndParamsSchema = Type.Object(
{
id: NonEmptyString,
token: NonEmptyString,
turnId: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkHandoffTurnCancelParamsSchema = Type.Object(
{
id: NonEmptyString,
token: NonEmptyString,
turnId: Type.Optional(Type.String()),
reason: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkHandoffTurnResultSchema = Type.Object(
{
ok: Type.Boolean(),
record: TalkHandoffPublicRecordSchema,
turnId: NonEmptyString,
events: Type.Array(TalkEventSchema),
},
{ additionalProperties: false },
);
export const TalkCatalogParamsSchema = Type.Object({}, { additionalProperties: false });
const TalkCatalogProviderSchema = Type.Object(
{
id: NonEmptyString,
label: NonEmptyString,
configured: Type.Boolean(),
models: Type.Optional(Type.Array(Type.String())),
voices: Type.Optional(Type.Array(Type.String())),
defaultModel: Type.Optional(Type.String()),
modes: Type.Optional(Type.Array(TalkModeSchema)),
transports: Type.Optional(Type.Array(TalkTransportSchema)),
brains: Type.Optional(Type.Array(TalkBrainSchema)),
inputAudioFormats: Type.Optional(
Type.Array(
Type.Object(
{
encoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]),
sampleRateHz: Type.Integer({ minimum: 1 }),
channels: Type.Integer({ minimum: 1 }),
},
{ additionalProperties: false },
),
),
),
outputAudioFormats: Type.Optional(
Type.Array(
Type.Object(
{
encoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]),
sampleRateHz: Type.Integer({ minimum: 1 }),
channels: Type.Integer({ minimum: 1 }),
},
{ additionalProperties: false },
),
),
),
supportsBrowserSession: Type.Optional(Type.Boolean()),
supportsBargeIn: Type.Optional(Type.Boolean()),
supportsToolCalls: Type.Optional(Type.Boolean()),
supportsVideoFrames: Type.Optional(Type.Boolean()),
supportsSessionResumption: Type.Optional(Type.Boolean()),
},
{ additionalProperties: false },
);
const TalkCatalogProviderGroupSchema = Type.Object(
{
activeProvider: Type.Optional(Type.String()),
providers: Type.Array(TalkCatalogProviderSchema),
},
{ additionalProperties: false },
);
export const TalkCatalogResultSchema = Type.Object(
{
modes: Type.Array(TalkModeSchema),
transports: Type.Array(TalkTransportSchema),
brains: Type.Array(TalkBrainSchema),
speech: TalkCatalogProviderGroupSchema,
transcription: TalkCatalogProviderGroupSchema,
realtime: TalkCatalogProviderGroupSchema,
},
{ additionalProperties: false },
);
@@ -70,6 +466,14 @@ export const TalkRealtimeRelayStopParamsSchema = Type.Object(
{ additionalProperties: false },
);
export const TalkRealtimeRelayCancelParamsSchema = Type.Object(
{
relaySessionId: NonEmptyString,
reason: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkRealtimeRelayToolResultParamsSchema = Type.Object(
{
relaySessionId: NonEmptyString,
@@ -86,6 +490,61 @@ export const TalkRealtimeRelayOkResultSchema = Type.Object(
{ additionalProperties: false },
);
export const TalkTranscriptionSessionParamsSchema = Type.Object(
{
provider: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkTranscriptionSessionResultSchema = Type.Object(
{
provider: NonEmptyString,
mode: Type.Literal("transcription"),
transport: Type.Literal("gateway-relay"),
transcriptionSessionId: NonEmptyString,
audio: Type.Object(
{
inputEncoding: Type.Literal("pcm16"),
inputSampleRateHz: Type.Integer({ minimum: 1 }),
},
{ additionalProperties: false },
),
expiresAt: Type.Number(),
},
{ additionalProperties: false },
);
export const TalkTranscriptionRelayAudioParamsSchema = Type.Object(
{
transcriptionSessionId: NonEmptyString,
audioBase64: NonEmptyString,
},
{ additionalProperties: false },
);
export const TalkTranscriptionRelayStopParamsSchema = Type.Object(
{
transcriptionSessionId: NonEmptyString,
},
{ additionalProperties: false },
);
export const TalkTranscriptionRelayCancelParamsSchema = Type.Object(
{
transcriptionSessionId: NonEmptyString,
reason: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const TalkTranscriptionRelayOkResultSchema = Type.Object(
{
ok: Type.Boolean(),
},
{ additionalProperties: false },
);
const BrowserRealtimeAudioContractSchema = Type.Object(
{
inputEncoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]),
@@ -96,10 +555,47 @@ const BrowserRealtimeAudioContractSchema = Type.Object(
{ additionalProperties: false },
);
export const TalkSessionCreateResultSchema = Type.Object(
{
sessionId: NonEmptyString,
provider: Type.Optional(Type.String()),
mode: TalkModeSchema,
transport: TalkTransportSchema,
brain: TalkBrainSchema,
relaySessionId: Type.Optional(NonEmptyString),
transcriptionSessionId: Type.Optional(NonEmptyString),
handoffId: Type.Optional(NonEmptyString),
roomId: Type.Optional(NonEmptyString),
roomUrl: Type.Optional(NonEmptyString),
token: Type.Optional(NonEmptyString),
audio: Type.Optional(Type.Unknown()),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
expiresAt: Type.Optional(Type.Number()),
},
{ additionalProperties: false },
);
export const TalkSessionControlResultSchema = Type.Object(
{
ok: Type.Boolean(),
turnId: Type.Optional(Type.String()),
events: Type.Optional(Type.Array(TalkEventSchema)),
},
{ additionalProperties: false },
);
export const TalkSessionOkResultSchema = Type.Object(
{
ok: Type.Boolean(),
},
{ additionalProperties: false },
);
const BrowserRealtimeWebRtcSdpSessionSchema = Type.Object(
{
provider: NonEmptyString,
transport: Type.Optional(Type.Literal("webrtc-sdp")),
transport: Type.Literal("webrtc"),
clientSecret: NonEmptyString,
offerUrl: Type.Optional(Type.String()),
offerHeaders: Type.Optional(Type.Record(Type.String(), Type.String())),
@@ -113,7 +609,7 @@ const BrowserRealtimeWebRtcSdpSessionSchema = Type.Object(
const BrowserRealtimeJsonPcmWebSocketSessionSchema = Type.Object(
{
provider: NonEmptyString,
transport: Type.Literal("json-pcm-websocket"),
transport: Type.Literal("provider-websocket"),
protocol: NonEmptyString,
clientSecret: NonEmptyString,
websocketUrl: NonEmptyString,
@@ -167,6 +663,19 @@ const TalkProviderConfigSchema = Type.Object(talkProviderFieldSchemas, {
additionalProperties: true,
});
const TalkRealtimeConfigSchema = Type.Object(
{
provider: Type.Optional(Type.String()),
providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
mode: Type.Optional(TalkModeSchema),
transport: Type.Optional(TalkTransportSchema),
brain: Type.Optional(TalkBrainSchema),
},
{ additionalProperties: false },
);
const ResolvedTalkConfigSchema = Type.Object(
{
provider: Type.String(),
@@ -179,7 +688,8 @@ const TalkConfigSchema = Type.Object(
{
provider: Type.Optional(Type.String()),
providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
resolved: ResolvedTalkConfigSchema,
realtime: Type.Optional(TalkRealtimeConfigSchema),
resolved: Type.Optional(ResolvedTalkConfigSchema),
speechLocale: Type.Optional(Type.String()),
interruptOnSpeech: Type.Optional(Type.Boolean()),
silenceTimeoutMs: Type.Optional(Type.Integer({ minimum: 1 })),

View File

@@ -65,15 +65,45 @@ import {
ChannelsStartParamsSchema,
ChannelsStopParamsSchema,
ChannelsLogoutParamsSchema,
TalkEventSchema,
TalkCatalogParamsSchema,
TalkCatalogResultSchema,
TalkConfigParamsSchema,
TalkConfigResultSchema,
TalkHandoffCreateParamsSchema,
TalkHandoffCreateResultSchema,
TalkHandoffJoinParamsSchema,
TalkHandoffJoinResultSchema,
TalkHandoffRevokeParamsSchema,
TalkHandoffRevokeResultSchema,
TalkHandoffTurnCancelParamsSchema,
TalkHandoffTurnEndParamsSchema,
TalkHandoffTurnResultSchema,
TalkHandoffTurnStartParamsSchema,
TalkRealtimeRelayAudioParamsSchema,
TalkRealtimeRelayCancelParamsSchema,
TalkRealtimeRelayMarkParamsSchema,
TalkRealtimeRelayOkResultSchema,
TalkRealtimeRelayStopParamsSchema,
TalkRealtimeRelayToolResultParamsSchema,
TalkRealtimeSessionParamsSchema,
TalkRealtimeSessionResultSchema,
TalkRealtimeToolCallParamsSchema,
TalkRealtimeToolCallResultSchema,
TalkSessionCloseParamsSchema,
TalkSessionControlParamsSchema,
TalkSessionControlResultSchema,
TalkSessionCreateParamsSchema,
TalkSessionCreateResultSchema,
TalkSessionInputAudioParamsSchema,
TalkSessionOkResultSchema,
TalkSessionToolResultParamsSchema,
TalkTranscriptionRelayAudioParamsSchema,
TalkTranscriptionRelayCancelParamsSchema,
TalkTranscriptionRelayOkResultSchema,
TalkTranscriptionRelayStopParamsSchema,
TalkTranscriptionSessionParamsSchema,
TalkTranscriptionSessionResultSchema,
TalkSpeakParamsSchema,
TalkSpeakResultSchema,
ChannelsStatusParamsSchema,
@@ -333,15 +363,45 @@ export const ProtocolSchemas = {
WizardStartResult: WizardStartResultSchema,
WizardStatusResult: WizardStatusResultSchema,
TalkModeParams: TalkModeParamsSchema,
TalkEvent: TalkEventSchema,
TalkCatalogParams: TalkCatalogParamsSchema,
TalkCatalogResult: TalkCatalogResultSchema,
TalkConfigParams: TalkConfigParamsSchema,
TalkConfigResult: TalkConfigResultSchema,
TalkHandoffCreateParams: TalkHandoffCreateParamsSchema,
TalkHandoffCreateResult: TalkHandoffCreateResultSchema,
TalkHandoffJoinParams: TalkHandoffJoinParamsSchema,
TalkHandoffJoinResult: TalkHandoffJoinResultSchema,
TalkHandoffRevokeParams: TalkHandoffRevokeParamsSchema,
TalkHandoffRevokeResult: TalkHandoffRevokeResultSchema,
TalkHandoffTurnStartParams: TalkHandoffTurnStartParamsSchema,
TalkHandoffTurnEndParams: TalkHandoffTurnEndParamsSchema,
TalkHandoffTurnCancelParams: TalkHandoffTurnCancelParamsSchema,
TalkHandoffTurnResult: TalkHandoffTurnResultSchema,
TalkRealtimeSessionParams: TalkRealtimeSessionParamsSchema,
TalkRealtimeSessionResult: TalkRealtimeSessionResultSchema,
TalkRealtimeRelayAudioParams: TalkRealtimeRelayAudioParamsSchema,
TalkRealtimeRelayCancelParams: TalkRealtimeRelayCancelParamsSchema,
TalkRealtimeRelayMarkParams: TalkRealtimeRelayMarkParamsSchema,
TalkRealtimeRelayStopParams: TalkRealtimeRelayStopParamsSchema,
TalkRealtimeRelayToolResultParams: TalkRealtimeRelayToolResultParamsSchema,
TalkRealtimeRelayOkResult: TalkRealtimeRelayOkResultSchema,
TalkRealtimeToolCallParams: TalkRealtimeToolCallParamsSchema,
TalkRealtimeToolCallResult: TalkRealtimeToolCallResultSchema,
TalkSessionCreateParams: TalkSessionCreateParamsSchema,
TalkSessionCreateResult: TalkSessionCreateResultSchema,
TalkSessionInputAudioParams: TalkSessionInputAudioParamsSchema,
TalkSessionControlParams: TalkSessionControlParamsSchema,
TalkSessionControlResult: TalkSessionControlResultSchema,
TalkSessionToolResultParams: TalkSessionToolResultParamsSchema,
TalkSessionCloseParams: TalkSessionCloseParamsSchema,
TalkSessionOkResult: TalkSessionOkResultSchema,
TalkTranscriptionSessionParams: TalkTranscriptionSessionParamsSchema,
TalkTranscriptionSessionResult: TalkTranscriptionSessionResultSchema,
TalkTranscriptionRelayAudioParams: TalkTranscriptionRelayAudioParamsSchema,
TalkTranscriptionRelayCancelParams: TalkTranscriptionRelayCancelParamsSchema,
TalkTranscriptionRelayStopParams: TalkTranscriptionRelayStopParamsSchema,
TalkTranscriptionRelayOkResult: TalkTranscriptionRelayOkResultSchema,
TalkSpeakParams: TalkSpeakParamsSchema,
TalkSpeakResult: TalkSpeakResultSchema,
ChannelsStatusParams: ChannelsStatusParamsSchema,

View File

@@ -92,16 +92,46 @@ export type WizardStep = SchemaType<"WizardStep">;
export type WizardNextResult = SchemaType<"WizardNextResult">;
export type WizardStartResult = SchemaType<"WizardStartResult">;
export type WizardStatusResult = SchemaType<"WizardStatusResult">;
export type TalkEvent = SchemaType<"TalkEvent">;
export type TalkModeParams = SchemaType<"TalkModeParams">;
export type TalkCatalogParams = SchemaType<"TalkCatalogParams">;
export type TalkCatalogResult = SchemaType<"TalkCatalogResult">;
export type TalkConfigParams = SchemaType<"TalkConfigParams">;
export type TalkConfigResult = SchemaType<"TalkConfigResult">;
export type TalkHandoffCreateParams = SchemaType<"TalkHandoffCreateParams">;
export type TalkHandoffCreateResult = SchemaType<"TalkHandoffCreateResult">;
export type TalkHandoffJoinParams = SchemaType<"TalkHandoffJoinParams">;
export type TalkHandoffJoinResult = SchemaType<"TalkHandoffJoinResult">;
export type TalkHandoffRevokeParams = SchemaType<"TalkHandoffRevokeParams">;
export type TalkHandoffRevokeResult = SchemaType<"TalkHandoffRevokeResult">;
export type TalkHandoffTurnStartParams = SchemaType<"TalkHandoffTurnStartParams">;
export type TalkHandoffTurnEndParams = SchemaType<"TalkHandoffTurnEndParams">;
export type TalkHandoffTurnCancelParams = SchemaType<"TalkHandoffTurnCancelParams">;
export type TalkHandoffTurnResult = SchemaType<"TalkHandoffTurnResult">;
export type TalkRealtimeSessionParams = SchemaType<"TalkRealtimeSessionParams">;
export type TalkRealtimeSessionResult = SchemaType<"TalkRealtimeSessionResult">;
export type TalkRealtimeRelayAudioParams = SchemaType<"TalkRealtimeRelayAudioParams">;
export type TalkRealtimeRelayCancelParams = SchemaType<"TalkRealtimeRelayCancelParams">;
export type TalkRealtimeRelayMarkParams = SchemaType<"TalkRealtimeRelayMarkParams">;
export type TalkRealtimeRelayStopParams = SchemaType<"TalkRealtimeRelayStopParams">;
export type TalkRealtimeRelayToolResultParams = SchemaType<"TalkRealtimeRelayToolResultParams">;
export type TalkRealtimeRelayOkResult = SchemaType<"TalkRealtimeRelayOkResult">;
export type TalkRealtimeToolCallParams = SchemaType<"TalkRealtimeToolCallParams">;
export type TalkRealtimeToolCallResult = SchemaType<"TalkRealtimeToolCallResult">;
export type TalkSessionCreateParams = SchemaType<"TalkSessionCreateParams">;
export type TalkSessionCreateResult = SchemaType<"TalkSessionCreateResult">;
export type TalkSessionInputAudioParams = SchemaType<"TalkSessionInputAudioParams">;
export type TalkSessionControlParams = SchemaType<"TalkSessionControlParams">;
export type TalkSessionControlResult = SchemaType<"TalkSessionControlResult">;
export type TalkSessionToolResultParams = SchemaType<"TalkSessionToolResultParams">;
export type TalkSessionCloseParams = SchemaType<"TalkSessionCloseParams">;
export type TalkSessionOkResult = SchemaType<"TalkSessionOkResult">;
export type TalkTranscriptionSessionParams = SchemaType<"TalkTranscriptionSessionParams">;
export type TalkTranscriptionSessionResult = SchemaType<"TalkTranscriptionSessionResult">;
export type TalkTranscriptionRelayAudioParams = SchemaType<"TalkTranscriptionRelayAudioParams">;
export type TalkTranscriptionRelayCancelParams = SchemaType<"TalkTranscriptionRelayCancelParams">;
export type TalkTranscriptionRelayStopParams = SchemaType<"TalkTranscriptionRelayStopParams">;
export type TalkTranscriptionRelayOkResult = SchemaType<"TalkTranscriptionRelayOkResult">;
export type TalkSpeakParams = SchemaType<"TalkSpeakParams">;
export type TalkSpeakResult = SchemaType<"TalkSpeakResult">;
export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">;

View File

@@ -32,6 +32,9 @@ const EVENT_SCOPE_GUARDS: Record<string, string[]> = {
presence: [],
shutdown: [],
tick: [],
"talk.event": [READ_SCOPE],
"talk.realtime.relay": [READ_SCOPE],
"talk.transcription.relay": [READ_SCOPE],
"talk.mode": [WRITE_SCOPE],
"update.available": [],
"voicewake.changed": [READ_SCOPE],

View File

@@ -0,0 +1,24 @@
import { describe, expect, it } from "vitest";
import { GATEWAY_EVENTS, listGatewayMethods } from "./server-methods-list.js";
describe("GATEWAY_EVENTS", () => {
it("advertises Talk event streams in hello features", () => {
expect(GATEWAY_EVENTS).toEqual(
expect.arrayContaining(["talk.event", "talk.realtime.relay", "talk.transcription.relay"]),
);
});
});
describe("listGatewayMethods", () => {
it("advertises the versioned Talk session RPCs", () => {
expect(listGatewayMethods()).toEqual(
expect.arrayContaining([
"talk.session.create",
"talk.session.inputAudio",
"talk.session.control",
"talk.session.toolResult",
"talk.session.close",
]),
);
});
});

View File

@@ -56,12 +56,30 @@ const BASE_METHODS = [
"wizard.next",
"wizard.cancel",
"wizard.status",
"talk.catalog",
"talk.config",
"talk.session.create",
"talk.session.inputAudio",
"talk.session.control",
"talk.session.toolResult",
"talk.session.close",
"talk.handoff.create",
"talk.handoff.join",
"talk.handoff.revoke",
"talk.handoff.turnStart",
"talk.handoff.turnEnd",
"talk.handoff.turnCancel",
"talk.realtime.session",
"talk.realtime.toolCall",
"talk.realtime.relayAudio",
"talk.realtime.relayCancel",
"talk.realtime.relayMark",
"talk.realtime.relayStop",
"talk.realtime.relayToolResult",
"talk.transcription.session",
"talk.transcription.relayAudio",
"talk.transcription.relayCancel",
"talk.transcription.relayStop",
"talk.speak",
"talk.mode",
"commands.list",
@@ -182,6 +200,9 @@ export const GATEWAY_EVENTS = [
"presence",
"tick",
"talk.mode",
"talk.event",
"talk.realtime.relay",
"talk.transcription.relay",
"shutdown",
"health",
"heartbeat",

View File

@@ -62,7 +62,7 @@ export type GatewayRequestContext = {
nodeSubscribe: (nodeId: string, sessionKey: string) => void;
nodeUnsubscribe: (nodeId: string, sessionKey: string) => void;
nodeUnsubscribeAll: (nodeId: string) => void;
hasConnectedMobileNode: () => boolean;
hasConnectedTalkNode: () => boolean;
hasExecApprovalClients?: (excludeConnId?: string) => boolean;
disconnectClientsForDevice?: (deviceId: string, opts?: { role?: string }) => void;
disconnectClientsUsingSharedGatewayAuth?: () => void;

View File

@@ -0,0 +1,497 @@
import { REALTIME_VOICE_AGENT_CONSULT_TOOL } from "../../realtime-voice/agent-consult-tool.js";
import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js";
import type { TalkBrain, TalkMode, TalkTransport } from "../../realtime-voice/talk-events.js";
import {
normalizeOptionalLowercaseString,
normalizeOptionalString,
} from "../../shared/string-coerce.js";
import { ADMIN_SCOPE } from "../operator-scopes.js";
import {
ErrorCodes,
errorShape,
formatValidationErrors,
validateTalkSessionCloseParams,
validateTalkSessionControlParams,
validateTalkSessionCreateParams,
validateTalkSessionInputAudioParams,
validateTalkSessionToolResultParams,
} from "../protocol/index.js";
import { resolveSessionKeyFromResolveParams } from "../sessions-resolve.js";
import {
cancelTalkHandoffTurn,
createTalkHandoff,
endTalkHandoffTurn,
revokeTalkHandoff,
startTalkHandoffTurn,
} from "../talk-handoff.js";
import {
cancelTalkRealtimeRelayTurn,
createTalkRealtimeRelaySession,
sendTalkRealtimeRelayAudio,
stopTalkRealtimeRelaySession,
submitTalkRealtimeRelayToolResult,
} from "../talk-realtime-relay.js";
import {
forgetUnifiedTalkSession,
getUnifiedTalkSession,
rememberUnifiedTalkSession,
requireUnifiedTalkSessionConn,
} from "../talk-session-registry.js";
import {
cancelTalkTranscriptionRelayTurn,
createTalkTranscriptionRelaySession,
sendTalkTranscriptionRelayAudio,
stopTalkTranscriptionRelaySession,
} from "../talk-transcription-relay.js";
import { formatForLog } from "../ws-log.js";
import {
broadcastTalkRoomEvents,
buildRealtimeInstructions,
buildTalkRealtimeConfig,
buildTalkTranscriptionConfig,
canUseTalkDirectTools,
resolveConfiguredRealtimeTranscriptionProvider,
talkHandoffErrorCode,
withRealtimeBrowserOverrides,
} from "./talk-shared.js";
import type { GatewayRequestHandlers } from "./types.js";
function normalizeTalkSessionMode(params: { mode?: string; transport?: string }): TalkMode {
const mode = normalizeOptionalLowercaseString(params.mode) as TalkMode | undefined;
if (mode) {
return mode;
}
return normalizeOptionalLowercaseString(params.transport) === "managed-room"
? "stt-tts"
: "realtime";
}
function normalizeTalkSessionTransport(params: {
mode: TalkMode;
transport?: string;
}): TalkTransport {
const transport = normalizeOptionalLowercaseString(params.transport) as TalkTransport | undefined;
if (transport) {
return transport;
}
return params.mode === "stt-tts" ? "managed-room" : "gateway-relay";
}
function normalizeTalkSessionBrain(params: { mode: TalkMode; brain?: string }): TalkBrain {
const brain = normalizeOptionalLowercaseString(params.brain) as TalkBrain | undefined;
if (brain) {
return brain;
}
return params.mode === "transcription" ? "none" : "agent-consult";
}
export const talkSessionHandlers: GatewayRequestHandlers = {
"talk.session.create": async ({ params, respond, context, client }) => {
if (!validateTalkSessionCreateParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.session.create params: ${formatValidationErrors(validateTalkSessionCreateParams.errors)}`,
),
);
return;
}
const mode = normalizeTalkSessionMode(params);
const transport = normalizeTalkSessionTransport({ mode, transport: params.transport });
const brain = normalizeTalkSessionBrain({ mode, brain: params.brain });
if (transport === "webrtc" || transport === "provider-websocket") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`talk.session.create is Gateway-managed; use talk.realtime.session for browser transport "${transport}"`,
),
);
return;
}
try {
if (transport === "managed-room") {
if (brain === "direct-tools" && !canUseTalkDirectTools(client)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`talk.session.create brain="direct-tools" requires gateway scope: ${ADMIN_SCOPE}`,
),
);
return;
}
const resolvedSession = await resolveSessionKeyFromResolveParams({
cfg: context.getRuntimeConfig(),
p: {
key: params.sessionKey,
includeGlobal: true,
includeUnknown: true,
},
});
if (!resolvedSession.ok) {
respond(false, undefined, resolvedSession.error);
return;
}
const handoff = createTalkHandoff({
sessionKey: resolvedSession.key,
provider: normalizeOptionalString(params.provider),
model: normalizeOptionalString(params.model),
voice: normalizeOptionalString(params.voice),
mode,
transport,
brain,
ttlMs: params.ttlMs,
});
rememberUnifiedTalkSession(handoff.id, {
kind: "managed-room",
handoffId: handoff.id,
token: handoff.token,
roomId: handoff.roomId,
});
respond(
true,
{
sessionId: handoff.id,
provider: handoff.provider,
mode: handoff.mode,
transport: handoff.transport,
brain: handoff.brain,
handoffId: handoff.id,
roomId: handoff.roomId,
roomUrl: handoff.roomUrl,
token: handoff.token,
model: handoff.model,
voice: handoff.voice,
expiresAt: handoff.expiresAt,
},
undefined,
);
return;
}
const connId = client?.connId;
if (!connId) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "Talk session unavailable"));
return;
}
if (mode === "realtime") {
if (transport !== "gateway-relay" || brain !== "agent-consult") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`realtime talk.session.create requires transport="gateway-relay" and brain="agent-consult"`,
),
);
return;
}
const runtimeConfig = context.getRuntimeConfig();
const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, params.provider);
const resolution = resolveConfiguredRealtimeVoiceProvider({
configuredProviderId: realtimeConfig.provider,
providerConfigs: realtimeConfig.providers,
cfg: runtimeConfig,
cfgForResolve: runtimeConfig,
noRegisteredProviderMessage: "No realtime voice provider registered",
});
const model = normalizeOptionalString(params.model) ?? realtimeConfig.model;
const voice = normalizeOptionalString(params.voice) ?? realtimeConfig.voice;
const session = createTalkRealtimeRelaySession({
context,
connId,
provider: resolution.provider,
providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }),
instructions: buildRealtimeInstructions(),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model,
voice,
});
rememberUnifiedTalkSession(session.relaySessionId, {
kind: "realtime-relay",
connId,
relaySessionId: session.relaySessionId,
});
respond(
true,
{
...session,
sessionId: session.relaySessionId,
mode,
brain,
},
undefined,
);
return;
}
if (mode === "transcription") {
if (transport !== "gateway-relay" || brain !== "none") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`transcription talk.session.create requires transport="gateway-relay" and brain="none"`,
),
);
return;
}
const runtimeConfig = context.getRuntimeConfig();
const transcriptionConfig = buildTalkTranscriptionConfig(runtimeConfig, params.provider);
const resolution = resolveConfiguredRealtimeTranscriptionProvider({
config: runtimeConfig,
configuredProviderId: transcriptionConfig.provider,
providerConfigs: transcriptionConfig.providers,
});
const session = createTalkTranscriptionRelaySession({
context,
connId,
provider: resolution.provider,
providerConfig: resolution.providerConfig,
});
rememberUnifiedTalkSession(session.transcriptionSessionId, {
kind: "transcription-relay",
connId,
transcriptionSessionId: session.transcriptionSessionId,
});
respond(
true,
{
...session,
sessionId: session.transcriptionSessionId,
brain,
},
undefined,
);
return;
}
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`stt-tts talk.session.create requires transport="managed-room"`,
),
);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.session.inputAudio": async ({ params, respond, client }) => {
if (!validateTalkSessionInputAudioParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.session.inputAudio params: ${formatValidationErrors(validateTalkSessionInputAudioParams.errors)}`,
),
);
return;
}
try {
const session = getUnifiedTalkSession(params.sessionId);
if (session.kind === "realtime-relay") {
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
sendTalkRealtimeRelayAudio({
relaySessionId: session.relaySessionId,
connId,
audioBase64: params.audioBase64,
timestamp: params.timestamp,
});
respond(true, { ok: true }, undefined);
return;
}
if (session.kind === "transcription-relay") {
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
sendTalkTranscriptionRelayAudio({
transcriptionSessionId: session.transcriptionSessionId,
connId,
audioBase64: params.audioBase64,
});
respond(true, { ok: true }, undefined);
return;
}
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
"talk.session.inputAudio is not supported for managed-room sessions",
),
);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.session.control": async ({ params, respond, client, context }) => {
if (!validateTalkSessionControlParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.session.control params: ${formatValidationErrors(validateTalkSessionControlParams.errors)}`,
),
);
return;
}
try {
const session = getUnifiedTalkSession(params.sessionId);
if (session.kind === "realtime-relay") {
if (params.type !== "turn.cancel") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`realtime relay sessions only support talk.session.control type="turn.cancel"`,
),
);
return;
}
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
cancelTalkRealtimeRelayTurn({
relaySessionId: session.relaySessionId,
connId,
reason: normalizeOptionalString(params.reason),
});
respond(true, { ok: true }, undefined);
return;
}
if (session.kind === "transcription-relay") {
if (params.type !== "turn.cancel") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`transcription relay sessions only support talk.session.control type="turn.cancel"`,
),
);
return;
}
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
cancelTalkTranscriptionRelayTurn({
transcriptionSessionId: session.transcriptionSessionId,
connId,
reason: normalizeOptionalString(params.reason),
});
respond(true, { ok: true }, undefined);
return;
}
const result =
params.type === "turn.start"
? startTalkHandoffTurn(session.handoffId, session.token, {
turnId: params.turnId,
clientId: client?.connId,
})
: params.type === "turn.end"
? endTalkHandoffTurn(session.handoffId, session.token, { turnId: params.turnId })
: cancelTalkHandoffTurn(session.handoffId, session.token, {
turnId: params.turnId,
reason: params.reason,
});
if (!result.ok) {
respond(
false,
undefined,
errorShape(
talkHandoffErrorCode(result.reason),
`talk session control failed: ${result.reason}`,
),
);
return;
}
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
handoffId: result.record.id,
roomId: result.record.roomId,
events: result.events,
});
respond(true, { ok: true, turnId: result.turnId, events: result.events }, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.session.toolResult": async ({ params, respond, client }) => {
if (!validateTalkSessionToolResultParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.session.toolResult params: ${formatValidationErrors(validateTalkSessionToolResultParams.errors)}`,
),
);
return;
}
try {
const session = getUnifiedTalkSession(params.sessionId);
if (session.kind !== "realtime-relay") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
"talk.session.toolResult is only supported for realtime relay sessions",
),
);
return;
}
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
submitTalkRealtimeRelayToolResult({
relaySessionId: session.relaySessionId,
connId,
callId: params.callId,
result: params.result,
});
respond(true, { ok: true }, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.session.close": async ({ params, respond, client }) => {
if (!validateTalkSessionCloseParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.session.close params: ${formatValidationErrors(validateTalkSessionCloseParams.errors)}`,
),
);
return;
}
try {
const session = getUnifiedTalkSession(params.sessionId);
if (session.kind === "realtime-relay") {
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId });
} else if (session.kind === "transcription-relay") {
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
stopTalkTranscriptionRelaySession({
transcriptionSessionId: session.transcriptionSessionId,
connId,
});
} else {
revokeTalkHandoff(session.handoffId);
}
forgetUnifiedTalkSession(params.sessionId);
respond(true, { ok: true }, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
};

View File

@@ -0,0 +1,237 @@
import type { OpenClawConfig } from "../../config/types.js";
import { listRealtimeTranscriptionProviders } from "../../realtime-transcription/provider-registry.js";
import type { RealtimeTranscriptionProviderConfig } from "../../realtime-transcription/provider-types.js";
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../realtime-voice/agent-consult-tool.js";
import type {
RealtimeVoiceBrowserSession,
RealtimeVoiceProviderConfig,
} from "../../realtime-voice/provider-types.js";
import type { TalkEvent } from "../../realtime-voice/talk-events.js";
import {
normalizeLowercaseStringOrEmpty,
normalizeOptionalLowercaseString,
normalizeOptionalString,
} from "../../shared/string-coerce.js";
import { ADMIN_SCOPE } from "../operator-scopes.js";
import { ErrorCodes } from "../protocol/index.js";
import type { TalkHandoffTurnResult } from "../talk-handoff.js";
import { asRecord } from "./record-shared.js";
export function canUseTalkDirectTools(client: { connect?: { scopes?: string[] } } | null): boolean {
const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
return scopes.includes(ADMIN_SCOPE);
}
export function broadcastTalkRoomEvents(
context: {
broadcastToConnIds: (
event: string,
payload: unknown,
connIds: Set<string>,
opts?: { dropIfSlow?: boolean },
) => void;
},
connId: string | undefined,
params: { handoffId: string; roomId: string; events: TalkEvent[] },
): void {
if (!connId || params.events.length === 0) {
return;
}
for (const talkEvent of params.events) {
context.broadcastToConnIds(
"talk.event",
{ handoffId: params.handoffId, roomId: params.roomId, talkEvent },
new Set([connId]),
{ dropIfSlow: true },
);
}
}
type TalkHandoffFailureReason = Extract<TalkHandoffTurnResult, { ok: false }>["reason"];
export function talkHandoffErrorCode(reason: TalkHandoffFailureReason) {
return reason === "invalid_token" || reason === "no_active_turn" || reason === "stale_turn"
? ErrorCodes.INVALID_REQUEST
: ErrorCodes.UNAVAILABLE;
}
function getRecord(value: unknown): Record<string, unknown> | undefined {
return asRecord(value) ?? undefined;
}
function getVoiceCallRealtimeConfig(config: OpenClawConfig): {
provider?: string;
providers?: Record<string, RealtimeVoiceProviderConfig>;
} {
const plugins = getRecord(config.plugins);
const entries = getRecord(plugins?.entries);
const voiceCall = getRecord(entries?.["voice-call"]);
const pluginConfig = getRecord(voiceCall?.config);
const realtime = getRecord(pluginConfig?.realtime);
const providersRaw = getRecord(realtime?.providers);
const providers: Record<string, RealtimeVoiceProviderConfig> = {};
if (providersRaw) {
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
const record = getRecord(providerConfig);
if (record) {
providers[providerId] = record;
}
}
}
return {
provider: normalizeOptionalString(realtime?.provider),
providers: Object.keys(providers).length > 0 ? providers : undefined,
};
}
export function getVoiceCallStreamingConfig(config: OpenClawConfig): {
provider?: string;
providers?: Record<string, RealtimeTranscriptionProviderConfig>;
} {
const plugins = getRecord(config.plugins);
const entries = getRecord(plugins?.entries);
const voiceCall = getRecord(entries?.["voice-call"]);
const pluginConfig = getRecord(voiceCall?.config);
const streaming = getRecord(pluginConfig?.streaming);
const providersRaw = getRecord(streaming?.providers);
const providers: Record<string, RealtimeTranscriptionProviderConfig> = {};
if (providersRaw) {
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
const record = getRecord(providerConfig);
if (record) {
providers[providerId] = record;
}
}
}
return {
provider: normalizeOptionalString(streaming?.provider),
providers: Object.keys(providers).length > 0 ? providers : undefined,
};
}
export function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) {
const voiceCallRealtime = getVoiceCallRealtimeConfig(config);
const talkRealtime = getRecord(config.talk?.realtime);
const talkRealtimeProviderConfigs = talkRealtime?.providers as
| Record<string, RealtimeVoiceProviderConfig>
| undefined;
const provider =
normalizeOptionalString(requestedProvider) ??
normalizeOptionalString(talkRealtime?.provider) ??
voiceCallRealtime.provider;
return {
provider,
providers: {
...voiceCallRealtime.providers,
...talkRealtimeProviderConfigs,
},
model: normalizeOptionalString(talkRealtime?.model),
voice: normalizeOptionalString(talkRealtime?.voice),
mode: normalizeOptionalLowercaseString(talkRealtime?.mode),
transport: normalizeOptionalLowercaseString(talkRealtime?.transport),
brain: normalizeOptionalLowercaseString(talkRealtime?.brain),
};
}
export function buildTalkTranscriptionConfig(config: OpenClawConfig, requestedProvider?: string) {
const streamingConfig = getVoiceCallStreamingConfig(config);
return {
provider: normalizeOptionalString(requestedProvider) ?? streamingConfig.provider,
providers: streamingConfig.providers ?? {},
};
}
function getRealtimeTranscriptionProviderConfig(params: {
providerConfigs: Record<string, RealtimeTranscriptionProviderConfig>;
provider: { id: string; aliases?: readonly string[] };
configuredProviderId?: string;
}): RealtimeTranscriptionProviderConfig {
const candidates = [
normalizeOptionalString(params.configuredProviderId),
params.provider.id,
...(params.provider.aliases ?? []),
].filter((key): key is string => Boolean(key));
const configuredKeys = Object.keys(params.providerConfigs);
for (const candidate of candidates) {
if (Object.hasOwn(params.providerConfigs, candidate)) {
return params.providerConfigs[candidate] ?? {};
}
const normalizedCandidate = normalizeOptionalLowercaseString(candidate);
const matchingKey = configuredKeys.find(
(key) => normalizeOptionalLowercaseString(key) === normalizedCandidate,
);
if (matchingKey) {
return params.providerConfigs[matchingKey] ?? {};
}
}
return {};
}
export function configuredOrFalse(callback: () => boolean): boolean {
try {
return callback();
} catch {
return false;
}
}
export function resolveConfiguredRealtimeTranscriptionProvider(params: {
config: OpenClawConfig;
configuredProviderId?: string;
providerConfigs: Record<string, RealtimeTranscriptionProviderConfig>;
}) {
const providers = listRealtimeTranscriptionProviders(params.config);
const normalizedConfigured = normalizeOptionalLowercaseString(params.configuredProviderId);
const orderedProviders = normalizedConfigured
? providers.filter(
(provider) =>
normalizeOptionalLowercaseString(provider.id) === normalizedConfigured ||
(provider.aliases ?? []).some(
(alias) => normalizeOptionalLowercaseString(alias) === normalizedConfigured,
),
)
: providers.toSorted((a, b) => (a.autoSelectOrder ?? 1000) - (b.autoSelectOrder ?? 1000));
for (const provider of orderedProviders) {
const rawConfig = getRealtimeTranscriptionProviderConfig({
providerConfigs: params.providerConfigs,
provider,
configuredProviderId: params.configuredProviderId,
});
const providerConfig = provider.resolveConfig?.({ cfg: params.config, rawConfig }) ?? rawConfig;
if (configuredOrFalse(() => provider.isConfigured({ cfg: params.config, providerConfig }))) {
return { provider, providerConfig };
}
}
if (normalizedConfigured) {
throw new Error(
`Realtime transcription provider "${params.configuredProviderId}" is not configured`,
);
}
throw new Error("No realtime transcription provider registered");
}
export function buildRealtimeInstructions(): string {
return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
}
export function withRealtimeBrowserOverrides(
providerConfig: RealtimeVoiceProviderConfig,
params: { model?: string; voice?: string },
): RealtimeVoiceProviderConfig {
const overrides: RealtimeVoiceProviderConfig = {};
const model = normalizeOptionalString(params.model);
const voice = normalizeOptionalString(params.voice);
if (model) {
overrides.model = model;
}
if (voice) {
overrides.voice = voice;
}
return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig;
}
export function isUnsupportedBrowserWebRtcSession(session: RealtimeVoiceBrowserSession): boolean {
const provider = normalizeLowercaseStringOrEmpty(session.provider);
const transport = (session as { transport?: string }).transport ?? "webrtc";
return provider === "google" && transport === "webrtc";
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,4 @@
import { randomUUID } from "node:crypto";
import { readConfigFileSnapshot } from "../../config/config.js";
import { redactConfigObject } from "../../config/redact-snapshot.js";
import {
@@ -7,47 +8,103 @@ import {
} from "../../config/talk.js";
import type { TalkConfigResponse, TalkProviderConfig } from "../../config/types.gateway.js";
import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js";
import { listRealtimeTranscriptionProviders } from "../../realtime-transcription/provider-registry.js";
import {
REALTIME_VOICE_AGENT_CONSULT_TOOL,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
buildRealtimeVoiceAgentConsultChatMessage,
} from "../../realtime-voice/agent-consult-tool.js";
import { getRealtimeVoiceProvider } from "../../realtime-voice/provider-registry.js";
import {
canonicalizeRealtimeVoiceProviderId,
listRealtimeVoiceProviders,
} from "../../realtime-voice/provider-registry.js";
import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js";
import type {
RealtimeVoiceBrowserSession,
RealtimeVoiceProviderConfig,
} from "../../realtime-voice/provider-types.js";
import {
normalizeLowercaseStringOrEmpty,
normalizeOptionalLowercaseString,
normalizeOptionalString,
} from "../../shared/string-coerce.js";
import { canonicalizeSpeechProviderId, getSpeechProvider } from "../../tts/provider-registry.js";
import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js";
import {
canonicalizeSpeechProviderId,
getSpeechProvider,
listSpeechProviders,
} from "../../tts/provider-registry.js";
import {
getResolvedSpeechProviderConfig,
resolveTtsConfig,
synthesizeSpeech,
type TtsDirectiveOverrides,
} from "../../tts/tts.js";
import { ADMIN_SCOPE, TALK_SECRETS_SCOPE } from "../operator-scopes.js";
import {
ErrorCodes,
errorShape,
formatValidationErrors,
type ErrorShape,
type TalkSpeakParams,
validateTalkCatalogParams,
validateTalkConfigParams,
validateTalkHandoffCreateParams,
validateTalkHandoffJoinParams,
validateTalkHandoffRevokeParams,
validateTalkHandoffTurnCancelParams,
validateTalkHandoffTurnEndParams,
validateTalkHandoffTurnStartParams,
validateTalkModeParams,
validateTalkRealtimeRelayAudioParams,
validateTalkRealtimeRelayCancelParams,
validateTalkRealtimeRelayMarkParams,
validateTalkRealtimeRelayStopParams,
validateTalkRealtimeRelayToolResultParams,
validateTalkRealtimeSessionParams,
validateTalkRealtimeToolCallParams,
validateTalkTranscriptionRelayAudioParams,
validateTalkTranscriptionRelayCancelParams,
validateTalkTranscriptionRelayStopParams,
validateTalkTranscriptionSessionParams,
validateTalkSpeakParams,
} from "../protocol/index.js";
import { resolveSessionKeyFromResolveParams } from "../sessions-resolve.js";
import {
cancelTalkHandoffTurn,
createTalkHandoff,
endTalkHandoffTurn,
joinTalkHandoff,
revokeTalkHandoff,
startTalkHandoffTurn,
} from "../talk-handoff.js";
import {
acknowledgeTalkRealtimeRelayMark,
cancelTalkRealtimeRelayTurn,
createTalkRealtimeRelaySession,
registerTalkRealtimeRelayAgentRun,
sendTalkRealtimeRelayAudio,
stopTalkRealtimeRelaySession,
submitTalkRealtimeRelayToolResult,
} from "../talk-realtime-relay.js";
import {
cancelTalkTranscriptionRelayTurn,
createTalkTranscriptionRelaySession,
sendTalkTranscriptionRelayAudio,
stopTalkTranscriptionRelaySession,
} from "../talk-transcription-relay.js";
import { formatForLog } from "../ws-log.js";
import { chatHandlers } from "./chat.js";
import { asRecord } from "./record-shared.js";
import { talkSessionHandlers } from "./talk-session.js";
import {
broadcastTalkRoomEvents,
buildRealtimeInstructions,
buildTalkRealtimeConfig,
buildTalkTranscriptionConfig,
canUseTalkDirectTools,
configuredOrFalse,
getVoiceCallStreamingConfig,
isUnsupportedBrowserWebRtcSession,
resolveConfiguredRealtimeTranscriptionProvider,
talkHandoffErrorCode,
withRealtimeBrowserOverrides,
} from "./talk-shared.js";
import type { GatewayRequestHandlers } from "./types.js";
type TalkSpeakReason =
@@ -158,83 +215,117 @@ function buildTalkTtsConfig(
};
}
function getRecord(value: unknown): Record<string, unknown> | undefined {
return asRecord(value) ?? undefined;
}
function buildTalkCatalog(config: OpenClawConfig) {
const ttsConfig = resolveTtsConfig(config);
const talkResolved = resolveActiveTalkProviderConfig(config.talk);
const activeSpeechProvider = canonicalizeSpeechProviderId(talkResolved?.provider, config);
const streamingConfig = getVoiceCallStreamingConfig(config);
const realtimeConfig = buildTalkRealtimeConfig(config);
const activeRealtimeProvider = canonicalizeRealtimeVoiceProviderId(
realtimeConfig.provider,
config,
);
function getVoiceCallRealtimeConfig(config: OpenClawConfig): {
provider?: string;
providers?: Record<string, RealtimeVoiceProviderConfig>;
} {
const plugins = getRecord(config.plugins);
const entries = getRecord(plugins?.entries);
const voiceCall = getRecord(entries?.["voice-call"]);
const pluginConfig = getRecord(voiceCall?.config);
const realtime = getRecord(pluginConfig?.realtime);
const providersRaw = getRecord(realtime?.providers);
const providers: Record<string, RealtimeVoiceProviderConfig> = {};
if (providersRaw) {
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
const record = getRecord(providerConfig);
if (record) {
providers[providerId] = record;
}
}
}
return {
provider: normalizeOptionalString(realtime?.provider),
providers: Object.keys(providers).length > 0 ? providers : undefined,
};
}
function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) {
const voiceCallRealtime = getVoiceCallRealtimeConfig(config);
const talkProviderConfigs = config.talk?.providers as
| Record<string, RealtimeVoiceProviderConfig>
| undefined;
const talkProvider = normalizeOptionalString(config.talk?.provider);
const talkProviderSupportsRealtime = talkProvider
? Boolean(getRealtimeVoiceProvider(talkProvider, config))
: false;
const provider =
normalizeOptionalString(requestedProvider) ??
(talkProviderSupportsRealtime ? talkProvider : undefined) ??
voiceCallRealtime.provider;
return {
provider,
providers: {
...voiceCallRealtime.providers,
...talkProviderConfigs,
modes: ["realtime", "stt-tts", "transcription"],
transports: ["webrtc", "provider-websocket", "gateway-relay", "managed-room"],
brains: ["agent-consult", "direct-tools", "none"],
speech: {
...(activeSpeechProvider ? { activeProvider: activeSpeechProvider } : {}),
providers: listSpeechProviders(config).map((provider) => {
const entry: Record<string, unknown> = {
id: provider.id,
label: provider.label,
configured: configuredOrFalse(() =>
provider.isConfigured({
cfg: config,
providerConfig: getResolvedSpeechProviderConfig(ttsConfig, provider.id, config),
timeoutMs: ttsConfig.timeoutMs,
}),
),
modes: ["stt-tts"],
brains: ["agent-consult"],
};
if (provider.models) {
entry.models = [...provider.models];
}
if (provider.voices) {
entry.voices = [...provider.voices];
}
return entry;
}),
},
transcription: {
...(streamingConfig.provider ? { activeProvider: streamingConfig.provider } : {}),
providers: listRealtimeTranscriptionProviders(config).map((provider) => {
const rawConfig = streamingConfig.providers?.[provider.id] ?? {};
const providerConfig = provider.resolveConfig?.({ cfg: config, rawConfig }) ?? rawConfig;
const entry: Record<string, unknown> = {
id: provider.id,
label: provider.label,
configured: configuredOrFalse(() =>
provider.isConfigured({ cfg: config, providerConfig }),
),
modes: ["transcription"],
transports: ["gateway-relay"],
brains: ["none"],
};
if (provider.defaultModel) {
entry.defaultModel = provider.defaultModel;
}
return entry;
}),
},
realtime: {
...(activeRealtimeProvider ? { activeProvider: activeRealtimeProvider } : {}),
providers: listRealtimeVoiceProviders(config).map((provider) => {
const rawConfig = realtimeConfig.providers?.[provider.id] ?? {};
const providerConfig = provider.resolveConfig?.({ cfg: config, rawConfig }) ?? rawConfig;
const capabilities = provider.capabilities;
const entry: Record<string, unknown> = {
id: provider.id,
label: provider.label,
configured: configuredOrFalse(() =>
provider.isConfigured({ cfg: config, providerConfig }),
),
modes: ["realtime"],
brains: capabilities?.supportsToolCalls === false ? ["none"] : ["agent-consult"],
supportsBrowserSession: Boolean(
capabilities?.supportsBrowserSession ?? provider.createBrowserSession,
),
};
if (provider.defaultModel) {
entry.defaultModel = provider.defaultModel;
}
if (capabilities?.transports) {
entry.transports = [...capabilities.transports];
}
if (capabilities?.inputAudioFormats) {
entry.inputAudioFormats = capabilities.inputAudioFormats.map((format) => ({ ...format }));
}
if (capabilities?.outputAudioFormats) {
entry.outputAudioFormats = capabilities.outputAudioFormats.map((format) => ({
...format,
}));
}
if (capabilities?.supportsBargeIn !== undefined) {
entry.supportsBargeIn = capabilities.supportsBargeIn;
}
if (capabilities?.supportsToolCalls !== undefined) {
entry.supportsToolCalls = capabilities.supportsToolCalls;
}
if (capabilities?.supportsVideoFrames !== undefined) {
entry.supportsVideoFrames = capabilities.supportsVideoFrames;
}
if (capabilities?.supportsSessionResumption !== undefined) {
entry.supportsSessionResumption = capabilities.supportsSessionResumption;
}
return entry;
}),
},
};
}
function buildRealtimeInstructions(): string {
return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
}
function withRealtimeBrowserOverrides(
providerConfig: RealtimeVoiceProviderConfig,
params: { model?: string; voice?: string },
): RealtimeVoiceProviderConfig {
const overrides: RealtimeVoiceProviderConfig = {};
const model = normalizeOptionalString(params.model);
const voice = normalizeOptionalString(params.voice);
if (model) {
overrides.model = model;
}
if (voice) {
overrides.voice = voice;
}
return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig;
}
function isUnsupportedBrowserWebRtcSession(session: RealtimeVoiceBrowserSession): boolean {
const provider = normalizeLowercaseStringOrEmpty(session.provider);
const transport = (session as { transport?: string }).transport ?? "webrtc-sdp";
return provider === "google" && transport === "webrtc-sdp";
}
function isFallbackEligibleTalkReason(reason: TalkSpeakReason): boolean {
return (
reason === "talk_unconfigured" ||
@@ -443,7 +534,89 @@ function stripUnresolvedSecretApiKeyFromRecord(
return rest;
}
async function startRealtimeToolCallAgentConsult(params: {
sessionKey: string;
callId: string;
args: unknown;
relaySessionId?: string;
connId?: string;
request: Parameters<GatewayRequestHandlers[string]>[0];
}): Promise<
{ ok: true; runId: string; idempotencyKey: string } | { ok: false; error: ErrorShape }
> {
let message: string;
try {
message = buildRealtimeVoiceAgentConsultChatMessage(params.args);
} catch (err) {
return { ok: false, error: errorShape(ErrorCodes.INVALID_REQUEST, formatForLog(err)) };
}
const idempotencyKey = `talk-${params.callId}-${randomUUID()}`;
let chatResponse: { ok: true; result: unknown } | { ok: false; error: ErrorShape } | undefined;
await chatHandlers["chat.send"]({
...params.request,
req: {
type: "req",
id: `${params.request.req.id}:talk-tool-call`,
method: "chat.send",
},
params: {
sessionKey: params.sessionKey,
message,
idempotencyKey,
},
respond: (ok: boolean, result?: unknown, error?: ErrorShape) => {
chatResponse = ok
? { ok: true, result }
: {
ok: false,
error: error ?? errorShape(ErrorCodes.UNAVAILABLE, "chat.send failed without error"),
};
},
} as never);
if (!chatResponse) {
return {
ok: false,
error: errorShape(ErrorCodes.UNAVAILABLE, "chat.send did not return a realtime tool result"),
};
}
if (!chatResponse.ok) {
return { ok: false, error: chatResponse.error };
}
const runId = normalizeOptionalString(asRecord(chatResponse.result)?.runId) ?? idempotencyKey;
if (params.relaySessionId && params.connId) {
registerTalkRealtimeRelayAgentRun({
relaySessionId: params.relaySessionId,
connId: params.connId,
sessionKey: params.sessionKey,
runId,
});
}
return { ok: true, runId, idempotencyKey };
}
export const talkHandlers: GatewayRequestHandlers = {
...talkSessionHandlers,
"talk.catalog": async ({ params, respond, context }) => {
const catalogParams = params ?? {};
if (!validateTalkCatalogParams(catalogParams)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.catalog params: ${formatValidationErrors(validateTalkCatalogParams.errors)}`,
),
);
return;
}
try {
respond(true, buildTalkCatalog(context.getRuntimeConfig()), undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.config": async ({ params, respond, client, context }) => {
if (!validateTalkConfigParams(params)) {
respond(
@@ -492,6 +665,200 @@ export const talkHandlers: GatewayRequestHandlers = {
respond(true, { config: configPayload }, undefined);
},
"talk.handoff.create": async ({ params, respond, client, context }) => {
if (!validateTalkHandoffCreateParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.handoff.create params: ${formatValidationErrors(validateTalkHandoffCreateParams.errors)}`,
),
);
return;
}
if (params.brain === "direct-tools" && !canUseTalkDirectTools(client)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`talk.handoff.create brain="direct-tools" requires gateway scope: ${ADMIN_SCOPE}`,
),
);
return;
}
const resolvedSession = await resolveSessionKeyFromResolveParams({
cfg: context.getRuntimeConfig(),
p: {
key: params.sessionKey,
includeGlobal: true,
includeUnknown: true,
},
});
if (!resolvedSession.ok) {
respond(false, undefined, resolvedSession.error);
return;
}
respond(true, createTalkHandoff({ ...params, sessionKey: resolvedSession.key }), undefined);
},
"talk.handoff.join": async ({ params, respond, client, context }) => {
if (!validateTalkHandoffJoinParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.handoff.join params: ${formatValidationErrors(validateTalkHandoffJoinParams.errors)}`,
),
);
return;
}
const result = joinTalkHandoff(params.id, params.token, { clientId: client?.connId });
if (!result.ok) {
respond(
false,
undefined,
errorShape(
result.reason === "invalid_token" ? ErrorCodes.INVALID_REQUEST : ErrorCodes.UNAVAILABLE,
`talk handoff join failed: ${result.reason}`,
),
);
return;
}
broadcastTalkRoomEvents(context, result.replacedClientId, {
handoffId: result.record.id,
roomId: result.record.roomId,
events: result.replacementEvents,
});
broadcastTalkRoomEvents(context, client?.connId, {
handoffId: result.record.id,
roomId: result.record.roomId,
events: result.activeClientEvents,
});
respond(true, result.record, undefined);
},
"talk.handoff.revoke": async ({ params, respond, context }) => {
if (!validateTalkHandoffRevokeParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.handoff.revoke params: ${formatValidationErrors(validateTalkHandoffRevokeParams.errors)}`,
),
);
return;
}
const result = revokeTalkHandoff(params.id);
broadcastTalkRoomEvents(context, result.activeClientId, {
handoffId: params.id,
roomId: result.roomId ?? "",
events: result.events,
});
respond(true, { ok: true, revoked: result.revoked }, undefined);
},
"talk.handoff.turnStart": async ({ params, respond, client, context }) => {
if (!validateTalkHandoffTurnStartParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.handoff.turnStart params: ${formatValidationErrors(validateTalkHandoffTurnStartParams.errors)}`,
),
);
return;
}
const result = startTalkHandoffTurn(params.id, params.token, {
turnId: params.turnId,
clientId: client?.connId,
});
if (!result.ok) {
respond(
false,
undefined,
errorShape(
talkHandoffErrorCode(result.reason),
`talk handoff turn start failed: ${result.reason}`,
),
);
return;
}
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
handoffId: result.record.id,
roomId: result.record.roomId,
events: result.events,
});
respond(true, result, undefined);
},
"talk.handoff.turnEnd": async ({ params, respond, context }) => {
if (!validateTalkHandoffTurnEndParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.handoff.turnEnd params: ${formatValidationErrors(validateTalkHandoffTurnEndParams.errors)}`,
),
);
return;
}
const result = endTalkHandoffTurn(params.id, params.token, {
turnId: params.turnId,
});
if (!result.ok) {
respond(
false,
undefined,
errorShape(
talkHandoffErrorCode(result.reason),
`talk handoff turn end failed: ${result.reason}`,
),
);
return;
}
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
handoffId: result.record.id,
roomId: result.record.roomId,
events: result.events,
});
respond(true, result, undefined);
},
"talk.handoff.turnCancel": async ({ params, respond, context }) => {
if (!validateTalkHandoffTurnCancelParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.handoff.turnCancel params: ${formatValidationErrors(validateTalkHandoffTurnCancelParams.errors)}`,
),
);
return;
}
const result = cancelTalkHandoffTurn(params.id, params.token, {
turnId: params.turnId,
reason: params.reason,
});
if (!result.ok) {
respond(
false,
undefined,
errorShape(
talkHandoffErrorCode(result.reason),
`talk handoff turn cancel failed: ${result.reason}`,
),
);
return;
}
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
handoffId: result.record.id,
roomId: result.record.roomId,
events: result.events,
});
respond(true, result, undefined);
},
"talk.realtime.session": async ({ params, respond, context, client }) => {
if (!validateTalkRealtimeSessionParams(params)) {
respond(
@@ -508,10 +875,54 @@ export const talkHandlers: GatewayRequestHandlers = {
provider?: string;
model?: string;
voice?: string;
mode?: string;
transport?: string;
brain?: string;
};
try {
const runtimeConfig = context.getRuntimeConfig();
const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, typedParams.provider);
const mode =
normalizeOptionalLowercaseString(typedParams.mode) ?? realtimeConfig.mode ?? "realtime";
if (mode !== "realtime") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`talk.realtime.session only supports mode="realtime"; use talk.catalog for ${mode} provider discovery`,
),
);
return;
}
const brain =
normalizeOptionalLowercaseString(typedParams.brain) ??
realtimeConfig.brain ??
"agent-consult";
if (brain !== "agent-consult") {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`talk.realtime.session only supports brain="agent-consult"`,
),
);
return;
}
const transport =
normalizeOptionalLowercaseString(typedParams.transport) ?? realtimeConfig.transport;
if (transport === "managed-room") {
respond(
false,
undefined,
errorShape(
ErrorCodes.UNAVAILABLE,
"managed-room realtime Talk sessions are not available in the browser UI yet",
),
);
return;
}
const resolution = resolveConfiguredRealtimeVoiceProvider({
configuredProviderId: realtimeConfig.provider,
providerConfigs: realtimeConfig.providers,
@@ -519,18 +930,32 @@ export const talkHandlers: GatewayRequestHandlers = {
cfgForResolve: runtimeConfig,
noRegisteredProviderMessage: "No realtime voice provider registered",
});
if (resolution.provider.createBrowserSession) {
if (resolution.provider.createBrowserSession && transport !== "gateway-relay") {
const session = await resolution.provider.createBrowserSession({
providerConfig: resolution.providerConfig,
instructions: buildRealtimeInstructions(),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model: normalizeOptionalString(typedParams.model),
voice: normalizeOptionalString(typedParams.voice),
model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model,
voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice,
});
if (!isUnsupportedBrowserWebRtcSession(session)) {
if (
!isUnsupportedBrowserWebRtcSession(session) &&
(!transport || session.transport === transport)
) {
respond(true, session, undefined);
return;
}
if (transport) {
respond(
false,
undefined,
errorShape(
ErrorCodes.UNAVAILABLE,
`Realtime provider "${resolution.provider.id}" does not support requested browser transport "${transport}"`,
),
);
return;
}
}
const connId = client?.connId;
@@ -542,8 +967,8 @@ export const talkHandlers: GatewayRequestHandlers = {
);
return;
}
const model = normalizeOptionalString(typedParams.model);
const voice = normalizeOptionalString(typedParams.voice);
const model = normalizeOptionalString(typedParams.model) ?? realtimeConfig.model;
const voice = normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice;
const session = createTalkRealtimeRelaySession({
context,
connId,
@@ -559,6 +984,49 @@ export const talkHandlers: GatewayRequestHandlers = {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.realtime.toolCall": async (request) => {
const { params, respond } = request;
if (!validateTalkRealtimeToolCallParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.realtime.toolCall params: ${formatValidationErrors(validateTalkRealtimeToolCallParams.errors)}`,
),
);
return;
}
if (params.name !== REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME) {
respond(
false,
undefined,
errorShape(ErrorCodes.INVALID_REQUEST, `unsupported realtime Talk tool: ${params.name}`),
);
return;
}
const result = await startRealtimeToolCallAgentConsult({
sessionKey: params.sessionKey,
callId: params.callId,
args: params.args ?? {},
relaySessionId: normalizeOptionalString(params.relaySessionId),
connId: normalizeOptionalString(request.client?.connId),
request,
});
if (!result.ok) {
respond(false, undefined, result.error);
return;
}
respond(
true,
{
runId: result.runId,
idempotencyKey: result.idempotencyKey,
},
undefined,
);
},
"talk.realtime.relayAudio": async ({ params, respond, client }) => {
if (!validateTalkRealtimeRelayAudioParams(params)) {
respond(
@@ -612,6 +1080,34 @@ export const talkHandlers: GatewayRequestHandlers = {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.realtime.relayCancel": async ({ params, respond, client }) => {
if (!validateTalkRealtimeRelayCancelParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.realtime.relayCancel params: ${formatValidationErrors(validateTalkRealtimeRelayCancelParams.errors)}`,
),
);
return;
}
const connId = client?.connId;
if (!connId) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "realtime relay unavailable"));
return;
}
try {
cancelTalkRealtimeRelayTurn({
relaySessionId: params.relaySessionId,
connId,
reason: normalizeOptionalString(params.reason),
});
respond(true, { ok: true }, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.realtime.relayStop": async ({ params, respond, client }) => {
if (!validateTalkRealtimeRelayStopParams(params)) {
respond(
@@ -665,6 +1161,141 @@ export const talkHandlers: GatewayRequestHandlers = {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.transcription.session": async ({ params, respond, context, client }) => {
if (!validateTalkTranscriptionSessionParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.transcription.session params: ${formatValidationErrors(validateTalkTranscriptionSessionParams.errors)}`,
),
);
return;
}
const connId = client?.connId;
if (!connId) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay requires a connected client"),
);
return;
}
try {
const runtimeConfig = context.getRuntimeConfig();
const transcriptionConfig = buildTalkTranscriptionConfig(runtimeConfig, params.provider);
const resolution = resolveConfiguredRealtimeTranscriptionProvider({
config: runtimeConfig,
configuredProviderId: transcriptionConfig.provider,
providerConfigs: transcriptionConfig.providers,
});
const session = createTalkTranscriptionRelaySession({
context,
connId,
provider: resolution.provider,
providerConfig: resolution.providerConfig,
});
respond(true, session, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.transcription.relayAudio": async ({ params, respond, client }) => {
if (!validateTalkTranscriptionRelayAudioParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.transcription.relayAudio params: ${formatValidationErrors(validateTalkTranscriptionRelayAudioParams.errors)}`,
),
);
return;
}
const connId = client?.connId;
if (!connId) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"),
);
return;
}
try {
sendTalkTranscriptionRelayAudio({
transcriptionSessionId: params.transcriptionSessionId,
connId,
audioBase64: params.audioBase64,
});
respond(true, { ok: true }, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.transcription.relayCancel": async ({ params, respond, client }) => {
if (!validateTalkTranscriptionRelayCancelParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.transcription.relayCancel params: ${formatValidationErrors(validateTalkTranscriptionRelayCancelParams.errors)}`,
),
);
return;
}
const connId = client?.connId;
if (!connId) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"),
);
return;
}
try {
cancelTalkTranscriptionRelayTurn({
transcriptionSessionId: params.transcriptionSessionId,
connId,
reason: normalizeOptionalString(params.reason),
});
respond(true, { ok: true }, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.transcription.relayStop": async ({ params, respond, client }) => {
if (!validateTalkTranscriptionRelayStopParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.transcription.relayStop params: ${formatValidationErrors(validateTalkTranscriptionRelayStopParams.errors)}`,
),
);
return;
}
const connId = client?.connId;
if (!connId) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"),
);
return;
}
try {
stopTalkTranscriptionRelaySession({
transcriptionSessionId: params.transcriptionSessionId,
connId,
});
respond(true, { ok: true }, undefined);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.speak": async ({ params, respond, context }) => {
if (!validateTalkSpeakParams(params)) {
respond(
@@ -763,11 +1394,11 @@ export const talkHandlers: GatewayRequestHandlers = {
}
},
"talk.mode": ({ params, respond, context, client, isWebchatConnect }) => {
if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) {
if (client && isWebchatConnect(client.connect) && !context.hasConnectedTalkNode()) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected iOS/Android nodes"),
errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected Talk-capable nodes"),
);
return;
}

View File

@@ -1,12 +0,0 @@
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
import type { NodeRegistry } from "./node-registry.js";
export function hasConnectedMobileNode(registry: NodeRegistry): boolean {
const connected = registry.listConnected();
return connected.some((n) => {
const platform = normalizeOptionalLowercaseString(n.platform) ?? "";
return (
platform.startsWith("ios") || platform.startsWith("ipados") || platform.startsWith("android")
);
});
}

View File

@@ -4,8 +4,8 @@ import {
createSessionMessageSubscriberRegistry,
} from "./server-chat-state.js";
import { safeParseJson } from "./server-json.js";
import { hasConnectedMobileNode } from "./server-mobile-nodes.js";
import { createNodeSubscriptionManager } from "./server-node-subscriptions.js";
import { hasConnectedTalkNode } from "./server-talk-nodes.js";
export function createGatewayNodeSessionRuntime(params: {
broadcast: (event: string, payload: unknown, opts?: { dropIfSlow?: boolean }) => void;
@@ -26,7 +26,7 @@ export function createGatewayNodeSessionRuntime(params: {
const broadcastVoiceWakeChanged = (triggers: string[]) => {
params.broadcast("voicewake.changed", { triggers }, { dropIfSlow: true });
};
const hasMobileNodeConnected = () => hasConnectedMobileNode(nodeRegistry);
const hasTalkNodeConnected = () => hasConnectedTalkNode(nodeRegistry);
return {
nodeRegistry,
@@ -39,6 +39,6 @@ export function createGatewayNodeSessionRuntime(params: {
nodeUnsubscribe: nodeSubscriptions.unsubscribe,
nodeUnsubscribeAll: nodeSubscriptions.unsubscribeAll,
broadcastVoiceWakeChanged,
hasMobileNodeConnected,
hasTalkNodeConnected,
};
}

View File

@@ -33,7 +33,7 @@ describe("createGatewayRequestContext", () => {
nodeSubscribe: vi.fn(),
nodeUnsubscribe: vi.fn(),
nodeUnsubscribeAll: vi.fn(),
hasConnectedMobileNode: vi.fn(() => false),
hasConnectedTalkNode: vi.fn(() => false),
clients: new Set(),
enforceSharedGatewayAuthGenerationForConfigWrite: vi.fn(),
nodeRegistry: {} as never,

View File

@@ -28,7 +28,7 @@ type GatewayRequestContextParams = {
nodeSubscribe: GatewayRequestContext["nodeSubscribe"];
nodeUnsubscribe: GatewayRequestContext["nodeUnsubscribe"];
nodeUnsubscribeAll: GatewayRequestContext["nodeUnsubscribeAll"];
hasConnectedMobileNode: GatewayRequestContext["hasConnectedMobileNode"];
hasConnectedTalkNode: GatewayRequestContext["hasConnectedTalkNode"];
clients: Set<GatewayRequestContextClient>;
enforceSharedGatewayAuthGenerationForConfigWrite: (nextConfig: OpenClawConfig) => void;
nodeRegistry: GatewayRequestContext["nodeRegistry"];
@@ -92,7 +92,7 @@ export function createGatewayRequestContext(
nodeSubscribe: params.nodeSubscribe,
nodeUnsubscribe: params.nodeUnsubscribe,
nodeUnsubscribeAll: params.nodeUnsubscribeAll,
hasConnectedMobileNode: params.hasConnectedMobileNode,
hasConnectedTalkNode: params.hasConnectedTalkNode,
hasExecApprovalClients: (excludeConnId?: string) => {
for (const gatewayClient of params.clients) {
if (excludeConnId && gatewayClient.connId === excludeConnId) {

View File

@@ -884,7 +884,7 @@ export async function startGatewayServer(
nodeUnsubscribe,
nodeUnsubscribeAll,
broadcastVoiceWakeChanged,
hasMobileNodeConnected,
hasTalkNodeConnected,
} = createGatewayNodeSessionRuntime({ broadcast });
applyGatewayLaneConcurrency(cfgAtStart);
@@ -1261,7 +1261,7 @@ export async function startGatewayServer(
nodeSubscribe,
nodeUnsubscribe,
nodeUnsubscribeAll,
hasConnectedMobileNode: hasMobileNodeConnected,
hasConnectedTalkNode: hasTalkNodeConnected,
clients,
enforceSharedGatewayAuthGenerationForConfigWrite: (nextConfig: OpenClawConfig) => {
enforceSharedGatewaySessionGenerationForConfigWrite({

View File

@@ -0,0 +1,286 @@
import { describe, expect, it, vi } from "vitest";
import {
cancelTalkHandoffTurn,
clearTalkHandoffsForTest,
createTalkHandoff,
endTalkHandoffTurn,
getTalkHandoff,
joinTalkHandoff,
revokeTalkHandoff,
startTalkHandoffTurn,
verifyTalkHandoffToken,
} from "./talk-handoff.js";
describe("talk handoff store", () => {
it("creates an expiring managed-room handoff without storing the plaintext token", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-05-05T12:00:00.000Z"));
clearTalkHandoffsForTest();
const handoff = createTalkHandoff({
sessionKey: "session:main",
sessionId: "session-id",
channel: "discord",
target: "dm:123",
provider: "openai",
model: "gpt-realtime-1.5",
voice: "alloy",
ttlMs: 5000,
});
const record = getTalkHandoff(handoff.id);
expect(handoff).toMatchObject({
roomId: `talk_${handoff.id}`,
roomUrl: `/talk/rooms/talk_${handoff.id}`,
sessionKey: "session:main",
sessionId: "session-id",
channel: "discord",
target: "dm:123",
provider: "openai",
model: "gpt-realtime-1.5",
voice: "alloy",
mode: "stt-tts",
transport: "managed-room",
brain: "agent-consult",
createdAt: Date.parse("2026-05-05T12:00:00.000Z"),
expiresAt: Date.parse("2026-05-05T12:00:05.000Z"),
room: {
activeClientId: undefined,
recentTalkEvents: [
expect.objectContaining({
type: "session.started",
sessionId: `talk_${handoff.id}`,
transport: "managed-room",
}),
],
},
});
expect(handoff).not.toHaveProperty("tokenHash");
expect(record?.tokenHash).toBeTruthy();
expect(record?.tokenHash).not.toBe(handoff.token);
expect(record && verifyTalkHandoffToken(record, handoff.token)).toBe(true);
vi.advanceTimersByTime(5001);
expect(getTalkHandoff(handoff.id)).toBeUndefined();
vi.useRealTimers();
});
it("joins and revokes handoffs with only the bearer token", () => {
clearTalkHandoffsForTest();
const handoff = createTalkHandoff({ sessionKey: "session:main" });
expect(joinTalkHandoff(handoff.id, "wrong")).toEqual({
ok: false,
reason: "invalid_token",
});
expect(joinTalkHandoff(handoff.id, handoff.token)).toMatchObject({
ok: true,
events: [expect.objectContaining({ type: "session.ready" })],
record: expect.objectContaining({
id: handoff.id,
roomId: handoff.roomId,
sessionKey: "session:main",
}),
});
expect(revokeTalkHandoff(handoff.id)).toMatchObject({ revoked: true });
expect(joinTalkHandoff(handoff.id, handoff.token)).toEqual({
ok: false,
reason: "not_found",
});
});
it("records managed-room ready, replacement, and close lifecycle events", () => {
clearTalkHandoffsForTest();
const handoff = createTalkHandoff({ sessionKey: "session:main" });
const firstJoin = joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-1" });
expect(firstJoin).toMatchObject({
ok: true,
events: [
expect.objectContaining({
type: "session.ready",
sessionId: handoff.roomId,
payload: expect.objectContaining({ clientId: "conn-1" }),
}),
],
record: {
room: expect.objectContaining({
activeClientId: "conn-1",
}),
},
});
const secondJoin = joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-2" });
expect(secondJoin).toMatchObject({
ok: true,
events: [
expect.objectContaining({
type: "session.replaced",
sessionId: handoff.roomId,
payload: expect.objectContaining({
previousClientId: "conn-1",
nextClientId: "conn-2",
}),
}),
expect.objectContaining({
type: "session.ready",
sessionId: handoff.roomId,
payload: expect.objectContaining({ clientId: "conn-2" }),
}),
],
record: {
room: expect.objectContaining({
activeClientId: "conn-2",
}),
},
});
expect(revokeTalkHandoff(handoff.id)).toMatchObject({
revoked: true,
activeClientId: "conn-2",
events: [
expect.objectContaining({
type: "session.closed",
sessionId: handoff.roomId,
payload: expect.objectContaining({ reason: "revoked" }),
final: true,
}),
],
});
});
it("records managed-room turn start, end, and cancellation events", () => {
clearTalkHandoffsForTest();
const handoff = createTalkHandoff({ sessionKey: "session:main" });
joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-1" });
const start = startTalkHandoffTurn(handoff.id, handoff.token, {
clientId: "conn-1",
turnId: "turn-1",
});
expect(start).toMatchObject({
ok: true,
turnId: "turn-1",
events: [expect.objectContaining({ type: "turn.started", turnId: "turn-1" })],
record: {
room: expect.objectContaining({
activeClientId: "conn-1",
activeTurnId: "turn-1",
}),
},
});
expect(endTalkHandoffTurn(handoff.id, handoff.token)).toMatchObject({
ok: true,
turnId: "turn-1",
events: [
expect.objectContaining({
type: "turn.ended",
turnId: "turn-1",
final: true,
}),
],
record: {
room: expect.not.objectContaining({
activeTurnId: expect.any(String),
}),
},
});
expect(cancelTalkHandoffTurn(handoff.id, handoff.token)).toEqual({
ok: false,
reason: "no_active_turn",
});
startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-2" });
expect(cancelTalkHandoffTurn(handoff.id, handoff.token, { reason: "barge-in" })).toMatchObject({
ok: true,
turnId: "turn-2",
events: [
expect.objectContaining({
type: "turn.cancelled",
turnId: "turn-2",
final: true,
payload: expect.objectContaining({ reason: "barge-in" }),
}),
],
});
});
it("rejects stale managed-room turn completion without clearing the active turn", () => {
clearTalkHandoffsForTest();
const handoff = createTalkHandoff({ sessionKey: "session:main" });
startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" });
startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-current" });
expect(endTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" })).toEqual({
ok: false,
reason: "stale_turn",
});
expect(getTalkHandoff(handoff.id)?.room.talk.activeTurnId).toBe("turn-current");
expect(cancelTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" })).toEqual({
ok: false,
reason: "stale_turn",
});
expect(getTalkHandoff(handoff.id)?.room.talk.activeTurnId).toBe("turn-current");
expect(endTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-current" })).toMatchObject(
{
ok: true,
turnId: "turn-current",
},
);
});
it("isolates simultaneous handoffs for different sessions on the same host", () => {
clearTalkHandoffsForTest();
const first = createTalkHandoff({
sessionKey: "agent:main:first",
channel: "browser",
target: "host:local",
provider: "openai",
});
const second = createTalkHandoff({
sessionKey: "agent:main:second",
channel: "browser",
target: "host:local",
});
expect(first.id).not.toBe(second.id);
expect(first.roomId).not.toBe(second.roomId);
expect(first.token).not.toBe(second.token);
expect(joinTalkHandoff(first.id, second.token)).toEqual({
ok: false,
reason: "invalid_token",
});
expect(joinTalkHandoff(second.id, first.token)).toEqual({
ok: false,
reason: "invalid_token",
});
expect(joinTalkHandoff(first.id, first.token)).toMatchObject({
ok: true,
events: [expect.objectContaining({ type: "session.ready" })],
record: expect.objectContaining({
roomId: first.roomId,
sessionKey: "agent:main:first",
channel: "browser",
target: "host:local",
provider: "openai",
}),
});
expect(joinTalkHandoff(second.id, second.token)).toMatchObject({
ok: true,
events: [expect.objectContaining({ type: "session.ready" })],
record: expect.objectContaining({
roomId: second.roomId,
sessionKey: "agent:main:second",
channel: "browser",
target: "host:local",
}),
});
});
});

389
src/gateway/talk-handoff.ts Normal file
View File

@@ -0,0 +1,389 @@
import { createHash, randomBytes, randomUUID } from "node:crypto";
import {
createTalkSessionController,
type TalkBrain,
type TalkEvent,
type TalkEventInput,
type TalkMode,
type TalkSessionController,
type TalkTransport,
} from "../realtime-voice/talk-session-controller.js";
const DEFAULT_TALK_HANDOFF_TTL_MS = 10 * 60 * 1000;
const MAX_TALK_HANDOFF_TTL_MS = 60 * 60 * 1000;
export type TalkHandoffCreateParams = {
sessionKey: string;
sessionId?: string;
channel?: string;
target?: string;
provider?: string;
model?: string;
voice?: string;
mode?: TalkMode;
transport?: TalkTransport;
brain?: TalkBrain;
ttlMs?: number;
};
export type TalkHandoffRecord = {
id: string;
roomId: string;
roomUrl: string;
tokenHash: string;
sessionKey: string;
sessionId?: string;
channel?: string;
target?: string;
provider?: string;
model?: string;
voice?: string;
mode: TalkMode;
transport: TalkTransport;
brain: TalkBrain;
createdAt: number;
expiresAt: number;
room: TalkHandoffRoomState;
};
export type TalkHandoffPublicRecord = Omit<TalkHandoffRecord, "tokenHash" | "room"> & {
room: {
activeClientId?: string;
activeTurnId?: string;
recentTalkEvents: TalkEvent[];
};
};
export type TalkHandoffCreateResult = TalkHandoffPublicRecord & {
token: string;
};
export type TalkHandoffJoinResult =
| {
ok: true;
record: TalkHandoffPublicRecord;
events: TalkEvent[];
replacedClientId?: string;
replacementEvents: TalkEvent[];
activeClientEvents: TalkEvent[];
}
| { ok: false; reason: "not_found" | "expired" | "invalid_token" };
export type TalkHandoffRevokeResult = {
revoked: boolean;
roomId?: string;
activeClientId?: string;
events: TalkEvent[];
};
export type TalkHandoffTurnResult =
| {
ok: true;
record: TalkHandoffPublicRecord;
turnId: string;
events: TalkEvent[];
}
| {
ok: false;
reason: "not_found" | "expired" | "invalid_token" | "no_active_turn" | "stale_turn";
};
type TalkHandoffRoomState = {
activeClientId?: string;
talk: TalkSessionController;
};
const handoffs = new Map<string, TalkHandoffRecord>();
export function createTalkHandoff(params: TalkHandoffCreateParams): TalkHandoffCreateResult {
pruneExpiredTalkHandoffs();
const createdAt = Date.now();
const ttlMs = normalizeTtlMs(params.ttlMs);
const id = randomUUID();
const roomId = `talk_${id}`;
const token = randomBytes(32).toString("base64url");
const room = createTalkHandoffRoom({
roomId,
mode: params.mode ?? "stt-tts",
transport: params.transport ?? "managed-room",
brain: params.brain ?? "agent-consult",
provider: params.provider,
});
const record: TalkHandoffRecord = {
id,
roomId,
roomUrl: `/talk/rooms/${roomId}`,
tokenHash: hashTalkHandoffToken(token),
sessionKey: params.sessionKey,
sessionId: params.sessionId,
channel: params.channel,
target: params.target,
provider: params.provider,
model: params.model,
voice: params.voice,
mode: params.mode ?? "stt-tts",
transport: params.transport ?? "managed-room",
brain: params.brain ?? "agent-consult",
createdAt,
expiresAt: createdAt + ttlMs,
room,
};
appendTalkHandoffRoomEvent(record, {
type: "session.started",
payload: { handoffId: id, roomId },
});
handoffs.set(id, record);
return { ...toPublicTalkHandoffRecord(record), token };
}
export function getTalkHandoff(id: string): TalkHandoffRecord | undefined {
pruneExpiredTalkHandoffs();
return handoffs.get(id);
}
export function joinTalkHandoff(
id: string,
token: string,
opts: { clientId?: string } = {},
): TalkHandoffJoinResult {
const access = resolveTalkHandoffAccess(id, token);
if (!access.ok) {
return access;
}
const record = access.record;
const previousClientId = record.room.activeClientId;
const events = joinTalkHandoffRoom(record, opts.clientId);
const replacedClientId =
previousClientId && previousClientId !== opts.clientId ? previousClientId : undefined;
const replacementEvents = replacedClientId
? events.filter((event) => event.type === "session.replaced")
: [];
const activeClientEvents = replacedClientId
? events.filter((event) => event.type !== "session.replaced")
: events;
return {
ok: true,
record: toPublicTalkHandoffRecord(record),
events,
replacedClientId,
replacementEvents,
activeClientEvents,
};
}
export function startTalkHandoffTurn(
id: string,
token: string,
opts: { turnId?: string; clientId?: string } = {},
): TalkHandoffTurnResult {
const access = resolveTalkHandoffAccess(id, token);
if (!access.ok) {
return access;
}
const record = access.record;
if (opts.clientId) {
record.room.activeClientId = opts.clientId;
}
const turnId = normalizeOptionalString(opts.turnId) ?? randomUUID();
const turn = record.room.talk.startTurn({
turnId,
payload: { handoffId: id, roomId: record.roomId, clientId: record.room.activeClientId },
});
return {
ok: true,
record: toPublicTalkHandoffRecord(record),
turnId,
events: turn.event ? [turn.event] : [],
};
}
export function endTalkHandoffTurn(
id: string,
token: string,
opts: { turnId?: string } = {},
): TalkHandoffTurnResult {
const access = resolveTalkHandoffAccess(id, token);
if (!access.ok) {
return access;
}
const record = access.record;
const result = record.room.talk.endTurn({
turnId: normalizeOptionalString(opts.turnId),
payload: { handoffId: id, roomId: record.roomId },
});
if (!result.ok) {
return result;
}
return {
ok: true,
record: toPublicTalkHandoffRecord(record),
turnId: result.turnId,
events: [result.event],
};
}
export function cancelTalkHandoffTurn(
id: string,
token: string,
opts: { reason?: string; turnId?: string } = {},
): TalkHandoffTurnResult {
const access = resolveTalkHandoffAccess(id, token);
if (!access.ok) {
return access;
}
const record = access.record;
const result = record.room.talk.cancelTurn({
turnId: normalizeOptionalString(opts.turnId),
payload: { handoffId: id, roomId: record.roomId, reason: opts.reason ?? "client-cancelled" },
});
if (!result.ok) {
return result;
}
return {
ok: true,
record: toPublicTalkHandoffRecord(record),
turnId: result.turnId,
events: [result.event],
};
}
export function revokeTalkHandoff(id: string): TalkHandoffRevokeResult {
pruneExpiredTalkHandoffs();
const record = handoffs.get(id);
if (!record) {
return { revoked: false, events: [] };
}
const event = appendTalkHandoffRoomEvent(record, {
type: "session.closed",
payload: { reason: "revoked", handoffId: id, roomId: record.roomId },
final: true,
});
handoffs.delete(id);
return {
revoked: true,
roomId: record.roomId,
activeClientId: record.room.activeClientId,
events: [event],
};
}
export function verifyTalkHandoffToken(record: TalkHandoffRecord, token: string): boolean {
return record.tokenHash === hashTalkHandoffToken(token);
}
export function clearTalkHandoffsForTest(): void {
handoffs.clear();
}
function normalizeTtlMs(value: number | undefined): number {
if (!Number.isFinite(value) || value === undefined) {
return DEFAULT_TALK_HANDOFF_TTL_MS;
}
return Math.min(Math.max(Math.trunc(value), 1000), MAX_TALK_HANDOFF_TTL_MS);
}
function pruneExpiredTalkHandoffs(now = Date.now()): void {
for (const [id, record] of handoffs) {
if (record.expiresAt <= now) {
appendTalkHandoffRoomEvent(record, {
type: "session.closed",
payload: { reason: "expired", handoffId: id, roomId: record.roomId },
final: true,
});
handoffs.delete(id);
}
}
}
function hashTalkHandoffToken(token: string): string {
return createHash("sha256").update(token).digest("base64url");
}
function toPublicTalkHandoffRecord(record: TalkHandoffRecord): TalkHandoffPublicRecord {
const { tokenHash: _tokenHash, room: _room, ...publicRecord } = record;
return {
...publicRecord,
room: {
activeClientId: record.room.activeClientId,
activeTurnId: record.room.talk.activeTurnId,
recentTalkEvents: [...record.room.talk.recentEvents],
},
};
}
function createTalkHandoffRoom(params: {
roomId: string;
mode: TalkMode;
transport: TalkTransport;
brain: TalkBrain;
provider?: string;
}): TalkHandoffRoomState {
return {
talk: createTalkSessionController({
sessionId: params.roomId,
mode: params.mode,
transport: params.transport,
brain: params.brain,
provider: params.provider,
}),
};
}
function resolveTalkHandoffAccess(
id: string,
token: string,
):
| { ok: true; record: TalkHandoffRecord }
| { ok: false; reason: "not_found" | "expired" | "invalid_token" } {
const record = handoffs.get(id);
if (!record) {
return { ok: false, reason: "not_found" };
}
if (record.expiresAt <= Date.now()) {
appendTalkHandoffRoomEvent(record, {
type: "session.closed",
payload: { reason: "expired", handoffId: id, roomId: record.roomId },
final: true,
});
handoffs.delete(id);
return { ok: false, reason: "expired" };
}
if (!verifyTalkHandoffToken(record, token)) {
return { ok: false, reason: "invalid_token" };
}
return { ok: true, record };
}
function appendTalkHandoffRoomEvent(record: TalkHandoffRecord, input: TalkEventInput): TalkEvent {
return record.room.talk.emit(input);
}
function joinTalkHandoffRoom(record: TalkHandoffRecord, clientId: string | undefined): TalkEvent[] {
const events: TalkEvent[] = [];
if (record.room.activeClientId && record.room.activeClientId !== clientId) {
events.push(
appendTalkHandoffRoomEvent(record, {
type: "session.replaced",
payload: {
handoffId: record.id,
roomId: record.roomId,
previousClientId: record.room.activeClientId,
nextClientId: clientId,
},
}),
);
}
record.room.activeClientId = clientId;
events.push(
appendTalkHandoffRoomEvent(record, {
type: "session.ready",
payload: { handoffId: record.id, roomId: record.roomId, clientId },
}),
);
return events;
}
function normalizeOptionalString(value: string | undefined): string | undefined {
const trimmed = value?.trim();
return trimmed ? trimmed : undefined;
}

View File

@@ -3,8 +3,10 @@ import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
import type { RealtimeVoiceBridgeCreateRequest } from "../realtime-voice/provider-types.js";
import {
acknowledgeTalkRealtimeRelayMark,
cancelTalkRealtimeRelayTurn,
clearTalkRealtimeRelaySessionsForTest,
createTalkRealtimeRelaySession,
registerTalkRealtimeRelayAgentRun,
sendTalkRealtimeRelayAudio,
stopTalkRealtimeRelaySession,
submitTalkRealtimeRelayToolResult,
@@ -24,6 +26,7 @@ describe("talk realtime gateway relay", () => {
bridgeRequest?.onAudio(Buffer.from("audio-out"));
bridgeRequest?.onMark?.("mark-1");
bridgeRequest?.onTranscript?.("user", "hello", true);
bridgeRequest?.onTranscript?.("assistant", "hi there", true);
bridgeRequest?.onToolCall?.({
itemId: "item-1",
callId: "call-1",
@@ -35,6 +38,7 @@ describe("talk realtime gateway relay", () => {
setMediaTimestamp: vi.fn(),
sendUserMessage: vi.fn(),
triggerGreeting: vi.fn(),
handleBargeIn: vi.fn(),
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
@@ -90,36 +94,74 @@ describe("talk realtime gateway relay", () => {
expect.objectContaining({
event: "talk.realtime.relay",
connIds: ["conn-1"],
payload: { relaySessionId: session.relaySessionId, type: "ready" },
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "ready",
talkEvent: expect.objectContaining({
sessionId: session.relaySessionId,
type: "session.ready",
seq: 1,
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
provider: "relay-test",
}),
}),
}),
expect.objectContaining({
payload: {
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "audio",
audioBase64: Buffer.from("audio-out").toString("base64"),
},
talkEvent: expect.objectContaining({ type: "output.audio.delta" }),
}),
}),
expect.objectContaining({
payload: { relaySessionId: session.relaySessionId, type: "mark", markName: "mark-1" },
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "mark",
markName: "mark-1",
talkEvent: expect.objectContaining({ type: "output.audio.done", final: true }),
}),
}),
expect.objectContaining({
payload: {
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "transcript",
role: "user",
text: "hello",
final: true,
},
talkEvent: expect.objectContaining({ type: "transcript.done", final: true }),
}),
}),
expect.objectContaining({
payload: {
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "transcript",
role: "assistant",
text: "hi there",
final: true,
talkEvent: expect.objectContaining({
type: "output.text.done",
final: true,
payload: { text: "hi there" },
}),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "toolCall",
itemId: "item-1",
callId: "call-1",
name: "openclaw_agent_consult",
args: { question: "what now" },
},
talkEvent: expect.objectContaining({
type: "tool.call",
itemId: "item-1",
callId: "call-1",
}),
}),
}),
]),
);
@@ -137,13 +179,66 @@ describe("talk realtime gateway relay", () => {
callId: "call-1",
result: { ok: true },
});
cancelTalkRealtimeRelayTurn({
relaySessionId: session.relaySessionId,
connId: "conn-1",
reason: "barge-in",
});
stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId: "conn-1" });
expect(bridge.sendAudio).toHaveBeenCalledWith(Buffer.from("audio-in"));
expect(bridge.setMediaTimestamp).toHaveBeenCalledWith(123);
expect(bridge.acknowledgeMark).toHaveBeenCalled();
expect(bridge.submitToolResult).toHaveBeenCalledWith("call-1", { ok: true }, undefined);
expect(bridge.handleBargeIn).toHaveBeenCalledWith({ audioPlaybackActive: true });
expect(bridge.close).toHaveBeenCalled();
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "inputAudio",
byteLength: Buffer.from("audio-in").byteLength,
talkEvent: expect.objectContaining({ type: "input.audio.delta" }),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "clear",
talkEvent: expect.objectContaining({
type: "turn.cancelled",
payload: { reason: "barge-in" },
final: true,
}),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "toolResult",
callId: "call-1",
talkEvent: expect.objectContaining({
type: "tool.result",
callId: "call-1",
final: true,
}),
}),
}),
]),
);
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({
payload: expect.objectContaining({
relaySessionId: session.relaySessionId,
type: "close",
reason: "completed",
talkEvent: expect.objectContaining({ type: "session.closed", final: true }),
}),
}),
]),
);
});
it("rejects relay control from a different connection", () => {
@@ -155,6 +250,7 @@ describe("talk realtime gateway relay", () => {
connect: vi.fn(async () => undefined),
sendAudio: vi.fn(),
setMediaTimestamp: vi.fn(),
handleBargeIn: vi.fn(),
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
@@ -179,6 +275,303 @@ describe("talk realtime gateway relay", () => {
).toThrow("Unknown realtime relay session");
});
it("correlates output audio with the active relay turn", () => {
let bridgeRequest: RealtimeVoiceBridgeCreateRequest | undefined;
const provider: RealtimeVoiceProviderPlugin = {
id: "relay-test",
label: "Relay Test",
isConfigured: () => true,
createBridge: (req) => {
bridgeRequest = req;
return {
connect: vi.fn(async () => undefined),
sendAudio: vi.fn(),
setMediaTimestamp: vi.fn(),
handleBargeIn: vi.fn(),
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
isConnected: vi.fn(() => true),
};
},
};
const events: Array<{
event: string;
payload: { talkEvent?: { type?: string; turnId?: string } };
}> = [];
const context = {
broadcastToConnIds: (
event: string,
payload: { talkEvent?: { type?: string; turnId?: string } },
) => {
events.push({ event, payload });
},
} as never;
const session = createTalkRealtimeRelaySession({
context,
connId: "conn-1",
provider,
providerConfig: {},
instructions: "brief",
tools: [],
});
sendTalkRealtimeRelayAudio({
relaySessionId: session.relaySessionId,
connId: "conn-1",
audioBase64: Buffer.from("audio").toString("base64"),
});
bridgeRequest?.onAudio(Buffer.from("reply"));
expect(
events.some(
(entry) =>
entry.payload.talkEvent?.type === "output.audio.delta" &&
entry.payload.talkEvent.turnId === "turn-1",
),
).toBe(true);
});
it("aborts linked agent consult runs when the relay turn is cancelled", () => {
const abortController = new AbortController();
const broadcast = vi.fn();
const nodeSendToSession = vi.fn();
const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" }));
const provider: RealtimeVoiceProviderPlugin = {
id: "relay-test",
label: "Relay Test",
isConfigured: () => true,
createBridge: () => ({
connect: vi.fn(async () => undefined),
sendAudio: vi.fn(),
setMediaTimestamp: vi.fn(),
handleBargeIn: vi.fn(),
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
isConnected: vi.fn(() => true),
}),
};
const context = {
broadcastToConnIds: vi.fn(),
broadcast,
nodeSendToSession,
chatAbortControllers: new Map([
[
"run-1",
{
controller: abortController,
sessionId: "run-1",
sessionKey: "main",
startedAtMs: 1,
expiresAtMs: Date.now() + 60_000,
},
],
]),
chatRunBuffers: new Map([["run-1", "partial answer"]]),
chatDeltaSentAt: new Map(),
chatDeltaLastBroadcastLen: new Map(),
chatAbortedRuns: new Map(),
removeChatRun,
agentRunSeq: new Map(),
} as never;
const session = createTalkRealtimeRelaySession({
context,
connId: "conn-1",
provider,
providerConfig: {},
instructions: "brief",
tools: [],
});
registerTalkRealtimeRelayAgentRun({
relaySessionId: session.relaySessionId,
connId: "conn-1",
sessionKey: "main",
runId: "run-1",
});
cancelTalkRealtimeRelayTurn({
relaySessionId: session.relaySessionId,
connId: "conn-1",
reason: "barge-in",
});
expect(abortController.signal.aborted).toBe(true);
expect(removeChatRun).toHaveBeenCalledWith("run-1", "run-1", "main");
expect(broadcast).toHaveBeenCalledWith(
"chat",
expect.objectContaining({
runId: "run-1",
sessionKey: "main",
state: "aborted",
stopReason: "barge-in",
}),
);
expect(nodeSendToSession).toHaveBeenCalledWith(
"main",
"chat",
expect.objectContaining({ runId: "run-1", state: "aborted" }),
);
});
it("aborts linked agent consult runs when the relay session closes", () => {
const abortController = new AbortController();
const broadcast = vi.fn();
const nodeSendToSession = vi.fn();
const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" }));
const provider: RealtimeVoiceProviderPlugin = {
id: "relay-test",
label: "Relay Test",
isConfigured: () => true,
createBridge: () => ({
connect: vi.fn(async () => undefined),
sendAudio: vi.fn(),
setMediaTimestamp: vi.fn(),
handleBargeIn: vi.fn(),
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
isConnected: vi.fn(() => true),
}),
};
const context = {
broadcastToConnIds: vi.fn(),
broadcast,
nodeSendToSession,
chatAbortControllers: new Map([
[
"run-1",
{
controller: abortController,
sessionId: "run-1",
sessionKey: "main",
startedAtMs: 1,
expiresAtMs: Date.now() + 60_000,
},
],
]),
chatRunBuffers: new Map([["run-1", "partial answer"]]),
chatDeltaSentAt: new Map(),
chatDeltaLastBroadcastLen: new Map(),
chatAbortedRuns: new Map(),
removeChatRun,
agentRunSeq: new Map(),
} as never;
const session = createTalkRealtimeRelaySession({
context,
connId: "conn-1",
provider,
providerConfig: {},
instructions: "brief",
tools: [],
});
registerTalkRealtimeRelayAgentRun({
relaySessionId: session.relaySessionId,
connId: "conn-1",
sessionKey: "main",
runId: "run-1",
});
stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId: "conn-1" });
expect(abortController.signal.aborted).toBe(true);
expect(broadcast).toHaveBeenCalledWith(
"chat",
expect.objectContaining({
runId: "run-1",
sessionKey: "main",
state: "aborted",
stopReason: "relay-closed",
}),
);
expect(nodeSendToSession).toHaveBeenCalledWith(
"main",
"chat",
expect.objectContaining({ runId: "run-1", state: "aborted" }),
);
});
it("aborts linked agent consult runs when the provider closes the relay", () => {
const abortController = new AbortController();
let bridgeRequest: RealtimeVoiceBridgeCreateRequest | undefined;
const broadcast = vi.fn();
const nodeSendToSession = vi.fn();
const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" }));
const provider: RealtimeVoiceProviderPlugin = {
id: "relay-test",
label: "Relay Test",
isConfigured: () => true,
createBridge: (req) => {
bridgeRequest = req;
return {
connect: vi.fn(async () => undefined),
sendAudio: vi.fn(),
setMediaTimestamp: vi.fn(),
handleBargeIn: vi.fn(),
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
isConnected: vi.fn(() => true),
};
},
};
const context = {
broadcastToConnIds: vi.fn(),
broadcast,
nodeSendToSession,
chatAbortControllers: new Map([
[
"run-1",
{
controller: abortController,
sessionId: "run-1",
sessionKey: "main",
startedAtMs: 1,
expiresAtMs: Date.now() + 60_000,
},
],
]),
chatRunBuffers: new Map([["run-1", "partial answer"]]),
chatDeltaSentAt: new Map(),
chatDeltaLastBroadcastLen: new Map(),
chatAbortedRuns: new Map(),
removeChatRun,
agentRunSeq: new Map(),
} as never;
const session = createTalkRealtimeRelaySession({
context,
connId: "conn-1",
provider,
providerConfig: {},
instructions: "brief",
tools: [],
});
registerTalkRealtimeRelayAgentRun({
relaySessionId: session.relaySessionId,
connId: "conn-1",
sessionKey: "main",
runId: "run-1",
});
bridgeRequest?.onClose?.("error");
expect(abortController.signal.aborted).toBe(true);
expect(broadcast).toHaveBeenCalledWith(
"chat",
expect.objectContaining({
runId: "run-1",
sessionKey: "main",
state: "aborted",
stopReason: "relay-closed",
}),
);
expect(nodeSendToSession).toHaveBeenCalledWith(
"main",
"chat",
expect.objectContaining({ runId: "run-1", state: "aborted" }),
);
});
it("caps active relay sessions per browser connection", () => {
const provider: RealtimeVoiceProviderPlugin = {
id: "relay-test",
@@ -188,6 +581,7 @@ describe("talk realtime gateway relay", () => {
connect: vi.fn(async () => undefined),
sendAudio: vi.fn(),
setMediaTimestamp: vi.fn(),
handleBargeIn: vi.fn(),
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),

View File

@@ -10,6 +10,13 @@ import {
createRealtimeVoiceBridgeSession,
type RealtimeVoiceBridgeSession,
} from "../realtime-voice/session-runtime.js";
import {
type TalkEvent,
type TalkEventInput,
type TalkSessionController,
createTalkSessionController,
} from "../realtime-voice/talk-session-controller.js";
import { abortChatRunById } from "./chat-abort.js";
import type { GatewayRequestContext } from "./server-methods/shared-types.js";
const RELAY_SESSION_TTL_MS = 30 * 60 * 1000;
@@ -18,8 +25,9 @@ const MAX_RELAY_SESSIONS_PER_CONN = 2;
const MAX_RELAY_SESSIONS_GLOBAL = 64;
const RELAY_EVENT = "talk.realtime.relay";
type TalkRealtimeRelayEvent =
type TalkRealtimeRelayEventPayload =
| { relaySessionId: string; type: "ready" }
| { relaySessionId: string; type: "inputAudio"; byteLength: number }
| { relaySessionId: string; type: "audio"; audioBase64: string }
| { relaySessionId: string; type: "clear" }
| { relaySessionId: string; type: "mark"; markName: string }
@@ -38,16 +46,21 @@ type TalkRealtimeRelayEvent =
name: string;
args: unknown;
}
| { relaySessionId: string; type: "toolResult"; callId: string }
| { relaySessionId: string; type: "error"; message: string }
| { relaySessionId: string; type: "close"; reason: "completed" | "error" };
type TalkRealtimeRelayEvent = TalkRealtimeRelayEventPayload & { talkEvent?: TalkEvent };
type RelaySession = {
id: string;
connId: string;
context: GatewayRequestContext;
bridge: RealtimeVoiceBridgeSession;
talk: TalkSessionController;
expiresAtMs: number;
cleanupTimer: ReturnType<typeof setTimeout>;
activeAgentRuns: Map<string, string>;
};
type CreateTalkRealtimeRelaySessionParams = {
@@ -85,14 +98,31 @@ function broadcastToOwner(
context.broadcastToConnIds(RELAY_EVENT, event, new Set([connId]), { dropIfSlow: true });
}
function abortRelayAgentRuns(session: RelaySession, reason: string): void {
for (const [runId, sessionKey] of session.activeAgentRuns) {
abortChatRunById(session.context, {
runId,
sessionKey,
stopReason: reason,
});
}
session.activeAgentRuns.clear();
}
function closeRelaySession(session: RelaySession, reason: "completed" | "error"): void {
relaySessions.delete(session.id);
clearTimeout(session.cleanupTimer);
abortRelayAgentRuns(session, reason === "error" ? "relay-error" : "relay-closed");
session.bridge.close();
broadcastToOwner(session.context, session.connId, {
relaySessionId: session.id,
type: "close",
reason,
talkEvent: session.talk.emit({
type: "session.closed",
payload: { reason },
final: true,
}),
});
}
@@ -130,9 +160,19 @@ export function createTalkRealtimeRelaySession(
enforceRelaySessionLimits(params.connId);
const relaySessionId = randomUUID();
const expiresAtMs = Date.now() + RELAY_SESSION_TTL_MS;
const talk = createTalkSessionController({
sessionId: relaySessionId,
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
provider: params.provider.id,
});
let relay: RelaySession | undefined;
const emit = (event: TalkRealtimeRelayEvent) =>
broadcastToOwner(params.context, params.connId, event);
const emit = (event: TalkRealtimeRelayEventPayload, talkEvent?: TalkEventInput) =>
broadcastToOwner(params.context, params.connId, {
...event,
...(talkEvent ? { talkEvent: talk.emit(talkEvent) } : {}),
});
const bridge = createRealtimeVoiceBridgeSession({
provider: params.provider,
providerConfig: params.providerConfig,
@@ -142,30 +182,94 @@ export function createTalkRealtimeRelaySession(
markStrategy: "transport",
audioSink: {
isOpen: () => Boolean(relay && relaySessions.has(relay.id)),
sendAudio: (audio) =>
emit({
relaySessionId,
type: "audio",
audioBase64: audio.toString("base64"),
}),
clearAudio: () => emit({ relaySessionId, type: "clear" }),
sendMark: (markName) => emit({ relaySessionId, type: "mark", markName }),
sendAudio: (audio) => {
const turnId = relay ? ensureRelayTurn(relay) : undefined;
emit(
{
relaySessionId,
type: "audio",
audioBase64: audio.toString("base64"),
},
{
type: "output.audio.delta",
turnId,
payload: { byteLength: audio.length },
},
);
},
clearAudio: () => {
const turnId = relay ? ensureRelayTurn(relay) : undefined;
emit(
{ relaySessionId, type: "clear" },
{
type: "output.audio.done",
turnId,
payload: { reason: "clear" },
final: true,
},
);
},
sendMark: (markName) => {
const turnId = relay ? ensureRelayTurn(relay) : undefined;
emit(
{ relaySessionId, type: "mark", markName },
{
type: "output.audio.done",
turnId,
payload: { markName },
final: true,
},
);
},
},
onTranscript: (role, text, final) => {
emit({ relaySessionId, type: "transcript", role, text, final });
const turnId = relay ? ensureRelayTurn(relay) : undefined;
const eventType =
role === "assistant"
? final
? "output.text.done"
: "output.text.delta"
: final
? "transcript.done"
: "transcript.delta";
const payload = role === "assistant" ? { text } : { role, text };
emit(
{ relaySessionId, type: "transcript", role, text, final },
{
type: eventType,
turnId,
payload,
final,
},
);
},
onToolCall: (toolCall) => {
emit({
relaySessionId,
type: "toolCall",
itemId: toolCall.itemId,
callId: toolCall.callId,
name: toolCall.name,
args: toolCall.args,
});
const turnId = relay ? ensureRelayTurn(relay) : undefined;
emit(
{
relaySessionId,
type: "toolCall",
itemId: toolCall.itemId,
callId: toolCall.callId,
name: toolCall.name,
args: toolCall.args,
},
{
type: "tool.call",
itemId: toolCall.itemId,
callId: toolCall.callId,
turnId,
payload: { name: toolCall.name, args: toolCall.args },
},
);
},
onReady: () => emit({ relaySessionId, type: "ready" }),
onError: (error) => emit({ relaySessionId, type: "error", message: error.message }),
onReady: () =>
emit({ relaySessionId, type: "ready" }, { type: "session.ready", payload: null }),
onError: (error) =>
emit(
{ relaySessionId, type: "error", message: error.message },
{ type: "session.error", payload: { message: error.message }, final: true },
),
onClose: (reason) => {
const active = relaySessions.get(relaySessionId);
if (!active) {
@@ -173,7 +277,11 @@ export function createTalkRealtimeRelaySession(
}
relaySessions.delete(relaySessionId);
clearTimeout(active.cleanupTimer);
emit({ relaySessionId, type: "close", reason });
abortRelayAgentRuns(active, "relay-closed");
emit(
{ relaySessionId, type: "close", reason },
{ type: "session.closed", payload: { reason }, final: true },
);
},
});
relay = {
@@ -181,6 +289,7 @@ export function createTalkRealtimeRelaySession(
connId: params.connId,
context: params.context,
bridge,
talk,
expiresAtMs,
cleanupTimer: setTimeout(() => {
const active = relaySessions.get(relaySessionId);
@@ -188,6 +297,7 @@ export function createTalkRealtimeRelaySession(
closeRelaySession(active, "completed");
}
}, RELAY_SESSION_TTL_MS),
activeAgentRuns: new Map(),
};
relay.cleanupTimer.unref?.();
relaySessions.set(relaySessionId, relay);
@@ -215,6 +325,19 @@ export function createTalkRealtimeRelaySession(
};
}
function ensureRelayTurn(session: RelaySession): string {
const turn = session.talk.ensureTurn();
if (turn.event) {
broadcastToOwner(session.context, session.connId, {
relaySessionId: session.id,
type: "inputAudio",
byteLength: 0,
talkEvent: turn.event,
});
}
return turn.turnId;
}
function getRelaySession(relaySessionId: string, connId: string): RelaySession {
const session = relaySessions.get(relaySessionId);
if (!session || session.connId !== connId || Date.now() > session.expiresAtMs) {
@@ -236,8 +359,19 @@ export function sendTalkRealtimeRelayAudio(params: {
throw new Error("Realtime relay audio frame is too large");
}
const session = getRelaySession(params.relaySessionId, params.connId);
const turnId = ensureRelayTurn(session);
const audio = Buffer.from(params.audioBase64, "base64");
session.bridge.sendAudio(audio);
broadcastToOwner(session.context, session.connId, {
relaySessionId: session.id,
type: "inputAudio",
byteLength: audio.byteLength,
talkEvent: session.talk.emit({
type: "input.audio.delta",
turnId,
payload: { byteLength: audio.byteLength },
}),
});
if (typeof params.timestamp === "number" && Number.isFinite(params.timestamp)) {
session.bridge.setMediaTimestamp(params.timestamp);
}
@@ -256,10 +390,52 @@ export function submitTalkRealtimeRelayToolResult(params: {
callId: string;
result: unknown;
}): void {
getRelaySession(params.relaySessionId, params.connId).bridge.submitToolResult(
params.callId,
params.result,
);
const session = getRelaySession(params.relaySessionId, params.connId);
session.bridge.submitToolResult(params.callId, params.result);
const turnId = ensureRelayTurn(session);
broadcastToOwner(session.context, session.connId, {
relaySessionId: session.id,
type: "toolResult",
callId: params.callId,
talkEvent: session.talk.emit({
type: "tool.result",
callId: params.callId,
turnId,
payload: { result: params.result },
final: true,
}),
});
}
export function registerTalkRealtimeRelayAgentRun(params: {
relaySessionId: string;
connId: string;
sessionKey: string;
runId: string;
}): void {
const session = getRelaySession(params.relaySessionId, params.connId);
session.activeAgentRuns.set(params.runId, params.sessionKey);
}
export function cancelTalkRealtimeRelayTurn(params: {
relaySessionId: string;
connId: string;
reason?: string;
}): void {
const session = getRelaySession(params.relaySessionId, params.connId);
const turnId = ensureRelayTurn(session);
const reason = params.reason ?? "client-cancelled";
session.bridge.handleBargeIn({ audioPlaybackActive: true });
abortRelayAgentRuns(session, reason);
const cancelled = session.talk.cancelTurn({
turnId,
payload: { reason },
});
broadcastToOwner(session.context, session.connId, {
relaySessionId: session.id,
type: "clear",
talkEvent: cancelled.ok ? cancelled.event : undefined,
});
}
export function stopTalkRealtimeRelaySession(params: {

View File

@@ -0,0 +1,52 @@
export type UnifiedTalkSessionRecord =
| {
kind: "realtime-relay";
connId: string;
relaySessionId: string;
}
| {
kind: "transcription-relay";
connId: string;
transcriptionSessionId: string;
}
| {
kind: "managed-room";
handoffId: string;
token: string;
roomId: string;
};
const unifiedTalkSessions = new Map<string, UnifiedTalkSessionRecord>();
export function rememberUnifiedTalkSession(
sessionId: string,
session: UnifiedTalkSessionRecord,
): void {
unifiedTalkSessions.set(sessionId, session);
}
export function getUnifiedTalkSession(sessionId: string): UnifiedTalkSessionRecord {
const session = unifiedTalkSessions.get(sessionId);
if (!session) {
throw new Error("Unknown Talk session");
}
return session;
}
export function forgetUnifiedTalkSession(sessionId: string): void {
unifiedTalkSessions.delete(sessionId);
}
export function requireUnifiedTalkSessionConn(
session: Extract<UnifiedTalkSessionRecord, { connId: string }>,
connId: string | undefined,
): string {
if (!connId || session.connId !== connId) {
throw new Error("Talk session is not owned by this connection");
}
return connId;
}
export function clearUnifiedTalkSessionsForTest(): void {
unifiedTalkSessions.clear();
}

View File

@@ -0,0 +1,216 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
import type { RealtimeTranscriptionSessionCreateRequest } from "../realtime-transcription/provider-types.js";
import {
cancelTalkTranscriptionRelayTurn,
clearTalkTranscriptionRelaySessionsForTest,
createTalkTranscriptionRelaySession,
sendTalkTranscriptionRelayAudio,
stopTalkTranscriptionRelaySession,
} from "./talk-transcription-relay.js";
describe("talk transcription gateway relay", () => {
afterEach(() => {
clearTalkTranscriptionRelaySessionsForTest();
});
it("bridges browser audio into a transcription-only Talk event stream", async () => {
let sttRequest: RealtimeTranscriptionSessionCreateRequest | undefined;
const sttSession = {
connect: vi.fn(async () => {
sttRequest?.onSpeechStart?.();
sttRequest?.onPartial?.("hel");
sttRequest?.onTranscript?.("hello world");
}),
sendAudio: vi.fn(),
close: vi.fn(),
isConnected: vi.fn(() => true),
};
const provider: RealtimeTranscriptionProviderPlugin = {
id: "stt-test",
label: "STT Test",
isConfigured: () => true,
createSession: (req) => {
sttRequest = req;
return sttSession;
},
};
const events: Array<{ event: string; payload: unknown; connIds: string[] }> = [];
const context = {
broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet<string>) => {
events.push({ event, payload, connIds: [...connIds] });
},
} as never;
const session = createTalkTranscriptionRelaySession({
context,
connId: "conn-1",
provider,
providerConfig: { model: "stt-model" },
});
await Promise.resolve();
expect(session).toMatchObject({
provider: "stt-test",
mode: "transcription",
transport: "gateway-relay",
audio: {
inputEncoding: "pcm16",
inputSampleRateHz: 24000,
},
});
expect(sttRequest).toMatchObject({
providerConfig: { model: "stt-model" },
});
sendTalkTranscriptionRelayAudio({
transcriptionSessionId: session.transcriptionSessionId,
connId: "conn-1",
audioBase64: Buffer.from("audio-in").toString("base64"),
});
stopTalkTranscriptionRelaySession({
transcriptionSessionId: session.transcriptionSessionId,
connId: "conn-1",
});
expect(sttSession.sendAudio).toHaveBeenCalledWith(Buffer.from("audio-in"));
expect(sttSession.close).toHaveBeenCalledOnce();
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({
event: "talk.transcription.relay",
connIds: ["conn-1"],
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
type: "ready",
talkEvent: expect.objectContaining({
sessionId: session.transcriptionSessionId,
type: "session.ready",
mode: "transcription",
transport: "gateway-relay",
brain: "none",
provider: "stt-test",
}),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
type: "speechStart",
talkEvent: expect.objectContaining({ type: "turn.started", turnId: "turn-1" }),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
type: "partial",
text: "hel",
talkEvent: expect.objectContaining({
type: "transcript.delta",
turnId: "turn-1",
payload: { text: "hel" },
}),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
type: "transcript",
text: "hello world",
final: true,
talkEvent: expect.objectContaining({
type: "transcript.done",
turnId: "turn-1",
final: true,
payload: { text: "hello world" },
}),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
type: "inputAudio",
byteLength: 8,
talkEvent: expect.objectContaining({ type: "input.audio.delta" }),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
type: "close",
reason: "completed",
talkEvent: expect.objectContaining({
type: "session.closed",
final: true,
}),
}),
}),
]),
);
});
it("cancels an active transcription turn and closes the provider session", async () => {
let sttRequest: RealtimeTranscriptionSessionCreateRequest | undefined;
const sttSession = {
connect: vi.fn(async () => {
sttRequest?.onSpeechStart?.();
}),
sendAudio: vi.fn(),
close: vi.fn(),
isConnected: vi.fn(() => true),
};
const provider: RealtimeTranscriptionProviderPlugin = {
id: "stt-test",
label: "STT Test",
isConfigured: () => true,
createSession: (req) => {
sttRequest = req;
return sttSession;
},
};
const events: Array<{ event: string; payload: unknown; connIds: string[] }> = [];
const context = {
broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet<string>) => {
events.push({ event, payload, connIds: [...connIds] });
},
} as never;
const session = createTalkTranscriptionRelaySession({
context,
connId: "conn-1",
provider,
providerConfig: {},
});
await Promise.resolve();
cancelTalkTranscriptionRelayTurn({
transcriptionSessionId: session.transcriptionSessionId,
connId: "conn-1",
reason: "barge-in",
});
expect(sttSession.close).toHaveBeenCalledOnce();
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
talkEvent: expect.objectContaining({
type: "turn.cancelled",
turnId: "turn-1",
payload: { reason: "barge-in" },
final: true,
}),
}),
}),
expect.objectContaining({
payload: expect.objectContaining({
transcriptionSessionId: session.transcriptionSessionId,
type: "close",
reason: "completed",
}),
}),
]),
);
});
});

View File

@@ -0,0 +1,354 @@
import { randomUUID } from "node:crypto";
import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
import type { RealtimeTranscriptionProviderConfig } from "../realtime-transcription/provider-types.js";
import {
type TalkEvent,
type TalkEventInput,
type TalkSessionController,
createTalkSessionController,
} from "../realtime-voice/talk-session-controller.js";
import type { GatewayRequestContext } from "./server-methods/shared-types.js";
const TRANSCRIPTION_SESSION_TTL_MS = 30 * 60 * 1000;
const MAX_AUDIO_BASE64_BYTES = 512 * 1024;
const MAX_TRANSCRIPTION_SESSIONS_PER_CONN = 2;
const MAX_TRANSCRIPTION_SESSIONS_GLOBAL = 64;
const TRANSCRIPTION_EVENT = "talk.transcription.relay";
type TalkTranscriptionRelayEventPayload =
| { transcriptionSessionId: string; type: "ready" }
| { transcriptionSessionId: string; type: "inputAudio"; byteLength: number }
| { transcriptionSessionId: string; type: "partial"; text: string }
| { transcriptionSessionId: string; type: "transcript"; text: string; final: true }
| { transcriptionSessionId: string; type: "speechStart" }
| { transcriptionSessionId: string; type: "error"; message: string }
| { transcriptionSessionId: string; type: "close"; reason: "completed" | "error" };
type TalkTranscriptionRelayEvent = TalkTranscriptionRelayEventPayload & {
talkEvent?: TalkEvent;
};
type TranscriptionRelaySession = {
id: string;
connId: string;
context: GatewayRequestContext;
provider: RealtimeTranscriptionProviderPlugin;
sttSession: ReturnType<RealtimeTranscriptionProviderPlugin["createSession"]>;
talk: TalkSessionController;
expiresAtMs: number;
cleanupTimer: ReturnType<typeof setTimeout>;
closed: boolean;
};
type CreateTalkTranscriptionRelaySessionParams = {
context: GatewayRequestContext;
connId: string;
provider: RealtimeTranscriptionProviderPlugin;
providerConfig: RealtimeTranscriptionProviderConfig;
};
type TalkTranscriptionRelaySessionResult = {
provider: string;
mode: "transcription";
transport: "gateway-relay";
transcriptionSessionId: string;
audio: {
inputEncoding: "pcm16";
inputSampleRateHz: 24000;
};
expiresAt: number;
};
const transcriptionSessions = new Map<string, TranscriptionRelaySession>();
function broadcastToOwner(
context: GatewayRequestContext,
connId: string,
event: TalkTranscriptionRelayEvent,
): void {
context.broadcastToConnIds(TRANSCRIPTION_EVENT, event, new Set([connId]), { dropIfSlow: true });
}
function ensureTranscriptionTurn(session: TranscriptionRelaySession): string {
const turn = session.talk.ensureTurn();
if (turn.event) {
broadcastToOwner(session.context, session.connId, {
transcriptionSessionId: session.id,
type: "speechStart",
talkEvent: turn.event,
});
}
return turn.turnId;
}
function closeTranscriptionSession(
session: TranscriptionRelaySession,
reason: "completed" | "error",
): void {
if (session.closed) {
return;
}
session.closed = true;
transcriptionSessions.delete(session.id);
clearTimeout(session.cleanupTimer);
session.sttSession.close();
broadcastToOwner(session.context, session.connId, {
transcriptionSessionId: session.id,
type: "close",
reason,
talkEvent: session.talk.emit({
type: "session.closed",
payload: { reason },
final: true,
}),
});
}
function pruneExpiredTranscriptionSessions(nowMs = Date.now()): void {
for (const session of transcriptionSessions.values()) {
if (nowMs > session.expiresAtMs) {
closeTranscriptionSession(session, "completed");
}
}
}
function countTranscriptionSessionsForConn(connId: string): number {
let count = 0;
for (const session of transcriptionSessions.values()) {
if (session.connId === connId) {
count += 1;
}
}
return count;
}
function enforceTranscriptionSessionLimits(connId: string): void {
pruneExpiredTranscriptionSessions();
if (transcriptionSessions.size >= MAX_TRANSCRIPTION_SESSIONS_GLOBAL) {
throw new Error("Too many active transcription Talk sessions");
}
if (countTranscriptionSessionsForConn(connId) >= MAX_TRANSCRIPTION_SESSIONS_PER_CONN) {
throw new Error("Too many active transcription Talk sessions for this connection");
}
}
export function createTalkTranscriptionRelaySession(
params: CreateTalkTranscriptionRelaySessionParams,
): TalkTranscriptionRelaySessionResult {
enforceTranscriptionSessionLimits(params.connId);
const transcriptionSessionId = randomUUID();
const expiresAtMs = Date.now() + TRANSCRIPTION_SESSION_TTL_MS;
const talk = createTalkSessionController({
sessionId: transcriptionSessionId,
mode: "transcription",
transport: "gateway-relay",
brain: "none",
provider: params.provider.id,
});
let relay: TranscriptionRelaySession | undefined;
const emit = (event: TalkTranscriptionRelayEventPayload, talkEvent?: TalkEventInput): void => {
broadcastToOwner(params.context, params.connId, {
...event,
...(talkEvent ? { talkEvent: talk.emit(talkEvent) } : {}),
});
};
const ensureTurnId = (): string => {
return relay ? ensureTranscriptionTurn(relay) : "turn-1";
};
const sttSession = params.provider.createSession({
providerConfig: params.providerConfig,
onSpeechStart: () => {
ensureTurnId();
},
onPartial: (text) => {
const turnId = ensureTurnId();
emit(
{ transcriptionSessionId, type: "partial", text },
{
type: "transcript.delta",
turnId,
payload: { text },
},
);
},
onTranscript: (text) => {
const turnId = ensureTurnId();
emit(
{ transcriptionSessionId, type: "transcript", text, final: true },
{
type: "transcript.done",
turnId,
payload: { text },
final: true,
},
);
if (relay) {
const ended = relay.talk.endTurn({ turnId, payload: {} });
if (ended.ok) {
broadcastToOwner(relay.context, relay.connId, {
transcriptionSessionId,
type: "transcript",
text: "",
final: true,
talkEvent: ended.event,
});
}
}
},
onError: (error) => {
emit(
{ transcriptionSessionId, type: "error", message: error.message },
{
type: "session.error",
payload: { message: error.message },
final: true,
},
);
if (relay) {
closeTranscriptionSession(relay, "error");
}
},
});
relay = {
id: transcriptionSessionId,
connId: params.connId,
context: params.context,
provider: params.provider,
sttSession,
talk,
expiresAtMs,
cleanupTimer: setTimeout(() => {
const active = transcriptionSessions.get(transcriptionSessionId);
if (active) {
closeTranscriptionSession(active, "completed");
}
}, TRANSCRIPTION_SESSION_TTL_MS),
closed: false,
};
relay.cleanupTimer.unref?.();
transcriptionSessions.set(transcriptionSessionId, relay);
sttSession
.connect()
.then(() => {
emit({ transcriptionSessionId, type: "ready" }, { type: "session.ready", payload: null });
})
.catch((error: unknown) => {
emit(
{
transcriptionSessionId,
type: "error",
message: error instanceof Error ? error.message : String(error),
},
{
type: "session.error",
payload: { message: error instanceof Error ? error.message : String(error) },
final: true,
},
);
const active = transcriptionSessions.get(transcriptionSessionId);
if (active) {
closeTranscriptionSession(active, "error");
}
});
return {
provider: params.provider.id,
mode: "transcription",
transport: "gateway-relay",
transcriptionSessionId,
audio: {
inputEncoding: "pcm16",
inputSampleRateHz: 24000,
},
expiresAt: Math.floor(expiresAtMs / 1000),
};
}
function getTranscriptionSession(
transcriptionSessionId: string,
connId: string,
): TranscriptionRelaySession {
const session = transcriptionSessions.get(transcriptionSessionId);
if (!session || session.connId !== connId || Date.now() > session.expiresAtMs) {
if (session) {
closeTranscriptionSession(session, "completed");
}
throw new Error("Unknown transcription Talk session");
}
return session;
}
export function sendTalkTranscriptionRelayAudio(params: {
transcriptionSessionId: string;
connId: string;
audioBase64: string;
}): void {
if (params.audioBase64.length > MAX_AUDIO_BASE64_BYTES) {
throw new Error("Transcription Talk audio frame is too large");
}
const session = getTranscriptionSession(params.transcriptionSessionId, params.connId);
const audio = Buffer.from(params.audioBase64, "base64");
const turnId = ensureTranscriptionTurn(session);
session.sttSession.sendAudio(audio);
broadcastToOwner(session.context, session.connId, {
transcriptionSessionId: session.id,
type: "inputAudio",
byteLength: audio.byteLength,
talkEvent: session.talk.emit({
type: "input.audio.delta",
turnId,
payload: { byteLength: audio.byteLength },
}),
});
}
export function stopTalkTranscriptionRelaySession(params: {
transcriptionSessionId: string;
connId: string;
}): void {
const session = getTranscriptionSession(params.transcriptionSessionId, params.connId);
if (session.talk.activeTurnId) {
broadcastToOwner(session.context, session.connId, {
transcriptionSessionId: session.id,
type: "transcript",
text: "",
final: true,
talkEvent: session.talk.emit({
type: "input.audio.committed",
turnId: session.talk.activeTurnId,
payload: {},
final: true,
}),
});
}
closeTranscriptionSession(session, "completed");
}
export function cancelTalkTranscriptionRelayTurn(params: {
transcriptionSessionId: string;
connId: string;
reason?: string;
}): void {
const session = getTranscriptionSession(params.transcriptionSessionId, params.connId);
const turnId = ensureTranscriptionTurn(session);
const cancelled = session.talk.cancelTurn({
turnId,
payload: { reason: params.reason ?? "client-cancelled" },
});
broadcastToOwner(session.context, session.connId, {
transcriptionSessionId: session.id,
type: "transcript",
text: "",
final: true,
talkEvent: cancelled.ok ? cancelled.event : undefined,
});
closeTranscriptionSession(session, "completed");
}
export function clearTalkTranscriptionRelaySessionsForTest(): void {
for (const session of transcriptionSessions.values()) {
clearTimeout(session.cleanupTimer);
session.sttSession.close();
}
transcriptionSessions.clear();
}

View File

@@ -55,10 +55,6 @@ export function buildInstructions(config: VoiceClawSessionConfigEvent): string {
parts.push(deviceContext);
}
if (config.instructionsOverride?.trim()) {
parts.push(`## About The User\n${config.instructionsOverride.trim()}`);
}
if (config.conversationHistory && config.conversationHistory.length > 0) {
parts.push(buildConversationHistory(config.conversationHistory));
}

View File

@@ -3,6 +3,8 @@ import type { IncomingMessage } from "node:http";
import { describe, expect, it, vi } from "vitest";
import WebSocket from "ws";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import type { TalkEvent } from "../../realtime-voice/talk-events.js";
import { createTalkSessionController } from "../../realtime-voice/talk-session-controller.js";
import type { ResolvedGatewayAuth } from "../auth.js";
import { resolveRealtimeSenderIsOwner, VoiceClawRealtimeSession } from "./session.js";
import type {
@@ -60,6 +62,45 @@ function makeAdapter(): VoiceClawRealtimeAdapter {
}
describe("VoiceClawRealtimeSession lifecycle", () => {
it("rejects request-time instructionsOverride", async () => {
const ws = new FakeWebSocket();
const adapter = makeAdapter();
const releasePreauthBudget = vi.fn();
const session = new VoiceClawRealtimeSession({
ws: ws as unknown as WebSocket,
req: {} as IncomingMessage,
auth: { mode: "none" } as ResolvedGatewayAuth,
config: {} as OpenClawConfig,
trustedProxies: [],
allowRealIpFallback: false,
releasePreauthBudget,
adapterFactory: () => adapter,
});
session.attach();
ws.emit(
"message",
JSON.stringify({
type: "session.config",
brainAgent: "none",
instructionsOverride: "custom request-time instructions",
}),
);
await new Promise((resolve) => setImmediate(resolve));
expect(ws.sent).toEqual([
{
type: "error",
message: "request-time instructionsOverride is not supported",
code: 400,
},
]);
expect(ws.closeCode).toBe(1008);
expect(ws.closeReason).toBe("unsupported instruction override");
expect(adapter.connect).not.toHaveBeenCalled();
expect(releasePreauthBudget).toHaveBeenCalledOnce();
});
it("sends session summary before closing after terminal adapter errors", () => {
const ws = new FakeWebSocket();
const adapter = makeAdapter();
@@ -102,4 +143,199 @@ describe("VoiceClawRealtimeSession lifecycle", () => {
expect(adapter.disconnect).toHaveBeenCalledOnce();
expect(releasePreauthBudget).toHaveBeenCalledOnce();
});
it("adds common Talk event envelopes to configured server events", () => {
const ws = new FakeWebSocket();
const adapter = makeAdapter();
const session = new VoiceClawRealtimeSession({
ws: ws as unknown as WebSocket,
req: {} as IncomingMessage,
auth: { mode: "none" } as ResolvedGatewayAuth,
config: {} as OpenClawConfig,
trustedProxies: [],
allowRealIpFallback: false,
releasePreauthBudget: vi.fn(),
adapterFactory: () => adapter,
});
const internals = session as unknown as {
config: VoiceClawSessionConfigEvent;
talk: unknown;
handleAdapterEvent(event: VoiceClawServerEvent): void;
};
internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" };
internals.talk = createTalkSessionController({
sessionId: "voice-session",
mode: "realtime",
transport: "gateway-relay",
brain: "direct-tools",
provider: "gemini",
});
internals.handleAdapterEvent({
type: "transcript.done",
role: "assistant",
text: "hello",
});
expect(ws.sent).toEqual([
expect.objectContaining({
type: "transcript.done",
talkEvent: expect.objectContaining({
type: "output.text.done",
sessionId: "voice-session",
mode: "realtime",
transport: "gateway-relay",
brain: "direct-tools",
provider: "gemini",
final: true,
payload: { role: "assistant", text: "hello" },
}),
}),
]);
});
it("keeps streamed output audio out of common Talk event payloads", () => {
const ws = new FakeWebSocket();
const adapter = makeAdapter();
const session = new VoiceClawRealtimeSession({
ws: ws as unknown as WebSocket,
req: {} as IncomingMessage,
auth: { mode: "none" } as ResolvedGatewayAuth,
config: {} as OpenClawConfig,
trustedProxies: [],
allowRealIpFallback: false,
releasePreauthBudget: vi.fn(),
adapterFactory: () => adapter,
});
const internals = session as unknown as {
config: VoiceClawSessionConfigEvent;
talk: unknown;
handleAdapterEvent(event: VoiceClawServerEvent): void;
};
const audioData = Buffer.from("hello").toString("base64");
internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" };
internals.talk = createTalkSessionController({
sessionId: "voice-session",
mode: "realtime",
transport: "gateway-relay",
brain: "direct-tools",
provider: "gemini",
});
internals.handleAdapterEvent({
type: "audio.delta",
data: audioData,
});
expect(ws.sent).toEqual([
expect.objectContaining({
type: "audio.delta",
data: audioData,
talkEvent: expect.objectContaining({
type: "output.audio.delta",
payload: { byteLength: 5 },
}),
}),
]);
expect(
(ws.sent[0] as { talkEvent?: { payload?: Record<string, unknown> } }).talkEvent?.payload,
).not.toHaveProperty("data");
});
it("emits common Talk events for client audio, video, cancellation, and tool results", async () => {
const ws = new FakeWebSocket();
const adapter = makeAdapter();
const talkEvents: TalkEvent[] = [];
const session = new VoiceClawRealtimeSession({
ws: ws as unknown as WebSocket,
req: {} as IncomingMessage,
auth: { mode: "none" } as ResolvedGatewayAuth,
config: {} as OpenClawConfig,
trustedProxies: [],
allowRealIpFallback: false,
releasePreauthBudget: vi.fn(),
adapterFactory: () => adapter,
onTalkEvent: (event) => talkEvents.push(event),
});
const internals = session as unknown as {
adapter: VoiceClawRealtimeAdapter;
config: VoiceClawSessionConfigEvent;
talk: ReturnType<typeof createTalkSessionController>;
handleRawMessage(raw: string): Promise<void>;
};
internals.adapter = adapter;
internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" };
internals.talk = createTalkSessionController({
sessionId: "voice-session",
mode: "realtime",
transport: "gateway-relay",
brain: "direct-tools",
provider: "gemini",
});
internals.talk.startTurn({ turnId: "turn-client" });
await internals.handleRawMessage(
JSON.stringify({ type: "audio.append", data: Buffer.from("hello").toString("base64") }),
);
await internals.handleRawMessage(JSON.stringify({ type: "audio.commit" }));
await internals.handleRawMessage(
JSON.stringify({
type: "frame.append",
data: Buffer.from("frame").toString("base64"),
mimeType: "image/jpeg",
}),
);
await internals.handleRawMessage(JSON.stringify({ type: "response.cancel" }));
await internals.handleRawMessage(
JSON.stringify({ type: "tool.result", callId: "call-1", output: "done" }),
);
expect(adapter.sendAudio).toHaveBeenCalledWith(Buffer.from("hello").toString("base64"));
expect(adapter.commitAudio).toHaveBeenCalledOnce();
expect(adapter.sendFrame).toHaveBeenCalledWith(
Buffer.from("frame").toString("base64"),
"image/jpeg",
);
expect(adapter.cancelResponse).toHaveBeenCalledOnce();
expect(adapter.sendToolResult).toHaveBeenCalledWith("call-1", "done");
expect(talkEvents.map((event) => event.type)).toEqual([
"input.audio.delta",
"input.audio.committed",
"health.changed",
"turn.cancelled",
"turn.started",
"tool.result",
]);
expect(talkEvents).toEqual([
expect.objectContaining({
type: "input.audio.delta",
turnId: "turn-client",
payload: { byteLength: 5 },
}),
expect.objectContaining({
type: "input.audio.committed",
turnId: "turn-client",
final: true,
}),
expect.objectContaining({
type: "health.changed",
payload: { inputVideoFrame: true, mimeType: "image/jpeg", byteLength: 5 },
}),
expect.objectContaining({
type: "turn.cancelled",
payload: { reason: "client-cancelled" },
final: true,
}),
expect.objectContaining({
type: "turn.started",
payload: { source: "implicit" },
}),
expect.objectContaining({
type: "tool.result",
callId: "call-1",
payload: { output: "done" },
final: true,
}),
]);
});
});

View File

@@ -3,6 +3,12 @@ import type { IncomingMessage } from "node:http";
import WebSocket, { type RawData } from "ws";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import {
type TalkEvent,
type TalkEventInput,
type TalkSessionController,
createTalkSessionController,
} from "../../realtime-voice/talk-session-controller.js";
import type { AuthRateLimiter } from "../auth-rate-limit.js";
import {
authorizeHttpGatewayConnect,
@@ -36,6 +42,7 @@ type VoiceClawRealtimeSessionOptions = {
rateLimiter?: AuthRateLimiter;
releasePreauthBudget: () => void;
adapterFactory?: () => VoiceClawRealtimeAdapter;
onTalkEvent?: (event: TalkEvent) => void;
};
export class VoiceClawRealtimeSession {
@@ -50,8 +57,10 @@ export class VoiceClawRealtimeSession {
private readonly rateLimiter: AuthRateLimiter | undefined;
private readonly releasePreauthBudget: () => void;
private readonly adapterFactory: () => VoiceClawRealtimeAdapter;
private readonly onTalkEvent: ((event: TalkEvent) => void) | undefined;
private adapter: VoiceClawRealtimeAdapter | null = null;
private toolRuntime: VoiceClawRealtimeToolRuntime | null = null;
private talk: TalkSessionController | null = null;
private config: VoiceClawSessionConfigEvent | null = null;
private handshakeTimer: ReturnType<typeof setTimeout> | null = null;
private closed = false;
@@ -67,6 +76,7 @@ export class VoiceClawRealtimeSession {
this.rateLimiter = opts.rateLimiter;
this.releasePreauthBudget = once(opts.releasePreauthBudget);
this.adapterFactory = opts.adapterFactory ?? (() => new VoiceClawGeminiLiveAdapter());
this.onTalkEvent = opts.onTalkEvent;
}
attach(): void {
@@ -113,24 +123,66 @@ export class VoiceClawRealtimeSession {
}
switch (event.type) {
case "audio.append":
case "audio.append": {
const audioTurnId = this.ensureActiveTurnId();
this.adapter?.sendAudio(event.data);
this.emitTalkEvent({
type: "input.audio.delta",
payload: { byteLength: base64ByteLength(event.data) },
turnId: audioTurnId,
});
break;
case "audio.commit":
}
case "audio.commit": {
const commitTurnId = this.ensureActiveTurnId();
this.adapter?.commitAudio();
this.emitTalkEvent({
type: "input.audio.committed",
payload: {},
turnId: commitTurnId,
final: true,
});
break;
}
case "frame.append":
this.adapter?.sendFrame(event.data, event.mimeType);
this.emitTalkEvent({
type: "health.changed",
payload: {
inputVideoFrame: true,
mimeType: event.mimeType,
byteLength: base64ByteLength(event.data),
},
turnId: this.talk?.activeTurnId,
});
break;
case "response.create":
this.adapter?.createResponse();
break;
case "response.cancel":
case "response.cancel": {
const cancelTurnId = this.ensureActiveTurnId();
this.adapter?.cancelResponse();
const cancelled = this.talk?.cancelTurn({
turnId: cancelTurnId,
payload: { reason: "client-cancelled" },
});
if (cancelled?.ok) {
this.onTalkEvent?.(cancelled.event);
}
break;
case "tool.result":
}
case "tool.result": {
const toolTurnId = this.ensureActiveTurnId();
this.adapter?.sendToolResult(event.callId, event.output);
this.emitTalkEvent({
type: "tool.result",
payload: { output: event.output },
turnId: toolTurnId,
callId: event.callId,
final: true,
});
break;
}
case "session.config":
this.send({ type: "error", message: "session already configured", code: 400 });
break;
@@ -144,6 +196,16 @@ export class VoiceClawRealtimeSession {
this.configStarted = true;
this.clearHandshakeTimer();
if (hasInstructionsOverride(config)) {
this.send({
type: "error",
message: "request-time instructionsOverride is not supported",
code: 400,
});
this.ws.close(1008, "unsupported instruction override");
return;
}
const authResult = await authorizeHttpGatewayConnect({
auth: this.auth,
connectAuth: config.apiKey ? { token: config.apiKey, password: config.apiKey } : null,
@@ -190,6 +252,13 @@ export class VoiceClawRealtimeSession {
voice: config.voice || "Zephyr",
brainAgent: config.brainAgent ?? "enabled",
};
this.talk = createTalkSessionController({
sessionId: this.id,
mode: "realtime",
transport: "gateway-relay",
brain: this.config.brainAgent === "none" ? "none" : "direct-tools",
provider: this.config.provider,
});
this.adapter = this.adapterFactory();
try {
@@ -270,7 +339,134 @@ export class VoiceClawRealtimeSession {
if (this.closed || this.ws.readyState !== WebSocket.OPEN) {
return;
}
this.ws.send(JSON.stringify(event));
this.ws.send(JSON.stringify(this.withTalkEvent(event)));
}
private withTalkEvent(
event: VoiceClawServerEvent,
): VoiceClawServerEvent & { talkEvent?: TalkEvent } {
const talkInput = this.toTalkEventInput(event);
if (!talkInput || !this.talk) {
return event;
}
return { ...event, talkEvent: this.emitTalkEvent(talkInput) };
}
private emitTalkEvent(input: TalkEventInput): TalkEvent | undefined {
if (!this.talk) {
return undefined;
}
let event: TalkEvent | undefined;
if (input.type === "turn.started") {
event = this.talk.startTurn({ turnId: input.turnId, payload: input.payload }).event;
} else if (input.type === "turn.ended") {
const ended = this.talk.endTurn({ turnId: input.turnId, payload: input.payload });
event = ended.ok ? ended.event : undefined;
} else if (input.type === "turn.cancelled") {
const cancelled = this.talk.cancelTurn({ turnId: input.turnId, payload: input.payload });
event = cancelled.ok ? cancelled.event : undefined;
} else {
event = this.talk.emit(input);
}
if (event) {
this.onTalkEvent?.(event);
}
return event;
}
private ensureActiveTurnId(): string {
if (this.talk?.activeTurnId) {
return this.talk.activeTurnId;
}
const turnId = randomUUID();
const turn = this.talk?.startTurn({
turnId,
payload: { source: "implicit" },
});
if (turn?.event) {
this.onTalkEvent?.(turn.event);
}
return turnId;
}
private toTalkEventInput(event: VoiceClawServerEvent): TalkEventInput | null {
switch (event.type) {
case "session.ready":
return { type: "session.ready", payload: { sessionId: event.sessionId } };
case "audio.delta":
return {
type: "output.audio.delta",
payload: { byteLength: base64ByteLength(event.data) },
turnId: this.ensureActiveTurnId(),
};
case "transcript.delta":
return {
type: event.role === "assistant" ? "output.text.delta" : "transcript.delta",
payload: { role: event.role, text: event.text },
turnId: this.ensureActiveTurnId(),
};
case "transcript.done":
return {
type: event.role === "assistant" ? "output.text.done" : "transcript.done",
payload: { role: event.role, text: event.text },
turnId: this.ensureActiveTurnId(),
final: true,
};
case "tool.call":
return {
type: "tool.call",
payload: { name: event.name, arguments: event.arguments },
turnId: this.ensureActiveTurnId(),
callId: event.callId,
};
case "tool.progress":
return {
type: "tool.progress",
payload: { summary: event.summary },
turnId: this.ensureActiveTurnId(),
callId: event.callId,
};
case "turn.started": {
const turnId = event.turnId || randomUUID();
return { type: "turn.started", payload: {}, turnId };
}
case "turn.ended": {
const turnId = this.ensureActiveTurnId();
return { type: "turn.ended", payload: {}, turnId, final: true };
}
case "session.ended":
return {
type: "session.closed",
payload: {
summary: event.summary,
durationSec: event.durationSec,
turnCount: event.turnCount,
},
final: true,
};
case "session.rotating":
return { type: "health.changed", payload: { status: "rotating" } };
case "session.rotated":
return { type: "session.replaced", payload: { sessionId: event.sessionId } };
case "usage.metrics":
return { type: "usage.metrics", payload: event };
case "latency.metrics":
return { type: "latency.metrics", payload: event };
case "tool.cancelled":
return {
type: "tool.error",
payload: { callIds: event.callIds, cancelled: true },
turnId: this.ensureActiveTurnId(),
final: true,
};
case "error":
return {
type: "session.error",
payload: { message: event.message, code: event.code },
final: true,
};
}
return null;
}
private clearHandshakeTimer(): void {
@@ -330,6 +526,11 @@ function parseClientEvent(raw: RawData): VoiceClawClientEvent | null {
}
}
function hasInstructionsOverride(config: VoiceClawSessionConfigEvent): boolean {
const value = (config as { instructionsOverride?: unknown }).instructionsOverride;
return typeof value === "string" && value.trim().length > 0;
}
function sanitizeSessionKey(value: string | undefined): string | null {
const trimmed = value?.trim();
if (!trimmed) {
@@ -353,6 +554,18 @@ function sanitizeErrorMessage(message: string): string {
return message.replace(/([?&]key=)[^&\s]+/g, "$1***");
}
function base64ByteLength(value: string): number {
const normalized = value.trim();
if (!normalized) {
return 0;
}
try {
return Buffer.from(normalized, "base64").byteLength;
} catch {
return normalized.length;
}
}
function once(fn: () => void): () => void {
let called = false;
return () => {

View File

@@ -23,7 +23,6 @@ export type VoiceClawSessionConfigEvent = {
location?: string;
};
watchdog?: "enabled" | "disabled";
instructionsOverride?: string;
conversationHistory?: { role: "user" | "assistant"; text: string }[];
};