From c434d7720b2e2f0cd2a9e1c256aafe56f33ec7eb Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 5 May 2026 20:59:34 +0100 Subject: [PATCH] feat: add unified talk gateway sessions --- ...acy-config-migrate.provider-shapes.test.ts | 78 + .../shared/legacy-talk-config-normalizer.ts | 35 + src/config/schema.help.ts | 15 + src/config/schema.labels.ts | 10 + src/config/talk.normalize.test.ts | 26 + src/config/talk.ts | 52 + src/config/types.gateway.ts | 19 + src/config/zod-schema.ts | 34 + src/gateway/gateway-misc.test.ts | 4 +- src/gateway/method-scopes.test.ts | 23 + src/gateway/method-scopes.ts | 18 + src/gateway/protocol/index.test.ts | 340 +++- src/gateway/protocol/index.ts | 199 ++ src/gateway/protocol/schema/channels.ts | 516 ++++- .../protocol/schema/protocol-schemas.ts | 60 + src/gateway/protocol/schema/types.ts | 30 + src/gateway/server-broadcast.ts | 3 + src/gateway/server-methods-list.test.ts | 24 + src/gateway/server-methods-list.ts | 21 + src/gateway/server-methods/shared-types.ts | 2 +- src/gateway/server-methods/talk-session.ts | 497 +++++ src/gateway/server-methods/talk-shared.ts | 237 +++ src/gateway/server-methods/talk.test.ts | 1704 ++++++++++++++++- src/gateway/server-methods/talk.ts | 803 +++++++- src/gateway/server-mobile-nodes.ts | 12 - src/gateway/server-node-session-runtime.ts | 6 +- src/gateway/server-request-context.test.ts | 2 +- src/gateway/server-request-context.ts | 4 +- src/gateway/server.impl.ts | 4 +- src/gateway/talk-handoff.test.ts | 286 +++ src/gateway/talk-handoff.ts | 389 ++++ src/gateway/talk-realtime-relay.test.ts | 410 +++- src/gateway/talk-realtime-relay.ts | 230 ++- src/gateway/talk-session-registry.ts | 52 + src/gateway/talk-transcription-relay.test.ts | 216 +++ src/gateway/talk-transcription-relay.ts | 354 ++++ .../voiceclaw-realtime/instructions.ts | 4 - .../voiceclaw-realtime/session.test.ts | 236 +++ src/gateway/voiceclaw-realtime/session.ts | 223 ++- src/gateway/voiceclaw-realtime/types.ts | 1 - 40 files changed, 7015 insertions(+), 164 deletions(-) create mode 100644 src/gateway/server-methods-list.test.ts create mode 100644 src/gateway/server-methods/talk-session.ts create mode 100644 src/gateway/server-methods/talk-shared.ts delete mode 100644 src/gateway/server-mobile-nodes.ts create mode 100644 src/gateway/talk-handoff.test.ts create mode 100644 src/gateway/talk-handoff.ts create mode 100644 src/gateway/talk-session-registry.ts create mode 100644 src/gateway/talk-transcription-relay.test.ts create mode 100644 src/gateway/talk-transcription-relay.ts diff --git a/src/commands/doctor/shared/legacy-config-migrate.provider-shapes.test.ts b/src/commands/doctor/shared/legacy-config-migrate.provider-shapes.test.ts index fbe13c7c04d..a808074eabc 100644 --- a/src/commands/doctor/shared/legacy-config-migrate.provider-shapes.test.ts +++ b/src/commands/doctor/shared/legacy-config-migrate.provider-shapes.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import type { OpenClawConfig } from "../../../config/types.js"; import { LEGACY_CONFIG_MIGRATIONS_RUNTIME_TTS } from "./legacy-config-migrations.runtime.tts.js"; +import { normalizeLegacyTalkConfig } from "./legacy-talk-config-normalizer.js"; function migrateLegacyConfig(raw: unknown): { config: OpenClawConfig | null; @@ -21,6 +22,83 @@ function migrateLegacyConfig(raw: unknown): { } describe("legacy migrate provider-shaped config", () => { + it("moves legacy realtime Talk selectors into talk.realtime without treating speech config as runtime fallback", () => { + const changes: string[] = []; + const migrated = normalizeLegacyTalkConfig( + { + talk: { + provider: "openai", + providers: { + openai: { + apiKey: "test-key", + custom: true, + }, + }, + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + model: "gpt-realtime", + voice: "alloy", + } as never, + }, + changes, + ); + + expect(changes).toContain( + "Moved legacy realtime Talk provider/model fields into talk.realtime.", + ); + expect(migrated.talk).toEqual({ + provider: "openai", + providers: { + openai: { + apiKey: "test-key", + custom: true, + }, + }, + realtime: { + provider: "openai", + providers: { + openai: { + apiKey: "test-key", + custom: true, + }, + }, + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + model: "gpt-realtime", + voice: "alloy", + }, + }); + }); + + it("does not copy plain Talk speech provider config into talk.realtime", () => { + const changes: string[] = []; + const migrated = normalizeLegacyTalkConfig( + { + talk: { + provider: "elevenlabs", + providers: { + elevenlabs: { + voiceId: "voice-1", + }, + }, + }, + }, + changes, + ); + + expect(changes).toEqual([]); + expect(migrated.talk).toEqual({ + provider: "elevenlabs", + providers: { + elevenlabs: { + voiceId: "voice-1", + }, + }, + }); + }); + it("moves messages.tts. keys into messages.tts.providers", () => { const res = migrateLegacyConfig({ messages: { diff --git a/src/commands/doctor/shared/legacy-talk-config-normalizer.ts b/src/commands/doctor/shared/legacy-talk-config-normalizer.ts index dd4c5665483..a91fdc44ca3 100644 --- a/src/commands/doctor/shared/legacy-talk-config-normalizer.ts +++ b/src/commands/doctor/shared/legacy-talk-config-normalizer.ts @@ -14,6 +14,31 @@ function buildLegacyTalkProviderCompat( return Object.keys(compat).length > 0 ? compat : undefined; } +function buildLegacyRealtimeTalkCompat( + talk: Record, + normalizedTalk: NonNullable, +): Record | undefined { + if (talk.realtime !== undefined) { + return undefined; + } + const compat: Record = {}; + for (const key of ["model", "voice", "mode", "transport", "brain"] as const) { + if (talk[key] !== undefined) { + compat[key] = talk[key]; + } + } + if (Object.keys(compat).length === 0) { + return undefined; + } + if (normalizedTalk.provider !== undefined) { + compat.provider = normalizedTalk.provider; + } + if (normalizedTalk.providers !== undefined) { + compat.providers = normalizedTalk.providers; + } + return normalizeTalkSection({ realtime: compat } as OpenClawConfig["talk"])?.realtime; +} + function isRecord(value: unknown): value is Record { return Boolean(value && typeof value === "object" && !Array.isArray(value)); } @@ -35,6 +60,13 @@ export function normalizeLegacyTalkConfig(cfg: OpenClawConfig, changes: string[] }, }; } + const legacyRealtimeCompat = buildLegacyRealtimeTalkCompat(rawTalk, normalizedTalk); + if (legacyRealtimeCompat) { + normalizedTalk.realtime = { + ...legacyRealtimeCompat, + ...normalizedTalk.realtime, + }; + } if (Object.keys(normalizedTalk).length === 0 || isDeepStrictEqual(normalizedTalk, rawTalk)) { return cfg; } @@ -42,6 +74,9 @@ export function normalizeLegacyTalkConfig(cfg: OpenClawConfig, changes: string[] changes.push( "Normalized talk.provider/providers shape (trimmed provider ids and merged missing compatibility fields).", ); + if (legacyRealtimeCompat) { + changes.push("Moved legacy realtime Talk provider/model fields into talk.realtime."); + } return { ...cfg, talk: normalizedTalk, diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 7c24c86c427..ba24a6faa4d 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -150,6 +150,21 @@ export const FIELD_HELP: Record = { "Provider-specific Talk settings keyed by provider id. During migration, prefer this over legacy talk.* keys.", "talk.providers.*": "Provider-owned Talk config fields for the matching provider id.", "talk.providers.*.apiKey": "Provider API key for Talk mode.", // pragma: allowlist secret + "talk.realtime": + "Realtime Talk provider, model, voice, mode, transport, and brain strategy. Keep speech/TTS provider config in talk.provider and talk.providers.", + "talk.realtime.provider": "Active realtime voice provider id, such as openai or google.", + "talk.realtime.providers": "Provider-specific realtime voice settings keyed by provider id.", + "talk.realtime.providers.*": "Provider-owned realtime voice config for the matching provider id.", + "talk.realtime.providers.*.apiKey": "Provider API key for realtime Talk.", // pragma: allowlist secret + "talk.realtime.model": + "Realtime provider model id override for browser or Gateway-owned Talk sessions.", + "talk.realtime.voice": + "Realtime provider voice id override for browser or Gateway-owned Talk sessions.", + "talk.realtime.mode": "Talk execution mode: realtime, stt-tts, or transcription.", + "talk.realtime.transport": + "Talk byte/session transport: webrtc, provider-websocket, gateway-relay, or managed-room.", + "talk.realtime.brain": + "Talk reasoning strategy: agent-consult for Gateway-mediated agent help, direct-tools for owner-only tool calls, or none.", "talk.speechLocale": 'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.', "talk.interruptOnSpeech": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index ccd5068b338..667ccfe9b75 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -875,6 +875,16 @@ export const FIELD_LABELS: Record = { "talk.providers": "Talk Provider Settings", "talk.providers.*": "Talk Provider Config", "talk.providers.*.apiKey": "Talk Provider API Key", // pragma: allowlist secret + "talk.realtime": "Talk Realtime", + "talk.realtime.provider": "Talk Realtime Provider", + "talk.realtime.providers": "Talk Realtime Provider Settings", + "talk.realtime.providers.*": "Talk Realtime Provider Config", + "talk.realtime.providers.*.apiKey": "Talk Realtime Provider API Key", // pragma: allowlist secret + "talk.realtime.model": "Talk Realtime Model", + "talk.realtime.voice": "Talk Realtime Voice", + "talk.realtime.mode": "Talk Realtime Mode", + "talk.realtime.transport": "Talk Realtime Transport", + "talk.realtime.brain": "Talk Realtime Brain", channels: "Channels", "channels.defaults": "Channel Defaults", "channels.defaults.groupPolicy": "Default Group Policy", diff --git a/src/config/talk.normalize.test.ts b/src/config/talk.normalize.test.ts index 54e6d3b72e0..1f2f36819a7 100644 --- a/src/config/talk.normalize.test.ts +++ b/src/config/talk.normalize.test.ts @@ -31,6 +31,19 @@ describe("talk normalization", () => { custom: true, }, }, + realtime: { + provider: "openai", + providers: { + openai: { + model: "gpt-realtime", + }, + }, + model: "gpt-realtime", + voice: "alloy", + mode: "realtime", + transport: "webrtc", + brain: "agent-consult", + }, interruptOnSpeech: true, }); @@ -42,6 +55,19 @@ describe("talk normalization", () => { custom: true, }, }, + realtime: { + provider: "openai", + providers: { + openai: { + model: "gpt-realtime", + }, + }, + model: "gpt-realtime", + voice: "alloy", + mode: "realtime", + transport: "webrtc", + brain: "agent-consult", + }, interruptOnSpeech: true, }); }); diff --git a/src/config/talk.ts b/src/config/talk.ts index a47f13d7358..644b17d0a64 100644 --- a/src/config/talk.ts +++ b/src/config/talk.ts @@ -5,6 +5,7 @@ import type { TalkConfig, TalkConfigResponse, TalkProviderConfig, + TalkRealtimeConfig, } from "./types.gateway.js"; import type { OpenClawConfig } from "./types.openclaw.js"; import { coerceSecretRef } from "./types.secrets.js"; @@ -85,6 +86,50 @@ function normalizeTalkProviders(value: unknown): Record 0 ? providers : undefined; } +function normalizeTalkRealtimeConfig(value: unknown): TalkRealtimeConfig | undefined { + if (!isRecord(value)) { + return undefined; + } + const source = value; + const normalized: TalkRealtimeConfig = {}; + + const provider = normalizeOptionalString(source.provider); + if (provider) { + normalized.provider = provider; + } + const providers = normalizeTalkProviders(source.providers); + if (providers) { + normalized.providers = providers; + } + const model = normalizeOptionalString(source.model); + if (model) { + normalized.model = model; + } + const voice = normalizeOptionalString(source.voice); + if (voice) { + normalized.voice = voice; + } + if (source.mode === "realtime" || source.mode === "stt-tts" || source.mode === "transcription") { + normalized.mode = source.mode; + } + if ( + source.transport === "webrtc" || + source.transport === "provider-websocket" || + source.transport === "gateway-relay" || + source.transport === "managed-room" + ) { + normalized.transport = source.transport; + } + if ( + source.brain === "agent-consult" || + source.brain === "direct-tools" || + source.brain === "none" + ) { + normalized.brain = source.brain; + } + return Object.keys(normalized).length > 0 ? normalized : undefined; +} + function activeProviderFromTalk(talk: TalkConfig): string | undefined { const provider = normalizeOptionalString(talk.provider); const providers = talk.providers; @@ -118,10 +163,14 @@ export function normalizeTalkSection(value: TalkConfig | undefined): TalkConfig } const providers = normalizeTalkProviders(source.providers); + const realtime = normalizeTalkRealtimeConfig(source.realtime); const provider = normalizeOptionalString(source.provider); if (providers) { normalized.providers = providers; } + if (realtime) { + normalized.realtime = realtime; + } if (provider) { normalized.provider = provider; } @@ -182,6 +231,9 @@ export function buildTalkConfigResponse(value: unknown): TalkConfigResponse | un if (normalized?.providers && Object.keys(normalized.providers).length > 0) { payload.providers = normalized.providers; } + if (normalized?.realtime && Object.keys(normalized.realtime).length > 0) { + payload.realtime = normalized.realtime; + } const resolved = resolveActiveTalkProviderConfig(normalized) ?? diff --git a/src/config/types.gateway.ts b/src/config/types.gateway.ts index b0420387032..4afe9552945 100644 --- a/src/config/types.gateway.ts +++ b/src/config/types.gateway.ts @@ -55,6 +55,23 @@ export type TalkProviderConfig = { [key: string]: unknown; }; +export type TalkRealtimeConfig = { + /** Active realtime voice provider. */ + provider?: string; + /** Provider-specific realtime voice config keyed by provider id. */ + providers?: Record; + /** Provider model override for realtime sessions. */ + model?: string; + /** Provider voice override for realtime sessions. */ + voice?: string; + /** Realtime execution mode. */ + mode?: "realtime" | "stt-tts" | "transcription"; + /** Byte/session transport. */ + transport?: "webrtc" | "provider-websocket" | "gateway-relay" | "managed-room"; + /** Tool/agent strategy for realtime sessions. */ + brain?: "agent-consult" | "direct-tools" | "none"; +}; + export type ResolvedTalkConfig = { /** Active Talk TTS provider resolved from the current config payload. */ provider: string; @@ -67,6 +84,8 @@ export type TalkConfig = { provider?: string; /** Provider-specific Talk config keyed by provider id. */ providers?: Record; + /** Realtime Talk provider, model, voice, mode, transport, and brain config. */ + realtime?: TalkRealtimeConfig; /** BCP 47 locale id used for Talk speech recognition on device nodes. */ speechLocale?: string; /** Stop speaking when user starts talking (default: true). */ diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index d3a61e86e12..9a848290b20 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -212,10 +212,44 @@ const TalkProviderEntrySchema = z }) .catchall(z.unknown()); +const TalkRealtimeSchema = z + .object({ + provider: z.string().optional(), + providers: z.record(z.string(), TalkProviderEntrySchema).optional(), + model: z.string().optional(), + voice: z.string().optional(), + mode: z.enum(["realtime", "stt-tts", "transcription"]).optional(), + transport: z.enum(["webrtc", "provider-websocket", "gateway-relay", "managed-room"]).optional(), + brain: z.enum(["agent-consult", "direct-tools", "none"]).optional(), + }) + .strict() + .superRefine((realtime, ctx) => { + const provider = normalizeLowercaseStringOrEmpty(realtime.provider ?? ""); + const providers = realtime.providers ? Object.keys(realtime.providers) : []; + + if (provider && providers.length > 0 && !(provider in realtime.providers!)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ["provider"], + message: `talk.realtime.provider must match a key in talk.realtime.providers (missing "${provider}")`, + }); + } + + if (!provider && providers.length > 1) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ["provider"], + message: + "talk.realtime.provider is required when talk.realtime.providers defines multiple providers", + }); + } + }); + const TalkSchema = z .object({ provider: z.string().optional(), providers: z.record(z.string(), TalkProviderEntrySchema).optional(), + realtime: TalkRealtimeSchema.optional(), speechLocale: z.string().optional(), interruptOnSpeech: z.boolean().optional(), silenceTimeoutMs: z.number().int().positive().optional(), diff --git a/src/gateway/gateway-misc.test.ts b/src/gateway/gateway-misc.test.ts index 2a89aa6356d..d7563af9a49 100644 --- a/src/gateway/gateway-misc.test.ts +++ b/src/gateway/gateway-misc.test.ts @@ -322,7 +322,9 @@ describe("gateway broadcaster", () => { expect(readSocket.send).toHaveBeenCalledTimes(0); broadcastToConnIds("tick", { ts: 1 }, new Set(["c-read"])); - expect(readSocket.send).toHaveBeenCalledTimes(1); + broadcastToConnIds("talk.realtime.relay", { type: "ready" }, new Set(["c-read"])); + broadcastToConnIds("talk.transcription.relay", { type: "session.ready" }, new Set(["c-read"])); + expect(readSocket.send).toHaveBeenCalledTimes(3); expect(approvalsSocket.send).toHaveBeenCalledTimes(1); expect(pairingSocket.send).toHaveBeenCalledTimes(1); }); diff --git a/src/gateway/method-scopes.test.ts b/src/gateway/method-scopes.test.ts index c7a1fc041ba..e1d7be541e4 100644 --- a/src/gateway/method-scopes.test.ts +++ b/src/gateway/method-scopes.test.ts @@ -41,6 +41,11 @@ describe("method scope resolution", () => { ["diagnostics.stability", ["operator.read"]], ["node.pair.approve", ["operator.pairing"]], ["poll", ["operator.write"]], + ["talk.session.create", ["operator.write"]], + ["talk.session.inputAudio", ["operator.write"]], + ["talk.session.control", ["operator.write"]], + ["talk.session.toolResult", ["operator.write"]], + ["talk.session.close", ["operator.write"]], ["update.status", ["operator.admin"]], ["config.patch", ["operator.admin"]], ["nativeHook.invoke", ["operator.admin"]], @@ -96,6 +101,24 @@ describe("operator scope authorization", () => { }); }); + it("allows operator.write clients to use unified Talk sessions", () => { + for (const method of [ + "talk.session.create", + "talk.session.inputAudio", + "talk.session.control", + "talk.session.toolResult", + "talk.session.close", + ]) { + expect(authorizeOperatorScopesForMethod(method, ["operator.write"])).toEqual({ + allowed: true, + }); + expect(authorizeOperatorScopesForMethod(method, ["operator.read"])).toEqual({ + allowed: false, + missingScope: "operator.write", + }); + } + }); + it("requires admin for browser.request", () => { setPluginGatewayMethodScope("browser.request", "operator.admin"); diff --git a/src/gateway/method-scopes.ts b/src/gateway/method-scopes.ts index 968d3f2189d..7135c4b0738 100644 --- a/src/gateway/method-scopes.ts +++ b/src/gateway/method-scopes.ts @@ -122,7 +122,9 @@ const METHOD_SCOPE_GROUPS: Record = { "chat.history", "config.get", "config.schema.lookup", + "talk.catalog", "talk.config", + "talk.handoff.join", "agents.files.list", "agents.files.get", "artifacts.list", @@ -137,11 +139,27 @@ const METHOD_SCOPE_GROUPS: Record = { "agent.wait", "wake", "talk.mode", + "talk.session.create", + "talk.session.inputAudio", + "talk.session.control", + "talk.session.toolResult", + "talk.session.close", + "talk.handoff.create", + "talk.handoff.revoke", + "talk.handoff.turnStart", + "talk.handoff.turnEnd", + "talk.handoff.turnCancel", "talk.realtime.session", + "talk.realtime.toolCall", "talk.realtime.relayAudio", + "talk.realtime.relayCancel", "talk.realtime.relayMark", "talk.realtime.relayStop", "talk.realtime.relayToolResult", + "talk.transcription.session", + "talk.transcription.relayAudio", + "talk.transcription.relayCancel", + "talk.transcription.relayStop", "talk.speak", "tts.enable", "tts.disable", diff --git a/src/gateway/protocol/index.test.ts b/src/gateway/protocol/index.test.ts index 34b325f5604..1f2a567a991 100644 --- a/src/gateway/protocol/index.test.ts +++ b/src/gateway/protocol/index.test.ts @@ -7,7 +7,21 @@ import { validateNodeEventResult, validateNodePresenceAlivePayload, validateTalkConfigResult, + validateTalkEvent, + validateTalkHandoffCreateParams, + validateTalkHandoffCreateResult, + validateTalkHandoffJoinResult, + validateTalkRealtimeRelayAudioParams, + validateTalkRealtimeRelayCancelParams, + validateTalkHandoffTurnCancelParams, + validateTalkHandoffTurnEndParams, + validateTalkHandoffTurnResult, + validateTalkHandoffTurnStartParams, validateTalkRealtimeSessionParams, + validateTalkRealtimeToolCallParams, + validateTalkTranscriptionRelayCancelParams, + validateTalkTranscriptionRelayAudioParams, + validateTalkTranscriptionSessionParams, validateWakeParams, } from "./index.js"; @@ -104,7 +118,7 @@ describe("validateTalkConfigResult", () => { ).toBe(true); }); - it("rejects normalized talk payloads without talk.resolved", () => { + it("accepts normalized talk payloads without resolved provider materialization", () => { expect( validateTalkConfigResult({ config: { @@ -118,18 +132,50 @@ describe("validateTalkConfigResult", () => { }, }, }), - ).toBe(false); + ).toBe(true); + }); + + it("accepts realtime Talk defaults without requiring a speech provider", () => { + expect( + validateTalkConfigResult({ + config: { + talk: { + realtime: { + provider: "openai", + providers: { + openai: { + apiKey: { + source: "env", + provider: "default", + id: "OPENAI_API_KEY", + }, + model: "gpt-realtime", + }, + }, + model: "gpt-realtime", + voice: "alloy", + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + }, + }, + }, + }), + ).toBe(true); }); }); describe("validateTalkRealtimeSessionParams", () => { - it("accepts provider, model, and voice overrides", () => { + it("accepts provider, model, voice, mode, transport, and brain overrides", () => { expect( validateTalkRealtimeSessionParams({ sessionKey: "agent:main:main", provider: "openai", model: "gpt-realtime-1.5", voice: "alloy", + mode: "realtime", + transport: "webrtc", + brain: "agent-consult", }), ).toBe(true); }); @@ -147,6 +193,294 @@ describe("validateTalkRealtimeSessionParams", () => { }); }); +describe("validateTalkEvent", () => { + it("pins the common Talk event envelope used by relay and surface adapters", () => { + expect( + validateTalkEvent({ + id: "talk-session:1", + type: "capture.started", + sessionId: "talk-session", + turnId: "turn-1", + captureId: "capture-1", + seq: 1, + timestamp: "2026-05-05T12:00:00.000Z", + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + provider: "openai", + final: false, + callId: "call-1", + itemId: "item-1", + parentId: "parent-1", + payload: { source: "ptt" }, + }), + ).toBe(true); + }); + + it("rejects stale or vendor-shaped event payloads without required correlation", () => { + expect( + validateTalkEvent({ + type: "output.audio.delta", + sessionId: "talk-session", + seq: 0, + timestamp: "2026-05-05T12:00:00.000Z", + mode: "realtime-duplex", + transport: "webrtc-sdp", + brain: "agent-consult", + payload: { byteLength: 12 }, + }), + ).toBe(false); + expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required"); + }); + + it("requires turnId and captureId for scoped Talk events", () => { + expect( + validateTalkEvent({ + id: "talk-session:1", + type: "turn.started", + sessionId: "talk-session", + seq: 1, + timestamp: "2026-05-05T12:00:00.000Z", + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + payload: {}, + }), + ).toBe(false); + expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required"); + + expect( + validateTalkEvent({ + id: "talk-session:2", + type: "capture.started", + sessionId: "talk-session", + turnId: "turn-1", + seq: 2, + timestamp: "2026-05-05T12:00:01.000Z", + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + payload: {}, + }), + ).toBe(false); + expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required"); + }); +}); + +describe("validateTalkHandoff", () => { + it("accepts session-scoped provider, model, and voice selection", () => { + expect( + validateTalkHandoffCreateParams({ + sessionKey: "agent:main:main", + provider: "openai", + model: "gpt-realtime-1.5", + voice: "alloy", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + }), + ).toBe(true); + expect( + validateTalkHandoffCreateResult({ + id: "handoff-1", + roomId: "talk_handoff-1", + roomUrl: "/talk/rooms/talk_handoff-1", + token: "token-1", + sessionKey: "agent:main:main", + provider: "openai", + model: "gpt-realtime-1.5", + voice: "alloy", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + createdAt: 1, + expiresAt: 2, + room: { + recentTalkEvents: [ + { + id: "talk_handoff-1:1", + type: "session.started", + sessionId: "talk_handoff-1", + seq: 1, + timestamp: "2026-05-05T12:00:00.000Z", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + payload: {}, + }, + ], + }, + }), + ).toBe(true); + expect( + validateTalkHandoffJoinResult({ + id: "handoff-1", + roomId: "talk_handoff-1", + roomUrl: "/talk/rooms/talk_handoff-1", + sessionKey: "agent:main:main", + provider: "openai", + model: "gpt-realtime-1.5", + voice: "alloy", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + createdAt: 1, + expiresAt: 2, + room: { + activeClientId: "conn-1", + recentTalkEvents: [ + { + id: "talk_handoff-1:1", + type: "session.ready", + sessionId: "talk_handoff-1", + seq: 1, + timestamp: "2026-05-05T12:00:00.000Z", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + payload: {}, + }, + ], + }, + }), + ).toBe(true); + }); + + it("rejects request-time instruction overrides", () => { + expect( + validateTalkHandoffCreateParams({ + sessionKey: "agent:main:main", + instructionsOverride: "Ignore configured policy.", + }), + ).toBe(false); + expect(formatValidationErrors(validateTalkHandoffCreateParams.errors)).toContain( + "unexpected property 'instructionsOverride'", + ); + }); + + it("accepts handoff turn lifecycle params and results", () => { + expect( + validateTalkHandoffTurnStartParams({ + id: "handoff-1", + token: "token-1", + turnId: "turn-1", + }), + ).toBe(true); + expect( + validateTalkHandoffTurnEndParams({ + id: "handoff-1", + token: "token-1", + }), + ).toBe(true); + expect( + validateTalkHandoffTurnCancelParams({ + id: "handoff-1", + token: "token-1", + reason: "barge-in", + }), + ).toBe(true); + expect( + validateTalkHandoffTurnResult({ + ok: true, + turnId: "turn-1", + events: [ + { + id: "talk_handoff-1:2", + type: "turn.started", + sessionId: "talk_handoff-1", + turnId: "turn-1", + seq: 2, + timestamp: "2026-05-05T12:00:00.000Z", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + payload: {}, + }, + ], + record: { + id: "handoff-1", + roomId: "talk_handoff-1", + roomUrl: "/talk/rooms/talk_handoff-1", + sessionKey: "agent:main:main", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + createdAt: 1, + expiresAt: 2, + room: { + activeClientId: "conn-1", + activeTurnId: "turn-1", + recentTalkEvents: [ + { + id: "talk_handoff-1:2", + type: "turn.started", + sessionId: "talk_handoff-1", + turnId: "turn-1", + seq: 2, + timestamp: "2026-05-05T12:00:00.000Z", + mode: "realtime", + transport: "managed-room", + brain: "agent-consult", + payload: {}, + }, + ], + }, + }, + }), + ).toBe(true); + }); +}); + +describe("validateTalkRealtimeToolCallParams", () => { + it("accepts optional relay session correlation", () => { + expect( + validateTalkRealtimeToolCallParams({ + sessionKey: "agent:main:main", + relaySessionId: "relay-1", + callId: "call-1", + name: "openclaw_agent_consult", + args: { question: "what now" }, + }), + ).toBe(true); + }); +}); + +describe("validateTalkRealtimeRelayParams", () => { + it("accepts relay audio and cancel params", () => { + expect( + validateTalkRealtimeRelayAudioParams({ + relaySessionId: "relay-1", + audioBase64: "aGVsbG8=", + timestamp: 123, + }), + ).toBe(true); + expect( + validateTalkRealtimeRelayCancelParams({ + relaySessionId: "relay-1", + reason: "barge-in", + }), + ).toBe(true); + }); +}); + +describe("validateTalkTranscriptionParams", () => { + it("accepts transcription session, relay audio, and cancel params", () => { + expect(validateTalkTranscriptionSessionParams({ provider: "openai" })).toBe(true); + expect( + validateTalkTranscriptionRelayAudioParams({ + transcriptionSessionId: "stt-1", + audioBase64: "aGVsbG8=", + }), + ).toBe(true); + expect( + validateTalkTranscriptionRelayCancelParams({ + transcriptionSessionId: "stt-1", + reason: "barge-in", + }), + ).toBe(true); + }); +}); + describe("validateWakeParams", () => { it("accepts valid wake params", () => { expect(validateWakeParams({ mode: "now", text: "hello" })).toBe(true); diff --git a/src/gateway/protocol/index.ts b/src/gateway/protocol/index.ts index a3cbd494b9c..9fd896b5771 100644 --- a/src/gateway/protocol/index.ts +++ b/src/gateway/protocol/index.ts @@ -61,12 +61,40 @@ import { ChannelsStopParamsSchema, type ChannelsLogoutParams, ChannelsLogoutParamsSchema, + type TalkEvent, + TalkEventSchema, + type TalkCatalogParams, + TalkCatalogParamsSchema, + type TalkCatalogResult, + TalkCatalogResultSchema, type TalkConfigParams, TalkConfigParamsSchema, type TalkConfigResult, TalkConfigResultSchema, + type TalkHandoffCreateParams, + TalkHandoffCreateParamsSchema, + type TalkHandoffCreateResult, + TalkHandoffCreateResultSchema, + type TalkHandoffJoinParams, + TalkHandoffJoinParamsSchema, + type TalkHandoffJoinResult, + TalkHandoffJoinResultSchema, + type TalkHandoffRevokeParams, + TalkHandoffRevokeParamsSchema, + type TalkHandoffRevokeResult, + TalkHandoffRevokeResultSchema, + type TalkHandoffTurnCancelParams, + TalkHandoffTurnCancelParamsSchema, + type TalkHandoffTurnEndParams, + TalkHandoffTurnEndParamsSchema, + type TalkHandoffTurnResult, + TalkHandoffTurnResultSchema, + type TalkHandoffTurnStartParams, + TalkHandoffTurnStartParamsSchema, type TalkRealtimeRelayAudioParams, TalkRealtimeRelayAudioParamsSchema, + type TalkRealtimeRelayCancelParams, + TalkRealtimeRelayCancelParamsSchema, type TalkRealtimeRelayMarkParams, TalkRealtimeRelayMarkParamsSchema, type TalkRealtimeRelayOkResult, @@ -79,6 +107,38 @@ import { TalkRealtimeSessionParamsSchema, type TalkRealtimeSessionResult, TalkRealtimeSessionResultSchema, + type TalkRealtimeToolCallParams, + TalkRealtimeToolCallParamsSchema, + type TalkRealtimeToolCallResult, + TalkRealtimeToolCallResultSchema, + type TalkSessionCloseParams, + TalkSessionCloseParamsSchema, + type TalkSessionControlParams, + TalkSessionControlParamsSchema, + type TalkSessionControlResult, + TalkSessionControlResultSchema, + type TalkSessionCreateParams, + TalkSessionCreateParamsSchema, + type TalkSessionCreateResult, + TalkSessionCreateResultSchema, + type TalkSessionInputAudioParams, + TalkSessionInputAudioParamsSchema, + type TalkSessionOkResult, + TalkSessionOkResultSchema, + type TalkSessionToolResultParams, + TalkSessionToolResultParamsSchema, + type TalkTranscriptionRelayAudioParams, + TalkTranscriptionRelayAudioParamsSchema, + type TalkTranscriptionRelayCancelParams, + TalkTranscriptionRelayCancelParamsSchema, + type TalkTranscriptionRelayOkResult, + TalkTranscriptionRelayOkResultSchema, + type TalkTranscriptionRelayStopParams, + TalkTranscriptionRelayStopParamsSchema, + type TalkTranscriptionSessionParams, + TalkTranscriptionSessionParamsSchema, + type TalkTranscriptionSessionResult, + TalkTranscriptionSessionResultSchema, type TalkSpeakParams, TalkSpeakParamsSchema, type TalkSpeakResult, @@ -532,17 +592,82 @@ export const validateWizardNextParams = ajv.compile(WizardNext export const validateWizardCancelParams = ajv.compile(WizardCancelParamsSchema); export const validateWizardStatusParams = ajv.compile(WizardStatusParamsSchema); export const validateTalkModeParams = ajv.compile(TalkModeParamsSchema); +export const validateTalkEvent = ajv.compile(TalkEventSchema); +export const validateTalkCatalogParams = ajv.compile(TalkCatalogParamsSchema); +export const validateTalkCatalogResult = ajv.compile(TalkCatalogResultSchema); export const validateTalkConfigParams = ajv.compile(TalkConfigParamsSchema); export const validateTalkConfigResult = ajv.compile(TalkConfigResultSchema); +export const validateTalkHandoffCreateParams = ajv.compile( + TalkHandoffCreateParamsSchema, +); +export const validateTalkHandoffCreateResult = ajv.compile( + TalkHandoffCreateResultSchema, +); +export const validateTalkHandoffJoinParams = ajv.compile( + TalkHandoffJoinParamsSchema, +); +export const validateTalkHandoffJoinResult = ajv.compile( + TalkHandoffJoinResultSchema, +); +export const validateTalkHandoffRevokeParams = ajv.compile( + TalkHandoffRevokeParamsSchema, +); +export const validateTalkHandoffRevokeResult = ajv.compile( + TalkHandoffRevokeResultSchema, +); +export const validateTalkHandoffTurnStartParams = ajv.compile( + TalkHandoffTurnStartParamsSchema, +); +export const validateTalkHandoffTurnEndParams = ajv.compile( + TalkHandoffTurnEndParamsSchema, +); +export const validateTalkHandoffTurnCancelParams = ajv.compile( + TalkHandoffTurnCancelParamsSchema, +); +export const validateTalkHandoffTurnResult = ajv.compile( + TalkHandoffTurnResultSchema, +); export const validateTalkRealtimeSessionParams = ajv.compile( TalkRealtimeSessionParamsSchema, ); export const validateTalkRealtimeSessionResult = ajv.compile( TalkRealtimeSessionResultSchema, ); +export const validateTalkRealtimeToolCallParams = ajv.compile( + TalkRealtimeToolCallParamsSchema, +); +export const validateTalkRealtimeToolCallResult = ajv.compile( + TalkRealtimeToolCallResultSchema, +); +export const validateTalkSessionCreateParams = ajv.compile( + TalkSessionCreateParamsSchema, +); +export const validateTalkSessionCreateResult = ajv.compile( + TalkSessionCreateResultSchema, +); +export const validateTalkSessionInputAudioParams = ajv.compile( + TalkSessionInputAudioParamsSchema, +); +export const validateTalkSessionControlParams = ajv.compile( + TalkSessionControlParamsSchema, +); +export const validateTalkSessionControlResult = ajv.compile( + TalkSessionControlResultSchema, +); +export const validateTalkSessionToolResultParams = ajv.compile( + TalkSessionToolResultParamsSchema, +); +export const validateTalkSessionCloseParams = ajv.compile( + TalkSessionCloseParamsSchema, +); +export const validateTalkSessionOkResult = + ajv.compile(TalkSessionOkResultSchema); export const validateTalkRealtimeRelayAudioParams = ajv.compile( TalkRealtimeRelayAudioParamsSchema, ); +export const validateTalkRealtimeRelayCancelParams = ajv.compile( + TalkRealtimeRelayCancelParamsSchema, +); export const validateTalkRealtimeRelayMarkParams = ajv.compile( TalkRealtimeRelayMarkParamsSchema, ); @@ -551,6 +676,21 @@ export const validateTalkRealtimeRelayStopParams = ajv.compile(TalkRealtimeRelayToolResultParamsSchema); +export const validateTalkTranscriptionSessionParams = ajv.compile( + TalkTranscriptionSessionParamsSchema, +); +export const validateTalkTranscriptionSessionResult = ajv.compile( + TalkTranscriptionSessionResultSchema, +); +export const validateTalkTranscriptionRelayAudioParams = + ajv.compile(TalkTranscriptionRelayAudioParamsSchema); +export const validateTalkTranscriptionRelayCancelParams = + ajv.compile(TalkTranscriptionRelayCancelParamsSchema); +export const validateTalkTranscriptionRelayStopParams = + ajv.compile(TalkTranscriptionRelayStopParamsSchema); +export const validateTalkTranscriptionRelayOkResult = ajv.compile( + TalkTranscriptionRelayOkResultSchema, +); export const validateTalkSpeakParams = ajv.compile(TalkSpeakParamsSchema); export const validateTalkSpeakResult = ajv.compile(TalkSpeakResultSchema); export const validateChannelsStatusParams = ajv.compile( @@ -765,15 +905,45 @@ export { WizardNextResultSchema, WizardStartResultSchema, WizardStatusResultSchema, + TalkEventSchema, + TalkCatalogParamsSchema, + TalkCatalogResultSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkHandoffCreateParamsSchema, + TalkHandoffCreateResultSchema, + TalkHandoffJoinParamsSchema, + TalkHandoffJoinResultSchema, + TalkHandoffRevokeParamsSchema, + TalkHandoffRevokeResultSchema, + TalkHandoffTurnStartParamsSchema, + TalkHandoffTurnEndParamsSchema, + TalkHandoffTurnCancelParamsSchema, + TalkHandoffTurnResultSchema, TalkRealtimeSessionParamsSchema, TalkRealtimeSessionResultSchema, + TalkRealtimeToolCallParamsSchema, + TalkRealtimeToolCallResultSchema, + TalkSessionCreateParamsSchema, + TalkSessionCreateResultSchema, + TalkSessionInputAudioParamsSchema, + TalkSessionControlParamsSchema, + TalkSessionControlResultSchema, + TalkSessionToolResultParamsSchema, + TalkSessionCloseParamsSchema, + TalkSessionOkResultSchema, TalkRealtimeRelayAudioParamsSchema, + TalkRealtimeRelayCancelParamsSchema, TalkRealtimeRelayMarkParamsSchema, TalkRealtimeRelayStopParamsSchema, TalkRealtimeRelayToolResultParamsSchema, TalkRealtimeRelayOkResultSchema, + TalkTranscriptionSessionParamsSchema, + TalkTranscriptionSessionResultSchema, + TalkTranscriptionRelayAudioParamsSchema, + TalkTranscriptionRelayCancelParamsSchema, + TalkTranscriptionRelayStopParamsSchema, + TalkTranscriptionRelayOkResultSchema, TalkSpeakParamsSchema, TalkSpeakResultSchema, ChannelsStatusParamsSchema, @@ -879,15 +1049,44 @@ export type { WizardNextResult, WizardStartResult, WizardStatusResult, + TalkCatalogParams, + TalkCatalogResult, TalkConfigParams, TalkConfigResult, + TalkHandoffCreateParams, + TalkHandoffCreateResult, + TalkHandoffJoinParams, + TalkHandoffJoinResult, + TalkHandoffRevokeParams, + TalkHandoffRevokeResult, + TalkHandoffTurnStartParams, + TalkHandoffTurnEndParams, + TalkHandoffTurnCancelParams, + TalkHandoffTurnResult, TalkRealtimeSessionParams, TalkRealtimeSessionResult, + TalkRealtimeToolCallParams, + TalkRealtimeToolCallResult, + TalkSessionCreateParams, + TalkSessionCreateResult, + TalkSessionInputAudioParams, + TalkSessionControlParams, + TalkSessionControlResult, + TalkSessionToolResultParams, + TalkSessionCloseParams, + TalkSessionOkResult, TalkRealtimeRelayAudioParams, + TalkRealtimeRelayCancelParams, TalkRealtimeRelayMarkParams, TalkRealtimeRelayStopParams, TalkRealtimeRelayToolResultParams, TalkRealtimeRelayOkResult, + TalkTranscriptionSessionParams, + TalkTranscriptionSessionResult, + TalkTranscriptionRelayAudioParams, + TalkTranscriptionRelayCancelParams, + TalkTranscriptionRelayStopParams, + TalkTranscriptionRelayOkResult, TalkSpeakParams, TalkSpeakResult, TalkModeParams, diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 87d95f609c5..9094999a4e1 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -36,12 +36,408 @@ export const TalkSpeakParamsSchema = Type.Object( { additionalProperties: false }, ); +const TalkModeSchema = Type.Union([ + Type.Literal("realtime"), + Type.Literal("stt-tts"), + Type.Literal("transcription"), +]); + +const TalkTransportSchema = Type.Union([ + Type.Literal("webrtc"), + Type.Literal("provider-websocket"), + Type.Literal("gateway-relay"), + Type.Literal("managed-room"), +]); + +const TalkBrainSchema = Type.Union([ + Type.Literal("agent-consult"), + Type.Literal("direct-tools"), + Type.Literal("none"), +]); + +const TalkEventTypeSchema = Type.Union([ + Type.Literal("session.started"), + Type.Literal("session.ready"), + Type.Literal("session.closed"), + Type.Literal("session.error"), + Type.Literal("session.replaced"), + Type.Literal("turn.started"), + Type.Literal("turn.ended"), + Type.Literal("turn.cancelled"), + Type.Literal("capture.started"), + Type.Literal("capture.stopped"), + Type.Literal("capture.cancelled"), + Type.Literal("capture.once"), + Type.Literal("input.audio.delta"), + Type.Literal("input.audio.committed"), + Type.Literal("transcript.delta"), + Type.Literal("transcript.done"), + Type.Literal("output.text.delta"), + Type.Literal("output.text.done"), + Type.Literal("output.audio.started"), + Type.Literal("output.audio.delta"), + Type.Literal("output.audio.done"), + Type.Literal("tool.call"), + Type.Literal("tool.progress"), + Type.Literal("tool.result"), + Type.Literal("tool.error"), + Type.Literal("usage.metrics"), + Type.Literal("latency.metrics"), + Type.Literal("health.changed"), +]); + +const TURN_SCOPED_TALK_EVENT_TYPES = [ + "turn.started", + "turn.ended", + "turn.cancelled", + "input.audio.delta", + "input.audio.committed", + "transcript.delta", + "transcript.done", + "output.text.delta", + "output.text.done", + "output.audio.started", + "output.audio.delta", + "output.audio.done", + "tool.call", + "tool.progress", + "tool.result", + "tool.error", +]; + +const CAPTURE_SCOPED_TALK_EVENT_TYPES = [ + "capture.started", + "capture.stopped", + "capture.cancelled", + "capture.once", +]; + +function requireJsonSchemaProperties(properties: string[]): Record { + const conditionalRequirementKey = ["th", "en"].join(""); + return Object.fromEntries([[conditionalRequirementKey, { required: properties }]]); +} + +export const TalkEventSchema = Type.Object( + { + id: NonEmptyString, + type: TalkEventTypeSchema, + sessionId: NonEmptyString, + turnId: Type.Optional(Type.String()), + captureId: Type.Optional(Type.String()), + seq: Type.Integer({ minimum: 1 }), + timestamp: NonEmptyString, + mode: TalkModeSchema, + transport: TalkTransportSchema, + brain: TalkBrainSchema, + provider: Type.Optional(Type.String()), + final: Type.Optional(Type.Boolean()), + callId: Type.Optional(Type.String()), + itemId: Type.Optional(Type.String()), + parentId: Type.Optional(Type.String()), + payload: Type.Unknown(), + }, + { + additionalProperties: false, + allOf: [ + { + if: { + properties: { type: { enum: TURN_SCOPED_TALK_EVENT_TYPES } }, + required: ["type"], + }, + ...requireJsonSchemaProperties(["turnId"]), + }, + { + if: { + properties: { type: { enum: CAPTURE_SCOPED_TALK_EVENT_TYPES } }, + required: ["type"], + }, + ...requireJsonSchemaProperties(["captureId"]), + }, + ], + }, +); + export const TalkRealtimeSessionParamsSchema = Type.Object( { sessionKey: Type.Optional(Type.String()), provider: Type.Optional(Type.String()), model: Type.Optional(Type.String()), voice: Type.Optional(Type.String()), + mode: Type.Optional(TalkModeSchema), + transport: Type.Optional(TalkTransportSchema), + brain: Type.Optional(TalkBrainSchema), + }, + { additionalProperties: false }, +); + +export const TalkRealtimeToolCallParamsSchema = Type.Object( + { + sessionKey: NonEmptyString, + callId: NonEmptyString, + name: NonEmptyString, + args: Type.Optional(Type.Unknown()), + relaySessionId: Type.Optional(NonEmptyString), + }, + { additionalProperties: false }, +); + +export const TalkRealtimeToolCallResultSchema = Type.Object( + { + runId: NonEmptyString, + idempotencyKey: NonEmptyString, + }, + { additionalProperties: false }, +); + +export const TalkSessionCreateParamsSchema = Type.Object( + { + sessionKey: Type.Optional(Type.String()), + provider: Type.Optional(Type.String()), + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + mode: Type.Optional(TalkModeSchema), + transport: Type.Optional(TalkTransportSchema), + brain: Type.Optional(TalkBrainSchema), + ttlMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 3600000 })), + }, + { additionalProperties: false }, +); + +export const TalkSessionInputAudioParamsSchema = Type.Object( + { + sessionId: NonEmptyString, + audioBase64: NonEmptyString, + timestamp: Type.Optional(Type.Number()), + }, + { additionalProperties: false }, +); + +export const TalkSessionControlParamsSchema = Type.Object( + { + sessionId: NonEmptyString, + type: Type.Union([ + Type.Literal("turn.start"), + Type.Literal("turn.end"), + Type.Literal("turn.cancel"), + ]), + turnId: Type.Optional(Type.String()), + reason: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + +export const TalkSessionToolResultParamsSchema = Type.Object( + { + sessionId: NonEmptyString, + callId: NonEmptyString, + result: Type.Unknown(), + }, + { additionalProperties: false }, +); + +export const TalkSessionCloseParamsSchema = Type.Object( + { + sessionId: NonEmptyString, + }, + { additionalProperties: false }, +); + +export const TalkHandoffCreateParamsSchema = Type.Object( + { + sessionKey: NonEmptyString, + sessionId: Type.Optional(Type.String()), + channel: Type.Optional(Type.String()), + target: Type.Optional(Type.String()), + provider: Type.Optional(Type.String()), + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + mode: Type.Optional(TalkModeSchema), + transport: Type.Optional(TalkTransportSchema), + brain: Type.Optional(TalkBrainSchema), + ttlMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 3600000 })), + }, + { additionalProperties: false }, +); + +const TalkHandoffRoomSchema = Type.Object( + { + activeClientId: Type.Optional(Type.String()), + activeTurnId: Type.Optional(Type.String()), + recentTalkEvents: Type.Array(TalkEventSchema), + }, + { additionalProperties: false }, +); + +export const TalkHandoffCreateResultSchema = Type.Object( + { + id: NonEmptyString, + roomId: NonEmptyString, + roomUrl: NonEmptyString, + token: NonEmptyString, + sessionKey: NonEmptyString, + sessionId: Type.Optional(Type.String()), + channel: Type.Optional(Type.String()), + target: Type.Optional(Type.String()), + provider: Type.Optional(Type.String()), + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + mode: TalkModeSchema, + transport: TalkTransportSchema, + brain: TalkBrainSchema, + createdAt: Type.Number(), + expiresAt: Type.Number(), + room: TalkHandoffRoomSchema, + }, + { additionalProperties: false }, +); + +const TalkHandoffPublicRecordSchema = Type.Object( + { + id: NonEmptyString, + roomId: NonEmptyString, + roomUrl: NonEmptyString, + sessionKey: NonEmptyString, + sessionId: Type.Optional(Type.String()), + channel: Type.Optional(Type.String()), + target: Type.Optional(Type.String()), + provider: Type.Optional(Type.String()), + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + mode: TalkModeSchema, + transport: TalkTransportSchema, + brain: TalkBrainSchema, + createdAt: Type.Number(), + expiresAt: Type.Number(), + room: TalkHandoffRoomSchema, + }, + { additionalProperties: false }, +); + +export const TalkHandoffJoinParamsSchema = Type.Object( + { + id: NonEmptyString, + token: NonEmptyString, + }, + { additionalProperties: false }, +); + +export const TalkHandoffJoinResultSchema = TalkHandoffPublicRecordSchema; + +export const TalkHandoffRevokeParamsSchema = Type.Object( + { + id: NonEmptyString, + }, + { additionalProperties: false }, +); + +export const TalkHandoffRevokeResultSchema = Type.Object( + { + ok: Type.Boolean(), + revoked: Type.Boolean(), + }, + { additionalProperties: false }, +); + +export const TalkHandoffTurnStartParamsSchema = Type.Object( + { + id: NonEmptyString, + token: NonEmptyString, + turnId: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + +export const TalkHandoffTurnEndParamsSchema = Type.Object( + { + id: NonEmptyString, + token: NonEmptyString, + turnId: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + +export const TalkHandoffTurnCancelParamsSchema = Type.Object( + { + id: NonEmptyString, + token: NonEmptyString, + turnId: Type.Optional(Type.String()), + reason: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + +export const TalkHandoffTurnResultSchema = Type.Object( + { + ok: Type.Boolean(), + record: TalkHandoffPublicRecordSchema, + turnId: NonEmptyString, + events: Type.Array(TalkEventSchema), + }, + { additionalProperties: false }, +); + +export const TalkCatalogParamsSchema = Type.Object({}, { additionalProperties: false }); + +const TalkCatalogProviderSchema = Type.Object( + { + id: NonEmptyString, + label: NonEmptyString, + configured: Type.Boolean(), + models: Type.Optional(Type.Array(Type.String())), + voices: Type.Optional(Type.Array(Type.String())), + defaultModel: Type.Optional(Type.String()), + modes: Type.Optional(Type.Array(TalkModeSchema)), + transports: Type.Optional(Type.Array(TalkTransportSchema)), + brains: Type.Optional(Type.Array(TalkBrainSchema)), + inputAudioFormats: Type.Optional( + Type.Array( + Type.Object( + { + encoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]), + sampleRateHz: Type.Integer({ minimum: 1 }), + channels: Type.Integer({ minimum: 1 }), + }, + { additionalProperties: false }, + ), + ), + ), + outputAudioFormats: Type.Optional( + Type.Array( + Type.Object( + { + encoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]), + sampleRateHz: Type.Integer({ minimum: 1 }), + channels: Type.Integer({ minimum: 1 }), + }, + { additionalProperties: false }, + ), + ), + ), + supportsBrowserSession: Type.Optional(Type.Boolean()), + supportsBargeIn: Type.Optional(Type.Boolean()), + supportsToolCalls: Type.Optional(Type.Boolean()), + supportsVideoFrames: Type.Optional(Type.Boolean()), + supportsSessionResumption: Type.Optional(Type.Boolean()), + }, + { additionalProperties: false }, +); + +const TalkCatalogProviderGroupSchema = Type.Object( + { + activeProvider: Type.Optional(Type.String()), + providers: Type.Array(TalkCatalogProviderSchema), + }, + { additionalProperties: false }, +); + +export const TalkCatalogResultSchema = Type.Object( + { + modes: Type.Array(TalkModeSchema), + transports: Type.Array(TalkTransportSchema), + brains: Type.Array(TalkBrainSchema), + speech: TalkCatalogProviderGroupSchema, + transcription: TalkCatalogProviderGroupSchema, + realtime: TalkCatalogProviderGroupSchema, }, { additionalProperties: false }, ); @@ -70,6 +466,14 @@ export const TalkRealtimeRelayStopParamsSchema = Type.Object( { additionalProperties: false }, ); +export const TalkRealtimeRelayCancelParamsSchema = Type.Object( + { + relaySessionId: NonEmptyString, + reason: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + export const TalkRealtimeRelayToolResultParamsSchema = Type.Object( { relaySessionId: NonEmptyString, @@ -86,6 +490,61 @@ export const TalkRealtimeRelayOkResultSchema = Type.Object( { additionalProperties: false }, ); +export const TalkTranscriptionSessionParamsSchema = Type.Object( + { + provider: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + +export const TalkTranscriptionSessionResultSchema = Type.Object( + { + provider: NonEmptyString, + mode: Type.Literal("transcription"), + transport: Type.Literal("gateway-relay"), + transcriptionSessionId: NonEmptyString, + audio: Type.Object( + { + inputEncoding: Type.Literal("pcm16"), + inputSampleRateHz: Type.Integer({ minimum: 1 }), + }, + { additionalProperties: false }, + ), + expiresAt: Type.Number(), + }, + { additionalProperties: false }, +); + +export const TalkTranscriptionRelayAudioParamsSchema = Type.Object( + { + transcriptionSessionId: NonEmptyString, + audioBase64: NonEmptyString, + }, + { additionalProperties: false }, +); + +export const TalkTranscriptionRelayStopParamsSchema = Type.Object( + { + transcriptionSessionId: NonEmptyString, + }, + { additionalProperties: false }, +); + +export const TalkTranscriptionRelayCancelParamsSchema = Type.Object( + { + transcriptionSessionId: NonEmptyString, + reason: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + +export const TalkTranscriptionRelayOkResultSchema = Type.Object( + { + ok: Type.Boolean(), + }, + { additionalProperties: false }, +); + const BrowserRealtimeAudioContractSchema = Type.Object( { inputEncoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]), @@ -96,10 +555,47 @@ const BrowserRealtimeAudioContractSchema = Type.Object( { additionalProperties: false }, ); +export const TalkSessionCreateResultSchema = Type.Object( + { + sessionId: NonEmptyString, + provider: Type.Optional(Type.String()), + mode: TalkModeSchema, + transport: TalkTransportSchema, + brain: TalkBrainSchema, + relaySessionId: Type.Optional(NonEmptyString), + transcriptionSessionId: Type.Optional(NonEmptyString), + handoffId: Type.Optional(NonEmptyString), + roomId: Type.Optional(NonEmptyString), + roomUrl: Type.Optional(NonEmptyString), + token: Type.Optional(NonEmptyString), + audio: Type.Optional(Type.Unknown()), + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + expiresAt: Type.Optional(Type.Number()), + }, + { additionalProperties: false }, +); + +export const TalkSessionControlResultSchema = Type.Object( + { + ok: Type.Boolean(), + turnId: Type.Optional(Type.String()), + events: Type.Optional(Type.Array(TalkEventSchema)), + }, + { additionalProperties: false }, +); + +export const TalkSessionOkResultSchema = Type.Object( + { + ok: Type.Boolean(), + }, + { additionalProperties: false }, +); + const BrowserRealtimeWebRtcSdpSessionSchema = Type.Object( { provider: NonEmptyString, - transport: Type.Optional(Type.Literal("webrtc-sdp")), + transport: Type.Literal("webrtc"), clientSecret: NonEmptyString, offerUrl: Type.Optional(Type.String()), offerHeaders: Type.Optional(Type.Record(Type.String(), Type.String())), @@ -113,7 +609,7 @@ const BrowserRealtimeWebRtcSdpSessionSchema = Type.Object( const BrowserRealtimeJsonPcmWebSocketSessionSchema = Type.Object( { provider: NonEmptyString, - transport: Type.Literal("json-pcm-websocket"), + transport: Type.Literal("provider-websocket"), protocol: NonEmptyString, clientSecret: NonEmptyString, websocketUrl: NonEmptyString, @@ -167,6 +663,19 @@ const TalkProviderConfigSchema = Type.Object(talkProviderFieldSchemas, { additionalProperties: true, }); +const TalkRealtimeConfigSchema = Type.Object( + { + provider: Type.Optional(Type.String()), + providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)), + model: Type.Optional(Type.String()), + voice: Type.Optional(Type.String()), + mode: Type.Optional(TalkModeSchema), + transport: Type.Optional(TalkTransportSchema), + brain: Type.Optional(TalkBrainSchema), + }, + { additionalProperties: false }, +); + const ResolvedTalkConfigSchema = Type.Object( { provider: Type.String(), @@ -179,7 +688,8 @@ const TalkConfigSchema = Type.Object( { provider: Type.Optional(Type.String()), providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)), - resolved: ResolvedTalkConfigSchema, + realtime: Type.Optional(TalkRealtimeConfigSchema), + resolved: Type.Optional(ResolvedTalkConfigSchema), speechLocale: Type.Optional(Type.String()), interruptOnSpeech: Type.Optional(Type.Boolean()), silenceTimeoutMs: Type.Optional(Type.Integer({ minimum: 1 })), diff --git a/src/gateway/protocol/schema/protocol-schemas.ts b/src/gateway/protocol/schema/protocol-schemas.ts index 94c1a60bb12..d2191c5499e 100644 --- a/src/gateway/protocol/schema/protocol-schemas.ts +++ b/src/gateway/protocol/schema/protocol-schemas.ts @@ -65,15 +65,45 @@ import { ChannelsStartParamsSchema, ChannelsStopParamsSchema, ChannelsLogoutParamsSchema, + TalkEventSchema, + TalkCatalogParamsSchema, + TalkCatalogResultSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkHandoffCreateParamsSchema, + TalkHandoffCreateResultSchema, + TalkHandoffJoinParamsSchema, + TalkHandoffJoinResultSchema, + TalkHandoffRevokeParamsSchema, + TalkHandoffRevokeResultSchema, + TalkHandoffTurnCancelParamsSchema, + TalkHandoffTurnEndParamsSchema, + TalkHandoffTurnResultSchema, + TalkHandoffTurnStartParamsSchema, TalkRealtimeRelayAudioParamsSchema, + TalkRealtimeRelayCancelParamsSchema, TalkRealtimeRelayMarkParamsSchema, TalkRealtimeRelayOkResultSchema, TalkRealtimeRelayStopParamsSchema, TalkRealtimeRelayToolResultParamsSchema, TalkRealtimeSessionParamsSchema, TalkRealtimeSessionResultSchema, + TalkRealtimeToolCallParamsSchema, + TalkRealtimeToolCallResultSchema, + TalkSessionCloseParamsSchema, + TalkSessionControlParamsSchema, + TalkSessionControlResultSchema, + TalkSessionCreateParamsSchema, + TalkSessionCreateResultSchema, + TalkSessionInputAudioParamsSchema, + TalkSessionOkResultSchema, + TalkSessionToolResultParamsSchema, + TalkTranscriptionRelayAudioParamsSchema, + TalkTranscriptionRelayCancelParamsSchema, + TalkTranscriptionRelayOkResultSchema, + TalkTranscriptionRelayStopParamsSchema, + TalkTranscriptionSessionParamsSchema, + TalkTranscriptionSessionResultSchema, TalkSpeakParamsSchema, TalkSpeakResultSchema, ChannelsStatusParamsSchema, @@ -333,15 +363,45 @@ export const ProtocolSchemas = { WizardStartResult: WizardStartResultSchema, WizardStatusResult: WizardStatusResultSchema, TalkModeParams: TalkModeParamsSchema, + TalkEvent: TalkEventSchema, + TalkCatalogParams: TalkCatalogParamsSchema, + TalkCatalogResult: TalkCatalogResultSchema, TalkConfigParams: TalkConfigParamsSchema, TalkConfigResult: TalkConfigResultSchema, + TalkHandoffCreateParams: TalkHandoffCreateParamsSchema, + TalkHandoffCreateResult: TalkHandoffCreateResultSchema, + TalkHandoffJoinParams: TalkHandoffJoinParamsSchema, + TalkHandoffJoinResult: TalkHandoffJoinResultSchema, + TalkHandoffRevokeParams: TalkHandoffRevokeParamsSchema, + TalkHandoffRevokeResult: TalkHandoffRevokeResultSchema, + TalkHandoffTurnStartParams: TalkHandoffTurnStartParamsSchema, + TalkHandoffTurnEndParams: TalkHandoffTurnEndParamsSchema, + TalkHandoffTurnCancelParams: TalkHandoffTurnCancelParamsSchema, + TalkHandoffTurnResult: TalkHandoffTurnResultSchema, TalkRealtimeSessionParams: TalkRealtimeSessionParamsSchema, TalkRealtimeSessionResult: TalkRealtimeSessionResultSchema, TalkRealtimeRelayAudioParams: TalkRealtimeRelayAudioParamsSchema, + TalkRealtimeRelayCancelParams: TalkRealtimeRelayCancelParamsSchema, TalkRealtimeRelayMarkParams: TalkRealtimeRelayMarkParamsSchema, TalkRealtimeRelayStopParams: TalkRealtimeRelayStopParamsSchema, TalkRealtimeRelayToolResultParams: TalkRealtimeRelayToolResultParamsSchema, TalkRealtimeRelayOkResult: TalkRealtimeRelayOkResultSchema, + TalkRealtimeToolCallParams: TalkRealtimeToolCallParamsSchema, + TalkRealtimeToolCallResult: TalkRealtimeToolCallResultSchema, + TalkSessionCreateParams: TalkSessionCreateParamsSchema, + TalkSessionCreateResult: TalkSessionCreateResultSchema, + TalkSessionInputAudioParams: TalkSessionInputAudioParamsSchema, + TalkSessionControlParams: TalkSessionControlParamsSchema, + TalkSessionControlResult: TalkSessionControlResultSchema, + TalkSessionToolResultParams: TalkSessionToolResultParamsSchema, + TalkSessionCloseParams: TalkSessionCloseParamsSchema, + TalkSessionOkResult: TalkSessionOkResultSchema, + TalkTranscriptionSessionParams: TalkTranscriptionSessionParamsSchema, + TalkTranscriptionSessionResult: TalkTranscriptionSessionResultSchema, + TalkTranscriptionRelayAudioParams: TalkTranscriptionRelayAudioParamsSchema, + TalkTranscriptionRelayCancelParams: TalkTranscriptionRelayCancelParamsSchema, + TalkTranscriptionRelayStopParams: TalkTranscriptionRelayStopParamsSchema, + TalkTranscriptionRelayOkResult: TalkTranscriptionRelayOkResultSchema, TalkSpeakParams: TalkSpeakParamsSchema, TalkSpeakResult: TalkSpeakResultSchema, ChannelsStatusParams: ChannelsStatusParamsSchema, diff --git a/src/gateway/protocol/schema/types.ts b/src/gateway/protocol/schema/types.ts index 9062f4c98f5..e577dc0e824 100644 --- a/src/gateway/protocol/schema/types.ts +++ b/src/gateway/protocol/schema/types.ts @@ -92,16 +92,46 @@ export type WizardStep = SchemaType<"WizardStep">; export type WizardNextResult = SchemaType<"WizardNextResult">; export type WizardStartResult = SchemaType<"WizardStartResult">; export type WizardStatusResult = SchemaType<"WizardStatusResult">; +export type TalkEvent = SchemaType<"TalkEvent">; export type TalkModeParams = SchemaType<"TalkModeParams">; +export type TalkCatalogParams = SchemaType<"TalkCatalogParams">; +export type TalkCatalogResult = SchemaType<"TalkCatalogResult">; export type TalkConfigParams = SchemaType<"TalkConfigParams">; export type TalkConfigResult = SchemaType<"TalkConfigResult">; +export type TalkHandoffCreateParams = SchemaType<"TalkHandoffCreateParams">; +export type TalkHandoffCreateResult = SchemaType<"TalkHandoffCreateResult">; +export type TalkHandoffJoinParams = SchemaType<"TalkHandoffJoinParams">; +export type TalkHandoffJoinResult = SchemaType<"TalkHandoffJoinResult">; +export type TalkHandoffRevokeParams = SchemaType<"TalkHandoffRevokeParams">; +export type TalkHandoffRevokeResult = SchemaType<"TalkHandoffRevokeResult">; +export type TalkHandoffTurnStartParams = SchemaType<"TalkHandoffTurnStartParams">; +export type TalkHandoffTurnEndParams = SchemaType<"TalkHandoffTurnEndParams">; +export type TalkHandoffTurnCancelParams = SchemaType<"TalkHandoffTurnCancelParams">; +export type TalkHandoffTurnResult = SchemaType<"TalkHandoffTurnResult">; export type TalkRealtimeSessionParams = SchemaType<"TalkRealtimeSessionParams">; export type TalkRealtimeSessionResult = SchemaType<"TalkRealtimeSessionResult">; export type TalkRealtimeRelayAudioParams = SchemaType<"TalkRealtimeRelayAudioParams">; +export type TalkRealtimeRelayCancelParams = SchemaType<"TalkRealtimeRelayCancelParams">; export type TalkRealtimeRelayMarkParams = SchemaType<"TalkRealtimeRelayMarkParams">; export type TalkRealtimeRelayStopParams = SchemaType<"TalkRealtimeRelayStopParams">; export type TalkRealtimeRelayToolResultParams = SchemaType<"TalkRealtimeRelayToolResultParams">; export type TalkRealtimeRelayOkResult = SchemaType<"TalkRealtimeRelayOkResult">; +export type TalkRealtimeToolCallParams = SchemaType<"TalkRealtimeToolCallParams">; +export type TalkRealtimeToolCallResult = SchemaType<"TalkRealtimeToolCallResult">; +export type TalkSessionCreateParams = SchemaType<"TalkSessionCreateParams">; +export type TalkSessionCreateResult = SchemaType<"TalkSessionCreateResult">; +export type TalkSessionInputAudioParams = SchemaType<"TalkSessionInputAudioParams">; +export type TalkSessionControlParams = SchemaType<"TalkSessionControlParams">; +export type TalkSessionControlResult = SchemaType<"TalkSessionControlResult">; +export type TalkSessionToolResultParams = SchemaType<"TalkSessionToolResultParams">; +export type TalkSessionCloseParams = SchemaType<"TalkSessionCloseParams">; +export type TalkSessionOkResult = SchemaType<"TalkSessionOkResult">; +export type TalkTranscriptionSessionParams = SchemaType<"TalkTranscriptionSessionParams">; +export type TalkTranscriptionSessionResult = SchemaType<"TalkTranscriptionSessionResult">; +export type TalkTranscriptionRelayAudioParams = SchemaType<"TalkTranscriptionRelayAudioParams">; +export type TalkTranscriptionRelayCancelParams = SchemaType<"TalkTranscriptionRelayCancelParams">; +export type TalkTranscriptionRelayStopParams = SchemaType<"TalkTranscriptionRelayStopParams">; +export type TalkTranscriptionRelayOkResult = SchemaType<"TalkTranscriptionRelayOkResult">; export type TalkSpeakParams = SchemaType<"TalkSpeakParams">; export type TalkSpeakResult = SchemaType<"TalkSpeakResult">; export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">; diff --git a/src/gateway/server-broadcast.ts b/src/gateway/server-broadcast.ts index 8e0c5a2a325..861e8687ab4 100644 --- a/src/gateway/server-broadcast.ts +++ b/src/gateway/server-broadcast.ts @@ -32,6 +32,9 @@ const EVENT_SCOPE_GUARDS: Record = { presence: [], shutdown: [], tick: [], + "talk.event": [READ_SCOPE], + "talk.realtime.relay": [READ_SCOPE], + "talk.transcription.relay": [READ_SCOPE], "talk.mode": [WRITE_SCOPE], "update.available": [], "voicewake.changed": [READ_SCOPE], diff --git a/src/gateway/server-methods-list.test.ts b/src/gateway/server-methods-list.test.ts new file mode 100644 index 00000000000..b8955479efc --- /dev/null +++ b/src/gateway/server-methods-list.test.ts @@ -0,0 +1,24 @@ +import { describe, expect, it } from "vitest"; +import { GATEWAY_EVENTS, listGatewayMethods } from "./server-methods-list.js"; + +describe("GATEWAY_EVENTS", () => { + it("advertises Talk event streams in hello features", () => { + expect(GATEWAY_EVENTS).toEqual( + expect.arrayContaining(["talk.event", "talk.realtime.relay", "talk.transcription.relay"]), + ); + }); +}); + +describe("listGatewayMethods", () => { + it("advertises the versioned Talk session RPCs", () => { + expect(listGatewayMethods()).toEqual( + expect.arrayContaining([ + "talk.session.create", + "talk.session.inputAudio", + "talk.session.control", + "talk.session.toolResult", + "talk.session.close", + ]), + ); + }); +}); diff --git a/src/gateway/server-methods-list.ts b/src/gateway/server-methods-list.ts index eb23232aa0d..0efa346f36e 100644 --- a/src/gateway/server-methods-list.ts +++ b/src/gateway/server-methods-list.ts @@ -56,12 +56,30 @@ const BASE_METHODS = [ "wizard.next", "wizard.cancel", "wizard.status", + "talk.catalog", "talk.config", + "talk.session.create", + "talk.session.inputAudio", + "talk.session.control", + "talk.session.toolResult", + "talk.session.close", + "talk.handoff.create", + "talk.handoff.join", + "talk.handoff.revoke", + "talk.handoff.turnStart", + "talk.handoff.turnEnd", + "talk.handoff.turnCancel", "talk.realtime.session", + "talk.realtime.toolCall", "talk.realtime.relayAudio", + "talk.realtime.relayCancel", "talk.realtime.relayMark", "talk.realtime.relayStop", "talk.realtime.relayToolResult", + "talk.transcription.session", + "talk.transcription.relayAudio", + "talk.transcription.relayCancel", + "talk.transcription.relayStop", "talk.speak", "talk.mode", "commands.list", @@ -182,6 +200,9 @@ export const GATEWAY_EVENTS = [ "presence", "tick", "talk.mode", + "talk.event", + "talk.realtime.relay", + "talk.transcription.relay", "shutdown", "health", "heartbeat", diff --git a/src/gateway/server-methods/shared-types.ts b/src/gateway/server-methods/shared-types.ts index 1bf326f3ca7..59d754b7543 100644 --- a/src/gateway/server-methods/shared-types.ts +++ b/src/gateway/server-methods/shared-types.ts @@ -62,7 +62,7 @@ export type GatewayRequestContext = { nodeSubscribe: (nodeId: string, sessionKey: string) => void; nodeUnsubscribe: (nodeId: string, sessionKey: string) => void; nodeUnsubscribeAll: (nodeId: string) => void; - hasConnectedMobileNode: () => boolean; + hasConnectedTalkNode: () => boolean; hasExecApprovalClients?: (excludeConnId?: string) => boolean; disconnectClientsForDevice?: (deviceId: string, opts?: { role?: string }) => void; disconnectClientsUsingSharedGatewayAuth?: () => void; diff --git a/src/gateway/server-methods/talk-session.ts b/src/gateway/server-methods/talk-session.ts new file mode 100644 index 00000000000..0284bafe04f --- /dev/null +++ b/src/gateway/server-methods/talk-session.ts @@ -0,0 +1,497 @@ +import { REALTIME_VOICE_AGENT_CONSULT_TOOL } from "../../realtime-voice/agent-consult-tool.js"; +import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js"; +import type { TalkBrain, TalkMode, TalkTransport } from "../../realtime-voice/talk-events.js"; +import { + normalizeOptionalLowercaseString, + normalizeOptionalString, +} from "../../shared/string-coerce.js"; +import { ADMIN_SCOPE } from "../operator-scopes.js"; +import { + ErrorCodes, + errorShape, + formatValidationErrors, + validateTalkSessionCloseParams, + validateTalkSessionControlParams, + validateTalkSessionCreateParams, + validateTalkSessionInputAudioParams, + validateTalkSessionToolResultParams, +} from "../protocol/index.js"; +import { resolveSessionKeyFromResolveParams } from "../sessions-resolve.js"; +import { + cancelTalkHandoffTurn, + createTalkHandoff, + endTalkHandoffTurn, + revokeTalkHandoff, + startTalkHandoffTurn, +} from "../talk-handoff.js"; +import { + cancelTalkRealtimeRelayTurn, + createTalkRealtimeRelaySession, + sendTalkRealtimeRelayAudio, + stopTalkRealtimeRelaySession, + submitTalkRealtimeRelayToolResult, +} from "../talk-realtime-relay.js"; +import { + forgetUnifiedTalkSession, + getUnifiedTalkSession, + rememberUnifiedTalkSession, + requireUnifiedTalkSessionConn, +} from "../talk-session-registry.js"; +import { + cancelTalkTranscriptionRelayTurn, + createTalkTranscriptionRelaySession, + sendTalkTranscriptionRelayAudio, + stopTalkTranscriptionRelaySession, +} from "../talk-transcription-relay.js"; +import { formatForLog } from "../ws-log.js"; +import { + broadcastTalkRoomEvents, + buildRealtimeInstructions, + buildTalkRealtimeConfig, + buildTalkTranscriptionConfig, + canUseTalkDirectTools, + resolveConfiguredRealtimeTranscriptionProvider, + talkHandoffErrorCode, + withRealtimeBrowserOverrides, +} from "./talk-shared.js"; +import type { GatewayRequestHandlers } from "./types.js"; + +function normalizeTalkSessionMode(params: { mode?: string; transport?: string }): TalkMode { + const mode = normalizeOptionalLowercaseString(params.mode) as TalkMode | undefined; + if (mode) { + return mode; + } + return normalizeOptionalLowercaseString(params.transport) === "managed-room" + ? "stt-tts" + : "realtime"; +} + +function normalizeTalkSessionTransport(params: { + mode: TalkMode; + transport?: string; +}): TalkTransport { + const transport = normalizeOptionalLowercaseString(params.transport) as TalkTransport | undefined; + if (transport) { + return transport; + } + return params.mode === "stt-tts" ? "managed-room" : "gateway-relay"; +} + +function normalizeTalkSessionBrain(params: { mode: TalkMode; brain?: string }): TalkBrain { + const brain = normalizeOptionalLowercaseString(params.brain) as TalkBrain | undefined; + if (brain) { + return brain; + } + return params.mode === "transcription" ? "none" : "agent-consult"; +} + +export const talkSessionHandlers: GatewayRequestHandlers = { + "talk.session.create": async ({ params, respond, context, client }) => { + if (!validateTalkSessionCreateParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.session.create params: ${formatValidationErrors(validateTalkSessionCreateParams.errors)}`, + ), + ); + return; + } + + const mode = normalizeTalkSessionMode(params); + const transport = normalizeTalkSessionTransport({ mode, transport: params.transport }); + const brain = normalizeTalkSessionBrain({ mode, brain: params.brain }); + + if (transport === "webrtc" || transport === "provider-websocket") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `talk.session.create is Gateway-managed; use talk.realtime.session for browser transport "${transport}"`, + ), + ); + return; + } + + try { + if (transport === "managed-room") { + if (brain === "direct-tools" && !canUseTalkDirectTools(client)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `talk.session.create brain="direct-tools" requires gateway scope: ${ADMIN_SCOPE}`, + ), + ); + return; + } + const resolvedSession = await resolveSessionKeyFromResolveParams({ + cfg: context.getRuntimeConfig(), + p: { + key: params.sessionKey, + includeGlobal: true, + includeUnknown: true, + }, + }); + if (!resolvedSession.ok) { + respond(false, undefined, resolvedSession.error); + return; + } + const handoff = createTalkHandoff({ + sessionKey: resolvedSession.key, + provider: normalizeOptionalString(params.provider), + model: normalizeOptionalString(params.model), + voice: normalizeOptionalString(params.voice), + mode, + transport, + brain, + ttlMs: params.ttlMs, + }); + rememberUnifiedTalkSession(handoff.id, { + kind: "managed-room", + handoffId: handoff.id, + token: handoff.token, + roomId: handoff.roomId, + }); + respond( + true, + { + sessionId: handoff.id, + provider: handoff.provider, + mode: handoff.mode, + transport: handoff.transport, + brain: handoff.brain, + handoffId: handoff.id, + roomId: handoff.roomId, + roomUrl: handoff.roomUrl, + token: handoff.token, + model: handoff.model, + voice: handoff.voice, + expiresAt: handoff.expiresAt, + }, + undefined, + ); + return; + } + + const connId = client?.connId; + if (!connId) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "Talk session unavailable")); + return; + } + + if (mode === "realtime") { + if (transport !== "gateway-relay" || brain !== "agent-consult") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `realtime talk.session.create requires transport="gateway-relay" and brain="agent-consult"`, + ), + ); + return; + } + const runtimeConfig = context.getRuntimeConfig(); + const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, params.provider); + const resolution = resolveConfiguredRealtimeVoiceProvider({ + configuredProviderId: realtimeConfig.provider, + providerConfigs: realtimeConfig.providers, + cfg: runtimeConfig, + cfgForResolve: runtimeConfig, + noRegisteredProviderMessage: "No realtime voice provider registered", + }); + const model = normalizeOptionalString(params.model) ?? realtimeConfig.model; + const voice = normalizeOptionalString(params.voice) ?? realtimeConfig.voice; + const session = createTalkRealtimeRelaySession({ + context, + connId, + provider: resolution.provider, + providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }), + instructions: buildRealtimeInstructions(), + tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL], + model, + voice, + }); + rememberUnifiedTalkSession(session.relaySessionId, { + kind: "realtime-relay", + connId, + relaySessionId: session.relaySessionId, + }); + respond( + true, + { + ...session, + sessionId: session.relaySessionId, + mode, + brain, + }, + undefined, + ); + return; + } + + if (mode === "transcription") { + if (transport !== "gateway-relay" || brain !== "none") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `transcription talk.session.create requires transport="gateway-relay" and brain="none"`, + ), + ); + return; + } + const runtimeConfig = context.getRuntimeConfig(); + const transcriptionConfig = buildTalkTranscriptionConfig(runtimeConfig, params.provider); + const resolution = resolveConfiguredRealtimeTranscriptionProvider({ + config: runtimeConfig, + configuredProviderId: transcriptionConfig.provider, + providerConfigs: transcriptionConfig.providers, + }); + const session = createTalkTranscriptionRelaySession({ + context, + connId, + provider: resolution.provider, + providerConfig: resolution.providerConfig, + }); + rememberUnifiedTalkSession(session.transcriptionSessionId, { + kind: "transcription-relay", + connId, + transcriptionSessionId: session.transcriptionSessionId, + }); + respond( + true, + { + ...session, + sessionId: session.transcriptionSessionId, + brain, + }, + undefined, + ); + return; + } + + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `stt-tts talk.session.create requires transport="managed-room"`, + ), + ); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "talk.session.inputAudio": async ({ params, respond, client }) => { + if (!validateTalkSessionInputAudioParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.session.inputAudio params: ${formatValidationErrors(validateTalkSessionInputAudioParams.errors)}`, + ), + ); + return; + } + try { + const session = getUnifiedTalkSession(params.sessionId); + if (session.kind === "realtime-relay") { + const connId = requireUnifiedTalkSessionConn(session, client?.connId); + sendTalkRealtimeRelayAudio({ + relaySessionId: session.relaySessionId, + connId, + audioBase64: params.audioBase64, + timestamp: params.timestamp, + }); + respond(true, { ok: true }, undefined); + return; + } + if (session.kind === "transcription-relay") { + const connId = requireUnifiedTalkSessionConn(session, client?.connId); + sendTalkTranscriptionRelayAudio({ + transcriptionSessionId: session.transcriptionSessionId, + connId, + audioBase64: params.audioBase64, + }); + respond(true, { ok: true }, undefined); + return; + } + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + "talk.session.inputAudio is not supported for managed-room sessions", + ), + ); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "talk.session.control": async ({ params, respond, client, context }) => { + if (!validateTalkSessionControlParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.session.control params: ${formatValidationErrors(validateTalkSessionControlParams.errors)}`, + ), + ); + return; + } + try { + const session = getUnifiedTalkSession(params.sessionId); + if (session.kind === "realtime-relay") { + if (params.type !== "turn.cancel") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `realtime relay sessions only support talk.session.control type="turn.cancel"`, + ), + ); + return; + } + const connId = requireUnifiedTalkSessionConn(session, client?.connId); + cancelTalkRealtimeRelayTurn({ + relaySessionId: session.relaySessionId, + connId, + reason: normalizeOptionalString(params.reason), + }); + respond(true, { ok: true }, undefined); + return; + } + if (session.kind === "transcription-relay") { + if (params.type !== "turn.cancel") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `transcription relay sessions only support talk.session.control type="turn.cancel"`, + ), + ); + return; + } + const connId = requireUnifiedTalkSessionConn(session, client?.connId); + cancelTalkTranscriptionRelayTurn({ + transcriptionSessionId: session.transcriptionSessionId, + connId, + reason: normalizeOptionalString(params.reason), + }); + respond(true, { ok: true }, undefined); + return; + } + + const result = + params.type === "turn.start" + ? startTalkHandoffTurn(session.handoffId, session.token, { + turnId: params.turnId, + clientId: client?.connId, + }) + : params.type === "turn.end" + ? endTalkHandoffTurn(session.handoffId, session.token, { turnId: params.turnId }) + : cancelTalkHandoffTurn(session.handoffId, session.token, { + turnId: params.turnId, + reason: params.reason, + }); + if (!result.ok) { + respond( + false, + undefined, + errorShape( + talkHandoffErrorCode(result.reason), + `talk session control failed: ${result.reason}`, + ), + ); + return; + } + broadcastTalkRoomEvents(context, result.record.room.activeClientId, { + handoffId: result.record.id, + roomId: result.record.roomId, + events: result.events, + }); + respond(true, { ok: true, turnId: result.turnId, events: result.events }, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "talk.session.toolResult": async ({ params, respond, client }) => { + if (!validateTalkSessionToolResultParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.session.toolResult params: ${formatValidationErrors(validateTalkSessionToolResultParams.errors)}`, + ), + ); + return; + } + try { + const session = getUnifiedTalkSession(params.sessionId); + if (session.kind !== "realtime-relay") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + "talk.session.toolResult is only supported for realtime relay sessions", + ), + ); + return; + } + const connId = requireUnifiedTalkSessionConn(session, client?.connId); + submitTalkRealtimeRelayToolResult({ + relaySessionId: session.relaySessionId, + connId, + callId: params.callId, + result: params.result, + }); + respond(true, { ok: true }, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "talk.session.close": async ({ params, respond, client }) => { + if (!validateTalkSessionCloseParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.session.close params: ${formatValidationErrors(validateTalkSessionCloseParams.errors)}`, + ), + ); + return; + } + try { + const session = getUnifiedTalkSession(params.sessionId); + if (session.kind === "realtime-relay") { + const connId = requireUnifiedTalkSessionConn(session, client?.connId); + stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId }); + } else if (session.kind === "transcription-relay") { + const connId = requireUnifiedTalkSessionConn(session, client?.connId); + stopTalkTranscriptionRelaySession({ + transcriptionSessionId: session.transcriptionSessionId, + connId, + }); + } else { + revokeTalkHandoff(session.handoffId); + } + forgetUnifiedTalkSession(params.sessionId); + respond(true, { ok: true }, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, +}; diff --git a/src/gateway/server-methods/talk-shared.ts b/src/gateway/server-methods/talk-shared.ts new file mode 100644 index 00000000000..c512e18b8c3 --- /dev/null +++ b/src/gateway/server-methods/talk-shared.ts @@ -0,0 +1,237 @@ +import type { OpenClawConfig } from "../../config/types.js"; +import { listRealtimeTranscriptionProviders } from "../../realtime-transcription/provider-registry.js"; +import type { RealtimeTranscriptionProviderConfig } from "../../realtime-transcription/provider-types.js"; +import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../realtime-voice/agent-consult-tool.js"; +import type { + RealtimeVoiceBrowserSession, + RealtimeVoiceProviderConfig, +} from "../../realtime-voice/provider-types.js"; +import type { TalkEvent } from "../../realtime-voice/talk-events.js"; +import { + normalizeLowercaseStringOrEmpty, + normalizeOptionalLowercaseString, + normalizeOptionalString, +} from "../../shared/string-coerce.js"; +import { ADMIN_SCOPE } from "../operator-scopes.js"; +import { ErrorCodes } from "../protocol/index.js"; +import type { TalkHandoffTurnResult } from "../talk-handoff.js"; +import { asRecord } from "./record-shared.js"; + +export function canUseTalkDirectTools(client: { connect?: { scopes?: string[] } } | null): boolean { + const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : []; + return scopes.includes(ADMIN_SCOPE); +} + +export function broadcastTalkRoomEvents( + context: { + broadcastToConnIds: ( + event: string, + payload: unknown, + connIds: Set, + opts?: { dropIfSlow?: boolean }, + ) => void; + }, + connId: string | undefined, + params: { handoffId: string; roomId: string; events: TalkEvent[] }, +): void { + if (!connId || params.events.length === 0) { + return; + } + for (const talkEvent of params.events) { + context.broadcastToConnIds( + "talk.event", + { handoffId: params.handoffId, roomId: params.roomId, talkEvent }, + new Set([connId]), + { dropIfSlow: true }, + ); + } +} + +type TalkHandoffFailureReason = Extract["reason"]; + +export function talkHandoffErrorCode(reason: TalkHandoffFailureReason) { + return reason === "invalid_token" || reason === "no_active_turn" || reason === "stale_turn" + ? ErrorCodes.INVALID_REQUEST + : ErrorCodes.UNAVAILABLE; +} + +function getRecord(value: unknown): Record | undefined { + return asRecord(value) ?? undefined; +} + +function getVoiceCallRealtimeConfig(config: OpenClawConfig): { + provider?: string; + providers?: Record; +} { + const plugins = getRecord(config.plugins); + const entries = getRecord(plugins?.entries); + const voiceCall = getRecord(entries?.["voice-call"]); + const pluginConfig = getRecord(voiceCall?.config); + const realtime = getRecord(pluginConfig?.realtime); + const providersRaw = getRecord(realtime?.providers); + const providers: Record = {}; + if (providersRaw) { + for (const [providerId, providerConfig] of Object.entries(providersRaw)) { + const record = getRecord(providerConfig); + if (record) { + providers[providerId] = record; + } + } + } + return { + provider: normalizeOptionalString(realtime?.provider), + providers: Object.keys(providers).length > 0 ? providers : undefined, + }; +} + +export function getVoiceCallStreamingConfig(config: OpenClawConfig): { + provider?: string; + providers?: Record; +} { + const plugins = getRecord(config.plugins); + const entries = getRecord(plugins?.entries); + const voiceCall = getRecord(entries?.["voice-call"]); + const pluginConfig = getRecord(voiceCall?.config); + const streaming = getRecord(pluginConfig?.streaming); + const providersRaw = getRecord(streaming?.providers); + const providers: Record = {}; + if (providersRaw) { + for (const [providerId, providerConfig] of Object.entries(providersRaw)) { + const record = getRecord(providerConfig); + if (record) { + providers[providerId] = record; + } + } + } + return { + provider: normalizeOptionalString(streaming?.provider), + providers: Object.keys(providers).length > 0 ? providers : undefined, + }; +} + +export function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) { + const voiceCallRealtime = getVoiceCallRealtimeConfig(config); + const talkRealtime = getRecord(config.talk?.realtime); + const talkRealtimeProviderConfigs = talkRealtime?.providers as + | Record + | undefined; + const provider = + normalizeOptionalString(requestedProvider) ?? + normalizeOptionalString(talkRealtime?.provider) ?? + voiceCallRealtime.provider; + return { + provider, + providers: { + ...voiceCallRealtime.providers, + ...talkRealtimeProviderConfigs, + }, + model: normalizeOptionalString(talkRealtime?.model), + voice: normalizeOptionalString(talkRealtime?.voice), + mode: normalizeOptionalLowercaseString(talkRealtime?.mode), + transport: normalizeOptionalLowercaseString(talkRealtime?.transport), + brain: normalizeOptionalLowercaseString(talkRealtime?.brain), + }; +} + +export function buildTalkTranscriptionConfig(config: OpenClawConfig, requestedProvider?: string) { + const streamingConfig = getVoiceCallStreamingConfig(config); + return { + provider: normalizeOptionalString(requestedProvider) ?? streamingConfig.provider, + providers: streamingConfig.providers ?? {}, + }; +} + +function getRealtimeTranscriptionProviderConfig(params: { + providerConfigs: Record; + provider: { id: string; aliases?: readonly string[] }; + configuredProviderId?: string; +}): RealtimeTranscriptionProviderConfig { + const candidates = [ + normalizeOptionalString(params.configuredProviderId), + params.provider.id, + ...(params.provider.aliases ?? []), + ].filter((key): key is string => Boolean(key)); + const configuredKeys = Object.keys(params.providerConfigs); + for (const candidate of candidates) { + if (Object.hasOwn(params.providerConfigs, candidate)) { + return params.providerConfigs[candidate] ?? {}; + } + const normalizedCandidate = normalizeOptionalLowercaseString(candidate); + const matchingKey = configuredKeys.find( + (key) => normalizeOptionalLowercaseString(key) === normalizedCandidate, + ); + if (matchingKey) { + return params.providerConfigs[matchingKey] ?? {}; + } + } + return {}; +} + +export function configuredOrFalse(callback: () => boolean): boolean { + try { + return callback(); + } catch { + return false; + } +} + +export function resolveConfiguredRealtimeTranscriptionProvider(params: { + config: OpenClawConfig; + configuredProviderId?: string; + providerConfigs: Record; +}) { + const providers = listRealtimeTranscriptionProviders(params.config); + const normalizedConfigured = normalizeOptionalLowercaseString(params.configuredProviderId); + const orderedProviders = normalizedConfigured + ? providers.filter( + (provider) => + normalizeOptionalLowercaseString(provider.id) === normalizedConfigured || + (provider.aliases ?? []).some( + (alias) => normalizeOptionalLowercaseString(alias) === normalizedConfigured, + ), + ) + : providers.toSorted((a, b) => (a.autoSelectOrder ?? 1000) - (b.autoSelectOrder ?? 1000)); + for (const provider of orderedProviders) { + const rawConfig = getRealtimeTranscriptionProviderConfig({ + providerConfigs: params.providerConfigs, + provider, + configuredProviderId: params.configuredProviderId, + }); + const providerConfig = provider.resolveConfig?.({ cfg: params.config, rawConfig }) ?? rawConfig; + if (configuredOrFalse(() => provider.isConfigured({ cfg: params.config, providerConfig }))) { + return { provider, providerConfig }; + } + } + if (normalizedConfigured) { + throw new Error( + `Realtime transcription provider "${params.configuredProviderId}" is not configured`, + ); + } + throw new Error("No realtime transcription provider registered"); +} + +export function buildRealtimeInstructions(): string { + return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`; +} + +export function withRealtimeBrowserOverrides( + providerConfig: RealtimeVoiceProviderConfig, + params: { model?: string; voice?: string }, +): RealtimeVoiceProviderConfig { + const overrides: RealtimeVoiceProviderConfig = {}; + const model = normalizeOptionalString(params.model); + const voice = normalizeOptionalString(params.voice); + if (model) { + overrides.model = model; + } + if (voice) { + overrides.voice = voice; + } + return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig; +} + +export function isUnsupportedBrowserWebRtcSession(session: RealtimeVoiceBrowserSession): boolean { + const provider = normalizeLowercaseStringOrEmpty(session.provider); + const transport = (session as { transport?: string }).transport ?? "webrtc"; + return provider === "google" && transport === "webrtc"; +} diff --git a/src/gateway/server-methods/talk.test.ts b/src/gateway/server-methods/talk.test.ts index bd00b45d90c..36f37da9d49 100644 --- a/src/gateway/server-methods/talk.test.ts +++ b/src/gateway/server-methods/talk.test.ts @@ -1,6 +1,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../../config/config.js"; import { normalizeResolvedSecretInputString } from "../../config/types.secrets.js"; +import { ErrorCodes } from "../protocol/index.js"; import { talkHandlers } from "./talk.js"; const mocks = vi.hoisted(() => ({ @@ -8,10 +9,27 @@ const mocks = vi.hoisted(() => ({ readConfigFileSnapshot: vi.fn(), canonicalizeSpeechProviderId: vi.fn((providerId: string | undefined) => providerId), getSpeechProvider: vi.fn(), + listSpeechProviders: vi.fn(() => []), + getResolvedSpeechProviderConfig: vi.fn(() => ({})), + resolveTtsConfig: vi.fn(() => ({ timeoutMs: 30_000 })), synthesizeSpeech: vi.fn(), - getRealtimeVoiceProvider: vi.fn(), + canonicalizeRealtimeVoiceProviderId: vi.fn((providerId: string | undefined) => providerId), + listRealtimeVoiceProviders: vi.fn(() => []), + listRealtimeTranscriptionProviders: vi.fn(() => []), resolveConfiguredRealtimeVoiceProvider: vi.fn(), createTalkRealtimeRelaySession: vi.fn(), + sendTalkRealtimeRelayAudio: vi.fn(), + acknowledgeTalkRealtimeRelayMark: vi.fn(), + cancelTalkRealtimeRelayTurn: vi.fn(), + stopTalkRealtimeRelaySession: vi.fn(), + registerTalkRealtimeRelayAgentRun: vi.fn(), + submitTalkRealtimeRelayToolResult: vi.fn(), + createTalkTranscriptionRelaySession: vi.fn(), + sendTalkTranscriptionRelayAudio: vi.fn(), + cancelTalkTranscriptionRelayTurn: vi.fn(), + stopTalkTranscriptionRelaySession: vi.fn(), + chatSend: vi.fn(), + resolveSessionKeyFromResolveParams: vi.fn(), })); vi.mock("../../config/config.js", () => ({ @@ -21,25 +39,60 @@ vi.mock("../../config/config.js", () => ({ vi.mock("../../tts/provider-registry.js", () => ({ canonicalizeSpeechProviderId: mocks.canonicalizeSpeechProviderId, getSpeechProvider: mocks.getSpeechProvider, + listSpeechProviders: mocks.listSpeechProviders, })); vi.mock("../../tts/tts.js", () => ({ + getResolvedSpeechProviderConfig: mocks.getResolvedSpeechProviderConfig, + resolveTtsConfig: mocks.resolveTtsConfig, synthesizeSpeech: mocks.synthesizeSpeech, })); vi.mock("../../realtime-voice/provider-registry.js", () => ({ - getRealtimeVoiceProvider: mocks.getRealtimeVoiceProvider, + canonicalizeRealtimeVoiceProviderId: mocks.canonicalizeRealtimeVoiceProviderId, + listRealtimeVoiceProviders: mocks.listRealtimeVoiceProviders, +})); + +vi.mock("../../realtime-transcription/provider-registry.js", () => ({ + listRealtimeTranscriptionProviders: mocks.listRealtimeTranscriptionProviders, })); vi.mock("../../realtime-voice/provider-resolver.js", () => ({ resolveConfiguredRealtimeVoiceProvider: mocks.resolveConfiguredRealtimeVoiceProvider, })); +vi.mock("./chat.js", () => ({ + chatHandlers: { + "chat.send": mocks.chatSend, + }, +})); + +vi.mock("../sessions-resolve.js", () => ({ + resolveSessionKeyFromResolveParams: mocks.resolveSessionKeyFromResolveParams, +})); + vi.mock("../talk-realtime-relay.js", async (importOriginal) => { const actual = await importOriginal(); return { ...actual, + acknowledgeTalkRealtimeRelayMark: mocks.acknowledgeTalkRealtimeRelayMark, + cancelTalkRealtimeRelayTurn: mocks.cancelTalkRealtimeRelayTurn, createTalkRealtimeRelaySession: mocks.createTalkRealtimeRelaySession, + registerTalkRealtimeRelayAgentRun: mocks.registerTalkRealtimeRelayAgentRun, + sendTalkRealtimeRelayAudio: mocks.sendTalkRealtimeRelayAudio, + stopTalkRealtimeRelaySession: mocks.stopTalkRealtimeRelaySession, + submitTalkRealtimeRelayToolResult: mocks.submitTalkRealtimeRelayToolResult, + }; +}); + +vi.mock("../talk-transcription-relay.js", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + cancelTalkTranscriptionRelayTurn: mocks.cancelTalkTranscriptionRelayTurn, + createTalkTranscriptionRelaySession: mocks.createTalkTranscriptionRelaySession, + sendTalkTranscriptionRelayAudio: mocks.sendTalkTranscriptionRelayAudio, + stopTalkTranscriptionRelaySession: mocks.stopTalkTranscriptionRelaySession, }; }); @@ -57,6 +110,156 @@ function createTalkConfig(apiKey: unknown): OpenClawConfig { } as OpenClawConfig; } +describe("talk.catalog handler", () => { + beforeEach(() => { + vi.clearAllMocks(); + mocks.listSpeechProviders.mockReturnValue([]); + mocks.listRealtimeTranscriptionProviders.mockReturnValue([]); + mocks.listRealtimeVoiceProviders.mockReturnValue([]); + mocks.getResolvedSpeechProviderConfig.mockReturnValue({}); + mocks.resolveTtsConfig.mockReturnValue({ timeoutMs: 30_000 }); + }); + + it("returns safe speech, transcription, and realtime catalogs without provider secrets", async () => { + mocks.listSpeechProviders.mockReturnValue([ + { + id: "elevenlabs", + label: "ElevenLabs", + models: ["eleven_flash_v2_5"], + voices: ["voice-1"], + isConfigured: vi.fn(() => true), + } as never, + ]); + mocks.getResolvedSpeechProviderConfig.mockReturnValue({ apiKey: "speech-key" }); + mocks.listRealtimeTranscriptionProviders.mockReturnValue([ + { + id: "openai", + label: "OpenAI Realtime Transcription", + defaultModel: "gpt-4o-transcribe", + resolveConfig: vi.fn(({ rawConfig }) => rawConfig), + isConfigured: vi.fn(({ providerConfig }) => providerConfig.apiKey === "stt-key"), + } as never, + ]); + mocks.listRealtimeVoiceProviders.mockReturnValue([ + { + id: "google", + label: "Google Live Voice", + defaultModel: "gemini-live", + resolveConfig: vi.fn(({ rawConfig }) => rawConfig), + isConfigured: vi.fn(({ providerConfig }) => providerConfig.apiKey === "live-key"), + capabilities: { + transports: ["provider-websocket", "gateway-relay"], + inputAudioFormats: [{ encoding: "pcm16", sampleRateHz: 24000, channels: 1 }], + outputAudioFormats: [{ encoding: "pcm16", sampleRateHz: 24000, channels: 1 }], + supportsBrowserSession: true, + supportsBargeIn: true, + supportsToolCalls: true, + supportsVideoFrames: true, + supportsSessionResumption: true, + }, + createBrowserSession: vi.fn(), + createBridge: vi.fn(), + } as never, + ]); + + const respond = vi.fn(); + await talkHandlers["talk.catalog"]({ + req: { type: "req", id: "1", method: "talk.catalog" }, + params: {}, + client: { connect: { scopes: ["operator.read"] } } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => + ({ + talk: { + provider: "elevenlabs", + providers: { elevenlabs: { apiKey: "speech-key" } }, + realtime: { + provider: "google", + providers: { google: { apiKey: "live-key" } }, + }, + }, + plugins: { + entries: { + "voice-call": { + config: { + streaming: { + provider: "openai", + providers: { openai: { apiKey: "stt-key" } }, + }, + }, + }, + }, + }, + }) as OpenClawConfig, + } as never, + }); + + expect(respond).toHaveBeenCalledWith( + true, + { + modes: ["realtime", "stt-tts", "transcription"], + transports: ["webrtc", "provider-websocket", "gateway-relay", "managed-room"], + brains: ["agent-consult", "direct-tools", "none"], + speech: { + activeProvider: "elevenlabs", + providers: [ + { + id: "elevenlabs", + label: "ElevenLabs", + configured: true, + modes: ["stt-tts"], + brains: ["agent-consult"], + models: ["eleven_flash_v2_5"], + voices: ["voice-1"], + }, + ], + }, + transcription: { + activeProvider: "openai", + providers: [ + { + id: "openai", + label: "OpenAI Realtime Transcription", + configured: true, + modes: ["transcription"], + transports: ["gateway-relay"], + brains: ["none"], + defaultModel: "gpt-4o-transcribe", + }, + ], + }, + realtime: { + activeProvider: "google", + providers: [ + { + id: "google", + label: "Google Live Voice", + configured: true, + defaultModel: "gemini-live", + modes: ["realtime"], + transports: ["provider-websocket", "gateway-relay"], + brains: ["agent-consult"], + inputAudioFormats: [{ encoding: "pcm16", sampleRateHz: 24000, channels: 1 }], + outputAudioFormats: [{ encoding: "pcm16", sampleRateHz: 24000, channels: 1 }], + supportsBrowserSession: true, + supportsBargeIn: true, + supportsToolCalls: true, + supportsVideoFrames: true, + supportsSessionResumption: true, + }, + ], + }, + }, + undefined, + ); + expect(JSON.stringify(respond.mock.calls[0]?.[1])).not.toContain("speech-key"); + expect(JSON.stringify(respond.mock.calls[0]?.[1])).not.toContain("stt-key"); + expect(JSON.stringify(respond.mock.calls[0]?.[1])).not.toContain("live-key"); + }); +}); + describe("talk.speak handler", () => { beforeEach(() => { vi.clearAllMocks(); @@ -238,6 +441,967 @@ describe("talk.config handler", () => { }); }); +describe("talk.handoff.create handler", () => { + beforeEach(() => { + vi.clearAllMocks(); + mocks.resolveSessionKeyFromResolveParams.mockImplementation(async ({ p }) => ({ + ok: true, + key: String((p as { key?: unknown }).key), + })); + }); + + it("creates an expiring managed-room handoff for an existing session key", async () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-05-05T12:00:00.000Z")); + const respond = vi.fn(); + + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "1", method: "talk.handoff.create" }, + params: { + sessionKey: "session:main", + sessionId: "session-id", + channel: "discord", + target: "dm:123", + provider: "openai", + model: "gpt-realtime-1.5", + voice: "alloy", + ttlMs: 5000, + }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(respond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + id: expect.any(String), + roomId: expect.stringMatching(/^talk_/), + roomUrl: expect.stringMatching(/^\/talk\/rooms\/talk_/), + token: expect.any(String), + sessionKey: "session:main", + sessionId: "session-id", + channel: "discord", + target: "dm:123", + provider: "openai", + model: "gpt-realtime-1.5", + voice: "alloy", + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + createdAt: Date.parse("2026-05-05T12:00:00.000Z"), + expiresAt: Date.parse("2026-05-05T12:00:05.000Z"), + }), + undefined, + ); + expect(mocks.resolveSessionKeyFromResolveParams).toHaveBeenCalledWith({ + cfg: {}, + p: { + key: "session:main", + includeGlobal: true, + includeUnknown: true, + }, + }); + expect(respond.mock.calls[0]?.[1]).not.toHaveProperty("tokenHash"); + vi.useRealTimers(); + }); + + it("rejects handoff creation when the session key cannot resolve", async () => { + const respond = vi.fn(); + mocks.resolveSessionKeyFromResolveParams.mockResolvedValueOnce({ + ok: false, + error: { + code: ErrorCodes.INVALID_REQUEST, + message: "No session found: missing", + }, + }); + + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "1", method: "talk.handoff.create" }, + params: { sessionKey: "missing" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: "No session found: missing", + }), + ); + }); + + it("rejects invalid handoff params", async () => { + const respond = vi.fn(); + + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "1", method: "talk.handoff.create" }, + params: { sessionKey: "" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: expect.stringContaining("invalid talk.handoff.create params"), + }), + ); + }); + + it("requires owner scope for direct-tools handoffs", async () => { + const rejectedRespond = vi.fn(); + + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "1", method: "talk.handoff.create" }, + params: { sessionKey: "session:main", brain: "direct-tools" }, + client: { connId: "conn-1", connect: { scopes: ["operator.write"] } } as never, + isWebchatConnect: () => false, + respond: rejectedRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(rejectedRespond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: 'talk.handoff.create brain="direct-tools" requires gateway scope: operator.admin', + }), + ); + + const ownerRespond = vi.fn(); + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "2", method: "talk.handoff.create" }, + params: { sessionKey: "session:main", brain: "direct-tools" }, + client: { connId: "conn-1", connect: { scopes: ["operator.admin"] } } as never, + isWebchatConnect: () => false, + respond: ownerRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(ownerRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + sessionKey: "session:main", + brain: "direct-tools", + }), + undefined, + ); + }); + + it("joins and revokes a handoff without exposing the token hash", async () => { + const broadcastToConnIds = vi.fn(); + const createRespond = vi.fn(); + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "1", method: "talk.handoff.create" }, + params: { sessionKey: "session:main" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: createRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + const handoff = createRespond.mock.calls[0]?.[1] as { id: string; token: string }; + + const joinRespond = vi.fn(); + await talkHandlers["talk.handoff.join"]({ + req: { type: "req", id: "2", method: "talk.handoff.join" }, + params: { id: handoff.id, token: handoff.token }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: joinRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(joinRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + id: handoff.id, + sessionKey: "session:main", + transport: "managed-room", + }), + undefined, + ); + expect(joinRespond.mock.calls[0]?.[1]).not.toHaveProperty("tokenHash"); + expect(joinRespond.mock.calls[0]?.[1]).not.toHaveProperty("token"); + expect(broadcastToConnIds).toHaveBeenCalledWith( + "talk.event", + expect.objectContaining({ + handoffId: handoff.id, + talkEvent: expect.objectContaining({ type: "session.ready" }), + }), + new Set(["conn-1"]), + { dropIfSlow: true }, + ); + + const revokeRespond = vi.fn(); + await talkHandlers["talk.handoff.revoke"]({ + req: { type: "req", id: "3", method: "talk.handoff.revoke" }, + params: { id: handoff.id }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: revokeRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(revokeRespond).toHaveBeenCalledWith(true, { ok: true, revoked: true }, undefined); + + const rejectedJoinRespond = vi.fn(); + await talkHandlers["talk.handoff.join"]({ + req: { type: "req", id: "4", method: "talk.handoff.join" }, + params: { id: handoff.id, token: handoff.token }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: rejectedJoinRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(rejectedJoinRespond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.UNAVAILABLE, + message: "talk handoff join failed: not_found", + }), + ); + }); + + it("notifies the displaced handoff client when a new client joins", async () => { + const broadcastToConnIds = vi.fn(); + const createRespond = vi.fn(); + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "1", method: "talk.handoff.create" }, + params: { sessionKey: "session:main" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: createRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + const handoff = createRespond.mock.calls[0]?.[1] as { id: string; token: string }; + + await talkHandlers["talk.handoff.join"]({ + req: { type: "req", id: "2", method: "talk.handoff.join" }, + params: { id: handoff.id, token: handoff.token }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: vi.fn() as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + broadcastToConnIds.mockClear(); + + const joinRespond = vi.fn(); + await talkHandlers["talk.handoff.join"]({ + req: { type: "req", id: "3", method: "talk.handoff.join" }, + params: { id: handoff.id, token: handoff.token }, + client: { connId: "conn-2" } as never, + isWebchatConnect: () => false, + respond: joinRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(joinRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + room: expect.objectContaining({ activeClientId: "conn-2" }), + }), + undefined, + ); + expect(broadcastToConnIds).toHaveBeenCalledWith( + "talk.event", + expect.objectContaining({ + handoffId: handoff.id, + talkEvent: expect.objectContaining({ + type: "session.replaced", + payload: expect.objectContaining({ + previousClientId: "conn-1", + nextClientId: "conn-2", + }), + }), + }), + new Set(["conn-1"]), + { dropIfSlow: true }, + ); + expect(broadcastToConnIds).toHaveBeenCalledWith( + "talk.event", + expect.objectContaining({ + handoffId: handoff.id, + talkEvent: expect.objectContaining({ + type: "session.ready", + payload: expect.objectContaining({ clientId: "conn-2" }), + }), + }), + new Set(["conn-2"]), + { dropIfSlow: true }, + ); + expect( + broadcastToConnIds.mock.calls.some( + ([, payload, connIds]) => + (payload as { talkEvent?: { type?: string } }).talkEvent?.type === "session.replaced" && + connIds instanceof Set && + connIds.has("conn-2"), + ), + ).toBe(false); + }); + + it("drives managed-room turn lifecycle through handoff RPCs", async () => { + const broadcastToConnIds = vi.fn(); + const createRespond = vi.fn(); + await talkHandlers["talk.handoff.create"]({ + req: { type: "req", id: "1", method: "talk.handoff.create" }, + params: { sessionKey: "session:main" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: createRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + const handoff = createRespond.mock.calls[0]?.[1] as { id: string; token: string }; + + const startRespond = vi.fn(); + await talkHandlers["talk.handoff.turnStart"]({ + req: { type: "req", id: "2", method: "talk.handoff.turnStart" }, + params: { id: handoff.id, token: handoff.token, turnId: "turn-1" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: startRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(startRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + ok: true, + turnId: "turn-1", + events: [expect.objectContaining({ type: "turn.started", turnId: "turn-1" })], + }), + undefined, + ); + expect(broadcastToConnIds).toHaveBeenCalledWith( + "talk.event", + expect.objectContaining({ + handoffId: handoff.id, + talkEvent: expect.objectContaining({ type: "turn.started", turnId: "turn-1" }), + }), + new Set(["conn-1"]), + { dropIfSlow: true }, + ); + + const cancelRespond = vi.fn(); + await talkHandlers["talk.handoff.turnCancel"]({ + req: { type: "req", id: "3", method: "talk.handoff.turnCancel" }, + params: { id: handoff.id, token: handoff.token, reason: "barge-in" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: cancelRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(cancelRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + ok: true, + turnId: "turn-1", + events: [expect.objectContaining({ type: "turn.cancelled", turnId: "turn-1" })], + }), + undefined, + ); + + const endRespond = vi.fn(); + await talkHandlers["talk.handoff.turnEnd"]({ + req: { type: "req", id: "4", method: "talk.handoff.turnEnd" }, + params: { id: handoff.id, token: handoff.token }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: endRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(endRespond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: "talk handoff turn end failed: no_active_turn", + }), + ); + }); +}); + +describe("talk.session unified handlers", () => { + beforeEach(() => { + vi.clearAllMocks(); + mocks.resolveSessionKeyFromResolveParams.mockImplementation(async ({ p }) => { + const key = (p as { key?: unknown }).key; + return { + ok: true, + key: typeof key === "string" ? key : "session:main", + }; + }); + }); + + it("creates and drives a realtime gateway-relay session through the unified API", async () => { + const provider = { + id: "openai", + label: "OpenAI Realtime", + isConfigured: () => true, + createBridge: vi.fn(), + }; + mocks.resolveConfiguredRealtimeVoiceProvider.mockReturnValue({ + provider, + providerConfig: { apiKey: "openai-key" }, + }); + mocks.createTalkRealtimeRelaySession.mockReturnValue({ + provider: "openai", + transport: "gateway-relay", + relaySessionId: "relay-unified-1", + audio: { + inputEncoding: "pcm16", + inputSampleRateHz: 24000, + outputEncoding: "pcm16", + outputSampleRateHz: 24000, + }, + model: "gpt-realtime", + voice: "alloy", + expiresAt: 1_797_986_400, + }); + + const createRespond = vi.fn(); + await talkHandlers["talk.session.create"]({ + req: { type: "req", id: "1", method: "talk.session.create" }, + params: { + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + provider: "openai", + model: "gpt-realtime", + voice: "alloy", + }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: createRespond as never, + context: { + getRuntimeConfig: () => + ({ + talk: { + realtime: { + provider: "openai", + providers: { openai: { apiKey: "openai-key" } }, + }, + }, + }) as OpenClawConfig, + } as never, + }); + + expect(mocks.createTalkRealtimeRelaySession).toHaveBeenCalledWith( + expect.objectContaining({ + connId: "conn-1", + provider, + providerConfig: expect.objectContaining({ + apiKey: "openai-key", + model: "gpt-realtime", + voice: "alloy", + }), + }), + ); + expect(createRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + sessionId: "relay-unified-1", + relaySessionId: "relay-unified-1", + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + }), + undefined, + ); + + const inputRespond = vi.fn(); + await talkHandlers["talk.session.inputAudio"]({ + req: { type: "req", id: "2", method: "talk.session.inputAudio" }, + params: { sessionId: "relay-unified-1", audioBase64: "aGVsbG8=", timestamp: 42 }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: inputRespond as never, + context: {} as never, + }); + expect(mocks.sendTalkRealtimeRelayAudio).toHaveBeenCalledWith({ + relaySessionId: "relay-unified-1", + connId: "conn-1", + audioBase64: "aGVsbG8=", + timestamp: 42, + }); + + const cancelRespond = vi.fn(); + await talkHandlers["talk.session.control"]({ + req: { type: "req", id: "3", method: "talk.session.control" }, + params: { sessionId: "relay-unified-1", type: "turn.cancel", reason: "barge-in" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: cancelRespond as never, + context: {} as never, + }); + expect(mocks.cancelTalkRealtimeRelayTurn).toHaveBeenCalledWith({ + relaySessionId: "relay-unified-1", + connId: "conn-1", + reason: "barge-in", + }); + + const toolRespond = vi.fn(); + await talkHandlers["talk.session.toolResult"]({ + req: { type: "req", id: "4", method: "talk.session.toolResult" }, + params: { sessionId: "relay-unified-1", callId: "call-1", result: { ok: true } }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: toolRespond as never, + context: {} as never, + }); + expect(mocks.submitTalkRealtimeRelayToolResult).toHaveBeenCalledWith({ + relaySessionId: "relay-unified-1", + connId: "conn-1", + callId: "call-1", + result: { ok: true }, + }); + + const closeRespond = vi.fn(); + await talkHandlers["talk.session.close"]({ + req: { type: "req", id: "5", method: "talk.session.close" }, + params: { sessionId: "relay-unified-1" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: closeRespond as never, + context: {} as never, + }); + expect(mocks.stopTalkRealtimeRelaySession).toHaveBeenCalledWith({ + relaySessionId: "relay-unified-1", + connId: "conn-1", + }); + expect(closeRespond).toHaveBeenCalledWith(true, { ok: true }, undefined); + }); + + it("creates transcription gateway-relay sessions through the unified API", async () => { + const provider = { + id: "openai", + label: "OpenAI Realtime Transcription", + autoSelectOrder: 1, + resolveConfig: vi.fn(({ rawConfig }) => rawConfig), + isConfigured: vi.fn(({ providerConfig }) => providerConfig.apiKey === "stt-key"), + createSession: vi.fn(), + }; + mocks.listRealtimeTranscriptionProviders.mockReturnValue([provider] as never); + mocks.createTalkTranscriptionRelaySession.mockReturnValue({ + provider: "openai", + mode: "transcription", + transport: "gateway-relay", + transcriptionSessionId: "stt-unified-1", + audio: { inputEncoding: "pcm16", inputSampleRateHz: 24000 }, + expiresAt: 1_797_986_400, + }); + + const createRespond = vi.fn(); + await talkHandlers["talk.session.create"]({ + req: { type: "req", id: "1", method: "talk.session.create" }, + params: { mode: "transcription", provider: "openai" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: createRespond as never, + context: { + getRuntimeConfig: () => + ({ + plugins: { + entries: { + "voice-call": { + config: { + streaming: { + provider: "openai", + providers: { openai: { apiKey: "stt-key" } }, + }, + }, + }, + }, + }, + }) as OpenClawConfig, + } as never, + }); + + expect(createRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + sessionId: "stt-unified-1", + transcriptionSessionId: "stt-unified-1", + mode: "transcription", + transport: "gateway-relay", + brain: "none", + }), + undefined, + ); + + const inputRespond = vi.fn(); + await talkHandlers["talk.session.inputAudio"]({ + req: { type: "req", id: "2", method: "talk.session.inputAudio" }, + params: { sessionId: "stt-unified-1", audioBase64: "aGVsbG8=" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: inputRespond as never, + context: {} as never, + }); + expect(mocks.sendTalkTranscriptionRelayAudio).toHaveBeenCalledWith({ + transcriptionSessionId: "stt-unified-1", + connId: "conn-1", + audioBase64: "aGVsbG8=", + }); + + const closeRespond = vi.fn(); + await talkHandlers["talk.session.close"]({ + req: { type: "req", id: "3", method: "talk.session.close" }, + params: { sessionId: "stt-unified-1" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: closeRespond as never, + context: {} as never, + }); + expect(mocks.stopTalkTranscriptionRelaySession).toHaveBeenCalledWith({ + transcriptionSessionId: "stt-unified-1", + connId: "conn-1", + }); + }); + + it("creates and controls managed-room sessions through the unified API", async () => { + const broadcastToConnIds = vi.fn(); + const createRespond = vi.fn(); + await talkHandlers["talk.session.create"]({ + req: { type: "req", id: "1", method: "talk.session.create" }, + params: { + mode: "stt-tts", + transport: "managed-room", + sessionKey: "session:main", + ttlMs: 5000, + }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: createRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + const session = createRespond.mock.calls[0]?.[1] as { sessionId: string; token: string }; + + expect(createRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + sessionId: expect.any(String), + handoffId: expect.any(String), + roomId: expect.stringMatching(/^talk_/), + transport: "managed-room", + brain: "agent-consult", + token: expect.any(String), + }), + undefined, + ); + expect(mocks.resolveSessionKeyFromResolveParams).toHaveBeenCalledWith({ + cfg: {}, + p: { + key: "session:main", + includeGlobal: true, + includeUnknown: true, + }, + }); + + const startRespond = vi.fn(); + await talkHandlers["talk.session.control"]({ + req: { type: "req", id: "2", method: "talk.session.control" }, + params: { sessionId: session.sessionId, type: "turn.start", turnId: "turn-1" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: startRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + broadcastToConnIds, + } as never, + }); + + expect(startRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + ok: true, + turnId: "turn-1", + events: [expect.objectContaining({ type: "turn.started", turnId: "turn-1" })], + }), + undefined, + ); + expect(broadcastToConnIds).toHaveBeenCalledWith( + "talk.event", + expect.objectContaining({ + handoffId: session.sessionId, + talkEvent: expect.objectContaining({ type: "turn.started", turnId: "turn-1" }), + }), + new Set(["conn-1"]), + { dropIfSlow: true }, + ); + + const closeRespond = vi.fn(); + await talkHandlers["talk.session.close"]({ + req: { type: "req", id: "3", method: "talk.session.close" }, + params: { sessionId: session.sessionId }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: closeRespond as never, + context: {} as never, + }); + expect(closeRespond).toHaveBeenCalledWith(true, { ok: true }, undefined); + }); + + it("keeps direct-tools managed-room sessions behind admin scope", async () => { + const rejectedRespond = vi.fn(); + await talkHandlers["talk.session.create"]({ + req: { type: "req", id: "1", method: "talk.session.create" }, + params: { + mode: "stt-tts", + transport: "managed-room", + brain: "direct-tools", + sessionKey: "session:main", + }, + client: { connId: "conn-1", connect: { scopes: ["operator.write"] } } as never, + isWebchatConnect: () => false, + respond: rejectedRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(rejectedRespond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: 'talk.session.create brain="direct-tools" requires gateway scope: operator.admin', + }), + ); + expect(mocks.resolveSessionKeyFromResolveParams).not.toHaveBeenCalled(); + + const createRespond = vi.fn(); + await talkHandlers["talk.session.create"]({ + req: { type: "req", id: "2", method: "talk.session.create" }, + params: { + mode: "stt-tts", + transport: "managed-room", + brain: "direct-tools", + sessionKey: "session:main", + }, + client: { connId: "conn-1", connect: { scopes: ["operator.admin"] } } as never, + isWebchatConnect: () => false, + respond: createRespond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + const session = createRespond.mock.calls[0]?.[1] as { sessionId: string }; + expect(createRespond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + sessionId: expect.any(String), + transport: "managed-room", + brain: "direct-tools", + }), + undefined, + ); + + await talkHandlers["talk.session.close"]({ + req: { type: "req", id: "3", method: "talk.session.close" }, + params: { sessionId: session.sessionId }, + client: { connId: "conn-1", connect: { scopes: ["operator.admin"] } } as never, + isWebchatConnect: () => false, + respond: vi.fn() as never, + context: {} as never, + }); + }); + + it("keeps browser-owned transports on the existing realtime endpoint", async () => { + const respond = vi.fn(); + await talkHandlers["talk.session.create"]({ + req: { type: "req", id: "1", method: "talk.session.create" }, + params: { mode: "realtime", transport: "webrtc" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { getRuntimeConfig: () => ({}) as OpenClawConfig } as never, + }); + + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: expect.stringContaining("use talk.realtime.session"), + }), + ); + }); +}); + +describe("talk.realtime.toolCall handler", () => { + beforeEach(() => { + vi.clearAllMocks(); + mocks.chatSend.mockImplementation( + async ({ + respond, + }: { + respond: (ok: boolean, result?: unknown, error?: unknown) => void; + }) => { + respond(true, { runId: "run-voice-1" }, undefined); + }, + ); + }); + + it("starts agent consult through gateway policy instead of exposing chat.send to browser clients", async () => { + const respond = vi.fn(); + + await talkHandlers["talk.realtime.toolCall"]({ + req: { type: "req", id: "1", method: "talk.realtime.toolCall" }, + params: { + sessionKey: "main", + callId: "call-1", + name: "openclaw_agent_consult", + args: { question: "What is in this repo?", responseStyle: "one sentence" }, + }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(mocks.chatSend).toHaveBeenCalledWith( + expect.objectContaining({ + req: expect.objectContaining({ method: "chat.send" }), + params: expect.objectContaining({ + sessionKey: "main", + message: expect.stringContaining("What is in this repo?"), + idempotencyKey: expect.stringMatching(/^talk-call-1-/), + }), + }), + ); + expect(respond).toHaveBeenCalledWith( + true, + { + runId: "run-voice-1", + idempotencyKey: expect.stringMatching(/^talk-call-1-/), + }, + undefined, + ); + }); + + it("links relay-owned agent consult runs so relay cancellation can abort them", async () => { + const respond = vi.fn(); + + await talkHandlers["talk.realtime.toolCall"]({ + req: { type: "req", id: "1", method: "talk.realtime.toolCall" }, + params: { + sessionKey: "main", + relaySessionId: "relay-1", + callId: "call-1", + name: "openclaw_agent_consult", + args: { question: "What now?" }, + }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(mocks.registerTalkRealtimeRelayAgentRun).toHaveBeenCalledWith({ + relaySessionId: "relay-1", + connId: "conn-1", + sessionKey: "main", + runId: "run-voice-1", + }); + expect(respond).toHaveBeenCalledWith( + true, + expect.objectContaining({ runId: "run-voice-1" }), + undefined, + ); + }); + + it("rejects realtime tool calls that are not the agent consult tool", async () => { + const respond = vi.fn(); + + await talkHandlers["talk.realtime.toolCall"]({ + req: { type: "req", id: "1", method: "talk.realtime.toolCall" }, + params: { + sessionKey: "main", + callId: "call-1", + name: "unknown_tool", + }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => ({}) as OpenClawConfig, + } as never, + }); + + expect(mocks.chatSend).not.toHaveBeenCalled(); + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: "unsupported realtime Talk tool: unknown_tool", + }), + ); + }); +}); + describe("talk.realtime.session handler", () => { beforeEach(() => { vi.clearAllMocks(); @@ -256,7 +1420,6 @@ describe("talk.realtime.session handler", () => { createBrowserSession, createBridge, }; - mocks.getRealtimeVoiceProvider.mockReturnValue(provider); mocks.resolveConfiguredRealtimeVoiceProvider.mockReturnValue({ provider, providerConfig: { apiKey: "gemini-key" }, @@ -284,8 +1447,10 @@ describe("talk.realtime.session handler", () => { getRuntimeConfig: () => ({ talk: { - provider: "google", - providers: { google: { apiKey: "gemini-key" } }, + realtime: { + provider: "google", + providers: { google: { apiKey: "gemini-key" } }, + }, }, }) as OpenClawConfig, } as never, @@ -309,4 +1474,533 @@ describe("talk.realtime.session handler", () => { undefined, ); }); + + it("uses talk.realtime provider, model, and voice without reading speech provider config", async () => { + const createBrowserSession = vi.fn(async () => ({ + provider: "openai", + transport: "webrtc" as const, + clientSecret: "secret", + })); + const provider = { + id: "openai", + label: "OpenAI Realtime", + isConfigured: () => true, + createBrowserSession, + createBridge: vi.fn(), + }; + mocks.resolveConfiguredRealtimeVoiceProvider.mockReturnValue({ + provider, + providerConfig: { apiKey: "openai-key", model: "gpt-realtime" }, + }); + + const respond = vi.fn(); + await talkHandlers["talk.realtime.session"]({ + req: { type: "req", id: "1", method: "talk.realtime.session" }, + params: { sessionKey: "main" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => + ({ + talk: { + provider: "elevenlabs", + providers: { elevenlabs: { apiKey: "speech-key" } }, + realtime: { + provider: "openai", + providers: { openai: { apiKey: "openai-key" } }, + model: "gpt-realtime", + voice: "alloy", + }, + }, + }) as OpenClawConfig, + } as never, + }); + + expect(mocks.resolveConfiguredRealtimeVoiceProvider).toHaveBeenCalledWith( + expect.objectContaining({ + configuredProviderId: "openai", + providerConfigs: { openai: { apiKey: "openai-key" } }, + }), + ); + expect(createBrowserSession).toHaveBeenCalledWith( + expect.objectContaining({ + model: "gpt-realtime", + voice: "alloy", + }), + ); + expect(respond).toHaveBeenCalledWith( + true, + expect.objectContaining({ provider: "openai" }), + undefined, + ); + }); + + it("rejects managed-room browser sessions until a real room client exists", async () => { + const respond = vi.fn(); + await talkHandlers["talk.realtime.session"]({ + req: { type: "req", id: "1", method: "talk.realtime.session" }, + params: { sessionKey: "main", mode: "realtime", transport: "managed-room" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { getRuntimeConfig: () => ({}) as OpenClawConfig } as never, + }); + + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + message: "managed-room realtime Talk sessions are not available in the browser UI yet", + }), + ); + expect(mocks.resolveConfiguredRealtimeVoiceProvider).not.toHaveBeenCalled(); + }); + + it("uses the gateway relay when requested instead of creating a browser-owned provider session", async () => { + const createBrowserSession = vi.fn(); + const createBridge = vi.fn(); + const provider = { + id: "openai", + label: "OpenAI Realtime", + isConfigured: () => true, + createBrowserSession, + createBridge, + }; + mocks.resolveConfiguredRealtimeVoiceProvider.mockReturnValue({ + provider, + providerConfig: { apiKey: "openai-key" }, + }); + mocks.createTalkRealtimeRelaySession.mockReturnValue({ + provider: "openai", + transport: "gateway-relay", + relaySessionId: "relay-1", + audio: { + inputEncoding: "pcm16", + inputSampleRateHz: 24000, + outputEncoding: "pcm16", + outputSampleRateHz: 24000, + }, + }); + + const respond = vi.fn(); + await talkHandlers["talk.realtime.session"]({ + req: { type: "req", id: "1", method: "talk.realtime.session" }, + params: { sessionKey: "main", transport: "gateway-relay", brain: "agent-consult" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => + ({ + talk: { + realtime: { + provider: "openai", + providers: { openai: { apiKey: "openai-key" } }, + }, + }, + }) as OpenClawConfig, + } as never, + }); + + expect(createBrowserSession).not.toHaveBeenCalled(); + expect(mocks.createTalkRealtimeRelaySession).toHaveBeenCalledWith( + expect.objectContaining({ + connId: "conn-1", + provider, + }), + ); + expect(respond).toHaveBeenCalledWith( + true, + expect.objectContaining({ transport: "gateway-relay" }), + undefined, + ); + }); + + it("uses the configured gateway relay transport when request params omit transport", async () => { + const createBrowserSession = vi.fn(); + const provider = { + id: "openai", + label: "OpenAI Realtime", + isConfigured: () => true, + createBrowserSession, + createBridge: vi.fn(), + }; + mocks.resolveConfiguredRealtimeVoiceProvider.mockReturnValue({ + provider, + providerConfig: { apiKey: "openai-key" }, + }); + mocks.createTalkRealtimeRelaySession.mockReturnValue({ + provider: "openai", + transport: "gateway-relay", + relaySessionId: "relay-from-config", + audio: { + inputEncoding: "pcm16", + inputSampleRateHz: 24000, + outputEncoding: "pcm16", + outputSampleRateHz: 24000, + }, + }); + + const respond = vi.fn(); + await talkHandlers["talk.realtime.session"]({ + req: { type: "req", id: "1", method: "talk.realtime.session" }, + params: { sessionKey: "main" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => + ({ + talk: { + realtime: { + provider: "openai", + providers: { openai: { apiKey: "openai-key" } }, + transport: "gateway-relay", + brain: "agent-consult", + }, + }, + }) as OpenClawConfig, + } as never, + }); + + expect(createBrowserSession).not.toHaveBeenCalled(); + expect(mocks.createTalkRealtimeRelaySession).toHaveBeenCalledWith( + expect.objectContaining({ connId: "conn-1", provider }), + ); + expect(respond).toHaveBeenCalledWith( + true, + expect.objectContaining({ relaySessionId: "relay-from-config" }), + undefined, + ); + }); + + it("rejects configured realtime brains the browser endpoint cannot run", async () => { + const respond = vi.fn(); + await talkHandlers["talk.realtime.session"]({ + req: { type: "req", id: "1", method: "talk.realtime.session" }, + params: { sessionKey: "main" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => + ({ + talk: { + realtime: { + brain: "direct-tools", + }, + }, + }) as OpenClawConfig, + } as never, + }); + + expect(mocks.resolveConfiguredRealtimeVoiceProvider).not.toHaveBeenCalled(); + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + message: 'talk.realtime.session only supports brain="agent-consult"', + }), + ); + }); + + it("forwards realtime relay control requests by connection id", async () => { + const respondAudio = vi.fn(); + await talkHandlers["talk.realtime.relayAudio"]({ + req: { type: "req", id: "1", method: "talk.realtime.relayAudio" }, + params: { relaySessionId: "relay-1", audioBase64: "aGVsbG8=", timestamp: 123 }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondAudio as never, + context: {} as never, + }); + + expect(mocks.sendTalkRealtimeRelayAudio).toHaveBeenCalledWith({ + relaySessionId: "relay-1", + connId: "conn-1", + audioBase64: "aGVsbG8=", + timestamp: 123, + }); + expect(respondAudio).toHaveBeenCalledWith(true, { ok: true }, undefined); + + const respondMark = vi.fn(); + await talkHandlers["talk.realtime.relayMark"]({ + req: { type: "req", id: "2", method: "talk.realtime.relayMark" }, + params: { relaySessionId: "relay-1", markName: "mark-1" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondMark as never, + context: {} as never, + }); + + expect(mocks.acknowledgeTalkRealtimeRelayMark).toHaveBeenCalledWith({ + relaySessionId: "relay-1", + connId: "conn-1", + }); + expect(respondMark).toHaveBeenCalledWith(true, { ok: true }, undefined); + + const respondCancel = vi.fn(); + await talkHandlers["talk.realtime.relayCancel"]({ + req: { type: "req", id: "3", method: "talk.realtime.relayCancel" }, + params: { relaySessionId: "relay-1", reason: "barge-in" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondCancel as never, + context: {} as never, + }); + + expect(mocks.cancelTalkRealtimeRelayTurn).toHaveBeenCalledWith({ + relaySessionId: "relay-1", + connId: "conn-1", + reason: "barge-in", + }); + expect(respondCancel).toHaveBeenCalledWith(true, { ok: true }, undefined); + + const respondToolResult = vi.fn(); + await talkHandlers["talk.realtime.relayToolResult"]({ + req: { type: "req", id: "4", method: "talk.realtime.relayToolResult" }, + params: { relaySessionId: "relay-1", callId: "call-1", result: { ok: true } }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondToolResult as never, + context: {} as never, + }); + + expect(mocks.submitTalkRealtimeRelayToolResult).toHaveBeenCalledWith({ + relaySessionId: "relay-1", + connId: "conn-1", + callId: "call-1", + result: { ok: true }, + }); + expect(respondToolResult).toHaveBeenCalledWith(true, { ok: true }, undefined); + + const respondStop = vi.fn(); + await talkHandlers["talk.realtime.relayStop"]({ + req: { type: "req", id: "5", method: "talk.realtime.relayStop" }, + params: { relaySessionId: "relay-1" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondStop as never, + context: {} as never, + }); + + expect(mocks.stopTalkRealtimeRelaySession).toHaveBeenCalledWith({ + relaySessionId: "relay-1", + connId: "conn-1", + }); + expect(respondStop).toHaveBeenCalledWith(true, { ok: true }, undefined); + }); +}); + +describe("talk.transcription relay handlers", () => { + beforeEach(() => { + vi.clearAllMocks(); + mocks.listRealtimeTranscriptionProviders.mockReturnValue([]); + }); + + it("creates a transcription-only gateway relay session without mutating global config", async () => { + const sttSession = { + connect: vi.fn(), + sendAudio: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + const provider = { + id: "openai", + label: "OpenAI Realtime Transcription", + autoSelectOrder: 1, + resolveConfig: vi.fn(({ rawConfig }) => rawConfig), + isConfigured: vi.fn(({ providerConfig }) => providerConfig.apiKey === "stt-key"), + createSession: vi.fn(() => sttSession), + }; + mocks.listRealtimeTranscriptionProviders.mockReturnValue([provider as never]); + mocks.createTalkTranscriptionRelaySession.mockReturnValue({ + provider: "openai", + mode: "transcription", + transport: "gateway-relay", + transcriptionSessionId: "stt-1", + audio: { inputEncoding: "pcm16", inputSampleRateHz: 24000 }, + expiresAt: 123, + }); + const runtimeConfig = { + plugins: { + entries: { + "voice-call": { + config: { + streaming: { + provider: "openai", + providers: { openai: { apiKey: "stt-key" } }, + }, + }, + }, + }, + }, + talk: { + provider: "elevenlabs", + providers: { elevenlabs: { apiKey: "speech-key" } }, + }, + } as OpenClawConfig; + const respond = vi.fn(); + + await talkHandlers["talk.transcription.session"]({ + req: { type: "req", id: "1", method: "talk.transcription.session" }, + params: {}, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => runtimeConfig, + } as never, + }); + + expect(provider.resolveConfig).toHaveBeenCalledWith({ + cfg: runtimeConfig, + rawConfig: { apiKey: "stt-key" }, + }); + expect(provider.isConfigured).toHaveBeenCalledWith({ + cfg: runtimeConfig, + providerConfig: { apiKey: "stt-key" }, + }); + expect(mocks.createTalkTranscriptionRelaySession).toHaveBeenCalledWith( + expect.objectContaining({ + connId: "conn-1", + provider, + providerConfig: { apiKey: "stt-key" }, + }), + ); + expect(respond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + mode: "transcription", + transport: "gateway-relay", + transcriptionSessionId: "stt-1", + }), + undefined, + ); + expect(runtimeConfig.talk?.provider).toBe("elevenlabs"); + }); + + it("resolves transcription provider config keyed by requested alias", async () => { + const sttSession = { + connect: vi.fn(), + sendAudio: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + const provider = { + id: "openai", + aliases: ["openai-realtime"], + label: "OpenAI Realtime Transcription", + autoSelectOrder: 1, + resolveConfig: vi.fn(({ rawConfig }) => rawConfig), + isConfigured: vi.fn(({ providerConfig }) => providerConfig.apiKey === "alias-key"), + createSession: vi.fn(() => sttSession), + }; + mocks.listRealtimeTranscriptionProviders.mockReturnValue([provider as never]); + mocks.createTalkTranscriptionRelaySession.mockReturnValue({ + provider: "openai", + mode: "transcription", + transport: "gateway-relay", + transcriptionSessionId: "stt-alias", + audio: { inputEncoding: "pcm16", inputSampleRateHz: 24000 }, + expiresAt: 123, + }); + const runtimeConfig = { + plugins: { + entries: { + "voice-call": { + config: { + streaming: { + provider: "openai-realtime", + providers: { "openai-realtime": { apiKey: "alias-key" } }, + }, + }, + }, + }, + }, + } as OpenClawConfig; + const respond = vi.fn(); + + await talkHandlers["talk.transcription.session"]({ + req: { type: "req", id: "1", method: "talk.transcription.session" }, + params: {}, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respond as never, + context: { + getRuntimeConfig: () => runtimeConfig, + } as never, + }); + + expect(provider.resolveConfig).toHaveBeenCalledWith({ + cfg: runtimeConfig, + rawConfig: { apiKey: "alias-key" }, + }); + expect(mocks.createTalkTranscriptionRelaySession).toHaveBeenCalledWith( + expect.objectContaining({ + provider, + providerConfig: { apiKey: "alias-key" }, + }), + ); + expect(respond).toHaveBeenCalledWith( + true, + expect.objectContaining({ + transcriptionSessionId: "stt-alias", + }), + undefined, + ); + }); + + it("forwards transcription relay audio, cancel, and stop requests by connection id", async () => { + const respondAudio = vi.fn(); + await talkHandlers["talk.transcription.relayAudio"]({ + req: { type: "req", id: "1", method: "talk.transcription.relayAudio" }, + params: { transcriptionSessionId: "stt-1", audioBase64: "aGVsbG8=" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondAudio as never, + context: {} as never, + }); + + expect(mocks.sendTalkTranscriptionRelayAudio).toHaveBeenCalledWith({ + transcriptionSessionId: "stt-1", + connId: "conn-1", + audioBase64: "aGVsbG8=", + }); + expect(respondAudio).toHaveBeenCalledWith(true, { ok: true }, undefined); + + const respondCancel = vi.fn(); + await talkHandlers["talk.transcription.relayCancel"]({ + req: { type: "req", id: "2", method: "talk.transcription.relayCancel" }, + params: { transcriptionSessionId: "stt-1", reason: "barge-in" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondCancel as never, + context: {} as never, + }); + + expect(mocks.cancelTalkTranscriptionRelayTurn).toHaveBeenCalledWith({ + transcriptionSessionId: "stt-1", + connId: "conn-1", + reason: "barge-in", + }); + expect(respondCancel).toHaveBeenCalledWith(true, { ok: true }, undefined); + + const respondStop = vi.fn(); + await talkHandlers["talk.transcription.relayStop"]({ + req: { type: "req", id: "3", method: "talk.transcription.relayStop" }, + params: { transcriptionSessionId: "stt-1" }, + client: { connId: "conn-1" } as never, + isWebchatConnect: () => false, + respond: respondStop as never, + context: {} as never, + }); + + expect(mocks.stopTalkTranscriptionRelaySession).toHaveBeenCalledWith({ + transcriptionSessionId: "stt-1", + connId: "conn-1", + }); + expect(respondStop).toHaveBeenCalledWith(true, { ok: true }, undefined); + }); }); diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index b036166a0d2..35a91224c8c 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -1,3 +1,4 @@ +import { randomUUID } from "node:crypto"; import { readConfigFileSnapshot } from "../../config/config.js"; import { redactConfigObject } from "../../config/redact-snapshot.js"; import { @@ -7,47 +8,103 @@ import { } from "../../config/talk.js"; import type { TalkConfigResponse, TalkProviderConfig } from "../../config/types.gateway.js"; import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js"; +import { listRealtimeTranscriptionProviders } from "../../realtime-transcription/provider-registry.js"; import { REALTIME_VOICE_AGENT_CONSULT_TOOL, REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + buildRealtimeVoiceAgentConsultChatMessage, } from "../../realtime-voice/agent-consult-tool.js"; -import { getRealtimeVoiceProvider } from "../../realtime-voice/provider-registry.js"; +import { + canonicalizeRealtimeVoiceProviderId, + listRealtimeVoiceProviders, +} from "../../realtime-voice/provider-registry.js"; import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js"; -import type { - RealtimeVoiceBrowserSession, - RealtimeVoiceProviderConfig, -} from "../../realtime-voice/provider-types.js"; import { normalizeLowercaseStringOrEmpty, normalizeOptionalLowercaseString, normalizeOptionalString, } from "../../shared/string-coerce.js"; -import { canonicalizeSpeechProviderId, getSpeechProvider } from "../../tts/provider-registry.js"; -import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js"; +import { + canonicalizeSpeechProviderId, + getSpeechProvider, + listSpeechProviders, +} from "../../tts/provider-registry.js"; +import { + getResolvedSpeechProviderConfig, + resolveTtsConfig, + synthesizeSpeech, + type TtsDirectiveOverrides, +} from "../../tts/tts.js"; import { ADMIN_SCOPE, TALK_SECRETS_SCOPE } from "../operator-scopes.js"; import { ErrorCodes, errorShape, formatValidationErrors, + type ErrorShape, type TalkSpeakParams, + validateTalkCatalogParams, validateTalkConfigParams, + validateTalkHandoffCreateParams, + validateTalkHandoffJoinParams, + validateTalkHandoffRevokeParams, + validateTalkHandoffTurnCancelParams, + validateTalkHandoffTurnEndParams, + validateTalkHandoffTurnStartParams, validateTalkModeParams, validateTalkRealtimeRelayAudioParams, + validateTalkRealtimeRelayCancelParams, validateTalkRealtimeRelayMarkParams, validateTalkRealtimeRelayStopParams, validateTalkRealtimeRelayToolResultParams, validateTalkRealtimeSessionParams, + validateTalkRealtimeToolCallParams, + validateTalkTranscriptionRelayAudioParams, + validateTalkTranscriptionRelayCancelParams, + validateTalkTranscriptionRelayStopParams, + validateTalkTranscriptionSessionParams, validateTalkSpeakParams, } from "../protocol/index.js"; +import { resolveSessionKeyFromResolveParams } from "../sessions-resolve.js"; +import { + cancelTalkHandoffTurn, + createTalkHandoff, + endTalkHandoffTurn, + joinTalkHandoff, + revokeTalkHandoff, + startTalkHandoffTurn, +} from "../talk-handoff.js"; import { acknowledgeTalkRealtimeRelayMark, + cancelTalkRealtimeRelayTurn, createTalkRealtimeRelaySession, + registerTalkRealtimeRelayAgentRun, sendTalkRealtimeRelayAudio, stopTalkRealtimeRelaySession, submitTalkRealtimeRelayToolResult, } from "../talk-realtime-relay.js"; +import { + cancelTalkTranscriptionRelayTurn, + createTalkTranscriptionRelaySession, + sendTalkTranscriptionRelayAudio, + stopTalkTranscriptionRelaySession, +} from "../talk-transcription-relay.js"; import { formatForLog } from "../ws-log.js"; +import { chatHandlers } from "./chat.js"; import { asRecord } from "./record-shared.js"; +import { talkSessionHandlers } from "./talk-session.js"; +import { + broadcastTalkRoomEvents, + buildRealtimeInstructions, + buildTalkRealtimeConfig, + buildTalkTranscriptionConfig, + canUseTalkDirectTools, + configuredOrFalse, + getVoiceCallStreamingConfig, + isUnsupportedBrowserWebRtcSession, + resolveConfiguredRealtimeTranscriptionProvider, + talkHandoffErrorCode, + withRealtimeBrowserOverrides, +} from "./talk-shared.js"; import type { GatewayRequestHandlers } from "./types.js"; type TalkSpeakReason = @@ -158,83 +215,117 @@ function buildTalkTtsConfig( }; } -function getRecord(value: unknown): Record | undefined { - return asRecord(value) ?? undefined; -} +function buildTalkCatalog(config: OpenClawConfig) { + const ttsConfig = resolveTtsConfig(config); + const talkResolved = resolveActiveTalkProviderConfig(config.talk); + const activeSpeechProvider = canonicalizeSpeechProviderId(talkResolved?.provider, config); + const streamingConfig = getVoiceCallStreamingConfig(config); + const realtimeConfig = buildTalkRealtimeConfig(config); + const activeRealtimeProvider = canonicalizeRealtimeVoiceProviderId( + realtimeConfig.provider, + config, + ); -function getVoiceCallRealtimeConfig(config: OpenClawConfig): { - provider?: string; - providers?: Record; -} { - const plugins = getRecord(config.plugins); - const entries = getRecord(plugins?.entries); - const voiceCall = getRecord(entries?.["voice-call"]); - const pluginConfig = getRecord(voiceCall?.config); - const realtime = getRecord(pluginConfig?.realtime); - const providersRaw = getRecord(realtime?.providers); - const providers: Record = {}; - if (providersRaw) { - for (const [providerId, providerConfig] of Object.entries(providersRaw)) { - const record = getRecord(providerConfig); - if (record) { - providers[providerId] = record; - } - } - } return { - provider: normalizeOptionalString(realtime?.provider), - providers: Object.keys(providers).length > 0 ? providers : undefined, - }; -} - -function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) { - const voiceCallRealtime = getVoiceCallRealtimeConfig(config); - const talkProviderConfigs = config.talk?.providers as - | Record - | undefined; - const talkProvider = normalizeOptionalString(config.talk?.provider); - const talkProviderSupportsRealtime = talkProvider - ? Boolean(getRealtimeVoiceProvider(talkProvider, config)) - : false; - const provider = - normalizeOptionalString(requestedProvider) ?? - (talkProviderSupportsRealtime ? talkProvider : undefined) ?? - voiceCallRealtime.provider; - return { - provider, - providers: { - ...voiceCallRealtime.providers, - ...talkProviderConfigs, + modes: ["realtime", "stt-tts", "transcription"], + transports: ["webrtc", "provider-websocket", "gateway-relay", "managed-room"], + brains: ["agent-consult", "direct-tools", "none"], + speech: { + ...(activeSpeechProvider ? { activeProvider: activeSpeechProvider } : {}), + providers: listSpeechProviders(config).map((provider) => { + const entry: Record = { + id: provider.id, + label: provider.label, + configured: configuredOrFalse(() => + provider.isConfigured({ + cfg: config, + providerConfig: getResolvedSpeechProviderConfig(ttsConfig, provider.id, config), + timeoutMs: ttsConfig.timeoutMs, + }), + ), + modes: ["stt-tts"], + brains: ["agent-consult"], + }; + if (provider.models) { + entry.models = [...provider.models]; + } + if (provider.voices) { + entry.voices = [...provider.voices]; + } + return entry; + }), + }, + transcription: { + ...(streamingConfig.provider ? { activeProvider: streamingConfig.provider } : {}), + providers: listRealtimeTranscriptionProviders(config).map((provider) => { + const rawConfig = streamingConfig.providers?.[provider.id] ?? {}; + const providerConfig = provider.resolveConfig?.({ cfg: config, rawConfig }) ?? rawConfig; + const entry: Record = { + id: provider.id, + label: provider.label, + configured: configuredOrFalse(() => + provider.isConfigured({ cfg: config, providerConfig }), + ), + modes: ["transcription"], + transports: ["gateway-relay"], + brains: ["none"], + }; + if (provider.defaultModel) { + entry.defaultModel = provider.defaultModel; + } + return entry; + }), + }, + realtime: { + ...(activeRealtimeProvider ? { activeProvider: activeRealtimeProvider } : {}), + providers: listRealtimeVoiceProviders(config).map((provider) => { + const rawConfig = realtimeConfig.providers?.[provider.id] ?? {}; + const providerConfig = provider.resolveConfig?.({ cfg: config, rawConfig }) ?? rawConfig; + const capabilities = provider.capabilities; + const entry: Record = { + id: provider.id, + label: provider.label, + configured: configuredOrFalse(() => + provider.isConfigured({ cfg: config, providerConfig }), + ), + modes: ["realtime"], + brains: capabilities?.supportsToolCalls === false ? ["none"] : ["agent-consult"], + supportsBrowserSession: Boolean( + capabilities?.supportsBrowserSession ?? provider.createBrowserSession, + ), + }; + if (provider.defaultModel) { + entry.defaultModel = provider.defaultModel; + } + if (capabilities?.transports) { + entry.transports = [...capabilities.transports]; + } + if (capabilities?.inputAudioFormats) { + entry.inputAudioFormats = capabilities.inputAudioFormats.map((format) => ({ ...format })); + } + if (capabilities?.outputAudioFormats) { + entry.outputAudioFormats = capabilities.outputAudioFormats.map((format) => ({ + ...format, + })); + } + if (capabilities?.supportsBargeIn !== undefined) { + entry.supportsBargeIn = capabilities.supportsBargeIn; + } + if (capabilities?.supportsToolCalls !== undefined) { + entry.supportsToolCalls = capabilities.supportsToolCalls; + } + if (capabilities?.supportsVideoFrames !== undefined) { + entry.supportsVideoFrames = capabilities.supportsVideoFrames; + } + if (capabilities?.supportsSessionResumption !== undefined) { + entry.supportsSessionResumption = capabilities.supportsSessionResumption; + } + return entry; + }), }, }; } -function buildRealtimeInstructions(): string { - return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`; -} - -function withRealtimeBrowserOverrides( - providerConfig: RealtimeVoiceProviderConfig, - params: { model?: string; voice?: string }, -): RealtimeVoiceProviderConfig { - const overrides: RealtimeVoiceProviderConfig = {}; - const model = normalizeOptionalString(params.model); - const voice = normalizeOptionalString(params.voice); - if (model) { - overrides.model = model; - } - if (voice) { - overrides.voice = voice; - } - return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig; -} - -function isUnsupportedBrowserWebRtcSession(session: RealtimeVoiceBrowserSession): boolean { - const provider = normalizeLowercaseStringOrEmpty(session.provider); - const transport = (session as { transport?: string }).transport ?? "webrtc-sdp"; - return provider === "google" && transport === "webrtc-sdp"; -} - function isFallbackEligibleTalkReason(reason: TalkSpeakReason): boolean { return ( reason === "talk_unconfigured" || @@ -443,7 +534,89 @@ function stripUnresolvedSecretApiKeyFromRecord( return rest; } +async function startRealtimeToolCallAgentConsult(params: { + sessionKey: string; + callId: string; + args: unknown; + relaySessionId?: string; + connId?: string; + request: Parameters[0]; +}): Promise< + { ok: true; runId: string; idempotencyKey: string } | { ok: false; error: ErrorShape } +> { + let message: string; + try { + message = buildRealtimeVoiceAgentConsultChatMessage(params.args); + } catch (err) { + return { ok: false, error: errorShape(ErrorCodes.INVALID_REQUEST, formatForLog(err)) }; + } + const idempotencyKey = `talk-${params.callId}-${randomUUID()}`; + let chatResponse: { ok: true; result: unknown } | { ok: false; error: ErrorShape } | undefined; + await chatHandlers["chat.send"]({ + ...params.request, + req: { + type: "req", + id: `${params.request.req.id}:talk-tool-call`, + method: "chat.send", + }, + params: { + sessionKey: params.sessionKey, + message, + idempotencyKey, + }, + respond: (ok: boolean, result?: unknown, error?: ErrorShape) => { + chatResponse = ok + ? { ok: true, result } + : { + ok: false, + error: error ?? errorShape(ErrorCodes.UNAVAILABLE, "chat.send failed without error"), + }; + }, + } as never); + + if (!chatResponse) { + return { + ok: false, + error: errorShape(ErrorCodes.UNAVAILABLE, "chat.send did not return a realtime tool result"), + }; + } + if (!chatResponse.ok) { + return { ok: false, error: chatResponse.error }; + } + const runId = normalizeOptionalString(asRecord(chatResponse.result)?.runId) ?? idempotencyKey; + if (params.relaySessionId && params.connId) { + registerTalkRealtimeRelayAgentRun({ + relaySessionId: params.relaySessionId, + connId: params.connId, + sessionKey: params.sessionKey, + runId, + }); + } + return { ok: true, runId, idempotencyKey }; +} + export const talkHandlers: GatewayRequestHandlers = { + ...talkSessionHandlers, + "talk.catalog": async ({ params, respond, context }) => { + const catalogParams = params ?? {}; + if (!validateTalkCatalogParams(catalogParams)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.catalog params: ${formatValidationErrors(validateTalkCatalogParams.errors)}`, + ), + ); + return; + } + + try { + respond(true, buildTalkCatalog(context.getRuntimeConfig()), undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, "talk.config": async ({ params, respond, client, context }) => { if (!validateTalkConfigParams(params)) { respond( @@ -492,6 +665,200 @@ export const talkHandlers: GatewayRequestHandlers = { respond(true, { config: configPayload }, undefined); }, + "talk.handoff.create": async ({ params, respond, client, context }) => { + if (!validateTalkHandoffCreateParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.handoff.create params: ${formatValidationErrors(validateTalkHandoffCreateParams.errors)}`, + ), + ); + return; + } + if (params.brain === "direct-tools" && !canUseTalkDirectTools(client)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `talk.handoff.create brain="direct-tools" requires gateway scope: ${ADMIN_SCOPE}`, + ), + ); + return; + } + const resolvedSession = await resolveSessionKeyFromResolveParams({ + cfg: context.getRuntimeConfig(), + p: { + key: params.sessionKey, + includeGlobal: true, + includeUnknown: true, + }, + }); + if (!resolvedSession.ok) { + respond(false, undefined, resolvedSession.error); + return; + } + respond(true, createTalkHandoff({ ...params, sessionKey: resolvedSession.key }), undefined); + }, + "talk.handoff.join": async ({ params, respond, client, context }) => { + if (!validateTalkHandoffJoinParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.handoff.join params: ${formatValidationErrors(validateTalkHandoffJoinParams.errors)}`, + ), + ); + return; + } + const result = joinTalkHandoff(params.id, params.token, { clientId: client?.connId }); + if (!result.ok) { + respond( + false, + undefined, + errorShape( + result.reason === "invalid_token" ? ErrorCodes.INVALID_REQUEST : ErrorCodes.UNAVAILABLE, + `talk handoff join failed: ${result.reason}`, + ), + ); + return; + } + broadcastTalkRoomEvents(context, result.replacedClientId, { + handoffId: result.record.id, + roomId: result.record.roomId, + events: result.replacementEvents, + }); + broadcastTalkRoomEvents(context, client?.connId, { + handoffId: result.record.id, + roomId: result.record.roomId, + events: result.activeClientEvents, + }); + respond(true, result.record, undefined); + }, + "talk.handoff.revoke": async ({ params, respond, context }) => { + if (!validateTalkHandoffRevokeParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.handoff.revoke params: ${formatValidationErrors(validateTalkHandoffRevokeParams.errors)}`, + ), + ); + return; + } + const result = revokeTalkHandoff(params.id); + broadcastTalkRoomEvents(context, result.activeClientId, { + handoffId: params.id, + roomId: result.roomId ?? "", + events: result.events, + }); + respond(true, { ok: true, revoked: result.revoked }, undefined); + }, + "talk.handoff.turnStart": async ({ params, respond, client, context }) => { + if (!validateTalkHandoffTurnStartParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.handoff.turnStart params: ${formatValidationErrors(validateTalkHandoffTurnStartParams.errors)}`, + ), + ); + return; + } + const result = startTalkHandoffTurn(params.id, params.token, { + turnId: params.turnId, + clientId: client?.connId, + }); + if (!result.ok) { + respond( + false, + undefined, + errorShape( + talkHandoffErrorCode(result.reason), + `talk handoff turn start failed: ${result.reason}`, + ), + ); + return; + } + broadcastTalkRoomEvents(context, result.record.room.activeClientId, { + handoffId: result.record.id, + roomId: result.record.roomId, + events: result.events, + }); + respond(true, result, undefined); + }, + "talk.handoff.turnEnd": async ({ params, respond, context }) => { + if (!validateTalkHandoffTurnEndParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.handoff.turnEnd params: ${formatValidationErrors(validateTalkHandoffTurnEndParams.errors)}`, + ), + ); + return; + } + const result = endTalkHandoffTurn(params.id, params.token, { + turnId: params.turnId, + }); + if (!result.ok) { + respond( + false, + undefined, + errorShape( + talkHandoffErrorCode(result.reason), + `talk handoff turn end failed: ${result.reason}`, + ), + ); + return; + } + broadcastTalkRoomEvents(context, result.record.room.activeClientId, { + handoffId: result.record.id, + roomId: result.record.roomId, + events: result.events, + }); + respond(true, result, undefined); + }, + "talk.handoff.turnCancel": async ({ params, respond, context }) => { + if (!validateTalkHandoffTurnCancelParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.handoff.turnCancel params: ${formatValidationErrors(validateTalkHandoffTurnCancelParams.errors)}`, + ), + ); + return; + } + const result = cancelTalkHandoffTurn(params.id, params.token, { + turnId: params.turnId, + reason: params.reason, + }); + if (!result.ok) { + respond( + false, + undefined, + errorShape( + talkHandoffErrorCode(result.reason), + `talk handoff turn cancel failed: ${result.reason}`, + ), + ); + return; + } + broadcastTalkRoomEvents(context, result.record.room.activeClientId, { + handoffId: result.record.id, + roomId: result.record.roomId, + events: result.events, + }); + respond(true, result, undefined); + }, "talk.realtime.session": async ({ params, respond, context, client }) => { if (!validateTalkRealtimeSessionParams(params)) { respond( @@ -508,10 +875,54 @@ export const talkHandlers: GatewayRequestHandlers = { provider?: string; model?: string; voice?: string; + mode?: string; + transport?: string; + brain?: string; }; try { const runtimeConfig = context.getRuntimeConfig(); const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, typedParams.provider); + const mode = + normalizeOptionalLowercaseString(typedParams.mode) ?? realtimeConfig.mode ?? "realtime"; + if (mode !== "realtime") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `talk.realtime.session only supports mode="realtime"; use talk.catalog for ${mode} provider discovery`, + ), + ); + return; + } + const brain = + normalizeOptionalLowercaseString(typedParams.brain) ?? + realtimeConfig.brain ?? + "agent-consult"; + if (brain !== "agent-consult") { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `talk.realtime.session only supports brain="agent-consult"`, + ), + ); + return; + } + const transport = + normalizeOptionalLowercaseString(typedParams.transport) ?? realtimeConfig.transport; + if (transport === "managed-room") { + respond( + false, + undefined, + errorShape( + ErrorCodes.UNAVAILABLE, + "managed-room realtime Talk sessions are not available in the browser UI yet", + ), + ); + return; + } const resolution = resolveConfiguredRealtimeVoiceProvider({ configuredProviderId: realtimeConfig.provider, providerConfigs: realtimeConfig.providers, @@ -519,18 +930,32 @@ export const talkHandlers: GatewayRequestHandlers = { cfgForResolve: runtimeConfig, noRegisteredProviderMessage: "No realtime voice provider registered", }); - if (resolution.provider.createBrowserSession) { + if (resolution.provider.createBrowserSession && transport !== "gateway-relay") { const session = await resolution.provider.createBrowserSession({ providerConfig: resolution.providerConfig, instructions: buildRealtimeInstructions(), tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL], - model: normalizeOptionalString(typedParams.model), - voice: normalizeOptionalString(typedParams.voice), + model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model, + voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice, }); - if (!isUnsupportedBrowserWebRtcSession(session)) { + if ( + !isUnsupportedBrowserWebRtcSession(session) && + (!transport || session.transport === transport) + ) { respond(true, session, undefined); return; } + if (transport) { + respond( + false, + undefined, + errorShape( + ErrorCodes.UNAVAILABLE, + `Realtime provider "${resolution.provider.id}" does not support requested browser transport "${transport}"`, + ), + ); + return; + } } const connId = client?.connId; @@ -542,8 +967,8 @@ export const talkHandlers: GatewayRequestHandlers = { ); return; } - const model = normalizeOptionalString(typedParams.model); - const voice = normalizeOptionalString(typedParams.voice); + const model = normalizeOptionalString(typedParams.model) ?? realtimeConfig.model; + const voice = normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice; const session = createTalkRealtimeRelaySession({ context, connId, @@ -559,6 +984,49 @@ export const talkHandlers: GatewayRequestHandlers = { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); } }, + "talk.realtime.toolCall": async (request) => { + const { params, respond } = request; + if (!validateTalkRealtimeToolCallParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.realtime.toolCall params: ${formatValidationErrors(validateTalkRealtimeToolCallParams.errors)}`, + ), + ); + return; + } + if (params.name !== REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME) { + respond( + false, + undefined, + errorShape(ErrorCodes.INVALID_REQUEST, `unsupported realtime Talk tool: ${params.name}`), + ); + return; + } + + const result = await startRealtimeToolCallAgentConsult({ + sessionKey: params.sessionKey, + callId: params.callId, + args: params.args ?? {}, + relaySessionId: normalizeOptionalString(params.relaySessionId), + connId: normalizeOptionalString(request.client?.connId), + request, + }); + if (!result.ok) { + respond(false, undefined, result.error); + return; + } + respond( + true, + { + runId: result.runId, + idempotencyKey: result.idempotencyKey, + }, + undefined, + ); + }, "talk.realtime.relayAudio": async ({ params, respond, client }) => { if (!validateTalkRealtimeRelayAudioParams(params)) { respond( @@ -612,6 +1080,34 @@ export const talkHandlers: GatewayRequestHandlers = { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); } }, + "talk.realtime.relayCancel": async ({ params, respond, client }) => { + if (!validateTalkRealtimeRelayCancelParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.realtime.relayCancel params: ${formatValidationErrors(validateTalkRealtimeRelayCancelParams.errors)}`, + ), + ); + return; + } + const connId = client?.connId; + if (!connId) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "realtime relay unavailable")); + return; + } + try { + cancelTalkRealtimeRelayTurn({ + relaySessionId: params.relaySessionId, + connId, + reason: normalizeOptionalString(params.reason), + }); + respond(true, { ok: true }, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, "talk.realtime.relayStop": async ({ params, respond, client }) => { if (!validateTalkRealtimeRelayStopParams(params)) { respond( @@ -665,6 +1161,141 @@ export const talkHandlers: GatewayRequestHandlers = { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); } }, + "talk.transcription.session": async ({ params, respond, context, client }) => { + if (!validateTalkTranscriptionSessionParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.transcription.session params: ${formatValidationErrors(validateTalkTranscriptionSessionParams.errors)}`, + ), + ); + return; + } + const connId = client?.connId; + if (!connId) { + respond( + false, + undefined, + errorShape(ErrorCodes.UNAVAILABLE, "transcription relay requires a connected client"), + ); + return; + } + try { + const runtimeConfig = context.getRuntimeConfig(); + const transcriptionConfig = buildTalkTranscriptionConfig(runtimeConfig, params.provider); + const resolution = resolveConfiguredRealtimeTranscriptionProvider({ + config: runtimeConfig, + configuredProviderId: transcriptionConfig.provider, + providerConfigs: transcriptionConfig.providers, + }); + const session = createTalkTranscriptionRelaySession({ + context, + connId, + provider: resolution.provider, + providerConfig: resolution.providerConfig, + }); + respond(true, session, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "talk.transcription.relayAudio": async ({ params, respond, client }) => { + if (!validateTalkTranscriptionRelayAudioParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.transcription.relayAudio params: ${formatValidationErrors(validateTalkTranscriptionRelayAudioParams.errors)}`, + ), + ); + return; + } + const connId = client?.connId; + if (!connId) { + respond( + false, + undefined, + errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"), + ); + return; + } + try { + sendTalkTranscriptionRelayAudio({ + transcriptionSessionId: params.transcriptionSessionId, + connId, + audioBase64: params.audioBase64, + }); + respond(true, { ok: true }, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "talk.transcription.relayCancel": async ({ params, respond, client }) => { + if (!validateTalkTranscriptionRelayCancelParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.transcription.relayCancel params: ${formatValidationErrors(validateTalkTranscriptionRelayCancelParams.errors)}`, + ), + ); + return; + } + const connId = client?.connId; + if (!connId) { + respond( + false, + undefined, + errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"), + ); + return; + } + try { + cancelTalkTranscriptionRelayTurn({ + transcriptionSessionId: params.transcriptionSessionId, + connId, + reason: normalizeOptionalString(params.reason), + }); + respond(true, { ok: true }, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "talk.transcription.relayStop": async ({ params, respond, client }) => { + if (!validateTalkTranscriptionRelayStopParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.transcription.relayStop params: ${formatValidationErrors(validateTalkTranscriptionRelayStopParams.errors)}`, + ), + ); + return; + } + const connId = client?.connId; + if (!connId) { + respond( + false, + undefined, + errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"), + ); + return; + } + try { + stopTalkTranscriptionRelaySession({ + transcriptionSessionId: params.transcriptionSessionId, + connId, + }); + respond(true, { ok: true }, undefined); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, "talk.speak": async ({ params, respond, context }) => { if (!validateTalkSpeakParams(params)) { respond( @@ -763,11 +1394,11 @@ export const talkHandlers: GatewayRequestHandlers = { } }, "talk.mode": ({ params, respond, context, client, isWebchatConnect }) => { - if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) { + if (client && isWebchatConnect(client.connect) && !context.hasConnectedTalkNode()) { respond( false, undefined, - errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected iOS/Android nodes"), + errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected Talk-capable nodes"), ); return; } diff --git a/src/gateway/server-mobile-nodes.ts b/src/gateway/server-mobile-nodes.ts deleted file mode 100644 index 12535fc0176..00000000000 --- a/src/gateway/server-mobile-nodes.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; -import type { NodeRegistry } from "./node-registry.js"; - -export function hasConnectedMobileNode(registry: NodeRegistry): boolean { - const connected = registry.listConnected(); - return connected.some((n) => { - const platform = normalizeOptionalLowercaseString(n.platform) ?? ""; - return ( - platform.startsWith("ios") || platform.startsWith("ipados") || platform.startsWith("android") - ); - }); -} diff --git a/src/gateway/server-node-session-runtime.ts b/src/gateway/server-node-session-runtime.ts index dc8bd815780..ffdee8f9782 100644 --- a/src/gateway/server-node-session-runtime.ts +++ b/src/gateway/server-node-session-runtime.ts @@ -4,8 +4,8 @@ import { createSessionMessageSubscriberRegistry, } from "./server-chat-state.js"; import { safeParseJson } from "./server-json.js"; -import { hasConnectedMobileNode } from "./server-mobile-nodes.js"; import { createNodeSubscriptionManager } from "./server-node-subscriptions.js"; +import { hasConnectedTalkNode } from "./server-talk-nodes.js"; export function createGatewayNodeSessionRuntime(params: { broadcast: (event: string, payload: unknown, opts?: { dropIfSlow?: boolean }) => void; @@ -26,7 +26,7 @@ export function createGatewayNodeSessionRuntime(params: { const broadcastVoiceWakeChanged = (triggers: string[]) => { params.broadcast("voicewake.changed", { triggers }, { dropIfSlow: true }); }; - const hasMobileNodeConnected = () => hasConnectedMobileNode(nodeRegistry); + const hasTalkNodeConnected = () => hasConnectedTalkNode(nodeRegistry); return { nodeRegistry, @@ -39,6 +39,6 @@ export function createGatewayNodeSessionRuntime(params: { nodeUnsubscribe: nodeSubscriptions.unsubscribe, nodeUnsubscribeAll: nodeSubscriptions.unsubscribeAll, broadcastVoiceWakeChanged, - hasMobileNodeConnected, + hasTalkNodeConnected, }; } diff --git a/src/gateway/server-request-context.test.ts b/src/gateway/server-request-context.test.ts index 736b7d3b61f..93afcc9e3ad 100644 --- a/src/gateway/server-request-context.test.ts +++ b/src/gateway/server-request-context.test.ts @@ -33,7 +33,7 @@ describe("createGatewayRequestContext", () => { nodeSubscribe: vi.fn(), nodeUnsubscribe: vi.fn(), nodeUnsubscribeAll: vi.fn(), - hasConnectedMobileNode: vi.fn(() => false), + hasConnectedTalkNode: vi.fn(() => false), clients: new Set(), enforceSharedGatewayAuthGenerationForConfigWrite: vi.fn(), nodeRegistry: {} as never, diff --git a/src/gateway/server-request-context.ts b/src/gateway/server-request-context.ts index 7fc674cc308..0a43c5d1977 100644 --- a/src/gateway/server-request-context.ts +++ b/src/gateway/server-request-context.ts @@ -28,7 +28,7 @@ type GatewayRequestContextParams = { nodeSubscribe: GatewayRequestContext["nodeSubscribe"]; nodeUnsubscribe: GatewayRequestContext["nodeUnsubscribe"]; nodeUnsubscribeAll: GatewayRequestContext["nodeUnsubscribeAll"]; - hasConnectedMobileNode: GatewayRequestContext["hasConnectedMobileNode"]; + hasConnectedTalkNode: GatewayRequestContext["hasConnectedTalkNode"]; clients: Set; enforceSharedGatewayAuthGenerationForConfigWrite: (nextConfig: OpenClawConfig) => void; nodeRegistry: GatewayRequestContext["nodeRegistry"]; @@ -92,7 +92,7 @@ export function createGatewayRequestContext( nodeSubscribe: params.nodeSubscribe, nodeUnsubscribe: params.nodeUnsubscribe, nodeUnsubscribeAll: params.nodeUnsubscribeAll, - hasConnectedMobileNode: params.hasConnectedMobileNode, + hasConnectedTalkNode: params.hasConnectedTalkNode, hasExecApprovalClients: (excludeConnId?: string) => { for (const gatewayClient of params.clients) { if (excludeConnId && gatewayClient.connId === excludeConnId) { diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index b8c870df5b9..0c7758653b3 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -884,7 +884,7 @@ export async function startGatewayServer( nodeUnsubscribe, nodeUnsubscribeAll, broadcastVoiceWakeChanged, - hasMobileNodeConnected, + hasTalkNodeConnected, } = createGatewayNodeSessionRuntime({ broadcast }); applyGatewayLaneConcurrency(cfgAtStart); @@ -1261,7 +1261,7 @@ export async function startGatewayServer( nodeSubscribe, nodeUnsubscribe, nodeUnsubscribeAll, - hasConnectedMobileNode: hasMobileNodeConnected, + hasConnectedTalkNode: hasTalkNodeConnected, clients, enforceSharedGatewayAuthGenerationForConfigWrite: (nextConfig: OpenClawConfig) => { enforceSharedGatewaySessionGenerationForConfigWrite({ diff --git a/src/gateway/talk-handoff.test.ts b/src/gateway/talk-handoff.test.ts new file mode 100644 index 00000000000..c17e8e746f3 --- /dev/null +++ b/src/gateway/talk-handoff.test.ts @@ -0,0 +1,286 @@ +import { describe, expect, it, vi } from "vitest"; +import { + cancelTalkHandoffTurn, + clearTalkHandoffsForTest, + createTalkHandoff, + endTalkHandoffTurn, + getTalkHandoff, + joinTalkHandoff, + revokeTalkHandoff, + startTalkHandoffTurn, + verifyTalkHandoffToken, +} from "./talk-handoff.js"; + +describe("talk handoff store", () => { + it("creates an expiring managed-room handoff without storing the plaintext token", () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2026-05-05T12:00:00.000Z")); + clearTalkHandoffsForTest(); + + const handoff = createTalkHandoff({ + sessionKey: "session:main", + sessionId: "session-id", + channel: "discord", + target: "dm:123", + provider: "openai", + model: "gpt-realtime-1.5", + voice: "alloy", + ttlMs: 5000, + }); + const record = getTalkHandoff(handoff.id); + + expect(handoff).toMatchObject({ + roomId: `talk_${handoff.id}`, + roomUrl: `/talk/rooms/talk_${handoff.id}`, + sessionKey: "session:main", + sessionId: "session-id", + channel: "discord", + target: "dm:123", + provider: "openai", + model: "gpt-realtime-1.5", + voice: "alloy", + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + createdAt: Date.parse("2026-05-05T12:00:00.000Z"), + expiresAt: Date.parse("2026-05-05T12:00:05.000Z"), + room: { + activeClientId: undefined, + recentTalkEvents: [ + expect.objectContaining({ + type: "session.started", + sessionId: `talk_${handoff.id}`, + transport: "managed-room", + }), + ], + }, + }); + expect(handoff).not.toHaveProperty("tokenHash"); + expect(record?.tokenHash).toBeTruthy(); + expect(record?.tokenHash).not.toBe(handoff.token); + expect(record && verifyTalkHandoffToken(record, handoff.token)).toBe(true); + + vi.advanceTimersByTime(5001); + expect(getTalkHandoff(handoff.id)).toBeUndefined(); + vi.useRealTimers(); + }); + + it("joins and revokes handoffs with only the bearer token", () => { + clearTalkHandoffsForTest(); + const handoff = createTalkHandoff({ sessionKey: "session:main" }); + + expect(joinTalkHandoff(handoff.id, "wrong")).toEqual({ + ok: false, + reason: "invalid_token", + }); + expect(joinTalkHandoff(handoff.id, handoff.token)).toMatchObject({ + ok: true, + events: [expect.objectContaining({ type: "session.ready" })], + record: expect.objectContaining({ + id: handoff.id, + roomId: handoff.roomId, + sessionKey: "session:main", + }), + }); + + expect(revokeTalkHandoff(handoff.id)).toMatchObject({ revoked: true }); + expect(joinTalkHandoff(handoff.id, handoff.token)).toEqual({ + ok: false, + reason: "not_found", + }); + }); + + it("records managed-room ready, replacement, and close lifecycle events", () => { + clearTalkHandoffsForTest(); + const handoff = createTalkHandoff({ sessionKey: "session:main" }); + + const firstJoin = joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-1" }); + expect(firstJoin).toMatchObject({ + ok: true, + events: [ + expect.objectContaining({ + type: "session.ready", + sessionId: handoff.roomId, + payload: expect.objectContaining({ clientId: "conn-1" }), + }), + ], + record: { + room: expect.objectContaining({ + activeClientId: "conn-1", + }), + }, + }); + + const secondJoin = joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-2" }); + expect(secondJoin).toMatchObject({ + ok: true, + events: [ + expect.objectContaining({ + type: "session.replaced", + sessionId: handoff.roomId, + payload: expect.objectContaining({ + previousClientId: "conn-1", + nextClientId: "conn-2", + }), + }), + expect.objectContaining({ + type: "session.ready", + sessionId: handoff.roomId, + payload: expect.objectContaining({ clientId: "conn-2" }), + }), + ], + record: { + room: expect.objectContaining({ + activeClientId: "conn-2", + }), + }, + }); + + expect(revokeTalkHandoff(handoff.id)).toMatchObject({ + revoked: true, + activeClientId: "conn-2", + events: [ + expect.objectContaining({ + type: "session.closed", + sessionId: handoff.roomId, + payload: expect.objectContaining({ reason: "revoked" }), + final: true, + }), + ], + }); + }); + + it("records managed-room turn start, end, and cancellation events", () => { + clearTalkHandoffsForTest(); + const handoff = createTalkHandoff({ sessionKey: "session:main" }); + joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-1" }); + + const start = startTalkHandoffTurn(handoff.id, handoff.token, { + clientId: "conn-1", + turnId: "turn-1", + }); + expect(start).toMatchObject({ + ok: true, + turnId: "turn-1", + events: [expect.objectContaining({ type: "turn.started", turnId: "turn-1" })], + record: { + room: expect.objectContaining({ + activeClientId: "conn-1", + activeTurnId: "turn-1", + }), + }, + }); + + expect(endTalkHandoffTurn(handoff.id, handoff.token)).toMatchObject({ + ok: true, + turnId: "turn-1", + events: [ + expect.objectContaining({ + type: "turn.ended", + turnId: "turn-1", + final: true, + }), + ], + record: { + room: expect.not.objectContaining({ + activeTurnId: expect.any(String), + }), + }, + }); + + expect(cancelTalkHandoffTurn(handoff.id, handoff.token)).toEqual({ + ok: false, + reason: "no_active_turn", + }); + + startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-2" }); + expect(cancelTalkHandoffTurn(handoff.id, handoff.token, { reason: "barge-in" })).toMatchObject({ + ok: true, + turnId: "turn-2", + events: [ + expect.objectContaining({ + type: "turn.cancelled", + turnId: "turn-2", + final: true, + payload: expect.objectContaining({ reason: "barge-in" }), + }), + ], + }); + }); + + it("rejects stale managed-room turn completion without clearing the active turn", () => { + clearTalkHandoffsForTest(); + const handoff = createTalkHandoff({ sessionKey: "session:main" }); + + startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" }); + startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-current" }); + + expect(endTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" })).toEqual({ + ok: false, + reason: "stale_turn", + }); + expect(getTalkHandoff(handoff.id)?.room.talk.activeTurnId).toBe("turn-current"); + + expect(cancelTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" })).toEqual({ + ok: false, + reason: "stale_turn", + }); + expect(getTalkHandoff(handoff.id)?.room.talk.activeTurnId).toBe("turn-current"); + + expect(endTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-current" })).toMatchObject( + { + ok: true, + turnId: "turn-current", + }, + ); + }); + + it("isolates simultaneous handoffs for different sessions on the same host", () => { + clearTalkHandoffsForTest(); + + const first = createTalkHandoff({ + sessionKey: "agent:main:first", + channel: "browser", + target: "host:local", + provider: "openai", + }); + const second = createTalkHandoff({ + sessionKey: "agent:main:second", + channel: "browser", + target: "host:local", + }); + + expect(first.id).not.toBe(second.id); + expect(first.roomId).not.toBe(second.roomId); + expect(first.token).not.toBe(second.token); + expect(joinTalkHandoff(first.id, second.token)).toEqual({ + ok: false, + reason: "invalid_token", + }); + expect(joinTalkHandoff(second.id, first.token)).toEqual({ + ok: false, + reason: "invalid_token", + }); + expect(joinTalkHandoff(first.id, first.token)).toMatchObject({ + ok: true, + events: [expect.objectContaining({ type: "session.ready" })], + record: expect.objectContaining({ + roomId: first.roomId, + sessionKey: "agent:main:first", + channel: "browser", + target: "host:local", + provider: "openai", + }), + }); + expect(joinTalkHandoff(second.id, second.token)).toMatchObject({ + ok: true, + events: [expect.objectContaining({ type: "session.ready" })], + record: expect.objectContaining({ + roomId: second.roomId, + sessionKey: "agent:main:second", + channel: "browser", + target: "host:local", + }), + }); + }); +}); diff --git a/src/gateway/talk-handoff.ts b/src/gateway/talk-handoff.ts new file mode 100644 index 00000000000..dfba177fa97 --- /dev/null +++ b/src/gateway/talk-handoff.ts @@ -0,0 +1,389 @@ +import { createHash, randomBytes, randomUUID } from "node:crypto"; +import { + createTalkSessionController, + type TalkBrain, + type TalkEvent, + type TalkEventInput, + type TalkMode, + type TalkSessionController, + type TalkTransport, +} from "../realtime-voice/talk-session-controller.js"; + +const DEFAULT_TALK_HANDOFF_TTL_MS = 10 * 60 * 1000; +const MAX_TALK_HANDOFF_TTL_MS = 60 * 60 * 1000; + +export type TalkHandoffCreateParams = { + sessionKey: string; + sessionId?: string; + channel?: string; + target?: string; + provider?: string; + model?: string; + voice?: string; + mode?: TalkMode; + transport?: TalkTransport; + brain?: TalkBrain; + ttlMs?: number; +}; + +export type TalkHandoffRecord = { + id: string; + roomId: string; + roomUrl: string; + tokenHash: string; + sessionKey: string; + sessionId?: string; + channel?: string; + target?: string; + provider?: string; + model?: string; + voice?: string; + mode: TalkMode; + transport: TalkTransport; + brain: TalkBrain; + createdAt: number; + expiresAt: number; + room: TalkHandoffRoomState; +}; + +export type TalkHandoffPublicRecord = Omit & { + room: { + activeClientId?: string; + activeTurnId?: string; + recentTalkEvents: TalkEvent[]; + }; +}; + +export type TalkHandoffCreateResult = TalkHandoffPublicRecord & { + token: string; +}; + +export type TalkHandoffJoinResult = + | { + ok: true; + record: TalkHandoffPublicRecord; + events: TalkEvent[]; + replacedClientId?: string; + replacementEvents: TalkEvent[]; + activeClientEvents: TalkEvent[]; + } + | { ok: false; reason: "not_found" | "expired" | "invalid_token" }; + +export type TalkHandoffRevokeResult = { + revoked: boolean; + roomId?: string; + activeClientId?: string; + events: TalkEvent[]; +}; + +export type TalkHandoffTurnResult = + | { + ok: true; + record: TalkHandoffPublicRecord; + turnId: string; + events: TalkEvent[]; + } + | { + ok: false; + reason: "not_found" | "expired" | "invalid_token" | "no_active_turn" | "stale_turn"; + }; + +type TalkHandoffRoomState = { + activeClientId?: string; + talk: TalkSessionController; +}; + +const handoffs = new Map(); + +export function createTalkHandoff(params: TalkHandoffCreateParams): TalkHandoffCreateResult { + pruneExpiredTalkHandoffs(); + const createdAt = Date.now(); + const ttlMs = normalizeTtlMs(params.ttlMs); + const id = randomUUID(); + const roomId = `talk_${id}`; + const token = randomBytes(32).toString("base64url"); + const room = createTalkHandoffRoom({ + roomId, + mode: params.mode ?? "stt-tts", + transport: params.transport ?? "managed-room", + brain: params.brain ?? "agent-consult", + provider: params.provider, + }); + const record: TalkHandoffRecord = { + id, + roomId, + roomUrl: `/talk/rooms/${roomId}`, + tokenHash: hashTalkHandoffToken(token), + sessionKey: params.sessionKey, + sessionId: params.sessionId, + channel: params.channel, + target: params.target, + provider: params.provider, + model: params.model, + voice: params.voice, + mode: params.mode ?? "stt-tts", + transport: params.transport ?? "managed-room", + brain: params.brain ?? "agent-consult", + createdAt, + expiresAt: createdAt + ttlMs, + room, + }; + appendTalkHandoffRoomEvent(record, { + type: "session.started", + payload: { handoffId: id, roomId }, + }); + handoffs.set(id, record); + return { ...toPublicTalkHandoffRecord(record), token }; +} + +export function getTalkHandoff(id: string): TalkHandoffRecord | undefined { + pruneExpiredTalkHandoffs(); + return handoffs.get(id); +} + +export function joinTalkHandoff( + id: string, + token: string, + opts: { clientId?: string } = {}, +): TalkHandoffJoinResult { + const access = resolveTalkHandoffAccess(id, token); + if (!access.ok) { + return access; + } + const record = access.record; + const previousClientId = record.room.activeClientId; + const events = joinTalkHandoffRoom(record, opts.clientId); + const replacedClientId = + previousClientId && previousClientId !== opts.clientId ? previousClientId : undefined; + const replacementEvents = replacedClientId + ? events.filter((event) => event.type === "session.replaced") + : []; + const activeClientEvents = replacedClientId + ? events.filter((event) => event.type !== "session.replaced") + : events; + return { + ok: true, + record: toPublicTalkHandoffRecord(record), + events, + replacedClientId, + replacementEvents, + activeClientEvents, + }; +} + +export function startTalkHandoffTurn( + id: string, + token: string, + opts: { turnId?: string; clientId?: string } = {}, +): TalkHandoffTurnResult { + const access = resolveTalkHandoffAccess(id, token); + if (!access.ok) { + return access; + } + const record = access.record; + if (opts.clientId) { + record.room.activeClientId = opts.clientId; + } + const turnId = normalizeOptionalString(opts.turnId) ?? randomUUID(); + const turn = record.room.talk.startTurn({ + turnId, + payload: { handoffId: id, roomId: record.roomId, clientId: record.room.activeClientId }, + }); + return { + ok: true, + record: toPublicTalkHandoffRecord(record), + turnId, + events: turn.event ? [turn.event] : [], + }; +} + +export function endTalkHandoffTurn( + id: string, + token: string, + opts: { turnId?: string } = {}, +): TalkHandoffTurnResult { + const access = resolveTalkHandoffAccess(id, token); + if (!access.ok) { + return access; + } + const record = access.record; + const result = record.room.talk.endTurn({ + turnId: normalizeOptionalString(opts.turnId), + payload: { handoffId: id, roomId: record.roomId }, + }); + if (!result.ok) { + return result; + } + return { + ok: true, + record: toPublicTalkHandoffRecord(record), + turnId: result.turnId, + events: [result.event], + }; +} + +export function cancelTalkHandoffTurn( + id: string, + token: string, + opts: { reason?: string; turnId?: string } = {}, +): TalkHandoffTurnResult { + const access = resolveTalkHandoffAccess(id, token); + if (!access.ok) { + return access; + } + const record = access.record; + const result = record.room.talk.cancelTurn({ + turnId: normalizeOptionalString(opts.turnId), + payload: { handoffId: id, roomId: record.roomId, reason: opts.reason ?? "client-cancelled" }, + }); + if (!result.ok) { + return result; + } + return { + ok: true, + record: toPublicTalkHandoffRecord(record), + turnId: result.turnId, + events: [result.event], + }; +} + +export function revokeTalkHandoff(id: string): TalkHandoffRevokeResult { + pruneExpiredTalkHandoffs(); + const record = handoffs.get(id); + if (!record) { + return { revoked: false, events: [] }; + } + const event = appendTalkHandoffRoomEvent(record, { + type: "session.closed", + payload: { reason: "revoked", handoffId: id, roomId: record.roomId }, + final: true, + }); + handoffs.delete(id); + return { + revoked: true, + roomId: record.roomId, + activeClientId: record.room.activeClientId, + events: [event], + }; +} + +export function verifyTalkHandoffToken(record: TalkHandoffRecord, token: string): boolean { + return record.tokenHash === hashTalkHandoffToken(token); +} + +export function clearTalkHandoffsForTest(): void { + handoffs.clear(); +} + +function normalizeTtlMs(value: number | undefined): number { + if (!Number.isFinite(value) || value === undefined) { + return DEFAULT_TALK_HANDOFF_TTL_MS; + } + return Math.min(Math.max(Math.trunc(value), 1000), MAX_TALK_HANDOFF_TTL_MS); +} + +function pruneExpiredTalkHandoffs(now = Date.now()): void { + for (const [id, record] of handoffs) { + if (record.expiresAt <= now) { + appendTalkHandoffRoomEvent(record, { + type: "session.closed", + payload: { reason: "expired", handoffId: id, roomId: record.roomId }, + final: true, + }); + handoffs.delete(id); + } + } +} + +function hashTalkHandoffToken(token: string): string { + return createHash("sha256").update(token).digest("base64url"); +} + +function toPublicTalkHandoffRecord(record: TalkHandoffRecord): TalkHandoffPublicRecord { + const { tokenHash: _tokenHash, room: _room, ...publicRecord } = record; + return { + ...publicRecord, + room: { + activeClientId: record.room.activeClientId, + activeTurnId: record.room.talk.activeTurnId, + recentTalkEvents: [...record.room.talk.recentEvents], + }, + }; +} + +function createTalkHandoffRoom(params: { + roomId: string; + mode: TalkMode; + transport: TalkTransport; + brain: TalkBrain; + provider?: string; +}): TalkHandoffRoomState { + return { + talk: createTalkSessionController({ + sessionId: params.roomId, + mode: params.mode, + transport: params.transport, + brain: params.brain, + provider: params.provider, + }), + }; +} + +function resolveTalkHandoffAccess( + id: string, + token: string, +): + | { ok: true; record: TalkHandoffRecord } + | { ok: false; reason: "not_found" | "expired" | "invalid_token" } { + const record = handoffs.get(id); + if (!record) { + return { ok: false, reason: "not_found" }; + } + if (record.expiresAt <= Date.now()) { + appendTalkHandoffRoomEvent(record, { + type: "session.closed", + payload: { reason: "expired", handoffId: id, roomId: record.roomId }, + final: true, + }); + handoffs.delete(id); + return { ok: false, reason: "expired" }; + } + if (!verifyTalkHandoffToken(record, token)) { + return { ok: false, reason: "invalid_token" }; + } + return { ok: true, record }; +} + +function appendTalkHandoffRoomEvent(record: TalkHandoffRecord, input: TalkEventInput): TalkEvent { + return record.room.talk.emit(input); +} + +function joinTalkHandoffRoom(record: TalkHandoffRecord, clientId: string | undefined): TalkEvent[] { + const events: TalkEvent[] = []; + if (record.room.activeClientId && record.room.activeClientId !== clientId) { + events.push( + appendTalkHandoffRoomEvent(record, { + type: "session.replaced", + payload: { + handoffId: record.id, + roomId: record.roomId, + previousClientId: record.room.activeClientId, + nextClientId: clientId, + }, + }), + ); + } + record.room.activeClientId = clientId; + events.push( + appendTalkHandoffRoomEvent(record, { + type: "session.ready", + payload: { handoffId: record.id, roomId: record.roomId, clientId }, + }), + ); + return events; +} + +function normalizeOptionalString(value: string | undefined): string | undefined { + const trimmed = value?.trim(); + return trimmed ? trimmed : undefined; +} diff --git a/src/gateway/talk-realtime-relay.test.ts b/src/gateway/talk-realtime-relay.test.ts index 0c76eb15cab..7e46791071f 100644 --- a/src/gateway/talk-realtime-relay.test.ts +++ b/src/gateway/talk-realtime-relay.test.ts @@ -3,8 +3,10 @@ import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; import type { RealtimeVoiceBridgeCreateRequest } from "../realtime-voice/provider-types.js"; import { acknowledgeTalkRealtimeRelayMark, + cancelTalkRealtimeRelayTurn, clearTalkRealtimeRelaySessionsForTest, createTalkRealtimeRelaySession, + registerTalkRealtimeRelayAgentRun, sendTalkRealtimeRelayAudio, stopTalkRealtimeRelaySession, submitTalkRealtimeRelayToolResult, @@ -24,6 +26,7 @@ describe("talk realtime gateway relay", () => { bridgeRequest?.onAudio(Buffer.from("audio-out")); bridgeRequest?.onMark?.("mark-1"); bridgeRequest?.onTranscript?.("user", "hello", true); + bridgeRequest?.onTranscript?.("assistant", "hi there", true); bridgeRequest?.onToolCall?.({ itemId: "item-1", callId: "call-1", @@ -35,6 +38,7 @@ describe("talk realtime gateway relay", () => { setMediaTimestamp: vi.fn(), sendUserMessage: vi.fn(), triggerGreeting: vi.fn(), + handleBargeIn: vi.fn(), submitToolResult: vi.fn(), acknowledgeMark: vi.fn(), close: vi.fn(), @@ -90,36 +94,74 @@ describe("talk realtime gateway relay", () => { expect.objectContaining({ event: "talk.realtime.relay", connIds: ["conn-1"], - payload: { relaySessionId: session.relaySessionId, type: "ready" }, + payload: expect.objectContaining({ + relaySessionId: session.relaySessionId, + type: "ready", + talkEvent: expect.objectContaining({ + sessionId: session.relaySessionId, + type: "session.ready", + seq: 1, + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + provider: "relay-test", + }), + }), }), expect.objectContaining({ - payload: { + payload: expect.objectContaining({ relaySessionId: session.relaySessionId, type: "audio", audioBase64: Buffer.from("audio-out").toString("base64"), - }, + talkEvent: expect.objectContaining({ type: "output.audio.delta" }), + }), }), expect.objectContaining({ - payload: { relaySessionId: session.relaySessionId, type: "mark", markName: "mark-1" }, + payload: expect.objectContaining({ + relaySessionId: session.relaySessionId, + type: "mark", + markName: "mark-1", + talkEvent: expect.objectContaining({ type: "output.audio.done", final: true }), + }), }), expect.objectContaining({ - payload: { + payload: expect.objectContaining({ relaySessionId: session.relaySessionId, type: "transcript", role: "user", text: "hello", final: true, - }, + talkEvent: expect.objectContaining({ type: "transcript.done", final: true }), + }), }), expect.objectContaining({ - payload: { + payload: expect.objectContaining({ + relaySessionId: session.relaySessionId, + type: "transcript", + role: "assistant", + text: "hi there", + final: true, + talkEvent: expect.objectContaining({ + type: "output.text.done", + final: true, + payload: { text: "hi there" }, + }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ relaySessionId: session.relaySessionId, type: "toolCall", itemId: "item-1", callId: "call-1", name: "openclaw_agent_consult", args: { question: "what now" }, - }, + talkEvent: expect.objectContaining({ + type: "tool.call", + itemId: "item-1", + callId: "call-1", + }), + }), }), ]), ); @@ -137,13 +179,66 @@ describe("talk realtime gateway relay", () => { callId: "call-1", result: { ok: true }, }); + cancelTalkRealtimeRelayTurn({ + relaySessionId: session.relaySessionId, + connId: "conn-1", + reason: "barge-in", + }); stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId: "conn-1" }); expect(bridge.sendAudio).toHaveBeenCalledWith(Buffer.from("audio-in")); expect(bridge.setMediaTimestamp).toHaveBeenCalledWith(123); expect(bridge.acknowledgeMark).toHaveBeenCalled(); expect(bridge.submitToolResult).toHaveBeenCalledWith("call-1", { ok: true }, undefined); + expect(bridge.handleBargeIn).toHaveBeenCalledWith({ audioPlaybackActive: true }); expect(bridge.close).toHaveBeenCalled(); + expect(events).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + payload: expect.objectContaining({ + relaySessionId: session.relaySessionId, + type: "inputAudio", + byteLength: Buffer.from("audio-in").byteLength, + talkEvent: expect.objectContaining({ type: "input.audio.delta" }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + relaySessionId: session.relaySessionId, + type: "clear", + talkEvent: expect.objectContaining({ + type: "turn.cancelled", + payload: { reason: "barge-in" }, + final: true, + }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + relaySessionId: session.relaySessionId, + type: "toolResult", + callId: "call-1", + talkEvent: expect.objectContaining({ + type: "tool.result", + callId: "call-1", + final: true, + }), + }), + }), + ]), + ); + expect(events).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + payload: expect.objectContaining({ + relaySessionId: session.relaySessionId, + type: "close", + reason: "completed", + talkEvent: expect.objectContaining({ type: "session.closed", final: true }), + }), + }), + ]), + ); }); it("rejects relay control from a different connection", () => { @@ -155,6 +250,7 @@ describe("talk realtime gateway relay", () => { connect: vi.fn(async () => undefined), sendAudio: vi.fn(), setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), submitToolResult: vi.fn(), acknowledgeMark: vi.fn(), close: vi.fn(), @@ -179,6 +275,303 @@ describe("talk realtime gateway relay", () => { ).toThrow("Unknown realtime relay session"); }); + it("correlates output audio with the active relay turn", () => { + let bridgeRequest: RealtimeVoiceBridgeCreateRequest | undefined; + const provider: RealtimeVoiceProviderPlugin = { + id: "relay-test", + label: "Relay Test", + isConfigured: () => true, + createBridge: (req) => { + bridgeRequest = req; + return { + connect: vi.fn(async () => undefined), + sendAudio: vi.fn(), + setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), + submitToolResult: vi.fn(), + acknowledgeMark: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + }, + }; + const events: Array<{ + event: string; + payload: { talkEvent?: { type?: string; turnId?: string } }; + }> = []; + const context = { + broadcastToConnIds: ( + event: string, + payload: { talkEvent?: { type?: string; turnId?: string } }, + ) => { + events.push({ event, payload }); + }, + } as never; + const session = createTalkRealtimeRelaySession({ + context, + connId: "conn-1", + provider, + providerConfig: {}, + instructions: "brief", + tools: [], + }); + + sendTalkRealtimeRelayAudio({ + relaySessionId: session.relaySessionId, + connId: "conn-1", + audioBase64: Buffer.from("audio").toString("base64"), + }); + bridgeRequest?.onAudio(Buffer.from("reply")); + + expect( + events.some( + (entry) => + entry.payload.talkEvent?.type === "output.audio.delta" && + entry.payload.talkEvent.turnId === "turn-1", + ), + ).toBe(true); + }); + + it("aborts linked agent consult runs when the relay turn is cancelled", () => { + const abortController = new AbortController(); + const broadcast = vi.fn(); + const nodeSendToSession = vi.fn(); + const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" })); + const provider: RealtimeVoiceProviderPlugin = { + id: "relay-test", + label: "Relay Test", + isConfigured: () => true, + createBridge: () => ({ + connect: vi.fn(async () => undefined), + sendAudio: vi.fn(), + setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), + submitToolResult: vi.fn(), + acknowledgeMark: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }), + }; + const context = { + broadcastToConnIds: vi.fn(), + broadcast, + nodeSendToSession, + chatAbortControllers: new Map([ + [ + "run-1", + { + controller: abortController, + sessionId: "run-1", + sessionKey: "main", + startedAtMs: 1, + expiresAtMs: Date.now() + 60_000, + }, + ], + ]), + chatRunBuffers: new Map([["run-1", "partial answer"]]), + chatDeltaSentAt: new Map(), + chatDeltaLastBroadcastLen: new Map(), + chatAbortedRuns: new Map(), + removeChatRun, + agentRunSeq: new Map(), + } as never; + const session = createTalkRealtimeRelaySession({ + context, + connId: "conn-1", + provider, + providerConfig: {}, + instructions: "brief", + tools: [], + }); + + registerTalkRealtimeRelayAgentRun({ + relaySessionId: session.relaySessionId, + connId: "conn-1", + sessionKey: "main", + runId: "run-1", + }); + cancelTalkRealtimeRelayTurn({ + relaySessionId: session.relaySessionId, + connId: "conn-1", + reason: "barge-in", + }); + + expect(abortController.signal.aborted).toBe(true); + expect(removeChatRun).toHaveBeenCalledWith("run-1", "run-1", "main"); + expect(broadcast).toHaveBeenCalledWith( + "chat", + expect.objectContaining({ + runId: "run-1", + sessionKey: "main", + state: "aborted", + stopReason: "barge-in", + }), + ); + expect(nodeSendToSession).toHaveBeenCalledWith( + "main", + "chat", + expect.objectContaining({ runId: "run-1", state: "aborted" }), + ); + }); + + it("aborts linked agent consult runs when the relay session closes", () => { + const abortController = new AbortController(); + const broadcast = vi.fn(); + const nodeSendToSession = vi.fn(); + const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" })); + const provider: RealtimeVoiceProviderPlugin = { + id: "relay-test", + label: "Relay Test", + isConfigured: () => true, + createBridge: () => ({ + connect: vi.fn(async () => undefined), + sendAudio: vi.fn(), + setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), + submitToolResult: vi.fn(), + acknowledgeMark: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }), + }; + const context = { + broadcastToConnIds: vi.fn(), + broadcast, + nodeSendToSession, + chatAbortControllers: new Map([ + [ + "run-1", + { + controller: abortController, + sessionId: "run-1", + sessionKey: "main", + startedAtMs: 1, + expiresAtMs: Date.now() + 60_000, + }, + ], + ]), + chatRunBuffers: new Map([["run-1", "partial answer"]]), + chatDeltaSentAt: new Map(), + chatDeltaLastBroadcastLen: new Map(), + chatAbortedRuns: new Map(), + removeChatRun, + agentRunSeq: new Map(), + } as never; + const session = createTalkRealtimeRelaySession({ + context, + connId: "conn-1", + provider, + providerConfig: {}, + instructions: "brief", + tools: [], + }); + + registerTalkRealtimeRelayAgentRun({ + relaySessionId: session.relaySessionId, + connId: "conn-1", + sessionKey: "main", + runId: "run-1", + }); + stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId: "conn-1" }); + + expect(abortController.signal.aborted).toBe(true); + expect(broadcast).toHaveBeenCalledWith( + "chat", + expect.objectContaining({ + runId: "run-1", + sessionKey: "main", + state: "aborted", + stopReason: "relay-closed", + }), + ); + expect(nodeSendToSession).toHaveBeenCalledWith( + "main", + "chat", + expect.objectContaining({ runId: "run-1", state: "aborted" }), + ); + }); + + it("aborts linked agent consult runs when the provider closes the relay", () => { + const abortController = new AbortController(); + let bridgeRequest: RealtimeVoiceBridgeCreateRequest | undefined; + const broadcast = vi.fn(); + const nodeSendToSession = vi.fn(); + const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" })); + const provider: RealtimeVoiceProviderPlugin = { + id: "relay-test", + label: "Relay Test", + isConfigured: () => true, + createBridge: (req) => { + bridgeRequest = req; + return { + connect: vi.fn(async () => undefined), + sendAudio: vi.fn(), + setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), + submitToolResult: vi.fn(), + acknowledgeMark: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + }, + }; + const context = { + broadcastToConnIds: vi.fn(), + broadcast, + nodeSendToSession, + chatAbortControllers: new Map([ + [ + "run-1", + { + controller: abortController, + sessionId: "run-1", + sessionKey: "main", + startedAtMs: 1, + expiresAtMs: Date.now() + 60_000, + }, + ], + ]), + chatRunBuffers: new Map([["run-1", "partial answer"]]), + chatDeltaSentAt: new Map(), + chatDeltaLastBroadcastLen: new Map(), + chatAbortedRuns: new Map(), + removeChatRun, + agentRunSeq: new Map(), + } as never; + const session = createTalkRealtimeRelaySession({ + context, + connId: "conn-1", + provider, + providerConfig: {}, + instructions: "brief", + tools: [], + }); + + registerTalkRealtimeRelayAgentRun({ + relaySessionId: session.relaySessionId, + connId: "conn-1", + sessionKey: "main", + runId: "run-1", + }); + bridgeRequest?.onClose?.("error"); + + expect(abortController.signal.aborted).toBe(true); + expect(broadcast).toHaveBeenCalledWith( + "chat", + expect.objectContaining({ + runId: "run-1", + sessionKey: "main", + state: "aborted", + stopReason: "relay-closed", + }), + ); + expect(nodeSendToSession).toHaveBeenCalledWith( + "main", + "chat", + expect.objectContaining({ runId: "run-1", state: "aborted" }), + ); + }); + it("caps active relay sessions per browser connection", () => { const provider: RealtimeVoiceProviderPlugin = { id: "relay-test", @@ -188,6 +581,7 @@ describe("talk realtime gateway relay", () => { connect: vi.fn(async () => undefined), sendAudio: vi.fn(), setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), submitToolResult: vi.fn(), acknowledgeMark: vi.fn(), close: vi.fn(), diff --git a/src/gateway/talk-realtime-relay.ts b/src/gateway/talk-realtime-relay.ts index dfad1912e9b..a0bf3597086 100644 --- a/src/gateway/talk-realtime-relay.ts +++ b/src/gateway/talk-realtime-relay.ts @@ -10,6 +10,13 @@ import { createRealtimeVoiceBridgeSession, type RealtimeVoiceBridgeSession, } from "../realtime-voice/session-runtime.js"; +import { + type TalkEvent, + type TalkEventInput, + type TalkSessionController, + createTalkSessionController, +} from "../realtime-voice/talk-session-controller.js"; +import { abortChatRunById } from "./chat-abort.js"; import type { GatewayRequestContext } from "./server-methods/shared-types.js"; const RELAY_SESSION_TTL_MS = 30 * 60 * 1000; @@ -18,8 +25,9 @@ const MAX_RELAY_SESSIONS_PER_CONN = 2; const MAX_RELAY_SESSIONS_GLOBAL = 64; const RELAY_EVENT = "talk.realtime.relay"; -type TalkRealtimeRelayEvent = +type TalkRealtimeRelayEventPayload = | { relaySessionId: string; type: "ready" } + | { relaySessionId: string; type: "inputAudio"; byteLength: number } | { relaySessionId: string; type: "audio"; audioBase64: string } | { relaySessionId: string; type: "clear" } | { relaySessionId: string; type: "mark"; markName: string } @@ -38,16 +46,21 @@ type TalkRealtimeRelayEvent = name: string; args: unknown; } + | { relaySessionId: string; type: "toolResult"; callId: string } | { relaySessionId: string; type: "error"; message: string } | { relaySessionId: string; type: "close"; reason: "completed" | "error" }; +type TalkRealtimeRelayEvent = TalkRealtimeRelayEventPayload & { talkEvent?: TalkEvent }; + type RelaySession = { id: string; connId: string; context: GatewayRequestContext; bridge: RealtimeVoiceBridgeSession; + talk: TalkSessionController; expiresAtMs: number; cleanupTimer: ReturnType; + activeAgentRuns: Map; }; type CreateTalkRealtimeRelaySessionParams = { @@ -85,14 +98,31 @@ function broadcastToOwner( context.broadcastToConnIds(RELAY_EVENT, event, new Set([connId]), { dropIfSlow: true }); } +function abortRelayAgentRuns(session: RelaySession, reason: string): void { + for (const [runId, sessionKey] of session.activeAgentRuns) { + abortChatRunById(session.context, { + runId, + sessionKey, + stopReason: reason, + }); + } + session.activeAgentRuns.clear(); +} + function closeRelaySession(session: RelaySession, reason: "completed" | "error"): void { relaySessions.delete(session.id); clearTimeout(session.cleanupTimer); + abortRelayAgentRuns(session, reason === "error" ? "relay-error" : "relay-closed"); session.bridge.close(); broadcastToOwner(session.context, session.connId, { relaySessionId: session.id, type: "close", reason, + talkEvent: session.talk.emit({ + type: "session.closed", + payload: { reason }, + final: true, + }), }); } @@ -130,9 +160,19 @@ export function createTalkRealtimeRelaySession( enforceRelaySessionLimits(params.connId); const relaySessionId = randomUUID(); const expiresAtMs = Date.now() + RELAY_SESSION_TTL_MS; + const talk = createTalkSessionController({ + sessionId: relaySessionId, + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + provider: params.provider.id, + }); let relay: RelaySession | undefined; - const emit = (event: TalkRealtimeRelayEvent) => - broadcastToOwner(params.context, params.connId, event); + const emit = (event: TalkRealtimeRelayEventPayload, talkEvent?: TalkEventInput) => + broadcastToOwner(params.context, params.connId, { + ...event, + ...(talkEvent ? { talkEvent: talk.emit(talkEvent) } : {}), + }); const bridge = createRealtimeVoiceBridgeSession({ provider: params.provider, providerConfig: params.providerConfig, @@ -142,30 +182,94 @@ export function createTalkRealtimeRelaySession( markStrategy: "transport", audioSink: { isOpen: () => Boolean(relay && relaySessions.has(relay.id)), - sendAudio: (audio) => - emit({ - relaySessionId, - type: "audio", - audioBase64: audio.toString("base64"), - }), - clearAudio: () => emit({ relaySessionId, type: "clear" }), - sendMark: (markName) => emit({ relaySessionId, type: "mark", markName }), + sendAudio: (audio) => { + const turnId = relay ? ensureRelayTurn(relay) : undefined; + emit( + { + relaySessionId, + type: "audio", + audioBase64: audio.toString("base64"), + }, + { + type: "output.audio.delta", + turnId, + payload: { byteLength: audio.length }, + }, + ); + }, + clearAudio: () => { + const turnId = relay ? ensureRelayTurn(relay) : undefined; + emit( + { relaySessionId, type: "clear" }, + { + type: "output.audio.done", + turnId, + payload: { reason: "clear" }, + final: true, + }, + ); + }, + sendMark: (markName) => { + const turnId = relay ? ensureRelayTurn(relay) : undefined; + emit( + { relaySessionId, type: "mark", markName }, + { + type: "output.audio.done", + turnId, + payload: { markName }, + final: true, + }, + ); + }, }, onTranscript: (role, text, final) => { - emit({ relaySessionId, type: "transcript", role, text, final }); + const turnId = relay ? ensureRelayTurn(relay) : undefined; + const eventType = + role === "assistant" + ? final + ? "output.text.done" + : "output.text.delta" + : final + ? "transcript.done" + : "transcript.delta"; + const payload = role === "assistant" ? { text } : { role, text }; + emit( + { relaySessionId, type: "transcript", role, text, final }, + { + type: eventType, + turnId, + payload, + final, + }, + ); }, onToolCall: (toolCall) => { - emit({ - relaySessionId, - type: "toolCall", - itemId: toolCall.itemId, - callId: toolCall.callId, - name: toolCall.name, - args: toolCall.args, - }); + const turnId = relay ? ensureRelayTurn(relay) : undefined; + emit( + { + relaySessionId, + type: "toolCall", + itemId: toolCall.itemId, + callId: toolCall.callId, + name: toolCall.name, + args: toolCall.args, + }, + { + type: "tool.call", + itemId: toolCall.itemId, + callId: toolCall.callId, + turnId, + payload: { name: toolCall.name, args: toolCall.args }, + }, + ); }, - onReady: () => emit({ relaySessionId, type: "ready" }), - onError: (error) => emit({ relaySessionId, type: "error", message: error.message }), + onReady: () => + emit({ relaySessionId, type: "ready" }, { type: "session.ready", payload: null }), + onError: (error) => + emit( + { relaySessionId, type: "error", message: error.message }, + { type: "session.error", payload: { message: error.message }, final: true }, + ), onClose: (reason) => { const active = relaySessions.get(relaySessionId); if (!active) { @@ -173,7 +277,11 @@ export function createTalkRealtimeRelaySession( } relaySessions.delete(relaySessionId); clearTimeout(active.cleanupTimer); - emit({ relaySessionId, type: "close", reason }); + abortRelayAgentRuns(active, "relay-closed"); + emit( + { relaySessionId, type: "close", reason }, + { type: "session.closed", payload: { reason }, final: true }, + ); }, }); relay = { @@ -181,6 +289,7 @@ export function createTalkRealtimeRelaySession( connId: params.connId, context: params.context, bridge, + talk, expiresAtMs, cleanupTimer: setTimeout(() => { const active = relaySessions.get(relaySessionId); @@ -188,6 +297,7 @@ export function createTalkRealtimeRelaySession( closeRelaySession(active, "completed"); } }, RELAY_SESSION_TTL_MS), + activeAgentRuns: new Map(), }; relay.cleanupTimer.unref?.(); relaySessions.set(relaySessionId, relay); @@ -215,6 +325,19 @@ export function createTalkRealtimeRelaySession( }; } +function ensureRelayTurn(session: RelaySession): string { + const turn = session.talk.ensureTurn(); + if (turn.event) { + broadcastToOwner(session.context, session.connId, { + relaySessionId: session.id, + type: "inputAudio", + byteLength: 0, + talkEvent: turn.event, + }); + } + return turn.turnId; +} + function getRelaySession(relaySessionId: string, connId: string): RelaySession { const session = relaySessions.get(relaySessionId); if (!session || session.connId !== connId || Date.now() > session.expiresAtMs) { @@ -236,8 +359,19 @@ export function sendTalkRealtimeRelayAudio(params: { throw new Error("Realtime relay audio frame is too large"); } const session = getRelaySession(params.relaySessionId, params.connId); + const turnId = ensureRelayTurn(session); const audio = Buffer.from(params.audioBase64, "base64"); session.bridge.sendAudio(audio); + broadcastToOwner(session.context, session.connId, { + relaySessionId: session.id, + type: "inputAudio", + byteLength: audio.byteLength, + talkEvent: session.talk.emit({ + type: "input.audio.delta", + turnId, + payload: { byteLength: audio.byteLength }, + }), + }); if (typeof params.timestamp === "number" && Number.isFinite(params.timestamp)) { session.bridge.setMediaTimestamp(params.timestamp); } @@ -256,10 +390,52 @@ export function submitTalkRealtimeRelayToolResult(params: { callId: string; result: unknown; }): void { - getRelaySession(params.relaySessionId, params.connId).bridge.submitToolResult( - params.callId, - params.result, - ); + const session = getRelaySession(params.relaySessionId, params.connId); + session.bridge.submitToolResult(params.callId, params.result); + const turnId = ensureRelayTurn(session); + broadcastToOwner(session.context, session.connId, { + relaySessionId: session.id, + type: "toolResult", + callId: params.callId, + talkEvent: session.talk.emit({ + type: "tool.result", + callId: params.callId, + turnId, + payload: { result: params.result }, + final: true, + }), + }); +} + +export function registerTalkRealtimeRelayAgentRun(params: { + relaySessionId: string; + connId: string; + sessionKey: string; + runId: string; +}): void { + const session = getRelaySession(params.relaySessionId, params.connId); + session.activeAgentRuns.set(params.runId, params.sessionKey); +} + +export function cancelTalkRealtimeRelayTurn(params: { + relaySessionId: string; + connId: string; + reason?: string; +}): void { + const session = getRelaySession(params.relaySessionId, params.connId); + const turnId = ensureRelayTurn(session); + const reason = params.reason ?? "client-cancelled"; + session.bridge.handleBargeIn({ audioPlaybackActive: true }); + abortRelayAgentRuns(session, reason); + const cancelled = session.talk.cancelTurn({ + turnId, + payload: { reason }, + }); + broadcastToOwner(session.context, session.connId, { + relaySessionId: session.id, + type: "clear", + talkEvent: cancelled.ok ? cancelled.event : undefined, + }); } export function stopTalkRealtimeRelaySession(params: { diff --git a/src/gateway/talk-session-registry.ts b/src/gateway/talk-session-registry.ts new file mode 100644 index 00000000000..5ce62638725 --- /dev/null +++ b/src/gateway/talk-session-registry.ts @@ -0,0 +1,52 @@ +export type UnifiedTalkSessionRecord = + | { + kind: "realtime-relay"; + connId: string; + relaySessionId: string; + } + | { + kind: "transcription-relay"; + connId: string; + transcriptionSessionId: string; + } + | { + kind: "managed-room"; + handoffId: string; + token: string; + roomId: string; + }; + +const unifiedTalkSessions = new Map(); + +export function rememberUnifiedTalkSession( + sessionId: string, + session: UnifiedTalkSessionRecord, +): void { + unifiedTalkSessions.set(sessionId, session); +} + +export function getUnifiedTalkSession(sessionId: string): UnifiedTalkSessionRecord { + const session = unifiedTalkSessions.get(sessionId); + if (!session) { + throw new Error("Unknown Talk session"); + } + return session; +} + +export function forgetUnifiedTalkSession(sessionId: string): void { + unifiedTalkSessions.delete(sessionId); +} + +export function requireUnifiedTalkSessionConn( + session: Extract, + connId: string | undefined, +): string { + if (!connId || session.connId !== connId) { + throw new Error("Talk session is not owned by this connection"); + } + return connId; +} + +export function clearUnifiedTalkSessionsForTest(): void { + unifiedTalkSessions.clear(); +} diff --git a/src/gateway/talk-transcription-relay.test.ts b/src/gateway/talk-transcription-relay.test.ts new file mode 100644 index 00000000000..f39ddac04ad --- /dev/null +++ b/src/gateway/talk-transcription-relay.test.ts @@ -0,0 +1,216 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js"; +import type { RealtimeTranscriptionSessionCreateRequest } from "../realtime-transcription/provider-types.js"; +import { + cancelTalkTranscriptionRelayTurn, + clearTalkTranscriptionRelaySessionsForTest, + createTalkTranscriptionRelaySession, + sendTalkTranscriptionRelayAudio, + stopTalkTranscriptionRelaySession, +} from "./talk-transcription-relay.js"; + +describe("talk transcription gateway relay", () => { + afterEach(() => { + clearTalkTranscriptionRelaySessionsForTest(); + }); + + it("bridges browser audio into a transcription-only Talk event stream", async () => { + let sttRequest: RealtimeTranscriptionSessionCreateRequest | undefined; + const sttSession = { + connect: vi.fn(async () => { + sttRequest?.onSpeechStart?.(); + sttRequest?.onPartial?.("hel"); + sttRequest?.onTranscript?.("hello world"); + }), + sendAudio: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + const provider: RealtimeTranscriptionProviderPlugin = { + id: "stt-test", + label: "STT Test", + isConfigured: () => true, + createSession: (req) => { + sttRequest = req; + return sttSession; + }, + }; + const events: Array<{ event: string; payload: unknown; connIds: string[] }> = []; + const context = { + broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet) => { + events.push({ event, payload, connIds: [...connIds] }); + }, + } as never; + + const session = createTalkTranscriptionRelaySession({ + context, + connId: "conn-1", + provider, + providerConfig: { model: "stt-model" }, + }); + await Promise.resolve(); + + expect(session).toMatchObject({ + provider: "stt-test", + mode: "transcription", + transport: "gateway-relay", + audio: { + inputEncoding: "pcm16", + inputSampleRateHz: 24000, + }, + }); + expect(sttRequest).toMatchObject({ + providerConfig: { model: "stt-model" }, + }); + + sendTalkTranscriptionRelayAudio({ + transcriptionSessionId: session.transcriptionSessionId, + connId: "conn-1", + audioBase64: Buffer.from("audio-in").toString("base64"), + }); + stopTalkTranscriptionRelaySession({ + transcriptionSessionId: session.transcriptionSessionId, + connId: "conn-1", + }); + + expect(sttSession.sendAudio).toHaveBeenCalledWith(Buffer.from("audio-in")); + expect(sttSession.close).toHaveBeenCalledOnce(); + expect(events).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + event: "talk.transcription.relay", + connIds: ["conn-1"], + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + type: "ready", + talkEvent: expect.objectContaining({ + sessionId: session.transcriptionSessionId, + type: "session.ready", + mode: "transcription", + transport: "gateway-relay", + brain: "none", + provider: "stt-test", + }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + type: "speechStart", + talkEvent: expect.objectContaining({ type: "turn.started", turnId: "turn-1" }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + type: "partial", + text: "hel", + talkEvent: expect.objectContaining({ + type: "transcript.delta", + turnId: "turn-1", + payload: { text: "hel" }, + }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + type: "transcript", + text: "hello world", + final: true, + talkEvent: expect.objectContaining({ + type: "transcript.done", + turnId: "turn-1", + final: true, + payload: { text: "hello world" }, + }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + type: "inputAudio", + byteLength: 8, + talkEvent: expect.objectContaining({ type: "input.audio.delta" }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + type: "close", + reason: "completed", + talkEvent: expect.objectContaining({ + type: "session.closed", + final: true, + }), + }), + }), + ]), + ); + }); + + it("cancels an active transcription turn and closes the provider session", async () => { + let sttRequest: RealtimeTranscriptionSessionCreateRequest | undefined; + const sttSession = { + connect: vi.fn(async () => { + sttRequest?.onSpeechStart?.(); + }), + sendAudio: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + const provider: RealtimeTranscriptionProviderPlugin = { + id: "stt-test", + label: "STT Test", + isConfigured: () => true, + createSession: (req) => { + sttRequest = req; + return sttSession; + }, + }; + const events: Array<{ event: string; payload: unknown; connIds: string[] }> = []; + const context = { + broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet) => { + events.push({ event, payload, connIds: [...connIds] }); + }, + } as never; + + const session = createTalkTranscriptionRelaySession({ + context, + connId: "conn-1", + provider, + providerConfig: {}, + }); + await Promise.resolve(); + + cancelTalkTranscriptionRelayTurn({ + transcriptionSessionId: session.transcriptionSessionId, + connId: "conn-1", + reason: "barge-in", + }); + + expect(sttSession.close).toHaveBeenCalledOnce(); + expect(events).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + talkEvent: expect.objectContaining({ + type: "turn.cancelled", + turnId: "turn-1", + payload: { reason: "barge-in" }, + final: true, + }), + }), + }), + expect.objectContaining({ + payload: expect.objectContaining({ + transcriptionSessionId: session.transcriptionSessionId, + type: "close", + reason: "completed", + }), + }), + ]), + ); + }); +}); diff --git a/src/gateway/talk-transcription-relay.ts b/src/gateway/talk-transcription-relay.ts new file mode 100644 index 00000000000..a2c7519f4a7 --- /dev/null +++ b/src/gateway/talk-transcription-relay.ts @@ -0,0 +1,354 @@ +import { randomUUID } from "node:crypto"; +import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js"; +import type { RealtimeTranscriptionProviderConfig } from "../realtime-transcription/provider-types.js"; +import { + type TalkEvent, + type TalkEventInput, + type TalkSessionController, + createTalkSessionController, +} from "../realtime-voice/talk-session-controller.js"; +import type { GatewayRequestContext } from "./server-methods/shared-types.js"; + +const TRANSCRIPTION_SESSION_TTL_MS = 30 * 60 * 1000; +const MAX_AUDIO_BASE64_BYTES = 512 * 1024; +const MAX_TRANSCRIPTION_SESSIONS_PER_CONN = 2; +const MAX_TRANSCRIPTION_SESSIONS_GLOBAL = 64; +const TRANSCRIPTION_EVENT = "talk.transcription.relay"; + +type TalkTranscriptionRelayEventPayload = + | { transcriptionSessionId: string; type: "ready" } + | { transcriptionSessionId: string; type: "inputAudio"; byteLength: number } + | { transcriptionSessionId: string; type: "partial"; text: string } + | { transcriptionSessionId: string; type: "transcript"; text: string; final: true } + | { transcriptionSessionId: string; type: "speechStart" } + | { transcriptionSessionId: string; type: "error"; message: string } + | { transcriptionSessionId: string; type: "close"; reason: "completed" | "error" }; + +type TalkTranscriptionRelayEvent = TalkTranscriptionRelayEventPayload & { + talkEvent?: TalkEvent; +}; + +type TranscriptionRelaySession = { + id: string; + connId: string; + context: GatewayRequestContext; + provider: RealtimeTranscriptionProviderPlugin; + sttSession: ReturnType; + talk: TalkSessionController; + expiresAtMs: number; + cleanupTimer: ReturnType; + closed: boolean; +}; + +type CreateTalkTranscriptionRelaySessionParams = { + context: GatewayRequestContext; + connId: string; + provider: RealtimeTranscriptionProviderPlugin; + providerConfig: RealtimeTranscriptionProviderConfig; +}; + +type TalkTranscriptionRelaySessionResult = { + provider: string; + mode: "transcription"; + transport: "gateway-relay"; + transcriptionSessionId: string; + audio: { + inputEncoding: "pcm16"; + inputSampleRateHz: 24000; + }; + expiresAt: number; +}; + +const transcriptionSessions = new Map(); + +function broadcastToOwner( + context: GatewayRequestContext, + connId: string, + event: TalkTranscriptionRelayEvent, +): void { + context.broadcastToConnIds(TRANSCRIPTION_EVENT, event, new Set([connId]), { dropIfSlow: true }); +} + +function ensureTranscriptionTurn(session: TranscriptionRelaySession): string { + const turn = session.talk.ensureTurn(); + if (turn.event) { + broadcastToOwner(session.context, session.connId, { + transcriptionSessionId: session.id, + type: "speechStart", + talkEvent: turn.event, + }); + } + return turn.turnId; +} + +function closeTranscriptionSession( + session: TranscriptionRelaySession, + reason: "completed" | "error", +): void { + if (session.closed) { + return; + } + session.closed = true; + transcriptionSessions.delete(session.id); + clearTimeout(session.cleanupTimer); + session.sttSession.close(); + broadcastToOwner(session.context, session.connId, { + transcriptionSessionId: session.id, + type: "close", + reason, + talkEvent: session.talk.emit({ + type: "session.closed", + payload: { reason }, + final: true, + }), + }); +} + +function pruneExpiredTranscriptionSessions(nowMs = Date.now()): void { + for (const session of transcriptionSessions.values()) { + if (nowMs > session.expiresAtMs) { + closeTranscriptionSession(session, "completed"); + } + } +} + +function countTranscriptionSessionsForConn(connId: string): number { + let count = 0; + for (const session of transcriptionSessions.values()) { + if (session.connId === connId) { + count += 1; + } + } + return count; +} + +function enforceTranscriptionSessionLimits(connId: string): void { + pruneExpiredTranscriptionSessions(); + if (transcriptionSessions.size >= MAX_TRANSCRIPTION_SESSIONS_GLOBAL) { + throw new Error("Too many active transcription Talk sessions"); + } + if (countTranscriptionSessionsForConn(connId) >= MAX_TRANSCRIPTION_SESSIONS_PER_CONN) { + throw new Error("Too many active transcription Talk sessions for this connection"); + } +} + +export function createTalkTranscriptionRelaySession( + params: CreateTalkTranscriptionRelaySessionParams, +): TalkTranscriptionRelaySessionResult { + enforceTranscriptionSessionLimits(params.connId); + const transcriptionSessionId = randomUUID(); + const expiresAtMs = Date.now() + TRANSCRIPTION_SESSION_TTL_MS; + const talk = createTalkSessionController({ + sessionId: transcriptionSessionId, + mode: "transcription", + transport: "gateway-relay", + brain: "none", + provider: params.provider.id, + }); + let relay: TranscriptionRelaySession | undefined; + const emit = (event: TalkTranscriptionRelayEventPayload, talkEvent?: TalkEventInput): void => { + broadcastToOwner(params.context, params.connId, { + ...event, + ...(talkEvent ? { talkEvent: talk.emit(talkEvent) } : {}), + }); + }; + const ensureTurnId = (): string => { + return relay ? ensureTranscriptionTurn(relay) : "turn-1"; + }; + const sttSession = params.provider.createSession({ + providerConfig: params.providerConfig, + onSpeechStart: () => { + ensureTurnId(); + }, + onPartial: (text) => { + const turnId = ensureTurnId(); + emit( + { transcriptionSessionId, type: "partial", text }, + { + type: "transcript.delta", + turnId, + payload: { text }, + }, + ); + }, + onTranscript: (text) => { + const turnId = ensureTurnId(); + emit( + { transcriptionSessionId, type: "transcript", text, final: true }, + { + type: "transcript.done", + turnId, + payload: { text }, + final: true, + }, + ); + if (relay) { + const ended = relay.talk.endTurn({ turnId, payload: {} }); + if (ended.ok) { + broadcastToOwner(relay.context, relay.connId, { + transcriptionSessionId, + type: "transcript", + text: "", + final: true, + talkEvent: ended.event, + }); + } + } + }, + onError: (error) => { + emit( + { transcriptionSessionId, type: "error", message: error.message }, + { + type: "session.error", + payload: { message: error.message }, + final: true, + }, + ); + if (relay) { + closeTranscriptionSession(relay, "error"); + } + }, + }); + relay = { + id: transcriptionSessionId, + connId: params.connId, + context: params.context, + provider: params.provider, + sttSession, + talk, + expiresAtMs, + cleanupTimer: setTimeout(() => { + const active = transcriptionSessions.get(transcriptionSessionId); + if (active) { + closeTranscriptionSession(active, "completed"); + } + }, TRANSCRIPTION_SESSION_TTL_MS), + closed: false, + }; + relay.cleanupTimer.unref?.(); + transcriptionSessions.set(transcriptionSessionId, relay); + sttSession + .connect() + .then(() => { + emit({ transcriptionSessionId, type: "ready" }, { type: "session.ready", payload: null }); + }) + .catch((error: unknown) => { + emit( + { + transcriptionSessionId, + type: "error", + message: error instanceof Error ? error.message : String(error), + }, + { + type: "session.error", + payload: { message: error instanceof Error ? error.message : String(error) }, + final: true, + }, + ); + const active = transcriptionSessions.get(transcriptionSessionId); + if (active) { + closeTranscriptionSession(active, "error"); + } + }); + + return { + provider: params.provider.id, + mode: "transcription", + transport: "gateway-relay", + transcriptionSessionId, + audio: { + inputEncoding: "pcm16", + inputSampleRateHz: 24000, + }, + expiresAt: Math.floor(expiresAtMs / 1000), + }; +} + +function getTranscriptionSession( + transcriptionSessionId: string, + connId: string, +): TranscriptionRelaySession { + const session = transcriptionSessions.get(transcriptionSessionId); + if (!session || session.connId !== connId || Date.now() > session.expiresAtMs) { + if (session) { + closeTranscriptionSession(session, "completed"); + } + throw new Error("Unknown transcription Talk session"); + } + return session; +} + +export function sendTalkTranscriptionRelayAudio(params: { + transcriptionSessionId: string; + connId: string; + audioBase64: string; +}): void { + if (params.audioBase64.length > MAX_AUDIO_BASE64_BYTES) { + throw new Error("Transcription Talk audio frame is too large"); + } + const session = getTranscriptionSession(params.transcriptionSessionId, params.connId); + const audio = Buffer.from(params.audioBase64, "base64"); + const turnId = ensureTranscriptionTurn(session); + session.sttSession.sendAudio(audio); + broadcastToOwner(session.context, session.connId, { + transcriptionSessionId: session.id, + type: "inputAudio", + byteLength: audio.byteLength, + talkEvent: session.talk.emit({ + type: "input.audio.delta", + turnId, + payload: { byteLength: audio.byteLength }, + }), + }); +} + +export function stopTalkTranscriptionRelaySession(params: { + transcriptionSessionId: string; + connId: string; +}): void { + const session = getTranscriptionSession(params.transcriptionSessionId, params.connId); + if (session.talk.activeTurnId) { + broadcastToOwner(session.context, session.connId, { + transcriptionSessionId: session.id, + type: "transcript", + text: "", + final: true, + talkEvent: session.talk.emit({ + type: "input.audio.committed", + turnId: session.talk.activeTurnId, + payload: {}, + final: true, + }), + }); + } + closeTranscriptionSession(session, "completed"); +} + +export function cancelTalkTranscriptionRelayTurn(params: { + transcriptionSessionId: string; + connId: string; + reason?: string; +}): void { + const session = getTranscriptionSession(params.transcriptionSessionId, params.connId); + const turnId = ensureTranscriptionTurn(session); + const cancelled = session.talk.cancelTurn({ + turnId, + payload: { reason: params.reason ?? "client-cancelled" }, + }); + broadcastToOwner(session.context, session.connId, { + transcriptionSessionId: session.id, + type: "transcript", + text: "", + final: true, + talkEvent: cancelled.ok ? cancelled.event : undefined, + }); + closeTranscriptionSession(session, "completed"); +} + +export function clearTalkTranscriptionRelaySessionsForTest(): void { + for (const session of transcriptionSessions.values()) { + clearTimeout(session.cleanupTimer); + session.sttSession.close(); + } + transcriptionSessions.clear(); +} diff --git a/src/gateway/voiceclaw-realtime/instructions.ts b/src/gateway/voiceclaw-realtime/instructions.ts index 72da3f98e82..4bab980e586 100644 --- a/src/gateway/voiceclaw-realtime/instructions.ts +++ b/src/gateway/voiceclaw-realtime/instructions.ts @@ -55,10 +55,6 @@ export function buildInstructions(config: VoiceClawSessionConfigEvent): string { parts.push(deviceContext); } - if (config.instructionsOverride?.trim()) { - parts.push(`## About The User\n${config.instructionsOverride.trim()}`); - } - if (config.conversationHistory && config.conversationHistory.length > 0) { parts.push(buildConversationHistory(config.conversationHistory)); } diff --git a/src/gateway/voiceclaw-realtime/session.test.ts b/src/gateway/voiceclaw-realtime/session.test.ts index ec13fd8b10a..d456cc88756 100644 --- a/src/gateway/voiceclaw-realtime/session.test.ts +++ b/src/gateway/voiceclaw-realtime/session.test.ts @@ -3,6 +3,8 @@ import type { IncomingMessage } from "node:http"; import { describe, expect, it, vi } from "vitest"; import WebSocket from "ws"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; +import type { TalkEvent } from "../../realtime-voice/talk-events.js"; +import { createTalkSessionController } from "../../realtime-voice/talk-session-controller.js"; import type { ResolvedGatewayAuth } from "../auth.js"; import { resolveRealtimeSenderIsOwner, VoiceClawRealtimeSession } from "./session.js"; import type { @@ -60,6 +62,45 @@ function makeAdapter(): VoiceClawRealtimeAdapter { } describe("VoiceClawRealtimeSession lifecycle", () => { + it("rejects request-time instructionsOverride", async () => { + const ws = new FakeWebSocket(); + const adapter = makeAdapter(); + const releasePreauthBudget = vi.fn(); + const session = new VoiceClawRealtimeSession({ + ws: ws as unknown as WebSocket, + req: {} as IncomingMessage, + auth: { mode: "none" } as ResolvedGatewayAuth, + config: {} as OpenClawConfig, + trustedProxies: [], + allowRealIpFallback: false, + releasePreauthBudget, + adapterFactory: () => adapter, + }); + + session.attach(); + ws.emit( + "message", + JSON.stringify({ + type: "session.config", + brainAgent: "none", + instructionsOverride: "custom request-time instructions", + }), + ); + await new Promise((resolve) => setImmediate(resolve)); + + expect(ws.sent).toEqual([ + { + type: "error", + message: "request-time instructionsOverride is not supported", + code: 400, + }, + ]); + expect(ws.closeCode).toBe(1008); + expect(ws.closeReason).toBe("unsupported instruction override"); + expect(adapter.connect).not.toHaveBeenCalled(); + expect(releasePreauthBudget).toHaveBeenCalledOnce(); + }); + it("sends session summary before closing after terminal adapter errors", () => { const ws = new FakeWebSocket(); const adapter = makeAdapter(); @@ -102,4 +143,199 @@ describe("VoiceClawRealtimeSession lifecycle", () => { expect(adapter.disconnect).toHaveBeenCalledOnce(); expect(releasePreauthBudget).toHaveBeenCalledOnce(); }); + + it("adds common Talk event envelopes to configured server events", () => { + const ws = new FakeWebSocket(); + const adapter = makeAdapter(); + const session = new VoiceClawRealtimeSession({ + ws: ws as unknown as WebSocket, + req: {} as IncomingMessage, + auth: { mode: "none" } as ResolvedGatewayAuth, + config: {} as OpenClawConfig, + trustedProxies: [], + allowRealIpFallback: false, + releasePreauthBudget: vi.fn(), + adapterFactory: () => adapter, + }); + const internals = session as unknown as { + config: VoiceClawSessionConfigEvent; + talk: unknown; + handleAdapterEvent(event: VoiceClawServerEvent): void; + }; + internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" }; + internals.talk = createTalkSessionController({ + sessionId: "voice-session", + mode: "realtime", + transport: "gateway-relay", + brain: "direct-tools", + provider: "gemini", + }); + + internals.handleAdapterEvent({ + type: "transcript.done", + role: "assistant", + text: "hello", + }); + + expect(ws.sent).toEqual([ + expect.objectContaining({ + type: "transcript.done", + talkEvent: expect.objectContaining({ + type: "output.text.done", + sessionId: "voice-session", + mode: "realtime", + transport: "gateway-relay", + brain: "direct-tools", + provider: "gemini", + final: true, + payload: { role: "assistant", text: "hello" }, + }), + }), + ]); + }); + + it("keeps streamed output audio out of common Talk event payloads", () => { + const ws = new FakeWebSocket(); + const adapter = makeAdapter(); + const session = new VoiceClawRealtimeSession({ + ws: ws as unknown as WebSocket, + req: {} as IncomingMessage, + auth: { mode: "none" } as ResolvedGatewayAuth, + config: {} as OpenClawConfig, + trustedProxies: [], + allowRealIpFallback: false, + releasePreauthBudget: vi.fn(), + adapterFactory: () => adapter, + }); + const internals = session as unknown as { + config: VoiceClawSessionConfigEvent; + talk: unknown; + handleAdapterEvent(event: VoiceClawServerEvent): void; + }; + const audioData = Buffer.from("hello").toString("base64"); + internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" }; + internals.talk = createTalkSessionController({ + sessionId: "voice-session", + mode: "realtime", + transport: "gateway-relay", + brain: "direct-tools", + provider: "gemini", + }); + + internals.handleAdapterEvent({ + type: "audio.delta", + data: audioData, + }); + + expect(ws.sent).toEqual([ + expect.objectContaining({ + type: "audio.delta", + data: audioData, + talkEvent: expect.objectContaining({ + type: "output.audio.delta", + payload: { byteLength: 5 }, + }), + }), + ]); + expect( + (ws.sent[0] as { talkEvent?: { payload?: Record } }).talkEvent?.payload, + ).not.toHaveProperty("data"); + }); + + it("emits common Talk events for client audio, video, cancellation, and tool results", async () => { + const ws = new FakeWebSocket(); + const adapter = makeAdapter(); + const talkEvents: TalkEvent[] = []; + const session = new VoiceClawRealtimeSession({ + ws: ws as unknown as WebSocket, + req: {} as IncomingMessage, + auth: { mode: "none" } as ResolvedGatewayAuth, + config: {} as OpenClawConfig, + trustedProxies: [], + allowRealIpFallback: false, + releasePreauthBudget: vi.fn(), + adapterFactory: () => adapter, + onTalkEvent: (event) => talkEvents.push(event), + }); + const internals = session as unknown as { + adapter: VoiceClawRealtimeAdapter; + config: VoiceClawSessionConfigEvent; + talk: ReturnType; + handleRawMessage(raw: string): Promise; + }; + internals.adapter = adapter; + internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" }; + internals.talk = createTalkSessionController({ + sessionId: "voice-session", + mode: "realtime", + transport: "gateway-relay", + brain: "direct-tools", + provider: "gemini", + }); + internals.talk.startTurn({ turnId: "turn-client" }); + + await internals.handleRawMessage( + JSON.stringify({ type: "audio.append", data: Buffer.from("hello").toString("base64") }), + ); + await internals.handleRawMessage(JSON.stringify({ type: "audio.commit" })); + await internals.handleRawMessage( + JSON.stringify({ + type: "frame.append", + data: Buffer.from("frame").toString("base64"), + mimeType: "image/jpeg", + }), + ); + await internals.handleRawMessage(JSON.stringify({ type: "response.cancel" })); + await internals.handleRawMessage( + JSON.stringify({ type: "tool.result", callId: "call-1", output: "done" }), + ); + + expect(adapter.sendAudio).toHaveBeenCalledWith(Buffer.from("hello").toString("base64")); + expect(adapter.commitAudio).toHaveBeenCalledOnce(); + expect(adapter.sendFrame).toHaveBeenCalledWith( + Buffer.from("frame").toString("base64"), + "image/jpeg", + ); + expect(adapter.cancelResponse).toHaveBeenCalledOnce(); + expect(adapter.sendToolResult).toHaveBeenCalledWith("call-1", "done"); + expect(talkEvents.map((event) => event.type)).toEqual([ + "input.audio.delta", + "input.audio.committed", + "health.changed", + "turn.cancelled", + "turn.started", + "tool.result", + ]); + expect(talkEvents).toEqual([ + expect.objectContaining({ + type: "input.audio.delta", + turnId: "turn-client", + payload: { byteLength: 5 }, + }), + expect.objectContaining({ + type: "input.audio.committed", + turnId: "turn-client", + final: true, + }), + expect.objectContaining({ + type: "health.changed", + payload: { inputVideoFrame: true, mimeType: "image/jpeg", byteLength: 5 }, + }), + expect.objectContaining({ + type: "turn.cancelled", + payload: { reason: "client-cancelled" }, + final: true, + }), + expect.objectContaining({ + type: "turn.started", + payload: { source: "implicit" }, + }), + expect.objectContaining({ + type: "tool.result", + callId: "call-1", + payload: { output: "done" }, + final: true, + }), + ]); + }); }); diff --git a/src/gateway/voiceclaw-realtime/session.ts b/src/gateway/voiceclaw-realtime/session.ts index 3823bc34270..ce7578aca8c 100644 --- a/src/gateway/voiceclaw-realtime/session.ts +++ b/src/gateway/voiceclaw-realtime/session.ts @@ -3,6 +3,12 @@ import type { IncomingMessage } from "node:http"; import WebSocket, { type RawData } from "ws"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; import { createSubsystemLogger } from "../../logging/subsystem.js"; +import { + type TalkEvent, + type TalkEventInput, + type TalkSessionController, + createTalkSessionController, +} from "../../realtime-voice/talk-session-controller.js"; import type { AuthRateLimiter } from "../auth-rate-limit.js"; import { authorizeHttpGatewayConnect, @@ -36,6 +42,7 @@ type VoiceClawRealtimeSessionOptions = { rateLimiter?: AuthRateLimiter; releasePreauthBudget: () => void; adapterFactory?: () => VoiceClawRealtimeAdapter; + onTalkEvent?: (event: TalkEvent) => void; }; export class VoiceClawRealtimeSession { @@ -50,8 +57,10 @@ export class VoiceClawRealtimeSession { private readonly rateLimiter: AuthRateLimiter | undefined; private readonly releasePreauthBudget: () => void; private readonly adapterFactory: () => VoiceClawRealtimeAdapter; + private readonly onTalkEvent: ((event: TalkEvent) => void) | undefined; private adapter: VoiceClawRealtimeAdapter | null = null; private toolRuntime: VoiceClawRealtimeToolRuntime | null = null; + private talk: TalkSessionController | null = null; private config: VoiceClawSessionConfigEvent | null = null; private handshakeTimer: ReturnType | null = null; private closed = false; @@ -67,6 +76,7 @@ export class VoiceClawRealtimeSession { this.rateLimiter = opts.rateLimiter; this.releasePreauthBudget = once(opts.releasePreauthBudget); this.adapterFactory = opts.adapterFactory ?? (() => new VoiceClawGeminiLiveAdapter()); + this.onTalkEvent = opts.onTalkEvent; } attach(): void { @@ -113,24 +123,66 @@ export class VoiceClawRealtimeSession { } switch (event.type) { - case "audio.append": + case "audio.append": { + const audioTurnId = this.ensureActiveTurnId(); this.adapter?.sendAudio(event.data); + this.emitTalkEvent({ + type: "input.audio.delta", + payload: { byteLength: base64ByteLength(event.data) }, + turnId: audioTurnId, + }); break; - case "audio.commit": + } + case "audio.commit": { + const commitTurnId = this.ensureActiveTurnId(); this.adapter?.commitAudio(); + this.emitTalkEvent({ + type: "input.audio.committed", + payload: {}, + turnId: commitTurnId, + final: true, + }); break; + } case "frame.append": this.adapter?.sendFrame(event.data, event.mimeType); + this.emitTalkEvent({ + type: "health.changed", + payload: { + inputVideoFrame: true, + mimeType: event.mimeType, + byteLength: base64ByteLength(event.data), + }, + turnId: this.talk?.activeTurnId, + }); break; case "response.create": this.adapter?.createResponse(); break; - case "response.cancel": + case "response.cancel": { + const cancelTurnId = this.ensureActiveTurnId(); this.adapter?.cancelResponse(); + const cancelled = this.talk?.cancelTurn({ + turnId: cancelTurnId, + payload: { reason: "client-cancelled" }, + }); + if (cancelled?.ok) { + this.onTalkEvent?.(cancelled.event); + } break; - case "tool.result": + } + case "tool.result": { + const toolTurnId = this.ensureActiveTurnId(); this.adapter?.sendToolResult(event.callId, event.output); + this.emitTalkEvent({ + type: "tool.result", + payload: { output: event.output }, + turnId: toolTurnId, + callId: event.callId, + final: true, + }); break; + } case "session.config": this.send({ type: "error", message: "session already configured", code: 400 }); break; @@ -144,6 +196,16 @@ export class VoiceClawRealtimeSession { this.configStarted = true; this.clearHandshakeTimer(); + if (hasInstructionsOverride(config)) { + this.send({ + type: "error", + message: "request-time instructionsOverride is not supported", + code: 400, + }); + this.ws.close(1008, "unsupported instruction override"); + return; + } + const authResult = await authorizeHttpGatewayConnect({ auth: this.auth, connectAuth: config.apiKey ? { token: config.apiKey, password: config.apiKey } : null, @@ -190,6 +252,13 @@ export class VoiceClawRealtimeSession { voice: config.voice || "Zephyr", brainAgent: config.brainAgent ?? "enabled", }; + this.talk = createTalkSessionController({ + sessionId: this.id, + mode: "realtime", + transport: "gateway-relay", + brain: this.config.brainAgent === "none" ? "none" : "direct-tools", + provider: this.config.provider, + }); this.adapter = this.adapterFactory(); try { @@ -270,7 +339,134 @@ export class VoiceClawRealtimeSession { if (this.closed || this.ws.readyState !== WebSocket.OPEN) { return; } - this.ws.send(JSON.stringify(event)); + this.ws.send(JSON.stringify(this.withTalkEvent(event))); + } + + private withTalkEvent( + event: VoiceClawServerEvent, + ): VoiceClawServerEvent & { talkEvent?: TalkEvent } { + const talkInput = this.toTalkEventInput(event); + if (!talkInput || !this.talk) { + return event; + } + return { ...event, talkEvent: this.emitTalkEvent(talkInput) }; + } + + private emitTalkEvent(input: TalkEventInput): TalkEvent | undefined { + if (!this.talk) { + return undefined; + } + let event: TalkEvent | undefined; + if (input.type === "turn.started") { + event = this.talk.startTurn({ turnId: input.turnId, payload: input.payload }).event; + } else if (input.type === "turn.ended") { + const ended = this.talk.endTurn({ turnId: input.turnId, payload: input.payload }); + event = ended.ok ? ended.event : undefined; + } else if (input.type === "turn.cancelled") { + const cancelled = this.talk.cancelTurn({ turnId: input.turnId, payload: input.payload }); + event = cancelled.ok ? cancelled.event : undefined; + } else { + event = this.talk.emit(input); + } + if (event) { + this.onTalkEvent?.(event); + } + return event; + } + + private ensureActiveTurnId(): string { + if (this.talk?.activeTurnId) { + return this.talk.activeTurnId; + } + const turnId = randomUUID(); + const turn = this.talk?.startTurn({ + turnId, + payload: { source: "implicit" }, + }); + if (turn?.event) { + this.onTalkEvent?.(turn.event); + } + return turnId; + } + + private toTalkEventInput(event: VoiceClawServerEvent): TalkEventInput | null { + switch (event.type) { + case "session.ready": + return { type: "session.ready", payload: { sessionId: event.sessionId } }; + case "audio.delta": + return { + type: "output.audio.delta", + payload: { byteLength: base64ByteLength(event.data) }, + turnId: this.ensureActiveTurnId(), + }; + case "transcript.delta": + return { + type: event.role === "assistant" ? "output.text.delta" : "transcript.delta", + payload: { role: event.role, text: event.text }, + turnId: this.ensureActiveTurnId(), + }; + case "transcript.done": + return { + type: event.role === "assistant" ? "output.text.done" : "transcript.done", + payload: { role: event.role, text: event.text }, + turnId: this.ensureActiveTurnId(), + final: true, + }; + case "tool.call": + return { + type: "tool.call", + payload: { name: event.name, arguments: event.arguments }, + turnId: this.ensureActiveTurnId(), + callId: event.callId, + }; + case "tool.progress": + return { + type: "tool.progress", + payload: { summary: event.summary }, + turnId: this.ensureActiveTurnId(), + callId: event.callId, + }; + case "turn.started": { + const turnId = event.turnId || randomUUID(); + return { type: "turn.started", payload: {}, turnId }; + } + case "turn.ended": { + const turnId = this.ensureActiveTurnId(); + return { type: "turn.ended", payload: {}, turnId, final: true }; + } + case "session.ended": + return { + type: "session.closed", + payload: { + summary: event.summary, + durationSec: event.durationSec, + turnCount: event.turnCount, + }, + final: true, + }; + case "session.rotating": + return { type: "health.changed", payload: { status: "rotating" } }; + case "session.rotated": + return { type: "session.replaced", payload: { sessionId: event.sessionId } }; + case "usage.metrics": + return { type: "usage.metrics", payload: event }; + case "latency.metrics": + return { type: "latency.metrics", payload: event }; + case "tool.cancelled": + return { + type: "tool.error", + payload: { callIds: event.callIds, cancelled: true }, + turnId: this.ensureActiveTurnId(), + final: true, + }; + case "error": + return { + type: "session.error", + payload: { message: event.message, code: event.code }, + final: true, + }; + } + return null; } private clearHandshakeTimer(): void { @@ -330,6 +526,11 @@ function parseClientEvent(raw: RawData): VoiceClawClientEvent | null { } } +function hasInstructionsOverride(config: VoiceClawSessionConfigEvent): boolean { + const value = (config as { instructionsOverride?: unknown }).instructionsOverride; + return typeof value === "string" && value.trim().length > 0; +} + function sanitizeSessionKey(value: string | undefined): string | null { const trimmed = value?.trim(); if (!trimmed) { @@ -353,6 +554,18 @@ function sanitizeErrorMessage(message: string): string { return message.replace(/([?&]key=)[^&\s]+/g, "$1***"); } +function base64ByteLength(value: string): number { + const normalized = value.trim(); + if (!normalized) { + return 0; + } + try { + return Buffer.from(normalized, "base64").byteLength; + } catch { + return normalized.length; + } +} + function once(fn: () => void): () => void { let called = false; return () => { diff --git a/src/gateway/voiceclaw-realtime/types.ts b/src/gateway/voiceclaw-realtime/types.ts index 248b2d1b6fc..3e442f3b6e0 100644 --- a/src/gateway/voiceclaw-realtime/types.ts +++ b/src/gateway/voiceclaw-realtime/types.ts @@ -23,7 +23,6 @@ export type VoiceClawSessionConfigEvent = { location?: string; }; watchdog?: "enabled" | "disabled"; - instructionsOverride?: string; conversationHistory?: { role: "user" | "assistant"; text: string }[]; };