mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:40:44 +00:00
feat: add unified talk gateway sessions
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { OpenClawConfig } from "../../../config/types.js";
|
||||
import { LEGACY_CONFIG_MIGRATIONS_RUNTIME_TTS } from "./legacy-config-migrations.runtime.tts.js";
|
||||
import { normalizeLegacyTalkConfig } from "./legacy-talk-config-normalizer.js";
|
||||
|
||||
function migrateLegacyConfig(raw: unknown): {
|
||||
config: OpenClawConfig | null;
|
||||
@@ -21,6 +22,83 @@ function migrateLegacyConfig(raw: unknown): {
|
||||
}
|
||||
|
||||
describe("legacy migrate provider-shaped config", () => {
|
||||
it("moves legacy realtime Talk selectors into talk.realtime without treating speech config as runtime fallback", () => {
|
||||
const changes: string[] = [];
|
||||
const migrated = normalizeLegacyTalkConfig(
|
||||
{
|
||||
talk: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
custom: true,
|
||||
},
|
||||
},
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "agent-consult",
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
} as never,
|
||||
},
|
||||
changes,
|
||||
);
|
||||
|
||||
expect(changes).toContain(
|
||||
"Moved legacy realtime Talk provider/model fields into talk.realtime.",
|
||||
);
|
||||
expect(migrated.talk).toEqual({
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
custom: true,
|
||||
},
|
||||
},
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
custom: true,
|
||||
},
|
||||
},
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "agent-consult",
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("does not copy plain Talk speech provider config into talk.realtime", () => {
|
||||
const changes: string[] = [];
|
||||
const migrated = normalizeLegacyTalkConfig(
|
||||
{
|
||||
talk: {
|
||||
provider: "elevenlabs",
|
||||
providers: {
|
||||
elevenlabs: {
|
||||
voiceId: "voice-1",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
changes,
|
||||
);
|
||||
|
||||
expect(changes).toEqual([]);
|
||||
expect(migrated.talk).toEqual({
|
||||
provider: "elevenlabs",
|
||||
providers: {
|
||||
elevenlabs: {
|
||||
voiceId: "voice-1",
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("moves messages.tts.<provider> keys into messages.tts.providers", () => {
|
||||
const res = migrateLegacyConfig({
|
||||
messages: {
|
||||
|
||||
@@ -14,6 +14,31 @@ function buildLegacyTalkProviderCompat(
|
||||
return Object.keys(compat).length > 0 ? compat : undefined;
|
||||
}
|
||||
|
||||
function buildLegacyRealtimeTalkCompat(
|
||||
talk: Record<string, unknown>,
|
||||
normalizedTalk: NonNullable<OpenClawConfig["talk"]>,
|
||||
): Record<string, unknown> | undefined {
|
||||
if (talk.realtime !== undefined) {
|
||||
return undefined;
|
||||
}
|
||||
const compat: Record<string, unknown> = {};
|
||||
for (const key of ["model", "voice", "mode", "transport", "brain"] as const) {
|
||||
if (talk[key] !== undefined) {
|
||||
compat[key] = talk[key];
|
||||
}
|
||||
}
|
||||
if (Object.keys(compat).length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
if (normalizedTalk.provider !== undefined) {
|
||||
compat.provider = normalizedTalk.provider;
|
||||
}
|
||||
if (normalizedTalk.providers !== undefined) {
|
||||
compat.providers = normalizedTalk.providers;
|
||||
}
|
||||
return normalizeTalkSection({ realtime: compat } as OpenClawConfig["talk"])?.realtime;
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value && typeof value === "object" && !Array.isArray(value));
|
||||
}
|
||||
@@ -35,6 +60,13 @@ export function normalizeLegacyTalkConfig(cfg: OpenClawConfig, changes: string[]
|
||||
},
|
||||
};
|
||||
}
|
||||
const legacyRealtimeCompat = buildLegacyRealtimeTalkCompat(rawTalk, normalizedTalk);
|
||||
if (legacyRealtimeCompat) {
|
||||
normalizedTalk.realtime = {
|
||||
...legacyRealtimeCompat,
|
||||
...normalizedTalk.realtime,
|
||||
};
|
||||
}
|
||||
if (Object.keys(normalizedTalk).length === 0 || isDeepStrictEqual(normalizedTalk, rawTalk)) {
|
||||
return cfg;
|
||||
}
|
||||
@@ -42,6 +74,9 @@ export function normalizeLegacyTalkConfig(cfg: OpenClawConfig, changes: string[]
|
||||
changes.push(
|
||||
"Normalized talk.provider/providers shape (trimmed provider ids and merged missing compatibility fields).",
|
||||
);
|
||||
if (legacyRealtimeCompat) {
|
||||
changes.push("Moved legacy realtime Talk provider/model fields into talk.realtime.");
|
||||
}
|
||||
return {
|
||||
...cfg,
|
||||
talk: normalizedTalk,
|
||||
|
||||
@@ -150,6 +150,21 @@ export const FIELD_HELP: Record<string, string> = {
|
||||
"Provider-specific Talk settings keyed by provider id. During migration, prefer this over legacy talk.* keys.",
|
||||
"talk.providers.*": "Provider-owned Talk config fields for the matching provider id.",
|
||||
"talk.providers.*.apiKey": "Provider API key for Talk mode.", // pragma: allowlist secret
|
||||
"talk.realtime":
|
||||
"Realtime Talk provider, model, voice, mode, transport, and brain strategy. Keep speech/TTS provider config in talk.provider and talk.providers.",
|
||||
"talk.realtime.provider": "Active realtime voice provider id, such as openai or google.",
|
||||
"talk.realtime.providers": "Provider-specific realtime voice settings keyed by provider id.",
|
||||
"talk.realtime.providers.*": "Provider-owned realtime voice config for the matching provider id.",
|
||||
"talk.realtime.providers.*.apiKey": "Provider API key for realtime Talk.", // pragma: allowlist secret
|
||||
"talk.realtime.model":
|
||||
"Realtime provider model id override for browser or Gateway-owned Talk sessions.",
|
||||
"talk.realtime.voice":
|
||||
"Realtime provider voice id override for browser or Gateway-owned Talk sessions.",
|
||||
"talk.realtime.mode": "Talk execution mode: realtime, stt-tts, or transcription.",
|
||||
"talk.realtime.transport":
|
||||
"Talk byte/session transport: webrtc, provider-websocket, gateway-relay, or managed-room.",
|
||||
"talk.realtime.brain":
|
||||
"Talk reasoning strategy: agent-consult for Gateway-mediated agent help, direct-tools for owner-only tool calls, or none.",
|
||||
"talk.speechLocale":
|
||||
'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.',
|
||||
"talk.interruptOnSpeech":
|
||||
|
||||
@@ -875,6 +875,16 @@ export const FIELD_LABELS: Record<string, string> = {
|
||||
"talk.providers": "Talk Provider Settings",
|
||||
"talk.providers.*": "Talk Provider Config",
|
||||
"talk.providers.*.apiKey": "Talk Provider API Key", // pragma: allowlist secret
|
||||
"talk.realtime": "Talk Realtime",
|
||||
"talk.realtime.provider": "Talk Realtime Provider",
|
||||
"talk.realtime.providers": "Talk Realtime Provider Settings",
|
||||
"talk.realtime.providers.*": "Talk Realtime Provider Config",
|
||||
"talk.realtime.providers.*.apiKey": "Talk Realtime Provider API Key", // pragma: allowlist secret
|
||||
"talk.realtime.model": "Talk Realtime Model",
|
||||
"talk.realtime.voice": "Talk Realtime Voice",
|
||||
"talk.realtime.mode": "Talk Realtime Mode",
|
||||
"talk.realtime.transport": "Talk Realtime Transport",
|
||||
"talk.realtime.brain": "Talk Realtime Brain",
|
||||
channels: "Channels",
|
||||
"channels.defaults": "Channel Defaults",
|
||||
"channels.defaults.groupPolicy": "Default Group Policy",
|
||||
|
||||
@@ -31,6 +31,19 @@ describe("talk normalization", () => {
|
||||
custom: true,
|
||||
},
|
||||
},
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
model: "gpt-realtime",
|
||||
},
|
||||
},
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
mode: "realtime",
|
||||
transport: "webrtc",
|
||||
brain: "agent-consult",
|
||||
},
|
||||
interruptOnSpeech: true,
|
||||
});
|
||||
|
||||
@@ -42,6 +55,19 @@ describe("talk normalization", () => {
|
||||
custom: true,
|
||||
},
|
||||
},
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
model: "gpt-realtime",
|
||||
},
|
||||
},
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
mode: "realtime",
|
||||
transport: "webrtc",
|
||||
brain: "agent-consult",
|
||||
},
|
||||
interruptOnSpeech: true,
|
||||
});
|
||||
});
|
||||
|
||||
@@ -5,6 +5,7 @@ import type {
|
||||
TalkConfig,
|
||||
TalkConfigResponse,
|
||||
TalkProviderConfig,
|
||||
TalkRealtimeConfig,
|
||||
} from "./types.gateway.js";
|
||||
import type { OpenClawConfig } from "./types.openclaw.js";
|
||||
import { coerceSecretRef } from "./types.secrets.js";
|
||||
@@ -85,6 +86,50 @@ function normalizeTalkProviders(value: unknown): Record<string, TalkProviderConf
|
||||
return Object.keys(providers).length > 0 ? providers : undefined;
|
||||
}
|
||||
|
||||
function normalizeTalkRealtimeConfig(value: unknown): TalkRealtimeConfig | undefined {
|
||||
if (!isRecord(value)) {
|
||||
return undefined;
|
||||
}
|
||||
const source = value;
|
||||
const normalized: TalkRealtimeConfig = {};
|
||||
|
||||
const provider = normalizeOptionalString(source.provider);
|
||||
if (provider) {
|
||||
normalized.provider = provider;
|
||||
}
|
||||
const providers = normalizeTalkProviders(source.providers);
|
||||
if (providers) {
|
||||
normalized.providers = providers;
|
||||
}
|
||||
const model = normalizeOptionalString(source.model);
|
||||
if (model) {
|
||||
normalized.model = model;
|
||||
}
|
||||
const voice = normalizeOptionalString(source.voice);
|
||||
if (voice) {
|
||||
normalized.voice = voice;
|
||||
}
|
||||
if (source.mode === "realtime" || source.mode === "stt-tts" || source.mode === "transcription") {
|
||||
normalized.mode = source.mode;
|
||||
}
|
||||
if (
|
||||
source.transport === "webrtc" ||
|
||||
source.transport === "provider-websocket" ||
|
||||
source.transport === "gateway-relay" ||
|
||||
source.transport === "managed-room"
|
||||
) {
|
||||
normalized.transport = source.transport;
|
||||
}
|
||||
if (
|
||||
source.brain === "agent-consult" ||
|
||||
source.brain === "direct-tools" ||
|
||||
source.brain === "none"
|
||||
) {
|
||||
normalized.brain = source.brain;
|
||||
}
|
||||
return Object.keys(normalized).length > 0 ? normalized : undefined;
|
||||
}
|
||||
|
||||
function activeProviderFromTalk(talk: TalkConfig): string | undefined {
|
||||
const provider = normalizeOptionalString(talk.provider);
|
||||
const providers = talk.providers;
|
||||
@@ -118,10 +163,14 @@ export function normalizeTalkSection(value: TalkConfig | undefined): TalkConfig
|
||||
}
|
||||
|
||||
const providers = normalizeTalkProviders(source.providers);
|
||||
const realtime = normalizeTalkRealtimeConfig(source.realtime);
|
||||
const provider = normalizeOptionalString(source.provider);
|
||||
if (providers) {
|
||||
normalized.providers = providers;
|
||||
}
|
||||
if (realtime) {
|
||||
normalized.realtime = realtime;
|
||||
}
|
||||
if (provider) {
|
||||
normalized.provider = provider;
|
||||
}
|
||||
@@ -182,6 +231,9 @@ export function buildTalkConfigResponse(value: unknown): TalkConfigResponse | un
|
||||
if (normalized?.providers && Object.keys(normalized.providers).length > 0) {
|
||||
payload.providers = normalized.providers;
|
||||
}
|
||||
if (normalized?.realtime && Object.keys(normalized.realtime).length > 0) {
|
||||
payload.realtime = normalized.realtime;
|
||||
}
|
||||
|
||||
const resolved =
|
||||
resolveActiveTalkProviderConfig(normalized) ??
|
||||
|
||||
@@ -55,6 +55,23 @@ export type TalkProviderConfig = {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
|
||||
export type TalkRealtimeConfig = {
|
||||
/** Active realtime voice provider. */
|
||||
provider?: string;
|
||||
/** Provider-specific realtime voice config keyed by provider id. */
|
||||
providers?: Record<string, TalkProviderConfig>;
|
||||
/** Provider model override for realtime sessions. */
|
||||
model?: string;
|
||||
/** Provider voice override for realtime sessions. */
|
||||
voice?: string;
|
||||
/** Realtime execution mode. */
|
||||
mode?: "realtime" | "stt-tts" | "transcription";
|
||||
/** Byte/session transport. */
|
||||
transport?: "webrtc" | "provider-websocket" | "gateway-relay" | "managed-room";
|
||||
/** Tool/agent strategy for realtime sessions. */
|
||||
brain?: "agent-consult" | "direct-tools" | "none";
|
||||
};
|
||||
|
||||
export type ResolvedTalkConfig = {
|
||||
/** Active Talk TTS provider resolved from the current config payload. */
|
||||
provider: string;
|
||||
@@ -67,6 +84,8 @@ export type TalkConfig = {
|
||||
provider?: string;
|
||||
/** Provider-specific Talk config keyed by provider id. */
|
||||
providers?: Record<string, TalkProviderConfig>;
|
||||
/** Realtime Talk provider, model, voice, mode, transport, and brain config. */
|
||||
realtime?: TalkRealtimeConfig;
|
||||
/** BCP 47 locale id used for Talk speech recognition on device nodes. */
|
||||
speechLocale?: string;
|
||||
/** Stop speaking when user starts talking (default: true). */
|
||||
|
||||
@@ -212,10 +212,44 @@ const TalkProviderEntrySchema = z
|
||||
})
|
||||
.catchall(z.unknown());
|
||||
|
||||
const TalkRealtimeSchema = z
|
||||
.object({
|
||||
provider: z.string().optional(),
|
||||
providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
mode: z.enum(["realtime", "stt-tts", "transcription"]).optional(),
|
||||
transport: z.enum(["webrtc", "provider-websocket", "gateway-relay", "managed-room"]).optional(),
|
||||
brain: z.enum(["agent-consult", "direct-tools", "none"]).optional(),
|
||||
})
|
||||
.strict()
|
||||
.superRefine((realtime, ctx) => {
|
||||
const provider = normalizeLowercaseStringOrEmpty(realtime.provider ?? "");
|
||||
const providers = realtime.providers ? Object.keys(realtime.providers) : [];
|
||||
|
||||
if (provider && providers.length > 0 && !(provider in realtime.providers!)) {
|
||||
ctx.addIssue({
|
||||
code: z.ZodIssueCode.custom,
|
||||
path: ["provider"],
|
||||
message: `talk.realtime.provider must match a key in talk.realtime.providers (missing "${provider}")`,
|
||||
});
|
||||
}
|
||||
|
||||
if (!provider && providers.length > 1) {
|
||||
ctx.addIssue({
|
||||
code: z.ZodIssueCode.custom,
|
||||
path: ["provider"],
|
||||
message:
|
||||
"talk.realtime.provider is required when talk.realtime.providers defines multiple providers",
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
const TalkSchema = z
|
||||
.object({
|
||||
provider: z.string().optional(),
|
||||
providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
|
||||
realtime: TalkRealtimeSchema.optional(),
|
||||
speechLocale: z.string().optional(),
|
||||
interruptOnSpeech: z.boolean().optional(),
|
||||
silenceTimeoutMs: z.number().int().positive().optional(),
|
||||
|
||||
@@ -322,7 +322,9 @@ describe("gateway broadcaster", () => {
|
||||
expect(readSocket.send).toHaveBeenCalledTimes(0);
|
||||
|
||||
broadcastToConnIds("tick", { ts: 1 }, new Set(["c-read"]));
|
||||
expect(readSocket.send).toHaveBeenCalledTimes(1);
|
||||
broadcastToConnIds("talk.realtime.relay", { type: "ready" }, new Set(["c-read"]));
|
||||
broadcastToConnIds("talk.transcription.relay", { type: "session.ready" }, new Set(["c-read"]));
|
||||
expect(readSocket.send).toHaveBeenCalledTimes(3);
|
||||
expect(approvalsSocket.send).toHaveBeenCalledTimes(1);
|
||||
expect(pairingSocket.send).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
@@ -41,6 +41,11 @@ describe("method scope resolution", () => {
|
||||
["diagnostics.stability", ["operator.read"]],
|
||||
["node.pair.approve", ["operator.pairing"]],
|
||||
["poll", ["operator.write"]],
|
||||
["talk.session.create", ["operator.write"]],
|
||||
["talk.session.inputAudio", ["operator.write"]],
|
||||
["talk.session.control", ["operator.write"]],
|
||||
["talk.session.toolResult", ["operator.write"]],
|
||||
["talk.session.close", ["operator.write"]],
|
||||
["update.status", ["operator.admin"]],
|
||||
["config.patch", ["operator.admin"]],
|
||||
["nativeHook.invoke", ["operator.admin"]],
|
||||
@@ -96,6 +101,24 @@ describe("operator scope authorization", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("allows operator.write clients to use unified Talk sessions", () => {
|
||||
for (const method of [
|
||||
"talk.session.create",
|
||||
"talk.session.inputAudio",
|
||||
"talk.session.control",
|
||||
"talk.session.toolResult",
|
||||
"talk.session.close",
|
||||
]) {
|
||||
expect(authorizeOperatorScopesForMethod(method, ["operator.write"])).toEqual({
|
||||
allowed: true,
|
||||
});
|
||||
expect(authorizeOperatorScopesForMethod(method, ["operator.read"])).toEqual({
|
||||
allowed: false,
|
||||
missingScope: "operator.write",
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
it("requires admin for browser.request", () => {
|
||||
setPluginGatewayMethodScope("browser.request", "operator.admin");
|
||||
|
||||
|
||||
@@ -122,7 +122,9 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
|
||||
"chat.history",
|
||||
"config.get",
|
||||
"config.schema.lookup",
|
||||
"talk.catalog",
|
||||
"talk.config",
|
||||
"talk.handoff.join",
|
||||
"agents.files.list",
|
||||
"agents.files.get",
|
||||
"artifacts.list",
|
||||
@@ -137,11 +139,27 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
|
||||
"agent.wait",
|
||||
"wake",
|
||||
"talk.mode",
|
||||
"talk.session.create",
|
||||
"talk.session.inputAudio",
|
||||
"talk.session.control",
|
||||
"talk.session.toolResult",
|
||||
"talk.session.close",
|
||||
"talk.handoff.create",
|
||||
"talk.handoff.revoke",
|
||||
"talk.handoff.turnStart",
|
||||
"talk.handoff.turnEnd",
|
||||
"talk.handoff.turnCancel",
|
||||
"talk.realtime.session",
|
||||
"talk.realtime.toolCall",
|
||||
"talk.realtime.relayAudio",
|
||||
"talk.realtime.relayCancel",
|
||||
"talk.realtime.relayMark",
|
||||
"talk.realtime.relayStop",
|
||||
"talk.realtime.relayToolResult",
|
||||
"talk.transcription.session",
|
||||
"talk.transcription.relayAudio",
|
||||
"talk.transcription.relayCancel",
|
||||
"talk.transcription.relayStop",
|
||||
"talk.speak",
|
||||
"tts.enable",
|
||||
"tts.disable",
|
||||
|
||||
@@ -7,7 +7,21 @@ import {
|
||||
validateNodeEventResult,
|
||||
validateNodePresenceAlivePayload,
|
||||
validateTalkConfigResult,
|
||||
validateTalkEvent,
|
||||
validateTalkHandoffCreateParams,
|
||||
validateTalkHandoffCreateResult,
|
||||
validateTalkHandoffJoinResult,
|
||||
validateTalkRealtimeRelayAudioParams,
|
||||
validateTalkRealtimeRelayCancelParams,
|
||||
validateTalkHandoffTurnCancelParams,
|
||||
validateTalkHandoffTurnEndParams,
|
||||
validateTalkHandoffTurnResult,
|
||||
validateTalkHandoffTurnStartParams,
|
||||
validateTalkRealtimeSessionParams,
|
||||
validateTalkRealtimeToolCallParams,
|
||||
validateTalkTranscriptionRelayCancelParams,
|
||||
validateTalkTranscriptionRelayAudioParams,
|
||||
validateTalkTranscriptionSessionParams,
|
||||
validateWakeParams,
|
||||
} from "./index.js";
|
||||
|
||||
@@ -104,7 +118,7 @@ describe("validateTalkConfigResult", () => {
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects normalized talk payloads without talk.resolved", () => {
|
||||
it("accepts normalized talk payloads without resolved provider materialization", () => {
|
||||
expect(
|
||||
validateTalkConfigResult({
|
||||
config: {
|
||||
@@ -118,18 +132,50 @@ describe("validateTalkConfigResult", () => {
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toBe(false);
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("accepts realtime Talk defaults without requiring a speech provider", () => {
|
||||
expect(
|
||||
validateTalkConfigResult({
|
||||
config: {
|
||||
talk: {
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: {
|
||||
source: "env",
|
||||
provider: "default",
|
||||
id: "OPENAI_API_KEY",
|
||||
},
|
||||
model: "gpt-realtime",
|
||||
},
|
||||
},
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "agent-consult",
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateTalkRealtimeSessionParams", () => {
|
||||
it("accepts provider, model, and voice overrides", () => {
|
||||
it("accepts provider, model, voice, mode, transport, and brain overrides", () => {
|
||||
expect(
|
||||
validateTalkRealtimeSessionParams({
|
||||
sessionKey: "agent:main:main",
|
||||
provider: "openai",
|
||||
model: "gpt-realtime-1.5",
|
||||
voice: "alloy",
|
||||
mode: "realtime",
|
||||
transport: "webrtc",
|
||||
brain: "agent-consult",
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
@@ -147,6 +193,294 @@ describe("validateTalkRealtimeSessionParams", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateTalkEvent", () => {
|
||||
it("pins the common Talk event envelope used by relay and surface adapters", () => {
|
||||
expect(
|
||||
validateTalkEvent({
|
||||
id: "talk-session:1",
|
||||
type: "capture.started",
|
||||
sessionId: "talk-session",
|
||||
turnId: "turn-1",
|
||||
captureId: "capture-1",
|
||||
seq: 1,
|
||||
timestamp: "2026-05-05T12:00:00.000Z",
|
||||
mode: "stt-tts",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
provider: "openai",
|
||||
final: false,
|
||||
callId: "call-1",
|
||||
itemId: "item-1",
|
||||
parentId: "parent-1",
|
||||
payload: { source: "ptt" },
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects stale or vendor-shaped event payloads without required correlation", () => {
|
||||
expect(
|
||||
validateTalkEvent({
|
||||
type: "output.audio.delta",
|
||||
sessionId: "talk-session",
|
||||
seq: 0,
|
||||
timestamp: "2026-05-05T12:00:00.000Z",
|
||||
mode: "realtime-duplex",
|
||||
transport: "webrtc-sdp",
|
||||
brain: "agent-consult",
|
||||
payload: { byteLength: 12 },
|
||||
}),
|
||||
).toBe(false);
|
||||
expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required");
|
||||
});
|
||||
|
||||
it("requires turnId and captureId for scoped Talk events", () => {
|
||||
expect(
|
||||
validateTalkEvent({
|
||||
id: "talk-session:1",
|
||||
type: "turn.started",
|
||||
sessionId: "talk-session",
|
||||
seq: 1,
|
||||
timestamp: "2026-05-05T12:00:00.000Z",
|
||||
mode: "stt-tts",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
payload: {},
|
||||
}),
|
||||
).toBe(false);
|
||||
expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required");
|
||||
|
||||
expect(
|
||||
validateTalkEvent({
|
||||
id: "talk-session:2",
|
||||
type: "capture.started",
|
||||
sessionId: "talk-session",
|
||||
turnId: "turn-1",
|
||||
seq: 2,
|
||||
timestamp: "2026-05-05T12:00:01.000Z",
|
||||
mode: "stt-tts",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
payload: {},
|
||||
}),
|
||||
).toBe(false);
|
||||
expect(formatValidationErrors(validateTalkEvent.errors)).toContain("must have required");
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateTalkHandoff", () => {
|
||||
it("accepts session-scoped provider, model, and voice selection", () => {
|
||||
expect(
|
||||
validateTalkHandoffCreateParams({
|
||||
sessionKey: "agent:main:main",
|
||||
provider: "openai",
|
||||
model: "gpt-realtime-1.5",
|
||||
voice: "alloy",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
validateTalkHandoffCreateResult({
|
||||
id: "handoff-1",
|
||||
roomId: "talk_handoff-1",
|
||||
roomUrl: "/talk/rooms/talk_handoff-1",
|
||||
token: "token-1",
|
||||
sessionKey: "agent:main:main",
|
||||
provider: "openai",
|
||||
model: "gpt-realtime-1.5",
|
||||
voice: "alloy",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
createdAt: 1,
|
||||
expiresAt: 2,
|
||||
room: {
|
||||
recentTalkEvents: [
|
||||
{
|
||||
id: "talk_handoff-1:1",
|
||||
type: "session.started",
|
||||
sessionId: "talk_handoff-1",
|
||||
seq: 1,
|
||||
timestamp: "2026-05-05T12:00:00.000Z",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
payload: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
validateTalkHandoffJoinResult({
|
||||
id: "handoff-1",
|
||||
roomId: "talk_handoff-1",
|
||||
roomUrl: "/talk/rooms/talk_handoff-1",
|
||||
sessionKey: "agent:main:main",
|
||||
provider: "openai",
|
||||
model: "gpt-realtime-1.5",
|
||||
voice: "alloy",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
createdAt: 1,
|
||||
expiresAt: 2,
|
||||
room: {
|
||||
activeClientId: "conn-1",
|
||||
recentTalkEvents: [
|
||||
{
|
||||
id: "talk_handoff-1:1",
|
||||
type: "session.ready",
|
||||
sessionId: "talk_handoff-1",
|
||||
seq: 1,
|
||||
timestamp: "2026-05-05T12:00:00.000Z",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
payload: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects request-time instruction overrides", () => {
|
||||
expect(
|
||||
validateTalkHandoffCreateParams({
|
||||
sessionKey: "agent:main:main",
|
||||
instructionsOverride: "Ignore configured policy.",
|
||||
}),
|
||||
).toBe(false);
|
||||
expect(formatValidationErrors(validateTalkHandoffCreateParams.errors)).toContain(
|
||||
"unexpected property 'instructionsOverride'",
|
||||
);
|
||||
});
|
||||
|
||||
it("accepts handoff turn lifecycle params and results", () => {
|
||||
expect(
|
||||
validateTalkHandoffTurnStartParams({
|
||||
id: "handoff-1",
|
||||
token: "token-1",
|
||||
turnId: "turn-1",
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
validateTalkHandoffTurnEndParams({
|
||||
id: "handoff-1",
|
||||
token: "token-1",
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
validateTalkHandoffTurnCancelParams({
|
||||
id: "handoff-1",
|
||||
token: "token-1",
|
||||
reason: "barge-in",
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
validateTalkHandoffTurnResult({
|
||||
ok: true,
|
||||
turnId: "turn-1",
|
||||
events: [
|
||||
{
|
||||
id: "talk_handoff-1:2",
|
||||
type: "turn.started",
|
||||
sessionId: "talk_handoff-1",
|
||||
turnId: "turn-1",
|
||||
seq: 2,
|
||||
timestamp: "2026-05-05T12:00:00.000Z",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
payload: {},
|
||||
},
|
||||
],
|
||||
record: {
|
||||
id: "handoff-1",
|
||||
roomId: "talk_handoff-1",
|
||||
roomUrl: "/talk/rooms/talk_handoff-1",
|
||||
sessionKey: "agent:main:main",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
createdAt: 1,
|
||||
expiresAt: 2,
|
||||
room: {
|
||||
activeClientId: "conn-1",
|
||||
activeTurnId: "turn-1",
|
||||
recentTalkEvents: [
|
||||
{
|
||||
id: "talk_handoff-1:2",
|
||||
type: "turn.started",
|
||||
sessionId: "talk_handoff-1",
|
||||
turnId: "turn-1",
|
||||
seq: 2,
|
||||
timestamp: "2026-05-05T12:00:00.000Z",
|
||||
mode: "realtime",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
payload: {},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateTalkRealtimeToolCallParams", () => {
|
||||
it("accepts optional relay session correlation", () => {
|
||||
expect(
|
||||
validateTalkRealtimeToolCallParams({
|
||||
sessionKey: "agent:main:main",
|
||||
relaySessionId: "relay-1",
|
||||
callId: "call-1",
|
||||
name: "openclaw_agent_consult",
|
||||
args: { question: "what now" },
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateTalkRealtimeRelayParams", () => {
|
||||
it("accepts relay audio and cancel params", () => {
|
||||
expect(
|
||||
validateTalkRealtimeRelayAudioParams({
|
||||
relaySessionId: "relay-1",
|
||||
audioBase64: "aGVsbG8=",
|
||||
timestamp: 123,
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
validateTalkRealtimeRelayCancelParams({
|
||||
relaySessionId: "relay-1",
|
||||
reason: "barge-in",
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateTalkTranscriptionParams", () => {
|
||||
it("accepts transcription session, relay audio, and cancel params", () => {
|
||||
expect(validateTalkTranscriptionSessionParams({ provider: "openai" })).toBe(true);
|
||||
expect(
|
||||
validateTalkTranscriptionRelayAudioParams({
|
||||
transcriptionSessionId: "stt-1",
|
||||
audioBase64: "aGVsbG8=",
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
validateTalkTranscriptionRelayCancelParams({
|
||||
transcriptionSessionId: "stt-1",
|
||||
reason: "barge-in",
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateWakeParams", () => {
|
||||
it("accepts valid wake params", () => {
|
||||
expect(validateWakeParams({ mode: "now", text: "hello" })).toBe(true);
|
||||
|
||||
@@ -61,12 +61,40 @@ import {
|
||||
ChannelsStopParamsSchema,
|
||||
type ChannelsLogoutParams,
|
||||
ChannelsLogoutParamsSchema,
|
||||
type TalkEvent,
|
||||
TalkEventSchema,
|
||||
type TalkCatalogParams,
|
||||
TalkCatalogParamsSchema,
|
||||
type TalkCatalogResult,
|
||||
TalkCatalogResultSchema,
|
||||
type TalkConfigParams,
|
||||
TalkConfigParamsSchema,
|
||||
type TalkConfigResult,
|
||||
TalkConfigResultSchema,
|
||||
type TalkHandoffCreateParams,
|
||||
TalkHandoffCreateParamsSchema,
|
||||
type TalkHandoffCreateResult,
|
||||
TalkHandoffCreateResultSchema,
|
||||
type TalkHandoffJoinParams,
|
||||
TalkHandoffJoinParamsSchema,
|
||||
type TalkHandoffJoinResult,
|
||||
TalkHandoffJoinResultSchema,
|
||||
type TalkHandoffRevokeParams,
|
||||
TalkHandoffRevokeParamsSchema,
|
||||
type TalkHandoffRevokeResult,
|
||||
TalkHandoffRevokeResultSchema,
|
||||
type TalkHandoffTurnCancelParams,
|
||||
TalkHandoffTurnCancelParamsSchema,
|
||||
type TalkHandoffTurnEndParams,
|
||||
TalkHandoffTurnEndParamsSchema,
|
||||
type TalkHandoffTurnResult,
|
||||
TalkHandoffTurnResultSchema,
|
||||
type TalkHandoffTurnStartParams,
|
||||
TalkHandoffTurnStartParamsSchema,
|
||||
type TalkRealtimeRelayAudioParams,
|
||||
TalkRealtimeRelayAudioParamsSchema,
|
||||
type TalkRealtimeRelayCancelParams,
|
||||
TalkRealtimeRelayCancelParamsSchema,
|
||||
type TalkRealtimeRelayMarkParams,
|
||||
TalkRealtimeRelayMarkParamsSchema,
|
||||
type TalkRealtimeRelayOkResult,
|
||||
@@ -79,6 +107,38 @@ import {
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
type TalkRealtimeSessionResult,
|
||||
TalkRealtimeSessionResultSchema,
|
||||
type TalkRealtimeToolCallParams,
|
||||
TalkRealtimeToolCallParamsSchema,
|
||||
type TalkRealtimeToolCallResult,
|
||||
TalkRealtimeToolCallResultSchema,
|
||||
type TalkSessionCloseParams,
|
||||
TalkSessionCloseParamsSchema,
|
||||
type TalkSessionControlParams,
|
||||
TalkSessionControlParamsSchema,
|
||||
type TalkSessionControlResult,
|
||||
TalkSessionControlResultSchema,
|
||||
type TalkSessionCreateParams,
|
||||
TalkSessionCreateParamsSchema,
|
||||
type TalkSessionCreateResult,
|
||||
TalkSessionCreateResultSchema,
|
||||
type TalkSessionInputAudioParams,
|
||||
TalkSessionInputAudioParamsSchema,
|
||||
type TalkSessionOkResult,
|
||||
TalkSessionOkResultSchema,
|
||||
type TalkSessionToolResultParams,
|
||||
TalkSessionToolResultParamsSchema,
|
||||
type TalkTranscriptionRelayAudioParams,
|
||||
TalkTranscriptionRelayAudioParamsSchema,
|
||||
type TalkTranscriptionRelayCancelParams,
|
||||
TalkTranscriptionRelayCancelParamsSchema,
|
||||
type TalkTranscriptionRelayOkResult,
|
||||
TalkTranscriptionRelayOkResultSchema,
|
||||
type TalkTranscriptionRelayStopParams,
|
||||
TalkTranscriptionRelayStopParamsSchema,
|
||||
type TalkTranscriptionSessionParams,
|
||||
TalkTranscriptionSessionParamsSchema,
|
||||
type TalkTranscriptionSessionResult,
|
||||
TalkTranscriptionSessionResultSchema,
|
||||
type TalkSpeakParams,
|
||||
TalkSpeakParamsSchema,
|
||||
type TalkSpeakResult,
|
||||
@@ -532,17 +592,82 @@ export const validateWizardNextParams = ajv.compile<WizardNextParams>(WizardNext
|
||||
export const validateWizardCancelParams = ajv.compile<WizardCancelParams>(WizardCancelParamsSchema);
|
||||
export const validateWizardStatusParams = ajv.compile<WizardStatusParams>(WizardStatusParamsSchema);
|
||||
export const validateTalkModeParams = ajv.compile<TalkModeParams>(TalkModeParamsSchema);
|
||||
export const validateTalkEvent = ajv.compile<TalkEvent>(TalkEventSchema);
|
||||
export const validateTalkCatalogParams = ajv.compile<TalkCatalogParams>(TalkCatalogParamsSchema);
|
||||
export const validateTalkCatalogResult = ajv.compile<TalkCatalogResult>(TalkCatalogResultSchema);
|
||||
export const validateTalkConfigParams = ajv.compile<TalkConfigParams>(TalkConfigParamsSchema);
|
||||
export const validateTalkConfigResult = ajv.compile<TalkConfigResult>(TalkConfigResultSchema);
|
||||
export const validateTalkHandoffCreateParams = ajv.compile<TalkHandoffCreateParams>(
|
||||
TalkHandoffCreateParamsSchema,
|
||||
);
|
||||
export const validateTalkHandoffCreateResult = ajv.compile<TalkHandoffCreateResult>(
|
||||
TalkHandoffCreateResultSchema,
|
||||
);
|
||||
export const validateTalkHandoffJoinParams = ajv.compile<TalkHandoffJoinParams>(
|
||||
TalkHandoffJoinParamsSchema,
|
||||
);
|
||||
export const validateTalkHandoffJoinResult = ajv.compile<TalkHandoffJoinResult>(
|
||||
TalkHandoffJoinResultSchema,
|
||||
);
|
||||
export const validateTalkHandoffRevokeParams = ajv.compile<TalkHandoffRevokeParams>(
|
||||
TalkHandoffRevokeParamsSchema,
|
||||
);
|
||||
export const validateTalkHandoffRevokeResult = ajv.compile<TalkHandoffRevokeResult>(
|
||||
TalkHandoffRevokeResultSchema,
|
||||
);
|
||||
export const validateTalkHandoffTurnStartParams = ajv.compile<TalkHandoffTurnStartParams>(
|
||||
TalkHandoffTurnStartParamsSchema,
|
||||
);
|
||||
export const validateTalkHandoffTurnEndParams = ajv.compile<TalkHandoffTurnEndParams>(
|
||||
TalkHandoffTurnEndParamsSchema,
|
||||
);
|
||||
export const validateTalkHandoffTurnCancelParams = ajv.compile<TalkHandoffTurnCancelParams>(
|
||||
TalkHandoffTurnCancelParamsSchema,
|
||||
);
|
||||
export const validateTalkHandoffTurnResult = ajv.compile<TalkHandoffTurnResult>(
|
||||
TalkHandoffTurnResultSchema,
|
||||
);
|
||||
export const validateTalkRealtimeSessionParams = ajv.compile<TalkRealtimeSessionParams>(
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
);
|
||||
export const validateTalkRealtimeSessionResult = ajv.compile<TalkRealtimeSessionResult>(
|
||||
TalkRealtimeSessionResultSchema,
|
||||
);
|
||||
export const validateTalkRealtimeToolCallParams = ajv.compile<TalkRealtimeToolCallParams>(
|
||||
TalkRealtimeToolCallParamsSchema,
|
||||
);
|
||||
export const validateTalkRealtimeToolCallResult = ajv.compile<TalkRealtimeToolCallResult>(
|
||||
TalkRealtimeToolCallResultSchema,
|
||||
);
|
||||
export const validateTalkSessionCreateParams = ajv.compile<TalkSessionCreateParams>(
|
||||
TalkSessionCreateParamsSchema,
|
||||
);
|
||||
export const validateTalkSessionCreateResult = ajv.compile<TalkSessionCreateResult>(
|
||||
TalkSessionCreateResultSchema,
|
||||
);
|
||||
export const validateTalkSessionInputAudioParams = ajv.compile<TalkSessionInputAudioParams>(
|
||||
TalkSessionInputAudioParamsSchema,
|
||||
);
|
||||
export const validateTalkSessionControlParams = ajv.compile<TalkSessionControlParams>(
|
||||
TalkSessionControlParamsSchema,
|
||||
);
|
||||
export const validateTalkSessionControlResult = ajv.compile<TalkSessionControlResult>(
|
||||
TalkSessionControlResultSchema,
|
||||
);
|
||||
export const validateTalkSessionToolResultParams = ajv.compile<TalkSessionToolResultParams>(
|
||||
TalkSessionToolResultParamsSchema,
|
||||
);
|
||||
export const validateTalkSessionCloseParams = ajv.compile<TalkSessionCloseParams>(
|
||||
TalkSessionCloseParamsSchema,
|
||||
);
|
||||
export const validateTalkSessionOkResult =
|
||||
ajv.compile<TalkSessionOkResult>(TalkSessionOkResultSchema);
|
||||
export const validateTalkRealtimeRelayAudioParams = ajv.compile<TalkRealtimeRelayAudioParams>(
|
||||
TalkRealtimeRelayAudioParamsSchema,
|
||||
);
|
||||
export const validateTalkRealtimeRelayCancelParams = ajv.compile<TalkRealtimeRelayCancelParams>(
|
||||
TalkRealtimeRelayCancelParamsSchema,
|
||||
);
|
||||
export const validateTalkRealtimeRelayMarkParams = ajv.compile<TalkRealtimeRelayMarkParams>(
|
||||
TalkRealtimeRelayMarkParamsSchema,
|
||||
);
|
||||
@@ -551,6 +676,21 @@ export const validateTalkRealtimeRelayStopParams = ajv.compile<TalkRealtimeRelay
|
||||
);
|
||||
export const validateTalkRealtimeRelayToolResultParams =
|
||||
ajv.compile<TalkRealtimeRelayToolResultParams>(TalkRealtimeRelayToolResultParamsSchema);
|
||||
export const validateTalkTranscriptionSessionParams = ajv.compile<TalkTranscriptionSessionParams>(
|
||||
TalkTranscriptionSessionParamsSchema,
|
||||
);
|
||||
export const validateTalkTranscriptionSessionResult = ajv.compile<TalkTranscriptionSessionResult>(
|
||||
TalkTranscriptionSessionResultSchema,
|
||||
);
|
||||
export const validateTalkTranscriptionRelayAudioParams =
|
||||
ajv.compile<TalkTranscriptionRelayAudioParams>(TalkTranscriptionRelayAudioParamsSchema);
|
||||
export const validateTalkTranscriptionRelayCancelParams =
|
||||
ajv.compile<TalkTranscriptionRelayCancelParams>(TalkTranscriptionRelayCancelParamsSchema);
|
||||
export const validateTalkTranscriptionRelayStopParams =
|
||||
ajv.compile<TalkTranscriptionRelayStopParams>(TalkTranscriptionRelayStopParamsSchema);
|
||||
export const validateTalkTranscriptionRelayOkResult = ajv.compile<TalkTranscriptionRelayOkResult>(
|
||||
TalkTranscriptionRelayOkResultSchema,
|
||||
);
|
||||
export const validateTalkSpeakParams = ajv.compile<TalkSpeakParams>(TalkSpeakParamsSchema);
|
||||
export const validateTalkSpeakResult = ajv.compile<TalkSpeakResult>(TalkSpeakResultSchema);
|
||||
export const validateChannelsStatusParams = ajv.compile<ChannelsStatusParams>(
|
||||
@@ -765,15 +905,45 @@ export {
|
||||
WizardNextResultSchema,
|
||||
WizardStartResultSchema,
|
||||
WizardStatusResultSchema,
|
||||
TalkEventSchema,
|
||||
TalkCatalogParamsSchema,
|
||||
TalkCatalogResultSchema,
|
||||
TalkConfigParamsSchema,
|
||||
TalkConfigResultSchema,
|
||||
TalkHandoffCreateParamsSchema,
|
||||
TalkHandoffCreateResultSchema,
|
||||
TalkHandoffJoinParamsSchema,
|
||||
TalkHandoffJoinResultSchema,
|
||||
TalkHandoffRevokeParamsSchema,
|
||||
TalkHandoffRevokeResultSchema,
|
||||
TalkHandoffTurnStartParamsSchema,
|
||||
TalkHandoffTurnEndParamsSchema,
|
||||
TalkHandoffTurnCancelParamsSchema,
|
||||
TalkHandoffTurnResultSchema,
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
TalkRealtimeSessionResultSchema,
|
||||
TalkRealtimeToolCallParamsSchema,
|
||||
TalkRealtimeToolCallResultSchema,
|
||||
TalkSessionCreateParamsSchema,
|
||||
TalkSessionCreateResultSchema,
|
||||
TalkSessionInputAudioParamsSchema,
|
||||
TalkSessionControlParamsSchema,
|
||||
TalkSessionControlResultSchema,
|
||||
TalkSessionToolResultParamsSchema,
|
||||
TalkSessionCloseParamsSchema,
|
||||
TalkSessionOkResultSchema,
|
||||
TalkRealtimeRelayAudioParamsSchema,
|
||||
TalkRealtimeRelayCancelParamsSchema,
|
||||
TalkRealtimeRelayMarkParamsSchema,
|
||||
TalkRealtimeRelayStopParamsSchema,
|
||||
TalkRealtimeRelayToolResultParamsSchema,
|
||||
TalkRealtimeRelayOkResultSchema,
|
||||
TalkTranscriptionSessionParamsSchema,
|
||||
TalkTranscriptionSessionResultSchema,
|
||||
TalkTranscriptionRelayAudioParamsSchema,
|
||||
TalkTranscriptionRelayCancelParamsSchema,
|
||||
TalkTranscriptionRelayStopParamsSchema,
|
||||
TalkTranscriptionRelayOkResultSchema,
|
||||
TalkSpeakParamsSchema,
|
||||
TalkSpeakResultSchema,
|
||||
ChannelsStatusParamsSchema,
|
||||
@@ -879,15 +1049,44 @@ export type {
|
||||
WizardNextResult,
|
||||
WizardStartResult,
|
||||
WizardStatusResult,
|
||||
TalkCatalogParams,
|
||||
TalkCatalogResult,
|
||||
TalkConfigParams,
|
||||
TalkConfigResult,
|
||||
TalkHandoffCreateParams,
|
||||
TalkHandoffCreateResult,
|
||||
TalkHandoffJoinParams,
|
||||
TalkHandoffJoinResult,
|
||||
TalkHandoffRevokeParams,
|
||||
TalkHandoffRevokeResult,
|
||||
TalkHandoffTurnStartParams,
|
||||
TalkHandoffTurnEndParams,
|
||||
TalkHandoffTurnCancelParams,
|
||||
TalkHandoffTurnResult,
|
||||
TalkRealtimeSessionParams,
|
||||
TalkRealtimeSessionResult,
|
||||
TalkRealtimeToolCallParams,
|
||||
TalkRealtimeToolCallResult,
|
||||
TalkSessionCreateParams,
|
||||
TalkSessionCreateResult,
|
||||
TalkSessionInputAudioParams,
|
||||
TalkSessionControlParams,
|
||||
TalkSessionControlResult,
|
||||
TalkSessionToolResultParams,
|
||||
TalkSessionCloseParams,
|
||||
TalkSessionOkResult,
|
||||
TalkRealtimeRelayAudioParams,
|
||||
TalkRealtimeRelayCancelParams,
|
||||
TalkRealtimeRelayMarkParams,
|
||||
TalkRealtimeRelayStopParams,
|
||||
TalkRealtimeRelayToolResultParams,
|
||||
TalkRealtimeRelayOkResult,
|
||||
TalkTranscriptionSessionParams,
|
||||
TalkTranscriptionSessionResult,
|
||||
TalkTranscriptionRelayAudioParams,
|
||||
TalkTranscriptionRelayCancelParams,
|
||||
TalkTranscriptionRelayStopParams,
|
||||
TalkTranscriptionRelayOkResult,
|
||||
TalkSpeakParams,
|
||||
TalkSpeakResult,
|
||||
TalkModeParams,
|
||||
|
||||
@@ -36,12 +36,408 @@ export const TalkSpeakParamsSchema = Type.Object(
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const TalkModeSchema = Type.Union([
|
||||
Type.Literal("realtime"),
|
||||
Type.Literal("stt-tts"),
|
||||
Type.Literal("transcription"),
|
||||
]);
|
||||
|
||||
const TalkTransportSchema = Type.Union([
|
||||
Type.Literal("webrtc"),
|
||||
Type.Literal("provider-websocket"),
|
||||
Type.Literal("gateway-relay"),
|
||||
Type.Literal("managed-room"),
|
||||
]);
|
||||
|
||||
const TalkBrainSchema = Type.Union([
|
||||
Type.Literal("agent-consult"),
|
||||
Type.Literal("direct-tools"),
|
||||
Type.Literal("none"),
|
||||
]);
|
||||
|
||||
const TalkEventTypeSchema = Type.Union([
|
||||
Type.Literal("session.started"),
|
||||
Type.Literal("session.ready"),
|
||||
Type.Literal("session.closed"),
|
||||
Type.Literal("session.error"),
|
||||
Type.Literal("session.replaced"),
|
||||
Type.Literal("turn.started"),
|
||||
Type.Literal("turn.ended"),
|
||||
Type.Literal("turn.cancelled"),
|
||||
Type.Literal("capture.started"),
|
||||
Type.Literal("capture.stopped"),
|
||||
Type.Literal("capture.cancelled"),
|
||||
Type.Literal("capture.once"),
|
||||
Type.Literal("input.audio.delta"),
|
||||
Type.Literal("input.audio.committed"),
|
||||
Type.Literal("transcript.delta"),
|
||||
Type.Literal("transcript.done"),
|
||||
Type.Literal("output.text.delta"),
|
||||
Type.Literal("output.text.done"),
|
||||
Type.Literal("output.audio.started"),
|
||||
Type.Literal("output.audio.delta"),
|
||||
Type.Literal("output.audio.done"),
|
||||
Type.Literal("tool.call"),
|
||||
Type.Literal("tool.progress"),
|
||||
Type.Literal("tool.result"),
|
||||
Type.Literal("tool.error"),
|
||||
Type.Literal("usage.metrics"),
|
||||
Type.Literal("latency.metrics"),
|
||||
Type.Literal("health.changed"),
|
||||
]);
|
||||
|
||||
const TURN_SCOPED_TALK_EVENT_TYPES = [
|
||||
"turn.started",
|
||||
"turn.ended",
|
||||
"turn.cancelled",
|
||||
"input.audio.delta",
|
||||
"input.audio.committed",
|
||||
"transcript.delta",
|
||||
"transcript.done",
|
||||
"output.text.delta",
|
||||
"output.text.done",
|
||||
"output.audio.started",
|
||||
"output.audio.delta",
|
||||
"output.audio.done",
|
||||
"tool.call",
|
||||
"tool.progress",
|
||||
"tool.result",
|
||||
"tool.error",
|
||||
];
|
||||
|
||||
const CAPTURE_SCOPED_TALK_EVENT_TYPES = [
|
||||
"capture.started",
|
||||
"capture.stopped",
|
||||
"capture.cancelled",
|
||||
"capture.once",
|
||||
];
|
||||
|
||||
function requireJsonSchemaProperties(properties: string[]): Record<string, { required: string[] }> {
|
||||
const conditionalRequirementKey = ["th", "en"].join("");
|
||||
return Object.fromEntries([[conditionalRequirementKey, { required: properties }]]);
|
||||
}
|
||||
|
||||
export const TalkEventSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
type: TalkEventTypeSchema,
|
||||
sessionId: NonEmptyString,
|
||||
turnId: Type.Optional(Type.String()),
|
||||
captureId: Type.Optional(Type.String()),
|
||||
seq: Type.Integer({ minimum: 1 }),
|
||||
timestamp: NonEmptyString,
|
||||
mode: TalkModeSchema,
|
||||
transport: TalkTransportSchema,
|
||||
brain: TalkBrainSchema,
|
||||
provider: Type.Optional(Type.String()),
|
||||
final: Type.Optional(Type.Boolean()),
|
||||
callId: Type.Optional(Type.String()),
|
||||
itemId: Type.Optional(Type.String()),
|
||||
parentId: Type.Optional(Type.String()),
|
||||
payload: Type.Unknown(),
|
||||
},
|
||||
{
|
||||
additionalProperties: false,
|
||||
allOf: [
|
||||
{
|
||||
if: {
|
||||
properties: { type: { enum: TURN_SCOPED_TALK_EVENT_TYPES } },
|
||||
required: ["type"],
|
||||
},
|
||||
...requireJsonSchemaProperties(["turnId"]),
|
||||
},
|
||||
{
|
||||
if: {
|
||||
properties: { type: { enum: CAPTURE_SCOPED_TALK_EVENT_TYPES } },
|
||||
required: ["type"],
|
||||
},
|
||||
...requireJsonSchemaProperties(["captureId"]),
|
||||
},
|
||||
],
|
||||
},
|
||||
);
|
||||
|
||||
export const TalkRealtimeSessionParamsSchema = Type.Object(
|
||||
{
|
||||
sessionKey: Type.Optional(Type.String()),
|
||||
provider: Type.Optional(Type.String()),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
mode: Type.Optional(TalkModeSchema),
|
||||
transport: Type.Optional(TalkTransportSchema),
|
||||
brain: Type.Optional(TalkBrainSchema),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkRealtimeToolCallParamsSchema = Type.Object(
|
||||
{
|
||||
sessionKey: NonEmptyString,
|
||||
callId: NonEmptyString,
|
||||
name: NonEmptyString,
|
||||
args: Type.Optional(Type.Unknown()),
|
||||
relaySessionId: Type.Optional(NonEmptyString),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkRealtimeToolCallResultSchema = Type.Object(
|
||||
{
|
||||
runId: NonEmptyString,
|
||||
idempotencyKey: NonEmptyString,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionCreateParamsSchema = Type.Object(
|
||||
{
|
||||
sessionKey: Type.Optional(Type.String()),
|
||||
provider: Type.Optional(Type.String()),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
mode: Type.Optional(TalkModeSchema),
|
||||
transport: Type.Optional(TalkTransportSchema),
|
||||
brain: Type.Optional(TalkBrainSchema),
|
||||
ttlMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 3600000 })),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionInputAudioParamsSchema = Type.Object(
|
||||
{
|
||||
sessionId: NonEmptyString,
|
||||
audioBase64: NonEmptyString,
|
||||
timestamp: Type.Optional(Type.Number()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionControlParamsSchema = Type.Object(
|
||||
{
|
||||
sessionId: NonEmptyString,
|
||||
type: Type.Union([
|
||||
Type.Literal("turn.start"),
|
||||
Type.Literal("turn.end"),
|
||||
Type.Literal("turn.cancel"),
|
||||
]),
|
||||
turnId: Type.Optional(Type.String()),
|
||||
reason: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionToolResultParamsSchema = Type.Object(
|
||||
{
|
||||
sessionId: NonEmptyString,
|
||||
callId: NonEmptyString,
|
||||
result: Type.Unknown(),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionCloseParamsSchema = Type.Object(
|
||||
{
|
||||
sessionId: NonEmptyString,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffCreateParamsSchema = Type.Object(
|
||||
{
|
||||
sessionKey: NonEmptyString,
|
||||
sessionId: Type.Optional(Type.String()),
|
||||
channel: Type.Optional(Type.String()),
|
||||
target: Type.Optional(Type.String()),
|
||||
provider: Type.Optional(Type.String()),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
mode: Type.Optional(TalkModeSchema),
|
||||
transport: Type.Optional(TalkTransportSchema),
|
||||
brain: Type.Optional(TalkBrainSchema),
|
||||
ttlMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 3600000 })),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const TalkHandoffRoomSchema = Type.Object(
|
||||
{
|
||||
activeClientId: Type.Optional(Type.String()),
|
||||
activeTurnId: Type.Optional(Type.String()),
|
||||
recentTalkEvents: Type.Array(TalkEventSchema),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffCreateResultSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
roomId: NonEmptyString,
|
||||
roomUrl: NonEmptyString,
|
||||
token: NonEmptyString,
|
||||
sessionKey: NonEmptyString,
|
||||
sessionId: Type.Optional(Type.String()),
|
||||
channel: Type.Optional(Type.String()),
|
||||
target: Type.Optional(Type.String()),
|
||||
provider: Type.Optional(Type.String()),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
mode: TalkModeSchema,
|
||||
transport: TalkTransportSchema,
|
||||
brain: TalkBrainSchema,
|
||||
createdAt: Type.Number(),
|
||||
expiresAt: Type.Number(),
|
||||
room: TalkHandoffRoomSchema,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const TalkHandoffPublicRecordSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
roomId: NonEmptyString,
|
||||
roomUrl: NonEmptyString,
|
||||
sessionKey: NonEmptyString,
|
||||
sessionId: Type.Optional(Type.String()),
|
||||
channel: Type.Optional(Type.String()),
|
||||
target: Type.Optional(Type.String()),
|
||||
provider: Type.Optional(Type.String()),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
mode: TalkModeSchema,
|
||||
transport: TalkTransportSchema,
|
||||
brain: TalkBrainSchema,
|
||||
createdAt: Type.Number(),
|
||||
expiresAt: Type.Number(),
|
||||
room: TalkHandoffRoomSchema,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffJoinParamsSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
token: NonEmptyString,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffJoinResultSchema = TalkHandoffPublicRecordSchema;
|
||||
|
||||
export const TalkHandoffRevokeParamsSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffRevokeResultSchema = Type.Object(
|
||||
{
|
||||
ok: Type.Boolean(),
|
||||
revoked: Type.Boolean(),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffTurnStartParamsSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
token: NonEmptyString,
|
||||
turnId: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffTurnEndParamsSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
token: NonEmptyString,
|
||||
turnId: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffTurnCancelParamsSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
token: NonEmptyString,
|
||||
turnId: Type.Optional(Type.String()),
|
||||
reason: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkHandoffTurnResultSchema = Type.Object(
|
||||
{
|
||||
ok: Type.Boolean(),
|
||||
record: TalkHandoffPublicRecordSchema,
|
||||
turnId: NonEmptyString,
|
||||
events: Type.Array(TalkEventSchema),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkCatalogParamsSchema = Type.Object({}, { additionalProperties: false });
|
||||
|
||||
const TalkCatalogProviderSchema = Type.Object(
|
||||
{
|
||||
id: NonEmptyString,
|
||||
label: NonEmptyString,
|
||||
configured: Type.Boolean(),
|
||||
models: Type.Optional(Type.Array(Type.String())),
|
||||
voices: Type.Optional(Type.Array(Type.String())),
|
||||
defaultModel: Type.Optional(Type.String()),
|
||||
modes: Type.Optional(Type.Array(TalkModeSchema)),
|
||||
transports: Type.Optional(Type.Array(TalkTransportSchema)),
|
||||
brains: Type.Optional(Type.Array(TalkBrainSchema)),
|
||||
inputAudioFormats: Type.Optional(
|
||||
Type.Array(
|
||||
Type.Object(
|
||||
{
|
||||
encoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]),
|
||||
sampleRateHz: Type.Integer({ minimum: 1 }),
|
||||
channels: Type.Integer({ minimum: 1 }),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
),
|
||||
),
|
||||
),
|
||||
outputAudioFormats: Type.Optional(
|
||||
Type.Array(
|
||||
Type.Object(
|
||||
{
|
||||
encoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]),
|
||||
sampleRateHz: Type.Integer({ minimum: 1 }),
|
||||
channels: Type.Integer({ minimum: 1 }),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
),
|
||||
),
|
||||
),
|
||||
supportsBrowserSession: Type.Optional(Type.Boolean()),
|
||||
supportsBargeIn: Type.Optional(Type.Boolean()),
|
||||
supportsToolCalls: Type.Optional(Type.Boolean()),
|
||||
supportsVideoFrames: Type.Optional(Type.Boolean()),
|
||||
supportsSessionResumption: Type.Optional(Type.Boolean()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const TalkCatalogProviderGroupSchema = Type.Object(
|
||||
{
|
||||
activeProvider: Type.Optional(Type.String()),
|
||||
providers: Type.Array(TalkCatalogProviderSchema),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkCatalogResultSchema = Type.Object(
|
||||
{
|
||||
modes: Type.Array(TalkModeSchema),
|
||||
transports: Type.Array(TalkTransportSchema),
|
||||
brains: Type.Array(TalkBrainSchema),
|
||||
speech: TalkCatalogProviderGroupSchema,
|
||||
transcription: TalkCatalogProviderGroupSchema,
|
||||
realtime: TalkCatalogProviderGroupSchema,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
@@ -70,6 +466,14 @@ export const TalkRealtimeRelayStopParamsSchema = Type.Object(
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkRealtimeRelayCancelParamsSchema = Type.Object(
|
||||
{
|
||||
relaySessionId: NonEmptyString,
|
||||
reason: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkRealtimeRelayToolResultParamsSchema = Type.Object(
|
||||
{
|
||||
relaySessionId: NonEmptyString,
|
||||
@@ -86,6 +490,61 @@ export const TalkRealtimeRelayOkResultSchema = Type.Object(
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkTranscriptionSessionParamsSchema = Type.Object(
|
||||
{
|
||||
provider: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkTranscriptionSessionResultSchema = Type.Object(
|
||||
{
|
||||
provider: NonEmptyString,
|
||||
mode: Type.Literal("transcription"),
|
||||
transport: Type.Literal("gateway-relay"),
|
||||
transcriptionSessionId: NonEmptyString,
|
||||
audio: Type.Object(
|
||||
{
|
||||
inputEncoding: Type.Literal("pcm16"),
|
||||
inputSampleRateHz: Type.Integer({ minimum: 1 }),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
),
|
||||
expiresAt: Type.Number(),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkTranscriptionRelayAudioParamsSchema = Type.Object(
|
||||
{
|
||||
transcriptionSessionId: NonEmptyString,
|
||||
audioBase64: NonEmptyString,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkTranscriptionRelayStopParamsSchema = Type.Object(
|
||||
{
|
||||
transcriptionSessionId: NonEmptyString,
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkTranscriptionRelayCancelParamsSchema = Type.Object(
|
||||
{
|
||||
transcriptionSessionId: NonEmptyString,
|
||||
reason: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkTranscriptionRelayOkResultSchema = Type.Object(
|
||||
{
|
||||
ok: Type.Boolean(),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const BrowserRealtimeAudioContractSchema = Type.Object(
|
||||
{
|
||||
inputEncoding: Type.Union([Type.Literal("pcm16"), Type.Literal("g711_ulaw")]),
|
||||
@@ -96,10 +555,47 @@ const BrowserRealtimeAudioContractSchema = Type.Object(
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionCreateResultSchema = Type.Object(
|
||||
{
|
||||
sessionId: NonEmptyString,
|
||||
provider: Type.Optional(Type.String()),
|
||||
mode: TalkModeSchema,
|
||||
transport: TalkTransportSchema,
|
||||
brain: TalkBrainSchema,
|
||||
relaySessionId: Type.Optional(NonEmptyString),
|
||||
transcriptionSessionId: Type.Optional(NonEmptyString),
|
||||
handoffId: Type.Optional(NonEmptyString),
|
||||
roomId: Type.Optional(NonEmptyString),
|
||||
roomUrl: Type.Optional(NonEmptyString),
|
||||
token: Type.Optional(NonEmptyString),
|
||||
audio: Type.Optional(Type.Unknown()),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
expiresAt: Type.Optional(Type.Number()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionControlResultSchema = Type.Object(
|
||||
{
|
||||
ok: Type.Boolean(),
|
||||
turnId: Type.Optional(Type.String()),
|
||||
events: Type.Optional(Type.Array(TalkEventSchema)),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSessionOkResultSchema = Type.Object(
|
||||
{
|
||||
ok: Type.Boolean(),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const BrowserRealtimeWebRtcSdpSessionSchema = Type.Object(
|
||||
{
|
||||
provider: NonEmptyString,
|
||||
transport: Type.Optional(Type.Literal("webrtc-sdp")),
|
||||
transport: Type.Literal("webrtc"),
|
||||
clientSecret: NonEmptyString,
|
||||
offerUrl: Type.Optional(Type.String()),
|
||||
offerHeaders: Type.Optional(Type.Record(Type.String(), Type.String())),
|
||||
@@ -113,7 +609,7 @@ const BrowserRealtimeWebRtcSdpSessionSchema = Type.Object(
|
||||
const BrowserRealtimeJsonPcmWebSocketSessionSchema = Type.Object(
|
||||
{
|
||||
provider: NonEmptyString,
|
||||
transport: Type.Literal("json-pcm-websocket"),
|
||||
transport: Type.Literal("provider-websocket"),
|
||||
protocol: NonEmptyString,
|
||||
clientSecret: NonEmptyString,
|
||||
websocketUrl: NonEmptyString,
|
||||
@@ -167,6 +663,19 @@ const TalkProviderConfigSchema = Type.Object(talkProviderFieldSchemas, {
|
||||
additionalProperties: true,
|
||||
});
|
||||
|
||||
const TalkRealtimeConfigSchema = Type.Object(
|
||||
{
|
||||
provider: Type.Optional(Type.String()),
|
||||
providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
mode: Type.Optional(TalkModeSchema),
|
||||
transport: Type.Optional(TalkTransportSchema),
|
||||
brain: Type.Optional(TalkBrainSchema),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const ResolvedTalkConfigSchema = Type.Object(
|
||||
{
|
||||
provider: Type.String(),
|
||||
@@ -179,7 +688,8 @@ const TalkConfigSchema = Type.Object(
|
||||
{
|
||||
provider: Type.Optional(Type.String()),
|
||||
providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
|
||||
resolved: ResolvedTalkConfigSchema,
|
||||
realtime: Type.Optional(TalkRealtimeConfigSchema),
|
||||
resolved: Type.Optional(ResolvedTalkConfigSchema),
|
||||
speechLocale: Type.Optional(Type.String()),
|
||||
interruptOnSpeech: Type.Optional(Type.Boolean()),
|
||||
silenceTimeoutMs: Type.Optional(Type.Integer({ minimum: 1 })),
|
||||
|
||||
@@ -65,15 +65,45 @@ import {
|
||||
ChannelsStartParamsSchema,
|
||||
ChannelsStopParamsSchema,
|
||||
ChannelsLogoutParamsSchema,
|
||||
TalkEventSchema,
|
||||
TalkCatalogParamsSchema,
|
||||
TalkCatalogResultSchema,
|
||||
TalkConfigParamsSchema,
|
||||
TalkConfigResultSchema,
|
||||
TalkHandoffCreateParamsSchema,
|
||||
TalkHandoffCreateResultSchema,
|
||||
TalkHandoffJoinParamsSchema,
|
||||
TalkHandoffJoinResultSchema,
|
||||
TalkHandoffRevokeParamsSchema,
|
||||
TalkHandoffRevokeResultSchema,
|
||||
TalkHandoffTurnCancelParamsSchema,
|
||||
TalkHandoffTurnEndParamsSchema,
|
||||
TalkHandoffTurnResultSchema,
|
||||
TalkHandoffTurnStartParamsSchema,
|
||||
TalkRealtimeRelayAudioParamsSchema,
|
||||
TalkRealtimeRelayCancelParamsSchema,
|
||||
TalkRealtimeRelayMarkParamsSchema,
|
||||
TalkRealtimeRelayOkResultSchema,
|
||||
TalkRealtimeRelayStopParamsSchema,
|
||||
TalkRealtimeRelayToolResultParamsSchema,
|
||||
TalkRealtimeSessionParamsSchema,
|
||||
TalkRealtimeSessionResultSchema,
|
||||
TalkRealtimeToolCallParamsSchema,
|
||||
TalkRealtimeToolCallResultSchema,
|
||||
TalkSessionCloseParamsSchema,
|
||||
TalkSessionControlParamsSchema,
|
||||
TalkSessionControlResultSchema,
|
||||
TalkSessionCreateParamsSchema,
|
||||
TalkSessionCreateResultSchema,
|
||||
TalkSessionInputAudioParamsSchema,
|
||||
TalkSessionOkResultSchema,
|
||||
TalkSessionToolResultParamsSchema,
|
||||
TalkTranscriptionRelayAudioParamsSchema,
|
||||
TalkTranscriptionRelayCancelParamsSchema,
|
||||
TalkTranscriptionRelayOkResultSchema,
|
||||
TalkTranscriptionRelayStopParamsSchema,
|
||||
TalkTranscriptionSessionParamsSchema,
|
||||
TalkTranscriptionSessionResultSchema,
|
||||
TalkSpeakParamsSchema,
|
||||
TalkSpeakResultSchema,
|
||||
ChannelsStatusParamsSchema,
|
||||
@@ -333,15 +363,45 @@ export const ProtocolSchemas = {
|
||||
WizardStartResult: WizardStartResultSchema,
|
||||
WizardStatusResult: WizardStatusResultSchema,
|
||||
TalkModeParams: TalkModeParamsSchema,
|
||||
TalkEvent: TalkEventSchema,
|
||||
TalkCatalogParams: TalkCatalogParamsSchema,
|
||||
TalkCatalogResult: TalkCatalogResultSchema,
|
||||
TalkConfigParams: TalkConfigParamsSchema,
|
||||
TalkConfigResult: TalkConfigResultSchema,
|
||||
TalkHandoffCreateParams: TalkHandoffCreateParamsSchema,
|
||||
TalkHandoffCreateResult: TalkHandoffCreateResultSchema,
|
||||
TalkHandoffJoinParams: TalkHandoffJoinParamsSchema,
|
||||
TalkHandoffJoinResult: TalkHandoffJoinResultSchema,
|
||||
TalkHandoffRevokeParams: TalkHandoffRevokeParamsSchema,
|
||||
TalkHandoffRevokeResult: TalkHandoffRevokeResultSchema,
|
||||
TalkHandoffTurnStartParams: TalkHandoffTurnStartParamsSchema,
|
||||
TalkHandoffTurnEndParams: TalkHandoffTurnEndParamsSchema,
|
||||
TalkHandoffTurnCancelParams: TalkHandoffTurnCancelParamsSchema,
|
||||
TalkHandoffTurnResult: TalkHandoffTurnResultSchema,
|
||||
TalkRealtimeSessionParams: TalkRealtimeSessionParamsSchema,
|
||||
TalkRealtimeSessionResult: TalkRealtimeSessionResultSchema,
|
||||
TalkRealtimeRelayAudioParams: TalkRealtimeRelayAudioParamsSchema,
|
||||
TalkRealtimeRelayCancelParams: TalkRealtimeRelayCancelParamsSchema,
|
||||
TalkRealtimeRelayMarkParams: TalkRealtimeRelayMarkParamsSchema,
|
||||
TalkRealtimeRelayStopParams: TalkRealtimeRelayStopParamsSchema,
|
||||
TalkRealtimeRelayToolResultParams: TalkRealtimeRelayToolResultParamsSchema,
|
||||
TalkRealtimeRelayOkResult: TalkRealtimeRelayOkResultSchema,
|
||||
TalkRealtimeToolCallParams: TalkRealtimeToolCallParamsSchema,
|
||||
TalkRealtimeToolCallResult: TalkRealtimeToolCallResultSchema,
|
||||
TalkSessionCreateParams: TalkSessionCreateParamsSchema,
|
||||
TalkSessionCreateResult: TalkSessionCreateResultSchema,
|
||||
TalkSessionInputAudioParams: TalkSessionInputAudioParamsSchema,
|
||||
TalkSessionControlParams: TalkSessionControlParamsSchema,
|
||||
TalkSessionControlResult: TalkSessionControlResultSchema,
|
||||
TalkSessionToolResultParams: TalkSessionToolResultParamsSchema,
|
||||
TalkSessionCloseParams: TalkSessionCloseParamsSchema,
|
||||
TalkSessionOkResult: TalkSessionOkResultSchema,
|
||||
TalkTranscriptionSessionParams: TalkTranscriptionSessionParamsSchema,
|
||||
TalkTranscriptionSessionResult: TalkTranscriptionSessionResultSchema,
|
||||
TalkTranscriptionRelayAudioParams: TalkTranscriptionRelayAudioParamsSchema,
|
||||
TalkTranscriptionRelayCancelParams: TalkTranscriptionRelayCancelParamsSchema,
|
||||
TalkTranscriptionRelayStopParams: TalkTranscriptionRelayStopParamsSchema,
|
||||
TalkTranscriptionRelayOkResult: TalkTranscriptionRelayOkResultSchema,
|
||||
TalkSpeakParams: TalkSpeakParamsSchema,
|
||||
TalkSpeakResult: TalkSpeakResultSchema,
|
||||
ChannelsStatusParams: ChannelsStatusParamsSchema,
|
||||
|
||||
@@ -92,16 +92,46 @@ export type WizardStep = SchemaType<"WizardStep">;
|
||||
export type WizardNextResult = SchemaType<"WizardNextResult">;
|
||||
export type WizardStartResult = SchemaType<"WizardStartResult">;
|
||||
export type WizardStatusResult = SchemaType<"WizardStatusResult">;
|
||||
export type TalkEvent = SchemaType<"TalkEvent">;
|
||||
export type TalkModeParams = SchemaType<"TalkModeParams">;
|
||||
export type TalkCatalogParams = SchemaType<"TalkCatalogParams">;
|
||||
export type TalkCatalogResult = SchemaType<"TalkCatalogResult">;
|
||||
export type TalkConfigParams = SchemaType<"TalkConfigParams">;
|
||||
export type TalkConfigResult = SchemaType<"TalkConfigResult">;
|
||||
export type TalkHandoffCreateParams = SchemaType<"TalkHandoffCreateParams">;
|
||||
export type TalkHandoffCreateResult = SchemaType<"TalkHandoffCreateResult">;
|
||||
export type TalkHandoffJoinParams = SchemaType<"TalkHandoffJoinParams">;
|
||||
export type TalkHandoffJoinResult = SchemaType<"TalkHandoffJoinResult">;
|
||||
export type TalkHandoffRevokeParams = SchemaType<"TalkHandoffRevokeParams">;
|
||||
export type TalkHandoffRevokeResult = SchemaType<"TalkHandoffRevokeResult">;
|
||||
export type TalkHandoffTurnStartParams = SchemaType<"TalkHandoffTurnStartParams">;
|
||||
export type TalkHandoffTurnEndParams = SchemaType<"TalkHandoffTurnEndParams">;
|
||||
export type TalkHandoffTurnCancelParams = SchemaType<"TalkHandoffTurnCancelParams">;
|
||||
export type TalkHandoffTurnResult = SchemaType<"TalkHandoffTurnResult">;
|
||||
export type TalkRealtimeSessionParams = SchemaType<"TalkRealtimeSessionParams">;
|
||||
export type TalkRealtimeSessionResult = SchemaType<"TalkRealtimeSessionResult">;
|
||||
export type TalkRealtimeRelayAudioParams = SchemaType<"TalkRealtimeRelayAudioParams">;
|
||||
export type TalkRealtimeRelayCancelParams = SchemaType<"TalkRealtimeRelayCancelParams">;
|
||||
export type TalkRealtimeRelayMarkParams = SchemaType<"TalkRealtimeRelayMarkParams">;
|
||||
export type TalkRealtimeRelayStopParams = SchemaType<"TalkRealtimeRelayStopParams">;
|
||||
export type TalkRealtimeRelayToolResultParams = SchemaType<"TalkRealtimeRelayToolResultParams">;
|
||||
export type TalkRealtimeRelayOkResult = SchemaType<"TalkRealtimeRelayOkResult">;
|
||||
export type TalkRealtimeToolCallParams = SchemaType<"TalkRealtimeToolCallParams">;
|
||||
export type TalkRealtimeToolCallResult = SchemaType<"TalkRealtimeToolCallResult">;
|
||||
export type TalkSessionCreateParams = SchemaType<"TalkSessionCreateParams">;
|
||||
export type TalkSessionCreateResult = SchemaType<"TalkSessionCreateResult">;
|
||||
export type TalkSessionInputAudioParams = SchemaType<"TalkSessionInputAudioParams">;
|
||||
export type TalkSessionControlParams = SchemaType<"TalkSessionControlParams">;
|
||||
export type TalkSessionControlResult = SchemaType<"TalkSessionControlResult">;
|
||||
export type TalkSessionToolResultParams = SchemaType<"TalkSessionToolResultParams">;
|
||||
export type TalkSessionCloseParams = SchemaType<"TalkSessionCloseParams">;
|
||||
export type TalkSessionOkResult = SchemaType<"TalkSessionOkResult">;
|
||||
export type TalkTranscriptionSessionParams = SchemaType<"TalkTranscriptionSessionParams">;
|
||||
export type TalkTranscriptionSessionResult = SchemaType<"TalkTranscriptionSessionResult">;
|
||||
export type TalkTranscriptionRelayAudioParams = SchemaType<"TalkTranscriptionRelayAudioParams">;
|
||||
export type TalkTranscriptionRelayCancelParams = SchemaType<"TalkTranscriptionRelayCancelParams">;
|
||||
export type TalkTranscriptionRelayStopParams = SchemaType<"TalkTranscriptionRelayStopParams">;
|
||||
export type TalkTranscriptionRelayOkResult = SchemaType<"TalkTranscriptionRelayOkResult">;
|
||||
export type TalkSpeakParams = SchemaType<"TalkSpeakParams">;
|
||||
export type TalkSpeakResult = SchemaType<"TalkSpeakResult">;
|
||||
export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">;
|
||||
|
||||
@@ -32,6 +32,9 @@ const EVENT_SCOPE_GUARDS: Record<string, string[]> = {
|
||||
presence: [],
|
||||
shutdown: [],
|
||||
tick: [],
|
||||
"talk.event": [READ_SCOPE],
|
||||
"talk.realtime.relay": [READ_SCOPE],
|
||||
"talk.transcription.relay": [READ_SCOPE],
|
||||
"talk.mode": [WRITE_SCOPE],
|
||||
"update.available": [],
|
||||
"voicewake.changed": [READ_SCOPE],
|
||||
|
||||
24
src/gateway/server-methods-list.test.ts
Normal file
24
src/gateway/server-methods-list.test.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { GATEWAY_EVENTS, listGatewayMethods } from "./server-methods-list.js";
|
||||
|
||||
describe("GATEWAY_EVENTS", () => {
|
||||
it("advertises Talk event streams in hello features", () => {
|
||||
expect(GATEWAY_EVENTS).toEqual(
|
||||
expect.arrayContaining(["talk.event", "talk.realtime.relay", "talk.transcription.relay"]),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("listGatewayMethods", () => {
|
||||
it("advertises the versioned Talk session RPCs", () => {
|
||||
expect(listGatewayMethods()).toEqual(
|
||||
expect.arrayContaining([
|
||||
"talk.session.create",
|
||||
"talk.session.inputAudio",
|
||||
"talk.session.control",
|
||||
"talk.session.toolResult",
|
||||
"talk.session.close",
|
||||
]),
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -56,12 +56,30 @@ const BASE_METHODS = [
|
||||
"wizard.next",
|
||||
"wizard.cancel",
|
||||
"wizard.status",
|
||||
"talk.catalog",
|
||||
"talk.config",
|
||||
"talk.session.create",
|
||||
"talk.session.inputAudio",
|
||||
"talk.session.control",
|
||||
"talk.session.toolResult",
|
||||
"talk.session.close",
|
||||
"talk.handoff.create",
|
||||
"talk.handoff.join",
|
||||
"talk.handoff.revoke",
|
||||
"talk.handoff.turnStart",
|
||||
"talk.handoff.turnEnd",
|
||||
"talk.handoff.turnCancel",
|
||||
"talk.realtime.session",
|
||||
"talk.realtime.toolCall",
|
||||
"talk.realtime.relayAudio",
|
||||
"talk.realtime.relayCancel",
|
||||
"talk.realtime.relayMark",
|
||||
"talk.realtime.relayStop",
|
||||
"talk.realtime.relayToolResult",
|
||||
"talk.transcription.session",
|
||||
"talk.transcription.relayAudio",
|
||||
"talk.transcription.relayCancel",
|
||||
"talk.transcription.relayStop",
|
||||
"talk.speak",
|
||||
"talk.mode",
|
||||
"commands.list",
|
||||
@@ -182,6 +200,9 @@ export const GATEWAY_EVENTS = [
|
||||
"presence",
|
||||
"tick",
|
||||
"talk.mode",
|
||||
"talk.event",
|
||||
"talk.realtime.relay",
|
||||
"talk.transcription.relay",
|
||||
"shutdown",
|
||||
"health",
|
||||
"heartbeat",
|
||||
|
||||
@@ -62,7 +62,7 @@ export type GatewayRequestContext = {
|
||||
nodeSubscribe: (nodeId: string, sessionKey: string) => void;
|
||||
nodeUnsubscribe: (nodeId: string, sessionKey: string) => void;
|
||||
nodeUnsubscribeAll: (nodeId: string) => void;
|
||||
hasConnectedMobileNode: () => boolean;
|
||||
hasConnectedTalkNode: () => boolean;
|
||||
hasExecApprovalClients?: (excludeConnId?: string) => boolean;
|
||||
disconnectClientsForDevice?: (deviceId: string, opts?: { role?: string }) => void;
|
||||
disconnectClientsUsingSharedGatewayAuth?: () => void;
|
||||
|
||||
497
src/gateway/server-methods/talk-session.ts
Normal file
497
src/gateway/server-methods/talk-session.ts
Normal file
@@ -0,0 +1,497 @@
|
||||
import { REALTIME_VOICE_AGENT_CONSULT_TOOL } from "../../realtime-voice/agent-consult-tool.js";
|
||||
import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js";
|
||||
import type { TalkBrain, TalkMode, TalkTransport } from "../../realtime-voice/talk-events.js";
|
||||
import {
|
||||
normalizeOptionalLowercaseString,
|
||||
normalizeOptionalString,
|
||||
} from "../../shared/string-coerce.js";
|
||||
import { ADMIN_SCOPE } from "../operator-scopes.js";
|
||||
import {
|
||||
ErrorCodes,
|
||||
errorShape,
|
||||
formatValidationErrors,
|
||||
validateTalkSessionCloseParams,
|
||||
validateTalkSessionControlParams,
|
||||
validateTalkSessionCreateParams,
|
||||
validateTalkSessionInputAudioParams,
|
||||
validateTalkSessionToolResultParams,
|
||||
} from "../protocol/index.js";
|
||||
import { resolveSessionKeyFromResolveParams } from "../sessions-resolve.js";
|
||||
import {
|
||||
cancelTalkHandoffTurn,
|
||||
createTalkHandoff,
|
||||
endTalkHandoffTurn,
|
||||
revokeTalkHandoff,
|
||||
startTalkHandoffTurn,
|
||||
} from "../talk-handoff.js";
|
||||
import {
|
||||
cancelTalkRealtimeRelayTurn,
|
||||
createTalkRealtimeRelaySession,
|
||||
sendTalkRealtimeRelayAudio,
|
||||
stopTalkRealtimeRelaySession,
|
||||
submitTalkRealtimeRelayToolResult,
|
||||
} from "../talk-realtime-relay.js";
|
||||
import {
|
||||
forgetUnifiedTalkSession,
|
||||
getUnifiedTalkSession,
|
||||
rememberUnifiedTalkSession,
|
||||
requireUnifiedTalkSessionConn,
|
||||
} from "../talk-session-registry.js";
|
||||
import {
|
||||
cancelTalkTranscriptionRelayTurn,
|
||||
createTalkTranscriptionRelaySession,
|
||||
sendTalkTranscriptionRelayAudio,
|
||||
stopTalkTranscriptionRelaySession,
|
||||
} from "../talk-transcription-relay.js";
|
||||
import { formatForLog } from "../ws-log.js";
|
||||
import {
|
||||
broadcastTalkRoomEvents,
|
||||
buildRealtimeInstructions,
|
||||
buildTalkRealtimeConfig,
|
||||
buildTalkTranscriptionConfig,
|
||||
canUseTalkDirectTools,
|
||||
resolveConfiguredRealtimeTranscriptionProvider,
|
||||
talkHandoffErrorCode,
|
||||
withRealtimeBrowserOverrides,
|
||||
} from "./talk-shared.js";
|
||||
import type { GatewayRequestHandlers } from "./types.js";
|
||||
|
||||
function normalizeTalkSessionMode(params: { mode?: string; transport?: string }): TalkMode {
|
||||
const mode = normalizeOptionalLowercaseString(params.mode) as TalkMode | undefined;
|
||||
if (mode) {
|
||||
return mode;
|
||||
}
|
||||
return normalizeOptionalLowercaseString(params.transport) === "managed-room"
|
||||
? "stt-tts"
|
||||
: "realtime";
|
||||
}
|
||||
|
||||
function normalizeTalkSessionTransport(params: {
|
||||
mode: TalkMode;
|
||||
transport?: string;
|
||||
}): TalkTransport {
|
||||
const transport = normalizeOptionalLowercaseString(params.transport) as TalkTransport | undefined;
|
||||
if (transport) {
|
||||
return transport;
|
||||
}
|
||||
return params.mode === "stt-tts" ? "managed-room" : "gateway-relay";
|
||||
}
|
||||
|
||||
function normalizeTalkSessionBrain(params: { mode: TalkMode; brain?: string }): TalkBrain {
|
||||
const brain = normalizeOptionalLowercaseString(params.brain) as TalkBrain | undefined;
|
||||
if (brain) {
|
||||
return brain;
|
||||
}
|
||||
return params.mode === "transcription" ? "none" : "agent-consult";
|
||||
}
|
||||
|
||||
export const talkSessionHandlers: GatewayRequestHandlers = {
|
||||
"talk.session.create": async ({ params, respond, context, client }) => {
|
||||
if (!validateTalkSessionCreateParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.session.create params: ${formatValidationErrors(validateTalkSessionCreateParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const mode = normalizeTalkSessionMode(params);
|
||||
const transport = normalizeTalkSessionTransport({ mode, transport: params.transport });
|
||||
const brain = normalizeTalkSessionBrain({ mode, brain: params.brain });
|
||||
|
||||
if (transport === "webrtc" || transport === "provider-websocket") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`talk.session.create is Gateway-managed; use talk.realtime.session for browser transport "${transport}"`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
if (transport === "managed-room") {
|
||||
if (brain === "direct-tools" && !canUseTalkDirectTools(client)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`talk.session.create brain="direct-tools" requires gateway scope: ${ADMIN_SCOPE}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const resolvedSession = await resolveSessionKeyFromResolveParams({
|
||||
cfg: context.getRuntimeConfig(),
|
||||
p: {
|
||||
key: params.sessionKey,
|
||||
includeGlobal: true,
|
||||
includeUnknown: true,
|
||||
},
|
||||
});
|
||||
if (!resolvedSession.ok) {
|
||||
respond(false, undefined, resolvedSession.error);
|
||||
return;
|
||||
}
|
||||
const handoff = createTalkHandoff({
|
||||
sessionKey: resolvedSession.key,
|
||||
provider: normalizeOptionalString(params.provider),
|
||||
model: normalizeOptionalString(params.model),
|
||||
voice: normalizeOptionalString(params.voice),
|
||||
mode,
|
||||
transport,
|
||||
brain,
|
||||
ttlMs: params.ttlMs,
|
||||
});
|
||||
rememberUnifiedTalkSession(handoff.id, {
|
||||
kind: "managed-room",
|
||||
handoffId: handoff.id,
|
||||
token: handoff.token,
|
||||
roomId: handoff.roomId,
|
||||
});
|
||||
respond(
|
||||
true,
|
||||
{
|
||||
sessionId: handoff.id,
|
||||
provider: handoff.provider,
|
||||
mode: handoff.mode,
|
||||
transport: handoff.transport,
|
||||
brain: handoff.brain,
|
||||
handoffId: handoff.id,
|
||||
roomId: handoff.roomId,
|
||||
roomUrl: handoff.roomUrl,
|
||||
token: handoff.token,
|
||||
model: handoff.model,
|
||||
voice: handoff.voice,
|
||||
expiresAt: handoff.expiresAt,
|
||||
},
|
||||
undefined,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const connId = client?.connId;
|
||||
if (!connId) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "Talk session unavailable"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (mode === "realtime") {
|
||||
if (transport !== "gateway-relay" || brain !== "agent-consult") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`realtime talk.session.create requires transport="gateway-relay" and brain="agent-consult"`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const runtimeConfig = context.getRuntimeConfig();
|
||||
const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, params.provider);
|
||||
const resolution = resolveConfiguredRealtimeVoiceProvider({
|
||||
configuredProviderId: realtimeConfig.provider,
|
||||
providerConfigs: realtimeConfig.providers,
|
||||
cfg: runtimeConfig,
|
||||
cfgForResolve: runtimeConfig,
|
||||
noRegisteredProviderMessage: "No realtime voice provider registered",
|
||||
});
|
||||
const model = normalizeOptionalString(params.model) ?? realtimeConfig.model;
|
||||
const voice = normalizeOptionalString(params.voice) ?? realtimeConfig.voice;
|
||||
const session = createTalkRealtimeRelaySession({
|
||||
context,
|
||||
connId,
|
||||
provider: resolution.provider,
|
||||
providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }),
|
||||
instructions: buildRealtimeInstructions(),
|
||||
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
|
||||
model,
|
||||
voice,
|
||||
});
|
||||
rememberUnifiedTalkSession(session.relaySessionId, {
|
||||
kind: "realtime-relay",
|
||||
connId,
|
||||
relaySessionId: session.relaySessionId,
|
||||
});
|
||||
respond(
|
||||
true,
|
||||
{
|
||||
...session,
|
||||
sessionId: session.relaySessionId,
|
||||
mode,
|
||||
brain,
|
||||
},
|
||||
undefined,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (mode === "transcription") {
|
||||
if (transport !== "gateway-relay" || brain !== "none") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`transcription talk.session.create requires transport="gateway-relay" and brain="none"`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const runtimeConfig = context.getRuntimeConfig();
|
||||
const transcriptionConfig = buildTalkTranscriptionConfig(runtimeConfig, params.provider);
|
||||
const resolution = resolveConfiguredRealtimeTranscriptionProvider({
|
||||
config: runtimeConfig,
|
||||
configuredProviderId: transcriptionConfig.provider,
|
||||
providerConfigs: transcriptionConfig.providers,
|
||||
});
|
||||
const session = createTalkTranscriptionRelaySession({
|
||||
context,
|
||||
connId,
|
||||
provider: resolution.provider,
|
||||
providerConfig: resolution.providerConfig,
|
||||
});
|
||||
rememberUnifiedTalkSession(session.transcriptionSessionId, {
|
||||
kind: "transcription-relay",
|
||||
connId,
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
});
|
||||
respond(
|
||||
true,
|
||||
{
|
||||
...session,
|
||||
sessionId: session.transcriptionSessionId,
|
||||
brain,
|
||||
},
|
||||
undefined,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`stt-tts talk.session.create requires transport="managed-room"`,
|
||||
),
|
||||
);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.session.inputAudio": async ({ params, respond, client }) => {
|
||||
if (!validateTalkSessionInputAudioParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.session.inputAudio params: ${formatValidationErrors(validateTalkSessionInputAudioParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const session = getUnifiedTalkSession(params.sessionId);
|
||||
if (session.kind === "realtime-relay") {
|
||||
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
|
||||
sendTalkRealtimeRelayAudio({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId,
|
||||
audioBase64: params.audioBase64,
|
||||
timestamp: params.timestamp,
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
return;
|
||||
}
|
||||
if (session.kind === "transcription-relay") {
|
||||
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
|
||||
sendTalkTranscriptionRelayAudio({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
connId,
|
||||
audioBase64: params.audioBase64,
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
return;
|
||||
}
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
"talk.session.inputAudio is not supported for managed-room sessions",
|
||||
),
|
||||
);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.session.control": async ({ params, respond, client, context }) => {
|
||||
if (!validateTalkSessionControlParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.session.control params: ${formatValidationErrors(validateTalkSessionControlParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const session = getUnifiedTalkSession(params.sessionId);
|
||||
if (session.kind === "realtime-relay") {
|
||||
if (params.type !== "turn.cancel") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`realtime relay sessions only support talk.session.control type="turn.cancel"`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
|
||||
cancelTalkRealtimeRelayTurn({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId,
|
||||
reason: normalizeOptionalString(params.reason),
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
return;
|
||||
}
|
||||
if (session.kind === "transcription-relay") {
|
||||
if (params.type !== "turn.cancel") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`transcription relay sessions only support talk.session.control type="turn.cancel"`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
|
||||
cancelTalkTranscriptionRelayTurn({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
connId,
|
||||
reason: normalizeOptionalString(params.reason),
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
return;
|
||||
}
|
||||
|
||||
const result =
|
||||
params.type === "turn.start"
|
||||
? startTalkHandoffTurn(session.handoffId, session.token, {
|
||||
turnId: params.turnId,
|
||||
clientId: client?.connId,
|
||||
})
|
||||
: params.type === "turn.end"
|
||||
? endTalkHandoffTurn(session.handoffId, session.token, { turnId: params.turnId })
|
||||
: cancelTalkHandoffTurn(session.handoffId, session.token, {
|
||||
turnId: params.turnId,
|
||||
reason: params.reason,
|
||||
});
|
||||
if (!result.ok) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
talkHandoffErrorCode(result.reason),
|
||||
`talk session control failed: ${result.reason}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
|
||||
handoffId: result.record.id,
|
||||
roomId: result.record.roomId,
|
||||
events: result.events,
|
||||
});
|
||||
respond(true, { ok: true, turnId: result.turnId, events: result.events }, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.session.toolResult": async ({ params, respond, client }) => {
|
||||
if (!validateTalkSessionToolResultParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.session.toolResult params: ${formatValidationErrors(validateTalkSessionToolResultParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const session = getUnifiedTalkSession(params.sessionId);
|
||||
if (session.kind !== "realtime-relay") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
"talk.session.toolResult is only supported for realtime relay sessions",
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
|
||||
submitTalkRealtimeRelayToolResult({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId,
|
||||
callId: params.callId,
|
||||
result: params.result,
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.session.close": async ({ params, respond, client }) => {
|
||||
if (!validateTalkSessionCloseParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.session.close params: ${formatValidationErrors(validateTalkSessionCloseParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const session = getUnifiedTalkSession(params.sessionId);
|
||||
if (session.kind === "realtime-relay") {
|
||||
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
|
||||
stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId });
|
||||
} else if (session.kind === "transcription-relay") {
|
||||
const connId = requireUnifiedTalkSessionConn(session, client?.connId);
|
||||
stopTalkTranscriptionRelaySession({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
connId,
|
||||
});
|
||||
} else {
|
||||
revokeTalkHandoff(session.handoffId);
|
||||
}
|
||||
forgetUnifiedTalkSession(params.sessionId);
|
||||
respond(true, { ok: true }, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
};
|
||||
237
src/gateway/server-methods/talk-shared.ts
Normal file
237
src/gateway/server-methods/talk-shared.ts
Normal file
@@ -0,0 +1,237 @@
|
||||
import type { OpenClawConfig } from "../../config/types.js";
|
||||
import { listRealtimeTranscriptionProviders } from "../../realtime-transcription/provider-registry.js";
|
||||
import type { RealtimeTranscriptionProviderConfig } from "../../realtime-transcription/provider-types.js";
|
||||
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../realtime-voice/agent-consult-tool.js";
|
||||
import type {
|
||||
RealtimeVoiceBrowserSession,
|
||||
RealtimeVoiceProviderConfig,
|
||||
} from "../../realtime-voice/provider-types.js";
|
||||
import type { TalkEvent } from "../../realtime-voice/talk-events.js";
|
||||
import {
|
||||
normalizeLowercaseStringOrEmpty,
|
||||
normalizeOptionalLowercaseString,
|
||||
normalizeOptionalString,
|
||||
} from "../../shared/string-coerce.js";
|
||||
import { ADMIN_SCOPE } from "../operator-scopes.js";
|
||||
import { ErrorCodes } from "../protocol/index.js";
|
||||
import type { TalkHandoffTurnResult } from "../talk-handoff.js";
|
||||
import { asRecord } from "./record-shared.js";
|
||||
|
||||
export function canUseTalkDirectTools(client: { connect?: { scopes?: string[] } } | null): boolean {
|
||||
const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
|
||||
return scopes.includes(ADMIN_SCOPE);
|
||||
}
|
||||
|
||||
export function broadcastTalkRoomEvents(
|
||||
context: {
|
||||
broadcastToConnIds: (
|
||||
event: string,
|
||||
payload: unknown,
|
||||
connIds: Set<string>,
|
||||
opts?: { dropIfSlow?: boolean },
|
||||
) => void;
|
||||
},
|
||||
connId: string | undefined,
|
||||
params: { handoffId: string; roomId: string; events: TalkEvent[] },
|
||||
): void {
|
||||
if (!connId || params.events.length === 0) {
|
||||
return;
|
||||
}
|
||||
for (const talkEvent of params.events) {
|
||||
context.broadcastToConnIds(
|
||||
"talk.event",
|
||||
{ handoffId: params.handoffId, roomId: params.roomId, talkEvent },
|
||||
new Set([connId]),
|
||||
{ dropIfSlow: true },
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
type TalkHandoffFailureReason = Extract<TalkHandoffTurnResult, { ok: false }>["reason"];
|
||||
|
||||
export function talkHandoffErrorCode(reason: TalkHandoffFailureReason) {
|
||||
return reason === "invalid_token" || reason === "no_active_turn" || reason === "stale_turn"
|
||||
? ErrorCodes.INVALID_REQUEST
|
||||
: ErrorCodes.UNAVAILABLE;
|
||||
}
|
||||
|
||||
function getRecord(value: unknown): Record<string, unknown> | undefined {
|
||||
return asRecord(value) ?? undefined;
|
||||
}
|
||||
|
||||
function getVoiceCallRealtimeConfig(config: OpenClawConfig): {
|
||||
provider?: string;
|
||||
providers?: Record<string, RealtimeVoiceProviderConfig>;
|
||||
} {
|
||||
const plugins = getRecord(config.plugins);
|
||||
const entries = getRecord(plugins?.entries);
|
||||
const voiceCall = getRecord(entries?.["voice-call"]);
|
||||
const pluginConfig = getRecord(voiceCall?.config);
|
||||
const realtime = getRecord(pluginConfig?.realtime);
|
||||
const providersRaw = getRecord(realtime?.providers);
|
||||
const providers: Record<string, RealtimeVoiceProviderConfig> = {};
|
||||
if (providersRaw) {
|
||||
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
|
||||
const record = getRecord(providerConfig);
|
||||
if (record) {
|
||||
providers[providerId] = record;
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
provider: normalizeOptionalString(realtime?.provider),
|
||||
providers: Object.keys(providers).length > 0 ? providers : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
export function getVoiceCallStreamingConfig(config: OpenClawConfig): {
|
||||
provider?: string;
|
||||
providers?: Record<string, RealtimeTranscriptionProviderConfig>;
|
||||
} {
|
||||
const plugins = getRecord(config.plugins);
|
||||
const entries = getRecord(plugins?.entries);
|
||||
const voiceCall = getRecord(entries?.["voice-call"]);
|
||||
const pluginConfig = getRecord(voiceCall?.config);
|
||||
const streaming = getRecord(pluginConfig?.streaming);
|
||||
const providersRaw = getRecord(streaming?.providers);
|
||||
const providers: Record<string, RealtimeTranscriptionProviderConfig> = {};
|
||||
if (providersRaw) {
|
||||
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
|
||||
const record = getRecord(providerConfig);
|
||||
if (record) {
|
||||
providers[providerId] = record;
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
provider: normalizeOptionalString(streaming?.provider),
|
||||
providers: Object.keys(providers).length > 0 ? providers : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
export function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) {
|
||||
const voiceCallRealtime = getVoiceCallRealtimeConfig(config);
|
||||
const talkRealtime = getRecord(config.talk?.realtime);
|
||||
const talkRealtimeProviderConfigs = talkRealtime?.providers as
|
||||
| Record<string, RealtimeVoiceProviderConfig>
|
||||
| undefined;
|
||||
const provider =
|
||||
normalizeOptionalString(requestedProvider) ??
|
||||
normalizeOptionalString(talkRealtime?.provider) ??
|
||||
voiceCallRealtime.provider;
|
||||
return {
|
||||
provider,
|
||||
providers: {
|
||||
...voiceCallRealtime.providers,
|
||||
...talkRealtimeProviderConfigs,
|
||||
},
|
||||
model: normalizeOptionalString(talkRealtime?.model),
|
||||
voice: normalizeOptionalString(talkRealtime?.voice),
|
||||
mode: normalizeOptionalLowercaseString(talkRealtime?.mode),
|
||||
transport: normalizeOptionalLowercaseString(talkRealtime?.transport),
|
||||
brain: normalizeOptionalLowercaseString(talkRealtime?.brain),
|
||||
};
|
||||
}
|
||||
|
||||
export function buildTalkTranscriptionConfig(config: OpenClawConfig, requestedProvider?: string) {
|
||||
const streamingConfig = getVoiceCallStreamingConfig(config);
|
||||
return {
|
||||
provider: normalizeOptionalString(requestedProvider) ?? streamingConfig.provider,
|
||||
providers: streamingConfig.providers ?? {},
|
||||
};
|
||||
}
|
||||
|
||||
function getRealtimeTranscriptionProviderConfig(params: {
|
||||
providerConfigs: Record<string, RealtimeTranscriptionProviderConfig>;
|
||||
provider: { id: string; aliases?: readonly string[] };
|
||||
configuredProviderId?: string;
|
||||
}): RealtimeTranscriptionProviderConfig {
|
||||
const candidates = [
|
||||
normalizeOptionalString(params.configuredProviderId),
|
||||
params.provider.id,
|
||||
...(params.provider.aliases ?? []),
|
||||
].filter((key): key is string => Boolean(key));
|
||||
const configuredKeys = Object.keys(params.providerConfigs);
|
||||
for (const candidate of candidates) {
|
||||
if (Object.hasOwn(params.providerConfigs, candidate)) {
|
||||
return params.providerConfigs[candidate] ?? {};
|
||||
}
|
||||
const normalizedCandidate = normalizeOptionalLowercaseString(candidate);
|
||||
const matchingKey = configuredKeys.find(
|
||||
(key) => normalizeOptionalLowercaseString(key) === normalizedCandidate,
|
||||
);
|
||||
if (matchingKey) {
|
||||
return params.providerConfigs[matchingKey] ?? {};
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
export function configuredOrFalse(callback: () => boolean): boolean {
|
||||
try {
|
||||
return callback();
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export function resolveConfiguredRealtimeTranscriptionProvider(params: {
|
||||
config: OpenClawConfig;
|
||||
configuredProviderId?: string;
|
||||
providerConfigs: Record<string, RealtimeTranscriptionProviderConfig>;
|
||||
}) {
|
||||
const providers = listRealtimeTranscriptionProviders(params.config);
|
||||
const normalizedConfigured = normalizeOptionalLowercaseString(params.configuredProviderId);
|
||||
const orderedProviders = normalizedConfigured
|
||||
? providers.filter(
|
||||
(provider) =>
|
||||
normalizeOptionalLowercaseString(provider.id) === normalizedConfigured ||
|
||||
(provider.aliases ?? []).some(
|
||||
(alias) => normalizeOptionalLowercaseString(alias) === normalizedConfigured,
|
||||
),
|
||||
)
|
||||
: providers.toSorted((a, b) => (a.autoSelectOrder ?? 1000) - (b.autoSelectOrder ?? 1000));
|
||||
for (const provider of orderedProviders) {
|
||||
const rawConfig = getRealtimeTranscriptionProviderConfig({
|
||||
providerConfigs: params.providerConfigs,
|
||||
provider,
|
||||
configuredProviderId: params.configuredProviderId,
|
||||
});
|
||||
const providerConfig = provider.resolveConfig?.({ cfg: params.config, rawConfig }) ?? rawConfig;
|
||||
if (configuredOrFalse(() => provider.isConfigured({ cfg: params.config, providerConfig }))) {
|
||||
return { provider, providerConfig };
|
||||
}
|
||||
}
|
||||
if (normalizedConfigured) {
|
||||
throw new Error(
|
||||
`Realtime transcription provider "${params.configuredProviderId}" is not configured`,
|
||||
);
|
||||
}
|
||||
throw new Error("No realtime transcription provider registered");
|
||||
}
|
||||
|
||||
export function buildRealtimeInstructions(): string {
|
||||
return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
|
||||
}
|
||||
|
||||
export function withRealtimeBrowserOverrides(
|
||||
providerConfig: RealtimeVoiceProviderConfig,
|
||||
params: { model?: string; voice?: string },
|
||||
): RealtimeVoiceProviderConfig {
|
||||
const overrides: RealtimeVoiceProviderConfig = {};
|
||||
const model = normalizeOptionalString(params.model);
|
||||
const voice = normalizeOptionalString(params.voice);
|
||||
if (model) {
|
||||
overrides.model = model;
|
||||
}
|
||||
if (voice) {
|
||||
overrides.voice = voice;
|
||||
}
|
||||
return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig;
|
||||
}
|
||||
|
||||
export function isUnsupportedBrowserWebRtcSession(session: RealtimeVoiceBrowserSession): boolean {
|
||||
const provider = normalizeLowercaseStringOrEmpty(session.provider);
|
||||
const transport = (session as { transport?: string }).transport ?? "webrtc";
|
||||
return provider === "google" && transport === "webrtc";
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,4 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { readConfigFileSnapshot } from "../../config/config.js";
|
||||
import { redactConfigObject } from "../../config/redact-snapshot.js";
|
||||
import {
|
||||
@@ -7,47 +8,103 @@ import {
|
||||
} from "../../config/talk.js";
|
||||
import type { TalkConfigResponse, TalkProviderConfig } from "../../config/types.gateway.js";
|
||||
import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js";
|
||||
import { listRealtimeTranscriptionProviders } from "../../realtime-transcription/provider-registry.js";
|
||||
import {
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
buildRealtimeVoiceAgentConsultChatMessage,
|
||||
} from "../../realtime-voice/agent-consult-tool.js";
|
||||
import { getRealtimeVoiceProvider } from "../../realtime-voice/provider-registry.js";
|
||||
import {
|
||||
canonicalizeRealtimeVoiceProviderId,
|
||||
listRealtimeVoiceProviders,
|
||||
} from "../../realtime-voice/provider-registry.js";
|
||||
import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js";
|
||||
import type {
|
||||
RealtimeVoiceBrowserSession,
|
||||
RealtimeVoiceProviderConfig,
|
||||
} from "../../realtime-voice/provider-types.js";
|
||||
import {
|
||||
normalizeLowercaseStringOrEmpty,
|
||||
normalizeOptionalLowercaseString,
|
||||
normalizeOptionalString,
|
||||
} from "../../shared/string-coerce.js";
|
||||
import { canonicalizeSpeechProviderId, getSpeechProvider } from "../../tts/provider-registry.js";
|
||||
import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js";
|
||||
import {
|
||||
canonicalizeSpeechProviderId,
|
||||
getSpeechProvider,
|
||||
listSpeechProviders,
|
||||
} from "../../tts/provider-registry.js";
|
||||
import {
|
||||
getResolvedSpeechProviderConfig,
|
||||
resolveTtsConfig,
|
||||
synthesizeSpeech,
|
||||
type TtsDirectiveOverrides,
|
||||
} from "../../tts/tts.js";
|
||||
import { ADMIN_SCOPE, TALK_SECRETS_SCOPE } from "../operator-scopes.js";
|
||||
import {
|
||||
ErrorCodes,
|
||||
errorShape,
|
||||
formatValidationErrors,
|
||||
type ErrorShape,
|
||||
type TalkSpeakParams,
|
||||
validateTalkCatalogParams,
|
||||
validateTalkConfigParams,
|
||||
validateTalkHandoffCreateParams,
|
||||
validateTalkHandoffJoinParams,
|
||||
validateTalkHandoffRevokeParams,
|
||||
validateTalkHandoffTurnCancelParams,
|
||||
validateTalkHandoffTurnEndParams,
|
||||
validateTalkHandoffTurnStartParams,
|
||||
validateTalkModeParams,
|
||||
validateTalkRealtimeRelayAudioParams,
|
||||
validateTalkRealtimeRelayCancelParams,
|
||||
validateTalkRealtimeRelayMarkParams,
|
||||
validateTalkRealtimeRelayStopParams,
|
||||
validateTalkRealtimeRelayToolResultParams,
|
||||
validateTalkRealtimeSessionParams,
|
||||
validateTalkRealtimeToolCallParams,
|
||||
validateTalkTranscriptionRelayAudioParams,
|
||||
validateTalkTranscriptionRelayCancelParams,
|
||||
validateTalkTranscriptionRelayStopParams,
|
||||
validateTalkTranscriptionSessionParams,
|
||||
validateTalkSpeakParams,
|
||||
} from "../protocol/index.js";
|
||||
import { resolveSessionKeyFromResolveParams } from "../sessions-resolve.js";
|
||||
import {
|
||||
cancelTalkHandoffTurn,
|
||||
createTalkHandoff,
|
||||
endTalkHandoffTurn,
|
||||
joinTalkHandoff,
|
||||
revokeTalkHandoff,
|
||||
startTalkHandoffTurn,
|
||||
} from "../talk-handoff.js";
|
||||
import {
|
||||
acknowledgeTalkRealtimeRelayMark,
|
||||
cancelTalkRealtimeRelayTurn,
|
||||
createTalkRealtimeRelaySession,
|
||||
registerTalkRealtimeRelayAgentRun,
|
||||
sendTalkRealtimeRelayAudio,
|
||||
stopTalkRealtimeRelaySession,
|
||||
submitTalkRealtimeRelayToolResult,
|
||||
} from "../talk-realtime-relay.js";
|
||||
import {
|
||||
cancelTalkTranscriptionRelayTurn,
|
||||
createTalkTranscriptionRelaySession,
|
||||
sendTalkTranscriptionRelayAudio,
|
||||
stopTalkTranscriptionRelaySession,
|
||||
} from "../talk-transcription-relay.js";
|
||||
import { formatForLog } from "../ws-log.js";
|
||||
import { chatHandlers } from "./chat.js";
|
||||
import { asRecord } from "./record-shared.js";
|
||||
import { talkSessionHandlers } from "./talk-session.js";
|
||||
import {
|
||||
broadcastTalkRoomEvents,
|
||||
buildRealtimeInstructions,
|
||||
buildTalkRealtimeConfig,
|
||||
buildTalkTranscriptionConfig,
|
||||
canUseTalkDirectTools,
|
||||
configuredOrFalse,
|
||||
getVoiceCallStreamingConfig,
|
||||
isUnsupportedBrowserWebRtcSession,
|
||||
resolveConfiguredRealtimeTranscriptionProvider,
|
||||
talkHandoffErrorCode,
|
||||
withRealtimeBrowserOverrides,
|
||||
} from "./talk-shared.js";
|
||||
import type { GatewayRequestHandlers } from "./types.js";
|
||||
|
||||
type TalkSpeakReason =
|
||||
@@ -158,83 +215,117 @@ function buildTalkTtsConfig(
|
||||
};
|
||||
}
|
||||
|
||||
function getRecord(value: unknown): Record<string, unknown> | undefined {
|
||||
return asRecord(value) ?? undefined;
|
||||
}
|
||||
function buildTalkCatalog(config: OpenClawConfig) {
|
||||
const ttsConfig = resolveTtsConfig(config);
|
||||
const talkResolved = resolveActiveTalkProviderConfig(config.talk);
|
||||
const activeSpeechProvider = canonicalizeSpeechProviderId(talkResolved?.provider, config);
|
||||
const streamingConfig = getVoiceCallStreamingConfig(config);
|
||||
const realtimeConfig = buildTalkRealtimeConfig(config);
|
||||
const activeRealtimeProvider = canonicalizeRealtimeVoiceProviderId(
|
||||
realtimeConfig.provider,
|
||||
config,
|
||||
);
|
||||
|
||||
function getVoiceCallRealtimeConfig(config: OpenClawConfig): {
|
||||
provider?: string;
|
||||
providers?: Record<string, RealtimeVoiceProviderConfig>;
|
||||
} {
|
||||
const plugins = getRecord(config.plugins);
|
||||
const entries = getRecord(plugins?.entries);
|
||||
const voiceCall = getRecord(entries?.["voice-call"]);
|
||||
const pluginConfig = getRecord(voiceCall?.config);
|
||||
const realtime = getRecord(pluginConfig?.realtime);
|
||||
const providersRaw = getRecord(realtime?.providers);
|
||||
const providers: Record<string, RealtimeVoiceProviderConfig> = {};
|
||||
if (providersRaw) {
|
||||
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
|
||||
const record = getRecord(providerConfig);
|
||||
if (record) {
|
||||
providers[providerId] = record;
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
provider: normalizeOptionalString(realtime?.provider),
|
||||
providers: Object.keys(providers).length > 0 ? providers : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) {
|
||||
const voiceCallRealtime = getVoiceCallRealtimeConfig(config);
|
||||
const talkProviderConfigs = config.talk?.providers as
|
||||
| Record<string, RealtimeVoiceProviderConfig>
|
||||
| undefined;
|
||||
const talkProvider = normalizeOptionalString(config.talk?.provider);
|
||||
const talkProviderSupportsRealtime = talkProvider
|
||||
? Boolean(getRealtimeVoiceProvider(talkProvider, config))
|
||||
: false;
|
||||
const provider =
|
||||
normalizeOptionalString(requestedProvider) ??
|
||||
(talkProviderSupportsRealtime ? talkProvider : undefined) ??
|
||||
voiceCallRealtime.provider;
|
||||
return {
|
||||
provider,
|
||||
providers: {
|
||||
...voiceCallRealtime.providers,
|
||||
...talkProviderConfigs,
|
||||
modes: ["realtime", "stt-tts", "transcription"],
|
||||
transports: ["webrtc", "provider-websocket", "gateway-relay", "managed-room"],
|
||||
brains: ["agent-consult", "direct-tools", "none"],
|
||||
speech: {
|
||||
...(activeSpeechProvider ? { activeProvider: activeSpeechProvider } : {}),
|
||||
providers: listSpeechProviders(config).map((provider) => {
|
||||
const entry: Record<string, unknown> = {
|
||||
id: provider.id,
|
||||
label: provider.label,
|
||||
configured: configuredOrFalse(() =>
|
||||
provider.isConfigured({
|
||||
cfg: config,
|
||||
providerConfig: getResolvedSpeechProviderConfig(ttsConfig, provider.id, config),
|
||||
timeoutMs: ttsConfig.timeoutMs,
|
||||
}),
|
||||
),
|
||||
modes: ["stt-tts"],
|
||||
brains: ["agent-consult"],
|
||||
};
|
||||
if (provider.models) {
|
||||
entry.models = [...provider.models];
|
||||
}
|
||||
if (provider.voices) {
|
||||
entry.voices = [...provider.voices];
|
||||
}
|
||||
return entry;
|
||||
}),
|
||||
},
|
||||
transcription: {
|
||||
...(streamingConfig.provider ? { activeProvider: streamingConfig.provider } : {}),
|
||||
providers: listRealtimeTranscriptionProviders(config).map((provider) => {
|
||||
const rawConfig = streamingConfig.providers?.[provider.id] ?? {};
|
||||
const providerConfig = provider.resolveConfig?.({ cfg: config, rawConfig }) ?? rawConfig;
|
||||
const entry: Record<string, unknown> = {
|
||||
id: provider.id,
|
||||
label: provider.label,
|
||||
configured: configuredOrFalse(() =>
|
||||
provider.isConfigured({ cfg: config, providerConfig }),
|
||||
),
|
||||
modes: ["transcription"],
|
||||
transports: ["gateway-relay"],
|
||||
brains: ["none"],
|
||||
};
|
||||
if (provider.defaultModel) {
|
||||
entry.defaultModel = provider.defaultModel;
|
||||
}
|
||||
return entry;
|
||||
}),
|
||||
},
|
||||
realtime: {
|
||||
...(activeRealtimeProvider ? { activeProvider: activeRealtimeProvider } : {}),
|
||||
providers: listRealtimeVoiceProviders(config).map((provider) => {
|
||||
const rawConfig = realtimeConfig.providers?.[provider.id] ?? {};
|
||||
const providerConfig = provider.resolveConfig?.({ cfg: config, rawConfig }) ?? rawConfig;
|
||||
const capabilities = provider.capabilities;
|
||||
const entry: Record<string, unknown> = {
|
||||
id: provider.id,
|
||||
label: provider.label,
|
||||
configured: configuredOrFalse(() =>
|
||||
provider.isConfigured({ cfg: config, providerConfig }),
|
||||
),
|
||||
modes: ["realtime"],
|
||||
brains: capabilities?.supportsToolCalls === false ? ["none"] : ["agent-consult"],
|
||||
supportsBrowserSession: Boolean(
|
||||
capabilities?.supportsBrowserSession ?? provider.createBrowserSession,
|
||||
),
|
||||
};
|
||||
if (provider.defaultModel) {
|
||||
entry.defaultModel = provider.defaultModel;
|
||||
}
|
||||
if (capabilities?.transports) {
|
||||
entry.transports = [...capabilities.transports];
|
||||
}
|
||||
if (capabilities?.inputAudioFormats) {
|
||||
entry.inputAudioFormats = capabilities.inputAudioFormats.map((format) => ({ ...format }));
|
||||
}
|
||||
if (capabilities?.outputAudioFormats) {
|
||||
entry.outputAudioFormats = capabilities.outputAudioFormats.map((format) => ({
|
||||
...format,
|
||||
}));
|
||||
}
|
||||
if (capabilities?.supportsBargeIn !== undefined) {
|
||||
entry.supportsBargeIn = capabilities.supportsBargeIn;
|
||||
}
|
||||
if (capabilities?.supportsToolCalls !== undefined) {
|
||||
entry.supportsToolCalls = capabilities.supportsToolCalls;
|
||||
}
|
||||
if (capabilities?.supportsVideoFrames !== undefined) {
|
||||
entry.supportsVideoFrames = capabilities.supportsVideoFrames;
|
||||
}
|
||||
if (capabilities?.supportsSessionResumption !== undefined) {
|
||||
entry.supportsSessionResumption = capabilities.supportsSessionResumption;
|
||||
}
|
||||
return entry;
|
||||
}),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function buildRealtimeInstructions(): string {
|
||||
return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
|
||||
}
|
||||
|
||||
function withRealtimeBrowserOverrides(
|
||||
providerConfig: RealtimeVoiceProviderConfig,
|
||||
params: { model?: string; voice?: string },
|
||||
): RealtimeVoiceProviderConfig {
|
||||
const overrides: RealtimeVoiceProviderConfig = {};
|
||||
const model = normalizeOptionalString(params.model);
|
||||
const voice = normalizeOptionalString(params.voice);
|
||||
if (model) {
|
||||
overrides.model = model;
|
||||
}
|
||||
if (voice) {
|
||||
overrides.voice = voice;
|
||||
}
|
||||
return Object.keys(overrides).length > 0 ? { ...providerConfig, ...overrides } : providerConfig;
|
||||
}
|
||||
|
||||
function isUnsupportedBrowserWebRtcSession(session: RealtimeVoiceBrowserSession): boolean {
|
||||
const provider = normalizeLowercaseStringOrEmpty(session.provider);
|
||||
const transport = (session as { transport?: string }).transport ?? "webrtc-sdp";
|
||||
return provider === "google" && transport === "webrtc-sdp";
|
||||
}
|
||||
|
||||
function isFallbackEligibleTalkReason(reason: TalkSpeakReason): boolean {
|
||||
return (
|
||||
reason === "talk_unconfigured" ||
|
||||
@@ -443,7 +534,89 @@ function stripUnresolvedSecretApiKeyFromRecord(
|
||||
return rest;
|
||||
}
|
||||
|
||||
async function startRealtimeToolCallAgentConsult(params: {
|
||||
sessionKey: string;
|
||||
callId: string;
|
||||
args: unknown;
|
||||
relaySessionId?: string;
|
||||
connId?: string;
|
||||
request: Parameters<GatewayRequestHandlers[string]>[0];
|
||||
}): Promise<
|
||||
{ ok: true; runId: string; idempotencyKey: string } | { ok: false; error: ErrorShape }
|
||||
> {
|
||||
let message: string;
|
||||
try {
|
||||
message = buildRealtimeVoiceAgentConsultChatMessage(params.args);
|
||||
} catch (err) {
|
||||
return { ok: false, error: errorShape(ErrorCodes.INVALID_REQUEST, formatForLog(err)) };
|
||||
}
|
||||
const idempotencyKey = `talk-${params.callId}-${randomUUID()}`;
|
||||
let chatResponse: { ok: true; result: unknown } | { ok: false; error: ErrorShape } | undefined;
|
||||
await chatHandlers["chat.send"]({
|
||||
...params.request,
|
||||
req: {
|
||||
type: "req",
|
||||
id: `${params.request.req.id}:talk-tool-call`,
|
||||
method: "chat.send",
|
||||
},
|
||||
params: {
|
||||
sessionKey: params.sessionKey,
|
||||
message,
|
||||
idempotencyKey,
|
||||
},
|
||||
respond: (ok: boolean, result?: unknown, error?: ErrorShape) => {
|
||||
chatResponse = ok
|
||||
? { ok: true, result }
|
||||
: {
|
||||
ok: false,
|
||||
error: error ?? errorShape(ErrorCodes.UNAVAILABLE, "chat.send failed without error"),
|
||||
};
|
||||
},
|
||||
} as never);
|
||||
|
||||
if (!chatResponse) {
|
||||
return {
|
||||
ok: false,
|
||||
error: errorShape(ErrorCodes.UNAVAILABLE, "chat.send did not return a realtime tool result"),
|
||||
};
|
||||
}
|
||||
if (!chatResponse.ok) {
|
||||
return { ok: false, error: chatResponse.error };
|
||||
}
|
||||
const runId = normalizeOptionalString(asRecord(chatResponse.result)?.runId) ?? idempotencyKey;
|
||||
if (params.relaySessionId && params.connId) {
|
||||
registerTalkRealtimeRelayAgentRun({
|
||||
relaySessionId: params.relaySessionId,
|
||||
connId: params.connId,
|
||||
sessionKey: params.sessionKey,
|
||||
runId,
|
||||
});
|
||||
}
|
||||
return { ok: true, runId, idempotencyKey };
|
||||
}
|
||||
|
||||
export const talkHandlers: GatewayRequestHandlers = {
|
||||
...talkSessionHandlers,
|
||||
"talk.catalog": async ({ params, respond, context }) => {
|
||||
const catalogParams = params ?? {};
|
||||
if (!validateTalkCatalogParams(catalogParams)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.catalog params: ${formatValidationErrors(validateTalkCatalogParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
respond(true, buildTalkCatalog(context.getRuntimeConfig()), undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.config": async ({ params, respond, client, context }) => {
|
||||
if (!validateTalkConfigParams(params)) {
|
||||
respond(
|
||||
@@ -492,6 +665,200 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
|
||||
respond(true, { config: configPayload }, undefined);
|
||||
},
|
||||
"talk.handoff.create": async ({ params, respond, client, context }) => {
|
||||
if (!validateTalkHandoffCreateParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.handoff.create params: ${formatValidationErrors(validateTalkHandoffCreateParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
if (params.brain === "direct-tools" && !canUseTalkDirectTools(client)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`talk.handoff.create brain="direct-tools" requires gateway scope: ${ADMIN_SCOPE}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const resolvedSession = await resolveSessionKeyFromResolveParams({
|
||||
cfg: context.getRuntimeConfig(),
|
||||
p: {
|
||||
key: params.sessionKey,
|
||||
includeGlobal: true,
|
||||
includeUnknown: true,
|
||||
},
|
||||
});
|
||||
if (!resolvedSession.ok) {
|
||||
respond(false, undefined, resolvedSession.error);
|
||||
return;
|
||||
}
|
||||
respond(true, createTalkHandoff({ ...params, sessionKey: resolvedSession.key }), undefined);
|
||||
},
|
||||
"talk.handoff.join": async ({ params, respond, client, context }) => {
|
||||
if (!validateTalkHandoffJoinParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.handoff.join params: ${formatValidationErrors(validateTalkHandoffJoinParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const result = joinTalkHandoff(params.id, params.token, { clientId: client?.connId });
|
||||
if (!result.ok) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
result.reason === "invalid_token" ? ErrorCodes.INVALID_REQUEST : ErrorCodes.UNAVAILABLE,
|
||||
`talk handoff join failed: ${result.reason}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
broadcastTalkRoomEvents(context, result.replacedClientId, {
|
||||
handoffId: result.record.id,
|
||||
roomId: result.record.roomId,
|
||||
events: result.replacementEvents,
|
||||
});
|
||||
broadcastTalkRoomEvents(context, client?.connId, {
|
||||
handoffId: result.record.id,
|
||||
roomId: result.record.roomId,
|
||||
events: result.activeClientEvents,
|
||||
});
|
||||
respond(true, result.record, undefined);
|
||||
},
|
||||
"talk.handoff.revoke": async ({ params, respond, context }) => {
|
||||
if (!validateTalkHandoffRevokeParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.handoff.revoke params: ${formatValidationErrors(validateTalkHandoffRevokeParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const result = revokeTalkHandoff(params.id);
|
||||
broadcastTalkRoomEvents(context, result.activeClientId, {
|
||||
handoffId: params.id,
|
||||
roomId: result.roomId ?? "",
|
||||
events: result.events,
|
||||
});
|
||||
respond(true, { ok: true, revoked: result.revoked }, undefined);
|
||||
},
|
||||
"talk.handoff.turnStart": async ({ params, respond, client, context }) => {
|
||||
if (!validateTalkHandoffTurnStartParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.handoff.turnStart params: ${formatValidationErrors(validateTalkHandoffTurnStartParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const result = startTalkHandoffTurn(params.id, params.token, {
|
||||
turnId: params.turnId,
|
||||
clientId: client?.connId,
|
||||
});
|
||||
if (!result.ok) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
talkHandoffErrorCode(result.reason),
|
||||
`talk handoff turn start failed: ${result.reason}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
|
||||
handoffId: result.record.id,
|
||||
roomId: result.record.roomId,
|
||||
events: result.events,
|
||||
});
|
||||
respond(true, result, undefined);
|
||||
},
|
||||
"talk.handoff.turnEnd": async ({ params, respond, context }) => {
|
||||
if (!validateTalkHandoffTurnEndParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.handoff.turnEnd params: ${formatValidationErrors(validateTalkHandoffTurnEndParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const result = endTalkHandoffTurn(params.id, params.token, {
|
||||
turnId: params.turnId,
|
||||
});
|
||||
if (!result.ok) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
talkHandoffErrorCode(result.reason),
|
||||
`talk handoff turn end failed: ${result.reason}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
|
||||
handoffId: result.record.id,
|
||||
roomId: result.record.roomId,
|
||||
events: result.events,
|
||||
});
|
||||
respond(true, result, undefined);
|
||||
},
|
||||
"talk.handoff.turnCancel": async ({ params, respond, context }) => {
|
||||
if (!validateTalkHandoffTurnCancelParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.handoff.turnCancel params: ${formatValidationErrors(validateTalkHandoffTurnCancelParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const result = cancelTalkHandoffTurn(params.id, params.token, {
|
||||
turnId: params.turnId,
|
||||
reason: params.reason,
|
||||
});
|
||||
if (!result.ok) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
talkHandoffErrorCode(result.reason),
|
||||
`talk handoff turn cancel failed: ${result.reason}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
broadcastTalkRoomEvents(context, result.record.room.activeClientId, {
|
||||
handoffId: result.record.id,
|
||||
roomId: result.record.roomId,
|
||||
events: result.events,
|
||||
});
|
||||
respond(true, result, undefined);
|
||||
},
|
||||
"talk.realtime.session": async ({ params, respond, context, client }) => {
|
||||
if (!validateTalkRealtimeSessionParams(params)) {
|
||||
respond(
|
||||
@@ -508,10 +875,54 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
provider?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
mode?: string;
|
||||
transport?: string;
|
||||
brain?: string;
|
||||
};
|
||||
try {
|
||||
const runtimeConfig = context.getRuntimeConfig();
|
||||
const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, typedParams.provider);
|
||||
const mode =
|
||||
normalizeOptionalLowercaseString(typedParams.mode) ?? realtimeConfig.mode ?? "realtime";
|
||||
if (mode !== "realtime") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`talk.realtime.session only supports mode="realtime"; use talk.catalog for ${mode} provider discovery`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const brain =
|
||||
normalizeOptionalLowercaseString(typedParams.brain) ??
|
||||
realtimeConfig.brain ??
|
||||
"agent-consult";
|
||||
if (brain !== "agent-consult") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`talk.realtime.session only supports brain="agent-consult"`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const transport =
|
||||
normalizeOptionalLowercaseString(typedParams.transport) ?? realtimeConfig.transport;
|
||||
if (transport === "managed-room") {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.UNAVAILABLE,
|
||||
"managed-room realtime Talk sessions are not available in the browser UI yet",
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const resolution = resolveConfiguredRealtimeVoiceProvider({
|
||||
configuredProviderId: realtimeConfig.provider,
|
||||
providerConfigs: realtimeConfig.providers,
|
||||
@@ -519,18 +930,32 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
cfgForResolve: runtimeConfig,
|
||||
noRegisteredProviderMessage: "No realtime voice provider registered",
|
||||
});
|
||||
if (resolution.provider.createBrowserSession) {
|
||||
if (resolution.provider.createBrowserSession && transport !== "gateway-relay") {
|
||||
const session = await resolution.provider.createBrowserSession({
|
||||
providerConfig: resolution.providerConfig,
|
||||
instructions: buildRealtimeInstructions(),
|
||||
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
|
||||
model: normalizeOptionalString(typedParams.model),
|
||||
voice: normalizeOptionalString(typedParams.voice),
|
||||
model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model,
|
||||
voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice,
|
||||
});
|
||||
if (!isUnsupportedBrowserWebRtcSession(session)) {
|
||||
if (
|
||||
!isUnsupportedBrowserWebRtcSession(session) &&
|
||||
(!transport || session.transport === transport)
|
||||
) {
|
||||
respond(true, session, undefined);
|
||||
return;
|
||||
}
|
||||
if (transport) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.UNAVAILABLE,
|
||||
`Realtime provider "${resolution.provider.id}" does not support requested browser transport "${transport}"`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const connId = client?.connId;
|
||||
@@ -542,8 +967,8 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
);
|
||||
return;
|
||||
}
|
||||
const model = normalizeOptionalString(typedParams.model);
|
||||
const voice = normalizeOptionalString(typedParams.voice);
|
||||
const model = normalizeOptionalString(typedParams.model) ?? realtimeConfig.model;
|
||||
const voice = normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice;
|
||||
const session = createTalkRealtimeRelaySession({
|
||||
context,
|
||||
connId,
|
||||
@@ -559,6 +984,49 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.realtime.toolCall": async (request) => {
|
||||
const { params, respond } = request;
|
||||
if (!validateTalkRealtimeToolCallParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.realtime.toolCall params: ${formatValidationErrors(validateTalkRealtimeToolCallParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
if (params.name !== REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(ErrorCodes.INVALID_REQUEST, `unsupported realtime Talk tool: ${params.name}`),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const result = await startRealtimeToolCallAgentConsult({
|
||||
sessionKey: params.sessionKey,
|
||||
callId: params.callId,
|
||||
args: params.args ?? {},
|
||||
relaySessionId: normalizeOptionalString(params.relaySessionId),
|
||||
connId: normalizeOptionalString(request.client?.connId),
|
||||
request,
|
||||
});
|
||||
if (!result.ok) {
|
||||
respond(false, undefined, result.error);
|
||||
return;
|
||||
}
|
||||
respond(
|
||||
true,
|
||||
{
|
||||
runId: result.runId,
|
||||
idempotencyKey: result.idempotencyKey,
|
||||
},
|
||||
undefined,
|
||||
);
|
||||
},
|
||||
"talk.realtime.relayAudio": async ({ params, respond, client }) => {
|
||||
if (!validateTalkRealtimeRelayAudioParams(params)) {
|
||||
respond(
|
||||
@@ -612,6 +1080,34 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.realtime.relayCancel": async ({ params, respond, client }) => {
|
||||
if (!validateTalkRealtimeRelayCancelParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.realtime.relayCancel params: ${formatValidationErrors(validateTalkRealtimeRelayCancelParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = client?.connId;
|
||||
if (!connId) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "realtime relay unavailable"));
|
||||
return;
|
||||
}
|
||||
try {
|
||||
cancelTalkRealtimeRelayTurn({
|
||||
relaySessionId: params.relaySessionId,
|
||||
connId,
|
||||
reason: normalizeOptionalString(params.reason),
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.realtime.relayStop": async ({ params, respond, client }) => {
|
||||
if (!validateTalkRealtimeRelayStopParams(params)) {
|
||||
respond(
|
||||
@@ -665,6 +1161,141 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.transcription.session": async ({ params, respond, context, client }) => {
|
||||
if (!validateTalkTranscriptionSessionParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.transcription.session params: ${formatValidationErrors(validateTalkTranscriptionSessionParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = client?.connId;
|
||||
if (!connId) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay requires a connected client"),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const runtimeConfig = context.getRuntimeConfig();
|
||||
const transcriptionConfig = buildTalkTranscriptionConfig(runtimeConfig, params.provider);
|
||||
const resolution = resolveConfiguredRealtimeTranscriptionProvider({
|
||||
config: runtimeConfig,
|
||||
configuredProviderId: transcriptionConfig.provider,
|
||||
providerConfigs: transcriptionConfig.providers,
|
||||
});
|
||||
const session = createTalkTranscriptionRelaySession({
|
||||
context,
|
||||
connId,
|
||||
provider: resolution.provider,
|
||||
providerConfig: resolution.providerConfig,
|
||||
});
|
||||
respond(true, session, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.transcription.relayAudio": async ({ params, respond, client }) => {
|
||||
if (!validateTalkTranscriptionRelayAudioParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.transcription.relayAudio params: ${formatValidationErrors(validateTalkTranscriptionRelayAudioParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = client?.connId;
|
||||
if (!connId) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
sendTalkTranscriptionRelayAudio({
|
||||
transcriptionSessionId: params.transcriptionSessionId,
|
||||
connId,
|
||||
audioBase64: params.audioBase64,
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.transcription.relayCancel": async ({ params, respond, client }) => {
|
||||
if (!validateTalkTranscriptionRelayCancelParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.transcription.relayCancel params: ${formatValidationErrors(validateTalkTranscriptionRelayCancelParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = client?.connId;
|
||||
if (!connId) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
cancelTalkTranscriptionRelayTurn({
|
||||
transcriptionSessionId: params.transcriptionSessionId,
|
||||
connId,
|
||||
reason: normalizeOptionalString(params.reason),
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.transcription.relayStop": async ({ params, respond, client }) => {
|
||||
if (!validateTalkTranscriptionRelayStopParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.transcription.relayStop params: ${formatValidationErrors(validateTalkTranscriptionRelayStopParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
const connId = client?.connId;
|
||||
if (!connId) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(ErrorCodes.UNAVAILABLE, "transcription relay unavailable"),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
stopTalkTranscriptionRelaySession({
|
||||
transcriptionSessionId: params.transcriptionSessionId,
|
||||
connId,
|
||||
});
|
||||
respond(true, { ok: true }, undefined);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.speak": async ({ params, respond, context }) => {
|
||||
if (!validateTalkSpeakParams(params)) {
|
||||
respond(
|
||||
@@ -763,11 +1394,11 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
}
|
||||
},
|
||||
"talk.mode": ({ params, respond, context, client, isWebchatConnect }) => {
|
||||
if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) {
|
||||
if (client && isWebchatConnect(client.connect) && !context.hasConnectedTalkNode()) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected iOS/Android nodes"),
|
||||
errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected Talk-capable nodes"),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
|
||||
import type { NodeRegistry } from "./node-registry.js";
|
||||
|
||||
export function hasConnectedMobileNode(registry: NodeRegistry): boolean {
|
||||
const connected = registry.listConnected();
|
||||
return connected.some((n) => {
|
||||
const platform = normalizeOptionalLowercaseString(n.platform) ?? "";
|
||||
return (
|
||||
platform.startsWith("ios") || platform.startsWith("ipados") || platform.startsWith("android")
|
||||
);
|
||||
});
|
||||
}
|
||||
@@ -4,8 +4,8 @@ import {
|
||||
createSessionMessageSubscriberRegistry,
|
||||
} from "./server-chat-state.js";
|
||||
import { safeParseJson } from "./server-json.js";
|
||||
import { hasConnectedMobileNode } from "./server-mobile-nodes.js";
|
||||
import { createNodeSubscriptionManager } from "./server-node-subscriptions.js";
|
||||
import { hasConnectedTalkNode } from "./server-talk-nodes.js";
|
||||
|
||||
export function createGatewayNodeSessionRuntime(params: {
|
||||
broadcast: (event: string, payload: unknown, opts?: { dropIfSlow?: boolean }) => void;
|
||||
@@ -26,7 +26,7 @@ export function createGatewayNodeSessionRuntime(params: {
|
||||
const broadcastVoiceWakeChanged = (triggers: string[]) => {
|
||||
params.broadcast("voicewake.changed", { triggers }, { dropIfSlow: true });
|
||||
};
|
||||
const hasMobileNodeConnected = () => hasConnectedMobileNode(nodeRegistry);
|
||||
const hasTalkNodeConnected = () => hasConnectedTalkNode(nodeRegistry);
|
||||
|
||||
return {
|
||||
nodeRegistry,
|
||||
@@ -39,6 +39,6 @@ export function createGatewayNodeSessionRuntime(params: {
|
||||
nodeUnsubscribe: nodeSubscriptions.unsubscribe,
|
||||
nodeUnsubscribeAll: nodeSubscriptions.unsubscribeAll,
|
||||
broadcastVoiceWakeChanged,
|
||||
hasMobileNodeConnected,
|
||||
hasTalkNodeConnected,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ describe("createGatewayRequestContext", () => {
|
||||
nodeSubscribe: vi.fn(),
|
||||
nodeUnsubscribe: vi.fn(),
|
||||
nodeUnsubscribeAll: vi.fn(),
|
||||
hasConnectedMobileNode: vi.fn(() => false),
|
||||
hasConnectedTalkNode: vi.fn(() => false),
|
||||
clients: new Set(),
|
||||
enforceSharedGatewayAuthGenerationForConfigWrite: vi.fn(),
|
||||
nodeRegistry: {} as never,
|
||||
|
||||
@@ -28,7 +28,7 @@ type GatewayRequestContextParams = {
|
||||
nodeSubscribe: GatewayRequestContext["nodeSubscribe"];
|
||||
nodeUnsubscribe: GatewayRequestContext["nodeUnsubscribe"];
|
||||
nodeUnsubscribeAll: GatewayRequestContext["nodeUnsubscribeAll"];
|
||||
hasConnectedMobileNode: GatewayRequestContext["hasConnectedMobileNode"];
|
||||
hasConnectedTalkNode: GatewayRequestContext["hasConnectedTalkNode"];
|
||||
clients: Set<GatewayRequestContextClient>;
|
||||
enforceSharedGatewayAuthGenerationForConfigWrite: (nextConfig: OpenClawConfig) => void;
|
||||
nodeRegistry: GatewayRequestContext["nodeRegistry"];
|
||||
@@ -92,7 +92,7 @@ export function createGatewayRequestContext(
|
||||
nodeSubscribe: params.nodeSubscribe,
|
||||
nodeUnsubscribe: params.nodeUnsubscribe,
|
||||
nodeUnsubscribeAll: params.nodeUnsubscribeAll,
|
||||
hasConnectedMobileNode: params.hasConnectedMobileNode,
|
||||
hasConnectedTalkNode: params.hasConnectedTalkNode,
|
||||
hasExecApprovalClients: (excludeConnId?: string) => {
|
||||
for (const gatewayClient of params.clients) {
|
||||
if (excludeConnId && gatewayClient.connId === excludeConnId) {
|
||||
|
||||
@@ -884,7 +884,7 @@ export async function startGatewayServer(
|
||||
nodeUnsubscribe,
|
||||
nodeUnsubscribeAll,
|
||||
broadcastVoiceWakeChanged,
|
||||
hasMobileNodeConnected,
|
||||
hasTalkNodeConnected,
|
||||
} = createGatewayNodeSessionRuntime({ broadcast });
|
||||
applyGatewayLaneConcurrency(cfgAtStart);
|
||||
|
||||
@@ -1261,7 +1261,7 @@ export async function startGatewayServer(
|
||||
nodeSubscribe,
|
||||
nodeUnsubscribe,
|
||||
nodeUnsubscribeAll,
|
||||
hasConnectedMobileNode: hasMobileNodeConnected,
|
||||
hasConnectedTalkNode: hasTalkNodeConnected,
|
||||
clients,
|
||||
enforceSharedGatewayAuthGenerationForConfigWrite: (nextConfig: OpenClawConfig) => {
|
||||
enforceSharedGatewaySessionGenerationForConfigWrite({
|
||||
|
||||
286
src/gateway/talk-handoff.test.ts
Normal file
286
src/gateway/talk-handoff.test.ts
Normal file
@@ -0,0 +1,286 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
cancelTalkHandoffTurn,
|
||||
clearTalkHandoffsForTest,
|
||||
createTalkHandoff,
|
||||
endTalkHandoffTurn,
|
||||
getTalkHandoff,
|
||||
joinTalkHandoff,
|
||||
revokeTalkHandoff,
|
||||
startTalkHandoffTurn,
|
||||
verifyTalkHandoffToken,
|
||||
} from "./talk-handoff.js";
|
||||
|
||||
describe("talk handoff store", () => {
|
||||
it("creates an expiring managed-room handoff without storing the plaintext token", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-05-05T12:00:00.000Z"));
|
||||
clearTalkHandoffsForTest();
|
||||
|
||||
const handoff = createTalkHandoff({
|
||||
sessionKey: "session:main",
|
||||
sessionId: "session-id",
|
||||
channel: "discord",
|
||||
target: "dm:123",
|
||||
provider: "openai",
|
||||
model: "gpt-realtime-1.5",
|
||||
voice: "alloy",
|
||||
ttlMs: 5000,
|
||||
});
|
||||
const record = getTalkHandoff(handoff.id);
|
||||
|
||||
expect(handoff).toMatchObject({
|
||||
roomId: `talk_${handoff.id}`,
|
||||
roomUrl: `/talk/rooms/talk_${handoff.id}`,
|
||||
sessionKey: "session:main",
|
||||
sessionId: "session-id",
|
||||
channel: "discord",
|
||||
target: "dm:123",
|
||||
provider: "openai",
|
||||
model: "gpt-realtime-1.5",
|
||||
voice: "alloy",
|
||||
mode: "stt-tts",
|
||||
transport: "managed-room",
|
||||
brain: "agent-consult",
|
||||
createdAt: Date.parse("2026-05-05T12:00:00.000Z"),
|
||||
expiresAt: Date.parse("2026-05-05T12:00:05.000Z"),
|
||||
room: {
|
||||
activeClientId: undefined,
|
||||
recentTalkEvents: [
|
||||
expect.objectContaining({
|
||||
type: "session.started",
|
||||
sessionId: `talk_${handoff.id}`,
|
||||
transport: "managed-room",
|
||||
}),
|
||||
],
|
||||
},
|
||||
});
|
||||
expect(handoff).not.toHaveProperty("tokenHash");
|
||||
expect(record?.tokenHash).toBeTruthy();
|
||||
expect(record?.tokenHash).not.toBe(handoff.token);
|
||||
expect(record && verifyTalkHandoffToken(record, handoff.token)).toBe(true);
|
||||
|
||||
vi.advanceTimersByTime(5001);
|
||||
expect(getTalkHandoff(handoff.id)).toBeUndefined();
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("joins and revokes handoffs with only the bearer token", () => {
|
||||
clearTalkHandoffsForTest();
|
||||
const handoff = createTalkHandoff({ sessionKey: "session:main" });
|
||||
|
||||
expect(joinTalkHandoff(handoff.id, "wrong")).toEqual({
|
||||
ok: false,
|
||||
reason: "invalid_token",
|
||||
});
|
||||
expect(joinTalkHandoff(handoff.id, handoff.token)).toMatchObject({
|
||||
ok: true,
|
||||
events: [expect.objectContaining({ type: "session.ready" })],
|
||||
record: expect.objectContaining({
|
||||
id: handoff.id,
|
||||
roomId: handoff.roomId,
|
||||
sessionKey: "session:main",
|
||||
}),
|
||||
});
|
||||
|
||||
expect(revokeTalkHandoff(handoff.id)).toMatchObject({ revoked: true });
|
||||
expect(joinTalkHandoff(handoff.id, handoff.token)).toEqual({
|
||||
ok: false,
|
||||
reason: "not_found",
|
||||
});
|
||||
});
|
||||
|
||||
it("records managed-room ready, replacement, and close lifecycle events", () => {
|
||||
clearTalkHandoffsForTest();
|
||||
const handoff = createTalkHandoff({ sessionKey: "session:main" });
|
||||
|
||||
const firstJoin = joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-1" });
|
||||
expect(firstJoin).toMatchObject({
|
||||
ok: true,
|
||||
events: [
|
||||
expect.objectContaining({
|
||||
type: "session.ready",
|
||||
sessionId: handoff.roomId,
|
||||
payload: expect.objectContaining({ clientId: "conn-1" }),
|
||||
}),
|
||||
],
|
||||
record: {
|
||||
room: expect.objectContaining({
|
||||
activeClientId: "conn-1",
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
const secondJoin = joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-2" });
|
||||
expect(secondJoin).toMatchObject({
|
||||
ok: true,
|
||||
events: [
|
||||
expect.objectContaining({
|
||||
type: "session.replaced",
|
||||
sessionId: handoff.roomId,
|
||||
payload: expect.objectContaining({
|
||||
previousClientId: "conn-1",
|
||||
nextClientId: "conn-2",
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
type: "session.ready",
|
||||
sessionId: handoff.roomId,
|
||||
payload: expect.objectContaining({ clientId: "conn-2" }),
|
||||
}),
|
||||
],
|
||||
record: {
|
||||
room: expect.objectContaining({
|
||||
activeClientId: "conn-2",
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
expect(revokeTalkHandoff(handoff.id)).toMatchObject({
|
||||
revoked: true,
|
||||
activeClientId: "conn-2",
|
||||
events: [
|
||||
expect.objectContaining({
|
||||
type: "session.closed",
|
||||
sessionId: handoff.roomId,
|
||||
payload: expect.objectContaining({ reason: "revoked" }),
|
||||
final: true,
|
||||
}),
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("records managed-room turn start, end, and cancellation events", () => {
|
||||
clearTalkHandoffsForTest();
|
||||
const handoff = createTalkHandoff({ sessionKey: "session:main" });
|
||||
joinTalkHandoff(handoff.id, handoff.token, { clientId: "conn-1" });
|
||||
|
||||
const start = startTalkHandoffTurn(handoff.id, handoff.token, {
|
||||
clientId: "conn-1",
|
||||
turnId: "turn-1",
|
||||
});
|
||||
expect(start).toMatchObject({
|
||||
ok: true,
|
||||
turnId: "turn-1",
|
||||
events: [expect.objectContaining({ type: "turn.started", turnId: "turn-1" })],
|
||||
record: {
|
||||
room: expect.objectContaining({
|
||||
activeClientId: "conn-1",
|
||||
activeTurnId: "turn-1",
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
expect(endTalkHandoffTurn(handoff.id, handoff.token)).toMatchObject({
|
||||
ok: true,
|
||||
turnId: "turn-1",
|
||||
events: [
|
||||
expect.objectContaining({
|
||||
type: "turn.ended",
|
||||
turnId: "turn-1",
|
||||
final: true,
|
||||
}),
|
||||
],
|
||||
record: {
|
||||
room: expect.not.objectContaining({
|
||||
activeTurnId: expect.any(String),
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
expect(cancelTalkHandoffTurn(handoff.id, handoff.token)).toEqual({
|
||||
ok: false,
|
||||
reason: "no_active_turn",
|
||||
});
|
||||
|
||||
startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-2" });
|
||||
expect(cancelTalkHandoffTurn(handoff.id, handoff.token, { reason: "barge-in" })).toMatchObject({
|
||||
ok: true,
|
||||
turnId: "turn-2",
|
||||
events: [
|
||||
expect.objectContaining({
|
||||
type: "turn.cancelled",
|
||||
turnId: "turn-2",
|
||||
final: true,
|
||||
payload: expect.objectContaining({ reason: "barge-in" }),
|
||||
}),
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("rejects stale managed-room turn completion without clearing the active turn", () => {
|
||||
clearTalkHandoffsForTest();
|
||||
const handoff = createTalkHandoff({ sessionKey: "session:main" });
|
||||
|
||||
startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" });
|
||||
startTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-current" });
|
||||
|
||||
expect(endTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" })).toEqual({
|
||||
ok: false,
|
||||
reason: "stale_turn",
|
||||
});
|
||||
expect(getTalkHandoff(handoff.id)?.room.talk.activeTurnId).toBe("turn-current");
|
||||
|
||||
expect(cancelTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-old" })).toEqual({
|
||||
ok: false,
|
||||
reason: "stale_turn",
|
||||
});
|
||||
expect(getTalkHandoff(handoff.id)?.room.talk.activeTurnId).toBe("turn-current");
|
||||
|
||||
expect(endTalkHandoffTurn(handoff.id, handoff.token, { turnId: "turn-current" })).toMatchObject(
|
||||
{
|
||||
ok: true,
|
||||
turnId: "turn-current",
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("isolates simultaneous handoffs for different sessions on the same host", () => {
|
||||
clearTalkHandoffsForTest();
|
||||
|
||||
const first = createTalkHandoff({
|
||||
sessionKey: "agent:main:first",
|
||||
channel: "browser",
|
||||
target: "host:local",
|
||||
provider: "openai",
|
||||
});
|
||||
const second = createTalkHandoff({
|
||||
sessionKey: "agent:main:second",
|
||||
channel: "browser",
|
||||
target: "host:local",
|
||||
});
|
||||
|
||||
expect(first.id).not.toBe(second.id);
|
||||
expect(first.roomId).not.toBe(second.roomId);
|
||||
expect(first.token).not.toBe(second.token);
|
||||
expect(joinTalkHandoff(first.id, second.token)).toEqual({
|
||||
ok: false,
|
||||
reason: "invalid_token",
|
||||
});
|
||||
expect(joinTalkHandoff(second.id, first.token)).toEqual({
|
||||
ok: false,
|
||||
reason: "invalid_token",
|
||||
});
|
||||
expect(joinTalkHandoff(first.id, first.token)).toMatchObject({
|
||||
ok: true,
|
||||
events: [expect.objectContaining({ type: "session.ready" })],
|
||||
record: expect.objectContaining({
|
||||
roomId: first.roomId,
|
||||
sessionKey: "agent:main:first",
|
||||
channel: "browser",
|
||||
target: "host:local",
|
||||
provider: "openai",
|
||||
}),
|
||||
});
|
||||
expect(joinTalkHandoff(second.id, second.token)).toMatchObject({
|
||||
ok: true,
|
||||
events: [expect.objectContaining({ type: "session.ready" })],
|
||||
record: expect.objectContaining({
|
||||
roomId: second.roomId,
|
||||
sessionKey: "agent:main:second",
|
||||
channel: "browser",
|
||||
target: "host:local",
|
||||
}),
|
||||
});
|
||||
});
|
||||
});
|
||||
389
src/gateway/talk-handoff.ts
Normal file
389
src/gateway/talk-handoff.ts
Normal file
@@ -0,0 +1,389 @@
|
||||
import { createHash, randomBytes, randomUUID } from "node:crypto";
|
||||
import {
|
||||
createTalkSessionController,
|
||||
type TalkBrain,
|
||||
type TalkEvent,
|
||||
type TalkEventInput,
|
||||
type TalkMode,
|
||||
type TalkSessionController,
|
||||
type TalkTransport,
|
||||
} from "../realtime-voice/talk-session-controller.js";
|
||||
|
||||
const DEFAULT_TALK_HANDOFF_TTL_MS = 10 * 60 * 1000;
|
||||
const MAX_TALK_HANDOFF_TTL_MS = 60 * 60 * 1000;
|
||||
|
||||
export type TalkHandoffCreateParams = {
|
||||
sessionKey: string;
|
||||
sessionId?: string;
|
||||
channel?: string;
|
||||
target?: string;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
mode?: TalkMode;
|
||||
transport?: TalkTransport;
|
||||
brain?: TalkBrain;
|
||||
ttlMs?: number;
|
||||
};
|
||||
|
||||
export type TalkHandoffRecord = {
|
||||
id: string;
|
||||
roomId: string;
|
||||
roomUrl: string;
|
||||
tokenHash: string;
|
||||
sessionKey: string;
|
||||
sessionId?: string;
|
||||
channel?: string;
|
||||
target?: string;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
mode: TalkMode;
|
||||
transport: TalkTransport;
|
||||
brain: TalkBrain;
|
||||
createdAt: number;
|
||||
expiresAt: number;
|
||||
room: TalkHandoffRoomState;
|
||||
};
|
||||
|
||||
export type TalkHandoffPublicRecord = Omit<TalkHandoffRecord, "tokenHash" | "room"> & {
|
||||
room: {
|
||||
activeClientId?: string;
|
||||
activeTurnId?: string;
|
||||
recentTalkEvents: TalkEvent[];
|
||||
};
|
||||
};
|
||||
|
||||
export type TalkHandoffCreateResult = TalkHandoffPublicRecord & {
|
||||
token: string;
|
||||
};
|
||||
|
||||
export type TalkHandoffJoinResult =
|
||||
| {
|
||||
ok: true;
|
||||
record: TalkHandoffPublicRecord;
|
||||
events: TalkEvent[];
|
||||
replacedClientId?: string;
|
||||
replacementEvents: TalkEvent[];
|
||||
activeClientEvents: TalkEvent[];
|
||||
}
|
||||
| { ok: false; reason: "not_found" | "expired" | "invalid_token" };
|
||||
|
||||
export type TalkHandoffRevokeResult = {
|
||||
revoked: boolean;
|
||||
roomId?: string;
|
||||
activeClientId?: string;
|
||||
events: TalkEvent[];
|
||||
};
|
||||
|
||||
export type TalkHandoffTurnResult =
|
||||
| {
|
||||
ok: true;
|
||||
record: TalkHandoffPublicRecord;
|
||||
turnId: string;
|
||||
events: TalkEvent[];
|
||||
}
|
||||
| {
|
||||
ok: false;
|
||||
reason: "not_found" | "expired" | "invalid_token" | "no_active_turn" | "stale_turn";
|
||||
};
|
||||
|
||||
type TalkHandoffRoomState = {
|
||||
activeClientId?: string;
|
||||
talk: TalkSessionController;
|
||||
};
|
||||
|
||||
const handoffs = new Map<string, TalkHandoffRecord>();
|
||||
|
||||
export function createTalkHandoff(params: TalkHandoffCreateParams): TalkHandoffCreateResult {
|
||||
pruneExpiredTalkHandoffs();
|
||||
const createdAt = Date.now();
|
||||
const ttlMs = normalizeTtlMs(params.ttlMs);
|
||||
const id = randomUUID();
|
||||
const roomId = `talk_${id}`;
|
||||
const token = randomBytes(32).toString("base64url");
|
||||
const room = createTalkHandoffRoom({
|
||||
roomId,
|
||||
mode: params.mode ?? "stt-tts",
|
||||
transport: params.transport ?? "managed-room",
|
||||
brain: params.brain ?? "agent-consult",
|
||||
provider: params.provider,
|
||||
});
|
||||
const record: TalkHandoffRecord = {
|
||||
id,
|
||||
roomId,
|
||||
roomUrl: `/talk/rooms/${roomId}`,
|
||||
tokenHash: hashTalkHandoffToken(token),
|
||||
sessionKey: params.sessionKey,
|
||||
sessionId: params.sessionId,
|
||||
channel: params.channel,
|
||||
target: params.target,
|
||||
provider: params.provider,
|
||||
model: params.model,
|
||||
voice: params.voice,
|
||||
mode: params.mode ?? "stt-tts",
|
||||
transport: params.transport ?? "managed-room",
|
||||
brain: params.brain ?? "agent-consult",
|
||||
createdAt,
|
||||
expiresAt: createdAt + ttlMs,
|
||||
room,
|
||||
};
|
||||
appendTalkHandoffRoomEvent(record, {
|
||||
type: "session.started",
|
||||
payload: { handoffId: id, roomId },
|
||||
});
|
||||
handoffs.set(id, record);
|
||||
return { ...toPublicTalkHandoffRecord(record), token };
|
||||
}
|
||||
|
||||
export function getTalkHandoff(id: string): TalkHandoffRecord | undefined {
|
||||
pruneExpiredTalkHandoffs();
|
||||
return handoffs.get(id);
|
||||
}
|
||||
|
||||
export function joinTalkHandoff(
|
||||
id: string,
|
||||
token: string,
|
||||
opts: { clientId?: string } = {},
|
||||
): TalkHandoffJoinResult {
|
||||
const access = resolveTalkHandoffAccess(id, token);
|
||||
if (!access.ok) {
|
||||
return access;
|
||||
}
|
||||
const record = access.record;
|
||||
const previousClientId = record.room.activeClientId;
|
||||
const events = joinTalkHandoffRoom(record, opts.clientId);
|
||||
const replacedClientId =
|
||||
previousClientId && previousClientId !== opts.clientId ? previousClientId : undefined;
|
||||
const replacementEvents = replacedClientId
|
||||
? events.filter((event) => event.type === "session.replaced")
|
||||
: [];
|
||||
const activeClientEvents = replacedClientId
|
||||
? events.filter((event) => event.type !== "session.replaced")
|
||||
: events;
|
||||
return {
|
||||
ok: true,
|
||||
record: toPublicTalkHandoffRecord(record),
|
||||
events,
|
||||
replacedClientId,
|
||||
replacementEvents,
|
||||
activeClientEvents,
|
||||
};
|
||||
}
|
||||
|
||||
export function startTalkHandoffTurn(
|
||||
id: string,
|
||||
token: string,
|
||||
opts: { turnId?: string; clientId?: string } = {},
|
||||
): TalkHandoffTurnResult {
|
||||
const access = resolveTalkHandoffAccess(id, token);
|
||||
if (!access.ok) {
|
||||
return access;
|
||||
}
|
||||
const record = access.record;
|
||||
if (opts.clientId) {
|
||||
record.room.activeClientId = opts.clientId;
|
||||
}
|
||||
const turnId = normalizeOptionalString(opts.turnId) ?? randomUUID();
|
||||
const turn = record.room.talk.startTurn({
|
||||
turnId,
|
||||
payload: { handoffId: id, roomId: record.roomId, clientId: record.room.activeClientId },
|
||||
});
|
||||
return {
|
||||
ok: true,
|
||||
record: toPublicTalkHandoffRecord(record),
|
||||
turnId,
|
||||
events: turn.event ? [turn.event] : [],
|
||||
};
|
||||
}
|
||||
|
||||
export function endTalkHandoffTurn(
|
||||
id: string,
|
||||
token: string,
|
||||
opts: { turnId?: string } = {},
|
||||
): TalkHandoffTurnResult {
|
||||
const access = resolveTalkHandoffAccess(id, token);
|
||||
if (!access.ok) {
|
||||
return access;
|
||||
}
|
||||
const record = access.record;
|
||||
const result = record.room.talk.endTurn({
|
||||
turnId: normalizeOptionalString(opts.turnId),
|
||||
payload: { handoffId: id, roomId: record.roomId },
|
||||
});
|
||||
if (!result.ok) {
|
||||
return result;
|
||||
}
|
||||
return {
|
||||
ok: true,
|
||||
record: toPublicTalkHandoffRecord(record),
|
||||
turnId: result.turnId,
|
||||
events: [result.event],
|
||||
};
|
||||
}
|
||||
|
||||
export function cancelTalkHandoffTurn(
|
||||
id: string,
|
||||
token: string,
|
||||
opts: { reason?: string; turnId?: string } = {},
|
||||
): TalkHandoffTurnResult {
|
||||
const access = resolveTalkHandoffAccess(id, token);
|
||||
if (!access.ok) {
|
||||
return access;
|
||||
}
|
||||
const record = access.record;
|
||||
const result = record.room.talk.cancelTurn({
|
||||
turnId: normalizeOptionalString(opts.turnId),
|
||||
payload: { handoffId: id, roomId: record.roomId, reason: opts.reason ?? "client-cancelled" },
|
||||
});
|
||||
if (!result.ok) {
|
||||
return result;
|
||||
}
|
||||
return {
|
||||
ok: true,
|
||||
record: toPublicTalkHandoffRecord(record),
|
||||
turnId: result.turnId,
|
||||
events: [result.event],
|
||||
};
|
||||
}
|
||||
|
||||
export function revokeTalkHandoff(id: string): TalkHandoffRevokeResult {
|
||||
pruneExpiredTalkHandoffs();
|
||||
const record = handoffs.get(id);
|
||||
if (!record) {
|
||||
return { revoked: false, events: [] };
|
||||
}
|
||||
const event = appendTalkHandoffRoomEvent(record, {
|
||||
type: "session.closed",
|
||||
payload: { reason: "revoked", handoffId: id, roomId: record.roomId },
|
||||
final: true,
|
||||
});
|
||||
handoffs.delete(id);
|
||||
return {
|
||||
revoked: true,
|
||||
roomId: record.roomId,
|
||||
activeClientId: record.room.activeClientId,
|
||||
events: [event],
|
||||
};
|
||||
}
|
||||
|
||||
export function verifyTalkHandoffToken(record: TalkHandoffRecord, token: string): boolean {
|
||||
return record.tokenHash === hashTalkHandoffToken(token);
|
||||
}
|
||||
|
||||
export function clearTalkHandoffsForTest(): void {
|
||||
handoffs.clear();
|
||||
}
|
||||
|
||||
function normalizeTtlMs(value: number | undefined): number {
|
||||
if (!Number.isFinite(value) || value === undefined) {
|
||||
return DEFAULT_TALK_HANDOFF_TTL_MS;
|
||||
}
|
||||
return Math.min(Math.max(Math.trunc(value), 1000), MAX_TALK_HANDOFF_TTL_MS);
|
||||
}
|
||||
|
||||
function pruneExpiredTalkHandoffs(now = Date.now()): void {
|
||||
for (const [id, record] of handoffs) {
|
||||
if (record.expiresAt <= now) {
|
||||
appendTalkHandoffRoomEvent(record, {
|
||||
type: "session.closed",
|
||||
payload: { reason: "expired", handoffId: id, roomId: record.roomId },
|
||||
final: true,
|
||||
});
|
||||
handoffs.delete(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function hashTalkHandoffToken(token: string): string {
|
||||
return createHash("sha256").update(token).digest("base64url");
|
||||
}
|
||||
|
||||
function toPublicTalkHandoffRecord(record: TalkHandoffRecord): TalkHandoffPublicRecord {
|
||||
const { tokenHash: _tokenHash, room: _room, ...publicRecord } = record;
|
||||
return {
|
||||
...publicRecord,
|
||||
room: {
|
||||
activeClientId: record.room.activeClientId,
|
||||
activeTurnId: record.room.talk.activeTurnId,
|
||||
recentTalkEvents: [...record.room.talk.recentEvents],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function createTalkHandoffRoom(params: {
|
||||
roomId: string;
|
||||
mode: TalkMode;
|
||||
transport: TalkTransport;
|
||||
brain: TalkBrain;
|
||||
provider?: string;
|
||||
}): TalkHandoffRoomState {
|
||||
return {
|
||||
talk: createTalkSessionController({
|
||||
sessionId: params.roomId,
|
||||
mode: params.mode,
|
||||
transport: params.transport,
|
||||
brain: params.brain,
|
||||
provider: params.provider,
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function resolveTalkHandoffAccess(
|
||||
id: string,
|
||||
token: string,
|
||||
):
|
||||
| { ok: true; record: TalkHandoffRecord }
|
||||
| { ok: false; reason: "not_found" | "expired" | "invalid_token" } {
|
||||
const record = handoffs.get(id);
|
||||
if (!record) {
|
||||
return { ok: false, reason: "not_found" };
|
||||
}
|
||||
if (record.expiresAt <= Date.now()) {
|
||||
appendTalkHandoffRoomEvent(record, {
|
||||
type: "session.closed",
|
||||
payload: { reason: "expired", handoffId: id, roomId: record.roomId },
|
||||
final: true,
|
||||
});
|
||||
handoffs.delete(id);
|
||||
return { ok: false, reason: "expired" };
|
||||
}
|
||||
if (!verifyTalkHandoffToken(record, token)) {
|
||||
return { ok: false, reason: "invalid_token" };
|
||||
}
|
||||
return { ok: true, record };
|
||||
}
|
||||
|
||||
function appendTalkHandoffRoomEvent(record: TalkHandoffRecord, input: TalkEventInput): TalkEvent {
|
||||
return record.room.talk.emit(input);
|
||||
}
|
||||
|
||||
function joinTalkHandoffRoom(record: TalkHandoffRecord, clientId: string | undefined): TalkEvent[] {
|
||||
const events: TalkEvent[] = [];
|
||||
if (record.room.activeClientId && record.room.activeClientId !== clientId) {
|
||||
events.push(
|
||||
appendTalkHandoffRoomEvent(record, {
|
||||
type: "session.replaced",
|
||||
payload: {
|
||||
handoffId: record.id,
|
||||
roomId: record.roomId,
|
||||
previousClientId: record.room.activeClientId,
|
||||
nextClientId: clientId,
|
||||
},
|
||||
}),
|
||||
);
|
||||
}
|
||||
record.room.activeClientId = clientId;
|
||||
events.push(
|
||||
appendTalkHandoffRoomEvent(record, {
|
||||
type: "session.ready",
|
||||
payload: { handoffId: record.id, roomId: record.roomId, clientId },
|
||||
}),
|
||||
);
|
||||
return events;
|
||||
}
|
||||
|
||||
function normalizeOptionalString(value: string | undefined): string | undefined {
|
||||
const trimmed = value?.trim();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
@@ -3,8 +3,10 @@ import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
|
||||
import type { RealtimeVoiceBridgeCreateRequest } from "../realtime-voice/provider-types.js";
|
||||
import {
|
||||
acknowledgeTalkRealtimeRelayMark,
|
||||
cancelTalkRealtimeRelayTurn,
|
||||
clearTalkRealtimeRelaySessionsForTest,
|
||||
createTalkRealtimeRelaySession,
|
||||
registerTalkRealtimeRelayAgentRun,
|
||||
sendTalkRealtimeRelayAudio,
|
||||
stopTalkRealtimeRelaySession,
|
||||
submitTalkRealtimeRelayToolResult,
|
||||
@@ -24,6 +26,7 @@ describe("talk realtime gateway relay", () => {
|
||||
bridgeRequest?.onAudio(Buffer.from("audio-out"));
|
||||
bridgeRequest?.onMark?.("mark-1");
|
||||
bridgeRequest?.onTranscript?.("user", "hello", true);
|
||||
bridgeRequest?.onTranscript?.("assistant", "hi there", true);
|
||||
bridgeRequest?.onToolCall?.({
|
||||
itemId: "item-1",
|
||||
callId: "call-1",
|
||||
@@ -35,6 +38,7 @@ describe("talk realtime gateway relay", () => {
|
||||
setMediaTimestamp: vi.fn(),
|
||||
sendUserMessage: vi.fn(),
|
||||
triggerGreeting: vi.fn(),
|
||||
handleBargeIn: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
@@ -90,36 +94,74 @@ describe("talk realtime gateway relay", () => {
|
||||
expect.objectContaining({
|
||||
event: "talk.realtime.relay",
|
||||
connIds: ["conn-1"],
|
||||
payload: { relaySessionId: session.relaySessionId, type: "ready" },
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "ready",
|
||||
talkEvent: expect.objectContaining({
|
||||
sessionId: session.relaySessionId,
|
||||
type: "session.ready",
|
||||
seq: 1,
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "agent-consult",
|
||||
provider: "relay-test",
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: {
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "audio",
|
||||
audioBase64: Buffer.from("audio-out").toString("base64"),
|
||||
},
|
||||
talkEvent: expect.objectContaining({ type: "output.audio.delta" }),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: { relaySessionId: session.relaySessionId, type: "mark", markName: "mark-1" },
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "mark",
|
||||
markName: "mark-1",
|
||||
talkEvent: expect.objectContaining({ type: "output.audio.done", final: true }),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: {
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "transcript",
|
||||
role: "user",
|
||||
text: "hello",
|
||||
final: true,
|
||||
},
|
||||
talkEvent: expect.objectContaining({ type: "transcript.done", final: true }),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: {
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "transcript",
|
||||
role: "assistant",
|
||||
text: "hi there",
|
||||
final: true,
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "output.text.done",
|
||||
final: true,
|
||||
payload: { text: "hi there" },
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "toolCall",
|
||||
itemId: "item-1",
|
||||
callId: "call-1",
|
||||
name: "openclaw_agent_consult",
|
||||
args: { question: "what now" },
|
||||
},
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "tool.call",
|
||||
itemId: "item-1",
|
||||
callId: "call-1",
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
@@ -137,13 +179,66 @@ describe("talk realtime gateway relay", () => {
|
||||
callId: "call-1",
|
||||
result: { ok: true },
|
||||
});
|
||||
cancelTalkRealtimeRelayTurn({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId: "conn-1",
|
||||
reason: "barge-in",
|
||||
});
|
||||
stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId: "conn-1" });
|
||||
|
||||
expect(bridge.sendAudio).toHaveBeenCalledWith(Buffer.from("audio-in"));
|
||||
expect(bridge.setMediaTimestamp).toHaveBeenCalledWith(123);
|
||||
expect(bridge.acknowledgeMark).toHaveBeenCalled();
|
||||
expect(bridge.submitToolResult).toHaveBeenCalledWith("call-1", { ok: true }, undefined);
|
||||
expect(bridge.handleBargeIn).toHaveBeenCalledWith({ audioPlaybackActive: true });
|
||||
expect(bridge.close).toHaveBeenCalled();
|
||||
expect(events).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "inputAudio",
|
||||
byteLength: Buffer.from("audio-in").byteLength,
|
||||
talkEvent: expect.objectContaining({ type: "input.audio.delta" }),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "clear",
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "turn.cancelled",
|
||||
payload: { reason: "barge-in" },
|
||||
final: true,
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "toolResult",
|
||||
callId: "call-1",
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "tool.result",
|
||||
callId: "call-1",
|
||||
final: true,
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
expect(events).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
relaySessionId: session.relaySessionId,
|
||||
type: "close",
|
||||
reason: "completed",
|
||||
talkEvent: expect.objectContaining({ type: "session.closed", final: true }),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects relay control from a different connection", () => {
|
||||
@@ -155,6 +250,7 @@ describe("talk realtime gateway relay", () => {
|
||||
connect: vi.fn(async () => undefined),
|
||||
sendAudio: vi.fn(),
|
||||
setMediaTimestamp: vi.fn(),
|
||||
handleBargeIn: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
@@ -179,6 +275,303 @@ describe("talk realtime gateway relay", () => {
|
||||
).toThrow("Unknown realtime relay session");
|
||||
});
|
||||
|
||||
it("correlates output audio with the active relay turn", () => {
|
||||
let bridgeRequest: RealtimeVoiceBridgeCreateRequest | undefined;
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "relay-test",
|
||||
label: "Relay Test",
|
||||
isConfigured: () => true,
|
||||
createBridge: (req) => {
|
||||
bridgeRequest = req;
|
||||
return {
|
||||
connect: vi.fn(async () => undefined),
|
||||
sendAudio: vi.fn(),
|
||||
setMediaTimestamp: vi.fn(),
|
||||
handleBargeIn: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
};
|
||||
},
|
||||
};
|
||||
const events: Array<{
|
||||
event: string;
|
||||
payload: { talkEvent?: { type?: string; turnId?: string } };
|
||||
}> = [];
|
||||
const context = {
|
||||
broadcastToConnIds: (
|
||||
event: string,
|
||||
payload: { talkEvent?: { type?: string; turnId?: string } },
|
||||
) => {
|
||||
events.push({ event, payload });
|
||||
},
|
||||
} as never;
|
||||
const session = createTalkRealtimeRelaySession({
|
||||
context,
|
||||
connId: "conn-1",
|
||||
provider,
|
||||
providerConfig: {},
|
||||
instructions: "brief",
|
||||
tools: [],
|
||||
});
|
||||
|
||||
sendTalkRealtimeRelayAudio({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId: "conn-1",
|
||||
audioBase64: Buffer.from("audio").toString("base64"),
|
||||
});
|
||||
bridgeRequest?.onAudio(Buffer.from("reply"));
|
||||
|
||||
expect(
|
||||
events.some(
|
||||
(entry) =>
|
||||
entry.payload.talkEvent?.type === "output.audio.delta" &&
|
||||
entry.payload.talkEvent.turnId === "turn-1",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("aborts linked agent consult runs when the relay turn is cancelled", () => {
|
||||
const abortController = new AbortController();
|
||||
const broadcast = vi.fn();
|
||||
const nodeSendToSession = vi.fn();
|
||||
const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" }));
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "relay-test",
|
||||
label: "Relay Test",
|
||||
isConfigured: () => true,
|
||||
createBridge: () => ({
|
||||
connect: vi.fn(async () => undefined),
|
||||
sendAudio: vi.fn(),
|
||||
setMediaTimestamp: vi.fn(),
|
||||
handleBargeIn: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
}),
|
||||
};
|
||||
const context = {
|
||||
broadcastToConnIds: vi.fn(),
|
||||
broadcast,
|
||||
nodeSendToSession,
|
||||
chatAbortControllers: new Map([
|
||||
[
|
||||
"run-1",
|
||||
{
|
||||
controller: abortController,
|
||||
sessionId: "run-1",
|
||||
sessionKey: "main",
|
||||
startedAtMs: 1,
|
||||
expiresAtMs: Date.now() + 60_000,
|
||||
},
|
||||
],
|
||||
]),
|
||||
chatRunBuffers: new Map([["run-1", "partial answer"]]),
|
||||
chatDeltaSentAt: new Map(),
|
||||
chatDeltaLastBroadcastLen: new Map(),
|
||||
chatAbortedRuns: new Map(),
|
||||
removeChatRun,
|
||||
agentRunSeq: new Map(),
|
||||
} as never;
|
||||
const session = createTalkRealtimeRelaySession({
|
||||
context,
|
||||
connId: "conn-1",
|
||||
provider,
|
||||
providerConfig: {},
|
||||
instructions: "brief",
|
||||
tools: [],
|
||||
});
|
||||
|
||||
registerTalkRealtimeRelayAgentRun({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId: "conn-1",
|
||||
sessionKey: "main",
|
||||
runId: "run-1",
|
||||
});
|
||||
cancelTalkRealtimeRelayTurn({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId: "conn-1",
|
||||
reason: "barge-in",
|
||||
});
|
||||
|
||||
expect(abortController.signal.aborted).toBe(true);
|
||||
expect(removeChatRun).toHaveBeenCalledWith("run-1", "run-1", "main");
|
||||
expect(broadcast).toHaveBeenCalledWith(
|
||||
"chat",
|
||||
expect.objectContaining({
|
||||
runId: "run-1",
|
||||
sessionKey: "main",
|
||||
state: "aborted",
|
||||
stopReason: "barge-in",
|
||||
}),
|
||||
);
|
||||
expect(nodeSendToSession).toHaveBeenCalledWith(
|
||||
"main",
|
||||
"chat",
|
||||
expect.objectContaining({ runId: "run-1", state: "aborted" }),
|
||||
);
|
||||
});
|
||||
|
||||
it("aborts linked agent consult runs when the relay session closes", () => {
|
||||
const abortController = new AbortController();
|
||||
const broadcast = vi.fn();
|
||||
const nodeSendToSession = vi.fn();
|
||||
const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" }));
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "relay-test",
|
||||
label: "Relay Test",
|
||||
isConfigured: () => true,
|
||||
createBridge: () => ({
|
||||
connect: vi.fn(async () => undefined),
|
||||
sendAudio: vi.fn(),
|
||||
setMediaTimestamp: vi.fn(),
|
||||
handleBargeIn: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
}),
|
||||
};
|
||||
const context = {
|
||||
broadcastToConnIds: vi.fn(),
|
||||
broadcast,
|
||||
nodeSendToSession,
|
||||
chatAbortControllers: new Map([
|
||||
[
|
||||
"run-1",
|
||||
{
|
||||
controller: abortController,
|
||||
sessionId: "run-1",
|
||||
sessionKey: "main",
|
||||
startedAtMs: 1,
|
||||
expiresAtMs: Date.now() + 60_000,
|
||||
},
|
||||
],
|
||||
]),
|
||||
chatRunBuffers: new Map([["run-1", "partial answer"]]),
|
||||
chatDeltaSentAt: new Map(),
|
||||
chatDeltaLastBroadcastLen: new Map(),
|
||||
chatAbortedRuns: new Map(),
|
||||
removeChatRun,
|
||||
agentRunSeq: new Map(),
|
||||
} as never;
|
||||
const session = createTalkRealtimeRelaySession({
|
||||
context,
|
||||
connId: "conn-1",
|
||||
provider,
|
||||
providerConfig: {},
|
||||
instructions: "brief",
|
||||
tools: [],
|
||||
});
|
||||
|
||||
registerTalkRealtimeRelayAgentRun({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId: "conn-1",
|
||||
sessionKey: "main",
|
||||
runId: "run-1",
|
||||
});
|
||||
stopTalkRealtimeRelaySession({ relaySessionId: session.relaySessionId, connId: "conn-1" });
|
||||
|
||||
expect(abortController.signal.aborted).toBe(true);
|
||||
expect(broadcast).toHaveBeenCalledWith(
|
||||
"chat",
|
||||
expect.objectContaining({
|
||||
runId: "run-1",
|
||||
sessionKey: "main",
|
||||
state: "aborted",
|
||||
stopReason: "relay-closed",
|
||||
}),
|
||||
);
|
||||
expect(nodeSendToSession).toHaveBeenCalledWith(
|
||||
"main",
|
||||
"chat",
|
||||
expect.objectContaining({ runId: "run-1", state: "aborted" }),
|
||||
);
|
||||
});
|
||||
|
||||
it("aborts linked agent consult runs when the provider closes the relay", () => {
|
||||
const abortController = new AbortController();
|
||||
let bridgeRequest: RealtimeVoiceBridgeCreateRequest | undefined;
|
||||
const broadcast = vi.fn();
|
||||
const nodeSendToSession = vi.fn();
|
||||
const removeChatRun = vi.fn(() => ({ sessionKey: "main", clientRunId: "run-1" }));
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "relay-test",
|
||||
label: "Relay Test",
|
||||
isConfigured: () => true,
|
||||
createBridge: (req) => {
|
||||
bridgeRequest = req;
|
||||
return {
|
||||
connect: vi.fn(async () => undefined),
|
||||
sendAudio: vi.fn(),
|
||||
setMediaTimestamp: vi.fn(),
|
||||
handleBargeIn: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
};
|
||||
},
|
||||
};
|
||||
const context = {
|
||||
broadcastToConnIds: vi.fn(),
|
||||
broadcast,
|
||||
nodeSendToSession,
|
||||
chatAbortControllers: new Map([
|
||||
[
|
||||
"run-1",
|
||||
{
|
||||
controller: abortController,
|
||||
sessionId: "run-1",
|
||||
sessionKey: "main",
|
||||
startedAtMs: 1,
|
||||
expiresAtMs: Date.now() + 60_000,
|
||||
},
|
||||
],
|
||||
]),
|
||||
chatRunBuffers: new Map([["run-1", "partial answer"]]),
|
||||
chatDeltaSentAt: new Map(),
|
||||
chatDeltaLastBroadcastLen: new Map(),
|
||||
chatAbortedRuns: new Map(),
|
||||
removeChatRun,
|
||||
agentRunSeq: new Map(),
|
||||
} as never;
|
||||
const session = createTalkRealtimeRelaySession({
|
||||
context,
|
||||
connId: "conn-1",
|
||||
provider,
|
||||
providerConfig: {},
|
||||
instructions: "brief",
|
||||
tools: [],
|
||||
});
|
||||
|
||||
registerTalkRealtimeRelayAgentRun({
|
||||
relaySessionId: session.relaySessionId,
|
||||
connId: "conn-1",
|
||||
sessionKey: "main",
|
||||
runId: "run-1",
|
||||
});
|
||||
bridgeRequest?.onClose?.("error");
|
||||
|
||||
expect(abortController.signal.aborted).toBe(true);
|
||||
expect(broadcast).toHaveBeenCalledWith(
|
||||
"chat",
|
||||
expect.objectContaining({
|
||||
runId: "run-1",
|
||||
sessionKey: "main",
|
||||
state: "aborted",
|
||||
stopReason: "relay-closed",
|
||||
}),
|
||||
);
|
||||
expect(nodeSendToSession).toHaveBeenCalledWith(
|
||||
"main",
|
||||
"chat",
|
||||
expect.objectContaining({ runId: "run-1", state: "aborted" }),
|
||||
);
|
||||
});
|
||||
|
||||
it("caps active relay sessions per browser connection", () => {
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "relay-test",
|
||||
@@ -188,6 +581,7 @@ describe("talk realtime gateway relay", () => {
|
||||
connect: vi.fn(async () => undefined),
|
||||
sendAudio: vi.fn(),
|
||||
setMediaTimestamp: vi.fn(),
|
||||
handleBargeIn: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
|
||||
@@ -10,6 +10,13 @@ import {
|
||||
createRealtimeVoiceBridgeSession,
|
||||
type RealtimeVoiceBridgeSession,
|
||||
} from "../realtime-voice/session-runtime.js";
|
||||
import {
|
||||
type TalkEvent,
|
||||
type TalkEventInput,
|
||||
type TalkSessionController,
|
||||
createTalkSessionController,
|
||||
} from "../realtime-voice/talk-session-controller.js";
|
||||
import { abortChatRunById } from "./chat-abort.js";
|
||||
import type { GatewayRequestContext } from "./server-methods/shared-types.js";
|
||||
|
||||
const RELAY_SESSION_TTL_MS = 30 * 60 * 1000;
|
||||
@@ -18,8 +25,9 @@ const MAX_RELAY_SESSIONS_PER_CONN = 2;
|
||||
const MAX_RELAY_SESSIONS_GLOBAL = 64;
|
||||
const RELAY_EVENT = "talk.realtime.relay";
|
||||
|
||||
type TalkRealtimeRelayEvent =
|
||||
type TalkRealtimeRelayEventPayload =
|
||||
| { relaySessionId: string; type: "ready" }
|
||||
| { relaySessionId: string; type: "inputAudio"; byteLength: number }
|
||||
| { relaySessionId: string; type: "audio"; audioBase64: string }
|
||||
| { relaySessionId: string; type: "clear" }
|
||||
| { relaySessionId: string; type: "mark"; markName: string }
|
||||
@@ -38,16 +46,21 @@ type TalkRealtimeRelayEvent =
|
||||
name: string;
|
||||
args: unknown;
|
||||
}
|
||||
| { relaySessionId: string; type: "toolResult"; callId: string }
|
||||
| { relaySessionId: string; type: "error"; message: string }
|
||||
| { relaySessionId: string; type: "close"; reason: "completed" | "error" };
|
||||
|
||||
type TalkRealtimeRelayEvent = TalkRealtimeRelayEventPayload & { talkEvent?: TalkEvent };
|
||||
|
||||
type RelaySession = {
|
||||
id: string;
|
||||
connId: string;
|
||||
context: GatewayRequestContext;
|
||||
bridge: RealtimeVoiceBridgeSession;
|
||||
talk: TalkSessionController;
|
||||
expiresAtMs: number;
|
||||
cleanupTimer: ReturnType<typeof setTimeout>;
|
||||
activeAgentRuns: Map<string, string>;
|
||||
};
|
||||
|
||||
type CreateTalkRealtimeRelaySessionParams = {
|
||||
@@ -85,14 +98,31 @@ function broadcastToOwner(
|
||||
context.broadcastToConnIds(RELAY_EVENT, event, new Set([connId]), { dropIfSlow: true });
|
||||
}
|
||||
|
||||
function abortRelayAgentRuns(session: RelaySession, reason: string): void {
|
||||
for (const [runId, sessionKey] of session.activeAgentRuns) {
|
||||
abortChatRunById(session.context, {
|
||||
runId,
|
||||
sessionKey,
|
||||
stopReason: reason,
|
||||
});
|
||||
}
|
||||
session.activeAgentRuns.clear();
|
||||
}
|
||||
|
||||
function closeRelaySession(session: RelaySession, reason: "completed" | "error"): void {
|
||||
relaySessions.delete(session.id);
|
||||
clearTimeout(session.cleanupTimer);
|
||||
abortRelayAgentRuns(session, reason === "error" ? "relay-error" : "relay-closed");
|
||||
session.bridge.close();
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
relaySessionId: session.id,
|
||||
type: "close",
|
||||
reason,
|
||||
talkEvent: session.talk.emit({
|
||||
type: "session.closed",
|
||||
payload: { reason },
|
||||
final: true,
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -130,9 +160,19 @@ export function createTalkRealtimeRelaySession(
|
||||
enforceRelaySessionLimits(params.connId);
|
||||
const relaySessionId = randomUUID();
|
||||
const expiresAtMs = Date.now() + RELAY_SESSION_TTL_MS;
|
||||
const talk = createTalkSessionController({
|
||||
sessionId: relaySessionId,
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "agent-consult",
|
||||
provider: params.provider.id,
|
||||
});
|
||||
let relay: RelaySession | undefined;
|
||||
const emit = (event: TalkRealtimeRelayEvent) =>
|
||||
broadcastToOwner(params.context, params.connId, event);
|
||||
const emit = (event: TalkRealtimeRelayEventPayload, talkEvent?: TalkEventInput) =>
|
||||
broadcastToOwner(params.context, params.connId, {
|
||||
...event,
|
||||
...(talkEvent ? { talkEvent: talk.emit(talkEvent) } : {}),
|
||||
});
|
||||
const bridge = createRealtimeVoiceBridgeSession({
|
||||
provider: params.provider,
|
||||
providerConfig: params.providerConfig,
|
||||
@@ -142,30 +182,94 @@ export function createTalkRealtimeRelaySession(
|
||||
markStrategy: "transport",
|
||||
audioSink: {
|
||||
isOpen: () => Boolean(relay && relaySessions.has(relay.id)),
|
||||
sendAudio: (audio) =>
|
||||
emit({
|
||||
relaySessionId,
|
||||
type: "audio",
|
||||
audioBase64: audio.toString("base64"),
|
||||
}),
|
||||
clearAudio: () => emit({ relaySessionId, type: "clear" }),
|
||||
sendMark: (markName) => emit({ relaySessionId, type: "mark", markName }),
|
||||
sendAudio: (audio) => {
|
||||
const turnId = relay ? ensureRelayTurn(relay) : undefined;
|
||||
emit(
|
||||
{
|
||||
relaySessionId,
|
||||
type: "audio",
|
||||
audioBase64: audio.toString("base64"),
|
||||
},
|
||||
{
|
||||
type: "output.audio.delta",
|
||||
turnId,
|
||||
payload: { byteLength: audio.length },
|
||||
},
|
||||
);
|
||||
},
|
||||
clearAudio: () => {
|
||||
const turnId = relay ? ensureRelayTurn(relay) : undefined;
|
||||
emit(
|
||||
{ relaySessionId, type: "clear" },
|
||||
{
|
||||
type: "output.audio.done",
|
||||
turnId,
|
||||
payload: { reason: "clear" },
|
||||
final: true,
|
||||
},
|
||||
);
|
||||
},
|
||||
sendMark: (markName) => {
|
||||
const turnId = relay ? ensureRelayTurn(relay) : undefined;
|
||||
emit(
|
||||
{ relaySessionId, type: "mark", markName },
|
||||
{
|
||||
type: "output.audio.done",
|
||||
turnId,
|
||||
payload: { markName },
|
||||
final: true,
|
||||
},
|
||||
);
|
||||
},
|
||||
},
|
||||
onTranscript: (role, text, final) => {
|
||||
emit({ relaySessionId, type: "transcript", role, text, final });
|
||||
const turnId = relay ? ensureRelayTurn(relay) : undefined;
|
||||
const eventType =
|
||||
role === "assistant"
|
||||
? final
|
||||
? "output.text.done"
|
||||
: "output.text.delta"
|
||||
: final
|
||||
? "transcript.done"
|
||||
: "transcript.delta";
|
||||
const payload = role === "assistant" ? { text } : { role, text };
|
||||
emit(
|
||||
{ relaySessionId, type: "transcript", role, text, final },
|
||||
{
|
||||
type: eventType,
|
||||
turnId,
|
||||
payload,
|
||||
final,
|
||||
},
|
||||
);
|
||||
},
|
||||
onToolCall: (toolCall) => {
|
||||
emit({
|
||||
relaySessionId,
|
||||
type: "toolCall",
|
||||
itemId: toolCall.itemId,
|
||||
callId: toolCall.callId,
|
||||
name: toolCall.name,
|
||||
args: toolCall.args,
|
||||
});
|
||||
const turnId = relay ? ensureRelayTurn(relay) : undefined;
|
||||
emit(
|
||||
{
|
||||
relaySessionId,
|
||||
type: "toolCall",
|
||||
itemId: toolCall.itemId,
|
||||
callId: toolCall.callId,
|
||||
name: toolCall.name,
|
||||
args: toolCall.args,
|
||||
},
|
||||
{
|
||||
type: "tool.call",
|
||||
itemId: toolCall.itemId,
|
||||
callId: toolCall.callId,
|
||||
turnId,
|
||||
payload: { name: toolCall.name, args: toolCall.args },
|
||||
},
|
||||
);
|
||||
},
|
||||
onReady: () => emit({ relaySessionId, type: "ready" }),
|
||||
onError: (error) => emit({ relaySessionId, type: "error", message: error.message }),
|
||||
onReady: () =>
|
||||
emit({ relaySessionId, type: "ready" }, { type: "session.ready", payload: null }),
|
||||
onError: (error) =>
|
||||
emit(
|
||||
{ relaySessionId, type: "error", message: error.message },
|
||||
{ type: "session.error", payload: { message: error.message }, final: true },
|
||||
),
|
||||
onClose: (reason) => {
|
||||
const active = relaySessions.get(relaySessionId);
|
||||
if (!active) {
|
||||
@@ -173,7 +277,11 @@ export function createTalkRealtimeRelaySession(
|
||||
}
|
||||
relaySessions.delete(relaySessionId);
|
||||
clearTimeout(active.cleanupTimer);
|
||||
emit({ relaySessionId, type: "close", reason });
|
||||
abortRelayAgentRuns(active, "relay-closed");
|
||||
emit(
|
||||
{ relaySessionId, type: "close", reason },
|
||||
{ type: "session.closed", payload: { reason }, final: true },
|
||||
);
|
||||
},
|
||||
});
|
||||
relay = {
|
||||
@@ -181,6 +289,7 @@ export function createTalkRealtimeRelaySession(
|
||||
connId: params.connId,
|
||||
context: params.context,
|
||||
bridge,
|
||||
talk,
|
||||
expiresAtMs,
|
||||
cleanupTimer: setTimeout(() => {
|
||||
const active = relaySessions.get(relaySessionId);
|
||||
@@ -188,6 +297,7 @@ export function createTalkRealtimeRelaySession(
|
||||
closeRelaySession(active, "completed");
|
||||
}
|
||||
}, RELAY_SESSION_TTL_MS),
|
||||
activeAgentRuns: new Map(),
|
||||
};
|
||||
relay.cleanupTimer.unref?.();
|
||||
relaySessions.set(relaySessionId, relay);
|
||||
@@ -215,6 +325,19 @@ export function createTalkRealtimeRelaySession(
|
||||
};
|
||||
}
|
||||
|
||||
function ensureRelayTurn(session: RelaySession): string {
|
||||
const turn = session.talk.ensureTurn();
|
||||
if (turn.event) {
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
relaySessionId: session.id,
|
||||
type: "inputAudio",
|
||||
byteLength: 0,
|
||||
talkEvent: turn.event,
|
||||
});
|
||||
}
|
||||
return turn.turnId;
|
||||
}
|
||||
|
||||
function getRelaySession(relaySessionId: string, connId: string): RelaySession {
|
||||
const session = relaySessions.get(relaySessionId);
|
||||
if (!session || session.connId !== connId || Date.now() > session.expiresAtMs) {
|
||||
@@ -236,8 +359,19 @@ export function sendTalkRealtimeRelayAudio(params: {
|
||||
throw new Error("Realtime relay audio frame is too large");
|
||||
}
|
||||
const session = getRelaySession(params.relaySessionId, params.connId);
|
||||
const turnId = ensureRelayTurn(session);
|
||||
const audio = Buffer.from(params.audioBase64, "base64");
|
||||
session.bridge.sendAudio(audio);
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
relaySessionId: session.id,
|
||||
type: "inputAudio",
|
||||
byteLength: audio.byteLength,
|
||||
talkEvent: session.talk.emit({
|
||||
type: "input.audio.delta",
|
||||
turnId,
|
||||
payload: { byteLength: audio.byteLength },
|
||||
}),
|
||||
});
|
||||
if (typeof params.timestamp === "number" && Number.isFinite(params.timestamp)) {
|
||||
session.bridge.setMediaTimestamp(params.timestamp);
|
||||
}
|
||||
@@ -256,10 +390,52 @@ export function submitTalkRealtimeRelayToolResult(params: {
|
||||
callId: string;
|
||||
result: unknown;
|
||||
}): void {
|
||||
getRelaySession(params.relaySessionId, params.connId).bridge.submitToolResult(
|
||||
params.callId,
|
||||
params.result,
|
||||
);
|
||||
const session = getRelaySession(params.relaySessionId, params.connId);
|
||||
session.bridge.submitToolResult(params.callId, params.result);
|
||||
const turnId = ensureRelayTurn(session);
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
relaySessionId: session.id,
|
||||
type: "toolResult",
|
||||
callId: params.callId,
|
||||
talkEvent: session.talk.emit({
|
||||
type: "tool.result",
|
||||
callId: params.callId,
|
||||
turnId,
|
||||
payload: { result: params.result },
|
||||
final: true,
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
export function registerTalkRealtimeRelayAgentRun(params: {
|
||||
relaySessionId: string;
|
||||
connId: string;
|
||||
sessionKey: string;
|
||||
runId: string;
|
||||
}): void {
|
||||
const session = getRelaySession(params.relaySessionId, params.connId);
|
||||
session.activeAgentRuns.set(params.runId, params.sessionKey);
|
||||
}
|
||||
|
||||
export function cancelTalkRealtimeRelayTurn(params: {
|
||||
relaySessionId: string;
|
||||
connId: string;
|
||||
reason?: string;
|
||||
}): void {
|
||||
const session = getRelaySession(params.relaySessionId, params.connId);
|
||||
const turnId = ensureRelayTurn(session);
|
||||
const reason = params.reason ?? "client-cancelled";
|
||||
session.bridge.handleBargeIn({ audioPlaybackActive: true });
|
||||
abortRelayAgentRuns(session, reason);
|
||||
const cancelled = session.talk.cancelTurn({
|
||||
turnId,
|
||||
payload: { reason },
|
||||
});
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
relaySessionId: session.id,
|
||||
type: "clear",
|
||||
talkEvent: cancelled.ok ? cancelled.event : undefined,
|
||||
});
|
||||
}
|
||||
|
||||
export function stopTalkRealtimeRelaySession(params: {
|
||||
|
||||
52
src/gateway/talk-session-registry.ts
Normal file
52
src/gateway/talk-session-registry.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
export type UnifiedTalkSessionRecord =
|
||||
| {
|
||||
kind: "realtime-relay";
|
||||
connId: string;
|
||||
relaySessionId: string;
|
||||
}
|
||||
| {
|
||||
kind: "transcription-relay";
|
||||
connId: string;
|
||||
transcriptionSessionId: string;
|
||||
}
|
||||
| {
|
||||
kind: "managed-room";
|
||||
handoffId: string;
|
||||
token: string;
|
||||
roomId: string;
|
||||
};
|
||||
|
||||
const unifiedTalkSessions = new Map<string, UnifiedTalkSessionRecord>();
|
||||
|
||||
export function rememberUnifiedTalkSession(
|
||||
sessionId: string,
|
||||
session: UnifiedTalkSessionRecord,
|
||||
): void {
|
||||
unifiedTalkSessions.set(sessionId, session);
|
||||
}
|
||||
|
||||
export function getUnifiedTalkSession(sessionId: string): UnifiedTalkSessionRecord {
|
||||
const session = unifiedTalkSessions.get(sessionId);
|
||||
if (!session) {
|
||||
throw new Error("Unknown Talk session");
|
||||
}
|
||||
return session;
|
||||
}
|
||||
|
||||
export function forgetUnifiedTalkSession(sessionId: string): void {
|
||||
unifiedTalkSessions.delete(sessionId);
|
||||
}
|
||||
|
||||
export function requireUnifiedTalkSessionConn(
|
||||
session: Extract<UnifiedTalkSessionRecord, { connId: string }>,
|
||||
connId: string | undefined,
|
||||
): string {
|
||||
if (!connId || session.connId !== connId) {
|
||||
throw new Error("Talk session is not owned by this connection");
|
||||
}
|
||||
return connId;
|
||||
}
|
||||
|
||||
export function clearUnifiedTalkSessionsForTest(): void {
|
||||
unifiedTalkSessions.clear();
|
||||
}
|
||||
216
src/gateway/talk-transcription-relay.test.ts
Normal file
216
src/gateway/talk-transcription-relay.test.ts
Normal file
@@ -0,0 +1,216 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
|
||||
import type { RealtimeTranscriptionSessionCreateRequest } from "../realtime-transcription/provider-types.js";
|
||||
import {
|
||||
cancelTalkTranscriptionRelayTurn,
|
||||
clearTalkTranscriptionRelaySessionsForTest,
|
||||
createTalkTranscriptionRelaySession,
|
||||
sendTalkTranscriptionRelayAudio,
|
||||
stopTalkTranscriptionRelaySession,
|
||||
} from "./talk-transcription-relay.js";
|
||||
|
||||
describe("talk transcription gateway relay", () => {
|
||||
afterEach(() => {
|
||||
clearTalkTranscriptionRelaySessionsForTest();
|
||||
});
|
||||
|
||||
it("bridges browser audio into a transcription-only Talk event stream", async () => {
|
||||
let sttRequest: RealtimeTranscriptionSessionCreateRequest | undefined;
|
||||
const sttSession = {
|
||||
connect: vi.fn(async () => {
|
||||
sttRequest?.onSpeechStart?.();
|
||||
sttRequest?.onPartial?.("hel");
|
||||
sttRequest?.onTranscript?.("hello world");
|
||||
}),
|
||||
sendAudio: vi.fn(),
|
||||
close: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
};
|
||||
const provider: RealtimeTranscriptionProviderPlugin = {
|
||||
id: "stt-test",
|
||||
label: "STT Test",
|
||||
isConfigured: () => true,
|
||||
createSession: (req) => {
|
||||
sttRequest = req;
|
||||
return sttSession;
|
||||
},
|
||||
};
|
||||
const events: Array<{ event: string; payload: unknown; connIds: string[] }> = [];
|
||||
const context = {
|
||||
broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet<string>) => {
|
||||
events.push({ event, payload, connIds: [...connIds] });
|
||||
},
|
||||
} as never;
|
||||
|
||||
const session = createTalkTranscriptionRelaySession({
|
||||
context,
|
||||
connId: "conn-1",
|
||||
provider,
|
||||
providerConfig: { model: "stt-model" },
|
||||
});
|
||||
await Promise.resolve();
|
||||
|
||||
expect(session).toMatchObject({
|
||||
provider: "stt-test",
|
||||
mode: "transcription",
|
||||
transport: "gateway-relay",
|
||||
audio: {
|
||||
inputEncoding: "pcm16",
|
||||
inputSampleRateHz: 24000,
|
||||
},
|
||||
});
|
||||
expect(sttRequest).toMatchObject({
|
||||
providerConfig: { model: "stt-model" },
|
||||
});
|
||||
|
||||
sendTalkTranscriptionRelayAudio({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
connId: "conn-1",
|
||||
audioBase64: Buffer.from("audio-in").toString("base64"),
|
||||
});
|
||||
stopTalkTranscriptionRelaySession({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
connId: "conn-1",
|
||||
});
|
||||
|
||||
expect(sttSession.sendAudio).toHaveBeenCalledWith(Buffer.from("audio-in"));
|
||||
expect(sttSession.close).toHaveBeenCalledOnce();
|
||||
expect(events).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
event: "talk.transcription.relay",
|
||||
connIds: ["conn-1"],
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
type: "ready",
|
||||
talkEvent: expect.objectContaining({
|
||||
sessionId: session.transcriptionSessionId,
|
||||
type: "session.ready",
|
||||
mode: "transcription",
|
||||
transport: "gateway-relay",
|
||||
brain: "none",
|
||||
provider: "stt-test",
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
type: "speechStart",
|
||||
talkEvent: expect.objectContaining({ type: "turn.started", turnId: "turn-1" }),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
type: "partial",
|
||||
text: "hel",
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "transcript.delta",
|
||||
turnId: "turn-1",
|
||||
payload: { text: "hel" },
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
type: "transcript",
|
||||
text: "hello world",
|
||||
final: true,
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "transcript.done",
|
||||
turnId: "turn-1",
|
||||
final: true,
|
||||
payload: { text: "hello world" },
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
type: "inputAudio",
|
||||
byteLength: 8,
|
||||
talkEvent: expect.objectContaining({ type: "input.audio.delta" }),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
type: "close",
|
||||
reason: "completed",
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "session.closed",
|
||||
final: true,
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it("cancels an active transcription turn and closes the provider session", async () => {
|
||||
let sttRequest: RealtimeTranscriptionSessionCreateRequest | undefined;
|
||||
const sttSession = {
|
||||
connect: vi.fn(async () => {
|
||||
sttRequest?.onSpeechStart?.();
|
||||
}),
|
||||
sendAudio: vi.fn(),
|
||||
close: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
};
|
||||
const provider: RealtimeTranscriptionProviderPlugin = {
|
||||
id: "stt-test",
|
||||
label: "STT Test",
|
||||
isConfigured: () => true,
|
||||
createSession: (req) => {
|
||||
sttRequest = req;
|
||||
return sttSession;
|
||||
},
|
||||
};
|
||||
const events: Array<{ event: string; payload: unknown; connIds: string[] }> = [];
|
||||
const context = {
|
||||
broadcastToConnIds: (event: string, payload: unknown, connIds: ReadonlySet<string>) => {
|
||||
events.push({ event, payload, connIds: [...connIds] });
|
||||
},
|
||||
} as never;
|
||||
|
||||
const session = createTalkTranscriptionRelaySession({
|
||||
context,
|
||||
connId: "conn-1",
|
||||
provider,
|
||||
providerConfig: {},
|
||||
});
|
||||
await Promise.resolve();
|
||||
|
||||
cancelTalkTranscriptionRelayTurn({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
connId: "conn-1",
|
||||
reason: "barge-in",
|
||||
});
|
||||
|
||||
expect(sttSession.close).toHaveBeenCalledOnce();
|
||||
expect(events).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "turn.cancelled",
|
||||
turnId: "turn-1",
|
||||
payload: { reason: "barge-in" },
|
||||
final: true,
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
payload: expect.objectContaining({
|
||||
transcriptionSessionId: session.transcriptionSessionId,
|
||||
type: "close",
|
||||
reason: "completed",
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
});
|
||||
354
src/gateway/talk-transcription-relay.ts
Normal file
354
src/gateway/talk-transcription-relay.ts
Normal file
@@ -0,0 +1,354 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
|
||||
import type { RealtimeTranscriptionProviderConfig } from "../realtime-transcription/provider-types.js";
|
||||
import {
|
||||
type TalkEvent,
|
||||
type TalkEventInput,
|
||||
type TalkSessionController,
|
||||
createTalkSessionController,
|
||||
} from "../realtime-voice/talk-session-controller.js";
|
||||
import type { GatewayRequestContext } from "./server-methods/shared-types.js";
|
||||
|
||||
const TRANSCRIPTION_SESSION_TTL_MS = 30 * 60 * 1000;
|
||||
const MAX_AUDIO_BASE64_BYTES = 512 * 1024;
|
||||
const MAX_TRANSCRIPTION_SESSIONS_PER_CONN = 2;
|
||||
const MAX_TRANSCRIPTION_SESSIONS_GLOBAL = 64;
|
||||
const TRANSCRIPTION_EVENT = "talk.transcription.relay";
|
||||
|
||||
type TalkTranscriptionRelayEventPayload =
|
||||
| { transcriptionSessionId: string; type: "ready" }
|
||||
| { transcriptionSessionId: string; type: "inputAudio"; byteLength: number }
|
||||
| { transcriptionSessionId: string; type: "partial"; text: string }
|
||||
| { transcriptionSessionId: string; type: "transcript"; text: string; final: true }
|
||||
| { transcriptionSessionId: string; type: "speechStart" }
|
||||
| { transcriptionSessionId: string; type: "error"; message: string }
|
||||
| { transcriptionSessionId: string; type: "close"; reason: "completed" | "error" };
|
||||
|
||||
type TalkTranscriptionRelayEvent = TalkTranscriptionRelayEventPayload & {
|
||||
talkEvent?: TalkEvent;
|
||||
};
|
||||
|
||||
type TranscriptionRelaySession = {
|
||||
id: string;
|
||||
connId: string;
|
||||
context: GatewayRequestContext;
|
||||
provider: RealtimeTranscriptionProviderPlugin;
|
||||
sttSession: ReturnType<RealtimeTranscriptionProviderPlugin["createSession"]>;
|
||||
talk: TalkSessionController;
|
||||
expiresAtMs: number;
|
||||
cleanupTimer: ReturnType<typeof setTimeout>;
|
||||
closed: boolean;
|
||||
};
|
||||
|
||||
type CreateTalkTranscriptionRelaySessionParams = {
|
||||
context: GatewayRequestContext;
|
||||
connId: string;
|
||||
provider: RealtimeTranscriptionProviderPlugin;
|
||||
providerConfig: RealtimeTranscriptionProviderConfig;
|
||||
};
|
||||
|
||||
type TalkTranscriptionRelaySessionResult = {
|
||||
provider: string;
|
||||
mode: "transcription";
|
||||
transport: "gateway-relay";
|
||||
transcriptionSessionId: string;
|
||||
audio: {
|
||||
inputEncoding: "pcm16";
|
||||
inputSampleRateHz: 24000;
|
||||
};
|
||||
expiresAt: number;
|
||||
};
|
||||
|
||||
const transcriptionSessions = new Map<string, TranscriptionRelaySession>();
|
||||
|
||||
function broadcastToOwner(
|
||||
context: GatewayRequestContext,
|
||||
connId: string,
|
||||
event: TalkTranscriptionRelayEvent,
|
||||
): void {
|
||||
context.broadcastToConnIds(TRANSCRIPTION_EVENT, event, new Set([connId]), { dropIfSlow: true });
|
||||
}
|
||||
|
||||
function ensureTranscriptionTurn(session: TranscriptionRelaySession): string {
|
||||
const turn = session.talk.ensureTurn();
|
||||
if (turn.event) {
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
transcriptionSessionId: session.id,
|
||||
type: "speechStart",
|
||||
talkEvent: turn.event,
|
||||
});
|
||||
}
|
||||
return turn.turnId;
|
||||
}
|
||||
|
||||
function closeTranscriptionSession(
|
||||
session: TranscriptionRelaySession,
|
||||
reason: "completed" | "error",
|
||||
): void {
|
||||
if (session.closed) {
|
||||
return;
|
||||
}
|
||||
session.closed = true;
|
||||
transcriptionSessions.delete(session.id);
|
||||
clearTimeout(session.cleanupTimer);
|
||||
session.sttSession.close();
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
transcriptionSessionId: session.id,
|
||||
type: "close",
|
||||
reason,
|
||||
talkEvent: session.talk.emit({
|
||||
type: "session.closed",
|
||||
payload: { reason },
|
||||
final: true,
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
function pruneExpiredTranscriptionSessions(nowMs = Date.now()): void {
|
||||
for (const session of transcriptionSessions.values()) {
|
||||
if (nowMs > session.expiresAtMs) {
|
||||
closeTranscriptionSession(session, "completed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function countTranscriptionSessionsForConn(connId: string): number {
|
||||
let count = 0;
|
||||
for (const session of transcriptionSessions.values()) {
|
||||
if (session.connId === connId) {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
function enforceTranscriptionSessionLimits(connId: string): void {
|
||||
pruneExpiredTranscriptionSessions();
|
||||
if (transcriptionSessions.size >= MAX_TRANSCRIPTION_SESSIONS_GLOBAL) {
|
||||
throw new Error("Too many active transcription Talk sessions");
|
||||
}
|
||||
if (countTranscriptionSessionsForConn(connId) >= MAX_TRANSCRIPTION_SESSIONS_PER_CONN) {
|
||||
throw new Error("Too many active transcription Talk sessions for this connection");
|
||||
}
|
||||
}
|
||||
|
||||
export function createTalkTranscriptionRelaySession(
|
||||
params: CreateTalkTranscriptionRelaySessionParams,
|
||||
): TalkTranscriptionRelaySessionResult {
|
||||
enforceTranscriptionSessionLimits(params.connId);
|
||||
const transcriptionSessionId = randomUUID();
|
||||
const expiresAtMs = Date.now() + TRANSCRIPTION_SESSION_TTL_MS;
|
||||
const talk = createTalkSessionController({
|
||||
sessionId: transcriptionSessionId,
|
||||
mode: "transcription",
|
||||
transport: "gateway-relay",
|
||||
brain: "none",
|
||||
provider: params.provider.id,
|
||||
});
|
||||
let relay: TranscriptionRelaySession | undefined;
|
||||
const emit = (event: TalkTranscriptionRelayEventPayload, talkEvent?: TalkEventInput): void => {
|
||||
broadcastToOwner(params.context, params.connId, {
|
||||
...event,
|
||||
...(talkEvent ? { talkEvent: talk.emit(talkEvent) } : {}),
|
||||
});
|
||||
};
|
||||
const ensureTurnId = (): string => {
|
||||
return relay ? ensureTranscriptionTurn(relay) : "turn-1";
|
||||
};
|
||||
const sttSession = params.provider.createSession({
|
||||
providerConfig: params.providerConfig,
|
||||
onSpeechStart: () => {
|
||||
ensureTurnId();
|
||||
},
|
||||
onPartial: (text) => {
|
||||
const turnId = ensureTurnId();
|
||||
emit(
|
||||
{ transcriptionSessionId, type: "partial", text },
|
||||
{
|
||||
type: "transcript.delta",
|
||||
turnId,
|
||||
payload: { text },
|
||||
},
|
||||
);
|
||||
},
|
||||
onTranscript: (text) => {
|
||||
const turnId = ensureTurnId();
|
||||
emit(
|
||||
{ transcriptionSessionId, type: "transcript", text, final: true },
|
||||
{
|
||||
type: "transcript.done",
|
||||
turnId,
|
||||
payload: { text },
|
||||
final: true,
|
||||
},
|
||||
);
|
||||
if (relay) {
|
||||
const ended = relay.talk.endTurn({ turnId, payload: {} });
|
||||
if (ended.ok) {
|
||||
broadcastToOwner(relay.context, relay.connId, {
|
||||
transcriptionSessionId,
|
||||
type: "transcript",
|
||||
text: "",
|
||||
final: true,
|
||||
talkEvent: ended.event,
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
onError: (error) => {
|
||||
emit(
|
||||
{ transcriptionSessionId, type: "error", message: error.message },
|
||||
{
|
||||
type: "session.error",
|
||||
payload: { message: error.message },
|
||||
final: true,
|
||||
},
|
||||
);
|
||||
if (relay) {
|
||||
closeTranscriptionSession(relay, "error");
|
||||
}
|
||||
},
|
||||
});
|
||||
relay = {
|
||||
id: transcriptionSessionId,
|
||||
connId: params.connId,
|
||||
context: params.context,
|
||||
provider: params.provider,
|
||||
sttSession,
|
||||
talk,
|
||||
expiresAtMs,
|
||||
cleanupTimer: setTimeout(() => {
|
||||
const active = transcriptionSessions.get(transcriptionSessionId);
|
||||
if (active) {
|
||||
closeTranscriptionSession(active, "completed");
|
||||
}
|
||||
}, TRANSCRIPTION_SESSION_TTL_MS),
|
||||
closed: false,
|
||||
};
|
||||
relay.cleanupTimer.unref?.();
|
||||
transcriptionSessions.set(transcriptionSessionId, relay);
|
||||
sttSession
|
||||
.connect()
|
||||
.then(() => {
|
||||
emit({ transcriptionSessionId, type: "ready" }, { type: "session.ready", payload: null });
|
||||
})
|
||||
.catch((error: unknown) => {
|
||||
emit(
|
||||
{
|
||||
transcriptionSessionId,
|
||||
type: "error",
|
||||
message: error instanceof Error ? error.message : String(error),
|
||||
},
|
||||
{
|
||||
type: "session.error",
|
||||
payload: { message: error instanceof Error ? error.message : String(error) },
|
||||
final: true,
|
||||
},
|
||||
);
|
||||
const active = transcriptionSessions.get(transcriptionSessionId);
|
||||
if (active) {
|
||||
closeTranscriptionSession(active, "error");
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
provider: params.provider.id,
|
||||
mode: "transcription",
|
||||
transport: "gateway-relay",
|
||||
transcriptionSessionId,
|
||||
audio: {
|
||||
inputEncoding: "pcm16",
|
||||
inputSampleRateHz: 24000,
|
||||
},
|
||||
expiresAt: Math.floor(expiresAtMs / 1000),
|
||||
};
|
||||
}
|
||||
|
||||
function getTranscriptionSession(
|
||||
transcriptionSessionId: string,
|
||||
connId: string,
|
||||
): TranscriptionRelaySession {
|
||||
const session = transcriptionSessions.get(transcriptionSessionId);
|
||||
if (!session || session.connId !== connId || Date.now() > session.expiresAtMs) {
|
||||
if (session) {
|
||||
closeTranscriptionSession(session, "completed");
|
||||
}
|
||||
throw new Error("Unknown transcription Talk session");
|
||||
}
|
||||
return session;
|
||||
}
|
||||
|
||||
export function sendTalkTranscriptionRelayAudio(params: {
|
||||
transcriptionSessionId: string;
|
||||
connId: string;
|
||||
audioBase64: string;
|
||||
}): void {
|
||||
if (params.audioBase64.length > MAX_AUDIO_BASE64_BYTES) {
|
||||
throw new Error("Transcription Talk audio frame is too large");
|
||||
}
|
||||
const session = getTranscriptionSession(params.transcriptionSessionId, params.connId);
|
||||
const audio = Buffer.from(params.audioBase64, "base64");
|
||||
const turnId = ensureTranscriptionTurn(session);
|
||||
session.sttSession.sendAudio(audio);
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
transcriptionSessionId: session.id,
|
||||
type: "inputAudio",
|
||||
byteLength: audio.byteLength,
|
||||
talkEvent: session.talk.emit({
|
||||
type: "input.audio.delta",
|
||||
turnId,
|
||||
payload: { byteLength: audio.byteLength },
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
export function stopTalkTranscriptionRelaySession(params: {
|
||||
transcriptionSessionId: string;
|
||||
connId: string;
|
||||
}): void {
|
||||
const session = getTranscriptionSession(params.transcriptionSessionId, params.connId);
|
||||
if (session.talk.activeTurnId) {
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
transcriptionSessionId: session.id,
|
||||
type: "transcript",
|
||||
text: "",
|
||||
final: true,
|
||||
talkEvent: session.talk.emit({
|
||||
type: "input.audio.committed",
|
||||
turnId: session.talk.activeTurnId,
|
||||
payload: {},
|
||||
final: true,
|
||||
}),
|
||||
});
|
||||
}
|
||||
closeTranscriptionSession(session, "completed");
|
||||
}
|
||||
|
||||
export function cancelTalkTranscriptionRelayTurn(params: {
|
||||
transcriptionSessionId: string;
|
||||
connId: string;
|
||||
reason?: string;
|
||||
}): void {
|
||||
const session = getTranscriptionSession(params.transcriptionSessionId, params.connId);
|
||||
const turnId = ensureTranscriptionTurn(session);
|
||||
const cancelled = session.talk.cancelTurn({
|
||||
turnId,
|
||||
payload: { reason: params.reason ?? "client-cancelled" },
|
||||
});
|
||||
broadcastToOwner(session.context, session.connId, {
|
||||
transcriptionSessionId: session.id,
|
||||
type: "transcript",
|
||||
text: "",
|
||||
final: true,
|
||||
talkEvent: cancelled.ok ? cancelled.event : undefined,
|
||||
});
|
||||
closeTranscriptionSession(session, "completed");
|
||||
}
|
||||
|
||||
export function clearTalkTranscriptionRelaySessionsForTest(): void {
|
||||
for (const session of transcriptionSessions.values()) {
|
||||
clearTimeout(session.cleanupTimer);
|
||||
session.sttSession.close();
|
||||
}
|
||||
transcriptionSessions.clear();
|
||||
}
|
||||
@@ -55,10 +55,6 @@ export function buildInstructions(config: VoiceClawSessionConfigEvent): string {
|
||||
parts.push(deviceContext);
|
||||
}
|
||||
|
||||
if (config.instructionsOverride?.trim()) {
|
||||
parts.push(`## About The User\n${config.instructionsOverride.trim()}`);
|
||||
}
|
||||
|
||||
if (config.conversationHistory && config.conversationHistory.length > 0) {
|
||||
parts.push(buildConversationHistory(config.conversationHistory));
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@ import type { IncomingMessage } from "node:http";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import WebSocket from "ws";
|
||||
import type { OpenClawConfig } from "../../config/types.openclaw.js";
|
||||
import type { TalkEvent } from "../../realtime-voice/talk-events.js";
|
||||
import { createTalkSessionController } from "../../realtime-voice/talk-session-controller.js";
|
||||
import type { ResolvedGatewayAuth } from "../auth.js";
|
||||
import { resolveRealtimeSenderIsOwner, VoiceClawRealtimeSession } from "./session.js";
|
||||
import type {
|
||||
@@ -60,6 +62,45 @@ function makeAdapter(): VoiceClawRealtimeAdapter {
|
||||
}
|
||||
|
||||
describe("VoiceClawRealtimeSession lifecycle", () => {
|
||||
it("rejects request-time instructionsOverride", async () => {
|
||||
const ws = new FakeWebSocket();
|
||||
const adapter = makeAdapter();
|
||||
const releasePreauthBudget = vi.fn();
|
||||
const session = new VoiceClawRealtimeSession({
|
||||
ws: ws as unknown as WebSocket,
|
||||
req: {} as IncomingMessage,
|
||||
auth: { mode: "none" } as ResolvedGatewayAuth,
|
||||
config: {} as OpenClawConfig,
|
||||
trustedProxies: [],
|
||||
allowRealIpFallback: false,
|
||||
releasePreauthBudget,
|
||||
adapterFactory: () => adapter,
|
||||
});
|
||||
|
||||
session.attach();
|
||||
ws.emit(
|
||||
"message",
|
||||
JSON.stringify({
|
||||
type: "session.config",
|
||||
brainAgent: "none",
|
||||
instructionsOverride: "custom request-time instructions",
|
||||
}),
|
||||
);
|
||||
await new Promise((resolve) => setImmediate(resolve));
|
||||
|
||||
expect(ws.sent).toEqual([
|
||||
{
|
||||
type: "error",
|
||||
message: "request-time instructionsOverride is not supported",
|
||||
code: 400,
|
||||
},
|
||||
]);
|
||||
expect(ws.closeCode).toBe(1008);
|
||||
expect(ws.closeReason).toBe("unsupported instruction override");
|
||||
expect(adapter.connect).not.toHaveBeenCalled();
|
||||
expect(releasePreauthBudget).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("sends session summary before closing after terminal adapter errors", () => {
|
||||
const ws = new FakeWebSocket();
|
||||
const adapter = makeAdapter();
|
||||
@@ -102,4 +143,199 @@ describe("VoiceClawRealtimeSession lifecycle", () => {
|
||||
expect(adapter.disconnect).toHaveBeenCalledOnce();
|
||||
expect(releasePreauthBudget).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("adds common Talk event envelopes to configured server events", () => {
|
||||
const ws = new FakeWebSocket();
|
||||
const adapter = makeAdapter();
|
||||
const session = new VoiceClawRealtimeSession({
|
||||
ws: ws as unknown as WebSocket,
|
||||
req: {} as IncomingMessage,
|
||||
auth: { mode: "none" } as ResolvedGatewayAuth,
|
||||
config: {} as OpenClawConfig,
|
||||
trustedProxies: [],
|
||||
allowRealIpFallback: false,
|
||||
releasePreauthBudget: vi.fn(),
|
||||
adapterFactory: () => adapter,
|
||||
});
|
||||
const internals = session as unknown as {
|
||||
config: VoiceClawSessionConfigEvent;
|
||||
talk: unknown;
|
||||
handleAdapterEvent(event: VoiceClawServerEvent): void;
|
||||
};
|
||||
internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" };
|
||||
internals.talk = createTalkSessionController({
|
||||
sessionId: "voice-session",
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "direct-tools",
|
||||
provider: "gemini",
|
||||
});
|
||||
|
||||
internals.handleAdapterEvent({
|
||||
type: "transcript.done",
|
||||
role: "assistant",
|
||||
text: "hello",
|
||||
});
|
||||
|
||||
expect(ws.sent).toEqual([
|
||||
expect.objectContaining({
|
||||
type: "transcript.done",
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "output.text.done",
|
||||
sessionId: "voice-session",
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "direct-tools",
|
||||
provider: "gemini",
|
||||
final: true,
|
||||
payload: { role: "assistant", text: "hello" },
|
||||
}),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps streamed output audio out of common Talk event payloads", () => {
|
||||
const ws = new FakeWebSocket();
|
||||
const adapter = makeAdapter();
|
||||
const session = new VoiceClawRealtimeSession({
|
||||
ws: ws as unknown as WebSocket,
|
||||
req: {} as IncomingMessage,
|
||||
auth: { mode: "none" } as ResolvedGatewayAuth,
|
||||
config: {} as OpenClawConfig,
|
||||
trustedProxies: [],
|
||||
allowRealIpFallback: false,
|
||||
releasePreauthBudget: vi.fn(),
|
||||
adapterFactory: () => adapter,
|
||||
});
|
||||
const internals = session as unknown as {
|
||||
config: VoiceClawSessionConfigEvent;
|
||||
talk: unknown;
|
||||
handleAdapterEvent(event: VoiceClawServerEvent): void;
|
||||
};
|
||||
const audioData = Buffer.from("hello").toString("base64");
|
||||
internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" };
|
||||
internals.talk = createTalkSessionController({
|
||||
sessionId: "voice-session",
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "direct-tools",
|
||||
provider: "gemini",
|
||||
});
|
||||
|
||||
internals.handleAdapterEvent({
|
||||
type: "audio.delta",
|
||||
data: audioData,
|
||||
});
|
||||
|
||||
expect(ws.sent).toEqual([
|
||||
expect.objectContaining({
|
||||
type: "audio.delta",
|
||||
data: audioData,
|
||||
talkEvent: expect.objectContaining({
|
||||
type: "output.audio.delta",
|
||||
payload: { byteLength: 5 },
|
||||
}),
|
||||
}),
|
||||
]);
|
||||
expect(
|
||||
(ws.sent[0] as { talkEvent?: { payload?: Record<string, unknown> } }).talkEvent?.payload,
|
||||
).not.toHaveProperty("data");
|
||||
});
|
||||
|
||||
it("emits common Talk events for client audio, video, cancellation, and tool results", async () => {
|
||||
const ws = new FakeWebSocket();
|
||||
const adapter = makeAdapter();
|
||||
const talkEvents: TalkEvent[] = [];
|
||||
const session = new VoiceClawRealtimeSession({
|
||||
ws: ws as unknown as WebSocket,
|
||||
req: {} as IncomingMessage,
|
||||
auth: { mode: "none" } as ResolvedGatewayAuth,
|
||||
config: {} as OpenClawConfig,
|
||||
trustedProxies: [],
|
||||
allowRealIpFallback: false,
|
||||
releasePreauthBudget: vi.fn(),
|
||||
adapterFactory: () => adapter,
|
||||
onTalkEvent: (event) => talkEvents.push(event),
|
||||
});
|
||||
const internals = session as unknown as {
|
||||
adapter: VoiceClawRealtimeAdapter;
|
||||
config: VoiceClawSessionConfigEvent;
|
||||
talk: ReturnType<typeof createTalkSessionController>;
|
||||
handleRawMessage(raw: string): Promise<void>;
|
||||
};
|
||||
internals.adapter = adapter;
|
||||
internals.config = { type: "session.config", brainAgent: "none", provider: "gemini" };
|
||||
internals.talk = createTalkSessionController({
|
||||
sessionId: "voice-session",
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "direct-tools",
|
||||
provider: "gemini",
|
||||
});
|
||||
internals.talk.startTurn({ turnId: "turn-client" });
|
||||
|
||||
await internals.handleRawMessage(
|
||||
JSON.stringify({ type: "audio.append", data: Buffer.from("hello").toString("base64") }),
|
||||
);
|
||||
await internals.handleRawMessage(JSON.stringify({ type: "audio.commit" }));
|
||||
await internals.handleRawMessage(
|
||||
JSON.stringify({
|
||||
type: "frame.append",
|
||||
data: Buffer.from("frame").toString("base64"),
|
||||
mimeType: "image/jpeg",
|
||||
}),
|
||||
);
|
||||
await internals.handleRawMessage(JSON.stringify({ type: "response.cancel" }));
|
||||
await internals.handleRawMessage(
|
||||
JSON.stringify({ type: "tool.result", callId: "call-1", output: "done" }),
|
||||
);
|
||||
|
||||
expect(adapter.sendAudio).toHaveBeenCalledWith(Buffer.from("hello").toString("base64"));
|
||||
expect(adapter.commitAudio).toHaveBeenCalledOnce();
|
||||
expect(adapter.sendFrame).toHaveBeenCalledWith(
|
||||
Buffer.from("frame").toString("base64"),
|
||||
"image/jpeg",
|
||||
);
|
||||
expect(adapter.cancelResponse).toHaveBeenCalledOnce();
|
||||
expect(adapter.sendToolResult).toHaveBeenCalledWith("call-1", "done");
|
||||
expect(talkEvents.map((event) => event.type)).toEqual([
|
||||
"input.audio.delta",
|
||||
"input.audio.committed",
|
||||
"health.changed",
|
||||
"turn.cancelled",
|
||||
"turn.started",
|
||||
"tool.result",
|
||||
]);
|
||||
expect(talkEvents).toEqual([
|
||||
expect.objectContaining({
|
||||
type: "input.audio.delta",
|
||||
turnId: "turn-client",
|
||||
payload: { byteLength: 5 },
|
||||
}),
|
||||
expect.objectContaining({
|
||||
type: "input.audio.committed",
|
||||
turnId: "turn-client",
|
||||
final: true,
|
||||
}),
|
||||
expect.objectContaining({
|
||||
type: "health.changed",
|
||||
payload: { inputVideoFrame: true, mimeType: "image/jpeg", byteLength: 5 },
|
||||
}),
|
||||
expect.objectContaining({
|
||||
type: "turn.cancelled",
|
||||
payload: { reason: "client-cancelled" },
|
||||
final: true,
|
||||
}),
|
||||
expect.objectContaining({
|
||||
type: "turn.started",
|
||||
payload: { source: "implicit" },
|
||||
}),
|
||||
expect.objectContaining({
|
||||
type: "tool.result",
|
||||
callId: "call-1",
|
||||
payload: { output: "done" },
|
||||
final: true,
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -3,6 +3,12 @@ import type { IncomingMessage } from "node:http";
|
||||
import WebSocket, { type RawData } from "ws";
|
||||
import type { OpenClawConfig } from "../../config/types.openclaw.js";
|
||||
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
||||
import {
|
||||
type TalkEvent,
|
||||
type TalkEventInput,
|
||||
type TalkSessionController,
|
||||
createTalkSessionController,
|
||||
} from "../../realtime-voice/talk-session-controller.js";
|
||||
import type { AuthRateLimiter } from "../auth-rate-limit.js";
|
||||
import {
|
||||
authorizeHttpGatewayConnect,
|
||||
@@ -36,6 +42,7 @@ type VoiceClawRealtimeSessionOptions = {
|
||||
rateLimiter?: AuthRateLimiter;
|
||||
releasePreauthBudget: () => void;
|
||||
adapterFactory?: () => VoiceClawRealtimeAdapter;
|
||||
onTalkEvent?: (event: TalkEvent) => void;
|
||||
};
|
||||
|
||||
export class VoiceClawRealtimeSession {
|
||||
@@ -50,8 +57,10 @@ export class VoiceClawRealtimeSession {
|
||||
private readonly rateLimiter: AuthRateLimiter | undefined;
|
||||
private readonly releasePreauthBudget: () => void;
|
||||
private readonly adapterFactory: () => VoiceClawRealtimeAdapter;
|
||||
private readonly onTalkEvent: ((event: TalkEvent) => void) | undefined;
|
||||
private adapter: VoiceClawRealtimeAdapter | null = null;
|
||||
private toolRuntime: VoiceClawRealtimeToolRuntime | null = null;
|
||||
private talk: TalkSessionController | null = null;
|
||||
private config: VoiceClawSessionConfigEvent | null = null;
|
||||
private handshakeTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
private closed = false;
|
||||
@@ -67,6 +76,7 @@ export class VoiceClawRealtimeSession {
|
||||
this.rateLimiter = opts.rateLimiter;
|
||||
this.releasePreauthBudget = once(opts.releasePreauthBudget);
|
||||
this.adapterFactory = opts.adapterFactory ?? (() => new VoiceClawGeminiLiveAdapter());
|
||||
this.onTalkEvent = opts.onTalkEvent;
|
||||
}
|
||||
|
||||
attach(): void {
|
||||
@@ -113,24 +123,66 @@ export class VoiceClawRealtimeSession {
|
||||
}
|
||||
|
||||
switch (event.type) {
|
||||
case "audio.append":
|
||||
case "audio.append": {
|
||||
const audioTurnId = this.ensureActiveTurnId();
|
||||
this.adapter?.sendAudio(event.data);
|
||||
this.emitTalkEvent({
|
||||
type: "input.audio.delta",
|
||||
payload: { byteLength: base64ByteLength(event.data) },
|
||||
turnId: audioTurnId,
|
||||
});
|
||||
break;
|
||||
case "audio.commit":
|
||||
}
|
||||
case "audio.commit": {
|
||||
const commitTurnId = this.ensureActiveTurnId();
|
||||
this.adapter?.commitAudio();
|
||||
this.emitTalkEvent({
|
||||
type: "input.audio.committed",
|
||||
payload: {},
|
||||
turnId: commitTurnId,
|
||||
final: true,
|
||||
});
|
||||
break;
|
||||
}
|
||||
case "frame.append":
|
||||
this.adapter?.sendFrame(event.data, event.mimeType);
|
||||
this.emitTalkEvent({
|
||||
type: "health.changed",
|
||||
payload: {
|
||||
inputVideoFrame: true,
|
||||
mimeType: event.mimeType,
|
||||
byteLength: base64ByteLength(event.data),
|
||||
},
|
||||
turnId: this.talk?.activeTurnId,
|
||||
});
|
||||
break;
|
||||
case "response.create":
|
||||
this.adapter?.createResponse();
|
||||
break;
|
||||
case "response.cancel":
|
||||
case "response.cancel": {
|
||||
const cancelTurnId = this.ensureActiveTurnId();
|
||||
this.adapter?.cancelResponse();
|
||||
const cancelled = this.talk?.cancelTurn({
|
||||
turnId: cancelTurnId,
|
||||
payload: { reason: "client-cancelled" },
|
||||
});
|
||||
if (cancelled?.ok) {
|
||||
this.onTalkEvent?.(cancelled.event);
|
||||
}
|
||||
break;
|
||||
case "tool.result":
|
||||
}
|
||||
case "tool.result": {
|
||||
const toolTurnId = this.ensureActiveTurnId();
|
||||
this.adapter?.sendToolResult(event.callId, event.output);
|
||||
this.emitTalkEvent({
|
||||
type: "tool.result",
|
||||
payload: { output: event.output },
|
||||
turnId: toolTurnId,
|
||||
callId: event.callId,
|
||||
final: true,
|
||||
});
|
||||
break;
|
||||
}
|
||||
case "session.config":
|
||||
this.send({ type: "error", message: "session already configured", code: 400 });
|
||||
break;
|
||||
@@ -144,6 +196,16 @@ export class VoiceClawRealtimeSession {
|
||||
this.configStarted = true;
|
||||
this.clearHandshakeTimer();
|
||||
|
||||
if (hasInstructionsOverride(config)) {
|
||||
this.send({
|
||||
type: "error",
|
||||
message: "request-time instructionsOverride is not supported",
|
||||
code: 400,
|
||||
});
|
||||
this.ws.close(1008, "unsupported instruction override");
|
||||
return;
|
||||
}
|
||||
|
||||
const authResult = await authorizeHttpGatewayConnect({
|
||||
auth: this.auth,
|
||||
connectAuth: config.apiKey ? { token: config.apiKey, password: config.apiKey } : null,
|
||||
@@ -190,6 +252,13 @@ export class VoiceClawRealtimeSession {
|
||||
voice: config.voice || "Zephyr",
|
||||
brainAgent: config.brainAgent ?? "enabled",
|
||||
};
|
||||
this.talk = createTalkSessionController({
|
||||
sessionId: this.id,
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: this.config.brainAgent === "none" ? "none" : "direct-tools",
|
||||
provider: this.config.provider,
|
||||
});
|
||||
this.adapter = this.adapterFactory();
|
||||
|
||||
try {
|
||||
@@ -270,7 +339,134 @@ export class VoiceClawRealtimeSession {
|
||||
if (this.closed || this.ws.readyState !== WebSocket.OPEN) {
|
||||
return;
|
||||
}
|
||||
this.ws.send(JSON.stringify(event));
|
||||
this.ws.send(JSON.stringify(this.withTalkEvent(event)));
|
||||
}
|
||||
|
||||
private withTalkEvent(
|
||||
event: VoiceClawServerEvent,
|
||||
): VoiceClawServerEvent & { talkEvent?: TalkEvent } {
|
||||
const talkInput = this.toTalkEventInput(event);
|
||||
if (!talkInput || !this.talk) {
|
||||
return event;
|
||||
}
|
||||
return { ...event, talkEvent: this.emitTalkEvent(talkInput) };
|
||||
}
|
||||
|
||||
private emitTalkEvent(input: TalkEventInput): TalkEvent | undefined {
|
||||
if (!this.talk) {
|
||||
return undefined;
|
||||
}
|
||||
let event: TalkEvent | undefined;
|
||||
if (input.type === "turn.started") {
|
||||
event = this.talk.startTurn({ turnId: input.turnId, payload: input.payload }).event;
|
||||
} else if (input.type === "turn.ended") {
|
||||
const ended = this.talk.endTurn({ turnId: input.turnId, payload: input.payload });
|
||||
event = ended.ok ? ended.event : undefined;
|
||||
} else if (input.type === "turn.cancelled") {
|
||||
const cancelled = this.talk.cancelTurn({ turnId: input.turnId, payload: input.payload });
|
||||
event = cancelled.ok ? cancelled.event : undefined;
|
||||
} else {
|
||||
event = this.talk.emit(input);
|
||||
}
|
||||
if (event) {
|
||||
this.onTalkEvent?.(event);
|
||||
}
|
||||
return event;
|
||||
}
|
||||
|
||||
private ensureActiveTurnId(): string {
|
||||
if (this.talk?.activeTurnId) {
|
||||
return this.talk.activeTurnId;
|
||||
}
|
||||
const turnId = randomUUID();
|
||||
const turn = this.talk?.startTurn({
|
||||
turnId,
|
||||
payload: { source: "implicit" },
|
||||
});
|
||||
if (turn?.event) {
|
||||
this.onTalkEvent?.(turn.event);
|
||||
}
|
||||
return turnId;
|
||||
}
|
||||
|
||||
private toTalkEventInput(event: VoiceClawServerEvent): TalkEventInput | null {
|
||||
switch (event.type) {
|
||||
case "session.ready":
|
||||
return { type: "session.ready", payload: { sessionId: event.sessionId } };
|
||||
case "audio.delta":
|
||||
return {
|
||||
type: "output.audio.delta",
|
||||
payload: { byteLength: base64ByteLength(event.data) },
|
||||
turnId: this.ensureActiveTurnId(),
|
||||
};
|
||||
case "transcript.delta":
|
||||
return {
|
||||
type: event.role === "assistant" ? "output.text.delta" : "transcript.delta",
|
||||
payload: { role: event.role, text: event.text },
|
||||
turnId: this.ensureActiveTurnId(),
|
||||
};
|
||||
case "transcript.done":
|
||||
return {
|
||||
type: event.role === "assistant" ? "output.text.done" : "transcript.done",
|
||||
payload: { role: event.role, text: event.text },
|
||||
turnId: this.ensureActiveTurnId(),
|
||||
final: true,
|
||||
};
|
||||
case "tool.call":
|
||||
return {
|
||||
type: "tool.call",
|
||||
payload: { name: event.name, arguments: event.arguments },
|
||||
turnId: this.ensureActiveTurnId(),
|
||||
callId: event.callId,
|
||||
};
|
||||
case "tool.progress":
|
||||
return {
|
||||
type: "tool.progress",
|
||||
payload: { summary: event.summary },
|
||||
turnId: this.ensureActiveTurnId(),
|
||||
callId: event.callId,
|
||||
};
|
||||
case "turn.started": {
|
||||
const turnId = event.turnId || randomUUID();
|
||||
return { type: "turn.started", payload: {}, turnId };
|
||||
}
|
||||
case "turn.ended": {
|
||||
const turnId = this.ensureActiveTurnId();
|
||||
return { type: "turn.ended", payload: {}, turnId, final: true };
|
||||
}
|
||||
case "session.ended":
|
||||
return {
|
||||
type: "session.closed",
|
||||
payload: {
|
||||
summary: event.summary,
|
||||
durationSec: event.durationSec,
|
||||
turnCount: event.turnCount,
|
||||
},
|
||||
final: true,
|
||||
};
|
||||
case "session.rotating":
|
||||
return { type: "health.changed", payload: { status: "rotating" } };
|
||||
case "session.rotated":
|
||||
return { type: "session.replaced", payload: { sessionId: event.sessionId } };
|
||||
case "usage.metrics":
|
||||
return { type: "usage.metrics", payload: event };
|
||||
case "latency.metrics":
|
||||
return { type: "latency.metrics", payload: event };
|
||||
case "tool.cancelled":
|
||||
return {
|
||||
type: "tool.error",
|
||||
payload: { callIds: event.callIds, cancelled: true },
|
||||
turnId: this.ensureActiveTurnId(),
|
||||
final: true,
|
||||
};
|
||||
case "error":
|
||||
return {
|
||||
type: "session.error",
|
||||
payload: { message: event.message, code: event.code },
|
||||
final: true,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private clearHandshakeTimer(): void {
|
||||
@@ -330,6 +526,11 @@ function parseClientEvent(raw: RawData): VoiceClawClientEvent | null {
|
||||
}
|
||||
}
|
||||
|
||||
function hasInstructionsOverride(config: VoiceClawSessionConfigEvent): boolean {
|
||||
const value = (config as { instructionsOverride?: unknown }).instructionsOverride;
|
||||
return typeof value === "string" && value.trim().length > 0;
|
||||
}
|
||||
|
||||
function sanitizeSessionKey(value: string | undefined): string | null {
|
||||
const trimmed = value?.trim();
|
||||
if (!trimmed) {
|
||||
@@ -353,6 +554,18 @@ function sanitizeErrorMessage(message: string): string {
|
||||
return message.replace(/([?&]key=)[^&\s]+/g, "$1***");
|
||||
}
|
||||
|
||||
function base64ByteLength(value: string): number {
|
||||
const normalized = value.trim();
|
||||
if (!normalized) {
|
||||
return 0;
|
||||
}
|
||||
try {
|
||||
return Buffer.from(normalized, "base64").byteLength;
|
||||
} catch {
|
||||
return normalized.length;
|
||||
}
|
||||
}
|
||||
|
||||
function once(fn: () => void): () => void {
|
||||
let called = false;
|
||||
return () => {
|
||||
|
||||
@@ -23,7 +23,6 @@ export type VoiceClawSessionConfigEvent = {
|
||||
location?: string;
|
||||
};
|
||||
watchdog?: "enabled" | "disabled";
|
||||
instructionsOverride?: string;
|
||||
conversationHistory?: { role: "user" | "assistant"; text: string }[];
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user