From 4ac355babbeffdf133c46f77352829ad23e38eda Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:27:05 +0530 Subject: [PATCH] feat(gateway): add talk speak rpc --- src/gateway/method-scopes.ts | 1 + src/gateway/protocol/index.ts | 10 + src/gateway/protocol/schema/channels.ts | 29 ++ .../protocol/schema/protocol-schemas.ts | 4 + src/gateway/protocol/schema/types.ts | 2 + src/gateway/server-methods-list.ts | 1 + src/gateway/server-methods/talk.ts | 335 +++++++++++++++++- src/gateway/server.talk-config.test.ts | 67 +++- 8 files changed, 447 insertions(+), 2 deletions(-) diff --git a/src/gateway/method-scopes.ts b/src/gateway/method-scopes.ts index c31ff30db7b..f3a969301bf 100644 --- a/src/gateway/method-scopes.ts +++ b/src/gateway/method-scopes.ts @@ -98,6 +98,7 @@ const METHOD_SCOPE_GROUPS: Record = { "agent.wait", "wake", "talk.mode", + "talk.speak", "tts.enable", "tts.disable", "tts.convert", diff --git a/src/gateway/protocol/index.ts b/src/gateway/protocol/index.ts index 408e3239cc1..408074d44e4 100644 --- a/src/gateway/protocol/index.ts +++ b/src/gateway/protocol/index.ts @@ -48,6 +48,10 @@ import { TalkConfigParamsSchema, type TalkConfigResult, TalkConfigResultSchema, + type TalkSpeakParams, + TalkSpeakParamsSchema, + type TalkSpeakResult, + TalkSpeakResultSchema, type ChannelsStatusParams, ChannelsStatusParamsSchema, type ChannelsStatusResult, @@ -375,6 +379,8 @@ export const validateWizardStatusParams = ajv.compile(Wizard export const validateTalkModeParams = ajv.compile(TalkModeParamsSchema); export const validateTalkConfigParams = ajv.compile(TalkConfigParamsSchema); export const validateTalkConfigResult = ajv.compile(TalkConfigResultSchema); +export const validateTalkSpeakParams = ajv.compile(TalkSpeakParamsSchema); +export const validateTalkSpeakResult = ajv.compile(TalkSpeakResultSchema); export const validateChannelsStatusParams = ajv.compile( ChannelsStatusParamsSchema, ); @@ -540,6 +546,8 @@ export { WizardStatusResultSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkSpeakParamsSchema, + TalkSpeakResultSchema, ChannelsStatusParamsSchema, ChannelsStatusResultSchema, ChannelsLogoutParamsSchema, @@ -629,6 +637,8 @@ export type { WizardStatusResult, TalkConfigParams, TalkConfigResult, + TalkSpeakParams, + TalkSpeakResult, TalkModeParams, ChannelsStatusParams, ChannelsStatusResult, diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 041318897ac..923432c7ac8 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -16,6 +16,23 @@ export const TalkConfigParamsSchema = Type.Object( { additionalProperties: false }, ); +export const TalkSpeakParamsSchema = Type.Object( + { + text: NonEmptyString, + voiceId: Type.Optional(Type.String()), + modelId: Type.Optional(Type.String()), + speed: Type.Optional(Type.Number()), + stability: Type.Optional(Type.Number()), + similarity: Type.Optional(Type.Number()), + style: Type.Optional(Type.Number()), + speakerBoost: Type.Optional(Type.Boolean()), + seed: Type.Optional(Type.Integer({ minimum: 0 })), + normalize: Type.Optional(Type.String()), + language: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + const talkProviderFieldSchemas = { voiceId: Type.Optional(Type.String()), voiceAliases: Type.Optional(Type.Record(Type.String(), Type.String())), @@ -85,6 +102,18 @@ export const TalkConfigResultSchema = Type.Object( { additionalProperties: false }, ); +export const TalkSpeakResultSchema = Type.Object( + { + audioBase64: NonEmptyString, + provider: NonEmptyString, + outputFormat: Type.Optional(Type.String()), + voiceCompatible: Type.Optional(Type.Boolean()), + mimeType: Type.Optional(Type.String()), + fileExtension: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + export const ChannelsStatusParamsSchema = Type.Object( { probe: Type.Optional(Type.Boolean()), diff --git a/src/gateway/protocol/schema/protocol-schemas.ts b/src/gateway/protocol/schema/protocol-schemas.ts index 60636e3eb5f..cf14fc44610 100644 --- a/src/gateway/protocol/schema/protocol-schemas.ts +++ b/src/gateway/protocol/schema/protocol-schemas.ts @@ -44,6 +44,8 @@ import { ChannelsLogoutParamsSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkSpeakParamsSchema, + TalkSpeakResultSchema, ChannelsStatusParamsSchema, ChannelsStatusResultSchema, TalkModeParamsSchema, @@ -238,6 +240,8 @@ export const ProtocolSchemas = { TalkModeParams: TalkModeParamsSchema, TalkConfigParams: TalkConfigParamsSchema, TalkConfigResult: TalkConfigResultSchema, + TalkSpeakParams: TalkSpeakParamsSchema, + TalkSpeakResult: TalkSpeakResultSchema, ChannelsStatusParams: ChannelsStatusParamsSchema, ChannelsStatusResult: ChannelsStatusResultSchema, ChannelsLogoutParams: ChannelsLogoutParamsSchema, diff --git a/src/gateway/protocol/schema/types.ts b/src/gateway/protocol/schema/types.ts index 58ddb142cd5..d74c08ad10b 100644 --- a/src/gateway/protocol/schema/types.ts +++ b/src/gateway/protocol/schema/types.ts @@ -70,6 +70,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">; export type TalkModeParams = SchemaType<"TalkModeParams">; export type TalkConfigParams = SchemaType<"TalkConfigParams">; export type TalkConfigResult = SchemaType<"TalkConfigResult">; +export type TalkSpeakParams = SchemaType<"TalkSpeakParams">; +export type TalkSpeakResult = SchemaType<"TalkSpeakResult">; export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">; export type ChannelsStatusResult = SchemaType<"ChannelsStatusResult">; export type ChannelsLogoutParams = SchemaType<"ChannelsLogoutParams">; diff --git a/src/gateway/server-methods-list.ts b/src/gateway/server-methods-list.ts index b4de49f1198..e930f8b0517 100644 --- a/src/gateway/server-methods-list.ts +++ b/src/gateway/server-methods-list.ts @@ -34,6 +34,7 @@ const BASE_METHODS = [ "wizard.cancel", "wizard.status", "talk.config", + "talk.speak", "talk.mode", "models.list", "tools.catalog", diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index 693f3447537..33cb6d7f116 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -1,23 +1,297 @@ import { readConfigFileSnapshot } from "../../config/config.js"; import { redactConfigObject } from "../../config/redact-snapshot.js"; -import { buildTalkConfigResponse } from "../../config/talk.js"; +import { buildTalkConfigResponse, resolveActiveTalkProviderConfig } from "../../config/talk.js"; +import type { TalkProviderConfig } from "../../config/types.gateway.js"; +import type { OpenClawConfig, TtsConfig } from "../../config/types.js"; +import { normalizeSpeechProviderId } from "../../tts/provider-registry.js"; +import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js"; import { ErrorCodes, errorShape, formatValidationErrors, validateTalkConfigParams, validateTalkModeParams, + validateTalkSpeakParams, } from "../protocol/index.js"; +import { formatForLog } from "../ws-log.js"; import type { GatewayRequestHandlers } from "./types.js"; const ADMIN_SCOPE = "operator.admin"; const TALK_SECRETS_SCOPE = "operator.talk.secrets"; +type ElevenLabsVoiceSettings = NonNullable["voiceSettings"]>; function canReadTalkSecrets(client: { connect?: { scopes?: string[] } } | null): boolean { const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : []; return scopes.includes(ADMIN_SCOPE) || scopes.includes(TALK_SECRETS_SCOPE); } +function trimString(value: unknown): string | undefined { + if (typeof value !== "string") { + return undefined; + } + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function finiteNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function optionalBoolean(value: unknown): boolean | undefined { + return typeof value === "boolean" ? value : undefined; +} + +function plainObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function normalizeTextNormalization(value: unknown): "auto" | "on" | "off" | undefined { + const normalized = trimString(value)?.toLowerCase(); + return normalized === "auto" || normalized === "on" || normalized === "off" + ? normalized + : undefined; +} + +function normalizeAliasKey(value: string): string { + return value.trim().toLowerCase(); +} + +function resolveTalkVoiceId( + providerConfig: TalkProviderConfig, + requested: string | undefined, +): string | undefined { + if (!requested) { + return undefined; + } + const aliases = providerConfig.voiceAliases; + if (!aliases) { + return requested; + } + return aliases[normalizeAliasKey(requested)] ?? requested; +} + +function readTalkVoiceSettings( + providerConfig: TalkProviderConfig, +): ElevenLabsVoiceSettings | undefined { + const source = plainObject(providerConfig.voiceSettings); + if (!source) { + return undefined; + } + const stability = finiteNumber(source.stability); + const similarityBoost = finiteNumber(source.similarityBoost); + const style = finiteNumber(source.style); + const useSpeakerBoost = optionalBoolean(source.useSpeakerBoost); + const speed = finiteNumber(source.speed); + const voiceSettings = { + ...(stability == null ? {} : { stability }), + ...(similarityBoost == null ? {} : { similarityBoost }), + ...(style == null ? {} : { style }), + ...(useSpeakerBoost == null ? {} : { useSpeakerBoost }), + ...(speed == null ? {} : { speed }), + }; + return Object.keys(voiceSettings).length > 0 ? voiceSettings : undefined; +} + +function buildTalkTtsConfig( + config: OpenClawConfig, +): + | { cfg: OpenClawConfig; provider: string; providerConfig: TalkProviderConfig } + | { error: string } { + const resolved = resolveActiveTalkProviderConfig(config.talk); + const provider = normalizeSpeechProviderId(resolved?.provider); + if (!resolved || !provider) { + return { error: "talk.speak unavailable: talk provider not configured" }; + } + + const baseTts = config.messages?.tts ?? {}; + const providerConfig = resolved.config; + const talkTts: TtsConfig = { + ...baseTts, + auto: "always", + provider, + }; + + if (provider === "elevenlabs") { + talkTts.elevenlabs = { + ...baseTts.elevenlabs, + ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }), + ...(trimString(providerConfig.baseUrl) == null + ? {} + : { baseUrl: trimString(providerConfig.baseUrl) }), + ...(trimString(providerConfig.voiceId) == null + ? {} + : { voiceId: trimString(providerConfig.voiceId) }), + ...(trimString(providerConfig.modelId) == null + ? {} + : { modelId: trimString(providerConfig.modelId) }), + ...(finiteNumber(providerConfig.seed) == null + ? {} + : { seed: finiteNumber(providerConfig.seed) }), + ...(normalizeTextNormalization(providerConfig.applyTextNormalization) == null + ? {} + : { + applyTextNormalization: normalizeTextNormalization( + providerConfig.applyTextNormalization, + ), + }), + ...(trimString(providerConfig.languageCode) == null + ? {} + : { languageCode: trimString(providerConfig.languageCode) }), + ...(readTalkVoiceSettings(providerConfig) == null + ? {} + : { voiceSettings: readTalkVoiceSettings(providerConfig) }), + }; + } else if (provider === "openai") { + talkTts.openai = { + ...baseTts.openai, + ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }), + ...(trimString(providerConfig.baseUrl) == null + ? {} + : { baseUrl: trimString(providerConfig.baseUrl) }), + ...(trimString(providerConfig.modelId) == null + ? {} + : { model: trimString(providerConfig.modelId) }), + ...(trimString(providerConfig.voiceId) == null + ? {} + : { voice: trimString(providerConfig.voiceId) }), + ...(finiteNumber(providerConfig.speed) == null + ? {} + : { speed: finiteNumber(providerConfig.speed) }), + ...(trimString(providerConfig.instructions) == null + ? {} + : { instructions: trimString(providerConfig.instructions) }), + }; + } else if (provider === "microsoft") { + talkTts.microsoft = { + ...baseTts.microsoft, + enabled: true, + ...(trimString(providerConfig.voiceId) == null + ? {} + : { voice: trimString(providerConfig.voiceId) }), + ...(trimString(providerConfig.languageCode) == null + ? {} + : { lang: trimString(providerConfig.languageCode) }), + ...(trimString(providerConfig.outputFormat) == null + ? {} + : { outputFormat: trimString(providerConfig.outputFormat) }), + ...(trimString(providerConfig.pitch) == null + ? {} + : { pitch: trimString(providerConfig.pitch) }), + ...(trimString(providerConfig.rate) == null ? {} : { rate: trimString(providerConfig.rate) }), + ...(trimString(providerConfig.volume) == null + ? {} + : { volume: trimString(providerConfig.volume) }), + ...(trimString(providerConfig.proxy) == null + ? {} + : { proxy: trimString(providerConfig.proxy) }), + ...(finiteNumber(providerConfig.timeoutMs) == null + ? {} + : { timeoutMs: finiteNumber(providerConfig.timeoutMs) }), + }; + } else { + return { error: `talk.speak unavailable: unsupported talk provider '${resolved.provider}'` }; + } + + return { + provider, + providerConfig, + cfg: { + ...config, + messages: { + ...config.messages, + tts: talkTts, + }, + }, + }; +} + +function buildTalkSpeakOverrides( + provider: string, + providerConfig: TalkProviderConfig, + params: Record, +): TtsDirectiveOverrides { + const voiceId = resolveTalkVoiceId(providerConfig, trimString(params.voiceId)); + const modelId = trimString(params.modelId); + const speed = finiteNumber(params.speed); + const seed = finiteNumber(params.seed); + const normalize = normalizeTextNormalization(params.normalize); + const language = trimString(params.language)?.toLowerCase(); + const overrides: TtsDirectiveOverrides = { provider }; + + if (provider === "elevenlabs") { + const voiceSettings = { + ...(speed == null ? {} : { speed }), + ...(finiteNumber(params.stability) == null + ? {} + : { stability: finiteNumber(params.stability) }), + ...(finiteNumber(params.similarity) == null + ? {} + : { similarityBoost: finiteNumber(params.similarity) }), + ...(finiteNumber(params.style) == null ? {} : { style: finiteNumber(params.style) }), + ...(optionalBoolean(params.speakerBoost) == null + ? {} + : { useSpeakerBoost: optionalBoolean(params.speakerBoost) }), + }; + overrides.elevenlabs = { + ...(voiceId == null ? {} : { voiceId }), + ...(modelId == null ? {} : { modelId }), + ...(seed == null ? {} : { seed }), + ...(normalize == null ? {} : { applyTextNormalization: normalize }), + ...(language == null ? {} : { languageCode: language }), + ...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }), + }; + return overrides; + } + + if (provider === "openai") { + overrides.openai = { + ...(voiceId == null ? {} : { voice: voiceId }), + ...(modelId == null ? {} : { model: modelId }), + ...(speed == null ? {} : { speed }), + }; + return overrides; + } + + if (provider === "microsoft") { + overrides.microsoft = voiceId == null ? undefined : { voice: voiceId }; + } + + return overrides; +} + +function inferMimeType( + outputFormat: string | undefined, + fileExtension: string | undefined, +): string | undefined { + const normalizedOutput = outputFormat?.trim().toLowerCase(); + const normalizedExtension = fileExtension?.trim().toLowerCase(); + if ( + normalizedOutput === "mp3" || + normalizedOutput?.startsWith("mp3_") || + normalizedOutput?.endsWith("-mp3") || + normalizedExtension === ".mp3" + ) { + return "audio/mpeg"; + } + if ( + normalizedOutput === "opus" || + normalizedOutput?.startsWith("opus_") || + normalizedExtension === ".opus" || + normalizedExtension === ".ogg" + ) { + return "audio/ogg"; + } + if (normalizedOutput?.endsWith("-wav") || normalizedExtension === ".wav") { + return "audio/wav"; + } + if (normalizedOutput?.endsWith("-webm") || normalizedExtension === ".webm") { + return "audio/webm"; + } + return undefined; +} + export const talkHandlers: GatewayRequestHandlers = { "talk.config": async ({ params, respond, client }) => { if (!validateTalkConfigParams(params)) { @@ -65,6 +339,65 @@ export const talkHandlers: GatewayRequestHandlers = { respond(true, { config: configPayload }, undefined); }, + "talk.speak": async ({ params, respond }) => { + if (!validateTalkSpeakParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.speak params: ${formatValidationErrors(validateTalkSpeakParams.errors)}`, + ), + ); + return; + } + + const text = trimString((params as { text?: unknown }).text); + if (!text) { + respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "talk.speak requires text")); + return; + } + + try { + const snapshot = await readConfigFileSnapshot(); + const setup = buildTalkTtsConfig(snapshot.config); + if ("error" in setup) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, setup.error)); + return; + } + + const overrides = buildTalkSpeakOverrides(setup.provider, setup.providerConfig, params); + const result = await synthesizeSpeech({ + text, + cfg: setup.cfg, + overrides, + disableFallback: true, + }); + if (!result.success || !result.audioBuffer) { + respond( + false, + undefined, + errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "talk synthesis failed"), + ); + return; + } + + respond( + true, + { + audioBase64: result.audioBuffer.toString("base64"), + provider: result.provider ?? setup.provider, + outputFormat: result.outputFormat, + voiceCompatible: result.voiceCompatible, + mimeType: inferMimeType(result.outputFormat, result.fileExtension), + fileExtension: result.fileExtension, + }, + undefined, + ); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, "talk.mode": ({ params, respond, context, client, isWebchatConnect }) => { if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) { respond( diff --git a/src/gateway/server.talk-config.test.ts b/src/gateway/server.talk-config.test.ts index a47addbb0e0..eb2925db158 100644 --- a/src/gateway/server.talk-config.test.ts +++ b/src/gateway/server.talk-config.test.ts @@ -1,6 +1,6 @@ import os from "node:os"; import path from "node:path"; -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { loadOrCreateDeviceIdentity, publicKeyRawBase64UrlFromPem, @@ -41,6 +41,13 @@ type TalkConfigPayload = { }; }; type TalkConfig = NonNullable["talk"]>; +type TalkSpeakPayload = { + audioBase64?: string; + provider?: string; + outputFormat?: string; + mimeType?: string; + fileExtension?: string; +}; const TALK_CONFIG_DEVICE_PATH = path.join( os.tmpdir(), `openclaw-talk-config-device-${process.pid}.json`, @@ -95,6 +102,10 @@ async function fetchTalkConfig( return rpcReq(ws, "talk.config", params ?? {}); } +async function fetchTalkSpeak(ws: GatewaySocket, params: Record) { + return rpcReq(ws, "talk.speak", params); +} + function expectElevenLabsTalkConfig( talk: TalkConfig | undefined, expected: { @@ -236,4 +247,58 @@ describe("gateway talk.config", () => { }); }); }); + + it("synthesizes talk audio via the active talk provider", async () => { + const { writeConfigFile } = await import("../config/config.js"); + await writeConfigFile({ + talk: { + provider: "openai", + providers: { + openai: { + apiKey: "openai-talk-key", // pragma: allowlist secret + voiceId: "alloy", + modelId: "gpt-4o-mini-tts", + }, + }, + }, + }); + + const originalFetch = globalThis.fetch; + const requestInits: RequestInit[] = []; + const fetchMock = vi.fn(async (_input: RequestInfo | URL, init?: RequestInit) => { + if (init) { + requestInits.push(init); + } + return new Response(new Uint8Array([1, 2, 3]), { status: 200 }); + }); + globalThis.fetch = fetchMock as typeof fetch; + + try { + await withServer(async (ws) => { + await connectOperator(ws, ["operator.read", "operator.write"]); + const res = await fetchTalkSpeak(ws, { + text: "Hello from talk mode.", + voiceId: "nova", + modelId: "tts-1", + speed: 1.25, + }); + expect(res.ok).toBe(true); + expect(res.payload?.provider).toBe("openai"); + expect(res.payload?.outputFormat).toBe("mp3"); + expect(res.payload?.mimeType).toBe("audio/mpeg"); + expect(res.payload?.fileExtension).toBe(".mp3"); + expect(res.payload?.audioBase64).toBe(Buffer.from([1, 2, 3]).toString("base64")); + }); + + expect(fetchMock).toHaveBeenCalled(); + const requestInit = requestInits.find((init) => typeof init.body === "string"); + expect(requestInit).toBeDefined(); + const body = JSON.parse(requestInit?.body as string) as Record; + expect(body.model).toBe("tts-1"); + expect(body.voice).toBe("nova"); + expect(body.speed).toBe(1.25); + } finally { + globalThis.fetch = originalFetch; + } + }); });