feat(gateway): add talk speak rpc

This commit is contained in:
Ayaan Zaidi
2026-03-20 10:27:05 +05:30
parent 84ee6fbb76
commit 4ac355babb
8 changed files with 447 additions and 2 deletions

View File

@@ -98,6 +98,7 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
"agent.wait",
"wake",
"talk.mode",
"talk.speak",
"tts.enable",
"tts.disable",
"tts.convert",

View File

@@ -48,6 +48,10 @@ import {
TalkConfigParamsSchema,
type TalkConfigResult,
TalkConfigResultSchema,
type TalkSpeakParams,
TalkSpeakParamsSchema,
type TalkSpeakResult,
TalkSpeakResultSchema,
type ChannelsStatusParams,
ChannelsStatusParamsSchema,
type ChannelsStatusResult,
@@ -375,6 +379,8 @@ export const validateWizardStatusParams = ajv.compile<WizardStatusParams>(Wizard
export const validateTalkModeParams = ajv.compile<TalkModeParams>(TalkModeParamsSchema);
export const validateTalkConfigParams = ajv.compile<TalkConfigParams>(TalkConfigParamsSchema);
export const validateTalkConfigResult = ajv.compile<TalkConfigResult>(TalkConfigResultSchema);
export const validateTalkSpeakParams = ajv.compile<TalkSpeakParams>(TalkSpeakParamsSchema);
export const validateTalkSpeakResult = ajv.compile<TalkSpeakResult>(TalkSpeakResultSchema);
export const validateChannelsStatusParams = ajv.compile<ChannelsStatusParams>(
ChannelsStatusParamsSchema,
);
@@ -540,6 +546,8 @@ export {
WizardStatusResultSchema,
TalkConfigParamsSchema,
TalkConfigResultSchema,
TalkSpeakParamsSchema,
TalkSpeakResultSchema,
ChannelsStatusParamsSchema,
ChannelsStatusResultSchema,
ChannelsLogoutParamsSchema,
@@ -629,6 +637,8 @@ export type {
WizardStatusResult,
TalkConfigParams,
TalkConfigResult,
TalkSpeakParams,
TalkSpeakResult,
TalkModeParams,
ChannelsStatusParams,
ChannelsStatusResult,

View File

@@ -16,6 +16,23 @@ export const TalkConfigParamsSchema = Type.Object(
{ additionalProperties: false },
);
export const TalkSpeakParamsSchema = Type.Object(
{
text: NonEmptyString,
voiceId: Type.Optional(Type.String()),
modelId: Type.Optional(Type.String()),
speed: Type.Optional(Type.Number()),
stability: Type.Optional(Type.Number()),
similarity: Type.Optional(Type.Number()),
style: Type.Optional(Type.Number()),
speakerBoost: Type.Optional(Type.Boolean()),
seed: Type.Optional(Type.Integer({ minimum: 0 })),
normalize: Type.Optional(Type.String()),
language: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
const talkProviderFieldSchemas = {
voiceId: Type.Optional(Type.String()),
voiceAliases: Type.Optional(Type.Record(Type.String(), Type.String())),
@@ -85,6 +102,18 @@ export const TalkConfigResultSchema = Type.Object(
{ additionalProperties: false },
);
export const TalkSpeakResultSchema = Type.Object(
{
audioBase64: NonEmptyString,
provider: NonEmptyString,
outputFormat: Type.Optional(Type.String()),
voiceCompatible: Type.Optional(Type.Boolean()),
mimeType: Type.Optional(Type.String()),
fileExtension: Type.Optional(Type.String()),
},
{ additionalProperties: false },
);
export const ChannelsStatusParamsSchema = Type.Object(
{
probe: Type.Optional(Type.Boolean()),

View File

@@ -44,6 +44,8 @@ import {
ChannelsLogoutParamsSchema,
TalkConfigParamsSchema,
TalkConfigResultSchema,
TalkSpeakParamsSchema,
TalkSpeakResultSchema,
ChannelsStatusParamsSchema,
ChannelsStatusResultSchema,
TalkModeParamsSchema,
@@ -238,6 +240,8 @@ export const ProtocolSchemas = {
TalkModeParams: TalkModeParamsSchema,
TalkConfigParams: TalkConfigParamsSchema,
TalkConfigResult: TalkConfigResultSchema,
TalkSpeakParams: TalkSpeakParamsSchema,
TalkSpeakResult: TalkSpeakResultSchema,
ChannelsStatusParams: ChannelsStatusParamsSchema,
ChannelsStatusResult: ChannelsStatusResultSchema,
ChannelsLogoutParams: ChannelsLogoutParamsSchema,

View File

@@ -70,6 +70,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">;
export type TalkModeParams = SchemaType<"TalkModeParams">;
export type TalkConfigParams = SchemaType<"TalkConfigParams">;
export type TalkConfigResult = SchemaType<"TalkConfigResult">;
export type TalkSpeakParams = SchemaType<"TalkSpeakParams">;
export type TalkSpeakResult = SchemaType<"TalkSpeakResult">;
export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">;
export type ChannelsStatusResult = SchemaType<"ChannelsStatusResult">;
export type ChannelsLogoutParams = SchemaType<"ChannelsLogoutParams">;

View File

@@ -34,6 +34,7 @@ const BASE_METHODS = [
"wizard.cancel",
"wizard.status",
"talk.config",
"talk.speak",
"talk.mode",
"models.list",
"tools.catalog",

View File

@@ -1,23 +1,297 @@
import { readConfigFileSnapshot } from "../../config/config.js";
import { redactConfigObject } from "../../config/redact-snapshot.js";
import { buildTalkConfigResponse } from "../../config/talk.js";
import { buildTalkConfigResponse, resolveActiveTalkProviderConfig } from "../../config/talk.js";
import type { TalkProviderConfig } from "../../config/types.gateway.js";
import type { OpenClawConfig, TtsConfig } from "../../config/types.js";
import { normalizeSpeechProviderId } from "../../tts/provider-registry.js";
import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js";
import {
ErrorCodes,
errorShape,
formatValidationErrors,
validateTalkConfigParams,
validateTalkModeParams,
validateTalkSpeakParams,
} from "../protocol/index.js";
import { formatForLog } from "../ws-log.js";
import type { GatewayRequestHandlers } from "./types.js";
const ADMIN_SCOPE = "operator.admin";
const TALK_SECRETS_SCOPE = "operator.talk.secrets";
type ElevenLabsVoiceSettings = NonNullable<NonNullable<TtsConfig["elevenlabs"]>["voiceSettings"]>;
function canReadTalkSecrets(client: { connect?: { scopes?: string[] } } | null): boolean {
const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
return scopes.includes(ADMIN_SCOPE) || scopes.includes(TALK_SECRETS_SCOPE);
}
function trimString(value: unknown): string | undefined {
if (typeof value !== "string") {
return undefined;
}
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : undefined;
}
function finiteNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function optionalBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
function plainObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function normalizeTextNormalization(value: unknown): "auto" | "on" | "off" | undefined {
const normalized = trimString(value)?.toLowerCase();
return normalized === "auto" || normalized === "on" || normalized === "off"
? normalized
: undefined;
}
function normalizeAliasKey(value: string): string {
return value.trim().toLowerCase();
}
function resolveTalkVoiceId(
providerConfig: TalkProviderConfig,
requested: string | undefined,
): string | undefined {
if (!requested) {
return undefined;
}
const aliases = providerConfig.voiceAliases;
if (!aliases) {
return requested;
}
return aliases[normalizeAliasKey(requested)] ?? requested;
}
function readTalkVoiceSettings(
providerConfig: TalkProviderConfig,
): ElevenLabsVoiceSettings | undefined {
const source = plainObject(providerConfig.voiceSettings);
if (!source) {
return undefined;
}
const stability = finiteNumber(source.stability);
const similarityBoost = finiteNumber(source.similarityBoost);
const style = finiteNumber(source.style);
const useSpeakerBoost = optionalBoolean(source.useSpeakerBoost);
const speed = finiteNumber(source.speed);
const voiceSettings = {
...(stability == null ? {} : { stability }),
...(similarityBoost == null ? {} : { similarityBoost }),
...(style == null ? {} : { style }),
...(useSpeakerBoost == null ? {} : { useSpeakerBoost }),
...(speed == null ? {} : { speed }),
};
return Object.keys(voiceSettings).length > 0 ? voiceSettings : undefined;
}
function buildTalkTtsConfig(
config: OpenClawConfig,
):
| { cfg: OpenClawConfig; provider: string; providerConfig: TalkProviderConfig }
| { error: string } {
const resolved = resolveActiveTalkProviderConfig(config.talk);
const provider = normalizeSpeechProviderId(resolved?.provider);
if (!resolved || !provider) {
return { error: "talk.speak unavailable: talk provider not configured" };
}
const baseTts = config.messages?.tts ?? {};
const providerConfig = resolved.config;
const talkTts: TtsConfig = {
...baseTts,
auto: "always",
provider,
};
if (provider === "elevenlabs") {
talkTts.elevenlabs = {
...baseTts.elevenlabs,
...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }),
...(trimString(providerConfig.baseUrl) == null
? {}
: { baseUrl: trimString(providerConfig.baseUrl) }),
...(trimString(providerConfig.voiceId) == null
? {}
: { voiceId: trimString(providerConfig.voiceId) }),
...(trimString(providerConfig.modelId) == null
? {}
: { modelId: trimString(providerConfig.modelId) }),
...(finiteNumber(providerConfig.seed) == null
? {}
: { seed: finiteNumber(providerConfig.seed) }),
...(normalizeTextNormalization(providerConfig.applyTextNormalization) == null
? {}
: {
applyTextNormalization: normalizeTextNormalization(
providerConfig.applyTextNormalization,
),
}),
...(trimString(providerConfig.languageCode) == null
? {}
: { languageCode: trimString(providerConfig.languageCode) }),
...(readTalkVoiceSettings(providerConfig) == null
? {}
: { voiceSettings: readTalkVoiceSettings(providerConfig) }),
};
} else if (provider === "openai") {
talkTts.openai = {
...baseTts.openai,
...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }),
...(trimString(providerConfig.baseUrl) == null
? {}
: { baseUrl: trimString(providerConfig.baseUrl) }),
...(trimString(providerConfig.modelId) == null
? {}
: { model: trimString(providerConfig.modelId) }),
...(trimString(providerConfig.voiceId) == null
? {}
: { voice: trimString(providerConfig.voiceId) }),
...(finiteNumber(providerConfig.speed) == null
? {}
: { speed: finiteNumber(providerConfig.speed) }),
...(trimString(providerConfig.instructions) == null
? {}
: { instructions: trimString(providerConfig.instructions) }),
};
} else if (provider === "microsoft") {
talkTts.microsoft = {
...baseTts.microsoft,
enabled: true,
...(trimString(providerConfig.voiceId) == null
? {}
: { voice: trimString(providerConfig.voiceId) }),
...(trimString(providerConfig.languageCode) == null
? {}
: { lang: trimString(providerConfig.languageCode) }),
...(trimString(providerConfig.outputFormat) == null
? {}
: { outputFormat: trimString(providerConfig.outputFormat) }),
...(trimString(providerConfig.pitch) == null
? {}
: { pitch: trimString(providerConfig.pitch) }),
...(trimString(providerConfig.rate) == null ? {} : { rate: trimString(providerConfig.rate) }),
...(trimString(providerConfig.volume) == null
? {}
: { volume: trimString(providerConfig.volume) }),
...(trimString(providerConfig.proxy) == null
? {}
: { proxy: trimString(providerConfig.proxy) }),
...(finiteNumber(providerConfig.timeoutMs) == null
? {}
: { timeoutMs: finiteNumber(providerConfig.timeoutMs) }),
};
} else {
return { error: `talk.speak unavailable: unsupported talk provider '${resolved.provider}'` };
}
return {
provider,
providerConfig,
cfg: {
...config,
messages: {
...config.messages,
tts: talkTts,
},
},
};
}
function buildTalkSpeakOverrides(
provider: string,
providerConfig: TalkProviderConfig,
params: Record<string, unknown>,
): TtsDirectiveOverrides {
const voiceId = resolveTalkVoiceId(providerConfig, trimString(params.voiceId));
const modelId = trimString(params.modelId);
const speed = finiteNumber(params.speed);
const seed = finiteNumber(params.seed);
const normalize = normalizeTextNormalization(params.normalize);
const language = trimString(params.language)?.toLowerCase();
const overrides: TtsDirectiveOverrides = { provider };
if (provider === "elevenlabs") {
const voiceSettings = {
...(speed == null ? {} : { speed }),
...(finiteNumber(params.stability) == null
? {}
: { stability: finiteNumber(params.stability) }),
...(finiteNumber(params.similarity) == null
? {}
: { similarityBoost: finiteNumber(params.similarity) }),
...(finiteNumber(params.style) == null ? {} : { style: finiteNumber(params.style) }),
...(optionalBoolean(params.speakerBoost) == null
? {}
: { useSpeakerBoost: optionalBoolean(params.speakerBoost) }),
};
overrides.elevenlabs = {
...(voiceId == null ? {} : { voiceId }),
...(modelId == null ? {} : { modelId }),
...(seed == null ? {} : { seed }),
...(normalize == null ? {} : { applyTextNormalization: normalize }),
...(language == null ? {} : { languageCode: language }),
...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }),
};
return overrides;
}
if (provider === "openai") {
overrides.openai = {
...(voiceId == null ? {} : { voice: voiceId }),
...(modelId == null ? {} : { model: modelId }),
...(speed == null ? {} : { speed }),
};
return overrides;
}
if (provider === "microsoft") {
overrides.microsoft = voiceId == null ? undefined : { voice: voiceId };
}
return overrides;
}
function inferMimeType(
outputFormat: string | undefined,
fileExtension: string | undefined,
): string | undefined {
const normalizedOutput = outputFormat?.trim().toLowerCase();
const normalizedExtension = fileExtension?.trim().toLowerCase();
if (
normalizedOutput === "mp3" ||
normalizedOutput?.startsWith("mp3_") ||
normalizedOutput?.endsWith("-mp3") ||
normalizedExtension === ".mp3"
) {
return "audio/mpeg";
}
if (
normalizedOutput === "opus" ||
normalizedOutput?.startsWith("opus_") ||
normalizedExtension === ".opus" ||
normalizedExtension === ".ogg"
) {
return "audio/ogg";
}
if (normalizedOutput?.endsWith("-wav") || normalizedExtension === ".wav") {
return "audio/wav";
}
if (normalizedOutput?.endsWith("-webm") || normalizedExtension === ".webm") {
return "audio/webm";
}
return undefined;
}
export const talkHandlers: GatewayRequestHandlers = {
"talk.config": async ({ params, respond, client }) => {
if (!validateTalkConfigParams(params)) {
@@ -65,6 +339,65 @@ export const talkHandlers: GatewayRequestHandlers = {
respond(true, { config: configPayload }, undefined);
},
"talk.speak": async ({ params, respond }) => {
if (!validateTalkSpeakParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.speak params: ${formatValidationErrors(validateTalkSpeakParams.errors)}`,
),
);
return;
}
const text = trimString((params as { text?: unknown }).text);
if (!text) {
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "talk.speak requires text"));
return;
}
try {
const snapshot = await readConfigFileSnapshot();
const setup = buildTalkTtsConfig(snapshot.config);
if ("error" in setup) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, setup.error));
return;
}
const overrides = buildTalkSpeakOverrides(setup.provider, setup.providerConfig, params);
const result = await synthesizeSpeech({
text,
cfg: setup.cfg,
overrides,
disableFallback: true,
});
if (!result.success || !result.audioBuffer) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "talk synthesis failed"),
);
return;
}
respond(
true,
{
audioBase64: result.audioBuffer.toString("base64"),
provider: result.provider ?? setup.provider,
outputFormat: result.outputFormat,
voiceCompatible: result.voiceCompatible,
mimeType: inferMimeType(result.outputFormat, result.fileExtension),
fileExtension: result.fileExtension,
},
undefined,
);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.mode": ({ params, respond, context, client, isWebchatConnect }) => {
if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) {
respond(

View File

@@ -1,6 +1,6 @@
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import {
loadOrCreateDeviceIdentity,
publicKeyRawBase64UrlFromPem,
@@ -41,6 +41,13 @@ type TalkConfigPayload = {
};
};
type TalkConfig = NonNullable<NonNullable<TalkConfigPayload["config"]>["talk"]>;
type TalkSpeakPayload = {
audioBase64?: string;
provider?: string;
outputFormat?: string;
mimeType?: string;
fileExtension?: string;
};
const TALK_CONFIG_DEVICE_PATH = path.join(
os.tmpdir(),
`openclaw-talk-config-device-${process.pid}.json`,
@@ -95,6 +102,10 @@ async function fetchTalkConfig(
return rpcReq<TalkConfigPayload>(ws, "talk.config", params ?? {});
}
async function fetchTalkSpeak(ws: GatewaySocket, params: Record<string, unknown>) {
return rpcReq<TalkSpeakPayload>(ws, "talk.speak", params);
}
function expectElevenLabsTalkConfig(
talk: TalkConfig | undefined,
expected: {
@@ -236,4 +247,58 @@ describe("gateway talk.config", () => {
});
});
});
it("synthesizes talk audio via the active talk provider", async () => {
const { writeConfigFile } = await import("../config/config.js");
await writeConfigFile({
talk: {
provider: "openai",
providers: {
openai: {
apiKey: "openai-talk-key", // pragma: allowlist secret
voiceId: "alloy",
modelId: "gpt-4o-mini-tts",
},
},
},
});
const originalFetch = globalThis.fetch;
const requestInits: RequestInit[] = [];
const fetchMock = vi.fn(async (_input: RequestInfo | URL, init?: RequestInit) => {
if (init) {
requestInits.push(init);
}
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
});
globalThis.fetch = fetchMock as typeof fetch;
try {
await withServer(async (ws) => {
await connectOperator(ws, ["operator.read", "operator.write"]);
const res = await fetchTalkSpeak(ws, {
text: "Hello from talk mode.",
voiceId: "nova",
modelId: "tts-1",
speed: 1.25,
});
expect(res.ok).toBe(true);
expect(res.payload?.provider).toBe("openai");
expect(res.payload?.outputFormat).toBe("mp3");
expect(res.payload?.mimeType).toBe("audio/mpeg");
expect(res.payload?.fileExtension).toBe(".mp3");
expect(res.payload?.audioBase64).toBe(Buffer.from([1, 2, 3]).toString("base64"));
});
expect(fetchMock).toHaveBeenCalled();
const requestInit = requestInits.find((init) => typeof init.body === "string");
expect(requestInit).toBeDefined();
const body = JSON.parse(requestInit?.body as string) as Record<string, unknown>;
expect(body.model).toBe("tts-1");
expect(body.voice).toBe("nova");
expect(body.speed).toBe(1.25);
} finally {
globalThis.fetch = originalFetch;
}
});
});