mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-20 22:40:58 +00:00
feat(gateway): add talk speak rpc
This commit is contained in:
@@ -98,6 +98,7 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
|
||||
"agent.wait",
|
||||
"wake",
|
||||
"talk.mode",
|
||||
"talk.speak",
|
||||
"tts.enable",
|
||||
"tts.disable",
|
||||
"tts.convert",
|
||||
|
||||
@@ -48,6 +48,10 @@ import {
|
||||
TalkConfigParamsSchema,
|
||||
type TalkConfigResult,
|
||||
TalkConfigResultSchema,
|
||||
type TalkSpeakParams,
|
||||
TalkSpeakParamsSchema,
|
||||
type TalkSpeakResult,
|
||||
TalkSpeakResultSchema,
|
||||
type ChannelsStatusParams,
|
||||
ChannelsStatusParamsSchema,
|
||||
type ChannelsStatusResult,
|
||||
@@ -375,6 +379,8 @@ export const validateWizardStatusParams = ajv.compile<WizardStatusParams>(Wizard
|
||||
export const validateTalkModeParams = ajv.compile<TalkModeParams>(TalkModeParamsSchema);
|
||||
export const validateTalkConfigParams = ajv.compile<TalkConfigParams>(TalkConfigParamsSchema);
|
||||
export const validateTalkConfigResult = ajv.compile<TalkConfigResult>(TalkConfigResultSchema);
|
||||
export const validateTalkSpeakParams = ajv.compile<TalkSpeakParams>(TalkSpeakParamsSchema);
|
||||
export const validateTalkSpeakResult = ajv.compile<TalkSpeakResult>(TalkSpeakResultSchema);
|
||||
export const validateChannelsStatusParams = ajv.compile<ChannelsStatusParams>(
|
||||
ChannelsStatusParamsSchema,
|
||||
);
|
||||
@@ -540,6 +546,8 @@ export {
|
||||
WizardStatusResultSchema,
|
||||
TalkConfigParamsSchema,
|
||||
TalkConfigResultSchema,
|
||||
TalkSpeakParamsSchema,
|
||||
TalkSpeakResultSchema,
|
||||
ChannelsStatusParamsSchema,
|
||||
ChannelsStatusResultSchema,
|
||||
ChannelsLogoutParamsSchema,
|
||||
@@ -629,6 +637,8 @@ export type {
|
||||
WizardStatusResult,
|
||||
TalkConfigParams,
|
||||
TalkConfigResult,
|
||||
TalkSpeakParams,
|
||||
TalkSpeakResult,
|
||||
TalkModeParams,
|
||||
ChannelsStatusParams,
|
||||
ChannelsStatusResult,
|
||||
|
||||
@@ -16,6 +16,23 @@ export const TalkConfigParamsSchema = Type.Object(
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSpeakParamsSchema = Type.Object(
|
||||
{
|
||||
text: NonEmptyString,
|
||||
voiceId: Type.Optional(Type.String()),
|
||||
modelId: Type.Optional(Type.String()),
|
||||
speed: Type.Optional(Type.Number()),
|
||||
stability: Type.Optional(Type.Number()),
|
||||
similarity: Type.Optional(Type.Number()),
|
||||
style: Type.Optional(Type.Number()),
|
||||
speakerBoost: Type.Optional(Type.Boolean()),
|
||||
seed: Type.Optional(Type.Integer({ minimum: 0 })),
|
||||
normalize: Type.Optional(Type.String()),
|
||||
language: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
const talkProviderFieldSchemas = {
|
||||
voiceId: Type.Optional(Type.String()),
|
||||
voiceAliases: Type.Optional(Type.Record(Type.String(), Type.String())),
|
||||
@@ -85,6 +102,18 @@ export const TalkConfigResultSchema = Type.Object(
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const TalkSpeakResultSchema = Type.Object(
|
||||
{
|
||||
audioBase64: NonEmptyString,
|
||||
provider: NonEmptyString,
|
||||
outputFormat: Type.Optional(Type.String()),
|
||||
voiceCompatible: Type.Optional(Type.Boolean()),
|
||||
mimeType: Type.Optional(Type.String()),
|
||||
fileExtension: Type.Optional(Type.String()),
|
||||
},
|
||||
{ additionalProperties: false },
|
||||
);
|
||||
|
||||
export const ChannelsStatusParamsSchema = Type.Object(
|
||||
{
|
||||
probe: Type.Optional(Type.Boolean()),
|
||||
|
||||
@@ -44,6 +44,8 @@ import {
|
||||
ChannelsLogoutParamsSchema,
|
||||
TalkConfigParamsSchema,
|
||||
TalkConfigResultSchema,
|
||||
TalkSpeakParamsSchema,
|
||||
TalkSpeakResultSchema,
|
||||
ChannelsStatusParamsSchema,
|
||||
ChannelsStatusResultSchema,
|
||||
TalkModeParamsSchema,
|
||||
@@ -238,6 +240,8 @@ export const ProtocolSchemas = {
|
||||
TalkModeParams: TalkModeParamsSchema,
|
||||
TalkConfigParams: TalkConfigParamsSchema,
|
||||
TalkConfigResult: TalkConfigResultSchema,
|
||||
TalkSpeakParams: TalkSpeakParamsSchema,
|
||||
TalkSpeakResult: TalkSpeakResultSchema,
|
||||
ChannelsStatusParams: ChannelsStatusParamsSchema,
|
||||
ChannelsStatusResult: ChannelsStatusResultSchema,
|
||||
ChannelsLogoutParams: ChannelsLogoutParamsSchema,
|
||||
|
||||
@@ -70,6 +70,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">;
|
||||
export type TalkModeParams = SchemaType<"TalkModeParams">;
|
||||
export type TalkConfigParams = SchemaType<"TalkConfigParams">;
|
||||
export type TalkConfigResult = SchemaType<"TalkConfigResult">;
|
||||
export type TalkSpeakParams = SchemaType<"TalkSpeakParams">;
|
||||
export type TalkSpeakResult = SchemaType<"TalkSpeakResult">;
|
||||
export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">;
|
||||
export type ChannelsStatusResult = SchemaType<"ChannelsStatusResult">;
|
||||
export type ChannelsLogoutParams = SchemaType<"ChannelsLogoutParams">;
|
||||
|
||||
@@ -34,6 +34,7 @@ const BASE_METHODS = [
|
||||
"wizard.cancel",
|
||||
"wizard.status",
|
||||
"talk.config",
|
||||
"talk.speak",
|
||||
"talk.mode",
|
||||
"models.list",
|
||||
"tools.catalog",
|
||||
|
||||
@@ -1,23 +1,297 @@
|
||||
import { readConfigFileSnapshot } from "../../config/config.js";
|
||||
import { redactConfigObject } from "../../config/redact-snapshot.js";
|
||||
import { buildTalkConfigResponse } from "../../config/talk.js";
|
||||
import { buildTalkConfigResponse, resolveActiveTalkProviderConfig } from "../../config/talk.js";
|
||||
import type { TalkProviderConfig } from "../../config/types.gateway.js";
|
||||
import type { OpenClawConfig, TtsConfig } from "../../config/types.js";
|
||||
import { normalizeSpeechProviderId } from "../../tts/provider-registry.js";
|
||||
import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js";
|
||||
import {
|
||||
ErrorCodes,
|
||||
errorShape,
|
||||
formatValidationErrors,
|
||||
validateTalkConfigParams,
|
||||
validateTalkModeParams,
|
||||
validateTalkSpeakParams,
|
||||
} from "../protocol/index.js";
|
||||
import { formatForLog } from "../ws-log.js";
|
||||
import type { GatewayRequestHandlers } from "./types.js";
|
||||
|
||||
const ADMIN_SCOPE = "operator.admin";
|
||||
const TALK_SECRETS_SCOPE = "operator.talk.secrets";
|
||||
type ElevenLabsVoiceSettings = NonNullable<NonNullable<TtsConfig["elevenlabs"]>["voiceSettings"]>;
|
||||
|
||||
function canReadTalkSecrets(client: { connect?: { scopes?: string[] } } | null): boolean {
|
||||
const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
|
||||
return scopes.includes(ADMIN_SCOPE) || scopes.includes(TALK_SECRETS_SCOPE);
|
||||
}
|
||||
|
||||
function trimString(value: unknown): string | undefined {
|
||||
if (typeof value !== "string") {
|
||||
return undefined;
|
||||
}
|
||||
const trimmed = value.trim();
|
||||
return trimmed.length > 0 ? trimmed : undefined;
|
||||
}
|
||||
|
||||
function finiteNumber(value: unknown): number | undefined {
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function optionalBoolean(value: unknown): boolean | undefined {
|
||||
return typeof value === "boolean" ? value : undefined;
|
||||
}
|
||||
|
||||
function plainObject(value: unknown): Record<string, unknown> | undefined {
|
||||
return typeof value === "object" && value !== null && !Array.isArray(value)
|
||||
? (value as Record<string, unknown>)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function normalizeTextNormalization(value: unknown): "auto" | "on" | "off" | undefined {
|
||||
const normalized = trimString(value)?.toLowerCase();
|
||||
return normalized === "auto" || normalized === "on" || normalized === "off"
|
||||
? normalized
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function normalizeAliasKey(value: string): string {
|
||||
return value.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function resolveTalkVoiceId(
|
||||
providerConfig: TalkProviderConfig,
|
||||
requested: string | undefined,
|
||||
): string | undefined {
|
||||
if (!requested) {
|
||||
return undefined;
|
||||
}
|
||||
const aliases = providerConfig.voiceAliases;
|
||||
if (!aliases) {
|
||||
return requested;
|
||||
}
|
||||
return aliases[normalizeAliasKey(requested)] ?? requested;
|
||||
}
|
||||
|
||||
function readTalkVoiceSettings(
|
||||
providerConfig: TalkProviderConfig,
|
||||
): ElevenLabsVoiceSettings | undefined {
|
||||
const source = plainObject(providerConfig.voiceSettings);
|
||||
if (!source) {
|
||||
return undefined;
|
||||
}
|
||||
const stability = finiteNumber(source.stability);
|
||||
const similarityBoost = finiteNumber(source.similarityBoost);
|
||||
const style = finiteNumber(source.style);
|
||||
const useSpeakerBoost = optionalBoolean(source.useSpeakerBoost);
|
||||
const speed = finiteNumber(source.speed);
|
||||
const voiceSettings = {
|
||||
...(stability == null ? {} : { stability }),
|
||||
...(similarityBoost == null ? {} : { similarityBoost }),
|
||||
...(style == null ? {} : { style }),
|
||||
...(useSpeakerBoost == null ? {} : { useSpeakerBoost }),
|
||||
...(speed == null ? {} : { speed }),
|
||||
};
|
||||
return Object.keys(voiceSettings).length > 0 ? voiceSettings : undefined;
|
||||
}
|
||||
|
||||
function buildTalkTtsConfig(
|
||||
config: OpenClawConfig,
|
||||
):
|
||||
| { cfg: OpenClawConfig; provider: string; providerConfig: TalkProviderConfig }
|
||||
| { error: string } {
|
||||
const resolved = resolveActiveTalkProviderConfig(config.talk);
|
||||
const provider = normalizeSpeechProviderId(resolved?.provider);
|
||||
if (!resolved || !provider) {
|
||||
return { error: "talk.speak unavailable: talk provider not configured" };
|
||||
}
|
||||
|
||||
const baseTts = config.messages?.tts ?? {};
|
||||
const providerConfig = resolved.config;
|
||||
const talkTts: TtsConfig = {
|
||||
...baseTts,
|
||||
auto: "always",
|
||||
provider,
|
||||
};
|
||||
|
||||
if (provider === "elevenlabs") {
|
||||
talkTts.elevenlabs = {
|
||||
...baseTts.elevenlabs,
|
||||
...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }),
|
||||
...(trimString(providerConfig.baseUrl) == null
|
||||
? {}
|
||||
: { baseUrl: trimString(providerConfig.baseUrl) }),
|
||||
...(trimString(providerConfig.voiceId) == null
|
||||
? {}
|
||||
: { voiceId: trimString(providerConfig.voiceId) }),
|
||||
...(trimString(providerConfig.modelId) == null
|
||||
? {}
|
||||
: { modelId: trimString(providerConfig.modelId) }),
|
||||
...(finiteNumber(providerConfig.seed) == null
|
||||
? {}
|
||||
: { seed: finiteNumber(providerConfig.seed) }),
|
||||
...(normalizeTextNormalization(providerConfig.applyTextNormalization) == null
|
||||
? {}
|
||||
: {
|
||||
applyTextNormalization: normalizeTextNormalization(
|
||||
providerConfig.applyTextNormalization,
|
||||
),
|
||||
}),
|
||||
...(trimString(providerConfig.languageCode) == null
|
||||
? {}
|
||||
: { languageCode: trimString(providerConfig.languageCode) }),
|
||||
...(readTalkVoiceSettings(providerConfig) == null
|
||||
? {}
|
||||
: { voiceSettings: readTalkVoiceSettings(providerConfig) }),
|
||||
};
|
||||
} else if (provider === "openai") {
|
||||
talkTts.openai = {
|
||||
...baseTts.openai,
|
||||
...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }),
|
||||
...(trimString(providerConfig.baseUrl) == null
|
||||
? {}
|
||||
: { baseUrl: trimString(providerConfig.baseUrl) }),
|
||||
...(trimString(providerConfig.modelId) == null
|
||||
? {}
|
||||
: { model: trimString(providerConfig.modelId) }),
|
||||
...(trimString(providerConfig.voiceId) == null
|
||||
? {}
|
||||
: { voice: trimString(providerConfig.voiceId) }),
|
||||
...(finiteNumber(providerConfig.speed) == null
|
||||
? {}
|
||||
: { speed: finiteNumber(providerConfig.speed) }),
|
||||
...(trimString(providerConfig.instructions) == null
|
||||
? {}
|
||||
: { instructions: trimString(providerConfig.instructions) }),
|
||||
};
|
||||
} else if (provider === "microsoft") {
|
||||
talkTts.microsoft = {
|
||||
...baseTts.microsoft,
|
||||
enabled: true,
|
||||
...(trimString(providerConfig.voiceId) == null
|
||||
? {}
|
||||
: { voice: trimString(providerConfig.voiceId) }),
|
||||
...(trimString(providerConfig.languageCode) == null
|
||||
? {}
|
||||
: { lang: trimString(providerConfig.languageCode) }),
|
||||
...(trimString(providerConfig.outputFormat) == null
|
||||
? {}
|
||||
: { outputFormat: trimString(providerConfig.outputFormat) }),
|
||||
...(trimString(providerConfig.pitch) == null
|
||||
? {}
|
||||
: { pitch: trimString(providerConfig.pitch) }),
|
||||
...(trimString(providerConfig.rate) == null ? {} : { rate: trimString(providerConfig.rate) }),
|
||||
...(trimString(providerConfig.volume) == null
|
||||
? {}
|
||||
: { volume: trimString(providerConfig.volume) }),
|
||||
...(trimString(providerConfig.proxy) == null
|
||||
? {}
|
||||
: { proxy: trimString(providerConfig.proxy) }),
|
||||
...(finiteNumber(providerConfig.timeoutMs) == null
|
||||
? {}
|
||||
: { timeoutMs: finiteNumber(providerConfig.timeoutMs) }),
|
||||
};
|
||||
} else {
|
||||
return { error: `talk.speak unavailable: unsupported talk provider '${resolved.provider}'` };
|
||||
}
|
||||
|
||||
return {
|
||||
provider,
|
||||
providerConfig,
|
||||
cfg: {
|
||||
...config,
|
||||
messages: {
|
||||
...config.messages,
|
||||
tts: talkTts,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function buildTalkSpeakOverrides(
|
||||
provider: string,
|
||||
providerConfig: TalkProviderConfig,
|
||||
params: Record<string, unknown>,
|
||||
): TtsDirectiveOverrides {
|
||||
const voiceId = resolveTalkVoiceId(providerConfig, trimString(params.voiceId));
|
||||
const modelId = trimString(params.modelId);
|
||||
const speed = finiteNumber(params.speed);
|
||||
const seed = finiteNumber(params.seed);
|
||||
const normalize = normalizeTextNormalization(params.normalize);
|
||||
const language = trimString(params.language)?.toLowerCase();
|
||||
const overrides: TtsDirectiveOverrides = { provider };
|
||||
|
||||
if (provider === "elevenlabs") {
|
||||
const voiceSettings = {
|
||||
...(speed == null ? {} : { speed }),
|
||||
...(finiteNumber(params.stability) == null
|
||||
? {}
|
||||
: { stability: finiteNumber(params.stability) }),
|
||||
...(finiteNumber(params.similarity) == null
|
||||
? {}
|
||||
: { similarityBoost: finiteNumber(params.similarity) }),
|
||||
...(finiteNumber(params.style) == null ? {} : { style: finiteNumber(params.style) }),
|
||||
...(optionalBoolean(params.speakerBoost) == null
|
||||
? {}
|
||||
: { useSpeakerBoost: optionalBoolean(params.speakerBoost) }),
|
||||
};
|
||||
overrides.elevenlabs = {
|
||||
...(voiceId == null ? {} : { voiceId }),
|
||||
...(modelId == null ? {} : { modelId }),
|
||||
...(seed == null ? {} : { seed }),
|
||||
...(normalize == null ? {} : { applyTextNormalization: normalize }),
|
||||
...(language == null ? {} : { languageCode: language }),
|
||||
...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }),
|
||||
};
|
||||
return overrides;
|
||||
}
|
||||
|
||||
if (provider === "openai") {
|
||||
overrides.openai = {
|
||||
...(voiceId == null ? {} : { voice: voiceId }),
|
||||
...(modelId == null ? {} : { model: modelId }),
|
||||
...(speed == null ? {} : { speed }),
|
||||
};
|
||||
return overrides;
|
||||
}
|
||||
|
||||
if (provider === "microsoft") {
|
||||
overrides.microsoft = voiceId == null ? undefined : { voice: voiceId };
|
||||
}
|
||||
|
||||
return overrides;
|
||||
}
|
||||
|
||||
function inferMimeType(
|
||||
outputFormat: string | undefined,
|
||||
fileExtension: string | undefined,
|
||||
): string | undefined {
|
||||
const normalizedOutput = outputFormat?.trim().toLowerCase();
|
||||
const normalizedExtension = fileExtension?.trim().toLowerCase();
|
||||
if (
|
||||
normalizedOutput === "mp3" ||
|
||||
normalizedOutput?.startsWith("mp3_") ||
|
||||
normalizedOutput?.endsWith("-mp3") ||
|
||||
normalizedExtension === ".mp3"
|
||||
) {
|
||||
return "audio/mpeg";
|
||||
}
|
||||
if (
|
||||
normalizedOutput === "opus" ||
|
||||
normalizedOutput?.startsWith("opus_") ||
|
||||
normalizedExtension === ".opus" ||
|
||||
normalizedExtension === ".ogg"
|
||||
) {
|
||||
return "audio/ogg";
|
||||
}
|
||||
if (normalizedOutput?.endsWith("-wav") || normalizedExtension === ".wav") {
|
||||
return "audio/wav";
|
||||
}
|
||||
if (normalizedOutput?.endsWith("-webm") || normalizedExtension === ".webm") {
|
||||
return "audio/webm";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export const talkHandlers: GatewayRequestHandlers = {
|
||||
"talk.config": async ({ params, respond, client }) => {
|
||||
if (!validateTalkConfigParams(params)) {
|
||||
@@ -65,6 +339,65 @@ export const talkHandlers: GatewayRequestHandlers = {
|
||||
|
||||
respond(true, { config: configPayload }, undefined);
|
||||
},
|
||||
"talk.speak": async ({ params, respond }) => {
|
||||
if (!validateTalkSpeakParams(params)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
`invalid talk.speak params: ${formatValidationErrors(validateTalkSpeakParams.errors)}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const text = trimString((params as { text?: unknown }).text);
|
||||
if (!text) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "talk.speak requires text"));
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const snapshot = await readConfigFileSnapshot();
|
||||
const setup = buildTalkTtsConfig(snapshot.config);
|
||||
if ("error" in setup) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, setup.error));
|
||||
return;
|
||||
}
|
||||
|
||||
const overrides = buildTalkSpeakOverrides(setup.provider, setup.providerConfig, params);
|
||||
const result = await synthesizeSpeech({
|
||||
text,
|
||||
cfg: setup.cfg,
|
||||
overrides,
|
||||
disableFallback: true,
|
||||
});
|
||||
if (!result.success || !result.audioBuffer) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "talk synthesis failed"),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
respond(
|
||||
true,
|
||||
{
|
||||
audioBase64: result.audioBuffer.toString("base64"),
|
||||
provider: result.provider ?? setup.provider,
|
||||
outputFormat: result.outputFormat,
|
||||
voiceCompatible: result.voiceCompatible,
|
||||
mimeType: inferMimeType(result.outputFormat, result.fileExtension),
|
||||
fileExtension: result.fileExtension,
|
||||
},
|
||||
undefined,
|
||||
);
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
}
|
||||
},
|
||||
"talk.mode": ({ params, respond, context, client, isWebchatConnect }) => {
|
||||
if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) {
|
||||
respond(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
loadOrCreateDeviceIdentity,
|
||||
publicKeyRawBase64UrlFromPem,
|
||||
@@ -41,6 +41,13 @@ type TalkConfigPayload = {
|
||||
};
|
||||
};
|
||||
type TalkConfig = NonNullable<NonNullable<TalkConfigPayload["config"]>["talk"]>;
|
||||
type TalkSpeakPayload = {
|
||||
audioBase64?: string;
|
||||
provider?: string;
|
||||
outputFormat?: string;
|
||||
mimeType?: string;
|
||||
fileExtension?: string;
|
||||
};
|
||||
const TALK_CONFIG_DEVICE_PATH = path.join(
|
||||
os.tmpdir(),
|
||||
`openclaw-talk-config-device-${process.pid}.json`,
|
||||
@@ -95,6 +102,10 @@ async function fetchTalkConfig(
|
||||
return rpcReq<TalkConfigPayload>(ws, "talk.config", params ?? {});
|
||||
}
|
||||
|
||||
async function fetchTalkSpeak(ws: GatewaySocket, params: Record<string, unknown>) {
|
||||
return rpcReq<TalkSpeakPayload>(ws, "talk.speak", params);
|
||||
}
|
||||
|
||||
function expectElevenLabsTalkConfig(
|
||||
talk: TalkConfig | undefined,
|
||||
expected: {
|
||||
@@ -236,4 +247,58 @@ describe("gateway talk.config", () => {
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("synthesizes talk audio via the active talk provider", async () => {
|
||||
const { writeConfigFile } = await import("../config/config.js");
|
||||
await writeConfigFile({
|
||||
talk: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "openai-talk-key", // pragma: allowlist secret
|
||||
voiceId: "alloy",
|
||||
modelId: "gpt-4o-mini-tts",
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const originalFetch = globalThis.fetch;
|
||||
const requestInits: RequestInit[] = [];
|
||||
const fetchMock = vi.fn(async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||
if (init) {
|
||||
requestInits.push(init);
|
||||
}
|
||||
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
|
||||
});
|
||||
globalThis.fetch = fetchMock as typeof fetch;
|
||||
|
||||
try {
|
||||
await withServer(async (ws) => {
|
||||
await connectOperator(ws, ["operator.read", "operator.write"]);
|
||||
const res = await fetchTalkSpeak(ws, {
|
||||
text: "Hello from talk mode.",
|
||||
voiceId: "nova",
|
||||
modelId: "tts-1",
|
||||
speed: 1.25,
|
||||
});
|
||||
expect(res.ok).toBe(true);
|
||||
expect(res.payload?.provider).toBe("openai");
|
||||
expect(res.payload?.outputFormat).toBe("mp3");
|
||||
expect(res.payload?.mimeType).toBe("audio/mpeg");
|
||||
expect(res.payload?.fileExtension).toBe(".mp3");
|
||||
expect(res.payload?.audioBase64).toBe(Buffer.from([1, 2, 3]).toString("base64"));
|
||||
});
|
||||
|
||||
expect(fetchMock).toHaveBeenCalled();
|
||||
const requestInit = requestInits.find((init) => typeof init.body === "string");
|
||||
expect(requestInit).toBeDefined();
|
||||
const body = JSON.parse(requestInit?.body as string) as Record<string, unknown>;
|
||||
expect(body.model).toBe("tts-1");
|
||||
expect(body.voice).toBe("nova");
|
||||
expect(body.speed).toBe(1.25);
|
||||
} finally {
|
||||
globalThis.fetch = originalFetch;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user