feat(plugins): add speech provider registration

This commit is contained in:
Peter Steinberger
2026-03-16 18:49:55 -07:00
parent ad05cd9ab2
commit 662031a88e
35 changed files with 658 additions and 286 deletions

View File

@@ -0,0 +1,14 @@
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
import { buildElevenLabsSpeechProvider } from "../../src/tts/providers/elevenlabs.js";
const elevenLabsPlugin = {
id: "elevenlabs",
name: "ElevenLabs Speech",
description: "Bundled ElevenLabs speech provider",
configSchema: emptyPluginConfigSchema(),
register(api: OpenClawPluginApi) {
api.registerSpeechProvider(buildElevenLabsSpeechProvider());
},
};
export default elevenLabsPlugin;

View File

@@ -0,0 +1,8 @@
{
"id": "elevenlabs",
"configSchema": {
"type": "object",
"additionalProperties": false,
"properties": {}
}
}

View File

@@ -0,0 +1,12 @@
{
"name": "@openclaw/elevenlabs-speech",
"version": "2026.3.14",
"private": true,
"description": "OpenClaw ElevenLabs speech plugin",
"type": "module",
"openclaw": {
"extensions": [
"./index.ts"
]
}
}

View File

@@ -44,6 +44,7 @@ function fakeApi(overrides: Partial<OpenClawPluginApi> = {}): OpenClawPluginApi
registerCli() {},
registerService() {},
registerProvider() {},
registerSpeechProvider() {},
registerWebSearchProvider() {},
registerInteractiveHandler() {},
registerHook() {},

View File

@@ -0,0 +1,14 @@
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
import { buildMicrosoftSpeechProvider } from "../../src/tts/providers/microsoft.js";
const microsoftPlugin = {
id: "microsoft",
name: "Microsoft Speech",
description: "Bundled Microsoft speech provider",
configSchema: emptyPluginConfigSchema(),
register(api: OpenClawPluginApi) {
api.registerSpeechProvider(buildMicrosoftSpeechProvider());
},
};
export default microsoftPlugin;

View File

@@ -0,0 +1,8 @@
{
"id": "microsoft",
"configSchema": {
"type": "object",
"additionalProperties": false,
"properties": {}
}
}

View File

@@ -0,0 +1,12 @@
{
"name": "@openclaw/microsoft-speech",
"version": "2026.3.14",
"private": true,
"description": "OpenClaw Microsoft speech plugin",
"type": "module",
"openclaw": {
"extensions": [
"./index.ts"
]
}
}

View File

@@ -1,4 +1,5 @@
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
import { buildOpenAISpeechProvider } from "../../src/tts/providers/openai.js";
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
import { buildOpenAIProvider } from "./openai-provider.js";
@@ -10,6 +11,7 @@ const openAIPlugin = {
register(api: OpenClawPluginApi) {
api.registerProvider(buildOpenAIProvider());
api.registerProvider(buildOpenAICodexProviderPlugin());
api.registerSpeechProvider(buildOpenAISpeechProvider());
},
};

View File

@@ -15,6 +15,7 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi
registerCli() {},
registerService() {},
registerProvider() {},
registerSpeechProvider() {},
registerWebSearchProvider() {},
registerInteractiveHandler() {},
registerCommand() {},

View File

@@ -80,7 +80,7 @@ const voiceCallConfigSchema = {
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
"tts.provider": {
label: "TTS Provider Override",
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
advanced: true,
},
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },

View File

@@ -101,7 +101,7 @@
},
"tts.provider": {
"label": "TTS Provider Override",
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
"help": "Deep-merges with messages.tts (Microsoft is ignored for calls).",
"advanced": true
},
"tts.openai.model": {
@@ -420,8 +420,7 @@
"enum": ["final", "all"]
},
"provider": {
"type": "string",
"enum": ["openai", "elevenlabs", "edge"]
"type": "string"
},
"summaryModel": {
"type": "string"

View File

@@ -1,4 +1,5 @@
import { logVerbose } from "../../globals.js";
import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js";
import {
getLastTtsAttempt,
getTtsMaxLength,
@@ -54,7 +55,7 @@ function ttsUsage(): ReplyPayload {
`• /tts summary [on|off] — View/change auto-summary\n` +
`• /tts audio <text> — Generate audio from text\n\n` +
`**Providers:**\n` +
`edge — Free, fast (default)\n` +
`microsoft — Microsoft Edge-backed speech (default fallback)\n` +
`• openai — High quality (requires API key)\n` +
`• elevenlabs — Premium voices (requires API key)\n\n` +
`**Text Limit (default: 1500, max: 4096):**\n` +
@@ -62,7 +63,7 @@ function ttsUsage(): ReplyPayload {
`• Summary ON: AI summarizes, then generates audio\n` +
`• Summary OFF: Truncates text, then generates audio\n\n` +
`**Examples:**\n` +
`/tts provider edge\n` +
`/tts provider microsoft\n` +
`/tts limit 2000\n` +
`/tts audio Hello, this is a test!`,
};
@@ -161,7 +162,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
if (!args.trim()) {
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
const hasEdge = isTtsProviderConfigured(config, "edge");
const hasMicrosoft = isTtsProviderConfigured(config, "microsoft", params.cfg);
return {
shouldContinue: false,
reply: {
@@ -170,21 +171,23 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
`Primary: ${currentProvider}\n` +
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
`Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
`Usage: /tts provider openai | elevenlabs | edge`,
`Microsoft enabled: ${hasMicrosoft ? "✅" : "❌"}\n` +
`Usage: /tts provider openai | elevenlabs | microsoft`,
},
};
}
const requested = args.trim().toLowerCase();
if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") {
const knownProviders = new Set(listSpeechProviders(params.cfg).map((provider) => provider.id));
if (requested !== "edge" && !knownProviders.has(requested)) {
return { shouldContinue: false, reply: ttsUsage() };
}
const nextProvider = normalizeSpeechProviderId(requested) ?? requested;
setTtsProvider(prefsPath, requested);
return {
shouldContinue: false,
reply: { text: `✅ TTS provider set to ${requested}.` },
reply: { text: `✅ TTS provider set to ${nextProvider}.` },
};
}
@@ -249,7 +252,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
if (action === "status") {
const enabled = isTtsEnabled(config, prefsPath);
const provider = getTtsProvider(config, prefsPath);
const hasKey = isTtsProviderConfigured(config, provider);
const hasKey = isTtsProviderConfigured(config, provider, params.cfg);
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath);
const last = getLastTtsAttempt();

View File

@@ -91,6 +91,7 @@ const createRegistry = (channels: PluginRegistry["channels"]): PluginRegistry =>
enabled: true,
})),
providers: [],
speechProviders: [],
webSearchProviders: [],
gatewayHandlers: {},
httpRoutes: [],

View File

@@ -337,6 +337,7 @@ describe("ensureChannelSetupPluginInstalled", () => {
hookNames: [],
channelIds: [],
providerIds: [],
speechProviderIds: [],
webSearchProviderIds: [],
gatewayMethods: [],
cliCommands: [],

View File

@@ -1,6 +1,6 @@
import type { SecretInput } from "./types.secrets.js";
export type TtsProvider = "elevenlabs" | "openai" | "edge";
export type TtsProvider = string;
export type TtsMode = "final" | "all";
@@ -66,9 +66,22 @@ export type TtsConfig = {
/** System-level instructions for the TTS model (gpt-4o-mini-tts only). */
instructions?: string;
};
/** Microsoft Edge (node-edge-tts) configuration. */
/** Legacy alias for Microsoft speech configuration. */
edge?: {
/** Explicitly allow Edge TTS usage (no API key required). */
/** Explicitly allow Microsoft speech usage (no API key required). */
enabled?: boolean;
voice?: string;
lang?: string;
outputFormat?: string;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles?: boolean;
proxy?: string;
timeoutMs?: number;
};
/** Preferred alias for Microsoft speech configuration. */
microsoft?: {
enabled?: boolean;
voice?: string;
lang?: string;

View File

@@ -353,9 +353,24 @@ export const MarkdownConfigSchema = z
.strict()
.optional();
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
export const TtsProviderSchema = z.string().min(1);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
const TtsMicrosoftConfigSchema = z
.object({
enabled: z.boolean().optional(),
voice: z.string().optional(),
lang: z.string().optional(),
outputFormat: z.string().optional(),
pitch: z.string().optional(),
rate: z.string().optional(),
volume: z.string().optional(),
saveSubtitles: z.boolean().optional(),
proxy: z.string().optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.optional();
export const TtsConfigSchema = z
.object({
auto: TtsAutoSchema.optional(),
@@ -409,21 +424,8 @@ export const TtsConfigSchema = z
})
.strict()
.optional(),
edge: z
.object({
enabled: z.boolean().optional(),
voice: z.string().optional(),
lang: z.string().optional(),
outputFormat: z.string().optional(),
pitch: z.string().optional(),
rate: z.string().optional(),
volume: z.string().optional(),
saveSubtitles: z.boolean().optional(),
proxy: z.string().optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.optional(),
edge: TtsMicrosoftConfigSchema,
microsoft: TtsMicrosoftConfigSchema,
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),

View File

@@ -1,4 +1,5 @@
import { loadConfig } from "../../config/config.js";
import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js";
import {
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
@@ -26,9 +27,9 @@ export const ttsHandlers: GatewayRequestHandlers = {
const prefsPath = resolveTtsPrefsPath(config);
const provider = getTtsProvider(config, prefsPath);
const autoMode = resolveTtsAutoMode({ config, prefsPath });
const fallbackProviders = resolveTtsProviderOrder(provider)
const fallbackProviders = resolveTtsProviderOrder(provider, cfg)
.slice(1)
.filter((candidate) => isTtsProviderConfigured(config, candidate));
.filter((candidate) => isTtsProviderConfigured(config, candidate, cfg));
respond(true, {
enabled: isTtsEnabled(config, prefsPath),
auto: autoMode,
@@ -38,7 +39,7 @@ export const ttsHandlers: GatewayRequestHandlers = {
prefsPath,
hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")),
hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")),
edgeEnabled: isTtsProviderConfigured(config, "edge"),
microsoftEnabled: isTtsProviderConfigured(config, "microsoft", cfg),
});
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
@@ -99,20 +100,23 @@ export const ttsHandlers: GatewayRequestHandlers = {
}
},
"tts.setProvider": async ({ params, respond }) => {
const provider = typeof params.provider === "string" ? params.provider.trim() : "";
if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") {
const provider = normalizeSpeechProviderId(
typeof params.provider === "string" ? params.provider.trim() : "",
);
const cfg = loadConfig();
const knownProviders = new Set(listSpeechProviders(cfg).map((entry) => entry.id));
if (!provider || !knownProviders.has(provider)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
"Invalid provider. Use openai, elevenlabs, or edge.",
"Invalid provider. Use a registered TTS provider id such as openai, elevenlabs, or microsoft.",
),
);
return;
}
try {
const cfg = loadConfig();
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
setTtsProvider(prefsPath, provider);
@@ -127,27 +131,19 @@ export const ttsHandlers: GatewayRequestHandlers = {
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
respond(true, {
providers: [
{
id: "openai",
name: "OpenAI",
configured: Boolean(resolveTtsApiKey(config, "openai")),
models: [...OPENAI_TTS_MODELS],
voices: [...OPENAI_TTS_VOICES],
},
{
id: "elevenlabs",
name: "ElevenLabs",
configured: Boolean(resolveTtsApiKey(config, "elevenlabs")),
models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
},
{
id: "edge",
name: "Edge TTS",
configured: isTtsProviderConfigured(config, "edge"),
models: [],
},
],
providers: listSpeechProviders(cfg).map((provider) => ({
id: provider.id,
name: provider.label,
configured: provider.isConfigured({ cfg, config }),
models:
provider.id === "openai" && provider.models == null
? [...OPENAI_TTS_MODELS]
: [...(provider.models ?? [])],
voices:
provider.id === "openai" && provider.voices == null
? [...OPENAI_TTS_VOICES]
: [...(provider.voices ?? [])],
})),
active: getTtsProvider(config, prefsPath),
});
} catch (err) {

View File

@@ -29,6 +29,7 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({
channelSetups: [],
commands: [],
providers: [],
speechProviders: [],
webSearchProviders: [],
gatewayHandlers: {},
httpRoutes: [],

View File

@@ -1,25 +1,9 @@
import { vi } from "vitest";
import type { PluginRegistry } from "../plugins/registry.js";
import { createEmptyPluginRegistry, type PluginRegistry } from "../plugins/registry.js";
import { setActivePluginRegistry } from "../plugins/runtime.js";
export const registryState: { registry: PluginRegistry } = {
registry: {
plugins: [],
tools: [],
hooks: [],
typedHooks: [],
channels: [],
channelSetups: [],
providers: [],
webSearchProviders: [],
gatewayHandlers: {},
httpHandlers: [],
httpRoutes: [],
cliRegistrars: [],
services: [],
commands: [],
diagnostics: [],
} as PluginRegistry,
registry: createEmptyPluginRegistry(),
};
export function setRegistry(registry: PluginRegistry) {

View File

@@ -146,6 +146,7 @@ const createStubPluginRegistry = (): PluginRegistry => ({
],
channelSetups: [],
providers: [],
speechProviders: [],
webSearchProviders: [],
gatewayHandlers: {},
httpRoutes: [],

View File

@@ -21,6 +21,7 @@ export type {
ProviderResolveDynamicModelContext,
ProviderNormalizeResolvedModelContext,
ProviderRuntimeModel,
SpeechProviderPlugin,
ProviderThinkingPolicyContext,
ProviderWrapStreamFnContext,
OpenClawPluginService,

View File

@@ -140,6 +140,7 @@ export type {
ProviderResolveDynamicModelContext,
ProviderNormalizeResolvedModelContext,
ProviderRuntimeModel,
SpeechProviderPlugin,
ProviderThinkingPolicyContext,
ProviderWrapStreamFnContext,
} from "../plugins/types.js";

View File

@@ -494,6 +494,7 @@ function createPluginRecord(params: {
hookNames: [],
channelIds: [],
providerIds: [],
speechProviderIds: [],
webSearchProviderIds: [],
gatewayMethods: [],
cliCommands: [],

View File

@@ -46,6 +46,7 @@ import type {
PluginHookName,
PluginHookHandlerMap,
PluginHookRegistration as TypedPluginHookRegistration,
SpeechProviderPlugin,
WebSearchProviderPlugin,
} from "./types.js";
@@ -110,6 +111,14 @@ export type PluginWebSearchProviderRegistration = {
rootDir?: string;
};
export type PluginSpeechProviderRegistration = {
pluginId: string;
pluginName?: string;
provider: SpeechProviderPlugin;
source: string;
rootDir?: string;
};
export type PluginHookRegistration = {
pluginId: string;
entry: HookEntry;
@@ -154,6 +163,7 @@ export type PluginRecord = {
hookNames: string[];
channelIds: string[];
providerIds: string[];
speechProviderIds: string[];
webSearchProviderIds: string[];
gatewayMethods: string[];
cliCommands: string[];
@@ -174,6 +184,7 @@ export type PluginRegistry = {
channels: PluginChannelRegistration[];
channelSetups: PluginChannelSetupRegistration[];
providers: PluginProviderRegistration[];
speechProviders: PluginSpeechProviderRegistration[];
webSearchProviders: PluginWebSearchProviderRegistration[];
gatewayHandlers: GatewayRequestHandlers;
httpRoutes: PluginHttpRouteRegistration[];
@@ -219,6 +230,7 @@ export function createEmptyPluginRegistry(): PluginRegistry {
channels: [],
channelSetups: [],
providers: [],
speechProviders: [],
webSearchProviders: [],
gatewayHandlers: {},
httpRoutes: [],
@@ -550,6 +562,37 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
});
};
const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => {
const id = provider.id.trim();
if (!id) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: "speech provider registration missing id",
});
return;
}
const existing = registry.speechProviders.find((entry) => entry.provider.id === id);
if (existing) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: `speech provider already registered: ${id} (${existing.pluginId})`,
});
return;
}
record.speechProviderIds.push(id);
registry.speechProviders.push({
pluginId: record.id,
pluginName: record.name,
provider,
source: record.source,
rootDir: record.rootDir,
});
};
const registerWebSearchProvider = (record: PluginRecord, provider: WebSearchProviderPlugin) => {
const id = provider.id.trim();
if (!id) {
@@ -789,6 +832,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
registerChannel: (registration) => registerChannel(record, registration, registrationMode),
registerProvider:
registrationMode === "full" ? (provider) => registerProvider(record, provider) : () => {},
registerSpeechProvider:
registrationMode === "full"
? (provider) => registerSpeechProvider(record, provider)
: () => {},
registerWebSearchProvider:
registrationMode === "full"
? (provider) => registerWebSearchProvider(record, provider)
@@ -862,6 +909,7 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
registerTool,
registerChannel,
registerProvider,
registerSpeechProvider,
registerWebSearchProvider,
registerGatewayMethod,
registerCli,

View File

@@ -27,6 +27,14 @@ import type { HookEntry } from "../hooks/types.js";
import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js";
import type { RuntimeEnv } from "../runtime.js";
import type { RuntimeWebSearchMetadata } from "../secrets/runtime-web-tools.types.js";
import type {
SpeechProviderConfiguredContext,
SpeechProviderId,
SpeechSynthesisRequest,
SpeechSynthesisResult,
SpeechTelephonySynthesisRequest,
SpeechTelephonySynthesisResult,
} from "../tts/provider-types.js";
import type { WizardPrompter } from "../wizard/prompts.js";
import type { PluginRuntime } from "./runtime/types.js";
@@ -853,6 +861,23 @@ export type PluginWebSearchProviderEntry = WebSearchProviderPlugin & {
pluginId: string;
};
export type SpeechProviderPlugin = {
id: SpeechProviderId;
label: string;
aliases?: string[];
models?: readonly string[];
voices?: readonly string[];
isConfigured: (ctx: SpeechProviderConfiguredContext) => boolean;
synthesize: (req: SpeechSynthesisRequest) => Promise<SpeechSynthesisResult>;
synthesizeTelephony?: (
req: SpeechTelephonySynthesisRequest,
) => Promise<SpeechTelephonySynthesisResult>;
};
export type PluginSpeechProviderEntry = SpeechProviderPlugin & {
pluginId: string;
};
export type OpenClawPluginGatewayMethod = {
method: string;
handler: GatewayRequestHandler;
@@ -1211,6 +1236,7 @@ export type OpenClawPluginApi = {
registerCli: (registrar: OpenClawPluginCliRegistrar, opts?: { commands?: string[] }) => void;
registerService: (service: OpenClawPluginService) => void;
registerProvider: (provider: ProviderPlugin) => void;
registerSpeechProvider: (provider: SpeechProviderPlugin) => void;
registerWebSearchProvider: (provider: WebSearchProviderPlugin) => void;
registerInteractiveHandler: (registration: PluginInteractiveHandlerRegistration) => void;
/**

View File

@@ -26,6 +26,7 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl
enabled: true,
})),
providers: [],
speechProviders: [],
webSearchProviders: [],
gatewayHandlers: {},
httpRoutes: [],

View File

@@ -2,29 +2,36 @@ import type {
AnyAgentTool,
OpenClawPluginApi,
ProviderPlugin,
SpeechProviderPlugin,
WebSearchProviderPlugin,
} from "../plugins/types.js";
export type CapturedPluginRegistration = {
api: OpenClawPluginApi;
providers: ProviderPlugin[];
speechProviders: SpeechProviderPlugin[];
webSearchProviders: WebSearchProviderPlugin[];
tools: AnyAgentTool[];
};
export function createCapturedPluginRegistration(): CapturedPluginRegistration {
const providers: ProviderPlugin[] = [];
const speechProviders: SpeechProviderPlugin[] = [];
const webSearchProviders: WebSearchProviderPlugin[] = [];
const tools: AnyAgentTool[] = [];
return {
providers,
speechProviders,
webSearchProviders,
tools,
api: {
registerProvider(provider: ProviderPlugin) {
providers.push(provider);
},
registerSpeechProvider(provider: SpeechProviderPlugin) {
speechProviders.push(provider);
},
registerWebSearchProvider(provider: WebSearchProviderPlugin) {
webSearchProviders.push(provider);
},

View File

@@ -0,0 +1,84 @@
import type { OpenClawConfig } from "../config/config.js";
import { loadOpenClawPlugins } from "../plugins/loader.js";
import { getActivePluginRegistry } from "../plugins/runtime.js";
import type { SpeechProviderPlugin } from "../plugins/types.js";
import type { SpeechProviderId } from "./provider-types.js";
import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
import { buildOpenAISpeechProvider } from "./providers/openai.js";
const BUILTIN_SPEECH_PROVIDERS: readonly SpeechProviderPlugin[] = [
buildOpenAISpeechProvider(),
buildElevenLabsSpeechProvider(),
buildMicrosoftSpeechProvider(),
];
function trimToUndefined(value: string | undefined): string | undefined {
const trimmed = value?.trim().toLowerCase();
return trimmed ? trimmed : undefined;
}
export function normalizeSpeechProviderId(
providerId: string | undefined,
): SpeechProviderId | undefined {
const normalized = trimToUndefined(providerId);
if (!normalized) {
return undefined;
}
return normalized === "edge" ? "microsoft" : normalized;
}
function resolveSpeechProviderPluginEntries(cfg?: OpenClawConfig): SpeechProviderPlugin[] {
const active = getActivePluginRegistry();
const registry =
(active?.speechProviders?.length ?? 0) > 0 || !cfg
? active
: loadOpenClawPlugins({ config: cfg });
return registry?.speechProviders?.map((entry) => entry.provider) ?? [];
}
function buildProviderMaps(cfg?: OpenClawConfig): {
canonical: Map<string, SpeechProviderPlugin>;
aliases: Map<string, SpeechProviderPlugin>;
} {
const canonical = new Map<string, SpeechProviderPlugin>();
const aliases = new Map<string, SpeechProviderPlugin>();
const register = (provider: SpeechProviderPlugin) => {
const id = normalizeSpeechProviderId(provider.id);
if (!id) {
return;
}
canonical.set(id, provider);
aliases.set(id, provider);
for (const alias of provider.aliases ?? []) {
const normalizedAlias = normalizeSpeechProviderId(alias);
if (normalizedAlias) {
aliases.set(normalizedAlias, provider);
}
}
};
for (const provider of BUILTIN_SPEECH_PROVIDERS) {
register(provider);
}
for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
register(provider);
}
return { canonical, aliases };
}
export function listSpeechProviders(cfg?: OpenClawConfig): SpeechProviderPlugin[] {
return [...buildProviderMaps(cfg).canonical.values()];
}
export function getSpeechProvider(
providerId: string | undefined,
cfg?: OpenClawConfig,
): SpeechProviderPlugin | undefined {
const normalized = normalizeSpeechProviderId(providerId);
if (!normalized) {
return undefined;
}
return buildProviderMaps(cfg).aliases.get(normalized);
}

38
src/tts/provider-types.ts Normal file
View File

@@ -0,0 +1,38 @@
import type { OpenClawConfig } from "../config/config.js";
import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "./tts.js";
export type SpeechProviderId = string;
export type SpeechSynthesisTarget = "audio-file" | "voice-note";
export type SpeechProviderConfiguredContext = {
cfg?: OpenClawConfig;
config: ResolvedTtsConfig;
};
export type SpeechSynthesisRequest = {
text: string;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
target: SpeechSynthesisTarget;
overrides?: TtsDirectiveOverrides;
};
export type SpeechSynthesisResult = {
audioBuffer: Buffer;
outputFormat: string;
fileExtension: string;
voiceCompatible: boolean;
};
export type SpeechTelephonySynthesisRequest = {
text: string;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
};
export type SpeechTelephonySynthesisResult = {
audioBuffer: Buffer;
outputFormat: string;
sampleRate: number;
};

View File

@@ -0,0 +1,73 @@
import type { SpeechProviderPlugin } from "../../plugins/types.js";
import { elevenLabsTTS } from "../tts-core.js";
const ELEVENLABS_TTS_MODELS = [
"eleven_multilingual_v2",
"eleven_turbo_v2_5",
"eleven_monolingual_v1",
] as const;
export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
return {
id: "elevenlabs",
label: "ElevenLabs",
models: ELEVENLABS_TTS_MODELS,
isConfigured: ({ config }) =>
Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
synthesize: async (req) => {
const apiKey =
req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
const outputFormat = req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128";
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: req.config.elevenlabs.baseUrl,
voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId,
modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId,
outputFormat,
seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed,
applyTextNormalization:
req.overrides?.elevenlabs?.applyTextNormalization ??
req.config.elevenlabs.applyTextNormalization,
languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode,
voiceSettings: {
...req.config.elevenlabs.voiceSettings,
...req.overrides?.elevenlabs?.voiceSettings,
},
timeoutMs: req.config.timeoutMs,
});
return {
audioBuffer,
outputFormat,
fileExtension: req.target === "voice-note" ? ".opus" : ".mp3",
voiceCompatible: req.target === "voice-note",
};
},
synthesizeTelephony: async (req) => {
const apiKey =
req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
const outputFormat = "pcm_22050";
const sampleRate = 22_050;
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: req.config.elevenlabs.baseUrl,
voiceId: req.config.elevenlabs.voiceId,
modelId: req.config.elevenlabs.modelId,
outputFormat,
seed: req.config.elevenlabs.seed,
applyTextNormalization: req.config.elevenlabs.applyTextNormalization,
languageCode: req.config.elevenlabs.languageCode,
voiceSettings: req.config.elevenlabs.voiceSettings,
timeoutMs: req.config.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };
},
};
}

View File

@@ -0,0 +1,60 @@
import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
import path from "node:path";
import { resolvePreferredOpenClawTmpDir } from "../../infra/tmp-openclaw-dir.js";
import { isVoiceCompatibleAudio } from "../../media/audio.js";
import type { SpeechProviderPlugin } from "../../plugins/types.js";
import { edgeTTS, inferEdgeExtension } from "../tts-core.js";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
return {
id: "microsoft",
label: "Microsoft",
aliases: ["edge"],
isConfigured: ({ config }) => config.edge.enabled,
synthesize: async (req) => {
const tempRoot = resolvePreferredOpenClawTmpDir();
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
let outputFormat = req.config.edge.outputFormat;
const fallbackOutputFormat =
outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
try {
const runEdge = async (format: string) => {
const fileExtension = inferEdgeExtension(format);
const outputPath = path.join(tempDir, `speech${fileExtension}`);
await edgeTTS({
text: req.text,
outputPath,
config: {
...req.config.edge,
outputFormat: format,
},
timeoutMs: req.config.timeoutMs,
});
const audioBuffer = readFileSync(outputPath);
return {
audioBuffer,
outputFormat: format,
fileExtension,
voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }),
};
};
try {
return await runEdge(outputFormat);
} catch (err) {
if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
throw err;
}
outputFormat = fallbackOutputFormat;
return await runEdge(outputFormat);
}
} finally {
rmSync(tempDir, { recursive: true, force: true });
}
},
};
}

View File

@@ -0,0 +1,56 @@
import type { SpeechProviderPlugin } from "../../plugins/types.js";
import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "../tts-core.js";
export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
return {
id: "openai",
label: "OpenAI",
models: OPENAI_TTS_MODELS,
voices: OPENAI_TTS_VOICES,
isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY),
synthesize: async (req) => {
const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
const responseFormat = req.target === "voice-note" ? "opus" : "mp3";
const audioBuffer = await openaiTTS({
text: req.text,
apiKey,
baseUrl: req.config.openai.baseUrl,
model: req.overrides?.openai?.model ?? req.config.openai.model,
voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
speed: req.config.openai.speed,
instructions: req.config.openai.instructions,
responseFormat,
timeoutMs: req.config.timeoutMs,
});
return {
audioBuffer,
outputFormat: responseFormat,
fileExtension: responseFormat === "opus" ? ".opus" : ".mp3",
voiceCompatible: req.target === "voice-note",
};
},
synthesizeTelephony: async (req) => {
const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
const outputFormat = "pcm";
const sampleRate = 24_000;
const audioBuffer = await openaiTTS({
text: req.text,
apiKey,
baseUrl: req.config.openai.baseUrl,
model: req.config.openai.model,
voice: req.config.openai.voice,
speed: req.config.openai.speed,
instructions: req.config.openai.instructions,
responseFormat: outputFormat,
timeoutMs: req.config.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };
},
};
}

View File

@@ -156,10 +156,13 @@ export function parseTtsDirectives(
if (!policy.allowProvider) {
break;
}
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
{
const providerId = rawValue.trim().toLowerCase();
if (providerId) {
overrides.provider = providerId;
} else {
warnings.push("invalid provider id");
}
}
break;
case "voice":

View File

@@ -311,7 +311,7 @@ describe("tts", () => {
expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1);
});
it("accepts edge as provider override", () => {
it("accepts edge as a legacy microsoft provider override", () => {
const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true });
const input = "Hello [[tts:provider=edge]] world";
const result = parseTtsDirectives(input, policy);
@@ -524,8 +524,8 @@ describe("tts", () => {
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
},
prefsPath: "/tmp/tts-prefs-edge.json",
expected: "edge",
prefsPath: "/tmp/tts-prefs-microsoft.json",
expected: "microsoft",
},
] as const;
@@ -539,6 +539,25 @@ describe("tts", () => {
});
});
describe("resolveTtsConfig provider normalization", () => {
it("normalizes legacy edge provider ids to microsoft", () => {
const config = resolveTtsConfig({
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: {
tts: {
provider: "edge",
edge: {
enabled: true,
},
},
},
});
expect(config.provider).toBe("microsoft");
expect(getTtsProvider(config, "/tmp/tts-prefs-normalized.json")).toBe("microsoft");
});
});
describe("resolveTtsConfig openai.baseUrl", () => {
const baseCfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },

View File

@@ -5,7 +5,6 @@ import {
readFileSync,
writeFileSync,
mkdtempSync,
rmSync,
renameSync,
unlinkSync,
} from "node:fs";
@@ -25,20 +24,20 @@ import type {
import { logVerbose } from "../globals.js";
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
import { stripMarkdown } from "../line/markdown-to-line.js";
import { isVoiceCompatibleAudio } from "../media/audio.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
import {
getSpeechProvider,
listSpeechProviders,
normalizeSpeechProviderId,
} from "./provider-registry.js";
import {
DEFAULT_OPENAI_BASE_URL,
edgeTTS,
elevenLabsTTS,
inferEdgeExtension,
isValidOpenAIModel,
isValidOpenAIVoice,
isValidVoiceId,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
resolveOpenAITtsInstructions,
openaiTTS,
parseTtsDirectives,
scheduleCleanup,
summarizeText,
@@ -83,11 +82,6 @@ const DEFAULT_OUTPUT = {
voiceCompatible: false,
};
const TELEPHONY_OUTPUT = {
openai: { format: "pcm" as const, sampleRate: 24000 },
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
};
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
export type ResolvedTtsConfig = {
@@ -261,12 +255,13 @@ function resolveModelOverridePolicy(
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
const rawMicrosoft = { ...raw.edge, ...raw.microsoft };
const edgeOutputFormat = rawMicrosoft.outputFormat?.trim();
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
return {
auto,
mode: raw.mode ?? "final",
provider: raw.provider ?? "edge",
provider: normalizeSpeechProviderId(raw.provider) ?? "microsoft",
providerSource,
summaryModel: raw.summaryModel?.trim() || undefined,
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
@@ -311,17 +306,17 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
instructions: raw.openai?.instructions?.trim() || undefined,
},
edge: {
enabled: raw.edge?.enabled ?? true,
voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE,
lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG,
enabled: rawMicrosoft.enabled ?? true,
voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,
lang: rawMicrosoft.lang?.trim() || DEFAULT_EDGE_LANG,
outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
outputFormatConfigured: Boolean(edgeOutputFormat),
pitch: raw.edge?.pitch?.trim() || undefined,
rate: raw.edge?.rate?.trim() || undefined,
volume: raw.edge?.volume?.trim() || undefined,
saveSubtitles: raw.edge?.saveSubtitles ?? false,
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
pitch: rawMicrosoft.pitch?.trim() || undefined,
rate: rawMicrosoft.rate?.trim() || undefined,
volume: rawMicrosoft.volume?.trim() || undefined,
saveSubtitles: rawMicrosoft.saveSubtitles ?? false,
proxy: rawMicrosoft.proxy?.trim() || undefined,
timeoutMs: rawMicrosoft.timeoutMs,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
@@ -448,11 +443,12 @@ export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
const prefs = readPrefs(prefsPath);
if (prefs.tts?.provider) {
return prefs.tts.provider;
const prefsProvider = normalizeSpeechProviderId(prefs.tts?.provider);
if (prefsProvider) {
return prefsProvider;
}
if (config.providerSource === "config") {
return config.provider;
return normalizeSpeechProviderId(config.provider) ?? config.provider;
}
if (resolveTtsApiKey(config, "openai")) {
@@ -461,12 +457,12 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
if (resolveTtsApiKey(config, "elevenlabs")) {
return "elevenlabs";
}
return "edge";
return "microsoft";
}
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, provider };
prefs.tts = { ...prefs.tts, provider: normalizeSpeechProviderId(provider) ?? provider };
});
}
@@ -522,26 +518,42 @@ export function resolveTtsApiKey(
config: ResolvedTtsConfig,
provider: TtsProvider,
): string | undefined {
if (provider === "elevenlabs") {
const normalizedProvider = normalizeSpeechProviderId(provider);
if (normalizedProvider === "elevenlabs") {
return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
}
if (provider === "openai") {
if (normalizedProvider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary;
const ordered = new Set<TtsProvider>([normalizedPrimary]);
for (const provider of TTS_PROVIDERS) {
if (provider !== normalizedPrimary) {
ordered.add(provider);
}
}
for (const provider of listSpeechProviders(cfg)) {
const normalized = normalizeSpeechProviderId(provider.id) ?? provider.id;
if (normalized !== normalizedPrimary) {
ordered.add(normalized);
}
}
return [...ordered];
}
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
if (provider === "edge") {
return config.edge.enabled;
}
return Boolean(resolveTtsApiKey(config, provider));
export function isTtsProviderConfigured(
config: ResolvedTtsConfig,
provider: TtsProvider,
cfg?: OpenClawConfig,
): boolean {
const resolvedProvider = getSpeechProvider(provider, cfg);
return resolvedProvider?.isConfigured({ cfg, config }) ?? false;
}
function formatTtsProviderError(provider: TtsProvider, err: unknown): string {
@@ -581,10 +593,10 @@ function resolveTtsRequestSetup(params: {
}
const userProvider = getTtsProvider(config, prefsPath);
const provider = params.providerOverride ?? userProvider;
const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider;
return {
config,
providers: resolveTtsProviderOrder(provider),
providers: resolveTtsProviderOrder(provider, params.cfg),
};
}
@@ -607,136 +619,36 @@ export async function textToSpeech(params: {
const { config, providers } = setup;
const channelId = resolveChannelId(params.channel);
const output = resolveOutputFormat(channelId);
const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
const errors: string[] = [];
for (const provider of providers) {
const providerStart = Date.now();
try {
if (provider === "edge") {
if (!config.edge.enabled) {
errors.push("edge: disabled");
continue;
}
const tempRoot = resolvePreferredOpenClawTmpDir();
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
let edgeOutputFormat = resolveEdgeOutputFormat(config);
const fallbackEdgeOutputFormat =
edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
const attemptEdgeTts = async (outputFormat: string) => {
const extension = inferEdgeExtension(outputFormat);
const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`);
await edgeTTS({
text: params.text,
outputPath: audioPath,
config: {
...config.edge,
outputFormat,
},
timeoutMs: config.timeoutMs,
});
return { audioPath, outputFormat };
};
let edgeResult: { audioPath: string; outputFormat: string };
try {
edgeResult = await attemptEdgeTts(edgeOutputFormat);
} catch (err) {
if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) {
logVerbose(
`TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`,
);
edgeOutputFormat = fallbackEdgeOutputFormat;
try {
edgeResult = await attemptEdgeTts(edgeOutputFormat);
} catch (fallbackErr) {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
throw fallbackErr;
}
} else {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
throw err;
}
}
scheduleCleanup(tempDir);
const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath });
return {
success: true,
audioPath: edgeResult.audioPath,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: edgeResult.outputFormat,
voiceCompatible,
};
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
errors.push(`${provider}: no API key`);
const resolvedProvider = getSpeechProvider(provider, params.cfg);
if (!resolvedProvider) {
errors.push(`${provider}: no provider registered`);
continue;
}
let audioBuffer: Buffer;
if (provider === "elevenlabs") {
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
const modelIdOverride = params.overrides?.elevenlabs?.modelId;
const voiceSettings = {
...config.elevenlabs.voiceSettings,
...params.overrides?.elevenlabs?.voiceSettings,
};
const seedOverride = params.overrides?.elevenlabs?.seed;
const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization;
const languageOverride = params.overrides?.elevenlabs?.languageCode;
audioBuffer = await elevenLabsTTS({
text: params.text,
apiKey,
baseUrl: config.elevenlabs.baseUrl,
voiceId: voiceIdOverride ?? config.elevenlabs.voiceId,
modelId: modelIdOverride ?? config.elevenlabs.modelId,
outputFormat: output.elevenlabs,
seed: seedOverride ?? config.elevenlabs.seed,
applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization,
languageCode: languageOverride ?? config.elevenlabs.languageCode,
voiceSettings,
timeoutMs: config.timeoutMs,
});
} else {
const openaiModelOverride = params.overrides?.openai?.model;
const openaiVoiceOverride = params.overrides?.openai?.voice;
audioBuffer = await openaiTTS({
text: params.text,
apiKey,
baseUrl: config.openai.baseUrl,
model: openaiModelOverride ?? config.openai.model,
voice: openaiVoiceOverride ?? config.openai.voice,
speed: config.openai.speed,
instructions: config.openai.instructions,
responseFormat: output.openai,
timeoutMs: config.timeoutMs,
});
if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) {
errors.push(`${provider}: not configured`);
continue;
}
const synthesis = await resolvedProvider.synthesize({
text: params.text,
cfg: params.cfg,
config,
target,
overrides: params.overrides,
});
const latencyMs = Date.now() - providerStart;
const tempRoot = resolvePreferredOpenClawTmpDir();
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
writeFileSync(audioPath, audioBuffer);
const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
writeFileSync(audioPath, synthesis.audioBuffer);
scheduleCleanup(tempDir);
return {
@@ -744,8 +656,8 @@ export async function textToSpeech(params: {
audioPath,
latencyMs,
provider,
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
voiceCompatible: output.voiceCompatible,
outputFormat: synthesis.outputFormat,
voiceCompatible: synthesis.voiceCompatible,
};
} catch (err) {
errors.push(formatTtsProviderError(provider, err));
@@ -776,63 +688,32 @@ export async function textToSpeechTelephony(params: {
for (const provider of providers) {
const providerStart = Date.now();
try {
if (provider === "edge") {
errors.push("edge: unsupported for telephony");
const resolvedProvider = getSpeechProvider(provider, params.cfg);
if (!resolvedProvider) {
errors.push(`${provider}: no provider registered`);
continue;
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
errors.push(`${provider}: no API key`);
if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) {
errors.push(`${provider}: not configured`);
continue;
}
if (provider === "elevenlabs") {
const output = TELEPHONY_OUTPUT.elevenlabs;
const audioBuffer = await elevenLabsTTS({
text: params.text,
apiKey,
baseUrl: config.elevenlabs.baseUrl,
voiceId: config.elevenlabs.voiceId,
modelId: config.elevenlabs.modelId,
outputFormat: output.format,
seed: config.elevenlabs.seed,
applyTextNormalization: config.elevenlabs.applyTextNormalization,
languageCode: config.elevenlabs.languageCode,
voiceSettings: config.elevenlabs.voiceSettings,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
if (!resolvedProvider.synthesizeTelephony) {
errors.push(`${provider}: unsupported for telephony`);
continue;
}
const output = TELEPHONY_OUTPUT.openai;
const audioBuffer = await openaiTTS({
const synthesis = await resolvedProvider.synthesizeTelephony({
text: params.text,
apiKey,
baseUrl: config.openai.baseUrl,
model: config.openai.model,
voice: config.openai.voice,
speed: config.openai.speed,
instructions: config.openai.instructions,
responseFormat: output.format,
timeoutMs: config.timeoutMs,
cfg: params.cfg,
config,
});
return {
success: true,
audioBuffer,
audioBuffer: synthesis.audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
outputFormat: synthesis.outputFormat,
sampleRate: synthesis.sampleRate,
};
} catch (err) {
errors.push(formatTtsProviderError(provider, err));