From 6ea3cddf0d5025cf6caf1eb8aed109cefc82bed4 Mon Sep 17 00:00:00 2001 From: stain lu <109842185+stainlu@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:56:38 +0800 Subject: [PATCH] fix: register bundled TTS providers and route overrides correctly (#62846) (thanks @stainlu) * fix(microsoft,elevenlabs): add enabledByDefault so speech providers register at runtime * fix(tts): route generic directive tokens to the explicitly declared provider Addresses the P2 Codex review on #62846 that flagged auto-enabling ElevenLabs as a product regression for MiniMax users. Both providers claim the generic `speed` token, and parseTtsDirectives walked providers in autoSelectOrder with first-match-wins, so inputs like `[[tts:provider=minimax speed=1.2]]` silently routed speed to providerOverrides.elevenlabs once elevenlabs participated in every parse pass. The parser now pre-scans for `provider=` (honoring legacy last-wins semantics) and routes generic tokens with the declared provider tried first, falling back to autoSelectOrder when it doesn't handle the key. Token order inside the directive no longer matters: `speed=1.2` before or after `provider=minimax` both resolve to MiniMax. Adds a regression test suite covering the exact ElevenLabs/MiniMax speed collision plus fallback, mixed-token, last-wins, and allowProvider-disabled cases. parseTtsDirectives had no prior test coverage. * fix(tts): prefer active provider for generic directives * fix: register bundled TTS providers safely (#62846) (thanks @stainlu) * fix: use exported TTS SDK seam (#62846) (thanks @stainlu) --------- Co-authored-by: Ayaan Zaidi --- CHANGELOG.md | 1 + extensions/discord/src/voice/manager.ts | 12 +- extensions/elevenlabs/openclaw.plugin.json | 1 + extensions/microsoft/openclaw.plugin.json | 1 + extensions/speech-core/src/tts.ts | 7 +- src/tts/directives.test.ts | 147 +++++++++++++++++++++ src/tts/directives.ts | 62 ++++++--- 7 files changed, 210 insertions(+), 21 deletions(-) create mode 100644 src/tts/directives.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index fdd6277f6f5..d54a1148895 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ Docs: https://docs.openclaw.ai - Ollama/chat: strip the `ollama/` provider prefix from Ollama chat request model ids so configured refs like `ollama/qwen3:14b-q8_0` stop 404ing against the Ollama API. (#67457) Thanks @suboss87. - QA/Matrix: split the private QA lab runtime into smaller tested modules, add Matrix media contract coverage for image understanding and generated-image delivery, and update the memory-dreaming QA sweep to assert the separate phase-report layout. (#67430) Thanks @gumadeiras. - Agents/tools: resolve non-workspace host tilde paths against the OS home directory and keep edit recovery aligned with that same path target, so `~/...` host edit/write operations stop failing or reading back the wrong file when `OPENCLAW_HOME` differs. (#62804) Thanks @stainlu. +- Speech/TTS: auto-enable the bundled Microsoft and ElevenLabs speech providers, and route generic TTS directive tokens through the explicit or active provider first so overrides like `[[tts:speed=1.2]]` stop silently landing on the wrong provider. (#62846) Thanks @stainlu. ## 2026.4.15-beta.1 diff --git a/extensions/discord/src/voice/manager.ts b/extensions/discord/src/voice/manager.ts index a390934981d..6db4f54c5af 100644 --- a/extensions/discord/src/voice/manager.ts +++ b/extensions/discord/src/voice/manager.ts @@ -5,9 +5,14 @@ import path from "node:path"; import type { Readable } from "node:stream"; import { ChannelType, type Client, ReadyListener } from "@buape/carbon"; import type { VoicePlugin } from "@buape/carbon/voice"; -import { resolveAgentDir } from "openclaw/plugin-sdk/agent-runtime"; -import { agentCommandFromIngress } from "openclaw/plugin-sdk/agent-runtime"; -import { resolveTtsConfig, type ResolvedTtsConfig } from "openclaw/plugin-sdk/agent-runtime"; +import { + agentCommandFromIngress, + getTtsProvider, + resolveAgentDir, + resolveTtsConfig, + resolveTtsPrefsPath, + type ResolvedTtsConfig, +} from "openclaw/plugin-sdk/agent-runtime"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; import type { DiscordAccountConfig, TtsConfig } from "openclaw/plugin-sdk/config-runtime"; import { resolveAgentRoute } from "openclaw/plugin-sdk/routing"; @@ -809,6 +814,7 @@ export class DiscordVoiceManager { const directive = parseTtsDirectives(replyText, ttsConfig.modelOverrides, { cfg: ttsCfg, providerConfigs: ttsConfig.providerConfigs, + preferredProviderId: getTtsProvider(ttsConfig, resolveTtsPrefsPath(ttsConfig)), }); const rawSpeakText = directive.overrides.ttsText ?? directive.cleanedText.trim(); const speakText = sanitizeVoiceReplyTextForSpeech(rawSpeakText, speaker.label); diff --git a/extensions/elevenlabs/openclaw.plugin.json b/extensions/elevenlabs/openclaw.plugin.json index df36c9d38b7..3b7154df40e 100644 --- a/extensions/elevenlabs/openclaw.plugin.json +++ b/extensions/elevenlabs/openclaw.plugin.json @@ -1,5 +1,6 @@ { "id": "elevenlabs", + "enabledByDefault": true, "contracts": { "speechProviders": ["elevenlabs"] }, diff --git a/extensions/microsoft/openclaw.plugin.json b/extensions/microsoft/openclaw.plugin.json index 2ff93655f71..2cac16c5986 100644 --- a/extensions/microsoft/openclaw.plugin.json +++ b/extensions/microsoft/openclaw.plugin.json @@ -1,5 +1,6 @@ { "id": "microsoft", + "enabledByDefault": true, "contracts": { "speechProviders": ["microsoft"] }, diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 2e3ec1d6321..bf5eda6bdf3 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -1036,12 +1036,14 @@ export async function maybeApplyTtsToPayload(params: { return params.payload; } const config = resolveTtsConfig(params.cfg); + const activeProvider = getTtsProvider(config, prefsPath); const reply = resolveSendableOutboundReplyParts(params.payload); const text = reply.text; const directives = parseTtsDirectives(text, config.modelOverrides, { cfg: params.cfg, providerConfigs: config.providerConfigs, + preferredProviderId: activeProvider, }); if (directives.warnings.length > 0) { logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`); @@ -1049,9 +1051,8 @@ export async function maybeApplyTtsToPayload(params: { if (isVerbose()) { const effectiveProvider = directives.overrides?.provider - ? (canonicalizeSpeechProviderId(directives.overrides.provider, params.cfg) ?? - getTtsProvider(config, prefsPath)) - : getTtsProvider(config, prefsPath); + ? (canonicalizeSpeechProviderId(directives.overrides.provider, params.cfg) ?? activeProvider) + : activeProvider; logVerbose( `TTS: auto mode enabled (${autoMode}), channel=${params.channel}, selected provider=${effectiveProvider}, config.provider=${config.provider}, config.providerSource=${config.providerSource}`, ); diff --git a/src/tts/directives.test.ts b/src/tts/directives.test.ts new file mode 100644 index 00000000000..0495d67ad27 --- /dev/null +++ b/src/tts/directives.test.ts @@ -0,0 +1,147 @@ +import { describe, expect, it } from "vitest"; +import type { SpeechProviderPlugin } from "../plugins/types.js"; +import { parseTtsDirectives } from "./directives.js"; +import type { + SpeechDirectiveTokenParseContext, + SpeechDirectiveTokenParseResult, + SpeechModelOverridePolicy, +} from "./provider-types.js"; + +function makeProvider( + id: string, + order: number, + parse: (ctx: SpeechDirectiveTokenParseContext) => SpeechDirectiveTokenParseResult | undefined, +): SpeechProviderPlugin { + return { + id, + label: id, + autoSelectOrder: order, + parseDirectiveToken: parse, + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: Buffer.alloc(0), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }), + } as SpeechProviderPlugin; +} + +const elevenlabs = makeProvider("elevenlabs", 10, ({ key, value }) => { + if (key === "speed") { + return { handled: true, overrides: { speed: Number(value) } }; + } + if (key === "style") { + return { handled: true, overrides: { style: Number(value) } }; + } + return undefined; +}); + +const minimax = makeProvider("minimax", 20, ({ key, value }) => { + if (key === "speed") { + return { handled: true, overrides: { speed: Number(value) } }; + } + return undefined; +}); + +const fullPolicy: SpeechModelOverridePolicy = { + enabled: true, + allowText: true, + allowProvider: true, + allowVoice: true, + allowModelId: true, + allowVoiceSettings: true, + allowNormalization: true, + allowSeed: true, +}; + +describe("parseTtsDirectives provider-aware routing", () => { + it("routes generic speed to the explicitly declared provider", () => { + const result = parseTtsDirectives( + "hello [[tts:provider=minimax speed=1.2]] world", + fullPolicy, + { + providers: [elevenlabs, minimax], + }, + ); + + expect(result.overrides.provider).toBe("minimax"); + expect(result.overrides.providerOverrides?.minimax).toEqual({ speed: 1.2 }); + expect(result.overrides.providerOverrides?.elevenlabs).toBeUndefined(); + }); + + it("routes correctly when provider appears after the generic token", () => { + const result = parseTtsDirectives("[[tts:speed=1.2 provider=minimax]] hi", fullPolicy, { + providers: [elevenlabs, minimax], + }); + + expect(result.overrides.provider).toBe("minimax"); + expect(result.overrides.providerOverrides?.minimax).toEqual({ speed: 1.2 }); + expect(result.overrides.providerOverrides?.elevenlabs).toBeUndefined(); + }); + + it("routes to the preferred provider when no provider token is declared", () => { + const result = parseTtsDirectives("[[tts:speed=1.5]]", fullPolicy, { + providers: [elevenlabs, minimax], + preferredProviderId: "minimax", + }); + + expect(result.overrides.provider).toBeUndefined(); + expect(result.overrides.providerOverrides?.minimax).toEqual({ speed: 1.5 }); + expect(result.overrides.providerOverrides?.elevenlabs).toBeUndefined(); + }); + + it("falls back to autoSelectOrder when no provider hint is available", () => { + const result = parseTtsDirectives("[[tts:speed=1.5]]", fullPolicy, { + providers: [elevenlabs, minimax], + }); + + expect(result.overrides.provider).toBeUndefined(); + expect(result.overrides.providerOverrides?.elevenlabs).toEqual({ speed: 1.5 }); + expect(result.overrides.providerOverrides?.minimax).toBeUndefined(); + }); + + it("falls through when the preferred provider does not handle the key", () => { + const result = parseTtsDirectives("[[tts:provider=minimax style=0.4]]", fullPolicy, { + providers: [elevenlabs, minimax], + }); + + expect(result.overrides.provider).toBe("minimax"); + expect(result.overrides.providerOverrides?.elevenlabs).toEqual({ style: 0.4 }); + expect(result.overrides.providerOverrides?.minimax).toBeUndefined(); + }); + + it("routes mixed tokens independently in the same directive", () => { + const result = parseTtsDirectives("[[tts:provider=minimax style=0.4 speed=1.2]]", fullPolicy, { + providers: [elevenlabs, minimax], + }); + + expect(result.overrides.provider).toBe("minimax"); + expect(result.overrides.providerOverrides?.minimax).toEqual({ speed: 1.2 }); + expect(result.overrides.providerOverrides?.elevenlabs).toEqual({ style: 0.4 }); + }); + + it("keeps last-wins provider semantics", () => { + const result = parseTtsDirectives( + "[[tts:provider=elevenlabs provider=minimax speed=1.1]]", + fullPolicy, + { providers: [elevenlabs, minimax] }, + ); + + expect(result.overrides.provider).toBe("minimax"); + expect(result.overrides.providerOverrides?.minimax).toEqual({ speed: 1.1 }); + expect(result.overrides.providerOverrides?.elevenlabs).toBeUndefined(); + }); + + it("ignores provider tokens when provider overrides are disabled", () => { + const policy: SpeechModelOverridePolicy = { ...fullPolicy, allowProvider: false }; + const result = parseTtsDirectives("[[tts:provider=elevenlabs speed=1.2]]", policy, { + providers: [elevenlabs, minimax], + preferredProviderId: "minimax", + }); + + expect(result.overrides.provider).toBeUndefined(); + expect(result.overrides.providerOverrides?.minimax).toEqual({ speed: 1.2 }); + expect(result.overrides.providerOverrides?.elevenlabs).toBeUndefined(); + }); +}); diff --git a/src/tts/directives.ts b/src/tts/directives.ts index 45ced71e382..80daf1e153d 100644 --- a/src/tts/directives.ts +++ b/src/tts/directives.ts @@ -13,6 +13,7 @@ type ParseTtsDirectiveOptions = { cfg?: OpenClawConfig; providers?: readonly SpeechProviderPlugin[]; providerConfigs?: Record; + preferredProviderId?: string; }; function buildProviderOrder(left: SpeechProviderPlugin, right: SpeechProviderPlugin): number { @@ -38,6 +39,20 @@ function resolveDirectiveProviderConfig( return options?.providerConfigs?.[provider.id]; } +function prioritizeProvider( + providers: readonly SpeechProviderPlugin[], + providerId: string | undefined, +): SpeechProviderPlugin[] { + if (!providerId) { + return [...providers]; + } + const preferredProvider = providers.find((provider) => provider.id === providerId); + if (!preferredProvider) { + return [...providers]; + } + return [preferredProvider, ...providers.filter((provider) => provider.id !== providerId)]; +} + export function parseTtsDirectives( text: string, policy: SpeechModelOverridePolicy, @@ -66,6 +81,37 @@ export function parseTtsDirectives( cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => { hasDirective = true; const tokens = body.split(/\s+/).filter(Boolean); + + let declaredProviderId: string | undefined; + if (policy.allowProvider) { + for (const token of tokens) { + const eqIndex = token.indexOf("="); + if (eqIndex === -1) { + continue; + } + const rawKey = token.slice(0, eqIndex).trim(); + if (!rawKey || normalizeLowercaseStringOrEmpty(rawKey) !== "provider") { + continue; + } + const rawValue = token.slice(eqIndex + 1).trim(); + if (!rawValue) { + continue; + } + const providerId = normalizeLowercaseStringOrEmpty(rawValue); + if (!providerId) { + warnings.push("invalid provider id"); + continue; + } + declaredProviderId = providerId; + overrides.provider = providerId; + } + } + + const orderedProviders = prioritizeProvider( + providers, + declaredProviderId ?? normalizeLowercaseStringOrEmpty(options?.preferredProviderId), + ); + for (const token of tokens) { const eqIndex = token.indexOf("="); if (eqIndex === -1) { @@ -78,19 +124,10 @@ export function parseTtsDirectives( } const key = normalizeLowercaseStringOrEmpty(rawKey); if (key === "provider") { - if (policy.allowProvider) { - const providerId = normalizeLowercaseStringOrEmpty(rawValue); - if (providerId) { - overrides.provider = providerId; - } else { - warnings.push("invalid provider id"); - } - } continue; } - let handled = false; - for (const provider of providers) { + for (const provider of orderedProviders) { const parsed = provider.parseDirectiveToken?.({ key, value: rawValue, @@ -101,7 +138,6 @@ export function parseTtsDirectives( if (!parsed?.handled) { continue; } - handled = true; if (parsed.overrides) { overrides.providerOverrides = { ...overrides.providerOverrides, @@ -116,10 +152,6 @@ export function parseTtsDirectives( } break; } - - if (!handled) { - continue; - } } return ""; });