fix: honor telephony tts directives

This commit is contained in:
Peter Steinberger
2026-05-01 22:48:27 +01:00
parent 236bd42bb3
commit 4389ceedac
11 changed files with 397 additions and 33 deletions

View File

@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
- Doctor/WhatsApp: warn when Linux crontabs still run the legacy `ensure-whatsapp.sh` health check, which can misreport `Gateway inactive` when cron lacks the systemd user-bus environment. Fixes #60204. Thanks @mySebbe.
- Slack/setup: print the generated app manifest as plain JSON instead of embedding it inside the framed setup note, so it can be copied into Slack without deleting border characters. Fixes #65751. Thanks @theDanielJLewis.
- Channels/WhatsApp: route CLI logout through the live Gateway and stop runtime-backed listeners before channel removal, so removing a WhatsApp account does not leave the old socket replying until restart. Fixes #67746. Thanks @123Mismail.
- Voice Call/Twilio: honor TTS directive text and provider voice/model overrides during telephony synthesis, so `[[tts:...]]` tags are not spoken literally and voiceId overrides reach OpenAI/ElevenLabs calls. Fixes #58114. Thanks @legonhilltech-jpg.
- Agents/Codex: stop prompting message-tool-only source turns to finish with `NO_REPLY`, so quiet turns are represented by not calling the visible message tool instead of conflicting final-text instructions. Thanks @pashpashpash.
- Gateway/config: report failed backup restores as failed in logs and config observe audit records instead of marking them valid. (#70515) Thanks @davidangularme.
- Compaction: use the active session model fallback chain for implicit summarization failures without persisting fallback model selection, so Azure content-filter 400s can recover. Fixes #64960. (#74470) Thanks @jalehman and @OpenCodeEngineer.

View File

@@ -1,7 +1,39 @@
import { describe, expect, it } from "vitest";
import { afterEach, describe, expect, it, vi } from "vitest";
import { buildElevenLabsSpeechProvider, isValidVoiceId } from "./speech-provider.js";
vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
fetchWithSsrFGuard: async ({
url,
init,
}: {
url: string;
init?: RequestInit;
}): Promise<{ response: Response; release: () => Promise<void> }> => ({
response: await globalThis.fetch(url, init),
release: vi.fn(async () => {}),
}),
ssrfPolicyFromHttpBaseUrlAllowedHostname: () => undefined,
}));
function parseRequestBody(init: RequestInit | undefined): Record<string, unknown> {
if (typeof init?.body !== "string") {
throw new Error("expected string request body");
}
const body: unknown = JSON.parse(init.body);
if (!body || typeof body !== "object" || Array.isArray(body)) {
throw new Error("expected ElevenLabs request body");
}
return body as Record<string, unknown>;
}
describe("elevenlabs speech provider", () => {
const originalFetch = globalThis.fetch;
afterEach(() => {
globalThis.fetch = originalFetch;
vi.restoreAllMocks();
});
it("exposes the current ElevenLabs TTS model catalog", () => {
const provider = buildElevenLabsSpeechProvider();
@@ -32,4 +64,49 @@ describe("elevenlabs speech provider", () => {
expect(isValidVoiceId(testCase.value), testCase.value).toBe(testCase.expected);
}
});
it("applies provider overrides to telephony synthesis", async () => {
const provider = buildElevenLabsSpeechProvider();
const fetchMock = vi.fn(async (url: string, init?: RequestInit) => {
expect(url).toContain("/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM");
expect(url).toContain("output_format=pcm_22050");
const body = parseRequestBody(init);
expect(body).toMatchObject({
text: "hello",
model_id: "eleven_v3",
seed: 123,
apply_text_normalization: "on",
language_code: "en",
voice_settings: expect.objectContaining({
speed: 1.2,
}),
});
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
});
globalThis.fetch = fetchMock as unknown as typeof fetch;
const result = await provider.synthesizeTelephony?.({
text: "hello",
cfg: {} as never,
providerConfig: {
apiKey: "xi-test",
voiceId: "pMsXgVXv3BLzUgSXRplE",
modelId: "eleven_multilingual_v2",
},
providerOverrides: {
voiceId: "21m00Tcm4TlvDq8ikWAM",
modelId: "eleven_v3",
seed: 123,
applyTextNormalization: "on",
languageCode: "en",
voiceSettings: {
speed: 1.2,
},
},
timeoutMs: 1_000,
});
expect(result?.outputFormat).toBe("pcm_22050");
expect(fetchMock).toHaveBeenCalledTimes(1);
});
});

View File

@@ -152,6 +152,31 @@ function mergeVoiceSettingsOverride(
};
}
function resolveVoiceSettingsOverride(
base: ElevenLabsProviderConfig["voiceSettings"],
overrides: unknown,
): ElevenLabsProviderConfig["voiceSettings"] {
const voiceSettings = asObject(overrides);
return {
...base,
...(asFiniteNumber(voiceSettings?.stability) == null
? {}
: { stability: asFiniteNumber(voiceSettings?.stability) }),
...(asFiniteNumber(voiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asFiniteNumber(voiceSettings?.similarityBoost) }),
...(asFiniteNumber(voiceSettings?.style) == null
? {}
: { style: asFiniteNumber(voiceSettings?.style) }),
...(asBoolean(voiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(voiceSettings?.useSpeakerBoost) }),
...(asFiniteNumber(voiceSettings?.speed) == null
? {}
: { speed: asFiniteNumber(voiceSettings?.speed) }),
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
try {
switch (ctx.key) {
@@ -469,7 +494,6 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
const outputFormat =
trimToUndefined(overrides.outputFormat) ??
(req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
const overrideVoiceSettings = asObject(overrides.voiceSettings);
const latencyTier = asFiniteNumber(overrides.latencyTier);
const audioBuffer = await elevenLabsTTS({
text: req.text,
@@ -487,24 +511,7 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
| undefined) ?? config.applyTextNormalization,
languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
latencyTier,
voiceSettings: {
...config.voiceSettings,
...(asFiniteNumber(overrideVoiceSettings?.stability) == null
? {}
: { stability: asFiniteNumber(overrideVoiceSettings?.stability) }),
...(asFiniteNumber(overrideVoiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asFiniteNumber(overrideVoiceSettings?.similarityBoost) }),
...(asFiniteNumber(overrideVoiceSettings?.style) == null
? {}
: { style: asFiniteNumber(overrideVoiceSettings?.style) }),
...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }),
...(asFiniteNumber(overrideVoiceSettings?.speed) == null
? {}
: { speed: asFiniteNumber(overrideVoiceSettings?.speed) }),
},
voiceSettings: resolveVoiceSettingsOverride(config.voiceSettings, overrides.voiceSettings),
timeoutMs: req.timeoutMs,
});
return {
@@ -516,6 +523,7 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
},
synthesizeTelephony: async (req) => {
const config = readElevenLabsProviderConfig(req.providerConfig);
const overrides = req.providerOverrides ?? {};
const apiKey =
config.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY;
if (!apiKey) {
@@ -527,13 +535,18 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
modelId: config.modelId,
voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
modelId: trimToUndefined(overrides.modelId) ?? config.modelId,
outputFormat,
seed: config.seed,
applyTextNormalization: config.applyTextNormalization,
languageCode: config.languageCode,
voiceSettings: config.voiceSettings,
seed: asFiniteNumber(overrides.seed) ?? config.seed,
applyTextNormalization:
(trimToUndefined(overrides.applyTextNormalization) as
| "auto"
| "on"
| "off"
| undefined) ?? config.applyTextNormalization,
languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
voiceSettings: resolveVoiceSettingsOverride(config.voiceSettings, overrides.voiceSettings),
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };

View File

@@ -15,11 +15,21 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
ssrfPolicyFromHttpBaseUrlAllowedHostname: () => undefined,
}));
function isSpeechRequestBody(value: unknown): value is { response_format?: string } {
function isSpeechRequestBody(value: unknown): value is {
model?: string;
voice?: string;
speed?: number;
response_format?: string;
} {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}
function parseRequestBody(init: RequestInit | undefined): { response_format?: string } {
function parseRequestBody(init: RequestInit | undefined): {
model?: string;
voice?: string;
speed?: number;
response_format?: string;
} {
if (typeof init?.body !== "string") {
throw new Error("expected string request body");
}
@@ -218,6 +228,41 @@ describe("buildOpenAISpeechProvider", () => {
expect(result.voiceCompatible).toBe(false);
});
it("applies provider overrides to telephony synthesis", async () => {
const provider = buildOpenAISpeechProvider();
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
const body = parseRequestBody(init);
expect(body).toMatchObject({
model: "tts-1",
voice: "nova",
speed: 1.25,
response_format: "pcm",
});
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
});
globalThis.fetch = fetchMock as unknown as typeof fetch;
const result = await provider.synthesizeTelephony?.({
text: "hello",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
model: "gpt-4o-mini-tts",
voice: "alloy",
speed: 1,
},
providerOverrides: {
model: "tts-1",
voice: "nova",
speed: 1.25,
},
timeoutMs: 1_000,
});
expect(result?.outputFormat).toBe("pcm");
expect(fetchMock).toHaveBeenCalledTimes(1);
});
it("honors explicit responseFormat overrides and clears voice-note compatibility when not opus", async () => {
const provider = buildOpenAISpeechProvider();
mockSpeechFetchExpectingFormat("wav");

View File

@@ -309,6 +309,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
},
synthesizeTelephony: async (req) => {
const config = readOpenAIProviderConfig(req.providerConfig);
const overrides = readOpenAIOverrides(req.providerOverrides);
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
@@ -319,9 +320,9 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
text: req.text,
apiKey,
baseUrl: config.baseUrl,
model: config.model,
voice: config.voice,
speed: config.speed,
model: overrides.model ?? config.model,
voice: overrides.voice ?? config.voice,
speed: overrides.speed ?? config.speed,
instructions: config.instructions,
responseFormat: outputFormat,
timeoutMs: req.timeoutMs,

View File

@@ -10,6 +10,7 @@ import type {
SpeechProviderPlugin,
SpeechProviderPrepareSynthesisContext,
SpeechSynthesisRequest,
SpeechTelephonySynthesisRequest,
} from "openclaw/plugin-sdk/speech-core";
import { afterEach, describe, expect, it, vi } from "vitest";
@@ -542,6 +543,47 @@ describe("speech-core native voice-note routing", () => {
expect(result.attempts?.[0]).not.toHaveProperty("personaBinding");
});
it("passes directive overrides to telephony synthesis providers", async () => {
const synthesizeTelephony = vi.fn(async (_request: SpeechTelephonySynthesisRequest) => ({
audioBuffer: Buffer.from("voice"),
outputFormat: "pcm",
sampleRate: 24000,
}));
installSpeechProviders([
createMockSpeechProvider("mock", {
synthesizeTelephony,
}),
]);
const result = await textToSpeechTelephony({
text: "Use a directed telephony voice.",
cfg: {
messages: {
tts: {
enabled: true,
provider: "mock",
},
},
},
overrides: {
providerOverrides: {
mock: {
voice: "directed-voice",
},
},
},
});
expect(result.success).toBe(true);
expect(synthesizeTelephony).toHaveBeenCalledWith(
expect.objectContaining({
providerOverrides: {
voice: "directed-voice",
},
}),
);
});
it("uses provider defaults when fallback policy allows missing persona bindings", async () => {
await synthesizeSpeech({
text: "Use neutral provider defaults.",

View File

@@ -1318,11 +1318,13 @@ export async function textToSpeechTelephony(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
overrides?: TtsDirectiveOverrides;
}): Promise<TtsTelephonyResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
cfg: params.cfg,
prefsPath: params.prefsPath,
providerOverride: params.overrides?.provider,
});
if ("error" in setup) {
return { success: false, error: setup.error };
@@ -1371,6 +1373,7 @@ export async function textToSpeechTelephony(params: {
text: params.text,
cfg,
providerConfig: resolvedProvider.providerConfig,
providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id],
persona: resolvedProvider.synthesisPersona,
personaProviderConfig: resolvedProvider.personaProviderConfig,
target: "telephony",
@@ -1380,6 +1383,7 @@ export async function textToSpeechTelephony(params: {
text: prepared.text,
cfg,
providerConfig: prepared.providerConfig,
providerOverrides: prepared.providerOverrides,
timeoutMs: config.timeoutMs,
});
const latencyMs = Date.now() - providerStart;

View File

@@ -117,6 +117,53 @@ describe("createTelephonyTtsProvider deepMerge hardening", () => {
);
});
it("strips telephony TTS directive tags before synthesis", async () => {
let requestText: string | undefined;
const provider = createTelephonyTtsProvider({
coreConfig: createCoreConfig(),
runtime: {
textToSpeechTelephony: async ({ text }) => {
requestText = text;
return {
success: true,
audioBuffer: Buffer.alloc(2),
sampleRate: 8000,
};
},
},
});
await provider.synthesizeForTelephony("[[tts]]Hello caller[[/tts]]");
expect(requestText).toBe("Hello caller");
});
it("uses hidden telephony TTS directive text for synthesis", async () => {
let requestText: string | undefined;
let requestOverrides: unknown;
const provider = createTelephonyTtsProvider({
coreConfig: createCoreConfig(),
runtime: {
textToSpeechTelephony: async ({ text, overrides }) => {
requestText = text;
requestOverrides = overrides;
return {
success: true,
audioBuffer: Buffer.alloc(2),
sampleRate: 8000,
};
},
},
});
await provider.synthesizeForTelephony(
"Visible text [[tts:text]]Speak this instead[[/tts:text]]",
);
expect(requestText).toBe("Speak this instead");
expect(requestOverrides).toMatchObject({ ttsText: "Speak this instead" });
});
it("exposes configured timeoutMs as synthesisTimeoutMs", () => {
const provider = createTelephonyTtsProvider({
coreConfig: { messages: { tts: { provider: "openai", timeoutMs: 15000 } } },

View File

@@ -1,3 +1,9 @@
import {
parseTtsDirectives,
type SpeechModelOverridePolicy,
type SpeechProviderConfig,
type TtsDirectiveOverrides,
} from "openclaw/plugin-sdk/speech";
import type { VoiceCallTtsConfig } from "./config.js";
import type { CoreConfig } from "./core-bridge.js";
import { deepMergeDefined } from "./deep-merge.js";
@@ -8,6 +14,7 @@ export type TelephonyTtsRuntime = {
text: string;
cfg: CoreConfig;
prefsPath?: string;
overrides?: TtsDirectiveOverrides;
}) => Promise<{
success: boolean;
audioBuffer?: Buffer;
@@ -26,6 +33,17 @@ export type TelephonyTtsProvider = {
export const TELEPHONY_DEFAULT_TTS_TIMEOUT_MS = 8000;
type TelephonyModelOverrideConfig = {
enabled?: boolean;
allowText?: boolean;
allowProvider?: boolean;
allowVoice?: boolean;
allowModelId?: boolean;
allowVoiceSettings?: boolean;
allowNormalization?: boolean;
allowSeed?: boolean;
};
export function createTelephonyTtsProvider(params: {
coreConfig: CoreConfig;
ttsOverride?: VoiceCallTtsConfig;
@@ -36,15 +54,35 @@ export function createTelephonyTtsProvider(params: {
}): TelephonyTtsProvider {
const { coreConfig, ttsOverride, runtime, logger } = params;
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
const ttsConfig = mergedConfig.messages?.tts;
const modelOverrides = resolveTelephonyModelOverridePolicy(
readTelephonyModelOverrides(ttsConfig),
);
const providerConfigs = collectTelephonyProviderConfigs(ttsConfig);
const activeProvider = normalizeProviderId(ttsConfig?.provider);
const synthesisTimeoutMs =
mergedConfig.messages?.tts?.timeoutMs ?? TELEPHONY_DEFAULT_TTS_TIMEOUT_MS;
return {
synthesisTimeoutMs,
synthesizeForTelephony: async (text: string) => {
const result = await runtime.textToSpeechTelephony({
text,
const directives = parseTtsDirectives(text, modelOverrides, {
cfg: mergedConfig,
providerConfigs,
preferredProviderId: activeProvider,
});
if (directives.warnings.length > 0) {
logger?.warn?.(
`[voice-call] Ignored telephony TTS directive overrides (${directives.warnings.join("; ")})`,
);
}
const cleanText = directives.hasDirective
? directives.ttsText?.trim() || directives.cleanedText.trim()
: text;
const result = await runtime.textToSpeechTelephony({
text: cleanText,
cfg: mergedConfig,
overrides: directives.overrides,
});
if (!result.success || !result.audioBuffer || !result.sampleRate) {
@@ -101,3 +139,97 @@ function mergeTtsConfig(
}
return deepMergeDefined(base, override) as VoiceCallTtsConfig;
}
function resolveTelephonyModelOverridePolicy(
overrides: TelephonyModelOverrideConfig | undefined,
): SpeechModelOverridePolicy {
const enabled = overrides?.enabled ?? true;
if (!enabled) {
return {
enabled: false,
allowText: false,
allowProvider: false,
allowVoice: false,
allowModelId: false,
allowVoiceSettings: false,
allowNormalization: false,
allowSeed: false,
};
}
const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue;
return {
enabled: true,
allowText: allow(overrides?.allowText),
allowProvider: allow(overrides?.allowProvider, false),
allowVoice: allow(overrides?.allowVoice),
allowModelId: allow(overrides?.allowModelId),
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
allowNormalization: allow(overrides?.allowNormalization),
allowSeed: allow(overrides?.allowSeed),
};
}
function readTelephonyModelOverrides(
ttsConfig: VoiceCallTtsConfig | undefined,
): TelephonyModelOverrideConfig | undefined {
const value = (ttsConfig as Record<string, unknown> | undefined)?.modelOverrides;
return value && typeof value === "object" && !Array.isArray(value)
? (value as TelephonyModelOverrideConfig)
: undefined;
}
function normalizeProviderId(value: unknown): string | undefined {
return typeof value === "string" ? value.trim().toLowerCase() || undefined : undefined;
}
function asProviderConfig(value: unknown): SpeechProviderConfig {
return value && typeof value === "object" && !Array.isArray(value)
? (value as SpeechProviderConfig)
: {};
}
function collectTelephonyProviderConfigs(
ttsConfig: VoiceCallTtsConfig | undefined,
): Record<string, SpeechProviderConfig> {
if (!ttsConfig) {
return {};
}
const entries: Record<string, SpeechProviderConfig> = {};
const rawProviders =
ttsConfig.providers &&
typeof ttsConfig.providers === "object" &&
!Array.isArray(ttsConfig.providers)
? (ttsConfig.providers as Record<string, unknown>)
: {};
for (const [providerId, value] of Object.entries(rawProviders)) {
const normalized = normalizeProviderId(providerId) ?? providerId;
entries[normalized] = asProviderConfig(value);
}
const reservedKeys = new Set([
"auto",
"enabled",
"maxTextLength",
"mode",
"modelOverrides",
"persona",
"personas",
"prefsPath",
"provider",
"providers",
"summaryModel",
"timeoutMs",
]);
for (const [key, value] of Object.entries(ttsConfig as Record<string, unknown>)) {
if (
reservedKeys.has(key) ||
typeof value !== "object" ||
value === null ||
Array.isArray(value)
) {
continue;
}
const normalized = normalizeProviderId(key) ?? key;
entries[normalized] ??= asProviderConfig(value);
}
return entries;
}

View File

@@ -88,6 +88,7 @@ export type TtsTelephonyRequestParams = {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
overrides?: TtsDirectiveOverrides;
};
export type ListSpeechVoicesParams = {

View File

@@ -61,6 +61,7 @@ export type SpeechTelephonySynthesisRequest = {
text: string;
cfg: OpenClawConfig;
providerConfig: SpeechProviderConfig;
providerOverrides?: SpeechProviderOverrides;
timeoutMs: number;
};