mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:10:44 +00:00
fix: honor telephony tts directives
This commit is contained in:
@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Doctor/WhatsApp: warn when Linux crontabs still run the legacy `ensure-whatsapp.sh` health check, which can misreport `Gateway inactive` when cron lacks the systemd user-bus environment. Fixes #60204. Thanks @mySebbe.
|
||||
- Slack/setup: print the generated app manifest as plain JSON instead of embedding it inside the framed setup note, so it can be copied into Slack without deleting border characters. Fixes #65751. Thanks @theDanielJLewis.
|
||||
- Channels/WhatsApp: route CLI logout through the live Gateway and stop runtime-backed listeners before channel removal, so removing a WhatsApp account does not leave the old socket replying until restart. Fixes #67746. Thanks @123Mismail.
|
||||
- Voice Call/Twilio: honor TTS directive text and provider voice/model overrides during telephony synthesis, so `[[tts:...]]` tags are not spoken literally and voiceId overrides reach OpenAI/ElevenLabs calls. Fixes #58114. Thanks @legonhilltech-jpg.
|
||||
- Agents/Codex: stop prompting message-tool-only source turns to finish with `NO_REPLY`, so quiet turns are represented by not calling the visible message tool instead of conflicting final-text instructions. Thanks @pashpashpash.
|
||||
- Gateway/config: report failed backup restores as failed in logs and config observe audit records instead of marking them valid. (#70515) Thanks @davidangularme.
|
||||
- Compaction: use the active session model fallback chain for implicit summarization failures without persisting fallback model selection, so Azure content-filter 400s can recover. Fixes #64960. (#74470) Thanks @jalehman and @OpenCodeEngineer.
|
||||
|
||||
@@ -1,7 +1,39 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { buildElevenLabsSpeechProvider, isValidVoiceId } from "./speech-provider.js";
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
|
||||
fetchWithSsrFGuard: async ({
|
||||
url,
|
||||
init,
|
||||
}: {
|
||||
url: string;
|
||||
init?: RequestInit;
|
||||
}): Promise<{ response: Response; release: () => Promise<void> }> => ({
|
||||
response: await globalThis.fetch(url, init),
|
||||
release: vi.fn(async () => {}),
|
||||
}),
|
||||
ssrfPolicyFromHttpBaseUrlAllowedHostname: () => undefined,
|
||||
}));
|
||||
|
||||
function parseRequestBody(init: RequestInit | undefined): Record<string, unknown> {
|
||||
if (typeof init?.body !== "string") {
|
||||
throw new Error("expected string request body");
|
||||
}
|
||||
const body: unknown = JSON.parse(init.body);
|
||||
if (!body || typeof body !== "object" || Array.isArray(body)) {
|
||||
throw new Error("expected ElevenLabs request body");
|
||||
}
|
||||
return body as Record<string, unknown>;
|
||||
}
|
||||
|
||||
describe("elevenlabs speech provider", () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch;
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it("exposes the current ElevenLabs TTS model catalog", () => {
|
||||
const provider = buildElevenLabsSpeechProvider();
|
||||
|
||||
@@ -32,4 +64,49 @@ describe("elevenlabs speech provider", () => {
|
||||
expect(isValidVoiceId(testCase.value), testCase.value).toBe(testCase.expected);
|
||||
}
|
||||
});
|
||||
|
||||
it("applies provider overrides to telephony synthesis", async () => {
|
||||
const provider = buildElevenLabsSpeechProvider();
|
||||
const fetchMock = vi.fn(async (url: string, init?: RequestInit) => {
|
||||
expect(url).toContain("/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM");
|
||||
expect(url).toContain("output_format=pcm_22050");
|
||||
const body = parseRequestBody(init);
|
||||
expect(body).toMatchObject({
|
||||
text: "hello",
|
||||
model_id: "eleven_v3",
|
||||
seed: 123,
|
||||
apply_text_normalization: "on",
|
||||
language_code: "en",
|
||||
voice_settings: expect.objectContaining({
|
||||
speed: 1.2,
|
||||
}),
|
||||
});
|
||||
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
|
||||
});
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const result = await provider.synthesizeTelephony?.({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "xi-test",
|
||||
voiceId: "pMsXgVXv3BLzUgSXRplE",
|
||||
modelId: "eleven_multilingual_v2",
|
||||
},
|
||||
providerOverrides: {
|
||||
voiceId: "21m00Tcm4TlvDq8ikWAM",
|
||||
modelId: "eleven_v3",
|
||||
seed: 123,
|
||||
applyTextNormalization: "on",
|
||||
languageCode: "en",
|
||||
voiceSettings: {
|
||||
speed: 1.2,
|
||||
},
|
||||
},
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(result?.outputFormat).toBe("pcm_22050");
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -152,6 +152,31 @@ function mergeVoiceSettingsOverride(
|
||||
};
|
||||
}
|
||||
|
||||
function resolveVoiceSettingsOverride(
|
||||
base: ElevenLabsProviderConfig["voiceSettings"],
|
||||
overrides: unknown,
|
||||
): ElevenLabsProviderConfig["voiceSettings"] {
|
||||
const voiceSettings = asObject(overrides);
|
||||
return {
|
||||
...base,
|
||||
...(asFiniteNumber(voiceSettings?.stability) == null
|
||||
? {}
|
||||
: { stability: asFiniteNumber(voiceSettings?.stability) }),
|
||||
...(asFiniteNumber(voiceSettings?.similarityBoost) == null
|
||||
? {}
|
||||
: { similarityBoost: asFiniteNumber(voiceSettings?.similarityBoost) }),
|
||||
...(asFiniteNumber(voiceSettings?.style) == null
|
||||
? {}
|
||||
: { style: asFiniteNumber(voiceSettings?.style) }),
|
||||
...(asBoolean(voiceSettings?.useSpeakerBoost) == null
|
||||
? {}
|
||||
: { useSpeakerBoost: asBoolean(voiceSettings?.useSpeakerBoost) }),
|
||||
...(asFiniteNumber(voiceSettings?.speed) == null
|
||||
? {}
|
||||
: { speed: asFiniteNumber(voiceSettings?.speed) }),
|
||||
};
|
||||
}
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||
try {
|
||||
switch (ctx.key) {
|
||||
@@ -469,7 +494,6 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
|
||||
const outputFormat =
|
||||
trimToUndefined(overrides.outputFormat) ??
|
||||
(req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
|
||||
const overrideVoiceSettings = asObject(overrides.voiceSettings);
|
||||
const latencyTier = asFiniteNumber(overrides.latencyTier);
|
||||
const audioBuffer = await elevenLabsTTS({
|
||||
text: req.text,
|
||||
@@ -487,24 +511,7 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
|
||||
| undefined) ?? config.applyTextNormalization,
|
||||
languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
|
||||
latencyTier,
|
||||
voiceSettings: {
|
||||
...config.voiceSettings,
|
||||
...(asFiniteNumber(overrideVoiceSettings?.stability) == null
|
||||
? {}
|
||||
: { stability: asFiniteNumber(overrideVoiceSettings?.stability) }),
|
||||
...(asFiniteNumber(overrideVoiceSettings?.similarityBoost) == null
|
||||
? {}
|
||||
: { similarityBoost: asFiniteNumber(overrideVoiceSettings?.similarityBoost) }),
|
||||
...(asFiniteNumber(overrideVoiceSettings?.style) == null
|
||||
? {}
|
||||
: { style: asFiniteNumber(overrideVoiceSettings?.style) }),
|
||||
...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null
|
||||
? {}
|
||||
: { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }),
|
||||
...(asFiniteNumber(overrideVoiceSettings?.speed) == null
|
||||
? {}
|
||||
: { speed: asFiniteNumber(overrideVoiceSettings?.speed) }),
|
||||
},
|
||||
voiceSettings: resolveVoiceSettingsOverride(config.voiceSettings, overrides.voiceSettings),
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
@@ -516,6 +523,7 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const config = readElevenLabsProviderConfig(req.providerConfig);
|
||||
const overrides = req.providerOverrides ?? {};
|
||||
const apiKey =
|
||||
config.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY;
|
||||
if (!apiKey) {
|
||||
@@ -527,13 +535,18 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
voiceId: config.voiceId,
|
||||
modelId: config.modelId,
|
||||
voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
|
||||
modelId: trimToUndefined(overrides.modelId) ?? config.modelId,
|
||||
outputFormat,
|
||||
seed: config.seed,
|
||||
applyTextNormalization: config.applyTextNormalization,
|
||||
languageCode: config.languageCode,
|
||||
voiceSettings: config.voiceSettings,
|
||||
seed: asFiniteNumber(overrides.seed) ?? config.seed,
|
||||
applyTextNormalization:
|
||||
(trimToUndefined(overrides.applyTextNormalization) as
|
||||
| "auto"
|
||||
| "on"
|
||||
| "off"
|
||||
| undefined) ?? config.applyTextNormalization,
|
||||
languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
|
||||
voiceSettings: resolveVoiceSettingsOverride(config.voiceSettings, overrides.voiceSettings),
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return { audioBuffer, outputFormat, sampleRate };
|
||||
|
||||
@@ -15,11 +15,21 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
|
||||
ssrfPolicyFromHttpBaseUrlAllowedHostname: () => undefined,
|
||||
}));
|
||||
|
||||
function isSpeechRequestBody(value: unknown): value is { response_format?: string } {
|
||||
function isSpeechRequestBody(value: unknown): value is {
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
response_format?: string;
|
||||
} {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function parseRequestBody(init: RequestInit | undefined): { response_format?: string } {
|
||||
function parseRequestBody(init: RequestInit | undefined): {
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
response_format?: string;
|
||||
} {
|
||||
if (typeof init?.body !== "string") {
|
||||
throw new Error("expected string request body");
|
||||
}
|
||||
@@ -218,6 +228,41 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
expect(result.voiceCompatible).toBe(false);
|
||||
});
|
||||
|
||||
it("applies provider overrides to telephony synthesis", async () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
|
||||
const body = parseRequestBody(init);
|
||||
expect(body).toMatchObject({
|
||||
model: "tts-1",
|
||||
voice: "nova",
|
||||
speed: 1.25,
|
||||
response_format: "pcm",
|
||||
});
|
||||
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
|
||||
});
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const result = await provider.synthesizeTelephony?.({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "sk-test",
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "alloy",
|
||||
speed: 1,
|
||||
},
|
||||
providerOverrides: {
|
||||
model: "tts-1",
|
||||
voice: "nova",
|
||||
speed: 1.25,
|
||||
},
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(result?.outputFormat).toBe("pcm");
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("honors explicit responseFormat overrides and clears voice-note compatibility when not opus", async () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
mockSpeechFetchExpectingFormat("wav");
|
||||
|
||||
@@ -309,6 +309,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const config = readOpenAIProviderConfig(req.providerConfig);
|
||||
const overrides = readOpenAIOverrides(req.providerOverrides);
|
||||
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key missing");
|
||||
@@ -319,9 +320,9 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
model: config.model,
|
||||
voice: config.voice,
|
||||
speed: config.speed,
|
||||
model: overrides.model ?? config.model,
|
||||
voice: overrides.voice ?? config.voice,
|
||||
speed: overrides.speed ?? config.speed,
|
||||
instructions: config.instructions,
|
||||
responseFormat: outputFormat,
|
||||
timeoutMs: req.timeoutMs,
|
||||
|
||||
@@ -10,6 +10,7 @@ import type {
|
||||
SpeechProviderPlugin,
|
||||
SpeechProviderPrepareSynthesisContext,
|
||||
SpeechSynthesisRequest,
|
||||
SpeechTelephonySynthesisRequest,
|
||||
} from "openclaw/plugin-sdk/speech-core";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
@@ -542,6 +543,47 @@ describe("speech-core native voice-note routing", () => {
|
||||
expect(result.attempts?.[0]).not.toHaveProperty("personaBinding");
|
||||
});
|
||||
|
||||
it("passes directive overrides to telephony synthesis providers", async () => {
|
||||
const synthesizeTelephony = vi.fn(async (_request: SpeechTelephonySynthesisRequest) => ({
|
||||
audioBuffer: Buffer.from("voice"),
|
||||
outputFormat: "pcm",
|
||||
sampleRate: 24000,
|
||||
}));
|
||||
installSpeechProviders([
|
||||
createMockSpeechProvider("mock", {
|
||||
synthesizeTelephony,
|
||||
}),
|
||||
]);
|
||||
|
||||
const result = await textToSpeechTelephony({
|
||||
text: "Use a directed telephony voice.",
|
||||
cfg: {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "mock",
|
||||
},
|
||||
},
|
||||
},
|
||||
overrides: {
|
||||
providerOverrides: {
|
||||
mock: {
|
||||
voice: "directed-voice",
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(synthesizeTelephony).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
providerOverrides: {
|
||||
voice: "directed-voice",
|
||||
},
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("uses provider defaults when fallback policy allows missing persona bindings", async () => {
|
||||
await synthesizeSpeech({
|
||||
text: "Use neutral provider defaults.",
|
||||
|
||||
@@ -1318,11 +1318,13 @@ export async function textToSpeechTelephony(params: {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
prefsPath?: string;
|
||||
overrides?: TtsDirectiveOverrides;
|
||||
}): Promise<TtsTelephonyResult> {
|
||||
const setup = resolveTtsRequestSetup({
|
||||
text: params.text,
|
||||
cfg: params.cfg,
|
||||
prefsPath: params.prefsPath,
|
||||
providerOverride: params.overrides?.provider,
|
||||
});
|
||||
if ("error" in setup) {
|
||||
return { success: false, error: setup.error };
|
||||
@@ -1371,6 +1373,7 @@ export async function textToSpeechTelephony(params: {
|
||||
text: params.text,
|
||||
cfg,
|
||||
providerConfig: resolvedProvider.providerConfig,
|
||||
providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id],
|
||||
persona: resolvedProvider.synthesisPersona,
|
||||
personaProviderConfig: resolvedProvider.personaProviderConfig,
|
||||
target: "telephony",
|
||||
@@ -1380,6 +1383,7 @@ export async function textToSpeechTelephony(params: {
|
||||
text: prepared.text,
|
||||
cfg,
|
||||
providerConfig: prepared.providerConfig,
|
||||
providerOverrides: prepared.providerOverrides,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
const latencyMs = Date.now() - providerStart;
|
||||
|
||||
@@ -117,6 +117,53 @@ describe("createTelephonyTtsProvider deepMerge hardening", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("strips telephony TTS directive tags before synthesis", async () => {
|
||||
let requestText: string | undefined;
|
||||
const provider = createTelephonyTtsProvider({
|
||||
coreConfig: createCoreConfig(),
|
||||
runtime: {
|
||||
textToSpeechTelephony: async ({ text }) => {
|
||||
requestText = text;
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer: Buffer.alloc(2),
|
||||
sampleRate: 8000,
|
||||
};
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await provider.synthesizeForTelephony("[[tts]]Hello caller[[/tts]]");
|
||||
|
||||
expect(requestText).toBe("Hello caller");
|
||||
});
|
||||
|
||||
it("uses hidden telephony TTS directive text for synthesis", async () => {
|
||||
let requestText: string | undefined;
|
||||
let requestOverrides: unknown;
|
||||
const provider = createTelephonyTtsProvider({
|
||||
coreConfig: createCoreConfig(),
|
||||
runtime: {
|
||||
textToSpeechTelephony: async ({ text, overrides }) => {
|
||||
requestText = text;
|
||||
requestOverrides = overrides;
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer: Buffer.alloc(2),
|
||||
sampleRate: 8000,
|
||||
};
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await provider.synthesizeForTelephony(
|
||||
"Visible text [[tts:text]]Speak this instead[[/tts:text]]",
|
||||
);
|
||||
|
||||
expect(requestText).toBe("Speak this instead");
|
||||
expect(requestOverrides).toMatchObject({ ttsText: "Speak this instead" });
|
||||
});
|
||||
|
||||
it("exposes configured timeoutMs as synthesisTimeoutMs", () => {
|
||||
const provider = createTelephonyTtsProvider({
|
||||
coreConfig: { messages: { tts: { provider: "openai", timeoutMs: 15000 } } },
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
import {
|
||||
parseTtsDirectives,
|
||||
type SpeechModelOverridePolicy,
|
||||
type SpeechProviderConfig,
|
||||
type TtsDirectiveOverrides,
|
||||
} from "openclaw/plugin-sdk/speech";
|
||||
import type { VoiceCallTtsConfig } from "./config.js";
|
||||
import type { CoreConfig } from "./core-bridge.js";
|
||||
import { deepMergeDefined } from "./deep-merge.js";
|
||||
@@ -8,6 +14,7 @@ export type TelephonyTtsRuntime = {
|
||||
text: string;
|
||||
cfg: CoreConfig;
|
||||
prefsPath?: string;
|
||||
overrides?: TtsDirectiveOverrides;
|
||||
}) => Promise<{
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
@@ -26,6 +33,17 @@ export type TelephonyTtsProvider = {
|
||||
|
||||
export const TELEPHONY_DEFAULT_TTS_TIMEOUT_MS = 8000;
|
||||
|
||||
type TelephonyModelOverrideConfig = {
|
||||
enabled?: boolean;
|
||||
allowText?: boolean;
|
||||
allowProvider?: boolean;
|
||||
allowVoice?: boolean;
|
||||
allowModelId?: boolean;
|
||||
allowVoiceSettings?: boolean;
|
||||
allowNormalization?: boolean;
|
||||
allowSeed?: boolean;
|
||||
};
|
||||
|
||||
export function createTelephonyTtsProvider(params: {
|
||||
coreConfig: CoreConfig;
|
||||
ttsOverride?: VoiceCallTtsConfig;
|
||||
@@ -36,15 +54,35 @@ export function createTelephonyTtsProvider(params: {
|
||||
}): TelephonyTtsProvider {
|
||||
const { coreConfig, ttsOverride, runtime, logger } = params;
|
||||
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
|
||||
const ttsConfig = mergedConfig.messages?.tts;
|
||||
const modelOverrides = resolveTelephonyModelOverridePolicy(
|
||||
readTelephonyModelOverrides(ttsConfig),
|
||||
);
|
||||
const providerConfigs = collectTelephonyProviderConfigs(ttsConfig);
|
||||
const activeProvider = normalizeProviderId(ttsConfig?.provider);
|
||||
const synthesisTimeoutMs =
|
||||
mergedConfig.messages?.tts?.timeoutMs ?? TELEPHONY_DEFAULT_TTS_TIMEOUT_MS;
|
||||
|
||||
return {
|
||||
synthesisTimeoutMs,
|
||||
synthesizeForTelephony: async (text: string) => {
|
||||
const result = await runtime.textToSpeechTelephony({
|
||||
text,
|
||||
const directives = parseTtsDirectives(text, modelOverrides, {
|
||||
cfg: mergedConfig,
|
||||
providerConfigs,
|
||||
preferredProviderId: activeProvider,
|
||||
});
|
||||
if (directives.warnings.length > 0) {
|
||||
logger?.warn?.(
|
||||
`[voice-call] Ignored telephony TTS directive overrides (${directives.warnings.join("; ")})`,
|
||||
);
|
||||
}
|
||||
const cleanText = directives.hasDirective
|
||||
? directives.ttsText?.trim() || directives.cleanedText.trim()
|
||||
: text;
|
||||
const result = await runtime.textToSpeechTelephony({
|
||||
text: cleanText,
|
||||
cfg: mergedConfig,
|
||||
overrides: directives.overrides,
|
||||
});
|
||||
|
||||
if (!result.success || !result.audioBuffer || !result.sampleRate) {
|
||||
@@ -101,3 +139,97 @@ function mergeTtsConfig(
|
||||
}
|
||||
return deepMergeDefined(base, override) as VoiceCallTtsConfig;
|
||||
}
|
||||
|
||||
function resolveTelephonyModelOverridePolicy(
|
||||
overrides: TelephonyModelOverrideConfig | undefined,
|
||||
): SpeechModelOverridePolicy {
|
||||
const enabled = overrides?.enabled ?? true;
|
||||
if (!enabled) {
|
||||
return {
|
||||
enabled: false,
|
||||
allowText: false,
|
||||
allowProvider: false,
|
||||
allowVoice: false,
|
||||
allowModelId: false,
|
||||
allowVoiceSettings: false,
|
||||
allowNormalization: false,
|
||||
allowSeed: false,
|
||||
};
|
||||
}
|
||||
const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue;
|
||||
return {
|
||||
enabled: true,
|
||||
allowText: allow(overrides?.allowText),
|
||||
allowProvider: allow(overrides?.allowProvider, false),
|
||||
allowVoice: allow(overrides?.allowVoice),
|
||||
allowModelId: allow(overrides?.allowModelId),
|
||||
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
|
||||
allowNormalization: allow(overrides?.allowNormalization),
|
||||
allowSeed: allow(overrides?.allowSeed),
|
||||
};
|
||||
}
|
||||
|
||||
function readTelephonyModelOverrides(
|
||||
ttsConfig: VoiceCallTtsConfig | undefined,
|
||||
): TelephonyModelOverrideConfig | undefined {
|
||||
const value = (ttsConfig as Record<string, unknown> | undefined)?.modelOverrides;
|
||||
return value && typeof value === "object" && !Array.isArray(value)
|
||||
? (value as TelephonyModelOverrideConfig)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function normalizeProviderId(value: unknown): string | undefined {
|
||||
return typeof value === "string" ? value.trim().toLowerCase() || undefined : undefined;
|
||||
}
|
||||
|
||||
function asProviderConfig(value: unknown): SpeechProviderConfig {
|
||||
return value && typeof value === "object" && !Array.isArray(value)
|
||||
? (value as SpeechProviderConfig)
|
||||
: {};
|
||||
}
|
||||
|
||||
function collectTelephonyProviderConfigs(
|
||||
ttsConfig: VoiceCallTtsConfig | undefined,
|
||||
): Record<string, SpeechProviderConfig> {
|
||||
if (!ttsConfig) {
|
||||
return {};
|
||||
}
|
||||
const entries: Record<string, SpeechProviderConfig> = {};
|
||||
const rawProviders =
|
||||
ttsConfig.providers &&
|
||||
typeof ttsConfig.providers === "object" &&
|
||||
!Array.isArray(ttsConfig.providers)
|
||||
? (ttsConfig.providers as Record<string, unknown>)
|
||||
: {};
|
||||
for (const [providerId, value] of Object.entries(rawProviders)) {
|
||||
const normalized = normalizeProviderId(providerId) ?? providerId;
|
||||
entries[normalized] = asProviderConfig(value);
|
||||
}
|
||||
const reservedKeys = new Set([
|
||||
"auto",
|
||||
"enabled",
|
||||
"maxTextLength",
|
||||
"mode",
|
||||
"modelOverrides",
|
||||
"persona",
|
||||
"personas",
|
||||
"prefsPath",
|
||||
"provider",
|
||||
"providers",
|
||||
"summaryModel",
|
||||
"timeoutMs",
|
||||
]);
|
||||
for (const [key, value] of Object.entries(ttsConfig as Record<string, unknown>)) {
|
||||
if (
|
||||
reservedKeys.has(key) ||
|
||||
typeof value !== "object" ||
|
||||
value === null ||
|
||||
Array.isArray(value)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
const normalized = normalizeProviderId(key) ?? key;
|
||||
entries[normalized] ??= asProviderConfig(value);
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
@@ -88,6 +88,7 @@ export type TtsTelephonyRequestParams = {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
prefsPath?: string;
|
||||
overrides?: TtsDirectiveOverrides;
|
||||
};
|
||||
|
||||
export type ListSpeechVoicesParams = {
|
||||
|
||||
@@ -61,6 +61,7 @@ export type SpeechTelephonySynthesisRequest = {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
providerConfig: SpeechProviderConfig;
|
||||
providerOverrides?: SpeechProviderOverrides;
|
||||
timeoutMs: number;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user