mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 12:20:44 +00:00
TTS: add provider personas
This commit is contained in:
@@ -1,5 +1,8 @@
|
||||
import * as providerHttp from "openclaw/plugin-sdk/provider-http";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
getProviderHttpMocks,
|
||||
installProviderHttpMockCleanup,
|
||||
} from "../../test/helpers/media-generation/provider-http-mocks.js";
|
||||
|
||||
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
@@ -7,10 +10,23 @@ vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
|
||||
}));
|
||||
|
||||
import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js";
|
||||
const {
|
||||
assertOkOrThrowProviderErrorMock,
|
||||
postJsonRequestMock,
|
||||
resolveProviderHttpRequestConfigMock,
|
||||
} = getProviderHttpMocks();
|
||||
|
||||
function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
|
||||
const fetchMock = vi.fn().mockResolvedValue({
|
||||
let buildGoogleSpeechProvider: typeof import("./speech-provider.js").buildGoogleSpeechProvider;
|
||||
let __testing: typeof import("./speech-provider.js").__testing;
|
||||
|
||||
beforeAll(async () => {
|
||||
({ buildGoogleSpeechProvider, __testing } = await import("./speech-provider.js"));
|
||||
});
|
||||
|
||||
installProviderHttpMockCleanup();
|
||||
|
||||
function googleTtsResponse(pcm = Buffer.from([1, 0, 2, 0])) {
|
||||
return {
|
||||
ok: true,
|
||||
json: async () => ({
|
||||
candidates: [
|
||||
@@ -28,21 +44,26 @@ function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function installGoogleTtsRequestMock(pcm = Buffer.from([1, 0, 2, 0])) {
|
||||
postJsonRequestMock.mockResolvedValue({
|
||||
response: googleTtsResponse(pcm),
|
||||
release: vi.fn(async () => {}),
|
||||
});
|
||||
vi.stubGlobal("fetch", fetchMock);
|
||||
return fetchMock;
|
||||
return postJsonRequestMock;
|
||||
}
|
||||
|
||||
describe("Google speech provider", () => {
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
vi.unstubAllGlobals();
|
||||
vi.unstubAllEnvs();
|
||||
transcodeAudioBufferToOpusMock.mockReset();
|
||||
});
|
||||
|
||||
it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
|
||||
const fetchMock = installGoogleTtsFetchMock();
|
||||
const requestMock = installGoogleTtsRequestMock();
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
const result = await provider.synthesize({
|
||||
@@ -57,11 +78,10 @@ describe("Google speech provider", () => {
|
||||
timeoutMs: 12_345,
|
||||
});
|
||||
|
||||
expect(fetchMock).toHaveBeenCalledWith(
|
||||
"https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
|
||||
expect(requestMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
|
||||
body: {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
@@ -78,11 +98,14 @@ describe("Google speech provider", () => {
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
},
|
||||
fetchFn: fetch,
|
||||
pinDns: false,
|
||||
timeoutMs: 12_345,
|
||||
}),
|
||||
);
|
||||
const [, init] = fetchMock.mock.calls[0];
|
||||
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("google-test-key");
|
||||
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
|
||||
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("google-test-key");
|
||||
expect(result.outputFormat).toBe("wav");
|
||||
expect(result.fileExtension).toBe(".wav");
|
||||
expect(result.voiceCompatible).toBe(false);
|
||||
@@ -94,7 +117,7 @@ describe("Google speech provider", () => {
|
||||
});
|
||||
|
||||
it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
|
||||
installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0]));
|
||||
installGoogleTtsRequestMock(Buffer.from([5, 0, 6, 0]));
|
||||
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
@@ -125,9 +148,138 @@ describe("Google speech provider", () => {
|
||||
expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
|
||||
});
|
||||
|
||||
it("advertises all documented Gemini TTS-capable models", () => {
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
expect(provider.models).toEqual(__testing.GOOGLE_TTS_MODELS);
|
||||
});
|
||||
|
||||
it("renders deterministic audio-profile-v1 prompts without generating tags", async () => {
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
const prepared = await provider.prepareSynthesis?.({
|
||||
text: "[whispers] The door is open.",
|
||||
cfg: {},
|
||||
providerConfig: {
|
||||
promptTemplate: "audio-profile-v1",
|
||||
personaPrompt: "Keep a close-mic feel.",
|
||||
},
|
||||
persona: {
|
||||
id: "alfred",
|
||||
label: "Alfred",
|
||||
prompt: {
|
||||
profile: "A brilliant British butler.",
|
||||
scene: "A quiet late-night study.",
|
||||
sampleContext: "The speaker is answering a trusted operator.",
|
||||
style: "Refined and lightly amused.",
|
||||
accent: "British English.",
|
||||
pacing: "Measured.",
|
||||
constraints: ["Do not read configuration values aloud."],
|
||||
},
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(prepared?.text).toBe(
|
||||
[
|
||||
"Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
|
||||
"as performance direction. Do not read section titles, notes, labels, or",
|
||||
"configuration aloud.",
|
||||
"",
|
||||
"# AUDIO PROFILE: Alfred",
|
||||
"A brilliant British butler.",
|
||||
"",
|
||||
"## THE SCENE",
|
||||
"A quiet late-night study.",
|
||||
"",
|
||||
"### DIRECTOR'S NOTES",
|
||||
"Style: Refined and lightly amused.",
|
||||
"Accent: British English.",
|
||||
"Pacing: Measured.",
|
||||
"Constraints:",
|
||||
"- Do not read configuration values aloud.",
|
||||
"Provider notes:",
|
||||
"Keep a close-mic feel.",
|
||||
"",
|
||||
"### SAMPLE CONTEXT",
|
||||
"The speaker is answering a trusted operator.",
|
||||
"",
|
||||
"### TRANSCRIPT",
|
||||
"[whispers] The door is open.",
|
||||
].join("\n"),
|
||||
);
|
||||
});
|
||||
|
||||
it("does not wrap an OpenClaw audio-profile-v1 prompt twice", async () => {
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
const text = [
|
||||
"Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
|
||||
"as performance direction. Do not read section titles, notes, labels, or",
|
||||
"configuration aloud.",
|
||||
"",
|
||||
"# AUDIO PROFILE: Alfred",
|
||||
"A brilliant British butler.",
|
||||
"",
|
||||
"### TRANSCRIPT",
|
||||
"Hello.",
|
||||
].join("\n");
|
||||
|
||||
const prepared = await provider.prepareSynthesis?.({
|
||||
text,
|
||||
cfg: {},
|
||||
providerConfig: {
|
||||
promptTemplate: "audio-profile-v1",
|
||||
},
|
||||
persona: {
|
||||
id: "alfred",
|
||||
label: "Alfred",
|
||||
prompt: {
|
||||
profile: "A brilliant British butler.",
|
||||
},
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(prepared).toBeUndefined();
|
||||
});
|
||||
|
||||
it("retries once when Gemini returns no audio payload", async () => {
|
||||
const pcm = Buffer.from([5, 0, 6, 0]);
|
||||
const requestSequence = vi
|
||||
.fn()
|
||||
.mockResolvedValueOnce({
|
||||
response: {
|
||||
ok: true,
|
||||
json: async () => ({ candidates: [{ content: { parts: [{ text: "not audio" }] } }] }),
|
||||
},
|
||||
release: vi.fn(async () => {}),
|
||||
})
|
||||
.mockResolvedValueOnce({
|
||||
response: googleTtsResponse(pcm),
|
||||
release: vi.fn(async () => {}),
|
||||
});
|
||||
postJsonRequestMock.mockImplementation(requestSequence);
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "Retry this.",
|
||||
cfg: {},
|
||||
providerConfig: {
|
||||
apiKey: "google-test-key",
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 5_000,
|
||||
});
|
||||
|
||||
expect(requestSequence).toHaveBeenCalledTimes(2);
|
||||
expect(result.audioBuffer.subarray(44)).toEqual(pcm);
|
||||
});
|
||||
|
||||
it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
|
||||
vi.stubEnv("GEMINI_API_KEY", "env-google-key");
|
||||
const fetchMock = installGoogleTtsFetchMock();
|
||||
const requestMock = installGoogleTtsRequestMock();
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true);
|
||||
@@ -149,16 +301,17 @@ describe("Google speech provider", () => {
|
||||
timeoutMs: 10_000,
|
||||
});
|
||||
|
||||
expect(fetchMock).toHaveBeenCalledWith(
|
||||
"https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
|
||||
expect.any(Object),
|
||||
expect(requestMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
|
||||
}),
|
||||
);
|
||||
const [, init] = fetchMock.mock.calls[0];
|
||||
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("env-google-key");
|
||||
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
|
||||
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("env-google-key");
|
||||
});
|
||||
|
||||
it("can reuse a configured Google model-provider API key without auth profiles", async () => {
|
||||
const fetchMock = installGoogleTtsFetchMock();
|
||||
const requestMock = installGoogleTtsRequestMock();
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
const cfg = {
|
||||
models: {
|
||||
@@ -182,13 +335,13 @@ describe("Google speech provider", () => {
|
||||
timeoutMs: 10_000,
|
||||
});
|
||||
|
||||
const [, init] = fetchMock.mock.calls[0];
|
||||
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
|
||||
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
|
||||
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
|
||||
});
|
||||
|
||||
it("returns Gemini PCM directly for telephony synthesis", async () => {
|
||||
const pcm = Buffer.from([3, 0, 4, 0]);
|
||||
installGoogleTtsFetchMock(pcm);
|
||||
installGoogleTtsRequestMock(pcm);
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
const result = await provider.synthesizeTelephony?.({
|
||||
@@ -209,7 +362,7 @@ describe("Google speech provider", () => {
|
||||
});
|
||||
|
||||
it("prepends configured Gemini TTS profile text", async () => {
|
||||
const fetchMock = installGoogleTtsFetchMock();
|
||||
const requestMock = installGoogleTtsRequestMock();
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
await provider.synthesize({
|
||||
@@ -224,8 +377,7 @@ describe("Google speech provider", () => {
|
||||
timeoutMs: 10_000,
|
||||
});
|
||||
|
||||
const [, init] = fetchMock.mock.calls[0];
|
||||
expect(JSON.parse(String(init.body))).toMatchObject({
|
||||
expect(requestMock.mock.calls[0]?.[0].body).toMatchObject({
|
||||
contents: [
|
||||
{
|
||||
parts: [
|
||||
@@ -326,23 +478,26 @@ describe("Google speech provider", () => {
|
||||
});
|
||||
|
||||
it("formats Google TTS HTTP errors with provider details", async () => {
|
||||
vi.stubGlobal(
|
||||
"fetch",
|
||||
vi.fn().mockResolvedValue(
|
||||
new Response(
|
||||
JSON.stringify({
|
||||
error: {
|
||||
message: "Quota exceeded",
|
||||
status: "RESOURCE_EXHAUSTED",
|
||||
},
|
||||
}),
|
||||
{
|
||||
status: 429,
|
||||
headers: { "x-request-id": "google_req_123" },
|
||||
},
|
||||
),
|
||||
assertOkOrThrowProviderErrorMock.mockRejectedValue(
|
||||
new Error(
|
||||
"Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
|
||||
),
|
||||
);
|
||||
postJsonRequestMock.mockResolvedValue({
|
||||
response: new Response(
|
||||
JSON.stringify({
|
||||
error: {
|
||||
message: "Quota exceeded",
|
||||
status: "RESOURCE_EXHAUSTED",
|
||||
},
|
||||
}),
|
||||
{
|
||||
status: 429,
|
||||
headers: { "x-request-id": "google_req_123" },
|
||||
},
|
||||
),
|
||||
release: vi.fn(async () => {}),
|
||||
});
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
await expect(
|
||||
@@ -359,8 +514,7 @@ describe("Google speech provider", () => {
|
||||
});
|
||||
|
||||
it("honors configured private-network opt-in for Google TTS", async () => {
|
||||
installGoogleTtsFetchMock();
|
||||
const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
|
||||
installGoogleTtsRequestMock();
|
||||
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
await provider.synthesize({
|
||||
@@ -381,14 +535,16 @@ describe("Google speech provider", () => {
|
||||
timeoutMs: 12_345,
|
||||
});
|
||||
|
||||
expect(postJsonRequestSpy).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ allowPrivateNetwork: true }),
|
||||
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
allowPrivateNetwork: true,
|
||||
request: expect.objectContaining({ allowPrivateNetwork: true }),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("honors configured private-network opt-in for Google telephony TTS", async () => {
|
||||
installGoogleTtsFetchMock();
|
||||
const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
|
||||
installGoogleTtsRequestMock();
|
||||
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
await provider.synthesizeTelephony?.({
|
||||
@@ -408,8 +564,11 @@ describe("Google speech provider", () => {
|
||||
timeoutMs: 12_345,
|
||||
});
|
||||
|
||||
expect(postJsonRequestSpy).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ allowPrivateNetwork: true }),
|
||||
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
allowPrivateNetwork: true,
|
||||
request: expect.objectContaining({ allowPrivateNetwork: true }),
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -21,6 +21,13 @@ const DEFAULT_GOOGLE_TTS_VOICE = "Kore";
|
||||
const GOOGLE_TTS_SAMPLE_RATE = 24_000;
|
||||
const GOOGLE_TTS_CHANNELS = 1;
|
||||
const GOOGLE_TTS_BITS_PER_SAMPLE = 16;
|
||||
const GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE = "audio-profile-v1";
|
||||
|
||||
const GOOGLE_TTS_MODELS = [
|
||||
"gemini-3.1-flash-tts-preview",
|
||||
"gemini-2.5-flash-preview-tts",
|
||||
"gemini-2.5-pro-preview-tts",
|
||||
] as const;
|
||||
|
||||
const GOOGLE_TTS_VOICES = [
|
||||
"Zephyr",
|
||||
@@ -62,6 +69,8 @@ type GoogleTtsProviderConfig = {
|
||||
voiceName: string;
|
||||
audioProfile?: string;
|
||||
speakerName?: string;
|
||||
promptTemplate?: typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE;
|
||||
personaPrompt?: string;
|
||||
};
|
||||
|
||||
type GoogleTtsProviderOverrides = {
|
||||
@@ -91,6 +100,13 @@ type GoogleGenerateSpeechResponse = {
|
||||
}>;
|
||||
};
|
||||
|
||||
class GoogleTtsRetryableError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = "GoogleTtsRetryableError";
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeGoogleTtsModel(model: unknown): string {
|
||||
const trimmed = normalizeOptionalString(model);
|
||||
if (!trimmed) {
|
||||
@@ -104,6 +120,19 @@ function normalizeGoogleTtsVoiceName(voiceName: unknown): string {
|
||||
return normalizeOptionalString(voiceName) ?? DEFAULT_GOOGLE_TTS_VOICE;
|
||||
}
|
||||
|
||||
function normalizeGooglePromptTemplate(
|
||||
value: unknown,
|
||||
): typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE | undefined {
|
||||
const trimmed = normalizeOptionalString(value);
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
if (trimmed === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE) {
|
||||
return trimmed;
|
||||
}
|
||||
throw new Error(`Invalid Google TTS promptTemplate: ${trimmed}`);
|
||||
}
|
||||
|
||||
function resolveGoogleTtsEnvApiKey(): string | undefined {
|
||||
return (
|
||||
normalizeOptionalString(process.env.GEMINI_API_KEY) ??
|
||||
@@ -149,6 +178,8 @@ function normalizeGoogleTtsProviderConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): GoogleTtsProviderConfig {
|
||||
const raw = resolveGoogleTtsConfigRecord(rawConfig);
|
||||
const promptTemplate = normalizeGooglePromptTemplate(raw?.promptTemplate);
|
||||
const personaPrompt = trimToUndefined(raw?.personaPrompt);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
@@ -159,11 +190,16 @@ function normalizeGoogleTtsProviderConfig(
|
||||
voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice),
|
||||
audioProfile: trimToUndefined(raw?.audioProfile),
|
||||
speakerName: trimToUndefined(raw?.speakerName),
|
||||
...(promptTemplate ? { promptTemplate } : {}),
|
||||
...(personaPrompt ? { personaPrompt } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsProviderConfig {
|
||||
const normalized = normalizeGoogleTtsProviderConfig({});
|
||||
const promptTemplate =
|
||||
normalizeGooglePromptTemplate(config.promptTemplate) ?? normalized.promptTemplate;
|
||||
const personaPrompt = trimToUndefined(config.personaPrompt) ?? normalized.personaPrompt;
|
||||
return {
|
||||
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
|
||||
baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl,
|
||||
@@ -173,6 +209,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro
|
||||
),
|
||||
audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile,
|
||||
speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName,
|
||||
...(promptTemplate ? { promptTemplate } : {}),
|
||||
...(personaPrompt ? { personaPrompt } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -243,6 +281,116 @@ function extractGoogleSpeechPcm(payload: GoogleGenerateSpeechResponse): Buffer {
|
||||
throw new Error("Google TTS response missing audio data");
|
||||
}
|
||||
|
||||
function normalizePromptSectionText(value: string | undefined): string | undefined {
|
||||
const trimmed = trimToUndefined(value?.replace(/\r\n?/g, "\n"));
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
let sanitized = "";
|
||||
for (const char of trimmed) {
|
||||
const code = char.charCodeAt(0);
|
||||
if (
|
||||
(code >= 0 && code <= 8) ||
|
||||
code === 11 ||
|
||||
code === 12 ||
|
||||
(code >= 14 && code <= 31) ||
|
||||
code === 127
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
sanitized += char;
|
||||
}
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
function normalizePromptList(values: readonly string[] | undefined): string[] {
|
||||
return (values ?? [])
|
||||
.map((value) => normalizePromptSectionText(value))
|
||||
.filter((value): value is string => Boolean(value));
|
||||
}
|
||||
|
||||
function isOpenClawGoogleAudioProfilePrompt(text: string): boolean {
|
||||
return (
|
||||
text.includes("# AUDIO PROFILE:") &&
|
||||
text.includes("### TRANSCRIPT") &&
|
||||
text.startsWith("Synthesize speech from the TRANSCRIPT section only.")
|
||||
);
|
||||
}
|
||||
|
||||
function renderGoogleAudioProfilePrompt(params: {
|
||||
text: string;
|
||||
persona?: {
|
||||
id: string;
|
||||
label?: string;
|
||||
prompt?: {
|
||||
profile?: string;
|
||||
scene?: string;
|
||||
sampleContext?: string;
|
||||
style?: string;
|
||||
accent?: string;
|
||||
pacing?: string;
|
||||
constraints?: string[];
|
||||
};
|
||||
};
|
||||
personaPrompt?: string;
|
||||
}): string {
|
||||
const transcript = params.text.replace(/\r\n?/g, "\n").trim();
|
||||
const prompt = params.persona?.prompt;
|
||||
const profile = normalizePromptSectionText(prompt?.profile);
|
||||
const scene = normalizePromptSectionText(prompt?.scene);
|
||||
const sampleContext = normalizePromptSectionText(prompt?.sampleContext);
|
||||
const style = normalizePromptSectionText(prompt?.style);
|
||||
const accent = normalizePromptSectionText(prompt?.accent);
|
||||
const pacing = normalizePromptSectionText(prompt?.pacing);
|
||||
const constraints = normalizePromptList(prompt?.constraints);
|
||||
const personaPrompt = normalizePromptSectionText(params.personaPrompt);
|
||||
const label =
|
||||
normalizePromptSectionText(params.persona?.label) ??
|
||||
normalizePromptSectionText(params.persona?.id);
|
||||
|
||||
const sections = [
|
||||
[
|
||||
"Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
|
||||
"as performance direction. Do not read section titles, notes, labels, or",
|
||||
"configuration aloud.",
|
||||
].join("\n"),
|
||||
];
|
||||
|
||||
if (label || profile) {
|
||||
sections.push([`# AUDIO PROFILE: ${label ?? "voice"}`, profile].filter(Boolean).join("\n"));
|
||||
}
|
||||
if (scene) {
|
||||
sections.push(["## THE SCENE", scene].join("\n"));
|
||||
}
|
||||
|
||||
const directorNotes: string[] = [];
|
||||
if (style) {
|
||||
directorNotes.push(`Style: ${style}`);
|
||||
}
|
||||
if (accent) {
|
||||
directorNotes.push(`Accent: ${accent}`);
|
||||
}
|
||||
if (pacing) {
|
||||
directorNotes.push(`Pacing: ${pacing}`);
|
||||
}
|
||||
if (constraints.length > 0) {
|
||||
directorNotes.push(["Constraints:", ...constraints.map((item) => `- ${item}`)].join("\n"));
|
||||
}
|
||||
if (personaPrompt) {
|
||||
directorNotes.push(["Provider notes:", personaPrompt].join("\n"));
|
||||
}
|
||||
if (directorNotes.length > 0) {
|
||||
sections.push(["### DIRECTOR'S NOTES", ...directorNotes].join("\n"));
|
||||
}
|
||||
|
||||
if (sampleContext) {
|
||||
sections.push(["### SAMPLE CONTEXT", sampleContext].join("\n"));
|
||||
}
|
||||
|
||||
sections.push(["### TRANSCRIPT", transcript].join("\n"));
|
||||
return sections.join("\n\n");
|
||||
}
|
||||
|
||||
function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): Buffer {
|
||||
const byteRate = sampleRate * GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8);
|
||||
const blockAlign = GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8);
|
||||
@@ -265,7 +413,7 @@ function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): B
|
||||
return Buffer.concat([header, pcm]);
|
||||
}
|
||||
|
||||
async function synthesizeGoogleTtsPcm(params: {
|
||||
async function synthesizeGoogleTtsPcmOnce(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
@@ -322,19 +470,59 @@ async function synthesizeGoogleTtsPcm(params: {
|
||||
});
|
||||
|
||||
try {
|
||||
await assertOkOrThrowProviderError(res, "Google TTS failed");
|
||||
return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse);
|
||||
if (!res.ok) {
|
||||
try {
|
||||
await assertOkOrThrowProviderError(res, "Google TTS failed");
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
if (res.status >= 500 && res.status < 600) {
|
||||
throw new GoogleTtsRetryableError(message);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
try {
|
||||
return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse);
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
throw new GoogleTtsRetryableError(message);
|
||||
}
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
}
|
||||
|
||||
async function synthesizeGoogleTtsPcm(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
request?: ReturnType<typeof sanitizeConfiguredModelProviderRequest>;
|
||||
model: string;
|
||||
voiceName: string;
|
||||
audioProfile?: string;
|
||||
speakerName?: string;
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
let lastError: unknown;
|
||||
for (let attempt = 0; attempt < 2; attempt += 1) {
|
||||
try {
|
||||
return await synthesizeGoogleTtsPcmOnce(params);
|
||||
} catch (err) {
|
||||
lastError = err;
|
||||
if (!(err instanceof GoogleTtsRetryableError) || attempt > 0) {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
}
|
||||
throw lastError instanceof Error ? lastError : new Error(String(lastError));
|
||||
}
|
||||
|
||||
export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "google",
|
||||
label: "Google",
|
||||
autoSelectOrder: 50,
|
||||
models: [DEFAULT_GOOGLE_TTS_MODEL],
|
||||
models: GOOGLE_TTS_MODELS,
|
||||
voices: GOOGLE_TTS_VOICES,
|
||||
resolveConfig: ({ rawConfig }) => normalizeGoogleTtsProviderConfig(rawConfig),
|
||||
parseDirectiveToken,
|
||||
@@ -372,6 +560,22 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
listVoices: async () => GOOGLE_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
|
||||
isConfigured: ({ cfg, providerConfig }) =>
|
||||
Boolean(resolveGoogleTtsApiKey({ cfg, providerConfig })),
|
||||
prepareSynthesis: (ctx) => {
|
||||
const config = readGoogleTtsProviderConfig(ctx.providerConfig);
|
||||
const shouldWrap =
|
||||
config.promptTemplate === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE ||
|
||||
Boolean(config.personaPrompt);
|
||||
if (!shouldWrap || isOpenClawGoogleAudioProfilePrompt(ctx.text)) {
|
||||
return undefined;
|
||||
}
|
||||
return {
|
||||
text: renderGoogleAudioProfilePrompt({
|
||||
text: ctx.text,
|
||||
persona: ctx.persona,
|
||||
personaPrompt: config.personaPrompt,
|
||||
}),
|
||||
};
|
||||
},
|
||||
synthesize: async (req) => {
|
||||
const config = readGoogleTtsProviderConfig(req.providerConfig);
|
||||
const overrides = readGoogleTtsOverrides(req.providerOverrides);
|
||||
@@ -449,7 +653,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
export const __testing = {
|
||||
DEFAULT_GOOGLE_TTS_MODEL,
|
||||
DEFAULT_GOOGLE_TTS_VOICE,
|
||||
GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE,
|
||||
GOOGLE_TTS_MODELS,
|
||||
GOOGLE_TTS_SAMPLE_RATE,
|
||||
normalizeGoogleTtsModel,
|
||||
renderGoogleAudioProfilePrompt,
|
||||
wrapPcm16MonoToWav,
|
||||
};
|
||||
|
||||
@@ -134,6 +134,7 @@ function createLiveTtsConfig(): ResolvedTtsConfig {
|
||||
voice: "alloy",
|
||||
},
|
||||
},
|
||||
personas: {},
|
||||
maxTextLength: 4_000,
|
||||
timeoutMs: 30_000,
|
||||
};
|
||||
|
||||
@@ -162,6 +162,40 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("maps persona prompt fields to instructions when instructions are unset", async () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
|
||||
const prepared = await provider.prepareSynthesis?.({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "sk-test",
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "cedar",
|
||||
},
|
||||
persona: {
|
||||
id: "alfred",
|
||||
label: "Alfred",
|
||||
prompt: {
|
||||
profile: "A brilliant British butler.",
|
||||
scene: "A quiet late-night study.",
|
||||
sampleContext: "The speaker is answering a trusted operator.",
|
||||
style: "Refined and lightly amused.",
|
||||
accent: "British English.",
|
||||
pacing: "Measured.",
|
||||
constraints: ["Do not read configuration values aloud."],
|
||||
},
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(prepared?.providerConfig?.instructions).toContain("Persona: Alfred");
|
||||
expect(prepared?.providerConfig?.instructions).toContain(
|
||||
"Constraint: Do not read configuration values aloud.",
|
||||
);
|
||||
});
|
||||
|
||||
it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
mockSpeechFetchExpectingFormat("wav");
|
||||
|
||||
@@ -71,7 +71,7 @@ function isGroqSpeechBaseUrl(baseUrl: string): boolean {
|
||||
|
||||
function resolveSpeechResponseFormat(
|
||||
baseUrl: string,
|
||||
target: "audio-file" | "voice-note",
|
||||
target: "audio-file" | "voice-note" | "telephony",
|
||||
configuredFormat?: OpenAiSpeechResponseFormat,
|
||||
): OpenAiSpeechResponseFormat {
|
||||
if (configuredFormat) {
|
||||
@@ -145,6 +145,37 @@ function readOpenAIOverrides(
|
||||
};
|
||||
}
|
||||
|
||||
function renderOpenAITtsPersonaInstructions(req: {
|
||||
label?: string;
|
||||
prompt?: {
|
||||
profile?: string;
|
||||
scene?: string;
|
||||
sampleContext?: string;
|
||||
style?: string;
|
||||
accent?: string;
|
||||
pacing?: string;
|
||||
constraints?: string[];
|
||||
};
|
||||
}): string | undefined {
|
||||
const prompt = req.prompt;
|
||||
if (!prompt) {
|
||||
return undefined;
|
||||
}
|
||||
const lines = [
|
||||
req.label ? `Persona: ${req.label}` : undefined,
|
||||
prompt.profile ? `Profile: ${prompt.profile}` : undefined,
|
||||
prompt.scene ? `Scene: ${prompt.scene}` : undefined,
|
||||
prompt.style ? `Style: ${prompt.style}` : undefined,
|
||||
prompt.accent ? `Accent: ${prompt.accent}` : undefined,
|
||||
prompt.pacing ? `Pacing: ${prompt.pacing}` : undefined,
|
||||
prompt.sampleContext ? `Sample context: ${prompt.sampleContext}` : undefined,
|
||||
...(prompt.constraints ?? []).map((constraint) => `Constraint: ${constraint}`),
|
||||
]
|
||||
.map((line) => trimToUndefined(line))
|
||||
.filter((line): line is string => Boolean(line));
|
||||
return lines.length > 0 ? lines.join("\n") : undefined;
|
||||
}
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
|
||||
handled: boolean;
|
||||
overrides?: SpeechProviderOverrides;
|
||||
@@ -229,6 +260,23 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
|
||||
isConfigured: ({ providerConfig }) =>
|
||||
Boolean(readOpenAIProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
|
||||
prepareSynthesis: (ctx) => {
|
||||
const config = readOpenAIProviderConfig(ctx.providerConfig);
|
||||
if (config.instructions) {
|
||||
return undefined;
|
||||
}
|
||||
const instructions = renderOpenAITtsPersonaInstructions({
|
||||
label: ctx.persona?.label ?? ctx.persona?.id,
|
||||
prompt: ctx.persona?.prompt,
|
||||
});
|
||||
return instructions
|
||||
? {
|
||||
providerConfig: {
|
||||
instructions,
|
||||
},
|
||||
}
|
||||
: undefined;
|
||||
},
|
||||
synthesize: async (req) => {
|
||||
const config = readOpenAIProviderConfig(req.providerConfig);
|
||||
const overrides = readOpenAIOverrides(req.providerOverrides);
|
||||
|
||||
@@ -3,11 +3,13 @@ export {
|
||||
getLastTtsAttempt,
|
||||
getResolvedSpeechProviderConfig,
|
||||
getTtsMaxLength,
|
||||
getTtsPersona,
|
||||
getTtsProvider,
|
||||
isSummarizationEnabled,
|
||||
isTtsEnabled,
|
||||
isTtsProviderConfigured,
|
||||
listSpeechVoices,
|
||||
listTtsPersonas,
|
||||
maybeApplyTtsToPayload,
|
||||
resolveExplicitTtsOverrides,
|
||||
resolveTtsAutoMode,
|
||||
@@ -19,6 +21,7 @@ export {
|
||||
setTtsAutoMode,
|
||||
setTtsEnabled,
|
||||
setTtsMaxLength,
|
||||
setTtsPersona,
|
||||
setTtsProvider,
|
||||
synthesizeSpeech,
|
||||
textToSpeech,
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
import { rmSync } from "node:fs";
|
||||
import path from "node:path";
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
|
||||
import type { SpeechProviderPlugin, SpeechSynthesisRequest } from "openclaw/plugin-sdk/speech-core";
|
||||
import type { ReplyPayload } from "openclaw/plugin-sdk/reply-payload";
|
||||
import type {
|
||||
SpeechProviderPlugin,
|
||||
SpeechProviderPrepareSynthesisContext,
|
||||
SpeechSynthesisRequest,
|
||||
} from "openclaw/plugin-sdk/speech-core";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
type MockSpeechSynthesisResult = Awaited<ReturnType<SpeechProviderPlugin["synthesize"]>>;
|
||||
@@ -16,6 +21,9 @@ const synthesizeMock = vi.hoisted(() =>
|
||||
}),
|
||||
),
|
||||
);
|
||||
const prepareSynthesisMock = vi.hoisted(() =>
|
||||
vi.fn(async (_ctx: SpeechProviderPrepareSynthesisContext) => undefined),
|
||||
);
|
||||
|
||||
const listSpeechProvidersMock = vi.hoisted(() => vi.fn());
|
||||
const getSpeechProviderMock = vi.hoisted(() => vi.fn());
|
||||
@@ -31,6 +39,7 @@ vi.mock("../api.js", async () => {
|
||||
label: "Mock",
|
||||
autoSelectOrder: 1,
|
||||
isConfigured: () => true,
|
||||
prepareSynthesis: prepareSynthesisMock,
|
||||
synthesize: synthesizeMock,
|
||||
};
|
||||
listSpeechProvidersMock.mockImplementation(() => [mockProvider]);
|
||||
@@ -49,10 +58,40 @@ vi.mock("../api.js", async () => {
|
||||
};
|
||||
});
|
||||
|
||||
const { _test, maybeApplyTtsToPayload, resolveTtsConfig } = await import("./tts.js");
|
||||
const {
|
||||
_test,
|
||||
getTtsPersona,
|
||||
getTtsProvider,
|
||||
maybeApplyTtsToPayload,
|
||||
resolveTtsConfig,
|
||||
synthesizeSpeech,
|
||||
textToSpeechTelephony,
|
||||
} = await import("./tts.js");
|
||||
|
||||
const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const;
|
||||
|
||||
function createMockSpeechProvider(
|
||||
id = "mock",
|
||||
options: Partial<SpeechProviderPlugin> = {},
|
||||
): SpeechProviderPlugin {
|
||||
return {
|
||||
id,
|
||||
label: id,
|
||||
autoSelectOrder: id === "mock" ? 1 : 2,
|
||||
isConfigured: () => true,
|
||||
prepareSynthesis: prepareSynthesisMock,
|
||||
synthesize: synthesizeMock,
|
||||
...options,
|
||||
};
|
||||
}
|
||||
|
||||
function installSpeechProviders(providers: SpeechProviderPlugin[]): void {
|
||||
listSpeechProvidersMock.mockImplementation(() => providers);
|
||||
getSpeechProviderMock.mockImplementation(
|
||||
(providerId: string) => providers.find((provider) => provider.id === providerId) ?? null,
|
||||
);
|
||||
}
|
||||
|
||||
function createTtsConfig(prefsName: string): OpenClawConfig {
|
||||
return {
|
||||
messages: {
|
||||
@@ -102,6 +141,8 @@ async function expectTtsPayloadResult(params: {
|
||||
describe("speech-core native voice-note routing", () => {
|
||||
afterEach(() => {
|
||||
synthesizeMock.mockClear();
|
||||
prepareSynthesisMock.mockClear();
|
||||
installSpeechProviders([createMockSpeechProvider()]);
|
||||
});
|
||||
|
||||
it("keeps native voice-note channel support centralized", () => {
|
||||
@@ -153,6 +194,268 @@ describe("speech-core native voice-note routing", () => {
|
||||
audioAsVoice: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
it("selects persona preferred provider before config fallback", () => {
|
||||
const cfg: OpenClawConfig = {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "other",
|
||||
persona: "alfred",
|
||||
personas: {
|
||||
alfred: {
|
||||
label: "Alfred",
|
||||
provider: "mock",
|
||||
providers: {
|
||||
mock: {
|
||||
voice: "Algieba",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const config = resolveTtsConfig(cfg);
|
||||
const prefsPath = "/tmp/openclaw-speech-core-persona-provider.json";
|
||||
|
||||
expect(getTtsPersona(config, prefsPath)?.id).toBe("alfred");
|
||||
expect(getTtsProvider(config, prefsPath)).toBe("mock");
|
||||
});
|
||||
|
||||
it("merges active persona provider binding into synthesis config", async () => {
|
||||
const cfg: OpenClawConfig = {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "mock",
|
||||
prefsPath: "/tmp/openclaw-speech-core-persona-merge.json",
|
||||
providers: {
|
||||
mock: {
|
||||
model: "base-model",
|
||||
voice: "base-voice",
|
||||
},
|
||||
},
|
||||
persona: "alfred",
|
||||
personas: {
|
||||
alfred: {
|
||||
provider: "mock",
|
||||
providers: {
|
||||
mock: {
|
||||
voice: "persona-voice",
|
||||
style: "dry",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const payload: ReplyPayload = {
|
||||
text: "This reply should use persona-specific provider configuration.",
|
||||
};
|
||||
|
||||
let mediaDir: string | undefined;
|
||||
try {
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload,
|
||||
cfg,
|
||||
channel: "slack",
|
||||
kind: "final",
|
||||
});
|
||||
|
||||
expect(synthesizeMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
providerConfig: expect.objectContaining({
|
||||
model: "base-model",
|
||||
voice: "persona-voice",
|
||||
style: "dry",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/);
|
||||
|
||||
mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
|
||||
} finally {
|
||||
if (mediaDir) {
|
||||
rmSync(mediaDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("does not mark skipped unregistered providers as missing persona bindings", async () => {
|
||||
const result = await synthesizeSpeech({
|
||||
text: "Use fallback provider.",
|
||||
cfg: {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "missing",
|
||||
persona: "alfred",
|
||||
personas: {
|
||||
alfred: {
|
||||
providers: {
|
||||
missing: {
|
||||
voice: "configured-but-unregistered",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.attempts?.[0]).toMatchObject({
|
||||
provider: "missing",
|
||||
outcome: "skipped",
|
||||
reasonCode: "no_provider_registered",
|
||||
persona: "alfred",
|
||||
});
|
||||
expect(result.attempts?.[0]).not.toHaveProperty("personaBinding");
|
||||
});
|
||||
|
||||
it("does not mark skipped telephony providers as missing persona bindings", async () => {
|
||||
const result = await textToSpeechTelephony({
|
||||
text: "Use telephony provider.",
|
||||
cfg: {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "mock",
|
||||
persona: "alfred",
|
||||
personas: {
|
||||
alfred: {
|
||||
providers: {
|
||||
mock: {
|
||||
voice: "persona-voice",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.attempts?.[0]).toMatchObject({
|
||||
provider: "mock",
|
||||
outcome: "skipped",
|
||||
reasonCode: "unsupported_for_telephony",
|
||||
persona: "alfred",
|
||||
});
|
||||
expect(result.attempts?.[0]).not.toHaveProperty("personaBinding");
|
||||
});
|
||||
|
||||
it("uses provider defaults when fallback policy allows missing persona bindings", async () => {
|
||||
await synthesizeSpeech({
|
||||
text: "Use neutral provider defaults.",
|
||||
cfg: {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "mock",
|
||||
persona: "alfred",
|
||||
personas: {
|
||||
alfred: {
|
||||
fallbackPolicy: "provider-defaults",
|
||||
prompt: {
|
||||
profile: "A precise butler.",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(prepareSynthesisMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
persona: undefined,
|
||||
personaProviderConfig: undefined,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("preserves persona prompts by default when provider bindings are missing", async () => {
|
||||
await synthesizeSpeech({
|
||||
text: "Use persona prompt.",
|
||||
cfg: {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "mock",
|
||||
persona: "alfred",
|
||||
personas: {
|
||||
alfred: {
|
||||
prompt: {
|
||||
profile: "A precise butler.",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(prepareSynthesisMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
persona: expect.objectContaining({ id: "alfred" }),
|
||||
personaProviderConfig: undefined,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("skips unbound providers under fail policy while allowing bound fallbacks", async () => {
|
||||
installSpeechProviders([
|
||||
createMockSpeechProvider("mock", { autoSelectOrder: 1 }),
|
||||
createMockSpeechProvider("fallback", { autoSelectOrder: 2 }),
|
||||
]);
|
||||
|
||||
const result = await synthesizeSpeech({
|
||||
text: "Use the first persona-bound provider.",
|
||||
cfg: {
|
||||
messages: {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "mock",
|
||||
persona: "alfred",
|
||||
personas: {
|
||||
alfred: {
|
||||
fallbackPolicy: "fail",
|
||||
providers: {
|
||||
fallback: {
|
||||
voice: "fallback-voice",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.provider).toBe("fallback");
|
||||
expect(result.fallbackFrom).toBe("mock");
|
||||
expect(result.attempts?.[0]).toMatchObject({
|
||||
provider: "mock",
|
||||
outcome: "skipped",
|
||||
reasonCode: "not_configured",
|
||||
persona: "alfred",
|
||||
personaBinding: "missing",
|
||||
error: "mock: persona alfred has no provider binding",
|
||||
});
|
||||
expect(result.attempts?.[1]).toMatchObject({
|
||||
provider: "fallback",
|
||||
outcome: "success",
|
||||
persona: "alfred",
|
||||
personaBinding: "applied",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("speech-core per-agent TTS config", () => {
|
||||
|
||||
@@ -12,6 +12,7 @@ import path from "node:path";
|
||||
import { normalizeChannelId, type ChannelId } from "openclaw/plugin-sdk/channel-targets";
|
||||
import type {
|
||||
OpenClawConfig,
|
||||
ResolvedTtsPersona,
|
||||
TtsAutoMode,
|
||||
TtsConfig,
|
||||
TtsModelOverrideConfig,
|
||||
@@ -40,6 +41,7 @@ import {
|
||||
normalizeSpeechProviderId,
|
||||
normalizeTtsAutoMode,
|
||||
parseTtsDirectives,
|
||||
resolveEffectiveTtsConfig,
|
||||
type ResolvedTtsConfig,
|
||||
type ResolvedTtsModelOverrides,
|
||||
scheduleCleanup,
|
||||
@@ -62,13 +64,13 @@ const DEFAULT_TIMEOUT_MS = 30_000;
|
||||
const DEFAULT_TTS_MAX_LENGTH = 1500;
|
||||
const DEFAULT_TTS_SUMMARIZE = true;
|
||||
const DEFAULT_MAX_TEXT_LENGTH = 4096;
|
||||
const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
|
||||
|
||||
type TtsUserPrefs = {
|
||||
tts?: {
|
||||
auto?: TtsAutoMode;
|
||||
enabled?: boolean;
|
||||
provider?: TtsProvider;
|
||||
persona?: string | null;
|
||||
maxLength?: number;
|
||||
summarize?: boolean;
|
||||
};
|
||||
@@ -86,6 +88,8 @@ export type TtsProviderAttempt = {
|
||||
provider: string;
|
||||
outcome: "success" | "skipped" | "failed";
|
||||
reasonCode: TtsAttemptReasonCode;
|
||||
persona?: string;
|
||||
personaBinding?: "applied" | "missing" | "none";
|
||||
latencyMs?: number;
|
||||
error?: string;
|
||||
};
|
||||
@@ -96,6 +100,7 @@ export type TtsResult = {
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
attempts?: TtsProviderAttempt[];
|
||||
@@ -111,6 +116,7 @@ export type TtsSynthesisResult = {
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
attempts?: TtsProviderAttempt[];
|
||||
@@ -126,6 +132,7 @@ export type TtsTelephonyResult = {
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
attempts?: TtsProviderAttempt[];
|
||||
@@ -139,6 +146,7 @@ type TtsStatusEntry = {
|
||||
textLength: number;
|
||||
summarized: boolean;
|
||||
provider?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
attempts?: TtsProviderAttempt[];
|
||||
@@ -162,6 +170,10 @@ function normalizeConfiguredSpeechProviderId(
|
||||
return normalized === "edge" ? "microsoft" : normalized;
|
||||
}
|
||||
|
||||
function normalizeTtsPersonaId(personaId: string | null | undefined): string | undefined {
|
||||
return normalizeOptionalLowercaseString(personaId ?? undefined);
|
||||
}
|
||||
|
||||
function resolveTtsPrefsPathValue(prefsPath: string | undefined): string {
|
||||
if (prefsPath?.trim()) {
|
||||
return resolveUserPath(prefsPath.trim());
|
||||
@@ -229,6 +241,87 @@ function asProviderConfigMap(value: unknown): Record<string, unknown> {
|
||||
: {};
|
||||
}
|
||||
|
||||
function hasOwnProperty(value: object, key: string): boolean {
|
||||
return Object.prototype.hasOwnProperty.call(value, key);
|
||||
}
|
||||
|
||||
function normalizeProviderConfigMap(
|
||||
value: unknown,
|
||||
): Record<string, SpeechProviderConfig> | undefined {
|
||||
const rawMap = asProviderConfigMap(value);
|
||||
if (Object.keys(rawMap).length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
const next: Record<string, SpeechProviderConfig> = {};
|
||||
for (const [providerId, providerConfig] of Object.entries(rawMap)) {
|
||||
const normalized = normalizeConfiguredSpeechProviderId(providerId) ?? providerId;
|
||||
next[normalized] = asProviderConfig(providerConfig);
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
function collectTtsPersonas(raw: TtsConfig): Record<string, ResolvedTtsPersona> {
|
||||
const rawPersonas = asProviderConfigMap(raw.personas);
|
||||
const personas: Record<string, ResolvedTtsPersona> = {};
|
||||
for (const [id, value] of Object.entries(rawPersonas)) {
|
||||
const normalizedId = normalizeTtsPersonaId(id);
|
||||
if (!normalizedId || typeof value !== "object" || value === null || Array.isArray(value)) {
|
||||
continue;
|
||||
}
|
||||
const persona = value as Omit<ResolvedTtsPersona, "id">;
|
||||
personas[normalizedId] = {
|
||||
...persona,
|
||||
id: normalizedId,
|
||||
provider: normalizeConfiguredSpeechProviderId(persona.provider) ?? persona.provider,
|
||||
providers: normalizeProviderConfigMap(persona.providers),
|
||||
};
|
||||
}
|
||||
return personas;
|
||||
}
|
||||
|
||||
function resolvePersonaProviderConfig(
|
||||
persona: ResolvedTtsPersona | undefined,
|
||||
providerId: string,
|
||||
): SpeechProviderConfig | undefined {
|
||||
if (!persona?.providers) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = normalizeConfiguredSpeechProviderId(providerId) ?? providerId;
|
||||
if (hasOwnProperty(persona.providers, normalized)) {
|
||||
return persona.providers[normalized];
|
||||
}
|
||||
if (hasOwnProperty(persona.providers, providerId)) {
|
||||
return persona.providers[providerId];
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function mergeProviderConfigWithPersona(params: {
|
||||
providerConfig: SpeechProviderConfig;
|
||||
persona?: ResolvedTtsPersona;
|
||||
providerId: string;
|
||||
}): {
|
||||
providerConfig: SpeechProviderConfig;
|
||||
personaProviderConfig?: SpeechProviderConfig;
|
||||
personaBinding: "applied" | "missing" | "none";
|
||||
} {
|
||||
if (!params.persona) {
|
||||
return { providerConfig: params.providerConfig, personaBinding: "none" };
|
||||
}
|
||||
const personaProviderConfig = resolvePersonaProviderConfig(params.persona, params.providerId);
|
||||
if (!personaProviderConfig) {
|
||||
return { providerConfig: params.providerConfig, personaBinding: "missing" };
|
||||
}
|
||||
return {
|
||||
providerConfig: {
|
||||
...params.providerConfig,
|
||||
...personaProviderConfig,
|
||||
},
|
||||
personaProviderConfig,
|
||||
personaBinding: "applied",
|
||||
};
|
||||
}
|
||||
|
||||
function resolveRawProviderConfig(
|
||||
raw: TtsConfig | undefined,
|
||||
providerId: string,
|
||||
@@ -241,48 +334,6 @@ function resolveRawProviderConfig(
|
||||
return asProviderConfig(direct);
|
||||
}
|
||||
|
||||
function isPlainObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function deepMergeDefined(base: unknown, override: unknown): unknown {
|
||||
if (!isPlainObject(base) || !isPlainObject(override)) {
|
||||
return override === undefined ? base : override;
|
||||
}
|
||||
|
||||
const result: Record<string, unknown> = { ...base };
|
||||
for (const [key, value] of Object.entries(override)) {
|
||||
if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
|
||||
continue;
|
||||
}
|
||||
const existing = result[key];
|
||||
result[key] = key in result ? deepMergeDefined(existing, value) : value;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function normalizeAgentConfigId(value: string | undefined | null): string {
|
||||
return normalizeLowercaseStringOrEmpty(value);
|
||||
}
|
||||
|
||||
function resolveAgentTtsOverride(
|
||||
cfg: OpenClawConfig,
|
||||
agentId: string | undefined,
|
||||
): TtsConfig | undefined {
|
||||
if (!agentId || !Array.isArray(cfg.agents?.list)) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = normalizeAgentConfigId(agentId);
|
||||
const agent = cfg.agents.list.find((entry) => normalizeAgentConfigId(entry.id) === normalized);
|
||||
return agent?.tts;
|
||||
}
|
||||
|
||||
function resolveEffectiveTtsRawConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig {
|
||||
const base = cfg.messages?.tts ?? {};
|
||||
const override = resolveAgentTtsOverride(cfg, agentId);
|
||||
return deepMergeDefined(base, override ?? {}) as TtsConfig;
|
||||
}
|
||||
|
||||
function resolveLazyProviderConfig(
|
||||
config: ResolvedTtsConfig,
|
||||
providerId: string,
|
||||
@@ -325,6 +376,8 @@ function collectDirectProviderConfigEntries(raw: TtsConfig): Record<string, Spee
|
||||
"maxTextLength",
|
||||
"mode",
|
||||
"modelOverrides",
|
||||
"persona",
|
||||
"personas",
|
||||
"prefsPath",
|
||||
"provider",
|
||||
"providers",
|
||||
@@ -357,10 +410,11 @@ export function getResolvedSpeechProviderConfig(
|
||||
}
|
||||
|
||||
export function resolveTtsConfig(cfg: OpenClawConfig, agentId?: string): ResolvedTtsConfig {
|
||||
const raw: TtsConfig = resolveEffectiveTtsRawConfig(cfg, agentId);
|
||||
const raw: TtsConfig = resolveEffectiveTtsConfig(cfg, agentId);
|
||||
const providerSource = raw.provider ? "config" : "default";
|
||||
const timeoutMs = raw.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
||||
const auto = resolveConfiguredTtsAutoMode(raw);
|
||||
const persona = normalizeTtsPersonaId(raw.persona);
|
||||
return {
|
||||
auto,
|
||||
mode: raw.mode ?? "final",
|
||||
@@ -368,6 +422,8 @@ export function resolveTtsConfig(cfg: OpenClawConfig, agentId?: string): Resolve
|
||||
normalizeConfiguredSpeechProviderId(raw.provider) ??
|
||||
(providerSource === "config" ? (normalizeOptionalLowercaseString(raw.provider) ?? "") : ""),
|
||||
providerSource,
|
||||
persona,
|
||||
personas: collectTtsPersonas(raw),
|
||||
summaryModel: normalizeOptionalString(raw.summaryModel),
|
||||
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
|
||||
providerConfigs: collectDirectProviderConfigEntries(raw),
|
||||
@@ -418,7 +474,7 @@ function resolveEffectiveTtsAutoState(params: {
|
||||
autoMode: TtsAutoMode;
|
||||
prefsPath: string;
|
||||
} {
|
||||
const raw: TtsConfig = resolveEffectiveTtsRawConfig(params.cfg, params.agentId);
|
||||
const raw: TtsConfig = resolveEffectiveTtsConfig(params.cfg, params.agentId);
|
||||
const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath);
|
||||
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
|
||||
if (sessionAuto) {
|
||||
@@ -443,6 +499,7 @@ export function buildTtsSystemPromptHint(
|
||||
return undefined;
|
||||
}
|
||||
const _config = resolveTtsConfig(cfg, agentId);
|
||||
const persona = getTtsPersona(_config, prefsPath);
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
|
||||
const autoHint =
|
||||
@@ -454,6 +511,9 @@ export function buildTtsSystemPromptHint(
|
||||
return [
|
||||
"Voice (TTS) is enabled.",
|
||||
autoHint,
|
||||
persona
|
||||
? `Active TTS persona: ${persona.label ?? persona.id}${persona.description ? ` - ${persona.description}` : ""}.`
|
||||
: undefined,
|
||||
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
|
||||
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
|
||||
]
|
||||
@@ -523,6 +583,13 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
|
||||
if (prefsProvider) {
|
||||
return prefsProvider;
|
||||
}
|
||||
const activePersona = resolveTtsPersonaFromPrefs(config, prefs);
|
||||
const personaProvider =
|
||||
canonicalizeSpeechProviderId(activePersona?.provider, config.sourceConfig) ??
|
||||
normalizeConfiguredSpeechProviderId(activePersona?.provider);
|
||||
if (personaProvider && getSpeechProvider(personaProvider, config.sourceConfig)) {
|
||||
return personaProvider;
|
||||
}
|
||||
if (config.providerSource === "config") {
|
||||
return normalizeConfiguredSpeechProviderId(config.provider) ?? config.provider;
|
||||
}
|
||||
@@ -542,6 +609,38 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
|
||||
return config.provider;
|
||||
}
|
||||
|
||||
function resolveTtsPersonaFromPrefs(
|
||||
config: ResolvedTtsConfig,
|
||||
prefs: TtsUserPrefs,
|
||||
): ResolvedTtsPersona | undefined {
|
||||
if (prefs.tts && hasOwnProperty(prefs.tts, "persona")) {
|
||||
const prefsPersona = normalizeTtsPersonaId(prefs.tts.persona);
|
||||
return prefsPersona ? config.personas[prefsPersona] : undefined;
|
||||
}
|
||||
const configPersona = normalizeTtsPersonaId(config.persona);
|
||||
return configPersona ? config.personas[configPersona] : undefined;
|
||||
}
|
||||
|
||||
export function getTtsPersona(
|
||||
config: ResolvedTtsConfig,
|
||||
prefsPath: string,
|
||||
): ResolvedTtsPersona | undefined {
|
||||
return resolveTtsPersonaFromPrefs(config, readPrefs(prefsPath));
|
||||
}
|
||||
|
||||
export function listTtsPersonas(config: ResolvedTtsConfig): ResolvedTtsPersona[] {
|
||||
return Object.values(config.personas).toSorted((left, right) => left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
export function setTtsPersona(prefsPath: string, persona: string | null | undefined): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
const next = { ...prefs.tts };
|
||||
const normalized = normalizeTtsPersonaId(persona);
|
||||
next.persona = normalized ?? null;
|
||||
prefs.tts = next;
|
||||
});
|
||||
}
|
||||
|
||||
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
prefs.tts = { ...prefs.tts, provider: canonicalizeSpeechProviderId(provider) ?? provider };
|
||||
@@ -714,17 +813,20 @@ function buildTtsFailureResult(
|
||||
errors: string[],
|
||||
attemptedProviders?: string[],
|
||||
attempts?: TtsProviderAttempt[],
|
||||
persona?: string,
|
||||
): {
|
||||
success: false;
|
||||
error: string;
|
||||
attemptedProviders?: string[];
|
||||
attempts?: TtsProviderAttempt[];
|
||||
persona?: string;
|
||||
} {
|
||||
return {
|
||||
success: false,
|
||||
error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`,
|
||||
attemptedProviders,
|
||||
attempts,
|
||||
persona,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -733,17 +835,22 @@ type TtsProviderReadyResolution =
|
||||
kind: "ready";
|
||||
provider: NonNullable<ReturnType<typeof getSpeechProvider>>;
|
||||
providerConfig: SpeechProviderConfig;
|
||||
personaProviderConfig?: SpeechProviderConfig;
|
||||
synthesisPersona?: ResolvedTtsPersona;
|
||||
personaBinding: "applied" | "missing" | "none";
|
||||
}
|
||||
| {
|
||||
kind: "skip";
|
||||
reasonCode: "no_provider_registered" | "not_configured" | "unsupported_for_telephony";
|
||||
message: string;
|
||||
personaBinding?: "missing";
|
||||
};
|
||||
|
||||
function resolveReadySpeechProvider(params: {
|
||||
provider: TtsProvider;
|
||||
cfg: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
persona?: ResolvedTtsPersona;
|
||||
requireTelephony?: boolean;
|
||||
}): TtsProviderReadyResolution {
|
||||
const resolvedProvider = getSpeechProvider(params.provider, params.cfg);
|
||||
@@ -759,10 +866,23 @@ function resolveReadySpeechProvider(params: {
|
||||
resolvedProvider.id,
|
||||
params.cfg,
|
||||
);
|
||||
const merged = mergeProviderConfigWithPersona({
|
||||
providerConfig,
|
||||
persona: params.persona,
|
||||
providerId: resolvedProvider.id,
|
||||
});
|
||||
if (params.persona?.fallbackPolicy === "fail" && merged.personaBinding === "missing") {
|
||||
return {
|
||||
kind: "skip",
|
||||
reasonCode: "not_configured",
|
||||
message: `${params.provider}: persona ${params.persona.id} has no provider binding`,
|
||||
personaBinding: "missing",
|
||||
};
|
||||
}
|
||||
if (
|
||||
!resolvedProvider.isConfigured({
|
||||
cfg: params.cfg,
|
||||
providerConfig,
|
||||
providerConfig: merged.providerConfig,
|
||||
timeoutMs: params.config.timeoutMs,
|
||||
})
|
||||
) {
|
||||
@@ -782,7 +902,56 @@ function resolveReadySpeechProvider(params: {
|
||||
return {
|
||||
kind: "ready",
|
||||
provider: resolvedProvider,
|
||||
providerConfig,
|
||||
providerConfig: merged.providerConfig,
|
||||
personaProviderConfig: merged.personaProviderConfig,
|
||||
synthesisPersona:
|
||||
params.persona?.fallbackPolicy === "provider-defaults" && merged.personaBinding === "missing"
|
||||
? undefined
|
||||
: params.persona,
|
||||
personaBinding: merged.personaBinding,
|
||||
};
|
||||
}
|
||||
|
||||
async function prepareSpeechSynthesis(params: {
|
||||
provider: NonNullable<ReturnType<typeof getSpeechProvider>>;
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
providerConfig: SpeechProviderConfig;
|
||||
providerOverrides?: SpeechProviderOverrides;
|
||||
persona?: ResolvedTtsPersona;
|
||||
personaProviderConfig?: SpeechProviderConfig;
|
||||
target: "audio-file" | "voice-note" | "telephony";
|
||||
timeoutMs: number;
|
||||
}): Promise<{
|
||||
text: string;
|
||||
providerConfig: SpeechProviderConfig;
|
||||
providerOverrides?: SpeechProviderOverrides;
|
||||
}> {
|
||||
if (!params.provider.prepareSynthesis) {
|
||||
return {
|
||||
text: params.text,
|
||||
providerConfig: params.providerConfig,
|
||||
providerOverrides: params.providerOverrides,
|
||||
};
|
||||
}
|
||||
const prepared = await params.provider.prepareSynthesis({
|
||||
text: params.text,
|
||||
cfg: params.cfg,
|
||||
providerConfig: params.providerConfig,
|
||||
providerOverrides: params.providerOverrides,
|
||||
persona: params.persona,
|
||||
personaProviderConfig: params.personaProviderConfig,
|
||||
target: params.target,
|
||||
timeoutMs: params.timeoutMs,
|
||||
});
|
||||
return {
|
||||
text: prepared?.text ?? params.text,
|
||||
providerConfig: prepared?.providerConfig
|
||||
? { ...params.providerConfig, ...prepared.providerConfig }
|
||||
: params.providerConfig,
|
||||
providerOverrides: prepared?.providerOverrides
|
||||
? { ...params.providerOverrides, ...prepared.providerOverrides }
|
||||
: params.providerOverrides,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -796,6 +965,7 @@ function resolveTtsRequestSetup(params: {
|
||||
}):
|
||||
| {
|
||||
config: ResolvedTtsConfig;
|
||||
persona?: ResolvedTtsPersona;
|
||||
providers: TtsProvider[];
|
||||
}
|
||||
| {
|
||||
@@ -814,6 +984,7 @@ function resolveTtsRequestSetup(params: {
|
||||
canonicalizeSpeechProviderId(params.providerOverride, params.cfg) ?? userProvider;
|
||||
return {
|
||||
config,
|
||||
persona: getTtsPersona(config, prefsPath),
|
||||
providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg),
|
||||
};
|
||||
}
|
||||
@@ -833,6 +1004,7 @@ export async function textToSpeech(params: {
|
||||
return {
|
||||
success: false,
|
||||
error: synthesis.error ?? "TTS conversion failed",
|
||||
persona: synthesis.persona,
|
||||
attemptedProviders: synthesis.attemptedProviders,
|
||||
attempts: synthesis.attempts,
|
||||
};
|
||||
@@ -850,6 +1022,7 @@ export async function textToSpeech(params: {
|
||||
audioPath,
|
||||
latencyMs: synthesis.latencyMs,
|
||||
provider: synthesis.provider,
|
||||
persona: synthesis.persona,
|
||||
fallbackFrom: synthesis.fallbackFrom,
|
||||
attemptedProviders: synthesis.attemptedProviders,
|
||||
attempts: synthesis.attempts,
|
||||
@@ -886,7 +1059,7 @@ export async function synthesizeSpeech(params: {
|
||||
return { success: false, error: setup.error };
|
||||
}
|
||||
|
||||
const { config, providers } = setup;
|
||||
const { config, persona, providers } = setup;
|
||||
const timeoutMs = params.timeoutMs ?? config.timeoutMs;
|
||||
const target = supportsNativeVoiceNoteTts(params.channel) ? "voice-note" : "audio-file";
|
||||
|
||||
@@ -906,6 +1079,7 @@ export async function synthesizeSpeech(params: {
|
||||
provider,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
persona,
|
||||
});
|
||||
if (resolvedProvider.kind === "skip") {
|
||||
errors.push(resolvedProvider.message);
|
||||
@@ -913,17 +1087,32 @@ export async function synthesizeSpeech(params: {
|
||||
provider,
|
||||
outcome: "skipped",
|
||||
reasonCode: resolvedProvider.reasonCode,
|
||||
persona: persona?.id,
|
||||
...(resolvedProvider.personaBinding
|
||||
? { personaBinding: resolvedProvider.personaBinding }
|
||||
: {}),
|
||||
error: resolvedProvider.message,
|
||||
});
|
||||
logVerbose(`TTS: provider ${provider} skipped (${resolvedProvider.message})`);
|
||||
continue;
|
||||
}
|
||||
const synthesis = await resolvedProvider.provider.synthesize({
|
||||
const prepared = await prepareSpeechSynthesis({
|
||||
provider: resolvedProvider.provider,
|
||||
text: params.text,
|
||||
cfg: params.cfg,
|
||||
providerConfig: resolvedProvider.providerConfig,
|
||||
target,
|
||||
providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id],
|
||||
persona: resolvedProvider.synthesisPersona,
|
||||
personaProviderConfig: resolvedProvider.personaProviderConfig,
|
||||
target,
|
||||
timeoutMs,
|
||||
});
|
||||
const synthesis = await resolvedProvider.provider.synthesize({
|
||||
text: prepared.text,
|
||||
cfg: params.cfg,
|
||||
providerConfig: prepared.providerConfig,
|
||||
target,
|
||||
providerOverrides: prepared.providerOverrides,
|
||||
timeoutMs,
|
||||
});
|
||||
const latencyMs = Date.now() - providerStart;
|
||||
@@ -931,6 +1120,8 @@ export async function synthesizeSpeech(params: {
|
||||
provider,
|
||||
outcome: "success",
|
||||
reasonCode: "success",
|
||||
persona: persona?.id,
|
||||
personaBinding: resolvedProvider.personaBinding,
|
||||
latencyMs,
|
||||
});
|
||||
return {
|
||||
@@ -938,6 +1129,7 @@ export async function synthesizeSpeech(params: {
|
||||
audioBuffer: synthesis.audioBuffer,
|
||||
latencyMs,
|
||||
provider,
|
||||
persona: persona?.id,
|
||||
fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
|
||||
attemptedProviders,
|
||||
attempts,
|
||||
@@ -956,6 +1148,13 @@ export async function synthesizeSpeech(params: {
|
||||
reasonCode:
|
||||
err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error",
|
||||
latencyMs,
|
||||
persona: persona?.id,
|
||||
personaBinding:
|
||||
resolvePersonaProviderConfig(persona, provider) != null
|
||||
? "applied"
|
||||
: persona
|
||||
? "missing"
|
||||
: "none",
|
||||
error: errorMsg,
|
||||
});
|
||||
const rawError = sanitizeTtsErrorForLog(err);
|
||||
@@ -970,7 +1169,7 @@ export async function synthesizeSpeech(params: {
|
||||
}
|
||||
}
|
||||
|
||||
return buildTtsFailureResult(errors, attemptedProviders, attempts);
|
||||
return buildTtsFailureResult(errors, attemptedProviders, attempts, persona?.id);
|
||||
}
|
||||
|
||||
export async function textToSpeechTelephony(params: {
|
||||
@@ -987,7 +1186,7 @@ export async function textToSpeechTelephony(params: {
|
||||
return { success: false, error: setup.error };
|
||||
}
|
||||
|
||||
const { config, providers } = setup;
|
||||
const { config, persona, providers } = setup;
|
||||
const errors: string[] = [];
|
||||
const attemptedProviders: string[] = [];
|
||||
const attempts: TtsProviderAttempt[] = [];
|
||||
@@ -1004,6 +1203,7 @@ export async function textToSpeechTelephony(params: {
|
||||
provider,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
persona,
|
||||
requireTelephony: true,
|
||||
});
|
||||
if (resolvedProvider.kind === "skip") {
|
||||
@@ -1012,28 +1212,32 @@ export async function textToSpeechTelephony(params: {
|
||||
provider,
|
||||
outcome: "skipped",
|
||||
reasonCode: resolvedProvider.reasonCode,
|
||||
persona: persona?.id,
|
||||
...(resolvedProvider.personaBinding
|
||||
? { personaBinding: resolvedProvider.personaBinding }
|
||||
: {}),
|
||||
error: resolvedProvider.message,
|
||||
});
|
||||
logVerbose(`TTS telephony: provider ${provider} skipped (${resolvedProvider.message})`);
|
||||
continue;
|
||||
}
|
||||
const synthesizeTelephony = resolvedProvider.provider.synthesizeTelephony;
|
||||
if (!synthesizeTelephony) {
|
||||
const message = `${provider}: unsupported for telephony`;
|
||||
errors.push(message);
|
||||
attempts.push({
|
||||
provider,
|
||||
outcome: "skipped",
|
||||
reasonCode: "unsupported_for_telephony",
|
||||
error: message,
|
||||
});
|
||||
logVerbose(`TTS telephony: provider ${provider} skipped (${message})`);
|
||||
continue;
|
||||
}
|
||||
const synthesis = await synthesizeTelephony({
|
||||
const synthesizeTelephony = resolvedProvider.provider.synthesizeTelephony as NonNullable<
|
||||
typeof resolvedProvider.provider.synthesizeTelephony
|
||||
>;
|
||||
const prepared = await prepareSpeechSynthesis({
|
||||
provider: resolvedProvider.provider,
|
||||
text: params.text,
|
||||
cfg: params.cfg,
|
||||
providerConfig: resolvedProvider.providerConfig,
|
||||
persona: resolvedProvider.synthesisPersona,
|
||||
personaProviderConfig: resolvedProvider.personaProviderConfig,
|
||||
target: "telephony",
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
const synthesis = await synthesizeTelephony({
|
||||
text: prepared.text,
|
||||
cfg: params.cfg,
|
||||
providerConfig: prepared.providerConfig,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
const latencyMs = Date.now() - providerStart;
|
||||
@@ -1041,6 +1245,8 @@ export async function textToSpeechTelephony(params: {
|
||||
provider,
|
||||
outcome: "success",
|
||||
reasonCode: "success",
|
||||
persona: persona?.id,
|
||||
personaBinding: resolvedProvider.personaBinding,
|
||||
latencyMs,
|
||||
});
|
||||
|
||||
@@ -1049,6 +1255,7 @@ export async function textToSpeechTelephony(params: {
|
||||
audioBuffer: synthesis.audioBuffer,
|
||||
latencyMs,
|
||||
provider,
|
||||
persona: persona?.id,
|
||||
fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
|
||||
attemptedProviders,
|
||||
attempts,
|
||||
@@ -1065,6 +1272,13 @@ export async function textToSpeechTelephony(params: {
|
||||
reasonCode:
|
||||
err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error",
|
||||
latencyMs,
|
||||
persona: persona?.id,
|
||||
personaBinding:
|
||||
resolvePersonaProviderConfig(persona, provider) != null
|
||||
? "applied"
|
||||
: persona
|
||||
? "missing"
|
||||
: "none",
|
||||
error: errorMsg,
|
||||
});
|
||||
const rawError = sanitizeTtsErrorForLog(err);
|
||||
@@ -1079,7 +1293,7 @@ export async function textToSpeechTelephony(params: {
|
||||
}
|
||||
}
|
||||
|
||||
return buildTtsFailureResult(errors, attemptedProviders, attempts);
|
||||
return buildTtsFailureResult(errors, attemptedProviders, attempts, persona?.id);
|
||||
}
|
||||
|
||||
export async function listSpeechVoices(params: {
|
||||
@@ -1250,6 +1464,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
textLength: text.length,
|
||||
summarized: wasSummarized,
|
||||
provider: result.provider,
|
||||
persona: result.persona,
|
||||
fallbackFrom: result.fallbackFrom,
|
||||
attemptedProviders: result.attemptedProviders,
|
||||
attempts: result.attempts,
|
||||
@@ -1268,6 +1483,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
success: false,
|
||||
textLength: text.length,
|
||||
summarized: wasSummarized,
|
||||
persona: result.persona,
|
||||
attemptedProviders: result.attemptedProviders,
|
||||
attempts: result.attempts,
|
||||
error: result.error,
|
||||
|
||||
@@ -6,6 +6,7 @@ import {
|
||||
type SpeechProviderConfig,
|
||||
type SpeechProviderOverrides,
|
||||
type SpeechProviderPlugin,
|
||||
type SpeechSynthesisTarget,
|
||||
} from "openclaw/plugin-sdk/speech";
|
||||
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
|
||||
import {
|
||||
@@ -48,7 +49,7 @@ function normalizeXaiSpeechResponseFormat(value: unknown): XaiSpeechResponseForm
|
||||
}
|
||||
|
||||
function resolveSpeechResponseFormat(
|
||||
target: "audio-file" | "voice-note",
|
||||
target: SpeechSynthesisTarget,
|
||||
configuredFormat?: XaiSpeechResponseFormat,
|
||||
): XaiSpeechResponseFormat {
|
||||
if (configuredFormat) {
|
||||
|
||||
Reference in New Issue
Block a user