diff --git a/CHANGELOG.md b/CHANGELOG.md index 4126ebae4b8..46a3a3bb6f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1048,6 +1048,7 @@ Docs: https://docs.openclaw.ai - Anthropic/models: add Claude Opus 4.7 `xhigh` reasoning effort support and keep it separate from adaptive thinking. - Control UI/settings: overhaul the settings and slash-command experience with faster presets, quick-create flows, and refreshed command discovery. (#67819) Thanks @BunsDev. - macOS/gateway: add `screen.snapshot` support for macOS app nodes, including runtime plumbing, default macOS allowlisting, and docs for monitor preview flows. (#67954) Thanks @BunsDev. +- TTS/personas: add provider-aware TTS personas with deterministic provider binding merges, `/tts persona` controls, gateway/CLI persona state, Google Gemini `audio-profile-v1` prompt wrapping, and OpenAI instruction mapping. (#68323) ### Fixes diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 98eafaedcab..12bd04d7069 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -493,6 +493,110 @@ transcoded to raw 16 kHz mono PCM with `ffmpeg`. The legacy provider alias } ``` +### TTS personas + +Use `messages.tts.personas` when you want a stable spoken identity that can be +applied deterministically across providers. A persona can prefer one provider, +define provider-neutral prompt intent, and carry provider-specific bindings for +voices, models, prompt templates, seeds, and voice settings. + +```json5 +{ + messages: { + tts: { + auto: "always", + persona: "alfred", + personas: { + alfred: { + label: "Alfred", + description: "Dry, warm British butler narrator.", + provider: "google", + fallbackPolicy: "preserve-persona", + prompt: { + profile: "A brilliant British butler. Dry, witty, warm, charming, emotionally expressive, never generic.", + scene: "A quiet late-night study. Close-mic narration for a trusted operator.", + sampleContext: "The speaker is answering a private technical request with concise confidence and dry warmth.", + style: "Refined, understated, lightly amused.", + accent: "British English.", + pacing: "Measured, with short dramatic pauses.", + constraints: ["Do not read configuration values aloud.", "Do not explain the persona."], + }, + providers: { + google: { + model: "gemini-3.1-flash-tts-preview", + voiceName: "Algieba", + promptTemplate: "audio-profile-v1", + }, + openai: { + model: "gpt-4o-mini-tts", + voice: "cedar", + }, + elevenlabs: { + voiceId: "voice_id", + modelId: "eleven_multilingual_v2", + seed: 42, + voiceSettings: { + stability: 0.65, + similarityBoost: 0.8, + style: 0.25, + useSpeakerBoost: true, + speed: 0.95, + }, + }, + }, + }, + }, + }, + }, +} +``` + +Resolution is deterministic: + +1. `/tts persona ` local preference, if set. +2. `messages.tts.persona`, if set. +3. No persona. + +Provider selection is explicit-first: + +1. Direct provider overrides from CLI, gateway, Talk, or allowed TTS directives. +2. `/tts provider ` local preference. +3. Active persona `provider`. +4. `messages.tts.provider`. +5. Registry auto-select. + +For each provider attempt, OpenClaw merges: + +1. `messages.tts.providers.` +2. `messages.tts.personas..providers.` +3. trusted request overrides +4. allowed model-emitted TTS directive overrides + +`fallbackPolicy` controls what happens when an active persona has no binding for +an attempted provider: + +- `preserve-persona` keeps provider-neutral persona prompt fields available to + providers. This is the default. +- `provider-defaults` omits the persona from provider prompt preparation for + that attempt, so the provider uses its neutral defaults while still allowing + fallback to continue. +- `fail` skips that provider attempt with `reasonCode: "not_configured"` and + `personaBinding: "missing"`. Fallback providers are still tried; the whole TTS + request fails only if every attempted provider is skipped or fails. + +Persona prompt fields are provider-neutral. Providers decide how to use them. +Google wraps them only when the effective Google provider config sets +`promptTemplate: "audio-profile-v1"` or `personaPrompt`; its older +`audioProfile` and `speakerName` fields are still prepended as Google-specific +prompt text. OpenAI maps prompt fields to `instructions` when no explicit +OpenAI `instructions` value is configured. Providers without prompt-like +controls use the provider-specific persona bindings only. + +Gemini inline audio tags are transcript content, not persona config. If the +assistant or an explicit `[[tts:text]]` block includes tags such as `[whispers]` +or `[laughs]`, OpenClaw preserves them inside the Gemini transcript. OpenClaw +does not generate configured start tags. + ### Disable Microsoft speech ```json5 @@ -565,6 +669,12 @@ Then run: - If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` config is repaired by `openclaw doctor --fix` and rewritten to `provider: "microsoft"`. +- `persona`: default TTS persona id from `personas`. +- `personas.`: stable spoken identity. The id is normalized to lowercase. +- `personas..provider`: preferred speech provider for the persona. Explicit provider overrides and local provider prefs still win. +- `personas..fallbackPolicy`: `preserve-persona` (default), `provider-defaults`, or `fail`; see [TTS personas](#tts-personas). +- `personas..prompt`: provider-neutral persona prompt fields (`profile`, `scene`, `sampleContext`, `style`, `accent`, `pacing`, `constraints`). +- `personas..providers.`: provider-specific persona binding merged over `providers.`. - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. - Accepts `provider/model` or a configured model alias. - `modelOverrides`: allow the model to emit TTS directives (on by default). @@ -621,6 +731,8 @@ Then run: - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted). - `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text. - `providers.google.speakerName`: optional speaker label prepended before the spoken text when your TTS prompt uses a named speaker. +- `providers.google.promptTemplate`: set to `audio-profile-v1` to wrap active persona prompt fields in a deterministic Gemini TTS prompt structure. +- `providers.google.personaPrompt`: Google-specific extra persona prompt text appended to the template's Director's Notes. - `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted. - If `messages.tts.providers.google.apiKey` is omitted, TTS can reuse `models.providers.google.apiKey` before env fallback. - `providers.gradium.baseUrl`: override Gradium API base URL (default `https://api.gradium.ai`). @@ -750,8 +862,9 @@ Slash commands write local overrides to `prefsPath` (default: Stored fields: -- `enabled` +- `auto` - `provider` +- `persona` - `maxLength` (summary threshold; default 1500 chars) - `summarize` (default `true`) @@ -837,6 +950,7 @@ Discord note: `/tts` is a built-in Discord command, so OpenClaw registers /tts chat default /tts latest /tts provider openai +/tts persona alfred /tts limit 2000 /tts summary off /tts audio Hello from OpenClaw @@ -850,6 +964,7 @@ Notes: - `/tts on` writes the local TTS preference to `always`; `/tts off` writes it to `off`. - `/tts chat on|off|default` writes a session-scoped auto-TTS override for the current chat. - Use config when you want `inbound` or `tagged` defaults. +- `/tts persona ` writes the local persona preference; `/tts persona off` clears it. - `limit` and `summary` are stored in local prefs, not the main config. - `/tts audio` generates a one-off audio reply (does not toggle TTS on). - `/tts latest` reads the latest assistant reply from the current session transcript and sends it as audio once. It stores only a hash of that reply on the session entry to suppress duplicate voice sends. @@ -883,6 +998,7 @@ Gateway methods: - `tts.disable` - `tts.convert` - `tts.setProvider` +- `tts.setPersona` - `tts.providers` ## Related diff --git a/extensions/google/speech-provider.test.ts b/extensions/google/speech-provider.test.ts index b64cc5bcace..47d952c7ee8 100644 --- a/extensions/google/speech-provider.test.ts +++ b/extensions/google/speech-provider.test.ts @@ -1,5 +1,8 @@ -import * as providerHttp from "openclaw/plugin-sdk/provider-http"; -import { afterEach, describe, expect, it, vi } from "vitest"; +import { afterEach, beforeAll, describe, expect, it, vi } from "vitest"; +import { + getProviderHttpMocks, + installProviderHttpMockCleanup, +} from "../../test/helpers/media-generation/provider-http-mocks.js"; const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn()); @@ -7,10 +10,23 @@ vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock, })); -import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js"; +const { + assertOkOrThrowProviderErrorMock, + postJsonRequestMock, + resolveProviderHttpRequestConfigMock, +} = getProviderHttpMocks(); -function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) { - const fetchMock = vi.fn().mockResolvedValue({ +let buildGoogleSpeechProvider: typeof import("./speech-provider.js").buildGoogleSpeechProvider; +let __testing: typeof import("./speech-provider.js").__testing; + +beforeAll(async () => { + ({ buildGoogleSpeechProvider, __testing } = await import("./speech-provider.js")); +}); + +installProviderHttpMockCleanup(); + +function googleTtsResponse(pcm = Buffer.from([1, 0, 2, 0])) { + return { ok: true, json: async () => ({ candidates: [ @@ -28,21 +44,26 @@ function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) { }, ], }), + }; +} + +function installGoogleTtsRequestMock(pcm = Buffer.from([1, 0, 2, 0])) { + postJsonRequestMock.mockResolvedValue({ + response: googleTtsResponse(pcm), + release: vi.fn(async () => {}), }); - vi.stubGlobal("fetch", fetchMock); - return fetchMock; + return postJsonRequestMock; } describe("Google speech provider", () => { afterEach(() => { - vi.restoreAllMocks(); vi.unstubAllGlobals(); vi.unstubAllEnvs(); transcodeAudioBufferToOpusMock.mockReset(); }); it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => { - const fetchMock = installGoogleTtsFetchMock(); + const requestMock = installGoogleTtsRequestMock(); const provider = buildGoogleSpeechProvider(); const result = await provider.synthesize({ @@ -57,11 +78,10 @@ describe("Google speech provider", () => { timeoutMs: 12_345, }); - expect(fetchMock).toHaveBeenCalledWith( - "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent", + expect(requestMock).toHaveBeenCalledWith( expect.objectContaining({ - method: "POST", - body: JSON.stringify({ + url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent", + body: { contents: [ { role: "user", @@ -78,11 +98,14 @@ describe("Google speech provider", () => { }, }, }, - }), + }, + fetchFn: fetch, + pinDns: false, + timeoutMs: 12_345, }), ); - const [, init] = fetchMock.mock.calls[0]; - expect(new Headers(init.headers).get("x-goog-api-key")).toBe("google-test-key"); + const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit }; + expect(new Headers(request.headers).get("x-goog-api-key")).toBe("google-test-key"); expect(result.outputFormat).toBe("wav"); expect(result.fileExtension).toBe(".wav"); expect(result.voiceCompatible).toBe(false); @@ -94,7 +117,7 @@ describe("Google speech provider", () => { }); it("transcodes Gemini PCM to Opus for voice-note targets", async () => { - installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0])); + installGoogleTtsRequestMock(Buffer.from([5, 0, 6, 0])); transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus")); const provider = buildGoogleSpeechProvider(); @@ -125,9 +148,138 @@ describe("Google speech provider", () => { expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE"); }); + it("advertises all documented Gemini TTS-capable models", () => { + const provider = buildGoogleSpeechProvider(); + + expect(provider.models).toEqual(__testing.GOOGLE_TTS_MODELS); + }); + + it("renders deterministic audio-profile-v1 prompts without generating tags", async () => { + const provider = buildGoogleSpeechProvider(); + + const prepared = await provider.prepareSynthesis?.({ + text: "[whispers] The door is open.", + cfg: {}, + providerConfig: { + promptTemplate: "audio-profile-v1", + personaPrompt: "Keep a close-mic feel.", + }, + persona: { + id: "alfred", + label: "Alfred", + prompt: { + profile: "A brilliant British butler.", + scene: "A quiet late-night study.", + sampleContext: "The speaker is answering a trusted operator.", + style: "Refined and lightly amused.", + accent: "British English.", + pacing: "Measured.", + constraints: ["Do not read configuration values aloud."], + }, + }, + target: "audio-file", + timeoutMs: 1_000, + }); + + expect(prepared?.text).toBe( + [ + "Synthesize speech from the TRANSCRIPT section only. Use the other sections only", + "as performance direction. Do not read section titles, notes, labels, or", + "configuration aloud.", + "", + "# AUDIO PROFILE: Alfred", + "A brilliant British butler.", + "", + "## THE SCENE", + "A quiet late-night study.", + "", + "### DIRECTOR'S NOTES", + "Style: Refined and lightly amused.", + "Accent: British English.", + "Pacing: Measured.", + "Constraints:", + "- Do not read configuration values aloud.", + "Provider notes:", + "Keep a close-mic feel.", + "", + "### SAMPLE CONTEXT", + "The speaker is answering a trusted operator.", + "", + "### TRANSCRIPT", + "[whispers] The door is open.", + ].join("\n"), + ); + }); + + it("does not wrap an OpenClaw audio-profile-v1 prompt twice", async () => { + const provider = buildGoogleSpeechProvider(); + const text = [ + "Synthesize speech from the TRANSCRIPT section only. Use the other sections only", + "as performance direction. Do not read section titles, notes, labels, or", + "configuration aloud.", + "", + "# AUDIO PROFILE: Alfred", + "A brilliant British butler.", + "", + "### TRANSCRIPT", + "Hello.", + ].join("\n"); + + const prepared = await provider.prepareSynthesis?.({ + text, + cfg: {}, + providerConfig: { + promptTemplate: "audio-profile-v1", + }, + persona: { + id: "alfred", + label: "Alfred", + prompt: { + profile: "A brilliant British butler.", + }, + }, + target: "audio-file", + timeoutMs: 1_000, + }); + + expect(prepared).toBeUndefined(); + }); + + it("retries once when Gemini returns no audio payload", async () => { + const pcm = Buffer.from([5, 0, 6, 0]); + const requestSequence = vi + .fn() + .mockResolvedValueOnce({ + response: { + ok: true, + json: async () => ({ candidates: [{ content: { parts: [{ text: "not audio" }] } }] }), + }, + release: vi.fn(async () => {}), + }) + .mockResolvedValueOnce({ + response: googleTtsResponse(pcm), + release: vi.fn(async () => {}), + }); + postJsonRequestMock.mockImplementation(requestSequence); + const provider = buildGoogleSpeechProvider(); + + const result = await provider.synthesize({ + text: "Retry this.", + cfg: {}, + providerConfig: { + apiKey: "google-test-key", + }, + target: "audio-file", + timeoutMs: 5_000, + }); + + expect(requestSequence).toHaveBeenCalledTimes(2); + expect(result.audioBuffer.subarray(44)).toEqual(pcm); + }); + it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => { vi.stubEnv("GEMINI_API_KEY", "env-google-key"); - const fetchMock = installGoogleTtsFetchMock(); + const requestMock = installGoogleTtsRequestMock(); const provider = buildGoogleSpeechProvider(); expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true); @@ -149,16 +301,17 @@ describe("Google speech provider", () => { timeoutMs: 10_000, }); - expect(fetchMock).toHaveBeenCalledWith( - "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent", - expect.any(Object), + expect(requestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent", + }), ); - const [, init] = fetchMock.mock.calls[0]; - expect(new Headers(init.headers).get("x-goog-api-key")).toBe("env-google-key"); + const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit }; + expect(new Headers(request.headers).get("x-goog-api-key")).toBe("env-google-key"); }); it("can reuse a configured Google model-provider API key without auth profiles", async () => { - const fetchMock = installGoogleTtsFetchMock(); + const requestMock = installGoogleTtsRequestMock(); const provider = buildGoogleSpeechProvider(); const cfg = { models: { @@ -182,13 +335,13 @@ describe("Google speech provider", () => { timeoutMs: 10_000, }); - const [, init] = fetchMock.mock.calls[0]; - expect(new Headers(init.headers).get("x-goog-api-key")).toBe("model-provider-google-key"); + const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit }; + expect(new Headers(request.headers).get("x-goog-api-key")).toBe("model-provider-google-key"); }); it("returns Gemini PCM directly for telephony synthesis", async () => { const pcm = Buffer.from([3, 0, 4, 0]); - installGoogleTtsFetchMock(pcm); + installGoogleTtsRequestMock(pcm); const provider = buildGoogleSpeechProvider(); const result = await provider.synthesizeTelephony?.({ @@ -209,7 +362,7 @@ describe("Google speech provider", () => { }); it("prepends configured Gemini TTS profile text", async () => { - const fetchMock = installGoogleTtsFetchMock(); + const requestMock = installGoogleTtsRequestMock(); const provider = buildGoogleSpeechProvider(); await provider.synthesize({ @@ -224,8 +377,7 @@ describe("Google speech provider", () => { timeoutMs: 10_000, }); - const [, init] = fetchMock.mock.calls[0]; - expect(JSON.parse(String(init.body))).toMatchObject({ + expect(requestMock.mock.calls[0]?.[0].body).toMatchObject({ contents: [ { parts: [ @@ -326,23 +478,26 @@ describe("Google speech provider", () => { }); it("formats Google TTS HTTP errors with provider details", async () => { - vi.stubGlobal( - "fetch", - vi.fn().mockResolvedValue( - new Response( - JSON.stringify({ - error: { - message: "Quota exceeded", - status: "RESOURCE_EXHAUSTED", - }, - }), - { - status: 429, - headers: { "x-request-id": "google_req_123" }, - }, - ), + assertOkOrThrowProviderErrorMock.mockRejectedValue( + new Error( + "Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]", ), ); + postJsonRequestMock.mockResolvedValue({ + response: new Response( + JSON.stringify({ + error: { + message: "Quota exceeded", + status: "RESOURCE_EXHAUSTED", + }, + }), + { + status: 429, + headers: { "x-request-id": "google_req_123" }, + }, + ), + release: vi.fn(async () => {}), + }); const provider = buildGoogleSpeechProvider(); await expect( @@ -359,8 +514,7 @@ describe("Google speech provider", () => { }); it("honors configured private-network opt-in for Google TTS", async () => { - installGoogleTtsFetchMock(); - const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest"); + installGoogleTtsRequestMock(); const provider = buildGoogleSpeechProvider(); await provider.synthesize({ @@ -381,14 +535,16 @@ describe("Google speech provider", () => { timeoutMs: 12_345, }); - expect(postJsonRequestSpy).toHaveBeenCalledWith( - expect.objectContaining({ allowPrivateNetwork: true }), + expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith( + expect.objectContaining({ + allowPrivateNetwork: true, + request: expect.objectContaining({ allowPrivateNetwork: true }), + }), ); }); it("honors configured private-network opt-in for Google telephony TTS", async () => { - installGoogleTtsFetchMock(); - const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest"); + installGoogleTtsRequestMock(); const provider = buildGoogleSpeechProvider(); await provider.synthesizeTelephony?.({ @@ -408,8 +564,11 @@ describe("Google speech provider", () => { timeoutMs: 12_345, }); - expect(postJsonRequestSpy).toHaveBeenCalledWith( - expect.objectContaining({ allowPrivateNetwork: true }), + expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith( + expect.objectContaining({ + allowPrivateNetwork: true, + request: expect.objectContaining({ allowPrivateNetwork: true }), + }), ); }); }); diff --git a/extensions/google/speech-provider.ts b/extensions/google/speech-provider.ts index e31686cddcb..b0a3932e0a6 100644 --- a/extensions/google/speech-provider.ts +++ b/extensions/google/speech-provider.ts @@ -21,6 +21,13 @@ const DEFAULT_GOOGLE_TTS_VOICE = "Kore"; const GOOGLE_TTS_SAMPLE_RATE = 24_000; const GOOGLE_TTS_CHANNELS = 1; const GOOGLE_TTS_BITS_PER_SAMPLE = 16; +const GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE = "audio-profile-v1"; + +const GOOGLE_TTS_MODELS = [ + "gemini-3.1-flash-tts-preview", + "gemini-2.5-flash-preview-tts", + "gemini-2.5-pro-preview-tts", +] as const; const GOOGLE_TTS_VOICES = [ "Zephyr", @@ -62,6 +69,8 @@ type GoogleTtsProviderConfig = { voiceName: string; audioProfile?: string; speakerName?: string; + promptTemplate?: typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE; + personaPrompt?: string; }; type GoogleTtsProviderOverrides = { @@ -91,6 +100,13 @@ type GoogleGenerateSpeechResponse = { }>; }; +class GoogleTtsRetryableError extends Error { + constructor(message: string) { + super(message); + this.name = "GoogleTtsRetryableError"; + } +} + function normalizeGoogleTtsModel(model: unknown): string { const trimmed = normalizeOptionalString(model); if (!trimmed) { @@ -104,6 +120,19 @@ function normalizeGoogleTtsVoiceName(voiceName: unknown): string { return normalizeOptionalString(voiceName) ?? DEFAULT_GOOGLE_TTS_VOICE; } +function normalizeGooglePromptTemplate( + value: unknown, +): typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE | undefined { + const trimmed = normalizeOptionalString(value); + if (!trimmed) { + return undefined; + } + if (trimmed === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE) { + return trimmed; + } + throw new Error(`Invalid Google TTS promptTemplate: ${trimmed}`); +} + function resolveGoogleTtsEnvApiKey(): string | undefined { return ( normalizeOptionalString(process.env.GEMINI_API_KEY) ?? @@ -149,6 +178,8 @@ function normalizeGoogleTtsProviderConfig( rawConfig: Record, ): GoogleTtsProviderConfig { const raw = resolveGoogleTtsConfigRecord(rawConfig); + const promptTemplate = normalizeGooglePromptTemplate(raw?.promptTemplate); + const personaPrompt = trimToUndefined(raw?.personaPrompt); return { apiKey: normalizeResolvedSecretInputString({ value: raw?.apiKey, @@ -159,11 +190,16 @@ function normalizeGoogleTtsProviderConfig( voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice), audioProfile: trimToUndefined(raw?.audioProfile), speakerName: trimToUndefined(raw?.speakerName), + ...(promptTemplate ? { promptTemplate } : {}), + ...(personaPrompt ? { personaPrompt } : {}), }; } function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsProviderConfig { const normalized = normalizeGoogleTtsProviderConfig({}); + const promptTemplate = + normalizeGooglePromptTemplate(config.promptTemplate) ?? normalized.promptTemplate; + const personaPrompt = trimToUndefined(config.personaPrompt) ?? normalized.personaPrompt; return { apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey, baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl, @@ -173,6 +209,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro ), audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile, speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName, + ...(promptTemplate ? { promptTemplate } : {}), + ...(personaPrompt ? { personaPrompt } : {}), }; } @@ -243,6 +281,116 @@ function extractGoogleSpeechPcm(payload: GoogleGenerateSpeechResponse): Buffer { throw new Error("Google TTS response missing audio data"); } +function normalizePromptSectionText(value: string | undefined): string | undefined { + const trimmed = trimToUndefined(value?.replace(/\r\n?/g, "\n")); + if (!trimmed) { + return undefined; + } + let sanitized = ""; + for (const char of trimmed) { + const code = char.charCodeAt(0); + if ( + (code >= 0 && code <= 8) || + code === 11 || + code === 12 || + (code >= 14 && code <= 31) || + code === 127 + ) { + continue; + } + sanitized += char; + } + return sanitized; +} + +function normalizePromptList(values: readonly string[] | undefined): string[] { + return (values ?? []) + .map((value) => normalizePromptSectionText(value)) + .filter((value): value is string => Boolean(value)); +} + +function isOpenClawGoogleAudioProfilePrompt(text: string): boolean { + return ( + text.includes("# AUDIO PROFILE:") && + text.includes("### TRANSCRIPT") && + text.startsWith("Synthesize speech from the TRANSCRIPT section only.") + ); +} + +function renderGoogleAudioProfilePrompt(params: { + text: string; + persona?: { + id: string; + label?: string; + prompt?: { + profile?: string; + scene?: string; + sampleContext?: string; + style?: string; + accent?: string; + pacing?: string; + constraints?: string[]; + }; + }; + personaPrompt?: string; +}): string { + const transcript = params.text.replace(/\r\n?/g, "\n").trim(); + const prompt = params.persona?.prompt; + const profile = normalizePromptSectionText(prompt?.profile); + const scene = normalizePromptSectionText(prompt?.scene); + const sampleContext = normalizePromptSectionText(prompt?.sampleContext); + const style = normalizePromptSectionText(prompt?.style); + const accent = normalizePromptSectionText(prompt?.accent); + const pacing = normalizePromptSectionText(prompt?.pacing); + const constraints = normalizePromptList(prompt?.constraints); + const personaPrompt = normalizePromptSectionText(params.personaPrompt); + const label = + normalizePromptSectionText(params.persona?.label) ?? + normalizePromptSectionText(params.persona?.id); + + const sections = [ + [ + "Synthesize speech from the TRANSCRIPT section only. Use the other sections only", + "as performance direction. Do not read section titles, notes, labels, or", + "configuration aloud.", + ].join("\n"), + ]; + + if (label || profile) { + sections.push([`# AUDIO PROFILE: ${label ?? "voice"}`, profile].filter(Boolean).join("\n")); + } + if (scene) { + sections.push(["## THE SCENE", scene].join("\n")); + } + + const directorNotes: string[] = []; + if (style) { + directorNotes.push(`Style: ${style}`); + } + if (accent) { + directorNotes.push(`Accent: ${accent}`); + } + if (pacing) { + directorNotes.push(`Pacing: ${pacing}`); + } + if (constraints.length > 0) { + directorNotes.push(["Constraints:", ...constraints.map((item) => `- ${item}`)].join("\n")); + } + if (personaPrompt) { + directorNotes.push(["Provider notes:", personaPrompt].join("\n")); + } + if (directorNotes.length > 0) { + sections.push(["### DIRECTOR'S NOTES", ...directorNotes].join("\n")); + } + + if (sampleContext) { + sections.push(["### SAMPLE CONTEXT", sampleContext].join("\n")); + } + + sections.push(["### TRANSCRIPT", transcript].join("\n")); + return sections.join("\n\n"); +} + function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): Buffer { const byteRate = sampleRate * GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8); const blockAlign = GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8); @@ -265,7 +413,7 @@ function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): B return Buffer.concat([header, pcm]); } -async function synthesizeGoogleTtsPcm(params: { +async function synthesizeGoogleTtsPcmOnce(params: { text: string; apiKey: string; baseUrl?: string; @@ -322,19 +470,59 @@ async function synthesizeGoogleTtsPcm(params: { }); try { - await assertOkOrThrowProviderError(res, "Google TTS failed"); - return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse); + if (!res.ok) { + try { + await assertOkOrThrowProviderError(res, "Google TTS failed"); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + if (res.status >= 500 && res.status < 600) { + throw new GoogleTtsRetryableError(message); + } + throw err; + } + } + try { + return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + throw new GoogleTtsRetryableError(message); + } } finally { await release(); } } +async function synthesizeGoogleTtsPcm(params: { + text: string; + apiKey: string; + baseUrl?: string; + request?: ReturnType; + model: string; + voiceName: string; + audioProfile?: string; + speakerName?: string; + timeoutMs: number; +}): Promise { + let lastError: unknown; + for (let attempt = 0; attempt < 2; attempt += 1) { + try { + return await synthesizeGoogleTtsPcmOnce(params); + } catch (err) { + lastError = err; + if (!(err instanceof GoogleTtsRetryableError) || attempt > 0) { + throw err; + } + } + } + throw lastError instanceof Error ? lastError : new Error(String(lastError)); +} + export function buildGoogleSpeechProvider(): SpeechProviderPlugin { return { id: "google", label: "Google", autoSelectOrder: 50, - models: [DEFAULT_GOOGLE_TTS_MODEL], + models: GOOGLE_TTS_MODELS, voices: GOOGLE_TTS_VOICES, resolveConfig: ({ rawConfig }) => normalizeGoogleTtsProviderConfig(rawConfig), parseDirectiveToken, @@ -372,6 +560,22 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin { listVoices: async () => GOOGLE_TTS_VOICES.map((voice) => ({ id: voice, name: voice })), isConfigured: ({ cfg, providerConfig }) => Boolean(resolveGoogleTtsApiKey({ cfg, providerConfig })), + prepareSynthesis: (ctx) => { + const config = readGoogleTtsProviderConfig(ctx.providerConfig); + const shouldWrap = + config.promptTemplate === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE || + Boolean(config.personaPrompt); + if (!shouldWrap || isOpenClawGoogleAudioProfilePrompt(ctx.text)) { + return undefined; + } + return { + text: renderGoogleAudioProfilePrompt({ + text: ctx.text, + persona: ctx.persona, + personaPrompt: config.personaPrompt, + }), + }; + }, synthesize: async (req) => { const config = readGoogleTtsProviderConfig(req.providerConfig); const overrides = readGoogleTtsOverrides(req.providerOverrides); @@ -449,7 +653,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin { export const __testing = { DEFAULT_GOOGLE_TTS_MODEL, DEFAULT_GOOGLE_TTS_VOICE, + GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE, + GOOGLE_TTS_MODELS, GOOGLE_TTS_SAMPLE_RATE, normalizeGoogleTtsModel, + renderGoogleAudioProfilePrompt, wrapPcm16MonoToWav, }; diff --git a/extensions/openai/openai.live.test.ts b/extensions/openai/openai.live.test.ts index 220f94ed53a..8345c4409b1 100644 --- a/extensions/openai/openai.live.test.ts +++ b/extensions/openai/openai.live.test.ts @@ -134,6 +134,7 @@ function createLiveTtsConfig(): ResolvedTtsConfig { voice: "alloy", }, }, + personas: {}, maxTextLength: 4_000, timeoutMs: 30_000, }; diff --git a/extensions/openai/speech-provider.test.ts b/extensions/openai/speech-provider.test.ts index 65c09736d86..4413af440b0 100644 --- a/extensions/openai/speech-provider.test.ts +++ b/extensions/openai/speech-provider.test.ts @@ -162,6 +162,40 @@ describe("buildOpenAISpeechProvider", () => { }); }); + it("maps persona prompt fields to instructions when instructions are unset", async () => { + const provider = buildOpenAISpeechProvider(); + + const prepared = await provider.prepareSynthesis?.({ + text: "hello", + cfg: {} as never, + providerConfig: { + apiKey: "sk-test", + model: "gpt-4o-mini-tts", + voice: "cedar", + }, + persona: { + id: "alfred", + label: "Alfred", + prompt: { + profile: "A brilliant British butler.", + scene: "A quiet late-night study.", + sampleContext: "The speaker is answering a trusted operator.", + style: "Refined and lightly amused.", + accent: "British English.", + pacing: "Measured.", + constraints: ["Do not read configuration values aloud."], + }, + }, + target: "audio-file", + timeoutMs: 1_000, + }); + + expect(prepared?.providerConfig?.instructions).toContain("Persona: Alfred"); + expect(prepared?.providerConfig?.instructions).toContain( + "Constraint: Do not read configuration values aloud.", + ); + }); + it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => { const provider = buildOpenAISpeechProvider(); mockSpeechFetchExpectingFormat("wav"); diff --git a/extensions/openai/speech-provider.ts b/extensions/openai/speech-provider.ts index 04c99e40158..7ae825f33cc 100644 --- a/extensions/openai/speech-provider.ts +++ b/extensions/openai/speech-provider.ts @@ -71,7 +71,7 @@ function isGroqSpeechBaseUrl(baseUrl: string): boolean { function resolveSpeechResponseFormat( baseUrl: string, - target: "audio-file" | "voice-note", + target: "audio-file" | "voice-note" | "telephony", configuredFormat?: OpenAiSpeechResponseFormat, ): OpenAiSpeechResponseFormat { if (configuredFormat) { @@ -145,6 +145,37 @@ function readOpenAIOverrides( }; } +function renderOpenAITtsPersonaInstructions(req: { + label?: string; + prompt?: { + profile?: string; + scene?: string; + sampleContext?: string; + style?: string; + accent?: string; + pacing?: string; + constraints?: string[]; + }; +}): string | undefined { + const prompt = req.prompt; + if (!prompt) { + return undefined; + } + const lines = [ + req.label ? `Persona: ${req.label}` : undefined, + prompt.profile ? `Profile: ${prompt.profile}` : undefined, + prompt.scene ? `Scene: ${prompt.scene}` : undefined, + prompt.style ? `Style: ${prompt.style}` : undefined, + prompt.accent ? `Accent: ${prompt.accent}` : undefined, + prompt.pacing ? `Pacing: ${prompt.pacing}` : undefined, + prompt.sampleContext ? `Sample context: ${prompt.sampleContext}` : undefined, + ...(prompt.constraints ?? []).map((constraint) => `Constraint: ${constraint}`), + ] + .map((line) => trimToUndefined(line)) + .filter((line): line is string => Boolean(line)); + return lines.length > 0 ? lines.join("\n") : undefined; +} + function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { handled: boolean; overrides?: SpeechProviderOverrides; @@ -229,6 +260,23 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin { listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })), isConfigured: ({ providerConfig }) => Boolean(readOpenAIProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY), + prepareSynthesis: (ctx) => { + const config = readOpenAIProviderConfig(ctx.providerConfig); + if (config.instructions) { + return undefined; + } + const instructions = renderOpenAITtsPersonaInstructions({ + label: ctx.persona?.label ?? ctx.persona?.id, + prompt: ctx.persona?.prompt, + }); + return instructions + ? { + providerConfig: { + instructions, + }, + } + : undefined; + }, synthesize: async (req) => { const config = readOpenAIProviderConfig(req.providerConfig); const overrides = readOpenAIOverrides(req.providerOverrides); diff --git a/extensions/speech-core/runtime-api.ts b/extensions/speech-core/runtime-api.ts index 81a6c3a83c3..2959109108e 100644 --- a/extensions/speech-core/runtime-api.ts +++ b/extensions/speech-core/runtime-api.ts @@ -3,11 +3,13 @@ export { getLastTtsAttempt, getResolvedSpeechProviderConfig, getTtsMaxLength, + getTtsPersona, getTtsProvider, isSummarizationEnabled, isTtsEnabled, isTtsProviderConfigured, listSpeechVoices, + listTtsPersonas, maybeApplyTtsToPayload, resolveExplicitTtsOverrides, resolveTtsAutoMode, @@ -19,6 +21,7 @@ export { setTtsAutoMode, setTtsEnabled, setTtsMaxLength, + setTtsPersona, setTtsProvider, synthesizeSpeech, textToSpeech, diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 0249d178467..90d19a4e252 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -1,7 +1,12 @@ import { rmSync } from "node:fs"; import path from "node:path"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; -import type { SpeechProviderPlugin, SpeechSynthesisRequest } from "openclaw/plugin-sdk/speech-core"; +import type { ReplyPayload } from "openclaw/plugin-sdk/reply-payload"; +import type { + SpeechProviderPlugin, + SpeechProviderPrepareSynthesisContext, + SpeechSynthesisRequest, +} from "openclaw/plugin-sdk/speech-core"; import { afterEach, describe, expect, it, vi } from "vitest"; type MockSpeechSynthesisResult = Awaited>; @@ -16,6 +21,9 @@ const synthesizeMock = vi.hoisted(() => }), ), ); +const prepareSynthesisMock = vi.hoisted(() => + vi.fn(async (_ctx: SpeechProviderPrepareSynthesisContext) => undefined), +); const listSpeechProvidersMock = vi.hoisted(() => vi.fn()); const getSpeechProviderMock = vi.hoisted(() => vi.fn()); @@ -31,6 +39,7 @@ vi.mock("../api.js", async () => { label: "Mock", autoSelectOrder: 1, isConfigured: () => true, + prepareSynthesis: prepareSynthesisMock, synthesize: synthesizeMock, }; listSpeechProvidersMock.mockImplementation(() => [mockProvider]); @@ -49,10 +58,40 @@ vi.mock("../api.js", async () => { }; }); -const { _test, maybeApplyTtsToPayload, resolveTtsConfig } = await import("./tts.js"); +const { + _test, + getTtsPersona, + getTtsProvider, + maybeApplyTtsToPayload, + resolveTtsConfig, + synthesizeSpeech, + textToSpeechTelephony, +} = await import("./tts.js"); const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const; +function createMockSpeechProvider( + id = "mock", + options: Partial = {}, +): SpeechProviderPlugin { + return { + id, + label: id, + autoSelectOrder: id === "mock" ? 1 : 2, + isConfigured: () => true, + prepareSynthesis: prepareSynthesisMock, + synthesize: synthesizeMock, + ...options, + }; +} + +function installSpeechProviders(providers: SpeechProviderPlugin[]): void { + listSpeechProvidersMock.mockImplementation(() => providers); + getSpeechProviderMock.mockImplementation( + (providerId: string) => providers.find((provider) => provider.id === providerId) ?? null, + ); +} + function createTtsConfig(prefsName: string): OpenClawConfig { return { messages: { @@ -102,6 +141,8 @@ async function expectTtsPayloadResult(params: { describe("speech-core native voice-note routing", () => { afterEach(() => { synthesizeMock.mockClear(); + prepareSynthesisMock.mockClear(); + installSpeechProviders([createMockSpeechProvider()]); }); it("keeps native voice-note channel support centralized", () => { @@ -153,6 +194,268 @@ describe("speech-core native voice-note routing", () => { audioAsVoice: undefined, }); }); + + it("selects persona preferred provider before config fallback", () => { + const cfg: OpenClawConfig = { + messages: { + tts: { + enabled: true, + provider: "other", + persona: "alfred", + personas: { + alfred: { + label: "Alfred", + provider: "mock", + providers: { + mock: { + voice: "Algieba", + }, + }, + }, + }, + }, + }, + }; + const config = resolveTtsConfig(cfg); + const prefsPath = "/tmp/openclaw-speech-core-persona-provider.json"; + + expect(getTtsPersona(config, prefsPath)?.id).toBe("alfred"); + expect(getTtsProvider(config, prefsPath)).toBe("mock"); + }); + + it("merges active persona provider binding into synthesis config", async () => { + const cfg: OpenClawConfig = { + messages: { + tts: { + enabled: true, + provider: "mock", + prefsPath: "/tmp/openclaw-speech-core-persona-merge.json", + providers: { + mock: { + model: "base-model", + voice: "base-voice", + }, + }, + persona: "alfred", + personas: { + alfred: { + provider: "mock", + providers: { + mock: { + voice: "persona-voice", + style: "dry", + }, + }, + }, + }, + }, + }, + }; + + const payload: ReplyPayload = { + text: "This reply should use persona-specific provider configuration.", + }; + + let mediaDir: string | undefined; + try { + const result = await maybeApplyTtsToPayload({ + payload, + cfg, + channel: "slack", + kind: "final", + }); + + expect(synthesizeMock).toHaveBeenCalledWith( + expect.objectContaining({ + providerConfig: expect.objectContaining({ + model: "base-model", + voice: "persona-voice", + style: "dry", + }), + }), + ); + expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/); + + mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined; + } finally { + if (mediaDir) { + rmSync(mediaDir, { recursive: true, force: true }); + } + } + }); + + it("does not mark skipped unregistered providers as missing persona bindings", async () => { + const result = await synthesizeSpeech({ + text: "Use fallback provider.", + cfg: { + messages: { + tts: { + enabled: true, + provider: "missing", + persona: "alfred", + personas: { + alfred: { + providers: { + missing: { + voice: "configured-but-unregistered", + }, + }, + }, + }, + }, + }, + }, + }); + + expect(result.success).toBe(true); + expect(result.attempts?.[0]).toMatchObject({ + provider: "missing", + outcome: "skipped", + reasonCode: "no_provider_registered", + persona: "alfred", + }); + expect(result.attempts?.[0]).not.toHaveProperty("personaBinding"); + }); + + it("does not mark skipped telephony providers as missing persona bindings", async () => { + const result = await textToSpeechTelephony({ + text: "Use telephony provider.", + cfg: { + messages: { + tts: { + enabled: true, + provider: "mock", + persona: "alfred", + personas: { + alfred: { + providers: { + mock: { + voice: "persona-voice", + }, + }, + }, + }, + }, + }, + }, + }); + + expect(result.success).toBe(false); + expect(result.attempts?.[0]).toMatchObject({ + provider: "mock", + outcome: "skipped", + reasonCode: "unsupported_for_telephony", + persona: "alfred", + }); + expect(result.attempts?.[0]).not.toHaveProperty("personaBinding"); + }); + + it("uses provider defaults when fallback policy allows missing persona bindings", async () => { + await synthesizeSpeech({ + text: "Use neutral provider defaults.", + cfg: { + messages: { + tts: { + enabled: true, + provider: "mock", + persona: "alfred", + personas: { + alfred: { + fallbackPolicy: "provider-defaults", + prompt: { + profile: "A precise butler.", + }, + }, + }, + }, + }, + }, + }); + + expect(prepareSynthesisMock).toHaveBeenCalledWith( + expect.objectContaining({ + persona: undefined, + personaProviderConfig: undefined, + }), + ); + }); + + it("preserves persona prompts by default when provider bindings are missing", async () => { + await synthesizeSpeech({ + text: "Use persona prompt.", + cfg: { + messages: { + tts: { + enabled: true, + provider: "mock", + persona: "alfred", + personas: { + alfred: { + prompt: { + profile: "A precise butler.", + }, + }, + }, + }, + }, + }, + }); + + expect(prepareSynthesisMock).toHaveBeenCalledWith( + expect.objectContaining({ + persona: expect.objectContaining({ id: "alfred" }), + personaProviderConfig: undefined, + }), + ); + }); + + it("skips unbound providers under fail policy while allowing bound fallbacks", async () => { + installSpeechProviders([ + createMockSpeechProvider("mock", { autoSelectOrder: 1 }), + createMockSpeechProvider("fallback", { autoSelectOrder: 2 }), + ]); + + const result = await synthesizeSpeech({ + text: "Use the first persona-bound provider.", + cfg: { + messages: { + tts: { + enabled: true, + provider: "mock", + persona: "alfred", + personas: { + alfred: { + fallbackPolicy: "fail", + providers: { + fallback: { + voice: "fallback-voice", + }, + }, + }, + }, + }, + }, + }, + }); + + expect(result.success).toBe(true); + expect(result.provider).toBe("fallback"); + expect(result.fallbackFrom).toBe("mock"); + expect(result.attempts?.[0]).toMatchObject({ + provider: "mock", + outcome: "skipped", + reasonCode: "not_configured", + persona: "alfred", + personaBinding: "missing", + error: "mock: persona alfred has no provider binding", + }); + expect(result.attempts?.[1]).toMatchObject({ + provider: "fallback", + outcome: "success", + persona: "alfred", + personaBinding: "applied", + }); + }); }); describe("speech-core per-agent TTS config", () => { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 9133be9d1b1..44f76e5cc54 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -12,6 +12,7 @@ import path from "node:path"; import { normalizeChannelId, type ChannelId } from "openclaw/plugin-sdk/channel-targets"; import type { OpenClawConfig, + ResolvedTtsPersona, TtsAutoMode, TtsConfig, TtsModelOverrideConfig, @@ -40,6 +41,7 @@ import { normalizeSpeechProviderId, normalizeTtsAutoMode, parseTtsDirectives, + resolveEffectiveTtsConfig, type ResolvedTtsConfig, type ResolvedTtsModelOverrides, scheduleCleanup, @@ -62,13 +64,13 @@ const DEFAULT_TIMEOUT_MS = 30_000; const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; const DEFAULT_MAX_TEXT_LENGTH = 4096; -const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]); type TtsUserPrefs = { tts?: { auto?: TtsAutoMode; enabled?: boolean; provider?: TtsProvider; + persona?: string | null; maxLength?: number; summarize?: boolean; }; @@ -86,6 +88,8 @@ export type TtsProviderAttempt = { provider: string; outcome: "success" | "skipped" | "failed"; reasonCode: TtsAttemptReasonCode; + persona?: string; + personaBinding?: "applied" | "missing" | "none"; latencyMs?: number; error?: string; }; @@ -96,6 +100,7 @@ export type TtsResult = { error?: string; latencyMs?: number; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -111,6 +116,7 @@ export type TtsSynthesisResult = { error?: string; latencyMs?: number; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -126,6 +132,7 @@ export type TtsTelephonyResult = { error?: string; latencyMs?: number; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -139,6 +146,7 @@ type TtsStatusEntry = { textLength: number; summarized: boolean; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -162,6 +170,10 @@ function normalizeConfiguredSpeechProviderId( return normalized === "edge" ? "microsoft" : normalized; } +function normalizeTtsPersonaId(personaId: string | null | undefined): string | undefined { + return normalizeOptionalLowercaseString(personaId ?? undefined); +} + function resolveTtsPrefsPathValue(prefsPath: string | undefined): string { if (prefsPath?.trim()) { return resolveUserPath(prefsPath.trim()); @@ -229,6 +241,87 @@ function asProviderConfigMap(value: unknown): Record { : {}; } +function hasOwnProperty(value: object, key: string): boolean { + return Object.prototype.hasOwnProperty.call(value, key); +} + +function normalizeProviderConfigMap( + value: unknown, +): Record | undefined { + const rawMap = asProviderConfigMap(value); + if (Object.keys(rawMap).length === 0) { + return undefined; + } + const next: Record = {}; + for (const [providerId, providerConfig] of Object.entries(rawMap)) { + const normalized = normalizeConfiguredSpeechProviderId(providerId) ?? providerId; + next[normalized] = asProviderConfig(providerConfig); + } + return next; +} + +function collectTtsPersonas(raw: TtsConfig): Record { + const rawPersonas = asProviderConfigMap(raw.personas); + const personas: Record = {}; + for (const [id, value] of Object.entries(rawPersonas)) { + const normalizedId = normalizeTtsPersonaId(id); + if (!normalizedId || typeof value !== "object" || value === null || Array.isArray(value)) { + continue; + } + const persona = value as Omit; + personas[normalizedId] = { + ...persona, + id: normalizedId, + provider: normalizeConfiguredSpeechProviderId(persona.provider) ?? persona.provider, + providers: normalizeProviderConfigMap(persona.providers), + }; + } + return personas; +} + +function resolvePersonaProviderConfig( + persona: ResolvedTtsPersona | undefined, + providerId: string, +): SpeechProviderConfig | undefined { + if (!persona?.providers) { + return undefined; + } + const normalized = normalizeConfiguredSpeechProviderId(providerId) ?? providerId; + if (hasOwnProperty(persona.providers, normalized)) { + return persona.providers[normalized]; + } + if (hasOwnProperty(persona.providers, providerId)) { + return persona.providers[providerId]; + } + return undefined; +} + +function mergeProviderConfigWithPersona(params: { + providerConfig: SpeechProviderConfig; + persona?: ResolvedTtsPersona; + providerId: string; +}): { + providerConfig: SpeechProviderConfig; + personaProviderConfig?: SpeechProviderConfig; + personaBinding: "applied" | "missing" | "none"; +} { + if (!params.persona) { + return { providerConfig: params.providerConfig, personaBinding: "none" }; + } + const personaProviderConfig = resolvePersonaProviderConfig(params.persona, params.providerId); + if (!personaProviderConfig) { + return { providerConfig: params.providerConfig, personaBinding: "missing" }; + } + return { + providerConfig: { + ...params.providerConfig, + ...personaProviderConfig, + }, + personaProviderConfig, + personaBinding: "applied", + }; +} + function resolveRawProviderConfig( raw: TtsConfig | undefined, providerId: string, @@ -241,48 +334,6 @@ function resolveRawProviderConfig( return asProviderConfig(direct); } -function isPlainObject(value: unknown): value is Record { - return Boolean(value) && typeof value === "object" && !Array.isArray(value); -} - -function deepMergeDefined(base: unknown, override: unknown): unknown { - if (!isPlainObject(base) || !isPlainObject(override)) { - return override === undefined ? base : override; - } - - const result: Record = { ...base }; - for (const [key, value] of Object.entries(override)) { - if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) { - continue; - } - const existing = result[key]; - result[key] = key in result ? deepMergeDefined(existing, value) : value; - } - return result; -} - -function normalizeAgentConfigId(value: string | undefined | null): string { - return normalizeLowercaseStringOrEmpty(value); -} - -function resolveAgentTtsOverride( - cfg: OpenClawConfig, - agentId: string | undefined, -): TtsConfig | undefined { - if (!agentId || !Array.isArray(cfg.agents?.list)) { - return undefined; - } - const normalized = normalizeAgentConfigId(agentId); - const agent = cfg.agents.list.find((entry) => normalizeAgentConfigId(entry.id) === normalized); - return agent?.tts; -} - -function resolveEffectiveTtsRawConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig { - const base = cfg.messages?.tts ?? {}; - const override = resolveAgentTtsOverride(cfg, agentId); - return deepMergeDefined(base, override ?? {}) as TtsConfig; -} - function resolveLazyProviderConfig( config: ResolvedTtsConfig, providerId: string, @@ -325,6 +376,8 @@ function collectDirectProviderConfigEntries(raw: TtsConfig): Record left.id.localeCompare(right.id)); +} + +export function setTtsPersona(prefsPath: string, persona: string | null | undefined): void { + updatePrefs(prefsPath, (prefs) => { + const next = { ...prefs.tts }; + const normalized = normalizeTtsPersonaId(persona); + next.persona = normalized ?? null; + prefs.tts = next; + }); +} + export function setTtsProvider(prefsPath: string, provider: TtsProvider): void { updatePrefs(prefsPath, (prefs) => { prefs.tts = { ...prefs.tts, provider: canonicalizeSpeechProviderId(provider) ?? provider }; @@ -714,17 +813,20 @@ function buildTtsFailureResult( errors: string[], attemptedProviders?: string[], attempts?: TtsProviderAttempt[], + persona?: string, ): { success: false; error: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; + persona?: string; } { return { success: false, error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`, attemptedProviders, attempts, + persona, }; } @@ -733,17 +835,22 @@ type TtsProviderReadyResolution = kind: "ready"; provider: NonNullable>; providerConfig: SpeechProviderConfig; + personaProviderConfig?: SpeechProviderConfig; + synthesisPersona?: ResolvedTtsPersona; + personaBinding: "applied" | "missing" | "none"; } | { kind: "skip"; reasonCode: "no_provider_registered" | "not_configured" | "unsupported_for_telephony"; message: string; + personaBinding?: "missing"; }; function resolveReadySpeechProvider(params: { provider: TtsProvider; cfg: OpenClawConfig; config: ResolvedTtsConfig; + persona?: ResolvedTtsPersona; requireTelephony?: boolean; }): TtsProviderReadyResolution { const resolvedProvider = getSpeechProvider(params.provider, params.cfg); @@ -759,10 +866,23 @@ function resolveReadySpeechProvider(params: { resolvedProvider.id, params.cfg, ); + const merged = mergeProviderConfigWithPersona({ + providerConfig, + persona: params.persona, + providerId: resolvedProvider.id, + }); + if (params.persona?.fallbackPolicy === "fail" && merged.personaBinding === "missing") { + return { + kind: "skip", + reasonCode: "not_configured", + message: `${params.provider}: persona ${params.persona.id} has no provider binding`, + personaBinding: "missing", + }; + } if ( !resolvedProvider.isConfigured({ cfg: params.cfg, - providerConfig, + providerConfig: merged.providerConfig, timeoutMs: params.config.timeoutMs, }) ) { @@ -782,7 +902,56 @@ function resolveReadySpeechProvider(params: { return { kind: "ready", provider: resolvedProvider, - providerConfig, + providerConfig: merged.providerConfig, + personaProviderConfig: merged.personaProviderConfig, + synthesisPersona: + params.persona?.fallbackPolicy === "provider-defaults" && merged.personaBinding === "missing" + ? undefined + : params.persona, + personaBinding: merged.personaBinding, + }; +} + +async function prepareSpeechSynthesis(params: { + provider: NonNullable>; + text: string; + cfg: OpenClawConfig; + providerConfig: SpeechProviderConfig; + providerOverrides?: SpeechProviderOverrides; + persona?: ResolvedTtsPersona; + personaProviderConfig?: SpeechProviderConfig; + target: "audio-file" | "voice-note" | "telephony"; + timeoutMs: number; +}): Promise<{ + text: string; + providerConfig: SpeechProviderConfig; + providerOverrides?: SpeechProviderOverrides; +}> { + if (!params.provider.prepareSynthesis) { + return { + text: params.text, + providerConfig: params.providerConfig, + providerOverrides: params.providerOverrides, + }; + } + const prepared = await params.provider.prepareSynthesis({ + text: params.text, + cfg: params.cfg, + providerConfig: params.providerConfig, + providerOverrides: params.providerOverrides, + persona: params.persona, + personaProviderConfig: params.personaProviderConfig, + target: params.target, + timeoutMs: params.timeoutMs, + }); + return { + text: prepared?.text ?? params.text, + providerConfig: prepared?.providerConfig + ? { ...params.providerConfig, ...prepared.providerConfig } + : params.providerConfig, + providerOverrides: prepared?.providerOverrides + ? { ...params.providerOverrides, ...prepared.providerOverrides } + : params.providerOverrides, }; } @@ -796,6 +965,7 @@ function resolveTtsRequestSetup(params: { }): | { config: ResolvedTtsConfig; + persona?: ResolvedTtsPersona; providers: TtsProvider[]; } | { @@ -814,6 +984,7 @@ function resolveTtsRequestSetup(params: { canonicalizeSpeechProviderId(params.providerOverride, params.cfg) ?? userProvider; return { config, + persona: getTtsPersona(config, prefsPath), providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg), }; } @@ -833,6 +1004,7 @@ export async function textToSpeech(params: { return { success: false, error: synthesis.error ?? "TTS conversion failed", + persona: synthesis.persona, attemptedProviders: synthesis.attemptedProviders, attempts: synthesis.attempts, }; @@ -850,6 +1022,7 @@ export async function textToSpeech(params: { audioPath, latencyMs: synthesis.latencyMs, provider: synthesis.provider, + persona: synthesis.persona, fallbackFrom: synthesis.fallbackFrom, attemptedProviders: synthesis.attemptedProviders, attempts: synthesis.attempts, @@ -886,7 +1059,7 @@ export async function synthesizeSpeech(params: { return { success: false, error: setup.error }; } - const { config, providers } = setup; + const { config, persona, providers } = setup; const timeoutMs = params.timeoutMs ?? config.timeoutMs; const target = supportsNativeVoiceNoteTts(params.channel) ? "voice-note" : "audio-file"; @@ -906,6 +1079,7 @@ export async function synthesizeSpeech(params: { provider, cfg: params.cfg, config, + persona, }); if (resolvedProvider.kind === "skip") { errors.push(resolvedProvider.message); @@ -913,17 +1087,32 @@ export async function synthesizeSpeech(params: { provider, outcome: "skipped", reasonCode: resolvedProvider.reasonCode, + persona: persona?.id, + ...(resolvedProvider.personaBinding + ? { personaBinding: resolvedProvider.personaBinding } + : {}), error: resolvedProvider.message, }); logVerbose(`TTS: provider ${provider} skipped (${resolvedProvider.message})`); continue; } - const synthesis = await resolvedProvider.provider.synthesize({ + const prepared = await prepareSpeechSynthesis({ + provider: resolvedProvider.provider, text: params.text, cfg: params.cfg, providerConfig: resolvedProvider.providerConfig, - target, providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id], + persona: resolvedProvider.synthesisPersona, + personaProviderConfig: resolvedProvider.personaProviderConfig, + target, + timeoutMs, + }); + const synthesis = await resolvedProvider.provider.synthesize({ + text: prepared.text, + cfg: params.cfg, + providerConfig: prepared.providerConfig, + target, + providerOverrides: prepared.providerOverrides, timeoutMs, }); const latencyMs = Date.now() - providerStart; @@ -931,6 +1120,8 @@ export async function synthesizeSpeech(params: { provider, outcome: "success", reasonCode: "success", + persona: persona?.id, + personaBinding: resolvedProvider.personaBinding, latencyMs, }); return { @@ -938,6 +1129,7 @@ export async function synthesizeSpeech(params: { audioBuffer: synthesis.audioBuffer, latencyMs, provider, + persona: persona?.id, fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined, attemptedProviders, attempts, @@ -956,6 +1148,13 @@ export async function synthesizeSpeech(params: { reasonCode: err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error", latencyMs, + persona: persona?.id, + personaBinding: + resolvePersonaProviderConfig(persona, provider) != null + ? "applied" + : persona + ? "missing" + : "none", error: errorMsg, }); const rawError = sanitizeTtsErrorForLog(err); @@ -970,7 +1169,7 @@ export async function synthesizeSpeech(params: { } } - return buildTtsFailureResult(errors, attemptedProviders, attempts); + return buildTtsFailureResult(errors, attemptedProviders, attempts, persona?.id); } export async function textToSpeechTelephony(params: { @@ -987,7 +1186,7 @@ export async function textToSpeechTelephony(params: { return { success: false, error: setup.error }; } - const { config, providers } = setup; + const { config, persona, providers } = setup; const errors: string[] = []; const attemptedProviders: string[] = []; const attempts: TtsProviderAttempt[] = []; @@ -1004,6 +1203,7 @@ export async function textToSpeechTelephony(params: { provider, cfg: params.cfg, config, + persona, requireTelephony: true, }); if (resolvedProvider.kind === "skip") { @@ -1012,28 +1212,32 @@ export async function textToSpeechTelephony(params: { provider, outcome: "skipped", reasonCode: resolvedProvider.reasonCode, + persona: persona?.id, + ...(resolvedProvider.personaBinding + ? { personaBinding: resolvedProvider.personaBinding } + : {}), error: resolvedProvider.message, }); logVerbose(`TTS telephony: provider ${provider} skipped (${resolvedProvider.message})`); continue; } - const synthesizeTelephony = resolvedProvider.provider.synthesizeTelephony; - if (!synthesizeTelephony) { - const message = `${provider}: unsupported for telephony`; - errors.push(message); - attempts.push({ - provider, - outcome: "skipped", - reasonCode: "unsupported_for_telephony", - error: message, - }); - logVerbose(`TTS telephony: provider ${provider} skipped (${message})`); - continue; - } - const synthesis = await synthesizeTelephony({ + const synthesizeTelephony = resolvedProvider.provider.synthesizeTelephony as NonNullable< + typeof resolvedProvider.provider.synthesizeTelephony + >; + const prepared = await prepareSpeechSynthesis({ + provider: resolvedProvider.provider, text: params.text, cfg: params.cfg, providerConfig: resolvedProvider.providerConfig, + persona: resolvedProvider.synthesisPersona, + personaProviderConfig: resolvedProvider.personaProviderConfig, + target: "telephony", + timeoutMs: config.timeoutMs, + }); + const synthesis = await synthesizeTelephony({ + text: prepared.text, + cfg: params.cfg, + providerConfig: prepared.providerConfig, timeoutMs: config.timeoutMs, }); const latencyMs = Date.now() - providerStart; @@ -1041,6 +1245,8 @@ export async function textToSpeechTelephony(params: { provider, outcome: "success", reasonCode: "success", + persona: persona?.id, + personaBinding: resolvedProvider.personaBinding, latencyMs, }); @@ -1049,6 +1255,7 @@ export async function textToSpeechTelephony(params: { audioBuffer: synthesis.audioBuffer, latencyMs, provider, + persona: persona?.id, fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined, attemptedProviders, attempts, @@ -1065,6 +1272,13 @@ export async function textToSpeechTelephony(params: { reasonCode: err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error", latencyMs, + persona: persona?.id, + personaBinding: + resolvePersonaProviderConfig(persona, provider) != null + ? "applied" + : persona + ? "missing" + : "none", error: errorMsg, }); const rawError = sanitizeTtsErrorForLog(err); @@ -1079,7 +1293,7 @@ export async function textToSpeechTelephony(params: { } } - return buildTtsFailureResult(errors, attemptedProviders, attempts); + return buildTtsFailureResult(errors, attemptedProviders, attempts, persona?.id); } export async function listSpeechVoices(params: { @@ -1250,6 +1464,7 @@ export async function maybeApplyTtsToPayload(params: { textLength: text.length, summarized: wasSummarized, provider: result.provider, + persona: result.persona, fallbackFrom: result.fallbackFrom, attemptedProviders: result.attemptedProviders, attempts: result.attempts, @@ -1268,6 +1483,7 @@ export async function maybeApplyTtsToPayload(params: { success: false, textLength: text.length, summarized: wasSummarized, + persona: result.persona, attemptedProviders: result.attemptedProviders, attempts: result.attempts, error: result.error, diff --git a/extensions/xai/speech-provider.ts b/extensions/xai/speech-provider.ts index 79fe3e6261d..9e5903007b2 100644 --- a/extensions/xai/speech-provider.ts +++ b/extensions/xai/speech-provider.ts @@ -6,6 +6,7 @@ import { type SpeechProviderConfig, type SpeechProviderOverrides, type SpeechProviderPlugin, + type SpeechSynthesisTarget, } from "openclaw/plugin-sdk/speech"; import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime"; import { @@ -48,7 +49,7 @@ function normalizeXaiSpeechResponseFormat(value: unknown): XaiSpeechResponseForm } function resolveSpeechResponseFormat( - target: "audio-file" | "voice-note", + target: SpeechSynthesisTarget, configuredFormat?: XaiSpeechResponseFormat, ): XaiSpeechResponseFormat { if (configuredFormat) { diff --git a/src/auto-reply/reply/commands-tts.test.ts b/src/auto-reply/reply/commands-tts.test.ts index 0fb6df3ffa4..611b6956d1b 100644 --- a/src/auto-reply/reply/commands-tts.test.ts +++ b/src/auto-reply/reply/commands-tts.test.ts @@ -9,16 +9,19 @@ const ttsMocks = vi.hoisted(() => ({ getResolvedSpeechProviderConfig: vi.fn(), getLastTtsAttempt: vi.fn(), getTtsMaxLength: vi.fn(), + getTtsPersona: vi.fn(), getTtsProvider: vi.fn(), isSummarizationEnabled: vi.fn(), isTtsEnabled: vi.fn(), isTtsProviderConfigured: vi.fn(), + listTtsPersonas: vi.fn(), resolveTtsConfig: vi.fn(), resolveTtsPrefsPath: vi.fn(), setLastTtsAttempt: vi.fn(), setSummarizationEnabled: vi.fn(), setTtsEnabled: vi.fn(), setTtsMaxLength: vi.fn(), + setTtsPersona: vi.fn(), setTtsProvider: vi.fn(), textToSpeech: vi.fn(), })); @@ -66,10 +69,12 @@ describe("handleTtsCommands status fallback reporting", () => { ttsMocks.resolveTtsPrefsPath.mockReturnValue("/tmp/tts-prefs.json"); ttsMocks.isTtsEnabled.mockReturnValue(true); ttsMocks.getTtsProvider.mockReturnValue(PRIMARY_TTS_PROVIDER); + ttsMocks.getTtsPersona.mockReturnValue(undefined); ttsMocks.isTtsProviderConfigured.mockReturnValue(true); ttsMocks.getTtsMaxLength.mockReturnValue(1500); ttsMocks.isSummarizationEnabled.mockReturnValue(true); ttsMocks.getLastTtsAttempt.mockReturnValue(undefined); + ttsMocks.listTtsPersonas.mockReturnValue([]); }); it("shows fallback provider details for successful attempts", async () => { @@ -234,6 +239,24 @@ describe("handleTtsCommands status fallback reporting", () => { ); }); + it("lists and sets configured TTS personas", async () => { + ttsMocks.listTtsPersonas.mockReturnValue([ + { + id: "alfred", + label: "Alfred", + provider: "google", + }, + ]); + + const listResult = await handleTtsCommands(buildTtsParams("/tts persona"), true); + expect(listResult?.shouldContinue).toBe(false); + expect(listResult?.reply?.text).toContain("alfred (Alfred) provider=google"); + + const setResult = await handleTtsCommands(buildTtsParams("/tts persona alfred"), true); + expect(setResult?.shouldContinue).toBe(false); + expect(ttsMocks.setTtsPersona).toHaveBeenCalledWith("/tmp/tts-prefs.json", "alfred"); + }); + it("reads the latest assistant transcript reply once", async () => { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-tts-latest-")); const sessionFile = path.join(tempDir, "session.jsonl"); diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 397a902aaba..bc37a91e27c 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -14,16 +14,19 @@ import { getResolvedSpeechProviderConfig, getLastTtsAttempt, getTtsMaxLength, + getTtsPersona, getTtsProvider, isSummarizationEnabled, isTtsEnabled, isTtsProviderConfigured, + listTtsPersonas, resolveTtsConfig, resolveTtsPrefsPath, setLastTtsAttempt, setSummarizationEnabled, setTtsEnabled, setTtsMaxLength, + setTtsPersona, setTtsProvider, textToSpeech, } from "../../tts/tts.js"; @@ -68,7 +71,11 @@ function formatAttemptDetails(attempts: TtsAttemptDetail[] | undefined): string .map((attempt) => { const reason = attempt.reasonCode === "success" ? "ok" : attempt.reasonCode; const latency = Number.isFinite(attempt.latencyMs) ? ` ${attempt.latencyMs}ms` : ""; - return `${attempt.provider}:${attempt.outcome}(${reason})${latency}`; + const persona = + attempt.persona && attempt.personaBinding && attempt.personaBinding !== "none" + ? ` persona=${attempt.persona}:${attempt.personaBinding}` + : ""; + return `${attempt.provider}:${attempt.outcome}(${reason})${persona}${latency}`; }) .join(", "); } @@ -83,6 +90,7 @@ function ttsUsage(): ReplyPayload { `• /tts off — Disable TTS\n` + `• /tts status — Show current settings\n` + `• /tts provider [name] — View/change provider\n` + + `• /tts persona [id|off] — View/change persona\n` + `• /tts limit [number] — View/change text limit\n` + `• /tts summary [on|off] — View/change auto-summary\n` + `• /tts audio — Generate audio from text\n` + @@ -96,6 +104,7 @@ function ttsUsage(): ReplyPayload { `• Summary OFF: Truncates text, then generates audio\n\n` + `**Examples:**\n` + `/tts provider \n` + + `/tts persona \n` + `/tts limit 2000\n` + `/tts latest\n` + `/tts audio Hello, this is a test!`, @@ -129,6 +138,7 @@ async function buildTtsAudioReply(params: { textLength: params.text.length, summarized: false, provider: result.provider, + persona: result.persona, fallbackFrom: result.fallbackFrom, attemptedProviders: result.attemptedProviders, attempts: result.attempts, @@ -150,6 +160,7 @@ async function buildTtsAudioReply(params: { success: false, textLength: params.text.length, summarized: false, + persona: result.persona, attemptedProviders: result.attemptedProviders, attempts: result.attempts, error: result.error, @@ -349,6 +360,50 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand }; } + if (action === "persona") { + const personas = listTtsPersonas(config); + const activePersona = getTtsPersona(config, prefsPath); + if (!args.trim()) { + const lines = [ + "🎭 TTS persona", + `Active: ${activePersona?.id ?? "none"}`, + personas.length > 0 + ? personas + .map((persona) => { + const label = persona.label ? ` (${persona.label})` : ""; + const provider = persona.provider ? ` provider=${persona.provider}` : ""; + return `${persona.id}${label}${provider}`; + }) + .join("\n") + : "No personas configured.", + "Usage: /tts persona | off", + ]; + return { shouldContinue: false, reply: { text: lines.join("\n") } }; + } + + const requested = normalizeOptionalLowercaseString(args) ?? ""; + if (requested === "off" || requested === "none" || requested === "default") { + setTtsPersona(prefsPath, null); + return { shouldContinue: false, reply: { text: "✅ TTS persona disabled." } }; + } + const persona = personas.find((entry) => entry.id === requested); + if (!persona) { + return { + shouldContinue: false, + reply: { + text: + `❌ Unknown TTS persona: ${requested || args}.\n` + + `Use /tts persona to list configured personas.`, + }, + }; + } + setTtsPersona(prefsPath, persona.id); + return { + shouldContinue: false, + reply: { text: `✅ TTS persona set to ${persona.id}.` }, + }; + } + if (action === "limit") { if (!args.trim()) { const currentLimit = getTtsMaxLength(prefsPath); @@ -410,6 +465,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "status") { const enabled = isTtsEnabled(config, prefsPath); const provider = getTtsProvider(config, prefsPath); + const persona = getTtsPersona(config, prefsPath); const hasKey = isTtsProviderConfigured(config, provider, params.cfg); const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath); @@ -419,6 +475,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand `State: ${enabled ? "✅ enabled" : "❌ disabled"}`, `Chat override: ${params.sessionEntry?.ttsAuto ?? "default"}`, `Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`, + `Persona: ${persona?.id ?? "none"}`, `Text limit: ${maxLength} chars`, `Auto-summary: ${summarize ? "on" : "off"}`, ]; @@ -429,6 +486,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand lines.push(`Text: ${last.textLength} chars${last.summarized ? " (summarized)" : ""}`); if (last.success) { lines.push(`Provider: ${last.provider ?? "unknown"}`); + if (last.persona) { + lines.push(`Persona: ${last.persona}`); + } if (last.fallbackFrom && last.provider && last.fallbackFrom !== last.provider) { lines.push(`Fallback: ${last.fallbackFrom} -> ${last.provider}`); } diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index ddf6ba3b23e..98b569e8016 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -73,6 +73,7 @@ const mocks = vi.hoisted(() => ({ attempts: [], })), setTtsProvider: vi.fn(), + setTtsPersona: vi.fn(), resolveExplicitTtsOverrides: vi.fn( ({ provider, @@ -220,11 +221,14 @@ vi.mock("../video-generation/runtime.js", () => ({ })); vi.mock("../tts/tts.js", () => ({ + getTtsPersona: vi.fn(() => undefined), getTtsProvider: vi.fn(() => "openai"), + listTtsPersonas: vi.fn(() => []), listSpeechVoices: vi.fn(async () => []), resolveTtsConfig: vi.fn(() => ({})), resolveTtsPrefsPath: vi.fn(() => "/tmp/tts.json"), setTtsEnabled: vi.fn(), + setTtsPersona: mocks.setTtsPersona as typeof import("../tts/tts.js").setTtsPersona, setTtsProvider: mocks.setTtsProvider as typeof import("../tts/tts.js").setTtsProvider, resolveExplicitTtsOverrides: mocks.resolveExplicitTtsOverrides as typeof import("../tts/tts.js").resolveExplicitTtsOverrides, diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts index acc84c89d3e..24b3e882d56 100644 --- a/src/cli/capability-cli.ts +++ b/src/cli/capability-cli.ts @@ -56,11 +56,14 @@ import { theme } from "../terminal/theme.js"; import { canonicalizeSpeechProviderId, listSpeechProviders } from "../tts/provider-registry.js"; import { getTtsProvider, + getTtsPersona, + listTtsPersonas, listSpeechVoices, resolveExplicitTtsOverrides, resolveTtsConfig, resolveTtsPrefsPath, setTtsEnabled, + setTtsPersona, setTtsProvider, textToSpeech, } from "../tts/tts.js"; @@ -256,6 +259,13 @@ const CAPABILITY_METADATA: CapabilityMetadata[] = [ flags: ["--local", "--gateway", "--json"], resultShape: "provider ids, configured state, models, voices", }, + { + id: "tts.personas", + description: "List TTS personas.", + transports: ["local", "gateway"], + flags: ["--local", "--gateway", "--json"], + resultShape: "persona ids, labels, providers, active persona", + }, { id: "tts.status", description: "Show gateway-managed TTS state.", @@ -284,6 +294,13 @@ const CAPABILITY_METADATA: CapabilityMetadata[] = [ flags: ["--provider", "--local", "--gateway", "--json"], resultShape: "selected provider", }, + { + id: "tts.set-persona", + description: "Set the active TTS persona.", + transports: ["local", "gateway"], + flags: ["--persona", "--off", "--local", "--gateway", "--json"], + resultShape: "selected persona", + }, { id: "video.generate", description: "Generate video files with configured video providers.", @@ -1181,6 +1198,30 @@ async function runTtsProviders(transport: CapabilityTransport) { }; } +async function runTtsPersonas(transport: CapabilityTransport) { + if (transport === "gateway") { + return await callGateway({ + method: "tts.personas", + timeoutMs: 30_000, + }); + } + const cfg = loadConfig(); + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + const active = getTtsPersona(config, prefsPath); + return { + active: active?.id ?? null, + personas: listTtsPersonas(config).map((persona) => ({ + id: persona.id, + label: persona.label, + description: persona.description, + provider: persona.provider, + fallbackPolicy: persona.fallbackPolicy, + providers: Object.keys(persona.providers ?? {}), + })), + }; +} + async function runTtsVoices(providerRaw?: string) { const cfg = loadConfig(); const config = resolveTtsConfig(cfg); @@ -1194,9 +1235,10 @@ async function runTtsVoices(providerRaw?: string) { } async function runTtsStateMutation(params: { - capability: "tts.enable" | "tts.disable" | "tts.set-provider"; + capability: "tts.enable" | "tts.disable" | "tts.set-provider" | "tts.set-persona"; transport: CapabilityTransport; provider?: string; + persona?: string | null; }) { if (params.transport === "gateway") { const method = @@ -1204,10 +1246,17 @@ async function runTtsStateMutation(params: { ? "tts.enable" : params.capability === "tts.disable" ? "tts.disable" - : "tts.setProvider"; + : params.capability === "tts.set-provider" + ? "tts.setProvider" + : "tts.setPersona"; const payload = await callGateway({ method, - params: params.provider ? { provider: params.provider } : undefined, + params: + params.capability === "tts.set-provider" + ? { provider: params.provider } + : params.capability === "tts.set-persona" + ? { persona: params.persona ?? "off" } + : undefined, timeoutMs: 30_000, }); return payload; @@ -1224,6 +1273,20 @@ async function runTtsStateMutation(params: { setTtsEnabled(prefsPath, false); return { enabled: false }; } + if (params.capability === "tts.set-persona") { + if (!params.persona) { + setTtsPersona(prefsPath, null); + return { persona: null }; + } + const persona = listTtsPersonas(config).find( + (entry) => entry.id === normalizeLowercaseStringOrEmpty(params.persona ?? ""), + ); + if (!persona) { + throw new Error(`Unknown TTS persona: ${params.persona}`); + } + setTtsPersona(prefsPath, persona.id); + return { persona: persona.id }; + } if (!params.provider) { throw new Error("--provider is required"); } @@ -1746,6 +1809,27 @@ export function registerCapabilityCli(program: Command) { }); }); + tts + .command("personas") + .description("List TTS personas") + .option("--local", "Force local execution", false) + .option("--gateway", "Force gateway execution", false) + .option("--json", "Output JSON", false) + .action(async (opts) => { + await runCommandWithRuntime(defaultRuntime, async () => { + const transport = resolveTransport({ + local: Boolean(opts.local), + gateway: Boolean(opts.gateway), + supported: ["local", "gateway"], + defaultTransport: "local", + }); + const result = await runTtsPersonas(transport); + emitJsonOrText(defaultRuntime, Boolean(opts.json), result, (value) => + JSON.stringify(value, null, 2), + ); + }); + }); + tts .command("status") .description("Show TTS status") @@ -1823,6 +1907,36 @@ export function registerCapabilityCli(program: Command) { }); }); + tts + .command("set-persona") + .description("Set the active TTS persona") + .option("--persona ", "TTS persona id") + .option("--off", "Disable the active TTS persona", false) + .option("--local", "Force local execution", false) + .option("--gateway", "Force gateway execution", false) + .option("--json", "Output JSON", false) + .action(async (opts) => { + await runCommandWithRuntime(defaultRuntime, async () => { + const transport = resolveTransport({ + local: Boolean(opts.local), + gateway: Boolean(opts.gateway), + supported: ["local", "gateway"], + defaultTransport: "gateway", + }); + if (!opts.off && !opts.persona) { + throw new Error("--persona is required unless --off is set"); + } + const result = await runTtsStateMutation({ + capability: "tts.set-persona", + persona: opts.off ? null : String(opts.persona), + transport, + }); + emitJsonOrText(defaultRuntime, Boolean(opts.json), result, (value) => + JSON.stringify(value, null, 2), + ); + }); + }); + const video = capability.command("video").description("Video generation and description"); video diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 5721ac38ac4..7b1d856956f 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -19116,6 +19116,222 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { type: "string", minLength: 1, }, + persona: { + type: "string", + title: "TTS Persona", + description: + "Default TTS persona id. Local TTS persona preferences can override this per host.", + }, + personas: { + type: "object", + propertyNames: { + type: "string", + }, + additionalProperties: { + type: "object", + properties: { + label: { + type: "string", + }, + description: { + type: "string", + }, + provider: { + type: "string", + minLength: 1, + }, + fallbackPolicy: { + anyOf: [ + { + type: "string", + const: "preserve-persona", + }, + { + type: "string", + const: "provider-defaults", + }, + { + type: "string", + const: "fail", + }, + ], + }, + prompt: { + type: "object", + properties: { + profile: { + type: "string", + }, + scene: { + type: "string", + }, + sampleContext: { + type: "string", + }, + style: { + type: "string", + }, + accent: { + type: "string", + }, + pacing: { + type: "string", + }, + constraints: { + type: "array", + items: { + type: "string", + }, + }, + }, + additionalProperties: false, + title: "TTS Persona Prompt", + description: + "Provider-neutral persona prompt intent. Providers decide whether and how to map this into request instructions.", + }, + rewrite: { + type: "object", + properties: { + enabled: { + type: "boolean", + }, + model: { + type: "string", + }, + preserveMeaning: { + type: "boolean", + }, + compressForSpeech: { + type: "boolean", + }, + inCharacter: { + type: "boolean", + }, + maxChars: { + type: "integer", + minimum: 1, + maximum: 9007199254740991, + }, + }, + additionalProperties: false, + }, + providers: { + type: "object", + propertyNames: { + type: "string", + }, + additionalProperties: { + type: "object", + properties: { + apiKey: { + anyOf: [ + { + type: "string", + }, + { + oneOf: [ + { + type: "object", + properties: { + source: { + type: "string", + const: "env", + }, + provider: { + type: "string", + pattern: "^[a-z][a-z0-9_-]{0,63}$", + }, + id: { + type: "string", + pattern: "^[A-Z][A-Z0-9_]{0,127}$", + }, + }, + required: ["source", "provider", "id"], + additionalProperties: false, + }, + { + type: "object", + properties: { + source: { + type: "string", + const: "file", + }, + provider: { + type: "string", + pattern: "^[a-z][a-z0-9_-]{0,63}$", + }, + id: { + type: "string", + }, + }, + required: ["source", "provider", "id"], + additionalProperties: false, + }, + { + type: "object", + properties: { + source: { + type: "string", + const: "exec", + }, + provider: { + type: "string", + pattern: "^[a-z][a-z0-9_-]{0,63}$", + }, + id: { + type: "string", + }, + }, + required: ["source", "provider", "id"], + additionalProperties: false, + }, + ], + }, + ], + }, + }, + additionalProperties: { + anyOf: [ + { + type: "string", + }, + { + type: "number", + }, + { + type: "boolean", + }, + { + type: "null", + }, + { + type: "array", + items: {}, + }, + { + type: "object", + propertyNames: { + type: "string", + }, + additionalProperties: {}, + }, + ], + }, + }, + title: "TTS Persona Provider Bindings", + description: + "Provider-specific TTS persona bindings keyed by speech provider id. These merge over messages.tts.providers for the active persona.", + }, + }, + additionalProperties: false, + title: "TTS Persona", + description: + "One TTS persona. Use provider-specific bindings for exact voices/models and prompt templates.", + }, + title: "TTS Personas", + description: + "Named TTS personas that define stable spoken identity plus provider-specific speech bindings.", + }, summaryModel: { type: "string", }, @@ -27520,6 +27736,31 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { help: "Text-to-speech policy for reading agent replies aloud on supported voice or audio surfaces. Keep disabled unless voice playback is part of your operator/user workflow.", tags: ["media"], }, + "messages.tts.persona": { + label: "TTS Persona", + help: "Default TTS persona id. Local TTS persona preferences can override this per host.", + tags: ["media"], + }, + "messages.tts.personas": { + label: "TTS Personas", + help: "Named TTS personas that define stable spoken identity plus provider-specific speech bindings.", + tags: ["media"], + }, + "messages.tts.personas.*": { + label: "TTS Persona", + help: "One TTS persona. Use provider-specific bindings for exact voices/models and prompt templates.", + tags: ["media"], + }, + "messages.tts.personas.*.prompt": { + label: "TTS Persona Prompt", + help: "Provider-neutral persona prompt intent. Providers decide whether and how to map this into request instructions.", + tags: ["media"], + }, + "messages.tts.personas.*.providers": { + label: "TTS Persona Provider Bindings", + help: "Provider-specific TTS persona bindings keyed by speech provider id. These merge over messages.tts.providers for the active persona.", + tags: ["media"], + }, "messages.tts.providers": { label: "TTS Provider Settings", help: "Provider-specific TTS settings keyed by speech provider id. Use this instead of bundled provider-specific top-level keys so speech plugins stay decoupled from core config schema.", @@ -28081,6 +28322,10 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { sensitive: true, tags: ["security", "media", "tools"], }, + "messages.tts.personas.*.providers.*.apiKey": { + sensitive: true, + tags: ["security", "auth", "media"], + }, "mcp.servers.*.headers.*": { sensitive: true, tags: ["security"], diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 6bae7757fc8..34ce2c75946 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -1589,6 +1589,16 @@ export const FIELD_HELP: Record = { "Removes the acknowledgment reaction after final reply delivery when enabled. Keep enabled for cleaner UX in channels where persistent ack reactions create clutter.", "messages.tts": "Text-to-speech policy for reading agent replies aloud on supported voice or audio surfaces. Keep disabled unless voice playback is part of your operator/user workflow.", + "messages.tts.persona": + "Default TTS persona id. Local TTS persona preferences can override this per host.", + "messages.tts.personas": + "Named TTS personas that define stable spoken identity plus provider-specific speech bindings.", + "messages.tts.personas.*": + "One TTS persona. Use provider-specific bindings for exact voices/models and prompt templates.", + "messages.tts.personas.*.prompt": + "Provider-neutral persona prompt intent. Providers decide whether and how to map this into request instructions.", + "messages.tts.personas.*.providers": + "Provider-specific TTS persona bindings keyed by speech provider id. These merge over messages.tts.providers for the active persona.", "messages.tts.providers": "Provider-specific TTS settings keyed by speech provider id. Use this instead of bundled provider-specific top-level keys so speech plugins stay decoupled from core config schema.", "messages.tts.providers.*": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index fe5eb67d76c..ea3c805a066 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -820,6 +820,11 @@ export const FIELD_LABELS: Record = { "messages.inbound.debounceMs": "Inbound Message Debounce (ms)", "messages.inbound.byChannel": "Inbound Debounce by Channel (ms)", "messages.tts": "Message Text-to-Speech", + "messages.tts.persona": "TTS Persona", + "messages.tts.personas": "TTS Personas", + "messages.tts.personas.*": "TTS Persona", + "messages.tts.personas.*.prompt": "TTS Persona Prompt", + "messages.tts.personas.*.providers": "TTS Persona Provider Bindings", "messages.tts.providers": "TTS Provider Settings", "messages.tts.providers.*": "TTS Provider Config", "messages.tts.providers.*.apiKey": "TTS Provider API Key", // pragma: allowlist secret diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4c9a506cf99..40192979e6c 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -25,6 +25,43 @@ export type TtsModelOverrideConfig = { export type TtsProviderConfigMap = Record>; +export type TtsPersonaFallbackPolicy = "preserve-persona" | "provider-defaults" | "fail"; + +export type TtsPersonaPromptConfig = { + profile?: string; + scene?: string; + sampleContext?: string; + style?: string; + accent?: string; + pacing?: string; + constraints?: string[]; +}; + +export type TtsPersonaRewriteConfig = { + enabled?: boolean; + model?: string; + preserveMeaning?: boolean; + compressForSpeech?: boolean; + inCharacter?: boolean; + maxChars?: number; +}; + +export type TtsPersonaConfig = { + label?: string; + description?: string; + /** Preferred provider for this persona. Explicit provider prefs still win. */ + provider?: TtsProvider; + fallbackPolicy?: TtsPersonaFallbackPolicy; + prompt?: TtsPersonaPromptConfig; + rewrite?: TtsPersonaRewriteConfig; + /** Provider-specific persona bindings keyed by speech provider id. */ + providers?: TtsProviderConfigMap; +}; + +export type ResolvedTtsPersona = TtsPersonaConfig & { + id: string; +}; + export type TtsConfig = { /** Auto-TTS mode (preferred). */ auto?: TtsAutoMode; @@ -34,6 +71,10 @@ export type TtsConfig = { mode?: TtsMode; /** Primary TTS provider (fallbacks are automatic). */ provider?: TtsProvider; + /** Active TTS persona id. */ + persona?: string; + /** Named TTS personas. */ + personas?: Record; /** Optional model override for TTS auto-summary (provider/model or alias). */ summaryModel?: string; /** Allow the model to override TTS parameters. */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 36bbf4346e5..78f5ef1561f 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -497,12 +497,48 @@ const TtsProviderConfigSchema = z z.record(z.string(), z.unknown()), ]), ); +const TtsPersonaPromptSchema = z + .object({ + profile: z.string().optional(), + scene: z.string().optional(), + sampleContext: z.string().optional(), + style: z.string().optional(), + accent: z.string().optional(), + pacing: z.string().optional(), + constraints: z.array(z.string()).optional(), + }) + .strict(); +const TtsPersonaRewriteSchema = z + .object({ + enabled: z.boolean().optional(), + model: z.string().optional(), + preserveMeaning: z.boolean().optional(), + compressForSpeech: z.boolean().optional(), + inCharacter: z.boolean().optional(), + maxChars: z.number().int().min(1).optional(), + }) + .strict(); +const TtsPersonaSchema = z + .object({ + label: z.string().optional(), + description: z.string().optional(), + provider: TtsProviderSchema.optional(), + fallbackPolicy: z + .union([z.literal("preserve-persona"), z.literal("provider-defaults"), z.literal("fail")]) + .optional(), + prompt: TtsPersonaPromptSchema.optional(), + rewrite: TtsPersonaRewriteSchema.optional(), + providers: z.record(z.string(), TtsProviderConfigSchema).optional(), + }) + .strict(); export const TtsConfigSchema = z .object({ auto: TtsAutoSchema.optional(), enabled: z.boolean().optional(), mode: TtsModeSchema.optional(), provider: TtsProviderSchema.optional(), + persona: z.string().optional(), + personas: z.record(z.string(), TtsPersonaSchema).optional(), summaryModel: z.string().optional(), modelOverrides: z .object({ diff --git a/src/config/zod-schema.tts.test.ts b/src/config/zod-schema.tts.test.ts index 1e8840a6763..99c54df6fe7 100644 --- a/src/config/zod-schema.tts.test.ts +++ b/src/config/zod-schema.tts.test.ts @@ -39,4 +39,47 @@ describe("TtsConfigSchema openai speed and instructions", () => { }), ).not.toThrow(); }); + + it("accepts provider-specific persona bindings and structured prompt fields", () => { + expect(() => + TtsConfigSchema.parse({ + persona: "alfred", + personas: { + alfred: { + label: "Alfred", + description: "Dry, warm British butler narrator.", + provider: "google", + fallbackPolicy: "preserve-persona", + prompt: { + profile: "A brilliant British butler.", + scene: "A quiet late-night study.", + sampleContext: "The speaker is answering a trusted operator.", + style: "Refined and lightly amused.", + accent: "British English.", + pacing: "Measured.", + constraints: ["Do not read configuration values aloud."], + }, + rewrite: { + enabled: false, + preserveMeaning: true, + compressForSpeech: true, + maxChars: 1500, + }, + providers: { + google: { + model: "gemini-3.1-flash-tts-preview", + voiceName: "Algieba", + promptTemplate: "audio-profile-v1", + }, + openai: { + model: "gpt-4o-mini-tts", + voice: "cedar", + instructions: "Speak with dry warmth.", + }, + }, + }, + }, + }), + ).not.toThrow(); + }); }); diff --git a/src/gateway/method-scopes.ts b/src/gateway/method-scopes.ts index 8cd32af9d32..cb0e93e429d 100644 --- a/src/gateway/method-scopes.ts +++ b/src/gateway/method-scopes.ts @@ -78,6 +78,7 @@ const METHOD_SCOPE_GROUPS: Record = { "usage.cost", "tts.status", "tts.providers", + "tts.personas", "commands.list", "models.list", "models.authStatus", @@ -131,6 +132,7 @@ const METHOD_SCOPE_GROUPS: Record = { "tts.disable", "tts.convert", "tts.setProvider", + "tts.setPersona", "voicewake.set", "node.invoke", "chat.send", diff --git a/src/gateway/server-methods-list.ts b/src/gateway/server-methods-list.ts index 352ca7bcd29..5ac65f2d74b 100644 --- a/src/gateway/server-methods-list.ts +++ b/src/gateway/server-methods-list.ts @@ -20,10 +20,12 @@ const BASE_METHODS = [ "usage.cost", "tts.status", "tts.providers", + "tts.personas", "tts.enable", "tts.disable", "tts.convert", "tts.setProvider", + "tts.setPersona", "config.get", "config.set", "config.apply", diff --git a/src/gateway/server-methods/tts.test.ts b/src/gateway/server-methods/tts.test.ts index f3998f6c81c..04fb44dbdfd 100644 --- a/src/gateway/server-methods/tts.test.ts +++ b/src/gateway/server-methods/tts.test.ts @@ -25,9 +25,11 @@ vi.mock("../../tts/provider-registry.js", () => ({ vi.mock("../../tts/tts.js", () => ({ getResolvedSpeechProviderConfig: vi.fn(), + getTtsPersona: vi.fn(() => undefined), getTtsProvider: vi.fn(() => "openai"), isTtsEnabled: vi.fn(() => true), isTtsProviderConfigured: vi.fn(() => true), + listTtsPersonas: vi.fn(() => []), resolveExplicitTtsOverrides: mocks.resolveExplicitTtsOverrides as typeof import("../../tts/tts.js").resolveExplicitTtsOverrides, resolveTtsAutoMode: vi.fn(() => false), @@ -35,6 +37,7 @@ vi.mock("../../tts/tts.js", () => ({ resolveTtsPrefsPath: vi.fn(() => "/tmp/tts.json"), resolveTtsProviderOrder: vi.fn(() => ["openai"]), setTtsEnabled: vi.fn(), + setTtsPersona: vi.fn(), setTtsProvider: vi.fn(), textToSpeech: mocks.textToSpeech as typeof import("../../tts/tts.js").textToSpeech, })); diff --git a/src/gateway/server-methods/tts.ts b/src/gateway/server-methods/tts.ts index e718ff0061e..8eb2d47ea4a 100644 --- a/src/gateway/server-methods/tts.ts +++ b/src/gateway/server-methods/tts.ts @@ -7,15 +7,18 @@ import { } from "../../tts/provider-registry.js"; import { getResolvedSpeechProviderConfig, + getTtsPersona, getTtsProvider, isTtsEnabled, isTtsProviderConfigured, + listTtsPersonas, resolveExplicitTtsOverrides, resolveTtsAutoMode, resolveTtsConfig, resolveTtsPrefsPath, resolveTtsProviderOrder, setTtsEnabled, + setTtsPersona, setTtsProvider, textToSpeech, } from "../../tts/tts.js"; @@ -30,6 +33,7 @@ export const ttsHandlers: GatewayRequestHandlers = { const config = resolveTtsConfig(cfg); const prefsPath = resolveTtsPrefsPath(config); const provider = getTtsProvider(config, prefsPath); + const persona = getTtsPersona(config, prefsPath); const autoMode = resolveTtsAutoMode({ config, prefsPath }); const fallbackProviders = resolveTtsProviderOrder(provider, cfg) .slice(1) @@ -47,6 +51,13 @@ export const ttsHandlers: GatewayRequestHandlers = { enabled: isTtsEnabled(config, prefsPath), auto: autoMode, provider, + persona: persona?.id ?? null, + personas: listTtsPersonas(config).map((entry) => ({ + id: entry.id, + label: entry.label, + description: entry.description, + provider: entry.provider, + })), fallbackProvider: fallbackProviders[0] ?? null, fallbackProviders, prefsPath, @@ -157,6 +168,58 @@ export const ttsHandlers: GatewayRequestHandlers = { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); } }, + "tts.personas": async ({ respond }) => { + try { + const cfg = loadConfig(); + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + const active = getTtsPersona(config, prefsPath); + respond(true, { + active: active?.id ?? null, + personas: listTtsPersonas(config).map((persona) => ({ + id: persona.id, + label: persona.label, + description: persona.description, + provider: persona.provider, + fallbackPolicy: persona.fallbackPolicy, + providers: Object.keys(persona.providers ?? {}), + })), + }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "tts.setPersona": async ({ params, respond }) => { + const cfg = loadConfig(); + const rawPersona = normalizeOptionalString(params.persona); + try { + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + if (!rawPersona || ["off", "none", "default"].includes(rawPersona.toLowerCase())) { + setTtsPersona(prefsPath, null); + respond(true, { persona: null }); + return; + } + const persona = listTtsPersonas(config).find( + (entry) => entry.id === rawPersona.toLowerCase(), + ); + if (!persona) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + "Invalid persona. Use a configured TTS persona id.", + ), + ); + return; + } + setTtsPersona(prefsPath, persona.id); + respond(true, { persona: persona.id }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, "tts.providers": async ({ respond }) => { try { const cfg = loadConfig(); diff --git a/src/plugin-sdk/config-runtime.ts b/src/plugin-sdk/config-runtime.ts index 5fccecbaab3..8d975ae0a6c 100644 --- a/src/plugin-sdk/config-runtime.ts +++ b/src/plugin-sdk/config-runtime.ts @@ -133,10 +133,15 @@ export type { TelegramInlineButtonsScope, TelegramNetworkConfig, TelegramTopicConfig, + ResolvedTtsPersona, TtsAutoMode, TtsConfig, TtsMode, TtsModelOverrideConfig, + TtsPersonaConfig, + TtsPersonaFallbackPolicy, + TtsPersonaPromptConfig, + TtsPersonaRewriteConfig, TtsProvider, } from "../config/types.js"; export { diff --git a/src/plugin-sdk/speech-core.ts b/src/plugin-sdk/speech-core.ts index 1da45f6aa25..42bbc7629dc 100644 --- a/src/plugin-sdk/speech-core.ts +++ b/src/plugin-sdk/speech-core.ts @@ -9,11 +9,14 @@ export type { SpeechModelOverridePolicy, SpeechProviderConfig, SpeechProviderConfiguredContext, + SpeechProviderPreparedSynthesis, + SpeechProviderPrepareSynthesisContext, SpeechProviderResolveConfigContext, SpeechProviderResolveTalkConfigContext, SpeechProviderResolveTalkOverridesContext, SpeechProviderOverrides, SpeechSynthesisRequest, + SpeechSynthesisTarget, SpeechTelephonySynthesisRequest, SpeechVoiceOption, TtsDirectiveOverrides, @@ -35,6 +38,7 @@ export { listSpeechProviders, normalizeSpeechProviderId, } from "../tts/provider-registry.js"; +export { resolveEffectiveTtsConfig } from "../tts/tts-config.js"; export { normalizeTtsAutoMode, TTS_AUTO_MODES } from "../tts/tts-auto-mode.js"; export { asBoolean, diff --git a/src/plugin-sdk/speech.ts b/src/plugin-sdk/speech.ts index 504607d0120..7b7774e50e5 100644 --- a/src/plugin-sdk/speech.ts +++ b/src/plugin-sdk/speech.ts @@ -12,11 +12,14 @@ export type { SpeechModelOverridePolicy, SpeechProviderConfig, SpeechProviderConfiguredContext, + SpeechProviderPreparedSynthesis, + SpeechProviderPrepareSynthesisContext, SpeechProviderResolveConfigContext, SpeechProviderResolveTalkConfigContext, SpeechProviderResolveTalkOverridesContext, SpeechProviderOverrides, SpeechSynthesisRequest, + SpeechSynthesisTarget, SpeechTelephonySynthesisRequest, SpeechVoiceOption, TtsDirectiveOverrides, diff --git a/src/plugin-sdk/tts-runtime.ts b/src/plugin-sdk/tts-runtime.ts index b821ddd237a..c8ce751718b 100644 --- a/src/plugin-sdk/tts-runtime.ts +++ b/src/plugin-sdk/tts-runtime.ts @@ -40,6 +40,10 @@ export const getTtsMaxLength: FacadeModule["getTtsMaxLength"] = createLazyFacade loadFacadeModule, "getTtsMaxLength", ); +export const getTtsPersona: FacadeModule["getTtsPersona"] = createLazyFacadeRuntimeValue( + loadFacadeModule, + "getTtsPersona", +); export const getTtsProvider: FacadeModule["getTtsProvider"] = createLazyFacadeRuntimeValue( loadFacadeModule, "getTtsProvider", @@ -56,6 +60,10 @@ export const listSpeechVoices: FacadeModule["listSpeechVoices"] = createLazyFaca loadFacadeModule, "listSpeechVoices", ); +export const listTtsPersonas: FacadeModule["listTtsPersonas"] = createLazyFacadeRuntimeValue( + loadFacadeModule, + "listTtsPersonas", +); export const maybeApplyTtsToPayload: FacadeModule["maybeApplyTtsToPayload"] = createLazyFacadeRuntimeValue(loadFacadeModule, "maybeApplyTtsToPayload"); export const resolveExplicitTtsOverrides: FacadeModule["resolveExplicitTtsOverrides"] = @@ -90,6 +98,10 @@ export const setTtsMaxLength: FacadeModule["setTtsMaxLength"] = createLazyFacade loadFacadeModule, "setTtsMaxLength", ); +export const setTtsPersona: FacadeModule["setTtsPersona"] = createLazyFacadeRuntimeValue( + loadFacadeModule, + "setTtsPersona", +); export const setTtsProvider: FacadeModule["setTtsProvider"] = createLazyFacadeRuntimeValue( loadFacadeModule, "setTtsProvider", diff --git a/src/plugin-sdk/tts-runtime.types.ts b/src/plugin-sdk/tts-runtime.types.ts index 99f8b8d1207..60e0c24a4ca 100644 --- a/src/plugin-sdk/tts-runtime.types.ts +++ b/src/plugin-sdk/tts-runtime.types.ts @@ -1,5 +1,5 @@ import type { OpenClawConfig } from "../config/types.openclaw.js"; -import type { TtsAutoMode, TtsProvider } from "../config/types.tts.js"; +import type { ResolvedTtsPersona, TtsAutoMode, TtsProvider } from "../config/types.tts.js"; import type { SpeechProviderConfig, SpeechVoiceOption, @@ -24,6 +24,8 @@ export type TtsProviderAttempt = { provider: string; outcome: "success" | "skipped" | "failed"; reasonCode: TtsAttemptReasonCode; + persona?: string; + personaBinding?: "applied" | "missing" | "none"; latencyMs?: number; error?: string; }; @@ -34,6 +36,7 @@ export type TtsStatusEntry = { textLength: number; summarized: boolean; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -126,6 +129,7 @@ export type TtsResult = { error?: string; latencyMs?: number; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -141,6 +145,7 @@ export type TtsSynthesisResult = { error?: string; latencyMs?: number; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -156,6 +161,7 @@ export type TtsTelephonyResult = { error?: string; latencyMs?: number; provider?: string; + persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; attempts?: TtsProviderAttempt[]; @@ -179,6 +185,7 @@ export type TtsRuntimeFacade = { cfg?: OpenClawConfig, ) => SpeechProviderConfig; getTtsMaxLength: (prefsPath: string) => number; + getTtsPersona: (config: ResolvedTtsConfig, prefsPath: string) => ResolvedTtsPersona | undefined; getTtsProvider: (config: ResolvedTtsConfig, prefsPath: string) => TtsProvider; isSummarizationEnabled: (prefsPath: string) => boolean; isTtsEnabled: (config: ResolvedTtsConfig, prefsPath: string, sessionAuto?: string) => boolean; @@ -188,6 +195,7 @@ export type TtsRuntimeFacade = { cfg?: OpenClawConfig, ) => boolean; listSpeechVoices: ListSpeechVoices; + listTtsPersonas: (config: ResolvedTtsConfig) => ResolvedTtsPersona[]; maybeApplyTtsToPayload: (params: MaybeApplyTtsToPayloadParams) => Promise; resolveExplicitTtsOverrides: (params: ResolveExplicitTtsOverridesParams) => TtsDirectiveOverrides; resolveTtsAutoMode: (params: ResolveTtsAutoModeParams) => TtsAutoMode; @@ -199,6 +207,7 @@ export type TtsRuntimeFacade = { setTtsAutoMode: (prefsPath: string, mode: TtsAutoMode) => void; setTtsEnabled: (prefsPath: string, enabled: boolean) => void; setTtsMaxLength: (prefsPath: string, maxLength: number) => void; + setTtsPersona: (prefsPath: string, persona: string | null | undefined) => void; setTtsProvider: (prefsPath: string, provider: TtsProvider) => void; synthesizeSpeech: (params: TtsRequestParams) => Promise; textToSpeech: TextToSpeech; diff --git a/src/plugins/types.ts b/src/plugins/types.ts index 57ca7103671..2eacf837657 100644 --- a/src/plugins/types.ts +++ b/src/plugins/types.ts @@ -65,6 +65,8 @@ import type { SpeechProviderResolveTalkConfigContext, SpeechProviderResolveTalkOverridesContext, SpeechListVoicesRequest, + SpeechProviderPrepareSynthesisContext, + SpeechProviderPreparedSynthesis, SpeechProviderId, SpeechSynthesisRequest, SpeechSynthesisResult, @@ -1724,6 +1726,12 @@ export type SpeechProviderPlugin = { resolveTalkOverrides?: ( ctx: SpeechProviderResolveTalkOverridesContext, ) => SpeechProviderConfig | undefined; + prepareSynthesis?: ( + ctx: SpeechProviderPrepareSynthesisContext, + ) => + | SpeechProviderPreparedSynthesis + | undefined + | Promise; isConfigured: (ctx: SpeechProviderConfiguredContext) => boolean; synthesize: (req: SpeechSynthesisRequest) => Promise; synthesizeTelephony?: ( diff --git a/src/status/status-message.ts b/src/status/status-message.ts index a6181d1e609..825debe69ab 100644 --- a/src/status/status-message.ts +++ b/src/status/status-message.ts @@ -465,6 +465,9 @@ const formatVoiceModeLine = ( return null; } const parts = [`🔊 Voice: ${snapshot.autoMode}`, `provider=${snapshot.provider}`]; + if (snapshot.persona) { + parts.push(`persona=${snapshot.persona}`); + } if (snapshot.displayName) { parts.push(`name=${snapshot.displayName}`); } diff --git a/src/tts/provider-types.ts b/src/tts/provider-types.ts index 48c010e64a3..88dc9ca9c31 100644 --- a/src/tts/provider-types.ts +++ b/src/tts/provider-types.ts @@ -1,9 +1,10 @@ import type { TalkProviderConfig } from "../config/types.gateway.js"; import type { OpenClawConfig } from "../config/types.js"; +import type { ResolvedTtsPersona } from "../config/types.tts.js"; export type SpeechProviderId = string; -export type SpeechSynthesisTarget = "audio-file" | "voice-note"; +export type SpeechSynthesisTarget = "audio-file" | "voice-note" | "telephony"; export type SpeechProviderConfig = Record; @@ -69,6 +70,23 @@ export type SpeechTelephonySynthesisResult = { sampleRate: number; }; +export type SpeechProviderPrepareSynthesisContext = { + text: string; + cfg: OpenClawConfig; + providerConfig: SpeechProviderConfig; + providerOverrides?: SpeechProviderOverrides; + persona?: ResolvedTtsPersona; + personaProviderConfig?: SpeechProviderConfig; + target: SpeechSynthesisTarget; + timeoutMs: number; +}; + +export type SpeechProviderPreparedSynthesis = { + text?: string; + providerConfig?: SpeechProviderConfig; + providerOverrides?: SpeechProviderOverrides; +}; + export type SpeechVoiceOption = { id: string; name?: string; diff --git a/src/tts/status-config.test.ts b/src/tts/status-config.test.ts index 76a263d4c96..3111bbf0815 100644 --- a/src/tts/status-config.test.ts +++ b/src/tts/status-config.test.ts @@ -138,6 +138,44 @@ describe("resolveStatusTtsSnapshot", () => { }); }); + it("reports per-agent persona provider over global persona", async () => { + await withStatusTempHome(async () => { + expect( + resolveStatusTtsSnapshot({ + cfg: { + messages: { + tts: { + auto: "always", + persona: "alfred", + personas: { + alfred: { provider: "google" }, + jarvis: { provider: "edge" }, + }, + }, + }, + agents: { + list: [ + { + id: "reader", + tts: { + persona: "jarvis", + }, + }, + ], + }, + } as OpenClawConfig, + agentId: "reader", + }), + ).toEqual({ + autoMode: "always", + provider: "microsoft", + persona: "jarvis", + maxLength: 1500, + summarize: true, + }); + }); + }); + it("reports configured OpenAI TTS model, voice, and sanitized custom endpoint", async () => { await withStatusTempHome(async () => { expect( diff --git a/src/tts/status-config.ts b/src/tts/status-config.ts index 04b6e5b2439..0415088f08c 100644 --- a/src/tts/status-config.ts +++ b/src/tts/status-config.ts @@ -20,6 +20,7 @@ type TtsUserPrefs = { auto?: TtsAutoMode; enabled?: boolean; provider?: TtsProvider; + persona?: string | null; maxLength?: number; summarize?: boolean; }; @@ -31,6 +32,7 @@ type TtsStatusSnapshot = { displayName?: string; model?: string; voice?: string; + persona?: string; baseUrl?: string; customBaseUrl?: boolean; maxLength: number; @@ -51,6 +53,27 @@ function normalizeConfiguredSpeechProviderId( return normalized === "edge" ? "microsoft" : normalized; } +function normalizeTtsPersonaId(personaId: string | null | undefined): string | undefined { + return normalizeOptionalLowercaseString(personaId ?? undefined); +} + +function resolvePersonaPreferredProvider( + raw: TtsConfig, + personaId: string | undefined, +): TtsProvider | undefined { + if (!personaId || !raw.personas) { + return undefined; + } + for (const [id, persona] of Object.entries(raw.personas)) { + if (normalizeTtsPersonaId(id) !== personaId) { + continue; + } + const provider = normalizeConfiguredSpeechProviderId(persona.provider) ?? persona.provider; + return normalizeOptionalString(provider); + } + return undefined; +} + function resolveTtsPrefsPathValue(prefsPath: string | undefined): string { const configuredPath = normalizeOptionalString(prefsPath); if (configuredPath) { @@ -212,8 +235,13 @@ export function resolveStatusTtsSnapshot(params: { return null; } + const persona = + prefs.tts && Object.prototype.hasOwnProperty.call(prefs.tts, "persona") + ? normalizeTtsPersonaId(prefs.tts.persona) + : normalizeTtsPersonaId(raw.persona); const provider = normalizeConfiguredSpeechProviderId(prefs.tts?.provider) ?? + resolvePersonaPreferredProvider(raw, persona) ?? normalizeConfiguredSpeechProviderId(raw.provider) ?? "auto"; @@ -221,6 +249,7 @@ export function resolveStatusTtsSnapshot(params: { autoMode, provider, ...resolveStatusProviderDetails(raw, provider), + ...(persona ? { persona } : {}), maxLength: prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH, summarize: prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE, }; diff --git a/src/tts/tts-types.ts b/src/tts/tts-types.ts index 79803ea1e6e..8638a2a8067 100644 --- a/src/tts/tts-types.ts +++ b/src/tts/tts-types.ts @@ -1,5 +1,11 @@ import type { OpenClawConfig } from "../config/types.openclaw.js"; -import type { TtsAutoMode, TtsConfig, TtsMode, TtsProvider } from "../config/types.tts.js"; +import type { + ResolvedTtsPersona, + TtsAutoMode, + TtsConfig, + TtsMode, + TtsProvider, +} from "../config/types.tts.js"; import type { SpeechModelOverridePolicy, SpeechProviderConfig } from "./provider-types.js"; export type ResolvedTtsModelOverrides = SpeechModelOverridePolicy; @@ -9,6 +15,8 @@ export type ResolvedTtsConfig = { mode: TtsMode; provider: TtsProvider; providerSource: "config" | "default"; + persona?: string; + personas: Record; summaryModel?: string; modelOverrides: ResolvedTtsModelOverrides; providerConfigs: Record; diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 5fb831e3631..dc962a8c8b8 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -4,11 +4,13 @@ export { getLastTtsAttempt, getResolvedSpeechProviderConfig, getTtsMaxLength, + getTtsPersona, getTtsProvider, isSummarizationEnabled, isTtsEnabled, isTtsProviderConfigured, listSpeechVoices, + listTtsPersonas, maybeApplyTtsToPayload, resolveExplicitTtsOverrides, resolveTtsAutoMode, @@ -20,6 +22,7 @@ export { setTtsAutoMode, setTtsEnabled, setTtsMaxLength, + setTtsPersona, setTtsProvider, synthesizeSpeech, textToSpeech, diff --git a/test/helpers/media-generation/provider-http-mocks.ts b/test/helpers/media-generation/provider-http-mocks.ts index 271e993f394..56ab02b641b 100644 --- a/test/helpers/media-generation/provider-http-mocks.ts +++ b/test/helpers/media-generation/provider-http-mocks.ts @@ -15,6 +15,7 @@ const providerHttpMocks = vi.hoisted(() => ({ fetchWithTimeoutMock: vi.fn(), pollProviderOperationJsonMock: vi.fn(), assertOkOrThrowHttpErrorMock: vi.fn(async (_response: Response, _label: string) => {}), + assertOkOrThrowProviderErrorMock: vi.fn(async (_response: Response, _label: string) => {}), resolveProviderHttpRequestConfigMock: vi.fn((params: ResolveProviderHttpRequestConfigParams) => ({ baseUrl: params.baseUrl ?? params.defaultBaseUrl, allowPrivateNetwork: false, @@ -55,6 +56,7 @@ vi.mock("openclaw/plugin-sdk/provider-auth-runtime", () => ({ vi.mock("openclaw/plugin-sdk/provider-http", () => ({ assertOkOrThrowHttpError: providerHttpMocks.assertOkOrThrowHttpErrorMock, + assertOkOrThrowProviderError: providerHttpMocks.assertOkOrThrowProviderErrorMock, createProviderOperationDeadline: ({ label, timeoutMs, @@ -85,6 +87,7 @@ export function installProviderHttpMockCleanup(): void { providerHttpMocks.fetchWithTimeoutMock.mockReset(); providerHttpMocks.pollProviderOperationJsonMock.mockClear(); providerHttpMocks.assertOkOrThrowHttpErrorMock.mockClear(); + providerHttpMocks.assertOkOrThrowProviderErrorMock.mockClear(); providerHttpMocks.resolveProviderHttpRequestConfigMock.mockClear(); }); } diff --git a/test/helpers/plugins/tts-contract-suites.ts b/test/helpers/plugins/tts-contract-suites.ts index e0d6ddcbbcb..6b36b5f7311 100644 --- a/test/helpers/plugins/tts-contract-suites.ts +++ b/test/helpers/plugins/tts-contract-suites.ts @@ -499,6 +499,7 @@ function createResolvedSummarizationConfig(cfg: OpenClawConfig): ResolvedTtsConf allowSeed: true, }, providerConfigs: {}, + personas: {}, prefsPath: typeof rawConfig.prefsPath === "string" ? rawConfig.prefsPath : undefined, maxTextLength: typeof rawConfig.maxTextLength === "number" ? rawConfig.maxTextLength : 4096, timeoutMs: typeof rawConfig.timeoutMs === "number" ? rawConfig.timeoutMs : 30_000, @@ -715,6 +716,7 @@ export function describeTtsConfigContract() { microsoft: {}, elevenlabs: {}, }, + personas: {}, prefsPath: undefined, maxTextLength: 4000, timeoutMs: 30_000,