TTS: add provider personas

2026-05-06 12:20:44 +00:00 · 2026-04-23 07:26:32 -07:00
parent 80219ed1b3
commit 0594fa3c4d
39 changed files with 2021 additions and 136 deletions
--- a/extensions/google/speech-provider.test.ts
+++ b/extensions/google/speech-provider.test.ts
@@ -1,5 +1,8 @@
-import * as providerHttp from "openclaw/plugin-sdk/provider-http";
-import { afterEach, describe, expect, it, vi } from "vitest";
+import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
+import {
+  getProviderHttpMocks,
+  installProviderHttpMockCleanup,
+} from "../../test/helpers/media-generation/provider-http-mocks.js";

 const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());

@@ -7,10 +10,23 @@ vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
  transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
 }));

-import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js";
+const {
+  assertOkOrThrowProviderErrorMock,
+  postJsonRequestMock,
+  resolveProviderHttpRequestConfigMock,
+} = getProviderHttpMocks();

-function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
-  const fetchMock = vi.fn().mockResolvedValue({
+let buildGoogleSpeechProvider: typeof import("./speech-provider.js").buildGoogleSpeechProvider;
+let __testing: typeof import("./speech-provider.js").__testing;
+
+beforeAll(async () => {
+  ({ buildGoogleSpeechProvider, __testing } = await import("./speech-provider.js"));
+});
+
+installProviderHttpMockCleanup();
+
+function googleTtsResponse(pcm = Buffer.from([1, 0, 2, 0])) {
+  return {
    ok: true,
    json: async () => ({
      candidates: [
@@ -28,21 +44,26 @@ function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
        },
      ],
    }),
+  };
+}
+
+function installGoogleTtsRequestMock(pcm = Buffer.from([1, 0, 2, 0])) {
+  postJsonRequestMock.mockResolvedValue({
+    response: googleTtsResponse(pcm),
+    release: vi.fn(async () => {}),
  });
-  vi.stubGlobal("fetch", fetchMock);
-  return fetchMock;
+  return postJsonRequestMock;
 }

 describe("Google speech provider", () => {
  afterEach(() => {
-    vi.restoreAllMocks();
    vi.unstubAllGlobals();
    vi.unstubAllEnvs();
    transcodeAudioBufferToOpusMock.mockReset();
  });

  it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();

    const result = await provider.synthesize({
@@ -57,11 +78,10 @@ describe("Google speech provider", () => {
      timeoutMs: 12_345,
    });

-    expect(fetchMock).toHaveBeenCalledWith(
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
+    expect(requestMock).toHaveBeenCalledWith(
      expect.objectContaining({
-        method: "POST",
-        body: JSON.stringify({
+        url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
+        body: {
          contents: [
            {
              role: "user",
@@ -78,11 +98,14 @@ describe("Google speech provider", () => {
              },
            },
          },
-        }),
+        },
+        fetchFn: fetch,
+        pinDns: false,
+        timeoutMs: 12_345,
      }),
    );
-    const [, init] = fetchMock.mock.calls[0];
-    expect(new Headers(init.headers).get("x-goog-api-key")).toBe("google-test-key");
+    const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
+    expect(new Headers(request.headers).get("x-goog-api-key")).toBe("google-test-key");
    expect(result.outputFormat).toBe("wav");
    expect(result.fileExtension).toBe(".wav");
    expect(result.voiceCompatible).toBe(false);
@@ -94,7 +117,7 @@ describe("Google speech provider", () => {
  });

  it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
-    installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0]));
+    installGoogleTtsRequestMock(Buffer.from([5, 0, 6, 0]));
    transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
    const provider = buildGoogleSpeechProvider();

@@ -125,9 +148,138 @@ describe("Google speech provider", () => {
    expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
  });

+  it("advertises all documented Gemini TTS-capable models", () => {
+    const provider = buildGoogleSpeechProvider();
+
+    expect(provider.models).toEqual(__testing.GOOGLE_TTS_MODELS);
+  });
+
+  it("renders deterministic audio-profile-v1 prompts without generating tags", async () => {
+    const provider = buildGoogleSpeechProvider();
+
+    const prepared = await provider.prepareSynthesis?.({
+      text: "[whispers] The door is open.",
+      cfg: {},
+      providerConfig: {
+        promptTemplate: "audio-profile-v1",
+        personaPrompt: "Keep a close-mic feel.",
+      },
+      persona: {
+        id: "alfred",
+        label: "Alfred",
+        prompt: {
+          profile: "A brilliant British butler.",
+          scene: "A quiet late-night study.",
+          sampleContext: "The speaker is answering a trusted operator.",
+          style: "Refined and lightly amused.",
+          accent: "British English.",
+          pacing: "Measured.",
+          constraints: ["Do not read configuration values aloud."],
+        },
+      },
+      target: "audio-file",
+      timeoutMs: 1_000,
+    });
+
+    expect(prepared?.text).toBe(
+      [
+        "Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
+        "as performance direction. Do not read section titles, notes, labels, or",
+        "configuration aloud.",
+        "",
+        "# AUDIO PROFILE: Alfred",
+        "A brilliant British butler.",
+        "",
+        "## THE SCENE",
+        "A quiet late-night study.",
+        "",
+        "### DIRECTOR'S NOTES",
+        "Style: Refined and lightly amused.",
+        "Accent: British English.",
+        "Pacing: Measured.",
+        "Constraints:",
+        "- Do not read configuration values aloud.",
+        "Provider notes:",
+        "Keep a close-mic feel.",
+        "",
+        "### SAMPLE CONTEXT",
+        "The speaker is answering a trusted operator.",
+        "",
+        "### TRANSCRIPT",
+        "[whispers] The door is open.",
+      ].join("\n"),
+    );
+  });
+
+  it("does not wrap an OpenClaw audio-profile-v1 prompt twice", async () => {
+    const provider = buildGoogleSpeechProvider();
+    const text = [
+      "Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
+      "as performance direction. Do not read section titles, notes, labels, or",
+      "configuration aloud.",
+      "",
+      "# AUDIO PROFILE: Alfred",
+      "A brilliant British butler.",
+      "",
+      "### TRANSCRIPT",
+      "Hello.",
+    ].join("\n");
+
+    const prepared = await provider.prepareSynthesis?.({
+      text,
+      cfg: {},
+      providerConfig: {
+        promptTemplate: "audio-profile-v1",
+      },
+      persona: {
+        id: "alfred",
+        label: "Alfred",
+        prompt: {
+          profile: "A brilliant British butler.",
+        },
+      },
+      target: "audio-file",
+      timeoutMs: 1_000,
+    });
+
+    expect(prepared).toBeUndefined();
+  });
+
+  it("retries once when Gemini returns no audio payload", async () => {
+    const pcm = Buffer.from([5, 0, 6, 0]);
+    const requestSequence = vi
+      .fn()
+      .mockResolvedValueOnce({
+        response: {
+          ok: true,
+          json: async () => ({ candidates: [{ content: { parts: [{ text: "not audio" }] } }] }),
+        },
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: googleTtsResponse(pcm),
+        release: vi.fn(async () => {}),
+      });
+    postJsonRequestMock.mockImplementation(requestSequence);
+    const provider = buildGoogleSpeechProvider();
+
+    const result = await provider.synthesize({
+      text: "Retry this.",
+      cfg: {},
+      providerConfig: {
+        apiKey: "google-test-key",
+      },
+      target: "audio-file",
+      timeoutMs: 5_000,
+    });
+
+    expect(requestSequence).toHaveBeenCalledTimes(2);
+    expect(result.audioBuffer.subarray(44)).toEqual(pcm);
+  });
+
  it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
    vi.stubEnv("GEMINI_API_KEY", "env-google-key");
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();

    expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true);
@@ -149,16 +301,17 @@ describe("Google speech provider", () => {
      timeoutMs: 10_000,
    });

-    expect(fetchMock).toHaveBeenCalledWith(
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
-      expect.any(Object),
+    expect(requestMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
+      }),
    );
-    const [, init] = fetchMock.mock.calls[0];
-    expect(new Headers(init.headers).get("x-goog-api-key")).toBe("env-google-key");
+    const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
+    expect(new Headers(request.headers).get("x-goog-api-key")).toBe("env-google-key");
  });

  it("can reuse a configured Google model-provider API key without auth profiles", async () => {
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();
    const cfg = {
      models: {
@@ -182,13 +335,13 @@ describe("Google speech provider", () => {
      timeoutMs: 10_000,
    });

-    const [, init] = fetchMock.mock.calls[0];
-    expect(new Headers(init.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
+    const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
+    expect(new Headers(request.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
  });

  it("returns Gemini PCM directly for telephony synthesis", async () => {
    const pcm = Buffer.from([3, 0, 4, 0]);
-    installGoogleTtsFetchMock(pcm);
+    installGoogleTtsRequestMock(pcm);
    const provider = buildGoogleSpeechProvider();

    const result = await provider.synthesizeTelephony?.({
@@ -209,7 +362,7 @@ describe("Google speech provider", () => {
  });

  it("prepends configured Gemini TTS profile text", async () => {
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();

    await provider.synthesize({
@@ -224,8 +377,7 @@ describe("Google speech provider", () => {
      timeoutMs: 10_000,
    });

-    const [, init] = fetchMock.mock.calls[0];
-    expect(JSON.parse(String(init.body))).toMatchObject({
+    expect(requestMock.mock.calls[0]?.[0].body).toMatchObject({
      contents: [
        {
          parts: [
@@ -326,23 +478,26 @@ describe("Google speech provider", () => {
  });

  it("formats Google TTS HTTP errors with provider details", async () => {
-    vi.stubGlobal(
-      "fetch",
-      vi.fn().mockResolvedValue(
-        new Response(
-          JSON.stringify({
-            error: {
-              message: "Quota exceeded",
-              status: "RESOURCE_EXHAUSTED",
-            },
-          }),
-          {
-            status: 429,
-            headers: { "x-request-id": "google_req_123" },
-          },
-        ),
+    assertOkOrThrowProviderErrorMock.mockRejectedValue(
+      new Error(
+        "Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
      ),
    );
+    postJsonRequestMock.mockResolvedValue({
+      response: new Response(
+        JSON.stringify({
+          error: {
+            message: "Quota exceeded",
+            status: "RESOURCE_EXHAUSTED",
+          },
+        }),
+        {
+          status: 429,
+          headers: { "x-request-id": "google_req_123" },
+        },
+      ),
+      release: vi.fn(async () => {}),
+    });
    const provider = buildGoogleSpeechProvider();

    await expect(
@@ -359,8 +514,7 @@ describe("Google speech provider", () => {
  });

  it("honors configured private-network opt-in for Google TTS", async () => {
-    installGoogleTtsFetchMock();
-    const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
+    installGoogleTtsRequestMock();

    const provider = buildGoogleSpeechProvider();
    await provider.synthesize({
@@ -381,14 +535,16 @@ describe("Google speech provider", () => {
      timeoutMs: 12_345,
    });

-    expect(postJsonRequestSpy).toHaveBeenCalledWith(
-      expect.objectContaining({ allowPrivateNetwork: true }),
+    expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        allowPrivateNetwork: true,
+        request: expect.objectContaining({ allowPrivateNetwork: true }),
+      }),
    );
  });

  it("honors configured private-network opt-in for Google telephony TTS", async () => {
-    installGoogleTtsFetchMock();
-    const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
+    installGoogleTtsRequestMock();

    const provider = buildGoogleSpeechProvider();
    await provider.synthesizeTelephony?.({
@@ -408,8 +564,11 @@ describe("Google speech provider", () => {
      timeoutMs: 12_345,
    });

-    expect(postJsonRequestSpy).toHaveBeenCalledWith(
-      expect.objectContaining({ allowPrivateNetwork: true }),
+    expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        allowPrivateNetwork: true,
+        request: expect.objectContaining({ allowPrivateNetwork: true }),
+      }),
    );
  });
 });
--- a/extensions/google/speech-provider.ts
+++ b/extensions/google/speech-provider.ts
@@ -21,6 +21,13 @@ const DEFAULT_GOOGLE_TTS_VOICE = "Kore";
 const GOOGLE_TTS_SAMPLE_RATE = 24_000;
 const GOOGLE_TTS_CHANNELS = 1;
 const GOOGLE_TTS_BITS_PER_SAMPLE = 16;
+const GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE = "audio-profile-v1";
+
+const GOOGLE_TTS_MODELS = [
+  "gemini-3.1-flash-tts-preview",
+  "gemini-2.5-flash-preview-tts",
+  "gemini-2.5-pro-preview-tts",
+] as const;

 const GOOGLE_TTS_VOICES = [
  "Zephyr",
@@ -62,6 +69,8 @@ type GoogleTtsProviderConfig = {
  voiceName: string;
  audioProfile?: string;
  speakerName?: string;
+  promptTemplate?: typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE;
+  personaPrompt?: string;
 };

 type GoogleTtsProviderOverrides = {
@@ -91,6 +100,13 @@ type GoogleGenerateSpeechResponse = {
  }>;
 };

+class GoogleTtsRetryableError extends Error {
+  constructor(message: string) {
+    super(message);
+    this.name = "GoogleTtsRetryableError";
+  }
+}
+
 function normalizeGoogleTtsModel(model: unknown): string {
  const trimmed = normalizeOptionalString(model);
  if (!trimmed) {
@@ -104,6 +120,19 @@ function normalizeGoogleTtsVoiceName(voiceName: unknown): string {
  return normalizeOptionalString(voiceName) ?? DEFAULT_GOOGLE_TTS_VOICE;
 }

+function normalizeGooglePromptTemplate(
+  value: unknown,
+): typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE | undefined {
+  const trimmed = normalizeOptionalString(value);
+  if (!trimmed) {
+    return undefined;
+  }
+  if (trimmed === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE) {
+    return trimmed;
+  }
+  throw new Error(`Invalid Google TTS promptTemplate: ${trimmed}`);
+}
+
 function resolveGoogleTtsEnvApiKey(): string | undefined {
  return (
    normalizeOptionalString(process.env.GEMINI_API_KEY) ??
@@ -149,6 +178,8 @@ function normalizeGoogleTtsProviderConfig(
  rawConfig: Record<string, unknown>,
 ): GoogleTtsProviderConfig {
  const raw = resolveGoogleTtsConfigRecord(rawConfig);
+  const promptTemplate = normalizeGooglePromptTemplate(raw?.promptTemplate);
+  const personaPrompt = trimToUndefined(raw?.personaPrompt);
  return {
    apiKey: normalizeResolvedSecretInputString({
      value: raw?.apiKey,
@@ -159,11 +190,16 @@ function normalizeGoogleTtsProviderConfig(
    voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice),
    audioProfile: trimToUndefined(raw?.audioProfile),
    speakerName: trimToUndefined(raw?.speakerName),
+    ...(promptTemplate ? { promptTemplate } : {}),
+    ...(personaPrompt ? { personaPrompt } : {}),
  };
 }

 function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsProviderConfig {
  const normalized = normalizeGoogleTtsProviderConfig({});
+  const promptTemplate =
+    normalizeGooglePromptTemplate(config.promptTemplate) ?? normalized.promptTemplate;
+  const personaPrompt = trimToUndefined(config.personaPrompt) ?? normalized.personaPrompt;
  return {
    apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
    baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl,
@@ -173,6 +209,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro
    ),
    audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile,
    speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName,
+    ...(promptTemplate ? { promptTemplate } : {}),
+    ...(personaPrompt ? { personaPrompt } : {}),
  };
 }

@@ -243,6 +281,116 @@ function extractGoogleSpeechPcm(payload: GoogleGenerateSpeechResponse): Buffer {
  throw new Error("Google TTS response missing audio data");
 }

+function normalizePromptSectionText(value: string | undefined): string | undefined {
+  const trimmed = trimToUndefined(value?.replace(/\r\n?/g, "\n"));
+  if (!trimmed) {
+    return undefined;
+  }
+  let sanitized = "";
+  for (const char of trimmed) {
+    const code = char.charCodeAt(0);
+    if (
+      (code >= 0 && code <= 8) ||
+      code === 11 ||
+      code === 12 ||
+      (code >= 14 && code <= 31) ||
+      code === 127
+    ) {
+      continue;
+    }
+    sanitized += char;
+  }
+  return sanitized;
+}
+
+function normalizePromptList(values: readonly string[] | undefined): string[] {
+  return (values ?? [])
+    .map((value) => normalizePromptSectionText(value))
+    .filter((value): value is string => Boolean(value));
+}
+
+function isOpenClawGoogleAudioProfilePrompt(text: string): boolean {
+  return (
+    text.includes("# AUDIO PROFILE:") &&
+    text.includes("### TRANSCRIPT") &&
+    text.startsWith("Synthesize speech from the TRANSCRIPT section only.")
+  );
+}
+
+function renderGoogleAudioProfilePrompt(params: {
+  text: string;
+  persona?: {
+    id: string;
+    label?: string;
+    prompt?: {
+      profile?: string;
+      scene?: string;
+      sampleContext?: string;
+      style?: string;
+      accent?: string;
+      pacing?: string;
+      constraints?: string[];
+    };
+  };
+  personaPrompt?: string;
+}): string {
+  const transcript = params.text.replace(/\r\n?/g, "\n").trim();
+  const prompt = params.persona?.prompt;
+  const profile = normalizePromptSectionText(prompt?.profile);
+  const scene = normalizePromptSectionText(prompt?.scene);
+  const sampleContext = normalizePromptSectionText(prompt?.sampleContext);
+  const style = normalizePromptSectionText(prompt?.style);
+  const accent = normalizePromptSectionText(prompt?.accent);
+  const pacing = normalizePromptSectionText(prompt?.pacing);
+  const constraints = normalizePromptList(prompt?.constraints);
+  const personaPrompt = normalizePromptSectionText(params.personaPrompt);
+  const label =
+    normalizePromptSectionText(params.persona?.label) ??
+    normalizePromptSectionText(params.persona?.id);
+
+  const sections = [
+    [
+      "Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
+      "as performance direction. Do not read section titles, notes, labels, or",
+      "configuration aloud.",
+    ].join("\n"),
+  ];
+
+  if (label || profile) {
+    sections.push([`# AUDIO PROFILE: ${label ?? "voice"}`, profile].filter(Boolean).join("\n"));
+  }
+  if (scene) {
+    sections.push(["## THE SCENE", scene].join("\n"));
+  }
+
+  const directorNotes: string[] = [];
+  if (style) {
+    directorNotes.push(`Style: ${style}`);
+  }
+  if (accent) {
+    directorNotes.push(`Accent: ${accent}`);
+  }
+  if (pacing) {
+    directorNotes.push(`Pacing: ${pacing}`);
+  }
+  if (constraints.length > 0) {
+    directorNotes.push(["Constraints:", ...constraints.map((item) => `- ${item}`)].join("\n"));
+  }
+  if (personaPrompt) {
+    directorNotes.push(["Provider notes:", personaPrompt].join("\n"));
+  }
+  if (directorNotes.length > 0) {
+    sections.push(["### DIRECTOR'S NOTES", ...directorNotes].join("\n"));
+  }
+
+  if (sampleContext) {
+    sections.push(["### SAMPLE CONTEXT", sampleContext].join("\n"));
+  }
+
+  sections.push(["### TRANSCRIPT", transcript].join("\n"));
+  return sections.join("\n\n");
+}
+
 function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): Buffer {
  const byteRate = sampleRate * GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8);
  const blockAlign = GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8);
@@ -265,7 +413,7 @@ function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): B
  return Buffer.concat([header, pcm]);
 }

-async function synthesizeGoogleTtsPcm(params: {
+async function synthesizeGoogleTtsPcmOnce(params: {
  text: string;
  apiKey: string;
  baseUrl?: string;
@@ -322,19 +470,59 @@ async function synthesizeGoogleTtsPcm(params: {
  });

  try {
-    await assertOkOrThrowProviderError(res, "Google TTS failed");
-    return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse);
+    if (!res.ok) {
+      try {
+        await assertOkOrThrowProviderError(res, "Google TTS failed");
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        if (res.status >= 500 && res.status < 600) {
+          throw new GoogleTtsRetryableError(message);
+        }
+        throw err;
+      }
+    }
+    try {
+      return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      throw new GoogleTtsRetryableError(message);
+    }
  } finally {
    await release();
  }
 }

+async function synthesizeGoogleTtsPcm(params: {
+  text: string;
+  apiKey: string;
+  baseUrl?: string;
+  request?: ReturnType<typeof sanitizeConfiguredModelProviderRequest>;
+  model: string;
+  voiceName: string;
+  audioProfile?: string;
+  speakerName?: string;
+  timeoutMs: number;
+}): Promise<Buffer> {
+  let lastError: unknown;
+  for (let attempt = 0; attempt < 2; attempt += 1) {
+    try {
+      return await synthesizeGoogleTtsPcmOnce(params);
+    } catch (err) {
+      lastError = err;
+      if (!(err instanceof GoogleTtsRetryableError) || attempt > 0) {
+        throw err;
+      }
+    }
+  }
+  throw lastError instanceof Error ? lastError : new Error(String(lastError));
+}
+
 export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
  return {
    id: "google",
    label: "Google",
    autoSelectOrder: 50,
-    models: [DEFAULT_GOOGLE_TTS_MODEL],
+    models: GOOGLE_TTS_MODELS,
    voices: GOOGLE_TTS_VOICES,
    resolveConfig: ({ rawConfig }) => normalizeGoogleTtsProviderConfig(rawConfig),
    parseDirectiveToken,
@@ -372,6 +560,22 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
    listVoices: async () => GOOGLE_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
    isConfigured: ({ cfg, providerConfig }) =>
      Boolean(resolveGoogleTtsApiKey({ cfg, providerConfig })),
+    prepareSynthesis: (ctx) => {
+      const config = readGoogleTtsProviderConfig(ctx.providerConfig);
+      const shouldWrap =
+        config.promptTemplate === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE ||
+        Boolean(config.personaPrompt);
+      if (!shouldWrap || isOpenClawGoogleAudioProfilePrompt(ctx.text)) {
+        return undefined;
+      }
+      return {
+        text: renderGoogleAudioProfilePrompt({
+          text: ctx.text,
+          persona: ctx.persona,
+          personaPrompt: config.personaPrompt,
+        }),
+      };
+    },
    synthesize: async (req) => {
      const config = readGoogleTtsProviderConfig(req.providerConfig);
      const overrides = readGoogleTtsOverrides(req.providerOverrides);
@@ -449,7 +653,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
 export const __testing = {
  DEFAULT_GOOGLE_TTS_MODEL,
  DEFAULT_GOOGLE_TTS_VOICE,
+  GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE,
+  GOOGLE_TTS_MODELS,
  GOOGLE_TTS_SAMPLE_RATE,
  normalizeGoogleTtsModel,
+  renderGoogleAudioProfilePrompt,
  wrapPcm16MonoToWav,
 };
--- a/extensions/openai/openai.live.test.ts
+++ b/extensions/openai/openai.live.test.ts
@@ -134,6 +134,7 @@ function createLiveTtsConfig(): ResolvedTtsConfig {
        voice: "alloy",
      },
    },
+    personas: {},
    maxTextLength: 4_000,
    timeoutMs: 30_000,
  };
--- a/extensions/openai/speech-provider.test.ts
+++ b/extensions/openai/speech-provider.test.ts
@@ -162,6 +162,40 @@ describe("buildOpenAISpeechProvider", () => {
    });
  });

+  it("maps persona prompt fields to instructions when instructions are unset", async () => {
+    const provider = buildOpenAISpeechProvider();
+
+    const prepared = await provider.prepareSynthesis?.({
+      text: "hello",
+      cfg: {} as never,
+      providerConfig: {
+        apiKey: "sk-test",
+        model: "gpt-4o-mini-tts",
+        voice: "cedar",
+      },
+      persona: {
+        id: "alfred",
+        label: "Alfred",
+        prompt: {
+          profile: "A brilliant British butler.",
+          scene: "A quiet late-night study.",
+          sampleContext: "The speaker is answering a trusted operator.",
+          style: "Refined and lightly amused.",
+          accent: "British English.",
+          pacing: "Measured.",
+          constraints: ["Do not read configuration values aloud."],
+        },
+      },
+      target: "audio-file",
+      timeoutMs: 1_000,
+    });
+
+    expect(prepared?.providerConfig?.instructions).toContain("Persona: Alfred");
+    expect(prepared?.providerConfig?.instructions).toContain(
+      "Constraint: Do not read configuration values aloud.",
+    );
+  });
+
  it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => {
    const provider = buildOpenAISpeechProvider();
    mockSpeechFetchExpectingFormat("wav");
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -71,7 +71,7 @@ function isGroqSpeechBaseUrl(baseUrl: string): boolean {

 function resolveSpeechResponseFormat(
  baseUrl: string,
-  target: "audio-file" | "voice-note",
+  target: "audio-file" | "voice-note" | "telephony",
  configuredFormat?: OpenAiSpeechResponseFormat,
 ): OpenAiSpeechResponseFormat {
  if (configuredFormat) {
@@ -145,6 +145,37 @@ function readOpenAIOverrides(
  };
 }

+function renderOpenAITtsPersonaInstructions(req: {
+  label?: string;
+  prompt?: {
+    profile?: string;
+    scene?: string;
+    sampleContext?: string;
+    style?: string;
+    accent?: string;
+    pacing?: string;
+    constraints?: string[];
+  };
+}): string | undefined {
+  const prompt = req.prompt;
+  if (!prompt) {
+    return undefined;
+  }
+  const lines = [
+    req.label ? `Persona: ${req.label}` : undefined,
+    prompt.profile ? `Profile: ${prompt.profile}` : undefined,
+    prompt.scene ? `Scene: ${prompt.scene}` : undefined,
+    prompt.style ? `Style: ${prompt.style}` : undefined,
+    prompt.accent ? `Accent: ${prompt.accent}` : undefined,
+    prompt.pacing ? `Pacing: ${prompt.pacing}` : undefined,
+    prompt.sampleContext ? `Sample context: ${prompt.sampleContext}` : undefined,
+    ...(prompt.constraints ?? []).map((constraint) => `Constraint: ${constraint}`),
+  ]
+    .map((line) => trimToUndefined(line))
+    .filter((line): line is string => Boolean(line));
+  return lines.length > 0 ? lines.join("\n") : undefined;
+}
+
 function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
  handled: boolean;
  overrides?: SpeechProviderOverrides;
@@ -229,6 +260,23 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
    listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
    isConfigured: ({ providerConfig }) =>
      Boolean(readOpenAIProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
+    prepareSynthesis: (ctx) => {
+      const config = readOpenAIProviderConfig(ctx.providerConfig);
+      if (config.instructions) {
+        return undefined;
+      }
+      const instructions = renderOpenAITtsPersonaInstructions({
+        label: ctx.persona?.label ?? ctx.persona?.id,
+        prompt: ctx.persona?.prompt,
+      });
+      return instructions
+        ? {
+            providerConfig: {
+              instructions,
+            },
+          }
+        : undefined;
+    },
    synthesize: async (req) => {
      const config = readOpenAIProviderConfig(req.providerConfig);
      const overrides = readOpenAIOverrides(req.providerOverrides);
--- a/extensions/speech-core/runtime-api.ts
+++ b/extensions/speech-core/runtime-api.ts
@@ -3,11 +3,13 @@ export {
  getLastTtsAttempt,
  getResolvedSpeechProviderConfig,
  getTtsMaxLength,
+  getTtsPersona,
  getTtsProvider,
  isSummarizationEnabled,
  isTtsEnabled,
  isTtsProviderConfigured,
  listSpeechVoices,
+  listTtsPersonas,
  maybeApplyTtsToPayload,
  resolveExplicitTtsOverrides,
  resolveTtsAutoMode,
@@ -19,6 +21,7 @@ export {
  setTtsAutoMode,
  setTtsEnabled,
  setTtsMaxLength,
+  setTtsPersona,
  setTtsProvider,
  synthesizeSpeech,
  textToSpeech,
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -1,7 +1,12 @@
 import { rmSync } from "node:fs";
 import path from "node:path";
 import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
-import type { SpeechProviderPlugin, SpeechSynthesisRequest } from "openclaw/plugin-sdk/speech-core";
+import type { ReplyPayload } from "openclaw/plugin-sdk/reply-payload";
+import type {
+  SpeechProviderPlugin,
+  SpeechProviderPrepareSynthesisContext,
+  SpeechSynthesisRequest,
+} from "openclaw/plugin-sdk/speech-core";
 import { afterEach, describe, expect, it, vi } from "vitest";

 type MockSpeechSynthesisResult = Awaited<ReturnType<SpeechProviderPlugin["synthesize"]>>;
@@ -16,6 +21,9 @@ const synthesizeMock = vi.hoisted(() =>
    }),
  ),
 );
+const prepareSynthesisMock = vi.hoisted(() =>
+  vi.fn(async (_ctx: SpeechProviderPrepareSynthesisContext) => undefined),
+);

 const listSpeechProvidersMock = vi.hoisted(() => vi.fn());
 const getSpeechProviderMock = vi.hoisted(() => vi.fn());
@@ -31,6 +39,7 @@ vi.mock("../api.js", async () => {
    label: "Mock",
    autoSelectOrder: 1,
    isConfigured: () => true,
+    prepareSynthesis: prepareSynthesisMock,
    synthesize: synthesizeMock,
  };
  listSpeechProvidersMock.mockImplementation(() => [mockProvider]);
@@ -49,10 +58,40 @@ vi.mock("../api.js", async () => {
  };
 });

-const { _test, maybeApplyTtsToPayload, resolveTtsConfig } = await import("./tts.js");
+const {
+  _test,
+  getTtsPersona,
+  getTtsProvider,
+  maybeApplyTtsToPayload,
+  resolveTtsConfig,
+  synthesizeSpeech,
+  textToSpeechTelephony,
+} = await import("./tts.js");

 const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const;

+function createMockSpeechProvider(
+  id = "mock",
+  options: Partial<SpeechProviderPlugin> = {},
+): SpeechProviderPlugin {
+  return {
+    id,
+    label: id,
+    autoSelectOrder: id === "mock" ? 1 : 2,
+    isConfigured: () => true,
+    prepareSynthesis: prepareSynthesisMock,
+    synthesize: synthesizeMock,
+    ...options,
+  };
+}
+
+function installSpeechProviders(providers: SpeechProviderPlugin[]): void {
+  listSpeechProvidersMock.mockImplementation(() => providers);
+  getSpeechProviderMock.mockImplementation(
+    (providerId: string) => providers.find((provider) => provider.id === providerId) ?? null,
+  );
+}
+
 function createTtsConfig(prefsName: string): OpenClawConfig {
  return {
    messages: {
@@ -102,6 +141,8 @@ async function expectTtsPayloadResult(params: {
 describe("speech-core native voice-note routing", () => {
  afterEach(() => {
    synthesizeMock.mockClear();
+    prepareSynthesisMock.mockClear();
+    installSpeechProviders([createMockSpeechProvider()]);
  });

  it("keeps native voice-note channel support centralized", () => {
@@ -153,6 +194,268 @@ describe("speech-core native voice-note routing", () => {
      audioAsVoice: undefined,
    });
  });
+
+  it("selects persona preferred provider before config fallback", () => {
+    const cfg: OpenClawConfig = {
+      messages: {
+        tts: {
+          enabled: true,
+          provider: "other",
+          persona: "alfred",
+          personas: {
+            alfred: {
+              label: "Alfred",
+              provider: "mock",
+              providers: {
+                mock: {
+                  voice: "Algieba",
+                },
+              },
+            },
+          },
+        },
+      },
+    };
+    const config = resolveTtsConfig(cfg);
+    const prefsPath = "/tmp/openclaw-speech-core-persona-provider.json";
+
+    expect(getTtsPersona(config, prefsPath)?.id).toBe("alfred");
+    expect(getTtsProvider(config, prefsPath)).toBe("mock");
+  });
+
+  it("merges active persona provider binding into synthesis config", async () => {
+    const cfg: OpenClawConfig = {
+      messages: {
+        tts: {
+          enabled: true,
+          provider: "mock",
+          prefsPath: "/tmp/openclaw-speech-core-persona-merge.json",
+          providers: {
+            mock: {
+              model: "base-model",
+              voice: "base-voice",
+            },
+          },
+          persona: "alfred",
+          personas: {
+            alfred: {
+              provider: "mock",
+              providers: {
+                mock: {
+                  voice: "persona-voice",
+                  style: "dry",
+                },
+              },
+            },
+          },
+        },
+      },
+    };
+
+    const payload: ReplyPayload = {
+      text: "This reply should use persona-specific provider configuration.",
+    };
+
+    let mediaDir: string | undefined;
+    try {
+      const result = await maybeApplyTtsToPayload({
+        payload,
+        cfg,
+        channel: "slack",
+        kind: "final",
+      });
+
+      expect(synthesizeMock).toHaveBeenCalledWith(
+        expect.objectContaining({
+          providerConfig: expect.objectContaining({
+            model: "base-model",
+            voice: "persona-voice",
+            style: "dry",
+          }),
+        }),
+      );
+      expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/);
+
+      mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
+    } finally {
+      if (mediaDir) {
+        rmSync(mediaDir, { recursive: true, force: true });
+      }
+    }
+  });
+
+  it("does not mark skipped unregistered providers as missing persona bindings", async () => {
+    const result = await synthesizeSpeech({
+      text: "Use fallback provider.",
+      cfg: {
+        messages: {
+          tts: {
+            enabled: true,
+            provider: "missing",
+            persona: "alfred",
+            personas: {
+              alfred: {
+                providers: {
+                  missing: {
+                    voice: "configured-but-unregistered",
+                  },
+                },
+              },
+            },
+          },
+        },
+      },
+    });
+
+    expect(result.success).toBe(true);
+    expect(result.attempts?.[0]).toMatchObject({
+      provider: "missing",
+      outcome: "skipped",
+      reasonCode: "no_provider_registered",
+      persona: "alfred",
+    });
+    expect(result.attempts?.[0]).not.toHaveProperty("personaBinding");
+  });
+
+  it("does not mark skipped telephony providers as missing persona bindings", async () => {
+    const result = await textToSpeechTelephony({
+      text: "Use telephony provider.",
+      cfg: {
+        messages: {
+          tts: {
+            enabled: true,
+            provider: "mock",
+            persona: "alfred",
+            personas: {
+              alfred: {
+                providers: {
+                  mock: {
+                    voice: "persona-voice",
+                  },
+                },
+              },
+            },
+          },
+        },
+      },
+    });
+
+    expect(result.success).toBe(false);
+    expect(result.attempts?.[0]).toMatchObject({
+      provider: "mock",
+      outcome: "skipped",
+      reasonCode: "unsupported_for_telephony",
+      persona: "alfred",
+    });
+    expect(result.attempts?.[0]).not.toHaveProperty("personaBinding");
+  });
+
+  it("uses provider defaults when fallback policy allows missing persona bindings", async () => {
+    await synthesizeSpeech({
+      text: "Use neutral provider defaults.",
+      cfg: {
+        messages: {
+          tts: {
+            enabled: true,
+            provider: "mock",
+            persona: "alfred",
+            personas: {
+              alfred: {
+                fallbackPolicy: "provider-defaults",
+                prompt: {
+                  profile: "A precise butler.",
+                },
+              },
+            },
+          },
+        },
+      },
+    });
+
+    expect(prepareSynthesisMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        persona: undefined,
+        personaProviderConfig: undefined,
+      }),
+    );
+  });
+
+  it("preserves persona prompts by default when provider bindings are missing", async () => {
+    await synthesizeSpeech({
+      text: "Use persona prompt.",
+      cfg: {
+        messages: {
+          tts: {
+            enabled: true,
+            provider: "mock",
+            persona: "alfred",
+            personas: {
+              alfred: {
+                prompt: {
+                  profile: "A precise butler.",
+                },
+              },
+            },
+          },
+        },
+      },
+    });
+
+    expect(prepareSynthesisMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        persona: expect.objectContaining({ id: "alfred" }),
+        personaProviderConfig: undefined,
+      }),
+    );
+  });
+
+  it("skips unbound providers under fail policy while allowing bound fallbacks", async () => {
+    installSpeechProviders([
+      createMockSpeechProvider("mock", { autoSelectOrder: 1 }),
+      createMockSpeechProvider("fallback", { autoSelectOrder: 2 }),
+    ]);
+
+    const result = await synthesizeSpeech({
+      text: "Use the first persona-bound provider.",
+      cfg: {
+        messages: {
+          tts: {
+            enabled: true,
+            provider: "mock",
+            persona: "alfred",
+            personas: {
+              alfred: {
+                fallbackPolicy: "fail",
+                providers: {
+                  fallback: {
+                    voice: "fallback-voice",
+                  },
+                },
+              },
+            },
+          },
+        },
+      },
+    });
+
+    expect(result.success).toBe(true);
+    expect(result.provider).toBe("fallback");
+    expect(result.fallbackFrom).toBe("mock");
+    expect(result.attempts?.[0]).toMatchObject({
+      provider: "mock",
+      outcome: "skipped",
+      reasonCode: "not_configured",
+      persona: "alfred",
+      personaBinding: "missing",
+      error: "mock: persona alfred has no provider binding",
+    });
+    expect(result.attempts?.[1]).toMatchObject({
+      provider: "fallback",
+      outcome: "success",
+      persona: "alfred",
+      personaBinding: "applied",
+    });
+  });
 });

 describe("speech-core per-agent TTS config", () => {
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -12,6 +12,7 @@ import path from "node:path";
 import { normalizeChannelId, type ChannelId } from "openclaw/plugin-sdk/channel-targets";
 import type {
  OpenClawConfig,
+  ResolvedTtsPersona,
  TtsAutoMode,
  TtsConfig,
  TtsModelOverrideConfig,
@@ -40,6 +41,7 @@ import {
  normalizeSpeechProviderId,
  normalizeTtsAutoMode,
  parseTtsDirectives,
+  resolveEffectiveTtsConfig,
  type ResolvedTtsConfig,
  type ResolvedTtsModelOverrides,
  scheduleCleanup,
@@ -62,13 +64,13 @@ const DEFAULT_TIMEOUT_MS = 30_000;
 const DEFAULT_TTS_MAX_LENGTH = 1500;
 const DEFAULT_TTS_SUMMARIZE = true;
 const DEFAULT_MAX_TEXT_LENGTH = 4096;
-const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);

 type TtsUserPrefs = {
  tts?: {
    auto?: TtsAutoMode;
    enabled?: boolean;
    provider?: TtsProvider;
+    persona?: string | null;
    maxLength?: number;
    summarize?: boolean;
  };
@@ -86,6 +88,8 @@ export type TtsProviderAttempt = {
  provider: string;
  outcome: "success" | "skipped" | "failed";
  reasonCode: TtsAttemptReasonCode;
+  persona?: string;
+  personaBinding?: "applied" | "missing" | "none";
  latencyMs?: number;
  error?: string;
 };
@@ -96,6 +100,7 @@ export type TtsResult = {
  error?: string;
  latencyMs?: number;
  provider?: string;
+  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];
  attempts?: TtsProviderAttempt[];
@@ -111,6 +116,7 @@ export type TtsSynthesisResult = {
  error?: string;
  latencyMs?: number;
  provider?: string;
+  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];
  attempts?: TtsProviderAttempt[];
@@ -126,6 +132,7 @@ export type TtsTelephonyResult = {
  error?: string;
  latencyMs?: number;
  provider?: string;
+  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];
  attempts?: TtsProviderAttempt[];
@@ -139,6 +146,7 @@ type TtsStatusEntry = {
  textLength: number;
  summarized: boolean;
  provider?: string;
+  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];
  attempts?: TtsProviderAttempt[];
@@ -162,6 +170,10 @@ function normalizeConfiguredSpeechProviderId(
  return normalized === "edge" ? "microsoft" : normalized;
 }

+function normalizeTtsPersonaId(personaId: string | null | undefined): string | undefined {
+  return normalizeOptionalLowercaseString(personaId ?? undefined);
+}
+
 function resolveTtsPrefsPathValue(prefsPath: string | undefined): string {
  if (prefsPath?.trim()) {
    return resolveUserPath(prefsPath.trim());
@@ -229,6 +241,87 @@ function asProviderConfigMap(value: unknown): Record<string, unknown> {
    : {};
 }

+function hasOwnProperty(value: object, key: string): boolean {
+  return Object.prototype.hasOwnProperty.call(value, key);
+}
+
+function normalizeProviderConfigMap(
+  value: unknown,
+): Record<string, SpeechProviderConfig> | undefined {
+  const rawMap = asProviderConfigMap(value);
+  if (Object.keys(rawMap).length === 0) {
+    return undefined;
+  }
+  const next: Record<string, SpeechProviderConfig> = {};
+  for (const [providerId, providerConfig] of Object.entries(rawMap)) {
+    const normalized = normalizeConfiguredSpeechProviderId(providerId) ?? providerId;
+    next[normalized] = asProviderConfig(providerConfig);
+  }
+  return next;
+}
+
+function collectTtsPersonas(raw: TtsConfig): Record<string, ResolvedTtsPersona> {
+  const rawPersonas = asProviderConfigMap(raw.personas);
+  const personas: Record<string, ResolvedTtsPersona> = {};
+  for (const [id, value] of Object.entries(rawPersonas)) {
+    const normalizedId = normalizeTtsPersonaId(id);
+    if (!normalizedId || typeof value !== "object" || value === null || Array.isArray(value)) {
+      continue;
+    }
+    const persona = value as Omit<ResolvedTtsPersona, "id">;
+    personas[normalizedId] = {
+      ...persona,
+      id: normalizedId,
+      provider: normalizeConfiguredSpeechProviderId(persona.provider) ?? persona.provider,
+      providers: normalizeProviderConfigMap(persona.providers),
+    };
+  }
+  return personas;
+}
+
+function resolvePersonaProviderConfig(
+  persona: ResolvedTtsPersona | undefined,
+  providerId: string,
+): SpeechProviderConfig | undefined {
+  if (!persona?.providers) {
+    return undefined;
+  }
+  const normalized = normalizeConfiguredSpeechProviderId(providerId) ?? providerId;
+  if (hasOwnProperty(persona.providers, normalized)) {
+    return persona.providers[normalized];
+  }
+  if (hasOwnProperty(persona.providers, providerId)) {
+    return persona.providers[providerId];
+  }
+  return undefined;
+}
+
+function mergeProviderConfigWithPersona(params: {
+  providerConfig: SpeechProviderConfig;
+  persona?: ResolvedTtsPersona;
+  providerId: string;
+}): {
+  providerConfig: SpeechProviderConfig;
+  personaProviderConfig?: SpeechProviderConfig;
+  personaBinding: "applied" | "missing" | "none";
+} {
+  if (!params.persona) {
+    return { providerConfig: params.providerConfig, personaBinding: "none" };
+  }
+  const personaProviderConfig = resolvePersonaProviderConfig(params.persona, params.providerId);
+  if (!personaProviderConfig) {
+    return { providerConfig: params.providerConfig, personaBinding: "missing" };
+  }
+  return {
+    providerConfig: {
+      ...params.providerConfig,
+      ...personaProviderConfig,
+    },
+    personaProviderConfig,
+    personaBinding: "applied",
+  };
+}
+
 function resolveRawProviderConfig(
  raw: TtsConfig | undefined,
  providerId: string,
@@ -241,48 +334,6 @@ function resolveRawProviderConfig(
  return asProviderConfig(direct);
 }

-function isPlainObject(value: unknown): value is Record<string, unknown> {
-  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
-}
-
-function deepMergeDefined(base: unknown, override: unknown): unknown {
-  if (!isPlainObject(base) || !isPlainObject(override)) {
-    return override === undefined ? base : override;
-  }
-
-  const result: Record<string, unknown> = { ...base };
-  for (const [key, value] of Object.entries(override)) {
-    if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
-      continue;
-    }
-    const existing = result[key];
-    result[key] = key in result ? deepMergeDefined(existing, value) : value;
-  }
-  return result;
-}
-
-function normalizeAgentConfigId(value: string | undefined | null): string {
-  return normalizeLowercaseStringOrEmpty(value);
-}
-
-function resolveAgentTtsOverride(
-  cfg: OpenClawConfig,
-  agentId: string | undefined,
-): TtsConfig | undefined {
-  if (!agentId || !Array.isArray(cfg.agents?.list)) {
-    return undefined;
-  }
-  const normalized = normalizeAgentConfigId(agentId);
-  const agent = cfg.agents.list.find((entry) => normalizeAgentConfigId(entry.id) === normalized);
-  return agent?.tts;
-}
-
-function resolveEffectiveTtsRawConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig {
-  const base = cfg.messages?.tts ?? {};
-  const override = resolveAgentTtsOverride(cfg, agentId);
-  return deepMergeDefined(base, override ?? {}) as TtsConfig;
-}
-
 function resolveLazyProviderConfig(
  config: ResolvedTtsConfig,
  providerId: string,
@@ -325,6 +376,8 @@ function collectDirectProviderConfigEntries(raw: TtsConfig): Record<string, Spee
    "maxTextLength",
    "mode",
    "modelOverrides",
+    "persona",
+    "personas",
    "prefsPath",
    "provider",
    "providers",
@@ -357,10 +410,11 @@ export function getResolvedSpeechProviderConfig(
 }

 export function resolveTtsConfig(cfg: OpenClawConfig, agentId?: string): ResolvedTtsConfig {
-  const raw: TtsConfig = resolveEffectiveTtsRawConfig(cfg, agentId);
+  const raw: TtsConfig = resolveEffectiveTtsConfig(cfg, agentId);
  const providerSource = raw.provider ? "config" : "default";
  const timeoutMs = raw.timeoutMs ?? DEFAULT_TIMEOUT_MS;
  const auto = resolveConfiguredTtsAutoMode(raw);
+  const persona = normalizeTtsPersonaId(raw.persona);
  return {
    auto,
    mode: raw.mode ?? "final",
@@ -368,6 +422,8 @@ export function resolveTtsConfig(cfg: OpenClawConfig, agentId?: string): Resolve
      normalizeConfiguredSpeechProviderId(raw.provider) ??
      (providerSource === "config" ? (normalizeOptionalLowercaseString(raw.provider) ?? "") : ""),
    providerSource,
+    persona,
+    personas: collectTtsPersonas(raw),
    summaryModel: normalizeOptionalString(raw.summaryModel),
    modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
    providerConfigs: collectDirectProviderConfigEntries(raw),
@@ -418,7 +474,7 @@ function resolveEffectiveTtsAutoState(params: {
  autoMode: TtsAutoMode;
  prefsPath: string;
 } {
-  const raw: TtsConfig = resolveEffectiveTtsRawConfig(params.cfg, params.agentId);
+  const raw: TtsConfig = resolveEffectiveTtsConfig(params.cfg, params.agentId);
  const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath);
  const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
  if (sessionAuto) {
@@ -443,6 +499,7 @@ export function buildTtsSystemPromptHint(
    return undefined;
  }
  const _config = resolveTtsConfig(cfg, agentId);
+  const persona = getTtsPersona(_config, prefsPath);
  const maxLength = getTtsMaxLength(prefsPath);
  const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
  const autoHint =
@@ -454,6 +511,9 @@ export function buildTtsSystemPromptHint(
  return [
    "Voice (TTS) is enabled.",
    autoHint,
+    persona
+      ? `Active TTS persona: ${persona.label ?? persona.id}${persona.description ? ` - ${persona.description}` : ""}.`
+      : undefined,
    `Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
    "Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
  ]
@@ -523,6 +583,13 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
  if (prefsProvider) {
    return prefsProvider;
  }
+  const activePersona = resolveTtsPersonaFromPrefs(config, prefs);
+  const personaProvider =
+    canonicalizeSpeechProviderId(activePersona?.provider, config.sourceConfig) ??
+    normalizeConfiguredSpeechProviderId(activePersona?.provider);
+  if (personaProvider && getSpeechProvider(personaProvider, config.sourceConfig)) {
+    return personaProvider;
+  }
  if (config.providerSource === "config") {
    return normalizeConfiguredSpeechProviderId(config.provider) ?? config.provider;
  }
@@ -542,6 +609,38 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
  return config.provider;
 }

+function resolveTtsPersonaFromPrefs(
+  config: ResolvedTtsConfig,
+  prefs: TtsUserPrefs,
+): ResolvedTtsPersona | undefined {
+  if (prefs.tts && hasOwnProperty(prefs.tts, "persona")) {
+    const prefsPersona = normalizeTtsPersonaId(prefs.tts.persona);
+    return prefsPersona ? config.personas[prefsPersona] : undefined;
+  }
+  const configPersona = normalizeTtsPersonaId(config.persona);
+  return configPersona ? config.personas[configPersona] : undefined;
+}
+
+export function getTtsPersona(
+  config: ResolvedTtsConfig,
+  prefsPath: string,
+): ResolvedTtsPersona | undefined {
+  return resolveTtsPersonaFromPrefs(config, readPrefs(prefsPath));
+}
+
+export function listTtsPersonas(config: ResolvedTtsConfig): ResolvedTtsPersona[] {
+  return Object.values(config.personas).toSorted((left, right) => left.id.localeCompare(right.id));
+}
+
+export function setTtsPersona(prefsPath: string, persona: string | null | undefined): void {
+  updatePrefs(prefsPath, (prefs) => {
+    const next = { ...prefs.tts };
+    const normalized = normalizeTtsPersonaId(persona);
+    next.persona = normalized ?? null;
+    prefs.tts = next;
+  });
+}
+
 export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
  updatePrefs(prefsPath, (prefs) => {
    prefs.tts = { ...prefs.tts, provider: canonicalizeSpeechProviderId(provider) ?? provider };
@@ -714,17 +813,20 @@ function buildTtsFailureResult(
  errors: string[],
  attemptedProviders?: string[],
  attempts?: TtsProviderAttempt[],
+  persona?: string,
 ): {
  success: false;
  error: string;
  attemptedProviders?: string[];
  attempts?: TtsProviderAttempt[];
+  persona?: string;
 } {
  return {
    success: false,
    error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`,
    attemptedProviders,
    attempts,
+    persona,
  };
 }

@@ -733,17 +835,22 @@ type TtsProviderReadyResolution =
      kind: "ready";
      provider: NonNullable<ReturnType<typeof getSpeechProvider>>;
      providerConfig: SpeechProviderConfig;
+      personaProviderConfig?: SpeechProviderConfig;
+      synthesisPersona?: ResolvedTtsPersona;
+      personaBinding: "applied" | "missing" | "none";
    }
  | {
      kind: "skip";
      reasonCode: "no_provider_registered" | "not_configured" | "unsupported_for_telephony";
      message: string;
+      personaBinding?: "missing";
    };

 function resolveReadySpeechProvider(params: {
  provider: TtsProvider;
  cfg: OpenClawConfig;
  config: ResolvedTtsConfig;
+  persona?: ResolvedTtsPersona;
  requireTelephony?: boolean;
 }): TtsProviderReadyResolution {
  const resolvedProvider = getSpeechProvider(params.provider, params.cfg);
@@ -759,10 +866,23 @@ function resolveReadySpeechProvider(params: {
    resolvedProvider.id,
    params.cfg,
  );
+  const merged = mergeProviderConfigWithPersona({
+    providerConfig,
+    persona: params.persona,
+    providerId: resolvedProvider.id,
+  });
+  if (params.persona?.fallbackPolicy === "fail" && merged.personaBinding === "missing") {
+    return {
+      kind: "skip",
+      reasonCode: "not_configured",
+      message: `${params.provider}: persona ${params.persona.id} has no provider binding`,
+      personaBinding: "missing",
+    };
+  }
  if (
    !resolvedProvider.isConfigured({
      cfg: params.cfg,
-      providerConfig,
+      providerConfig: merged.providerConfig,
      timeoutMs: params.config.timeoutMs,
    })
  ) {
@@ -782,7 +902,56 @@ function resolveReadySpeechProvider(params: {
  return {
    kind: "ready",
    provider: resolvedProvider,
-    providerConfig,
+    providerConfig: merged.providerConfig,
+    personaProviderConfig: merged.personaProviderConfig,
+    synthesisPersona:
+      params.persona?.fallbackPolicy === "provider-defaults" && merged.personaBinding === "missing"
+        ? undefined
+        : params.persona,
+    personaBinding: merged.personaBinding,
+  };
+}
+
+async function prepareSpeechSynthesis(params: {
+  provider: NonNullable<ReturnType<typeof getSpeechProvider>>;
+  text: string;
+  cfg: OpenClawConfig;
+  providerConfig: SpeechProviderConfig;
+  providerOverrides?: SpeechProviderOverrides;
+  persona?: ResolvedTtsPersona;
+  personaProviderConfig?: SpeechProviderConfig;
+  target: "audio-file" | "voice-note" | "telephony";
+  timeoutMs: number;
+}): Promise<{
+  text: string;
+  providerConfig: SpeechProviderConfig;
+  providerOverrides?: SpeechProviderOverrides;
+}> {
+  if (!params.provider.prepareSynthesis) {
+    return {
+      text: params.text,
+      providerConfig: params.providerConfig,
+      providerOverrides: params.providerOverrides,
+    };
+  }
+  const prepared = await params.provider.prepareSynthesis({
+    text: params.text,
+    cfg: params.cfg,
+    providerConfig: params.providerConfig,
+    providerOverrides: params.providerOverrides,
+    persona: params.persona,
+    personaProviderConfig: params.personaProviderConfig,
+    target: params.target,
+    timeoutMs: params.timeoutMs,
+  });
+  return {
+    text: prepared?.text ?? params.text,
+    providerConfig: prepared?.providerConfig
+      ? { ...params.providerConfig, ...prepared.providerConfig }
+      : params.providerConfig,
+    providerOverrides: prepared?.providerOverrides
+      ? { ...params.providerOverrides, ...prepared.providerOverrides }
+      : params.providerOverrides,
  };
 }

@@ -796,6 +965,7 @@ function resolveTtsRequestSetup(params: {
 }):
  | {
      config: ResolvedTtsConfig;
+      persona?: ResolvedTtsPersona;
      providers: TtsProvider[];
    }
  | {
@@ -814,6 +984,7 @@ function resolveTtsRequestSetup(params: {
    canonicalizeSpeechProviderId(params.providerOverride, params.cfg) ?? userProvider;
  return {
    config,
+    persona: getTtsPersona(config, prefsPath),
    providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg),
  };
 }
@@ -833,6 +1004,7 @@ export async function textToSpeech(params: {
    return {
      success: false,
      error: synthesis.error ?? "TTS conversion failed",
+      persona: synthesis.persona,
      attemptedProviders: synthesis.attemptedProviders,
      attempts: synthesis.attempts,
    };
@@ -850,6 +1022,7 @@ export async function textToSpeech(params: {
    audioPath,
    latencyMs: synthesis.latencyMs,
    provider: synthesis.provider,
+    persona: synthesis.persona,
    fallbackFrom: synthesis.fallbackFrom,
    attemptedProviders: synthesis.attemptedProviders,
    attempts: synthesis.attempts,
@@ -886,7 +1059,7 @@ export async function synthesizeSpeech(params: {
    return { success: false, error: setup.error };
  }

-  const { config, providers } = setup;
+  const { config, persona, providers } = setup;
  const timeoutMs = params.timeoutMs ?? config.timeoutMs;
  const target = supportsNativeVoiceNoteTts(params.channel) ? "voice-note" : "audio-file";

@@ -906,6 +1079,7 @@ export async function synthesizeSpeech(params: {
        provider,
        cfg: params.cfg,
        config,
+        persona,
      });
      if (resolvedProvider.kind === "skip") {
        errors.push(resolvedProvider.message);
@@ -913,17 +1087,32 @@ export async function synthesizeSpeech(params: {
          provider,
          outcome: "skipped",
          reasonCode: resolvedProvider.reasonCode,
+          persona: persona?.id,
+          ...(resolvedProvider.personaBinding
+            ? { personaBinding: resolvedProvider.personaBinding }
+            : {}),
          error: resolvedProvider.message,
        });
        logVerbose(`TTS: provider ${provider} skipped (${resolvedProvider.message})`);
        continue;
      }
-      const synthesis = await resolvedProvider.provider.synthesize({
+      const prepared = await prepareSpeechSynthesis({
+        provider: resolvedProvider.provider,
        text: params.text,
        cfg: params.cfg,
        providerConfig: resolvedProvider.providerConfig,
-        target,
        providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id],
+        persona: resolvedProvider.synthesisPersona,
+        personaProviderConfig: resolvedProvider.personaProviderConfig,
+        target,
+        timeoutMs,
+      });
+      const synthesis = await resolvedProvider.provider.synthesize({
+        text: prepared.text,
+        cfg: params.cfg,
+        providerConfig: prepared.providerConfig,
+        target,
+        providerOverrides: prepared.providerOverrides,
        timeoutMs,
      });
      const latencyMs = Date.now() - providerStart;
@@ -931,6 +1120,8 @@ export async function synthesizeSpeech(params: {
        provider,
        outcome: "success",
        reasonCode: "success",
+        persona: persona?.id,
+        personaBinding: resolvedProvider.personaBinding,
        latencyMs,
      });
      return {
@@ -938,6 +1129,7 @@ export async function synthesizeSpeech(params: {
        audioBuffer: synthesis.audioBuffer,
        latencyMs,
        provider,
+        persona: persona?.id,
        fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
        attemptedProviders,
        attempts,
@@ -956,6 +1148,13 @@ export async function synthesizeSpeech(params: {
        reasonCode:
          err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error",
        latencyMs,
+        persona: persona?.id,
+        personaBinding:
+          resolvePersonaProviderConfig(persona, provider) != null
+            ? "applied"
+            : persona
+              ? "missing"
+              : "none",
        error: errorMsg,
      });
      const rawError = sanitizeTtsErrorForLog(err);
@@ -970,7 +1169,7 @@ export async function synthesizeSpeech(params: {
    }
  }

-  return buildTtsFailureResult(errors, attemptedProviders, attempts);
+  return buildTtsFailureResult(errors, attemptedProviders, attempts, persona?.id);
 }

 export async function textToSpeechTelephony(params: {
@@ -987,7 +1186,7 @@ export async function textToSpeechTelephony(params: {
    return { success: false, error: setup.error };
  }

-  const { config, providers } = setup;
+  const { config, persona, providers } = setup;
  const errors: string[] = [];
  const attemptedProviders: string[] = [];
  const attempts: TtsProviderAttempt[] = [];
@@ -1004,6 +1203,7 @@ export async function textToSpeechTelephony(params: {
        provider,
        cfg: params.cfg,
        config,
+        persona,
        requireTelephony: true,
      });
      if (resolvedProvider.kind === "skip") {
@@ -1012,28 +1212,32 @@ export async function textToSpeechTelephony(params: {
          provider,
          outcome: "skipped",
          reasonCode: resolvedProvider.reasonCode,
+          persona: persona?.id,
+          ...(resolvedProvider.personaBinding
+            ? { personaBinding: resolvedProvider.personaBinding }
+            : {}),
          error: resolvedProvider.message,
        });
        logVerbose(`TTS telephony: provider ${provider} skipped (${resolvedProvider.message})`);
        continue;
      }
-      const synthesizeTelephony = resolvedProvider.provider.synthesizeTelephony;
-      if (!synthesizeTelephony) {
-        const message = `${provider}: unsupported for telephony`;
-        errors.push(message);
-        attempts.push({
-          provider,
-          outcome: "skipped",
-          reasonCode: "unsupported_for_telephony",
-          error: message,
-        });
-        logVerbose(`TTS telephony: provider ${provider} skipped (${message})`);
-        continue;
-      }
-      const synthesis = await synthesizeTelephony({
+      const synthesizeTelephony = resolvedProvider.provider.synthesizeTelephony as NonNullable<
+        typeof resolvedProvider.provider.synthesizeTelephony
+      >;
+      const prepared = await prepareSpeechSynthesis({
+        provider: resolvedProvider.provider,
        text: params.text,
        cfg: params.cfg,
        providerConfig: resolvedProvider.providerConfig,
+        persona: resolvedProvider.synthesisPersona,
+        personaProviderConfig: resolvedProvider.personaProviderConfig,
+        target: "telephony",
+        timeoutMs: config.timeoutMs,
+      });
+      const synthesis = await synthesizeTelephony({
+        text: prepared.text,
+        cfg: params.cfg,
+        providerConfig: prepared.providerConfig,
        timeoutMs: config.timeoutMs,
      });
      const latencyMs = Date.now() - providerStart;
@@ -1041,6 +1245,8 @@ export async function textToSpeechTelephony(params: {
        provider,
        outcome: "success",
        reasonCode: "success",
+        persona: persona?.id,
+        personaBinding: resolvedProvider.personaBinding,
        latencyMs,
      });

@@ -1049,6 +1255,7 @@ export async function textToSpeechTelephony(params: {
        audioBuffer: synthesis.audioBuffer,
        latencyMs,
        provider,
+        persona: persona?.id,
        fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
        attemptedProviders,
        attempts,
@@ -1065,6 +1272,13 @@ export async function textToSpeechTelephony(params: {
        reasonCode:
          err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error",
        latencyMs,
+        persona: persona?.id,
+        personaBinding:
+          resolvePersonaProviderConfig(persona, provider) != null
+            ? "applied"
+            : persona
+              ? "missing"
+              : "none",
        error: errorMsg,
      });
      const rawError = sanitizeTtsErrorForLog(err);
@@ -1079,7 +1293,7 @@ export async function textToSpeechTelephony(params: {
    }
  }

-  return buildTtsFailureResult(errors, attemptedProviders, attempts);
+  return buildTtsFailureResult(errors, attemptedProviders, attempts, persona?.id);
 }

 export async function listSpeechVoices(params: {
@@ -1250,6 +1464,7 @@ export async function maybeApplyTtsToPayload(params: {
      textLength: text.length,
      summarized: wasSummarized,
      provider: result.provider,
+      persona: result.persona,
      fallbackFrom: result.fallbackFrom,
      attemptedProviders: result.attemptedProviders,
      attempts: result.attempts,
@@ -1268,6 +1483,7 @@ export async function maybeApplyTtsToPayload(params: {
    success: false,
    textLength: text.length,
    summarized: wasSummarized,
+    persona: result.persona,
    attemptedProviders: result.attemptedProviders,
    attempts: result.attempts,
    error: result.error,
--- a/extensions/xai/speech-provider.ts
+++ b/extensions/xai/speech-provider.ts
@@ -6,6 +6,7 @@ import {
  type SpeechProviderConfig,
  type SpeechProviderOverrides,
  type SpeechProviderPlugin,
+  type SpeechSynthesisTarget,
 } from "openclaw/plugin-sdk/speech";
 import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
 import {
@@ -48,7 +49,7 @@ function normalizeXaiSpeechResponseFormat(value: unknown): XaiSpeechResponseForm
 }

 function resolveSpeechResponseFormat(
-  target: "audio-file" | "voice-note",
+  target: SpeechSynthesisTarget,
  configuredFormat?: XaiSpeechResponseFormat,
 ): XaiSpeechResponseFormat {
  if (configuredFormat) {