TTS: add provider personas

2026-05-06 10:40:43 +00:00 · 2026-04-23 07:26:32 -07:00
parent 80219ed1b3
commit 0594fa3c4d
39 changed files with 2021 additions and 136 deletions
--- a/extensions/google/speech-provider.test.ts
+++ b/extensions/google/speech-provider.test.ts
@@ -1,5 +1,8 @@
-import * as providerHttp from "openclaw/plugin-sdk/provider-http";
-import { afterEach, describe, expect, it, vi } from "vitest";
+import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
+import {
+  getProviderHttpMocks,
+  installProviderHttpMockCleanup,
+} from "../../test/helpers/media-generation/provider-http-mocks.js";

 const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());

@@ -7,10 +10,23 @@ vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
  transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
 }));

-import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js";
+const {
+  assertOkOrThrowProviderErrorMock,
+  postJsonRequestMock,
+  resolveProviderHttpRequestConfigMock,
+} = getProviderHttpMocks();

-function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
-  const fetchMock = vi.fn().mockResolvedValue({
+let buildGoogleSpeechProvider: typeof import("./speech-provider.js").buildGoogleSpeechProvider;
+let __testing: typeof import("./speech-provider.js").__testing;
+
+beforeAll(async () => {
+  ({ buildGoogleSpeechProvider, __testing } = await import("./speech-provider.js"));
+});
+
+installProviderHttpMockCleanup();
+
+function googleTtsResponse(pcm = Buffer.from([1, 0, 2, 0])) {
+  return {
    ok: true,
    json: async () => ({
      candidates: [
@@ -28,21 +44,26 @@ function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
        },
      ],
    }),
+  };
+}
+
+function installGoogleTtsRequestMock(pcm = Buffer.from([1, 0, 2, 0])) {
+  postJsonRequestMock.mockResolvedValue({
+    response: googleTtsResponse(pcm),
+    release: vi.fn(async () => {}),
  });
-  vi.stubGlobal("fetch", fetchMock);
-  return fetchMock;
+  return postJsonRequestMock;
 }

 describe("Google speech provider", () => {
  afterEach(() => {
-    vi.restoreAllMocks();
    vi.unstubAllGlobals();
    vi.unstubAllEnvs();
    transcodeAudioBufferToOpusMock.mockReset();
  });

  it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();

    const result = await provider.synthesize({
@@ -57,11 +78,10 @@ describe("Google speech provider", () => {
      timeoutMs: 12_345,
    });

-    expect(fetchMock).toHaveBeenCalledWith(
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
+    expect(requestMock).toHaveBeenCalledWith(
      expect.objectContaining({
-        method: "POST",
-        body: JSON.stringify({
+        url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
+        body: {
          contents: [
            {
              role: "user",
@@ -78,11 +98,14 @@ describe("Google speech provider", () => {
              },
            },
          },
-        }),
+        },
+        fetchFn: fetch,
+        pinDns: false,
+        timeoutMs: 12_345,
      }),
    );
-    const [, init] = fetchMock.mock.calls[0];
-    expect(new Headers(init.headers).get("x-goog-api-key")).toBe("google-test-key");
+    const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
+    expect(new Headers(request.headers).get("x-goog-api-key")).toBe("google-test-key");
    expect(result.outputFormat).toBe("wav");
    expect(result.fileExtension).toBe(".wav");
    expect(result.voiceCompatible).toBe(false);
@@ -94,7 +117,7 @@ describe("Google speech provider", () => {
  });

  it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
-    installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0]));
+    installGoogleTtsRequestMock(Buffer.from([5, 0, 6, 0]));
    transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
    const provider = buildGoogleSpeechProvider();

@@ -125,9 +148,138 @@ describe("Google speech provider", () => {
    expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
  });

+  it("advertises all documented Gemini TTS-capable models", () => {
+    const provider = buildGoogleSpeechProvider();
+
+    expect(provider.models).toEqual(__testing.GOOGLE_TTS_MODELS);
+  });
+
+  it("renders deterministic audio-profile-v1 prompts without generating tags", async () => {
+    const provider = buildGoogleSpeechProvider();
+
+    const prepared = await provider.prepareSynthesis?.({
+      text: "[whispers] The door is open.",
+      cfg: {},
+      providerConfig: {
+        promptTemplate: "audio-profile-v1",
+        personaPrompt: "Keep a close-mic feel.",
+      },
+      persona: {
+        id: "alfred",
+        label: "Alfred",
+        prompt: {
+          profile: "A brilliant British butler.",
+          scene: "A quiet late-night study.",
+          sampleContext: "The speaker is answering a trusted operator.",
+          style: "Refined and lightly amused.",
+          accent: "British English.",
+          pacing: "Measured.",
+          constraints: ["Do not read configuration values aloud."],
+        },
+      },
+      target: "audio-file",
+      timeoutMs: 1_000,
+    });
+
+    expect(prepared?.text).toBe(
+      [
+        "Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
+        "as performance direction. Do not read section titles, notes, labels, or",
+        "configuration aloud.",
+        "",
+        "# AUDIO PROFILE: Alfred",
+        "A brilliant British butler.",
+        "",
+        "## THE SCENE",
+        "A quiet late-night study.",
+        "",
+        "### DIRECTOR'S NOTES",
+        "Style: Refined and lightly amused.",
+        "Accent: British English.",
+        "Pacing: Measured.",
+        "Constraints:",
+        "- Do not read configuration values aloud.",
+        "Provider notes:",
+        "Keep a close-mic feel.",
+        "",
+        "### SAMPLE CONTEXT",
+        "The speaker is answering a trusted operator.",
+        "",
+        "### TRANSCRIPT",
+        "[whispers] The door is open.",
+      ].join("\n"),
+    );
+  });
+
+  it("does not wrap an OpenClaw audio-profile-v1 prompt twice", async () => {
+    const provider = buildGoogleSpeechProvider();
+    const text = [
+      "Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
+      "as performance direction. Do not read section titles, notes, labels, or",
+      "configuration aloud.",
+      "",
+      "# AUDIO PROFILE: Alfred",
+      "A brilliant British butler.",
+      "",
+      "### TRANSCRIPT",
+      "Hello.",
+    ].join("\n");
+
+    const prepared = await provider.prepareSynthesis?.({
+      text,
+      cfg: {},
+      providerConfig: {
+        promptTemplate: "audio-profile-v1",
+      },
+      persona: {
+        id: "alfred",
+        label: "Alfred",
+        prompt: {
+          profile: "A brilliant British butler.",
+        },
+      },
+      target: "audio-file",
+      timeoutMs: 1_000,
+    });
+
+    expect(prepared).toBeUndefined();
+  });
+
+  it("retries once when Gemini returns no audio payload", async () => {
+    const pcm = Buffer.from([5, 0, 6, 0]);
+    const requestSequence = vi
+      .fn()
+      .mockResolvedValueOnce({
+        response: {
+          ok: true,
+          json: async () => ({ candidates: [{ content: { parts: [{ text: "not audio" }] } }] }),
+        },
+        release: vi.fn(async () => {}),
+      })
+      .mockResolvedValueOnce({
+        response: googleTtsResponse(pcm),
+        release: vi.fn(async () => {}),
+      });
+    postJsonRequestMock.mockImplementation(requestSequence);
+    const provider = buildGoogleSpeechProvider();
+
+    const result = await provider.synthesize({
+      text: "Retry this.",
+      cfg: {},
+      providerConfig: {
+        apiKey: "google-test-key",
+      },
+      target: "audio-file",
+      timeoutMs: 5_000,
+    });
+
+    expect(requestSequence).toHaveBeenCalledTimes(2);
+    expect(result.audioBuffer.subarray(44)).toEqual(pcm);
+  });
+
  it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
    vi.stubEnv("GEMINI_API_KEY", "env-google-key");
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();

    expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true);
@@ -149,16 +301,17 @@ describe("Google speech provider", () => {
      timeoutMs: 10_000,
    });

-    expect(fetchMock).toHaveBeenCalledWith(
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
-      expect.any(Object),
+    expect(requestMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
+      }),
    );
-    const [, init] = fetchMock.mock.calls[0];
-    expect(new Headers(init.headers).get("x-goog-api-key")).toBe("env-google-key");
+    const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
+    expect(new Headers(request.headers).get("x-goog-api-key")).toBe("env-google-key");
  });

  it("can reuse a configured Google model-provider API key without auth profiles", async () => {
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();
    const cfg = {
      models: {
@@ -182,13 +335,13 @@ describe("Google speech provider", () => {
      timeoutMs: 10_000,
    });

-    const [, init] = fetchMock.mock.calls[0];
-    expect(new Headers(init.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
+    const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
+    expect(new Headers(request.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
  });

  it("returns Gemini PCM directly for telephony synthesis", async () => {
    const pcm = Buffer.from([3, 0, 4, 0]);
-    installGoogleTtsFetchMock(pcm);
+    installGoogleTtsRequestMock(pcm);
    const provider = buildGoogleSpeechProvider();

    const result = await provider.synthesizeTelephony?.({
@@ -209,7 +362,7 @@ describe("Google speech provider", () => {
  });

  it("prepends configured Gemini TTS profile text", async () => {
-    const fetchMock = installGoogleTtsFetchMock();
+    const requestMock = installGoogleTtsRequestMock();
    const provider = buildGoogleSpeechProvider();

    await provider.synthesize({
@@ -224,8 +377,7 @@ describe("Google speech provider", () => {
      timeoutMs: 10_000,
    });

-    const [, init] = fetchMock.mock.calls[0];
-    expect(JSON.parse(String(init.body))).toMatchObject({
+    expect(requestMock.mock.calls[0]?.[0].body).toMatchObject({
      contents: [
        {
          parts: [
@@ -326,23 +478,26 @@ describe("Google speech provider", () => {
  });

  it("formats Google TTS HTTP errors with provider details", async () => {
-    vi.stubGlobal(
-      "fetch",
-      vi.fn().mockResolvedValue(
-        new Response(
-          JSON.stringify({
-            error: {
-              message: "Quota exceeded",
-              status: "RESOURCE_EXHAUSTED",
-            },
-          }),
-          {
-            status: 429,
-            headers: { "x-request-id": "google_req_123" },
-          },
-        ),
+    assertOkOrThrowProviderErrorMock.mockRejectedValue(
+      new Error(
+        "Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
      ),
    );
+    postJsonRequestMock.mockResolvedValue({
+      response: new Response(
+        JSON.stringify({
+          error: {
+            message: "Quota exceeded",
+            status: "RESOURCE_EXHAUSTED",
+          },
+        }),
+        {
+          status: 429,
+          headers: { "x-request-id": "google_req_123" },
+        },
+      ),
+      release: vi.fn(async () => {}),
+    });
    const provider = buildGoogleSpeechProvider();

    await expect(
@@ -359,8 +514,7 @@ describe("Google speech provider", () => {
  });

  it("honors configured private-network opt-in for Google TTS", async () => {
-    installGoogleTtsFetchMock();
-    const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
+    installGoogleTtsRequestMock();

    const provider = buildGoogleSpeechProvider();
    await provider.synthesize({
@@ -381,14 +535,16 @@ describe("Google speech provider", () => {
      timeoutMs: 12_345,
    });

-    expect(postJsonRequestSpy).toHaveBeenCalledWith(
-      expect.objectContaining({ allowPrivateNetwork: true }),
+    expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        allowPrivateNetwork: true,
+        request: expect.objectContaining({ allowPrivateNetwork: true }),
+      }),
    );
  });

  it("honors configured private-network opt-in for Google telephony TTS", async () => {
-    installGoogleTtsFetchMock();
-    const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
+    installGoogleTtsRequestMock();

    const provider = buildGoogleSpeechProvider();
    await provider.synthesizeTelephony?.({
@@ -408,8 +564,11 @@ describe("Google speech provider", () => {
      timeoutMs: 12_345,
    });

-    expect(postJsonRequestSpy).toHaveBeenCalledWith(
-      expect.objectContaining({ allowPrivateNetwork: true }),
+    expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        allowPrivateNetwork: true,
+        request: expect.objectContaining({ allowPrivateNetwork: true }),
+      }),
    );
  });
 });
--- a/extensions/google/speech-provider.ts
+++ b/extensions/google/speech-provider.ts
@@ -21,6 +21,13 @@ const DEFAULT_GOOGLE_TTS_VOICE = "Kore";
 const GOOGLE_TTS_SAMPLE_RATE = 24_000;
 const GOOGLE_TTS_CHANNELS = 1;
 const GOOGLE_TTS_BITS_PER_SAMPLE = 16;
+const GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE = "audio-profile-v1";
+
+const GOOGLE_TTS_MODELS = [
+  "gemini-3.1-flash-tts-preview",
+  "gemini-2.5-flash-preview-tts",
+  "gemini-2.5-pro-preview-tts",
+] as const;

 const GOOGLE_TTS_VOICES = [
  "Zephyr",
@@ -62,6 +69,8 @@ type GoogleTtsProviderConfig = {
  voiceName: string;
  audioProfile?: string;
  speakerName?: string;
+  promptTemplate?: typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE;
+  personaPrompt?: string;
 };

 type GoogleTtsProviderOverrides = {
@@ -91,6 +100,13 @@ type GoogleGenerateSpeechResponse = {
  }>;
 };

+class GoogleTtsRetryableError extends Error {
+  constructor(message: string) {
+    super(message);
+    this.name = "GoogleTtsRetryableError";
+  }
+}
+
 function normalizeGoogleTtsModel(model: unknown): string {
  const trimmed = normalizeOptionalString(model);
  if (!trimmed) {
@@ -104,6 +120,19 @@ function normalizeGoogleTtsVoiceName(voiceName: unknown): string {
  return normalizeOptionalString(voiceName) ?? DEFAULT_GOOGLE_TTS_VOICE;
 }

+function normalizeGooglePromptTemplate(
+  value: unknown,
+): typeof GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE | undefined {
+  const trimmed = normalizeOptionalString(value);
+  if (!trimmed) {
+    return undefined;
+  }
+  if (trimmed === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE) {
+    return trimmed;
+  }
+  throw new Error(`Invalid Google TTS promptTemplate: ${trimmed}`);
+}
+
 function resolveGoogleTtsEnvApiKey(): string | undefined {
  return (
    normalizeOptionalString(process.env.GEMINI_API_KEY) ??
@@ -149,6 +178,8 @@ function normalizeGoogleTtsProviderConfig(
  rawConfig: Record<string, unknown>,
 ): GoogleTtsProviderConfig {
  const raw = resolveGoogleTtsConfigRecord(rawConfig);
+  const promptTemplate = normalizeGooglePromptTemplate(raw?.promptTemplate);
+  const personaPrompt = trimToUndefined(raw?.personaPrompt);
  return {
    apiKey: normalizeResolvedSecretInputString({
      value: raw?.apiKey,
@@ -159,11 +190,16 @@ function normalizeGoogleTtsProviderConfig(
    voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice),
    audioProfile: trimToUndefined(raw?.audioProfile),
    speakerName: trimToUndefined(raw?.speakerName),
+    ...(promptTemplate ? { promptTemplate } : {}),
+    ...(personaPrompt ? { personaPrompt } : {}),
  };
 }

 function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsProviderConfig {
  const normalized = normalizeGoogleTtsProviderConfig({});
+  const promptTemplate =
+    normalizeGooglePromptTemplate(config.promptTemplate) ?? normalized.promptTemplate;
+  const personaPrompt = trimToUndefined(config.personaPrompt) ?? normalized.personaPrompt;
  return {
    apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
    baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl,
@@ -173,6 +209,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro
    ),
    audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile,
    speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName,
+    ...(promptTemplate ? { promptTemplate } : {}),
+    ...(personaPrompt ? { personaPrompt } : {}),
  };
 }

@@ -243,6 +281,116 @@ function extractGoogleSpeechPcm(payload: GoogleGenerateSpeechResponse): Buffer {
  throw new Error("Google TTS response missing audio data");
 }

+function normalizePromptSectionText(value: string | undefined): string | undefined {
+  const trimmed = trimToUndefined(value?.replace(/\r\n?/g, "\n"));
+  if (!trimmed) {
+    return undefined;
+  }
+  let sanitized = "";
+  for (const char of trimmed) {
+    const code = char.charCodeAt(0);
+    if (
+      (code >= 0 && code <= 8) ||
+      code === 11 ||
+      code === 12 ||
+      (code >= 14 && code <= 31) ||
+      code === 127
+    ) {
+      continue;
+    }
+    sanitized += char;
+  }
+  return sanitized;
+}
+
+function normalizePromptList(values: readonly string[] | undefined): string[] {
+  return (values ?? [])
+    .map((value) => normalizePromptSectionText(value))
+    .filter((value): value is string => Boolean(value));
+}
+
+function isOpenClawGoogleAudioProfilePrompt(text: string): boolean {
+  return (
+    text.includes("# AUDIO PROFILE:") &&
+    text.includes("### TRANSCRIPT") &&
+    text.startsWith("Synthesize speech from the TRANSCRIPT section only.")
+  );
+}
+
+function renderGoogleAudioProfilePrompt(params: {
+  text: string;
+  persona?: {
+    id: string;
+    label?: string;
+    prompt?: {
+      profile?: string;
+      scene?: string;
+      sampleContext?: string;
+      style?: string;
+      accent?: string;
+      pacing?: string;
+      constraints?: string[];
+    };
+  };
+  personaPrompt?: string;
+}): string {
+  const transcript = params.text.replace(/\r\n?/g, "\n").trim();
+  const prompt = params.persona?.prompt;
+  const profile = normalizePromptSectionText(prompt?.profile);
+  const scene = normalizePromptSectionText(prompt?.scene);
+  const sampleContext = normalizePromptSectionText(prompt?.sampleContext);
+  const style = normalizePromptSectionText(prompt?.style);
+  const accent = normalizePromptSectionText(prompt?.accent);
+  const pacing = normalizePromptSectionText(prompt?.pacing);
+  const constraints = normalizePromptList(prompt?.constraints);
+  const personaPrompt = normalizePromptSectionText(params.personaPrompt);
+  const label =
+    normalizePromptSectionText(params.persona?.label) ??
+    normalizePromptSectionText(params.persona?.id);
+
+  const sections = [
+    [
+      "Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
+      "as performance direction. Do not read section titles, notes, labels, or",
+      "configuration aloud.",
+    ].join("\n"),
+  ];
+
+  if (label || profile) {
+    sections.push([`# AUDIO PROFILE: ${label ?? "voice"}`, profile].filter(Boolean).join("\n"));
+  }
+  if (scene) {
+    sections.push(["## THE SCENE", scene].join("\n"));
+  }
+
+  const directorNotes: string[] = [];
+  if (style) {
+    directorNotes.push(`Style: ${style}`);
+  }
+  if (accent) {
+    directorNotes.push(`Accent: ${accent}`);
+  }
+  if (pacing) {
+    directorNotes.push(`Pacing: ${pacing}`);
+  }
+  if (constraints.length > 0) {
+    directorNotes.push(["Constraints:", ...constraints.map((item) => `- ${item}`)].join("\n"));
+  }
+  if (personaPrompt) {
+    directorNotes.push(["Provider notes:", personaPrompt].join("\n"));
+  }
+  if (directorNotes.length > 0) {
+    sections.push(["### DIRECTOR'S NOTES", ...directorNotes].join("\n"));
+  }
+
+  if (sampleContext) {
+    sections.push(["### SAMPLE CONTEXT", sampleContext].join("\n"));
+  }
+
+  sections.push(["### TRANSCRIPT", transcript].join("\n"));
+  return sections.join("\n\n");
+}
+
 function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): Buffer {
  const byteRate = sampleRate * GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8);
  const blockAlign = GOOGLE_TTS_CHANNELS * (GOOGLE_TTS_BITS_PER_SAMPLE / 8);
@@ -265,7 +413,7 @@ function wrapPcm16MonoToWav(pcm: Buffer, sampleRate = GOOGLE_TTS_SAMPLE_RATE): B
  return Buffer.concat([header, pcm]);
 }

-async function synthesizeGoogleTtsPcm(params: {
+async function synthesizeGoogleTtsPcmOnce(params: {
  text: string;
  apiKey: string;
  baseUrl?: string;
@@ -322,19 +470,59 @@ async function synthesizeGoogleTtsPcm(params: {
  });

  try {
-    await assertOkOrThrowProviderError(res, "Google TTS failed");
-    return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse);
+    if (!res.ok) {
+      try {
+        await assertOkOrThrowProviderError(res, "Google TTS failed");
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        if (res.status >= 500 && res.status < 600) {
+          throw new GoogleTtsRetryableError(message);
+        }
+        throw err;
+      }
+    }
+    try {
+      return extractGoogleSpeechPcm((await res.json()) as GoogleGenerateSpeechResponse);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      throw new GoogleTtsRetryableError(message);
+    }
  } finally {
    await release();
  }
 }

+async function synthesizeGoogleTtsPcm(params: {
+  text: string;
+  apiKey: string;
+  baseUrl?: string;
+  request?: ReturnType<typeof sanitizeConfiguredModelProviderRequest>;
+  model: string;
+  voiceName: string;
+  audioProfile?: string;
+  speakerName?: string;
+  timeoutMs: number;
+}): Promise<Buffer> {
+  let lastError: unknown;
+  for (let attempt = 0; attempt < 2; attempt += 1) {
+    try {
+      return await synthesizeGoogleTtsPcmOnce(params);
+    } catch (err) {
+      lastError = err;
+      if (!(err instanceof GoogleTtsRetryableError) || attempt > 0) {
+        throw err;
+      }
+    }
+  }
+  throw lastError instanceof Error ? lastError : new Error(String(lastError));
+}
+
 export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
  return {
    id: "google",
    label: "Google",
    autoSelectOrder: 50,
-    models: [DEFAULT_GOOGLE_TTS_MODEL],
+    models: GOOGLE_TTS_MODELS,
    voices: GOOGLE_TTS_VOICES,
    resolveConfig: ({ rawConfig }) => normalizeGoogleTtsProviderConfig(rawConfig),
    parseDirectiveToken,
@@ -372,6 +560,22 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
    listVoices: async () => GOOGLE_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
    isConfigured: ({ cfg, providerConfig }) =>
      Boolean(resolveGoogleTtsApiKey({ cfg, providerConfig })),
+    prepareSynthesis: (ctx) => {
+      const config = readGoogleTtsProviderConfig(ctx.providerConfig);
+      const shouldWrap =
+        config.promptTemplate === GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE ||
+        Boolean(config.personaPrompt);
+      if (!shouldWrap || isOpenClawGoogleAudioProfilePrompt(ctx.text)) {
+        return undefined;
+      }
+      return {
+        text: renderGoogleAudioProfilePrompt({
+          text: ctx.text,
+          persona: ctx.persona,
+          personaPrompt: config.personaPrompt,
+        }),
+      };
+    },
    synthesize: async (req) => {
      const config = readGoogleTtsProviderConfig(req.providerConfig);
      const overrides = readGoogleTtsOverrides(req.providerOverrides);
@@ -449,7 +653,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
 export const __testing = {
  DEFAULT_GOOGLE_TTS_MODEL,
  DEFAULT_GOOGLE_TTS_VOICE,
+  GOOGLE_AUDIO_PROFILE_PROMPT_TEMPLATE,
+  GOOGLE_TTS_MODELS,
  GOOGLE_TTS_SAMPLE_RATE,
  normalizeGoogleTtsModel,
+  renderGoogleAudioProfilePrompt,
  wrapPcm16MonoToWav,
 };