TTS: add provider personas

2026-05-06 16:40:49 +00:00 · 2026-04-23 07:26:32 -07:00
parent 80219ed1b3
commit 0594fa3c4d
39 changed files with 2021 additions and 136 deletions
--- a/extensions/openai/openai.live.test.ts
+++ b/extensions/openai/openai.live.test.ts
@@ -134,6 +134,7 @@ function createLiveTtsConfig(): ResolvedTtsConfig {
        voice: "alloy",
      },
    },
+    personas: {},
    maxTextLength: 4_000,
    timeoutMs: 30_000,
  };
--- a/extensions/openai/speech-provider.test.ts
+++ b/extensions/openai/speech-provider.test.ts
@@ -162,6 +162,40 @@ describe("buildOpenAISpeechProvider", () => {
    });
  });

+  it("maps persona prompt fields to instructions when instructions are unset", async () => {
+    const provider = buildOpenAISpeechProvider();
+
+    const prepared = await provider.prepareSynthesis?.({
+      text: "hello",
+      cfg: {} as never,
+      providerConfig: {
+        apiKey: "sk-test",
+        model: "gpt-4o-mini-tts",
+        voice: "cedar",
+      },
+      persona: {
+        id: "alfred",
+        label: "Alfred",
+        prompt: {
+          profile: "A brilliant British butler.",
+          scene: "A quiet late-night study.",
+          sampleContext: "The speaker is answering a trusted operator.",
+          style: "Refined and lightly amused.",
+          accent: "British English.",
+          pacing: "Measured.",
+          constraints: ["Do not read configuration values aloud."],
+        },
+      },
+      target: "audio-file",
+      timeoutMs: 1_000,
+    });
+
+    expect(prepared?.providerConfig?.instructions).toContain("Persona: Alfred");
+    expect(prepared?.providerConfig?.instructions).toContain(
+      "Constraint: Do not read configuration values aloud.",
+    );
+  });
+
  it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => {
    const provider = buildOpenAISpeechProvider();
    mockSpeechFetchExpectingFormat("wav");
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -71,7 +71,7 @@ function isGroqSpeechBaseUrl(baseUrl: string): boolean {

 function resolveSpeechResponseFormat(
  baseUrl: string,
-  target: "audio-file" | "voice-note",
+  target: "audio-file" | "voice-note" | "telephony",
  configuredFormat?: OpenAiSpeechResponseFormat,
 ): OpenAiSpeechResponseFormat {
  if (configuredFormat) {
@@ -145,6 +145,37 @@ function readOpenAIOverrides(
  };
 }

+function renderOpenAITtsPersonaInstructions(req: {
+  label?: string;
+  prompt?: {
+    profile?: string;
+    scene?: string;
+    sampleContext?: string;
+    style?: string;
+    accent?: string;
+    pacing?: string;
+    constraints?: string[];
+  };
+}): string | undefined {
+  const prompt = req.prompt;
+  if (!prompt) {
+    return undefined;
+  }
+  const lines = [
+    req.label ? `Persona: ${req.label}` : undefined,
+    prompt.profile ? `Profile: ${prompt.profile}` : undefined,
+    prompt.scene ? `Scene: ${prompt.scene}` : undefined,
+    prompt.style ? `Style: ${prompt.style}` : undefined,
+    prompt.accent ? `Accent: ${prompt.accent}` : undefined,
+    prompt.pacing ? `Pacing: ${prompt.pacing}` : undefined,
+    prompt.sampleContext ? `Sample context: ${prompt.sampleContext}` : undefined,
+    ...(prompt.constraints ?? []).map((constraint) => `Constraint: ${constraint}`),
+  ]
+    .map((line) => trimToUndefined(line))
+    .filter((line): line is string => Boolean(line));
+  return lines.length > 0 ? lines.join("\n") : undefined;
+}
+
 function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
  handled: boolean;
  overrides?: SpeechProviderOverrides;
@@ -229,6 +260,23 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
    listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
    isConfigured: ({ providerConfig }) =>
      Boolean(readOpenAIProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
+    prepareSynthesis: (ctx) => {
+      const config = readOpenAIProviderConfig(ctx.providerConfig);
+      if (config.instructions) {
+        return undefined;
+      }
+      const instructions = renderOpenAITtsPersonaInstructions({
+        label: ctx.persona?.label ?? ctx.persona?.id,
+        prompt: ctx.persona?.prompt,
+      });
+      return instructions
+        ? {
+            providerConfig: {
+              instructions,
+            },
+          }
+        : undefined;
+    },
    synthesize: async (req) => {
      const config = readOpenAIProviderConfig(req.providerConfig);
      const overrides = readOpenAIOverrides(req.providerOverrides);