TTS: add provider personas

This commit is contained in:
Barron Roth
2026-04-23 07:26:32 -07:00
committed by Ayaan Zaidi
parent 80219ed1b3
commit 0594fa3c4d
39 changed files with 2021 additions and 136 deletions

View File

@@ -134,6 +134,7 @@ function createLiveTtsConfig(): ResolvedTtsConfig {
voice: "alloy",
},
},
personas: {},
maxTextLength: 4_000,
timeoutMs: 30_000,
};

View File

@@ -162,6 +162,40 @@ describe("buildOpenAISpeechProvider", () => {
});
});
it("maps persona prompt fields to instructions when instructions are unset", async () => {
const provider = buildOpenAISpeechProvider();
const prepared = await provider.prepareSynthesis?.({
text: "hello",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
model: "gpt-4o-mini-tts",
voice: "cedar",
},
persona: {
id: "alfred",
label: "Alfred",
prompt: {
profile: "A brilliant British butler.",
scene: "A quiet late-night study.",
sampleContext: "The speaker is answering a trusted operator.",
style: "Refined and lightly amused.",
accent: "British English.",
pacing: "Measured.",
constraints: ["Do not read configuration values aloud."],
},
},
target: "audio-file",
timeoutMs: 1_000,
});
expect(prepared?.providerConfig?.instructions).toContain("Persona: Alfred");
expect(prepared?.providerConfig?.instructions).toContain(
"Constraint: Do not read configuration values aloud.",
);
});
it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => {
const provider = buildOpenAISpeechProvider();
mockSpeechFetchExpectingFormat("wav");

View File

@@ -71,7 +71,7 @@ function isGroqSpeechBaseUrl(baseUrl: string): boolean {
function resolveSpeechResponseFormat(
baseUrl: string,
target: "audio-file" | "voice-note",
target: "audio-file" | "voice-note" | "telephony",
configuredFormat?: OpenAiSpeechResponseFormat,
): OpenAiSpeechResponseFormat {
if (configuredFormat) {
@@ -145,6 +145,37 @@ function readOpenAIOverrides(
};
}
function renderOpenAITtsPersonaInstructions(req: {
label?: string;
prompt?: {
profile?: string;
scene?: string;
sampleContext?: string;
style?: string;
accent?: string;
pacing?: string;
constraints?: string[];
};
}): string | undefined {
const prompt = req.prompt;
if (!prompt) {
return undefined;
}
const lines = [
req.label ? `Persona: ${req.label}` : undefined,
prompt.profile ? `Profile: ${prompt.profile}` : undefined,
prompt.scene ? `Scene: ${prompt.scene}` : undefined,
prompt.style ? `Style: ${prompt.style}` : undefined,
prompt.accent ? `Accent: ${prompt.accent}` : undefined,
prompt.pacing ? `Pacing: ${prompt.pacing}` : undefined,
prompt.sampleContext ? `Sample context: ${prompt.sampleContext}` : undefined,
...(prompt.constraints ?? []).map((constraint) => `Constraint: ${constraint}`),
]
.map((line) => trimToUndefined(line))
.filter((line): line is string => Boolean(line));
return lines.length > 0 ? lines.join("\n") : undefined;
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
@@ -229,6 +260,23 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
isConfigured: ({ providerConfig }) =>
Boolean(readOpenAIProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
prepareSynthesis: (ctx) => {
const config = readOpenAIProviderConfig(ctx.providerConfig);
if (config.instructions) {
return undefined;
}
const instructions = renderOpenAITtsPersonaInstructions({
label: ctx.persona?.label ?? ctx.persona?.id,
prompt: ctx.persona?.prompt,
});
return instructions
? {
providerConfig: {
instructions,
},
}
: undefined;
},
synthesize: async (req) => {
const config = readOpenAIProviderConfig(req.providerConfig);
const overrides = readOpenAIOverrides(req.providerOverrides);