fix(tts): surface voice status and harden providers

This commit is contained in:
Peter Steinberger
2026-04-26 03:51:01 +01:00
parent 1231f21679
commit 7a85c1a822
10 changed files with 551 additions and 16 deletions

View File

@@ -68,6 +68,7 @@ Docs: https://docs.openclaw.ai
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
plain prompts instead of OpenClaw internal runtime-context envelopes, while
keeping those envelopes out of ACP transcripts.
- TTS/status: show configured TTS model, voice, and sanitized custom endpoint in `/status`, preserve OpenAI-compatible TTS instructions on custom endpoints, and retry empty Microsoft/Edge TTS output once. Addresses #46602, #47232, and #43936. Thanks @leekuangtao, @Huntterxx, and @rex993.
- Agents/Claude: treat zero-token empty `stop` turns as failed provider output,
retry once, repair replay, and allow configured model fallback instead of
preserving them as successful silent replies. Fixes #71880. Thanks @MagnaAI.

View File

@@ -846,6 +846,8 @@ Notes:
- success fallback: `Fallback: <primary> -> <used>` plus `Attempts: ...`
- failure: `Error: ...` plus `Attempts: ...`
- detailed diagnostics: `Attempt details: provider:outcome(reasonCode) latency`
- `/status` shows the active TTS mode plus configured provider, model, voice,
and sanitized custom endpoint metadata when TTS is enabled.
- OpenAI and ElevenLabs API failures now include parsed provider error detail and request id (when returned by the provider), which is surfaced in TTS errors/logs.
## Agent tool

View File

@@ -1,13 +1,20 @@
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import path from "node:path";
import { afterEach, beforeAll, describe, expect, it } from "vitest";
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
let edgeTTS: typeof import("./tts.js").edgeTTS;
function createEdgeTTSDeps(ttsPromise: (text: string, filePath: string) => Promise<void>) {
function createEdgeTTSDeps(
ttsPromise: (text: string, filePath: string) => Promise<void>,
onConstruct?: () => void,
) {
return {
EdgeTTS: class {
constructor() {
onConstruct?.();
}
ttsPromise(text: string, filePath: string) {
return ttsPromise(text, filePath);
}
@@ -36,11 +43,35 @@ describe("edgeTTS empty audio validation", () => {
}
});
it("throws when the output file is 0 bytes", async () => {
it("rejects blank text before constructing Edge TTS", async () => {
tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-"));
const outputPath = path.join(tempDir, "voice.mp3");
const onConstruct = vi.fn();
const deps = createEdgeTTSDeps(async (_text: string, filePath: string) => {
writeFileSync(filePath, Buffer.from([0xff]));
}, onConstruct);
await expect(
edgeTTS(
{
text: " \n\t ",
outputPath,
config: baseEdgeConfig,
timeoutMs: 10000,
},
deps,
),
).rejects.toThrow("Microsoft TTS text cannot be empty");
expect(onConstruct).not.toHaveBeenCalled();
});
it("throws after one retry when the output file stays empty", async () => {
tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-"));
const outputPath = path.join(tempDir, "voice.mp3");
const calls: string[] = [];
const deps = createEdgeTTSDeps(async (text: string, filePath: string) => {
calls.push(text);
writeFileSync(filePath, "");
});
@@ -54,7 +85,8 @@ describe("edgeTTS empty audio validation", () => {
},
deps,
),
).rejects.toThrow("Edge TTS produced empty audio file");
).rejects.toThrow("Edge TTS produced empty audio file after retry");
expect(calls).toEqual(["Hello", "Hello"]);
});
it("succeeds when the output file has content", async () => {
@@ -77,4 +109,78 @@ describe("edgeTTS empty audio validation", () => {
),
).resolves.toBeUndefined();
});
it("retries once when the first output file is empty", async () => {
tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-"));
const outputPath = path.join(tempDir, "voice.mp3");
const calls: string[] = [];
const deps = createEdgeTTSDeps(async (text: string, filePath: string) => {
calls.push(text);
writeFileSync(filePath, calls.length === 1 ? "" : Buffer.from([0xff, 0xfb, 0x90, 0x00]));
});
await expect(
edgeTTS(
{
text: "Hello",
outputPath,
config: baseEdgeConfig,
timeoutMs: 10000,
},
deps,
),
).resolves.toBeUndefined();
expect(calls).toEqual(["Hello", "Hello"]);
});
it("retries once when Edge TTS resolves without creating an output file", async () => {
tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-"));
const outputPath = path.join(tempDir, "voice.mp3");
const calls: string[] = [];
const deps = createEdgeTTSDeps(async (text: string, filePath: string) => {
calls.push(text);
if (calls.length === 2) {
writeFileSync(filePath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
}
});
await expect(
edgeTTS(
{
text: "Hello",
outputPath,
config: baseEdgeConfig,
timeoutMs: 10000,
},
deps,
),
).resolves.toBeUndefined();
expect(calls).toEqual(["Hello", "Hello"]);
});
it("does not retry provider errors", async () => {
tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-"));
const outputPath = path.join(tempDir, "voice.mp3");
const calls: string[] = [];
const deps = createEdgeTTSDeps(async (text: string) => {
calls.push(text);
throw new Error("upstream timeout");
});
await expect(
edgeTTS(
{
text: "Hello",
outputPath,
config: baseEdgeConfig,
timeoutMs: 10000,
},
deps,
),
).rejects.toThrow("upstream timeout");
expect(calls).toEqual(["Hello"]);
});
});

View File

@@ -24,6 +24,26 @@ async function loadDefaultEdgeTTSDeps(): Promise<EdgeTTSDeps> {
return { EdgeTTS };
}
function isMissingOutputFileError(error: unknown): boolean {
return (
typeof error === "object" &&
error !== null &&
"code" in error &&
(error as { code?: unknown }).code === "ENOENT"
);
}
function readOutputSize(outputPath: string): number {
try {
return statSync(outputPath).size;
} catch (error) {
if (isMissingOutputFileError(error)) {
return 0;
}
throw error;
}
}
export function inferEdgeExtension(outputFormat: string): string {
const normalized = normalizeLowercaseStringOrEmpty(outputFormat);
if (normalized.includes("webm")) {
@@ -61,6 +81,10 @@ export async function edgeTTS(
deps?: EdgeTTSDeps,
): Promise<void> {
const { text, outputPath, config, timeoutMs } = params;
if (text.trim().length === 0) {
throw new Error("Microsoft TTS text cannot be empty");
}
const resolvedDeps = deps ?? (await loadDefaultEdgeTTSDeps());
const tts = new resolvedDeps.EdgeTTS({
voice: config.voice,
@@ -73,10 +97,12 @@ export async function edgeTTS(
volume: config.volume,
timeout: config.timeoutMs ?? timeoutMs,
});
await tts.ttsPromise(text, outputPath);
const { size } = statSync(outputPath);
if (size === 0) {
throw new Error("Edge TTS produced empty audio file");
for (let attempt = 0; attempt < 2; attempt += 1) {
await tts.ttsPromise(text, outputPath);
if (readOutputSize(outputPath) > 0) {
return;
}
}
throw new Error("Edge TTS produced empty audio file after retry");
}

View File

@@ -91,9 +91,75 @@ describe("openai tts", () => {
expect(resolveOpenAITtsInstructions("tts-1-hd", "Speak warmly")).toBeUndefined();
expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " ")).toBeUndefined();
});
it("preserves instructions for custom OpenAI-compatible TTS endpoints", () => {
expect(
resolveOpenAITtsInstructions("tts-1", " Speak warmly ", "https://tts.example.com/v1"),
).toBe("Speak warmly");
expect(
resolveOpenAITtsInstructions("tts-1", " Speak warmly ", "https://api.openai.com/v1/"),
).toBeUndefined();
expect(
resolveOpenAITtsInstructions("tts-1", " ", "https://tts.example.com/v1"),
).toBeUndefined();
});
});
describe("openaiTTS diagnostics", () => {
it("sends instructions to custom OpenAI-compatible endpoints", async () => {
const fetchMock = vi.fn(
async (_url: string | URL, _init?: RequestInit) =>
new Response(Buffer.from("audio-bytes"), { status: 200 }),
);
globalThis.fetch = fetchMock as unknown as typeof fetch;
await openaiTTS({
text: "hello",
apiKey: "test-key",
baseUrl: "https://tts.example.com/v1",
model: "tts-1",
voice: "custom-voice",
instructions: " Speak warmly ",
responseFormat: "mp3",
timeoutMs: 5_000,
});
const [, init] = fetchMock.mock.calls[0] ?? [];
if (typeof init?.body !== "string") {
throw new Error("expected JSON request body");
}
const body = JSON.parse(init.body) as Record<string, unknown>;
expect(body.instructions).toBe("Speak warmly");
expect(body.model).toBe("tts-1");
expect(body.voice).toBe("custom-voice");
});
it("omits instructions for unsupported models on the official OpenAI endpoint", async () => {
const fetchMock = vi.fn(
async (_url: string | URL, _init?: RequestInit) =>
new Response(Buffer.from("audio-bytes"), { status: 200 }),
);
globalThis.fetch = fetchMock as unknown as typeof fetch;
await openaiTTS({
text: "hello",
apiKey: "test-key",
baseUrl: "https://api.openai.com/v1/",
model: "tts-1",
voice: "alloy",
instructions: "Speak warmly",
responseFormat: "mp3",
timeoutMs: 5_000,
});
const [, init] = fetchMock.mock.calls[0] ?? [];
if (typeof init?.body !== "string") {
throw new Error("expected JSON request body");
}
const body = JSON.parse(init.body) as Record<string, unknown>;
expect(body.instructions).toBeUndefined();
});
it("includes parsed provider detail and request id for JSON API errors", async () => {
const fetchMock = vi.fn(
async () =>

View File

@@ -63,9 +63,16 @@ export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is Op
export function resolveOpenAITtsInstructions(
model: string,
instructions?: string,
baseUrl?: string,
): string | undefined {
const next = instructions?.trim();
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
if (!next) {
return undefined;
}
if (baseUrl !== undefined && isCustomOpenAIEndpoint(baseUrl)) {
return next;
}
return model.includes("gpt-4o-mini-tts") ? next : undefined;
}
export async function openaiTTS(params: {
@@ -81,7 +88,7 @@ export async function openaiTTS(params: {
}): Promise<Buffer> {
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
params;
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions, baseUrl);
if (!isValidOpenAIModel(model, baseUrl)) {
throw new Error(`Invalid model: ${model}`);

View File

@@ -103,6 +103,39 @@ describe("buildStatusMessage", () => {
expect(normalized).toContain("Queue: collect");
});
it("shows sanitized TTS provider details in the voice status line", async () => {
await withTempHome(async () => {
const text = buildStatusMessage({
config: {
messages: {
tts: {
auto: "always",
provider: "openai",
providers: {
openai: {
displayName: "NeuTTS local",
baseUrl: "http://user:secret@127.0.0.1:18801/v1?token=hidden#fragment",
model: "neutts-nano",
voice: "clara",
},
},
},
},
} as unknown as OpenClawConfig,
agent: {},
now: 0,
});
const normalized = normalizeTestText(text);
expect(normalized).toContain(
"Voice: always · provider=openai · name=NeuTTS local · model=neutts-nano · voice=clara · endpoint=custom(http://127.0.0.1:18801/v1)",
);
expect(normalized).not.toContain("secret");
expect(normalized).not.toContain("token=hidden");
expect(normalized).not.toContain("fragment");
});
});
it("shows the model runtime for CLI-backed providers", () => {
const text = buildStatusMessage({
config: {

View File

@@ -464,7 +464,25 @@ const formatVoiceModeLine = (
if (!snapshot) {
return null;
}
return `🔊 Voice: ${snapshot.autoMode} · provider=${snapshot.provider} · limit=${snapshot.maxLength} · summary=${snapshot.summarize ? "on" : "off"}`;
const parts = [`🔊 Voice: ${snapshot.autoMode}`, `provider=${snapshot.provider}`];
if (snapshot.displayName) {
parts.push(`name=${snapshot.displayName}`);
}
if (snapshot.model) {
parts.push(`model=${snapshot.model}`);
}
if (snapshot.voice) {
parts.push(`voice=${snapshot.voice}`);
}
if (snapshot.baseUrl) {
parts.push(
snapshot.customBaseUrl
? `endpoint=custom(${snapshot.baseUrl})`
: `endpoint=${snapshot.baseUrl}`,
);
}
parts.push(`limit=${snapshot.maxLength}`, `summary=${snapshot.summarize ? "on" : "off"}`);
return parts.join(" · ");
};
export function buildStatusMessage(args: StatusArgs): string {

View File

@@ -138,6 +138,162 @@ describe("resolveStatusTtsSnapshot", () => {
});
});
it("reports configured OpenAI TTS model, voice, and sanitized custom endpoint", async () => {
await withStatusTempHome(async () => {
expect(
resolveStatusTtsSnapshot({
cfg: {
messages: {
tts: {
auto: "always",
provider: "openai",
providers: {
openai: {
displayName: "NeuTTS local",
baseUrl: "http://user:secret@127.0.0.1:18801/v1?token=hidden#fragment",
model: "neutts-nano",
voice: "clara",
},
},
},
},
} as OpenClawConfig,
}),
).toEqual({
autoMode: "always",
provider: "openai",
displayName: "NeuTTS local",
model: "neutts-nano",
voice: "clara",
baseUrl: "http://127.0.0.1:18801/v1",
customBaseUrl: true,
maxLength: 1500,
summarize: true,
});
});
});
it("omits default OpenAI endpoint details from status", async () => {
await withStatusTempHome(async () => {
expect(
resolveStatusTtsSnapshot({
cfg: {
messages: {
tts: {
auto: "always",
provider: "openai",
providers: {
openai: {
baseUrl: "https://api.openai.com/v1/",
model: "gpt-4o-mini-tts",
voice: "coral",
},
},
},
},
} as OpenClawConfig,
}),
).toEqual({
autoMode: "always",
provider: "openai",
model: "gpt-4o-mini-tts",
voice: "coral",
maxLength: 1500,
summarize: true,
});
});
});
it("reports merged per-agent provider metadata", async () => {
await withStatusTempHome(async () => {
expect(
resolveStatusTtsSnapshot({
cfg: {
messages: {
tts: {
auto: "off",
provider: "openai",
providers: {
openai: {
model: "gpt-4o-mini-tts",
voice: "coral",
},
},
},
},
agents: {
list: [
{
id: "reader",
tts: {
auto: "always",
providers: {
openai: {
voice: "nova",
},
},
},
},
],
},
} as OpenClawConfig,
agentId: "reader",
}),
).toEqual({
autoMode: "always",
provider: "openai",
model: "gpt-4o-mini-tts",
voice: "nova",
maxLength: 1500,
summarize: true,
});
});
});
it("uses provider metadata for local provider prefs overrides", async () => {
await withStatusTempHome(async (home) => {
const prefsPath = path.join(home, ".openclaw", "settings", "tts.json");
fs.mkdirSync(path.dirname(prefsPath), { recursive: true });
fs.writeFileSync(
prefsPath,
JSON.stringify({
tts: {
auto: "always",
provider: "edge",
},
}),
);
expect(
resolveStatusTtsSnapshot({
cfg: {
messages: {
tts: {
provider: "openai",
prefsPath,
providers: {
microsoft: {
voice: "en-US-AvaMultilingualNeural",
},
openai: {
model: "gpt-4o-mini-tts",
voice: "coral",
},
},
},
},
} as OpenClawConfig,
}),
).toEqual({
autoMode: "always",
provider: "microsoft",
voice: "en-US-AvaMultilingualNeural",
maxLength: 1500,
summarize: true,
});
});
});
it("derives the default prefs path from OPENCLAW_CONFIG_PATH when set", async () => {
await withStatusTempHome(async (home) => {
const stateDir = path.join(home, ".openclaw-dev");

View File

@@ -12,6 +12,8 @@ import { resolveEffectiveTtsConfig } from "./tts-config.js";
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_OPENAI_TTS_BASE_URL = "https://api.openai.com/v1";
const MAX_STATUS_DETAIL_LENGTH = 96;
type TtsUserPrefs = {
tts?: {
@@ -26,6 +28,11 @@ type TtsUserPrefs = {
type TtsStatusSnapshot = {
autoMode: TtsAutoMode;
provider: TtsProvider;
displayName?: string;
model?: string;
voice?: string;
baseUrl?: string;
customBaseUrl?: boolean;
maxLength: number;
summarize: boolean;
};
@@ -78,6 +85,116 @@ function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefin
return undefined;
}
function isObjectRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
function normalizeStatusDetail(
value: unknown,
maxLength = MAX_STATUS_DETAIL_LENGTH,
): string | undefined {
if (typeof value !== "string") {
return undefined;
}
const normalized = value.trim().replace(/\s+/g, " ");
if (!normalized) {
return undefined;
}
return normalized.length > maxLength ? `${normalized.slice(0, maxLength - 3)}...` : normalized;
}
function sanitizeBaseUrlForStatus(value: unknown): string | undefined {
const raw = normalizeStatusDetail(value, 180);
if (!raw) {
return undefined;
}
try {
const parsed = new URL(raw);
parsed.username = "";
parsed.password = "";
parsed.search = "";
parsed.hash = "";
const sanitized = parsed.toString().replace(/\/+$/, "");
return normalizeStatusDetail(sanitized, 120);
} catch {
return "[invalid-url]";
}
}
function isCustomOpenAiTtsBaseUrl(baseUrl: string | undefined): boolean {
return baseUrl ? baseUrl.replace(/\/+$/, "") !== DEFAULT_OPENAI_TTS_BASE_URL : false;
}
function firstStatusDetail(
record: Record<string, unknown> | undefined,
keys: string[],
): string | undefined {
if (!record) {
return undefined;
}
for (const key of keys) {
const value = normalizeStatusDetail(record[key]);
if (value) {
return value;
}
}
return undefined;
}
function resolveProviderConfigRecord(
raw: TtsConfig,
provider: TtsProvider,
): Record<string, unknown> | undefined {
const rawRecord: Record<string, unknown> = isObjectRecord(raw)
? (raw as Record<string, unknown>)
: {};
const providers: Record<string, unknown> = isObjectRecord(raw.providers) ? raw.providers : {};
if (provider === "microsoft") {
return {
...(isObjectRecord(rawRecord.edge) ? rawRecord.edge : {}),
...(isObjectRecord(rawRecord.microsoft) ? rawRecord.microsoft : {}),
...(isObjectRecord(providers.edge) ? providers.edge : {}),
...(isObjectRecord(providers.microsoft) ? providers.microsoft : {}),
};
}
const direct = rawRecord[provider];
const providerScoped = providers[provider];
if (isObjectRecord(providerScoped)) {
return providerScoped;
}
if (isObjectRecord(direct)) {
return direct;
}
return rawRecord;
}
function resolveStatusProviderDetails(raw: TtsConfig, provider: TtsProvider) {
if (provider === "auto") {
return {};
}
const record = resolveProviderConfigRecord(raw, provider);
const sanitizedBaseUrl = sanitizeBaseUrlForStatus(record?.baseUrl);
const customBaseUrl = provider === "openai" && isCustomOpenAiTtsBaseUrl(sanitizedBaseUrl);
const details: Partial<TtsStatusSnapshot> = {};
const displayName = firstStatusDetail(record, ["displayName"]);
if (displayName) {
details.displayName = displayName;
}
const model = firstStatusDetail(record, ["model", "modelId"]);
if (model) {
details.model = model;
}
const voice = firstStatusDetail(record, ["voice", "voiceId", "voiceName"]);
if (voice) {
details.voice = voice;
}
if (sanitizedBaseUrl && (provider !== "openai" || customBaseUrl)) {
details.baseUrl = sanitizedBaseUrl;
details.customBaseUrl = customBaseUrl;
}
return details;
}
export function resolveStatusTtsSnapshot(params: {
cfg: OpenClawConfig;
sessionAuto?: string;
@@ -95,12 +212,15 @@ export function resolveStatusTtsSnapshot(params: {
return null;
}
const provider =
normalizeConfiguredSpeechProviderId(prefs.tts?.provider) ??
normalizeConfiguredSpeechProviderId(raw.provider) ??
"auto";
return {
autoMode,
provider:
normalizeConfiguredSpeechProviderId(prefs.tts?.provider) ??
normalizeConfiguredSpeechProviderId(raw.provider) ??
"auto",
provider,
...resolveStatusProviderDetails(raw, provider),
maxLength: prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH,
summarize: prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE,
};