diff --git a/CHANGELOG.md b/CHANGELOG.md index ace9d907b27..467fab0f5ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,7 @@ Docs: https://docs.openclaw.ai - ACP: send subagent and async-task completion wakes to external ACP harnesses as plain prompts instead of OpenClaw internal runtime-context envelopes, while keeping those envelopes out of ACP transcripts. +- TTS/status: show configured TTS model, voice, and sanitized custom endpoint in `/status`, preserve OpenAI-compatible TTS instructions on custom endpoints, and retry empty Microsoft/Edge TTS output once. Addresses #46602, #47232, and #43936. Thanks @leekuangtao, @Huntterxx, and @rex993. - Agents/Claude: treat zero-token empty `stop` turns as failed provider output, retry once, repair replay, and allow configured model fallback instead of preserving them as successful silent replies. Fixes #71880. Thanks @MagnaAI. diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 7c8e879c5dc..df0ea10079e 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -846,6 +846,8 @@ Notes: - success fallback: `Fallback: -> ` plus `Attempts: ...` - failure: `Error: ...` plus `Attempts: ...` - detailed diagnostics: `Attempt details: provider:outcome(reasonCode) latency` +- `/status` shows the active TTS mode plus configured provider, model, voice, + and sanitized custom endpoint metadata when TTS is enabled. - OpenAI and ElevenLabs API failures now include parsed provider error detail and request id (when returned by the provider), which is surfaced in TTS errors/logs. ## Agent tool diff --git a/extensions/microsoft/tts.test.ts b/extensions/microsoft/tts.test.ts index 521ef742130..4d9dccbb9d2 100644 --- a/extensions/microsoft/tts.test.ts +++ b/extensions/microsoft/tts.test.ts @@ -1,13 +1,20 @@ import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import path from "node:path"; -import { afterEach, beforeAll, describe, expect, it } from "vitest"; +import { afterEach, beforeAll, describe, expect, it, vi } from "vitest"; let edgeTTS: typeof import("./tts.js").edgeTTS; -function createEdgeTTSDeps(ttsPromise: (text: string, filePath: string) => Promise) { +function createEdgeTTSDeps( + ttsPromise: (text: string, filePath: string) => Promise, + onConstruct?: () => void, +) { return { EdgeTTS: class { + constructor() { + onConstruct?.(); + } + ttsPromise(text: string, filePath: string) { return ttsPromise(text, filePath); } @@ -36,11 +43,35 @@ describe("edgeTTS empty audio validation", () => { } }); - it("throws when the output file is 0 bytes", async () => { + it("rejects blank text before constructing Edge TTS", async () => { tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-")); const outputPath = path.join(tempDir, "voice.mp3"); - + const onConstruct = vi.fn(); const deps = createEdgeTTSDeps(async (_text: string, filePath: string) => { + writeFileSync(filePath, Buffer.from([0xff])); + }, onConstruct); + + await expect( + edgeTTS( + { + text: " \n\t ", + outputPath, + config: baseEdgeConfig, + timeoutMs: 10000, + }, + deps, + ), + ).rejects.toThrow("Microsoft TTS text cannot be empty"); + expect(onConstruct).not.toHaveBeenCalled(); + }); + + it("throws after one retry when the output file stays empty", async () => { + tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-")); + const outputPath = path.join(tempDir, "voice.mp3"); + const calls: string[] = []; + + const deps = createEdgeTTSDeps(async (text: string, filePath: string) => { + calls.push(text); writeFileSync(filePath, ""); }); @@ -54,7 +85,8 @@ describe("edgeTTS empty audio validation", () => { }, deps, ), - ).rejects.toThrow("Edge TTS produced empty audio file"); + ).rejects.toThrow("Edge TTS produced empty audio file after retry"); + expect(calls).toEqual(["Hello", "Hello"]); }); it("succeeds when the output file has content", async () => { @@ -77,4 +109,78 @@ describe("edgeTTS empty audio validation", () => { ), ).resolves.toBeUndefined(); }); + + it("retries once when the first output file is empty", async () => { + tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-")); + const outputPath = path.join(tempDir, "voice.mp3"); + const calls: string[] = []; + + const deps = createEdgeTTSDeps(async (text: string, filePath: string) => { + calls.push(text); + writeFileSync(filePath, calls.length === 1 ? "" : Buffer.from([0xff, 0xfb, 0x90, 0x00])); + }); + + await expect( + edgeTTS( + { + text: "Hello", + outputPath, + config: baseEdgeConfig, + timeoutMs: 10000, + }, + deps, + ), + ).resolves.toBeUndefined(); + expect(calls).toEqual(["Hello", "Hello"]); + }); + + it("retries once when Edge TTS resolves without creating an output file", async () => { + tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-")); + const outputPath = path.join(tempDir, "voice.mp3"); + const calls: string[] = []; + + const deps = createEdgeTTSDeps(async (text: string, filePath: string) => { + calls.push(text); + if (calls.length === 2) { + writeFileSync(filePath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); + } + }); + + await expect( + edgeTTS( + { + text: "Hello", + outputPath, + config: baseEdgeConfig, + timeoutMs: 10000, + }, + deps, + ), + ).resolves.toBeUndefined(); + expect(calls).toEqual(["Hello", "Hello"]); + }); + + it("does not retry provider errors", async () => { + tempDir = mkdtempSync(path.join(tmpdir(), "tts-test-")); + const outputPath = path.join(tempDir, "voice.mp3"); + const calls: string[] = []; + + const deps = createEdgeTTSDeps(async (text: string) => { + calls.push(text); + throw new Error("upstream timeout"); + }); + + await expect( + edgeTTS( + { + text: "Hello", + outputPath, + config: baseEdgeConfig, + timeoutMs: 10000, + }, + deps, + ), + ).rejects.toThrow("upstream timeout"); + expect(calls).toEqual(["Hello"]); + }); }); diff --git a/extensions/microsoft/tts.ts b/extensions/microsoft/tts.ts index bfc2dc3b549..c4521e2d943 100644 --- a/extensions/microsoft/tts.ts +++ b/extensions/microsoft/tts.ts @@ -24,6 +24,26 @@ async function loadDefaultEdgeTTSDeps(): Promise { return { EdgeTTS }; } +function isMissingOutputFileError(error: unknown): boolean { + return ( + typeof error === "object" && + error !== null && + "code" in error && + (error as { code?: unknown }).code === "ENOENT" + ); +} + +function readOutputSize(outputPath: string): number { + try { + return statSync(outputPath).size; + } catch (error) { + if (isMissingOutputFileError(error)) { + return 0; + } + throw error; + } +} + export function inferEdgeExtension(outputFormat: string): string { const normalized = normalizeLowercaseStringOrEmpty(outputFormat); if (normalized.includes("webm")) { @@ -61,6 +81,10 @@ export async function edgeTTS( deps?: EdgeTTSDeps, ): Promise { const { text, outputPath, config, timeoutMs } = params; + if (text.trim().length === 0) { + throw new Error("Microsoft TTS text cannot be empty"); + } + const resolvedDeps = deps ?? (await loadDefaultEdgeTTSDeps()); const tts = new resolvedDeps.EdgeTTS({ voice: config.voice, @@ -73,10 +97,12 @@ export async function edgeTTS( volume: config.volume, timeout: config.timeoutMs ?? timeoutMs, }); - await tts.ttsPromise(text, outputPath); - const { size } = statSync(outputPath); - if (size === 0) { - throw new Error("Edge TTS produced empty audio file"); + for (let attempt = 0; attempt < 2; attempt += 1) { + await tts.ttsPromise(text, outputPath); + if (readOutputSize(outputPath) > 0) { + return; + } } + throw new Error("Edge TTS produced empty audio file after retry"); } diff --git a/extensions/openai/tts.test.ts b/extensions/openai/tts.test.ts index 9e48c8fc6ca..98b609b01de 100644 --- a/extensions/openai/tts.test.ts +++ b/extensions/openai/tts.test.ts @@ -91,9 +91,75 @@ describe("openai tts", () => { expect(resolveOpenAITtsInstructions("tts-1-hd", "Speak warmly")).toBeUndefined(); expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " ")).toBeUndefined(); }); + + it("preserves instructions for custom OpenAI-compatible TTS endpoints", () => { + expect( + resolveOpenAITtsInstructions("tts-1", " Speak warmly ", "https://tts.example.com/v1"), + ).toBe("Speak warmly"); + expect( + resolveOpenAITtsInstructions("tts-1", " Speak warmly ", "https://api.openai.com/v1/"), + ).toBeUndefined(); + expect( + resolveOpenAITtsInstructions("tts-1", " ", "https://tts.example.com/v1"), + ).toBeUndefined(); + }); }); describe("openaiTTS diagnostics", () => { + it("sends instructions to custom OpenAI-compatible endpoints", async () => { + const fetchMock = vi.fn( + async (_url: string | URL, _init?: RequestInit) => + new Response(Buffer.from("audio-bytes"), { status: 200 }), + ); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await openaiTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://tts.example.com/v1", + model: "tts-1", + voice: "custom-voice", + instructions: " Speak warmly ", + responseFormat: "mp3", + timeoutMs: 5_000, + }); + + const [, init] = fetchMock.mock.calls[0] ?? []; + if (typeof init?.body !== "string") { + throw new Error("expected JSON request body"); + } + const body = JSON.parse(init.body) as Record; + expect(body.instructions).toBe("Speak warmly"); + expect(body.model).toBe("tts-1"); + expect(body.voice).toBe("custom-voice"); + }); + + it("omits instructions for unsupported models on the official OpenAI endpoint", async () => { + const fetchMock = vi.fn( + async (_url: string | URL, _init?: RequestInit) => + new Response(Buffer.from("audio-bytes"), { status: 200 }), + ); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await openaiTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.openai.com/v1/", + model: "tts-1", + voice: "alloy", + instructions: "Speak warmly", + responseFormat: "mp3", + timeoutMs: 5_000, + }); + + const [, init] = fetchMock.mock.calls[0] ?? []; + if (typeof init?.body !== "string") { + throw new Error("expected JSON request body"); + } + const body = JSON.parse(init.body) as Record; + expect(body.instructions).toBeUndefined(); + }); + it("includes parsed provider detail and request id for JSON API errors", async () => { const fetchMock = vi.fn( async () => diff --git a/extensions/openai/tts.ts b/extensions/openai/tts.ts index 5947648063b..87c30024fcf 100644 --- a/extensions/openai/tts.ts +++ b/extensions/openai/tts.ts @@ -63,9 +63,16 @@ export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is Op export function resolveOpenAITtsInstructions( model: string, instructions?: string, + baseUrl?: string, ): string | undefined { const next = instructions?.trim(); - return next && model.includes("gpt-4o-mini-tts") ? next : undefined; + if (!next) { + return undefined; + } + if (baseUrl !== undefined && isCustomOpenAIEndpoint(baseUrl)) { + return next; + } + return model.includes("gpt-4o-mini-tts") ? next : undefined; } export async function openaiTTS(params: { @@ -81,7 +88,7 @@ export async function openaiTTS(params: { }): Promise { const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } = params; - const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions); + const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions, baseUrl); if (!isValidOpenAIModel(model, baseUrl)) { throw new Error(`Invalid model: ${model}`); diff --git a/src/auto-reply/status.test.ts b/src/auto-reply/status.test.ts index 8be53dc86d1..33e35158db9 100644 --- a/src/auto-reply/status.test.ts +++ b/src/auto-reply/status.test.ts @@ -103,6 +103,39 @@ describe("buildStatusMessage", () => { expect(normalized).toContain("Queue: collect"); }); + it("shows sanitized TTS provider details in the voice status line", async () => { + await withTempHome(async () => { + const text = buildStatusMessage({ + config: { + messages: { + tts: { + auto: "always", + provider: "openai", + providers: { + openai: { + displayName: "NeuTTS local", + baseUrl: "http://user:secret@127.0.0.1:18801/v1?token=hidden#fragment", + model: "neutts-nano", + voice: "clara", + }, + }, + }, + }, + } as unknown as OpenClawConfig, + agent: {}, + now: 0, + }); + const normalized = normalizeTestText(text); + + expect(normalized).toContain( + "Voice: always · provider=openai · name=NeuTTS local · model=neutts-nano · voice=clara · endpoint=custom(http://127.0.0.1:18801/v1)", + ); + expect(normalized).not.toContain("secret"); + expect(normalized).not.toContain("token=hidden"); + expect(normalized).not.toContain("fragment"); + }); + }); + it("shows the model runtime for CLI-backed providers", () => { const text = buildStatusMessage({ config: { diff --git a/src/status/status-message.ts b/src/status/status-message.ts index 7520123c7e6..a6181d1e609 100644 --- a/src/status/status-message.ts +++ b/src/status/status-message.ts @@ -464,7 +464,25 @@ const formatVoiceModeLine = ( if (!snapshot) { return null; } - return `🔊 Voice: ${snapshot.autoMode} · provider=${snapshot.provider} · limit=${snapshot.maxLength} · summary=${snapshot.summarize ? "on" : "off"}`; + const parts = [`🔊 Voice: ${snapshot.autoMode}`, `provider=${snapshot.provider}`]; + if (snapshot.displayName) { + parts.push(`name=${snapshot.displayName}`); + } + if (snapshot.model) { + parts.push(`model=${snapshot.model}`); + } + if (snapshot.voice) { + parts.push(`voice=${snapshot.voice}`); + } + if (snapshot.baseUrl) { + parts.push( + snapshot.customBaseUrl + ? `endpoint=custom(${snapshot.baseUrl})` + : `endpoint=${snapshot.baseUrl}`, + ); + } + parts.push(`limit=${snapshot.maxLength}`, `summary=${snapshot.summarize ? "on" : "off"}`); + return parts.join(" · "); }; export function buildStatusMessage(args: StatusArgs): string { diff --git a/src/tts/status-config.test.ts b/src/tts/status-config.test.ts index 8ec0f33ce37..76a263d4c96 100644 --- a/src/tts/status-config.test.ts +++ b/src/tts/status-config.test.ts @@ -138,6 +138,162 @@ describe("resolveStatusTtsSnapshot", () => { }); }); + it("reports configured OpenAI TTS model, voice, and sanitized custom endpoint", async () => { + await withStatusTempHome(async () => { + expect( + resolveStatusTtsSnapshot({ + cfg: { + messages: { + tts: { + auto: "always", + provider: "openai", + providers: { + openai: { + displayName: "NeuTTS local", + baseUrl: "http://user:secret@127.0.0.1:18801/v1?token=hidden#fragment", + model: "neutts-nano", + voice: "clara", + }, + }, + }, + }, + } as OpenClawConfig, + }), + ).toEqual({ + autoMode: "always", + provider: "openai", + displayName: "NeuTTS local", + model: "neutts-nano", + voice: "clara", + baseUrl: "http://127.0.0.1:18801/v1", + customBaseUrl: true, + maxLength: 1500, + summarize: true, + }); + }); + }); + + it("omits default OpenAI endpoint details from status", async () => { + await withStatusTempHome(async () => { + expect( + resolveStatusTtsSnapshot({ + cfg: { + messages: { + tts: { + auto: "always", + provider: "openai", + providers: { + openai: { + baseUrl: "https://api.openai.com/v1/", + model: "gpt-4o-mini-tts", + voice: "coral", + }, + }, + }, + }, + } as OpenClawConfig, + }), + ).toEqual({ + autoMode: "always", + provider: "openai", + model: "gpt-4o-mini-tts", + voice: "coral", + maxLength: 1500, + summarize: true, + }); + }); + }); + + it("reports merged per-agent provider metadata", async () => { + await withStatusTempHome(async () => { + expect( + resolveStatusTtsSnapshot({ + cfg: { + messages: { + tts: { + auto: "off", + provider: "openai", + providers: { + openai: { + model: "gpt-4o-mini-tts", + voice: "coral", + }, + }, + }, + }, + agents: { + list: [ + { + id: "reader", + tts: { + auto: "always", + providers: { + openai: { + voice: "nova", + }, + }, + }, + }, + ], + }, + } as OpenClawConfig, + agentId: "reader", + }), + ).toEqual({ + autoMode: "always", + provider: "openai", + model: "gpt-4o-mini-tts", + voice: "nova", + maxLength: 1500, + summarize: true, + }); + }); + }); + + it("uses provider metadata for local provider prefs overrides", async () => { + await withStatusTempHome(async (home) => { + const prefsPath = path.join(home, ".openclaw", "settings", "tts.json"); + fs.mkdirSync(path.dirname(prefsPath), { recursive: true }); + fs.writeFileSync( + prefsPath, + JSON.stringify({ + tts: { + auto: "always", + provider: "edge", + }, + }), + ); + + expect( + resolveStatusTtsSnapshot({ + cfg: { + messages: { + tts: { + provider: "openai", + prefsPath, + providers: { + microsoft: { + voice: "en-US-AvaMultilingualNeural", + }, + openai: { + model: "gpt-4o-mini-tts", + voice: "coral", + }, + }, + }, + }, + } as OpenClawConfig, + }), + ).toEqual({ + autoMode: "always", + provider: "microsoft", + voice: "en-US-AvaMultilingualNeural", + maxLength: 1500, + summarize: true, + }); + }); + }); + it("derives the default prefs path from OPENCLAW_CONFIG_PATH when set", async () => { await withStatusTempHome(async (home) => { const stateDir = path.join(home, ".openclaw-dev"); diff --git a/src/tts/status-config.ts b/src/tts/status-config.ts index 2d5509001ad..04b6e5b2439 100644 --- a/src/tts/status-config.ts +++ b/src/tts/status-config.ts @@ -12,6 +12,8 @@ import { resolveEffectiveTtsConfig } from "./tts-config.js"; const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; +const DEFAULT_OPENAI_TTS_BASE_URL = "https://api.openai.com/v1"; +const MAX_STATUS_DETAIL_LENGTH = 96; type TtsUserPrefs = { tts?: { @@ -26,6 +28,11 @@ type TtsUserPrefs = { type TtsStatusSnapshot = { autoMode: TtsAutoMode; provider: TtsProvider; + displayName?: string; + model?: string; + voice?: string; + baseUrl?: string; + customBaseUrl?: boolean; maxLength: number; summarize: boolean; }; @@ -78,6 +85,116 @@ function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefin return undefined; } +function isObjectRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function normalizeStatusDetail( + value: unknown, + maxLength = MAX_STATUS_DETAIL_LENGTH, +): string | undefined { + if (typeof value !== "string") { + return undefined; + } + const normalized = value.trim().replace(/\s+/g, " "); + if (!normalized) { + return undefined; + } + return normalized.length > maxLength ? `${normalized.slice(0, maxLength - 3)}...` : normalized; +} + +function sanitizeBaseUrlForStatus(value: unknown): string | undefined { + const raw = normalizeStatusDetail(value, 180); + if (!raw) { + return undefined; + } + try { + const parsed = new URL(raw); + parsed.username = ""; + parsed.password = ""; + parsed.search = ""; + parsed.hash = ""; + const sanitized = parsed.toString().replace(/\/+$/, ""); + return normalizeStatusDetail(sanitized, 120); + } catch { + return "[invalid-url]"; + } +} + +function isCustomOpenAiTtsBaseUrl(baseUrl: string | undefined): boolean { + return baseUrl ? baseUrl.replace(/\/+$/, "") !== DEFAULT_OPENAI_TTS_BASE_URL : false; +} + +function firstStatusDetail( + record: Record | undefined, + keys: string[], +): string | undefined { + if (!record) { + return undefined; + } + for (const key of keys) { + const value = normalizeStatusDetail(record[key]); + if (value) { + return value; + } + } + return undefined; +} + +function resolveProviderConfigRecord( + raw: TtsConfig, + provider: TtsProvider, +): Record | undefined { + const rawRecord: Record = isObjectRecord(raw) + ? (raw as Record) + : {}; + const providers: Record = isObjectRecord(raw.providers) ? raw.providers : {}; + if (provider === "microsoft") { + return { + ...(isObjectRecord(rawRecord.edge) ? rawRecord.edge : {}), + ...(isObjectRecord(rawRecord.microsoft) ? rawRecord.microsoft : {}), + ...(isObjectRecord(providers.edge) ? providers.edge : {}), + ...(isObjectRecord(providers.microsoft) ? providers.microsoft : {}), + }; + } + const direct = rawRecord[provider]; + const providerScoped = providers[provider]; + if (isObjectRecord(providerScoped)) { + return providerScoped; + } + if (isObjectRecord(direct)) { + return direct; + } + return rawRecord; +} + +function resolveStatusProviderDetails(raw: TtsConfig, provider: TtsProvider) { + if (provider === "auto") { + return {}; + } + const record = resolveProviderConfigRecord(raw, provider); + const sanitizedBaseUrl = sanitizeBaseUrlForStatus(record?.baseUrl); + const customBaseUrl = provider === "openai" && isCustomOpenAiTtsBaseUrl(sanitizedBaseUrl); + const details: Partial = {}; + const displayName = firstStatusDetail(record, ["displayName"]); + if (displayName) { + details.displayName = displayName; + } + const model = firstStatusDetail(record, ["model", "modelId"]); + if (model) { + details.model = model; + } + const voice = firstStatusDetail(record, ["voice", "voiceId", "voiceName"]); + if (voice) { + details.voice = voice; + } + if (sanitizedBaseUrl && (provider !== "openai" || customBaseUrl)) { + details.baseUrl = sanitizedBaseUrl; + details.customBaseUrl = customBaseUrl; + } + return details; +} + export function resolveStatusTtsSnapshot(params: { cfg: OpenClawConfig; sessionAuto?: string; @@ -95,12 +212,15 @@ export function resolveStatusTtsSnapshot(params: { return null; } + const provider = + normalizeConfiguredSpeechProviderId(prefs.tts?.provider) ?? + normalizeConfiguredSpeechProviderId(raw.provider) ?? + "auto"; + return { autoMode, - provider: - normalizeConfiguredSpeechProviderId(prefs.tts?.provider) ?? - normalizeConfiguredSpeechProviderId(raw.provider) ?? - "auto", + provider, + ...resolveStatusProviderDetails(raw, provider), maxLength: prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH, summarize: prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE, };