From 36835592df4c2e1b48db55be772bbc18cbdc6b84 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 7 May 2026 22:17:42 +0100 Subject: [PATCH] feat: log discord voice transcripts --- CHANGELOG.md | 1 + docs/channels/discord.md | 1 + .../discord/src/voice/manager.e2e.test.ts | 41 +++++++++++++++++++ extensions/discord/src/voice/segment.ts | 12 ++++++ 4 files changed, 55 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5499fe668a2..7ad6a78d054 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai - Telegram: preserve the channel-specific 10-option poll cap in the unified outbound adapter so over-limit polls are rejected before send. (#78762) Thanks @obviyus. - Runtime/install: raise the supported Node 22 floor to `22.16+` so native SQLite query handling can rely on the `node:sqlite` statement metadata API while continuing to recommend Node 24. (#78921) +- Discord/voice: include a bounded one-line STT transcript preview in verbose voice logs so live voice debugging shows what speakers said before the agent reply. - Discord/voice: stream ElevenLabs TTS directly into Discord playback and send ElevenLabs latency optimization as the documented query parameter so spoken replies can start sooner. - Discord/voice: keep TTS playback running when another user starts speaking, ignore new capture during playback to avoid feedback loops, and downgrade expected receive-stream aborts to verbose diagnostics. - Telegram: treat successful same-chat `message` tool outbound sends during an inbound telegram turn as delivered when deciding whether to emit the rewritten silent reply fallback (#78685). Thanks @neeravmakwana. diff --git a/docs/channels/discord.md b/docs/channels/discord.md index 7ea444b865f..701a2e02df6 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -1211,6 +1211,7 @@ Notes: - OpenClaw also watches receive decrypt failures and auto-recovers by leaving/rejoining the voice channel after repeated failures in a short window. - If receive logs repeatedly show `DecryptionFailed(UnencryptedWhenPassthroughDisabled)` after updating, collect a dependency report and logs. The bundled `@discordjs/voice` line includes the upstream padding fix from discord.js PR #11449, which closed discord.js issue #11419. - `The operation was aborted` receive events are expected when OpenClaw finalizes a captured speaker segment; they are verbose diagnostics, not warnings. +- Verbose Discord voice logs include a bounded one-line STT transcript preview for each accepted speaker segment, so debugging shows both the user side and the agent reply side without dumping unbounded transcript text. Voice channel pipeline: diff --git a/extensions/discord/src/voice/manager.e2e.test.ts b/extensions/discord/src/voice/manager.e2e.test.ts index 18ef6ff3980..7cb21371f43 100644 --- a/extensions/discord/src/voice/manager.e2e.test.ts +++ b/extensions/discord/src/voice/manager.e2e.test.ts @@ -16,6 +16,7 @@ const { transcribeAudioFileMock, textToSpeechStreamMock, textToSpeechMock, + logVerboseMock, } = vi.hoisted(() => { type EventHandler = (...args: unknown[]) => unknown; type MockConnection = { @@ -111,6 +112,7 @@ const { async (): Promise => ({ success: false, error: "stream unavailable" }), ), textToSpeechMock: vi.fn(async () => ({ success: true, audioPath: "/tmp/voice.mp3" })), + logVerboseMock: vi.fn(), }; }); @@ -154,6 +156,16 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", async () => { }; }); +vi.mock("openclaw/plugin-sdk/runtime-env", async () => { + const actual = await vi.importActual( + "openclaw/plugin-sdk/runtime-env", + ); + return { + ...actual, + logVerbose: logVerboseMock, + }; +}); + vi.mock("../runtime.js", () => ({ getDiscordRuntime: () => ({ mediaUnderstanding: { @@ -218,6 +230,7 @@ describe("DiscordVoiceManager", () => { textToSpeechStreamMock.mockResolvedValue({ success: false, error: "stream unavailable" }); textToSpeechMock.mockReset(); textToSpeechMock.mockResolvedValue({ success: true, audioPath: "/tmp/voice.mp3" }); + logVerboseMock.mockClear(); createAudioResourceMock.mockClear(); }); @@ -760,6 +773,34 @@ describe("DiscordVoiceManager", () => { ); }); + it("logs a bounded inbound transcript preview for voice debugging", async () => { + transcribeAudioFileMock.mockResolvedValueOnce({ + text: `hello from voice\n\n${"x".repeat(700)}`, + }); + const client = createClient(); + client.fetchMember.mockResolvedValue({ + nickname: "Debug Speaker", + user: { + id: "u-debug", + username: "debug", + globalName: "Debug", + discriminator: "0001", + }, + }); + const manager = createManager({ groupPolicy: "open" }, client, { + commands: { useAccessGroups: false }, + }); + + await processVoiceSegment(manager, "u-debug"); + + const transcriptLog = logVerboseMock.mock.calls + .map((call) => String(call[0])) + .find((message) => message.includes("transcript from Debug Speaker (u-debug)")); + expect(transcriptLog).toContain("hello from voice "); + expect(transcriptLog).not.toContain("\n"); + expect(transcriptLog?.length).toBeLessThan(650); + }); + it("plays streaming TTS audio before falling back to a synthesized file", async () => { const release = vi.fn(async () => undefined); textToSpeechStreamMock.mockResolvedValue({ diff --git a/extensions/discord/src/voice/segment.ts b/extensions/discord/src/voice/segment.ts index d1e61dd1464..fabefce8f34 100644 --- a/extensions/discord/src/voice/segment.ts +++ b/extensions/discord/src/voice/segment.ts @@ -21,8 +21,17 @@ import type { DiscordVoiceSpeakerContextResolver } from "./speaker-context.js"; import { synthesizeVoiceReplyAudio, transcribeVoiceAudio } from "./tts.js"; const DISCORD_VOICE_MESSAGE_PROVIDER = "discord-voice"; +const VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS = 500; const logger = createSubsystemLogger("discord/voice"); +function formatVoiceTranscriptLogPreview(text: string): string { + const oneLine = text.replace(/\s+/g, " ").trim(); + if (oneLine.length <= VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS) { + return oneLine; + } + return `${oneLine.slice(0, VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS)}...`; +} + export async function processDiscordVoiceSegment(params: { entry: VoiceSessionEntry; wavPath: string; @@ -82,6 +91,9 @@ export async function processDiscordVoiceSegment(params: { logVoiceVerbose( `transcription ok (${transcript.length} chars): guild ${entry.guildId} channel ${entry.channelId}`, ); + logVoiceVerbose( + `transcript from ${speaker.label} (${userId}) in guild ${entry.guildId} channel ${entry.channelId}: ${formatVoiceTranscriptLogPreview(transcript)}`, + ); const prompt = formatVoiceIngressPrompt(transcript, speaker.label); const extraSystemPrompt = buildDiscordGroupSystemPrompt(access.channelConfig);