From 36835592df4c2e1b48db55be772bbc18cbdc6b84 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Thu, 7 May 2026 22:17:42 +0100
Subject: [PATCH] feat: log discord voice transcripts

---
 CHANGELOG.md                                  |  1 +
 docs/channels/discord.md                      |  1 +
 .../discord/src/voice/manager.e2e.test.ts     | 41 +++++++++++++++++++
 extensions/discord/src/voice/segment.ts       | 12 ++++++
 4 files changed, 55 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5499fe668a2..7ad6a78d054 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
 
 - Telegram: preserve the channel-specific 10-option poll cap in the unified outbound adapter so over-limit polls are rejected before send. (#78762) Thanks @obviyus.
 - Runtime/install: raise the supported Node 22 floor to `22.16+` so native SQLite query handling can rely on the `node:sqlite` statement metadata API while continuing to recommend Node 24. (#78921)
+- Discord/voice: include a bounded one-line STT transcript preview in verbose voice logs so live voice debugging shows what speakers said before the agent reply.
 - Discord/voice: stream ElevenLabs TTS directly into Discord playback and send ElevenLabs latency optimization as the documented query parameter so spoken replies can start sooner.
 - Discord/voice: keep TTS playback running when another user starts speaking, ignore new capture during playback to avoid feedback loops, and downgrade expected receive-stream aborts to verbose diagnostics.
 - Telegram: treat successful same-chat `message` tool outbound sends during an inbound telegram turn as delivered when deciding whether to emit the rewritten silent reply fallback (#78685). Thanks @neeravmakwana.
diff --git a/docs/channels/discord.md b/docs/channels/discord.md
index 7ea444b865f..701a2e02df6 100644
--- a/docs/channels/discord.md
+++ b/docs/channels/discord.md
@@ -1211,6 +1211,7 @@ Notes:
 - OpenClaw also watches receive decrypt failures and auto-recovers by leaving/rejoining the voice channel after repeated failures in a short window.
 - If receive logs repeatedly show `DecryptionFailed(UnencryptedWhenPassthroughDisabled)` after updating, collect a dependency report and logs. The bundled `@discordjs/voice` line includes the upstream padding fix from discord.js PR #11449, which closed discord.js issue #11419.
 - `The operation was aborted` receive events are expected when OpenClaw finalizes a captured speaker segment; they are verbose diagnostics, not warnings.
+- Verbose Discord voice logs include a bounded one-line STT transcript preview for each accepted speaker segment, so debugging shows both the user side and the agent reply side without dumping unbounded transcript text.
 
 Voice channel pipeline:
 
diff --git a/extensions/discord/src/voice/manager.e2e.test.ts b/extensions/discord/src/voice/manager.e2e.test.ts
index 18ef6ff3980..7cb21371f43 100644
--- a/extensions/discord/src/voice/manager.e2e.test.ts
+++ b/extensions/discord/src/voice/manager.e2e.test.ts
@@ -16,6 +16,7 @@ const {
   transcribeAudioFileMock,
   textToSpeechStreamMock,
   textToSpeechMock,
+  logVerboseMock,
 } = vi.hoisted(() => {
   type EventHandler = (...args: unknown[]) => unknown;
   type MockConnection = {
@@ -111,6 +112,7 @@ const {
       async (): Promise<unknown> => ({ success: false, error: "stream unavailable" }),
     ),
     textToSpeechMock: vi.fn(async () => ({ success: true, audioPath: "/tmp/voice.mp3" })),
+    logVerboseMock: vi.fn(),
   };
 });
 
@@ -154,6 +156,16 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", async () => {
   };
 });
 
+vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
+  const actual = await vi.importActual<typeof import("openclaw/plugin-sdk/runtime-env")>(
+    "openclaw/plugin-sdk/runtime-env",
+  );
+  return {
+    ...actual,
+    logVerbose: logVerboseMock,
+  };
+});
+
 vi.mock("../runtime.js", () => ({
   getDiscordRuntime: () => ({
     mediaUnderstanding: {
@@ -218,6 +230,7 @@ describe("DiscordVoiceManager", () => {
     textToSpeechStreamMock.mockResolvedValue({ success: false, error: "stream unavailable" });
     textToSpeechMock.mockReset();
     textToSpeechMock.mockResolvedValue({ success: true, audioPath: "/tmp/voice.mp3" });
+    logVerboseMock.mockClear();
     createAudioResourceMock.mockClear();
   });
 
@@ -760,6 +773,34 @@ describe("DiscordVoiceManager", () => {
     );
   });
 
+  it("logs a bounded inbound transcript preview for voice debugging", async () => {
+    transcribeAudioFileMock.mockResolvedValueOnce({
+      text: `hello from voice\n\n${"x".repeat(700)}`,
+    });
+    const client = createClient();
+    client.fetchMember.mockResolvedValue({
+      nickname: "Debug Speaker",
+      user: {
+        id: "u-debug",
+        username: "debug",
+        globalName: "Debug",
+        discriminator: "0001",
+      },
+    });
+    const manager = createManager({ groupPolicy: "open" }, client, {
+      commands: { useAccessGroups: false },
+    });
+
+    await processVoiceSegment(manager, "u-debug");
+
+    const transcriptLog = logVerboseMock.mock.calls
+      .map((call) => String(call[0]))
+      .find((message) => message.includes("transcript from Debug Speaker (u-debug)"));
+    expect(transcriptLog).toContain("hello from voice ");
+    expect(transcriptLog).not.toContain("\n");
+    expect(transcriptLog?.length).toBeLessThan(650);
+  });
+
   it("plays streaming TTS audio before falling back to a synthesized file", async () => {
     const release = vi.fn(async () => undefined);
     textToSpeechStreamMock.mockResolvedValue({
diff --git a/extensions/discord/src/voice/segment.ts b/extensions/discord/src/voice/segment.ts
index d1e61dd1464..fabefce8f34 100644
--- a/extensions/discord/src/voice/segment.ts
+++ b/extensions/discord/src/voice/segment.ts
@@ -21,8 +21,17 @@ import type { DiscordVoiceSpeakerContextResolver } from "./speaker-context.js";
 import { synthesizeVoiceReplyAudio, transcribeVoiceAudio } from "./tts.js";
 
 const DISCORD_VOICE_MESSAGE_PROVIDER = "discord-voice";
+const VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS = 500;
 const logger = createSubsystemLogger("discord/voice");
 
+function formatVoiceTranscriptLogPreview(text: string): string {
+  const oneLine = text.replace(/\s+/g, " ").trim();
+  if (oneLine.length <= VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS) {
+    return oneLine;
+  }
+  return `${oneLine.slice(0, VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS)}...`;
+}
+
 export async function processDiscordVoiceSegment(params: {
   entry: VoiceSessionEntry;
   wavPath: string;
@@ -82,6 +91,9 @@ export async function processDiscordVoiceSegment(params: {
   logVoiceVerbose(
     `transcription ok (${transcript.length} chars): guild ${entry.guildId} channel ${entry.channelId}`,
   );
+  logVoiceVerbose(
+    `transcript from ${speaker.label} (${userId}) in guild ${entry.guildId} channel ${entry.channelId}: ${formatVoiceTranscriptLogPreview(transcript)}`,
+  );
 
   const prompt = formatVoiceIngressPrompt(transcript, speaker.label);
   const extraSystemPrompt = buildDiscordGroupSystemPrompt(access.channelConfig);