feat: log discord voice transcripts

2026-05-08 23:30:42 +00:00 · 2026-05-07 22:17:42 +01:00
parent 6785633d13
commit 36835592df
4 changed files with 55 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai

 - Telegram: preserve the channel-specific 10-option poll cap in the unified outbound adapter so over-limit polls are rejected before send. (#78762) Thanks @obviyus.
 - Runtime/install: raise the supported Node 22 floor to `22.16+` so native SQLite query handling can rely on the `node:sqlite` statement metadata API while continuing to recommend Node 24. (#78921)
+- Discord/voice: include a bounded one-line STT transcript preview in verbose voice logs so live voice debugging shows what speakers said before the agent reply.
 - Discord/voice: stream ElevenLabs TTS directly into Discord playback and send ElevenLabs latency optimization as the documented query parameter so spoken replies can start sooner.
 - Discord/voice: keep TTS playback running when another user starts speaking, ignore new capture during playback to avoid feedback loops, and downgrade expected receive-stream aborts to verbose diagnostics.
 - Telegram: treat successful same-chat `message` tool outbound sends during an inbound telegram turn as delivered when deciding whether to emit the rewritten silent reply fallback (#78685). Thanks @neeravmakwana.
--- a/docs/channels/discord.md
+++ b/docs/channels/discord.md
@@ -1211,6 +1211,7 @@ Notes:
 - OpenClaw also watches receive decrypt failures and auto-recovers by leaving/rejoining the voice channel after repeated failures in a short window.
 - If receive logs repeatedly show `DecryptionFailed(UnencryptedWhenPassthroughDisabled)` after updating, collect a dependency report and logs. The bundled `@discordjs/voice` line includes the upstream padding fix from discord.js PR #11449, which closed discord.js issue #11419.
 - `The operation was aborted` receive events are expected when OpenClaw finalizes a captured speaker segment; they are verbose diagnostics, not warnings.
+- Verbose Discord voice logs include a bounded one-line STT transcript preview for each accepted speaker segment, so debugging shows both the user side and the agent reply side without dumping unbounded transcript text.

 Voice channel pipeline:

--- a/extensions/discord/src/voice/manager.e2e.test.ts
+++ b/extensions/discord/src/voice/manager.e2e.test.ts
@@ -16,6 +16,7 @@ const {
  transcribeAudioFileMock,
  textToSpeechStreamMock,
  textToSpeechMock,
+  logVerboseMock,
 } = vi.hoisted(() => {
  type EventHandler = (...args: unknown[]) => unknown;
  type MockConnection = {
@@ -111,6 +112,7 @@ const {
      async (): Promise<unknown> => ({ success: false, error: "stream unavailable" }),
    ),
    textToSpeechMock: vi.fn(async () => ({ success: true, audioPath: "/tmp/voice.mp3" })),
+    logVerboseMock: vi.fn(),
  };
 });

@@ -154,6 +156,16 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", async () => {
  };
 });

+vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
+  const actual = await vi.importActual<typeof import("openclaw/plugin-sdk/runtime-env")>(
+    "openclaw/plugin-sdk/runtime-env",
+  );
+  return {
+    ...actual,
+    logVerbose: logVerboseMock,
+  };
+});
+
 vi.mock("../runtime.js", () => ({
  getDiscordRuntime: () => ({
    mediaUnderstanding: {
@@ -218,6 +230,7 @@ describe("DiscordVoiceManager", () => {
    textToSpeechStreamMock.mockResolvedValue({ success: false, error: "stream unavailable" });
    textToSpeechMock.mockReset();
    textToSpeechMock.mockResolvedValue({ success: true, audioPath: "/tmp/voice.mp3" });
+    logVerboseMock.mockClear();
    createAudioResourceMock.mockClear();
  });

@@ -760,6 +773,34 @@ describe("DiscordVoiceManager", () => {
    );
  });

+  it("logs a bounded inbound transcript preview for voice debugging", async () => {
+    transcribeAudioFileMock.mockResolvedValueOnce({
+      text: `hello from voice\n\n${"x".repeat(700)}`,
+    });
+    const client = createClient();
+    client.fetchMember.mockResolvedValue({
+      nickname: "Debug Speaker",
+      user: {
+        id: "u-debug",
+        username: "debug",
+        globalName: "Debug",
+        discriminator: "0001",
+      },
+    });
+    const manager = createManager({ groupPolicy: "open" }, client, {
+      commands: { useAccessGroups: false },
+    });
+
+    await processVoiceSegment(manager, "u-debug");
+
+    const transcriptLog = logVerboseMock.mock.calls
+      .map((call) => String(call[0]))
+      .find((message) => message.includes("transcript from Debug Speaker (u-debug)"));
+    expect(transcriptLog).toContain("hello from voice ");
+    expect(transcriptLog).not.toContain("\n");
+    expect(transcriptLog?.length).toBeLessThan(650);
+  });
+
  it("plays streaming TTS audio before falling back to a synthesized file", async () => {
    const release = vi.fn(async () => undefined);
    textToSpeechStreamMock.mockResolvedValue({
--- a/extensions/discord/src/voice/segment.ts
+++ b/extensions/discord/src/voice/segment.ts
@@ -21,8 +21,17 @@ import type { DiscordVoiceSpeakerContextResolver } from "./speaker-context.js";
 import { synthesizeVoiceReplyAudio, transcribeVoiceAudio } from "./tts.js";

 const DISCORD_VOICE_MESSAGE_PROVIDER = "discord-voice";
+const VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS = 500;
 const logger = createSubsystemLogger("discord/voice");

+function formatVoiceTranscriptLogPreview(text: string): string {
+  const oneLine = text.replace(/\s+/g, " ").trim();
+  if (oneLine.length <= VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS) {
+    return oneLine;
+  }
+  return `${oneLine.slice(0, VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS)}...`;
+}
+
 export async function processDiscordVoiceSegment(params: {
  entry: VoiceSessionEntry;
  wavPath: string;
@@ -82,6 +91,9 @@ export async function processDiscordVoiceSegment(params: {
  logVoiceVerbose(
    `transcription ok (${transcript.length} chars): guild ${entry.guildId} channel ${entry.channelId}`,
  );
+  logVoiceVerbose(
+    `transcript from ${speaker.label} (${userId}) in guild ${entry.guildId} channel ${entry.channelId}: ${formatVoiceTranscriptLogPreview(transcript)}`,
+  );

  const prompt = formatVoiceIngressPrompt(transcript, speaker.label);
  const extraSystemPrompt = buildDiscordGroupSystemPrompt(access.channelConfig);