feat: log discord voice transcripts

This commit is contained in:
Peter Steinberger
2026-05-07 22:17:42 +01:00
parent 6785633d13
commit 36835592df
4 changed files with 55 additions and 0 deletions

View File

@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
- Telegram: preserve the channel-specific 10-option poll cap in the unified outbound adapter so over-limit polls are rejected before send. (#78762) Thanks @obviyus.
- Runtime/install: raise the supported Node 22 floor to `22.16+` so native SQLite query handling can rely on the `node:sqlite` statement metadata API while continuing to recommend Node 24. (#78921)
- Discord/voice: include a bounded one-line STT transcript preview in verbose voice logs so live voice debugging shows what speakers said before the agent reply.
- Discord/voice: stream ElevenLabs TTS directly into Discord playback and send ElevenLabs latency optimization as the documented query parameter so spoken replies can start sooner.
- Discord/voice: keep TTS playback running when another user starts speaking, ignore new capture during playback to avoid feedback loops, and downgrade expected receive-stream aborts to verbose diagnostics.
- Telegram: treat successful same-chat `message` tool outbound sends during an inbound telegram turn as delivered when deciding whether to emit the rewritten silent reply fallback (#78685). Thanks @neeravmakwana.

View File

@@ -1211,6 +1211,7 @@ Notes:
- OpenClaw also watches receive decrypt failures and auto-recovers by leaving/rejoining the voice channel after repeated failures in a short window.
- If receive logs repeatedly show `DecryptionFailed(UnencryptedWhenPassthroughDisabled)` after updating, collect a dependency report and logs. The bundled `@discordjs/voice` line includes the upstream padding fix from discord.js PR #11449, which closed discord.js issue #11419.
- `The operation was aborted` receive events are expected when OpenClaw finalizes a captured speaker segment; they are verbose diagnostics, not warnings.
- Verbose Discord voice logs include a bounded one-line STT transcript preview for each accepted speaker segment, so debugging shows both the user side and the agent reply side without dumping unbounded transcript text.
Voice channel pipeline:

View File

@@ -16,6 +16,7 @@ const {
transcribeAudioFileMock,
textToSpeechStreamMock,
textToSpeechMock,
logVerboseMock,
} = vi.hoisted(() => {
type EventHandler = (...args: unknown[]) => unknown;
type MockConnection = {
@@ -111,6 +112,7 @@ const {
async (): Promise<unknown> => ({ success: false, error: "stream unavailable" }),
),
textToSpeechMock: vi.fn(async () => ({ success: true, audioPath: "/tmp/voice.mp3" })),
logVerboseMock: vi.fn(),
};
});
@@ -154,6 +156,16 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", async () => {
};
});
vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
const actual = await vi.importActual<typeof import("openclaw/plugin-sdk/runtime-env")>(
"openclaw/plugin-sdk/runtime-env",
);
return {
...actual,
logVerbose: logVerboseMock,
};
});
vi.mock("../runtime.js", () => ({
getDiscordRuntime: () => ({
mediaUnderstanding: {
@@ -218,6 +230,7 @@ describe("DiscordVoiceManager", () => {
textToSpeechStreamMock.mockResolvedValue({ success: false, error: "stream unavailable" });
textToSpeechMock.mockReset();
textToSpeechMock.mockResolvedValue({ success: true, audioPath: "/tmp/voice.mp3" });
logVerboseMock.mockClear();
createAudioResourceMock.mockClear();
});
@@ -760,6 +773,34 @@ describe("DiscordVoiceManager", () => {
);
});
it("logs a bounded inbound transcript preview for voice debugging", async () => {
transcribeAudioFileMock.mockResolvedValueOnce({
text: `hello from voice\n\n${"x".repeat(700)}`,
});
const client = createClient();
client.fetchMember.mockResolvedValue({
nickname: "Debug Speaker",
user: {
id: "u-debug",
username: "debug",
globalName: "Debug",
discriminator: "0001",
},
});
const manager = createManager({ groupPolicy: "open" }, client, {
commands: { useAccessGroups: false },
});
await processVoiceSegment(manager, "u-debug");
const transcriptLog = logVerboseMock.mock.calls
.map((call) => String(call[0]))
.find((message) => message.includes("transcript from Debug Speaker (u-debug)"));
expect(transcriptLog).toContain("hello from voice ");
expect(transcriptLog).not.toContain("\n");
expect(transcriptLog?.length).toBeLessThan(650);
});
it("plays streaming TTS audio before falling back to a synthesized file", async () => {
const release = vi.fn(async () => undefined);
textToSpeechStreamMock.mockResolvedValue({

View File

@@ -21,8 +21,17 @@ import type { DiscordVoiceSpeakerContextResolver } from "./speaker-context.js";
import { synthesizeVoiceReplyAudio, transcribeVoiceAudio } from "./tts.js";
const DISCORD_VOICE_MESSAGE_PROVIDER = "discord-voice";
const VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS = 500;
const logger = createSubsystemLogger("discord/voice");
function formatVoiceTranscriptLogPreview(text: string): string {
const oneLine = text.replace(/\s+/g, " ").trim();
if (oneLine.length <= VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS) {
return oneLine;
}
return `${oneLine.slice(0, VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS)}...`;
}
export async function processDiscordVoiceSegment(params: {
entry: VoiceSessionEntry;
wavPath: string;
@@ -82,6 +91,9 @@ export async function processDiscordVoiceSegment(params: {
logVoiceVerbose(
`transcription ok (${transcript.length} chars): guild ${entry.guildId} channel ${entry.channelId}`,
);
logVoiceVerbose(
`transcript from ${speaker.label} (${userId}) in guild ${entry.guildId} channel ${entry.channelId}: ${formatVoiceTranscriptLogPreview(transcript)}`,
);
const prompt = formatVoiceIngressPrompt(transcript, speaker.label);
const extraSystemPrompt = buildDiscordGroupSystemPrompt(access.channelConfig);