mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 23:30:42 +00:00
feat: log discord voice transcripts
This commit is contained in:
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
- Telegram: preserve the channel-specific 10-option poll cap in the unified outbound adapter so over-limit polls are rejected before send. (#78762) Thanks @obviyus.
|
||||
- Runtime/install: raise the supported Node 22 floor to `22.16+` so native SQLite query handling can rely on the `node:sqlite` statement metadata API while continuing to recommend Node 24. (#78921)
|
||||
- Discord/voice: include a bounded one-line STT transcript preview in verbose voice logs so live voice debugging shows what speakers said before the agent reply.
|
||||
- Discord/voice: stream ElevenLabs TTS directly into Discord playback and send ElevenLabs latency optimization as the documented query parameter so spoken replies can start sooner.
|
||||
- Discord/voice: keep TTS playback running when another user starts speaking, ignore new capture during playback to avoid feedback loops, and downgrade expected receive-stream aborts to verbose diagnostics.
|
||||
- Telegram: treat successful same-chat `message` tool outbound sends during an inbound telegram turn as delivered when deciding whether to emit the rewritten silent reply fallback (#78685). Thanks @neeravmakwana.
|
||||
|
||||
@@ -1211,6 +1211,7 @@ Notes:
|
||||
- OpenClaw also watches receive decrypt failures and auto-recovers by leaving/rejoining the voice channel after repeated failures in a short window.
|
||||
- If receive logs repeatedly show `DecryptionFailed(UnencryptedWhenPassthroughDisabled)` after updating, collect a dependency report and logs. The bundled `@discordjs/voice` line includes the upstream padding fix from discord.js PR #11449, which closed discord.js issue #11419.
|
||||
- `The operation was aborted` receive events are expected when OpenClaw finalizes a captured speaker segment; they are verbose diagnostics, not warnings.
|
||||
- Verbose Discord voice logs include a bounded one-line STT transcript preview for each accepted speaker segment, so debugging shows both the user side and the agent reply side without dumping unbounded transcript text.
|
||||
|
||||
Voice channel pipeline:
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ const {
|
||||
transcribeAudioFileMock,
|
||||
textToSpeechStreamMock,
|
||||
textToSpeechMock,
|
||||
logVerboseMock,
|
||||
} = vi.hoisted(() => {
|
||||
type EventHandler = (...args: unknown[]) => unknown;
|
||||
type MockConnection = {
|
||||
@@ -111,6 +112,7 @@ const {
|
||||
async (): Promise<unknown> => ({ success: false, error: "stream unavailable" }),
|
||||
),
|
||||
textToSpeechMock: vi.fn(async () => ({ success: true, audioPath: "/tmp/voice.mp3" })),
|
||||
logVerboseMock: vi.fn(),
|
||||
};
|
||||
});
|
||||
|
||||
@@ -154,6 +156,16 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", async () => {
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
|
||||
const actual = await vi.importActual<typeof import("openclaw/plugin-sdk/runtime-env")>(
|
||||
"openclaw/plugin-sdk/runtime-env",
|
||||
);
|
||||
return {
|
||||
...actual,
|
||||
logVerbose: logVerboseMock,
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("../runtime.js", () => ({
|
||||
getDiscordRuntime: () => ({
|
||||
mediaUnderstanding: {
|
||||
@@ -218,6 +230,7 @@ describe("DiscordVoiceManager", () => {
|
||||
textToSpeechStreamMock.mockResolvedValue({ success: false, error: "stream unavailable" });
|
||||
textToSpeechMock.mockReset();
|
||||
textToSpeechMock.mockResolvedValue({ success: true, audioPath: "/tmp/voice.mp3" });
|
||||
logVerboseMock.mockClear();
|
||||
createAudioResourceMock.mockClear();
|
||||
});
|
||||
|
||||
@@ -760,6 +773,34 @@ describe("DiscordVoiceManager", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("logs a bounded inbound transcript preview for voice debugging", async () => {
|
||||
transcribeAudioFileMock.mockResolvedValueOnce({
|
||||
text: `hello from voice\n\n${"x".repeat(700)}`,
|
||||
});
|
||||
const client = createClient();
|
||||
client.fetchMember.mockResolvedValue({
|
||||
nickname: "Debug Speaker",
|
||||
user: {
|
||||
id: "u-debug",
|
||||
username: "debug",
|
||||
globalName: "Debug",
|
||||
discriminator: "0001",
|
||||
},
|
||||
});
|
||||
const manager = createManager({ groupPolicy: "open" }, client, {
|
||||
commands: { useAccessGroups: false },
|
||||
});
|
||||
|
||||
await processVoiceSegment(manager, "u-debug");
|
||||
|
||||
const transcriptLog = logVerboseMock.mock.calls
|
||||
.map((call) => String(call[0]))
|
||||
.find((message) => message.includes("transcript from Debug Speaker (u-debug)"));
|
||||
expect(transcriptLog).toContain("hello from voice ");
|
||||
expect(transcriptLog).not.toContain("\n");
|
||||
expect(transcriptLog?.length).toBeLessThan(650);
|
||||
});
|
||||
|
||||
it("plays streaming TTS audio before falling back to a synthesized file", async () => {
|
||||
const release = vi.fn(async () => undefined);
|
||||
textToSpeechStreamMock.mockResolvedValue({
|
||||
|
||||
@@ -21,8 +21,17 @@ import type { DiscordVoiceSpeakerContextResolver } from "./speaker-context.js";
|
||||
import { synthesizeVoiceReplyAudio, transcribeVoiceAudio } from "./tts.js";
|
||||
|
||||
const DISCORD_VOICE_MESSAGE_PROVIDER = "discord-voice";
|
||||
const VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS = 500;
|
||||
const logger = createSubsystemLogger("discord/voice");
|
||||
|
||||
function formatVoiceTranscriptLogPreview(text: string): string {
|
||||
const oneLine = text.replace(/\s+/g, " ").trim();
|
||||
if (oneLine.length <= VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS) {
|
||||
return oneLine;
|
||||
}
|
||||
return `${oneLine.slice(0, VOICE_TRANSCRIPT_LOG_PREVIEW_CHARS)}...`;
|
||||
}
|
||||
|
||||
export async function processDiscordVoiceSegment(params: {
|
||||
entry: VoiceSessionEntry;
|
||||
wavPath: string;
|
||||
@@ -82,6 +91,9 @@ export async function processDiscordVoiceSegment(params: {
|
||||
logVoiceVerbose(
|
||||
`transcription ok (${transcript.length} chars): guild ${entry.guildId} channel ${entry.channelId}`,
|
||||
);
|
||||
logVoiceVerbose(
|
||||
`transcript from ${speaker.label} (${userId}) in guild ${entry.guildId} channel ${entry.channelId}: ${formatVoiceTranscriptLogPreview(transcript)}`,
|
||||
);
|
||||
|
||||
const prompt = formatVoiceIngressPrompt(transcript, speaker.label);
|
||||
const extraSystemPrompt = buildDiscordGroupSystemPrompt(access.channelConfig);
|
||||
|
||||
Reference in New Issue
Block a user