fix: restore Discord voice replies

This commit is contained in:
Peter Steinberger
2026-05-01 11:04:18 +01:00
parent 9a051d2f9b
commit f9bb6e3515
13 changed files with 103 additions and 21 deletions

View File

@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Discord/voice: run voice-channel turns under a voice-output policy that hides the agent `tts` tool and asks for spoken reply text, so `/vc join` sessions synthesize and play agent replies instead of ending with `NO_REPLY`. Fixes #61536. Thanks @aounakram.
- Plugins/runtime-deps: prune legacy version-scoped plugin runtime-deps roots during bundled dependency repair and cover the path in Package Acceptance's upgrade-survivor matrix, so upgrades from 2026.4.x no longer leave stale per-plugin runtime trees after doctor runs. Thanks @vincentkoc.
- Plugins/runtime-deps: keep Gateway startup plugin imports and runtime plugin fallback loads verify-only after startup/config repair planning, so packaged installs no longer spawn package-manager repair from hot paths after readiness. Refs #75283 and #75069. Thanks @brokemac79 and @xiaohuaxi.
- Voice Call/realtime: add default-off fast memory/session context for `openclaw_agent_consult`, giving live calls a bounded answer-or-miss path before the full agent consult. Fixes #71849. Thanks @amzzzzzzz.

View File

@@ -1075,7 +1075,7 @@ Voice channel pipeline:
- Discord PCM capture is converted to a WAV temp file.
- `tools.media.audio` handles STT, for example `openai/gpt-4o-mini-transcribe`.
- The transcript is sent through normal Discord ingress and routing.
- The transcript is sent through Discord ingress and routing while the response LLM runs with a voice-output policy that hides the agent `tts` tool and asks for returned text, because Discord voice owns final TTS playback.
- `voice.model`, when set, overrides only the response LLM for this voice-channel turn.
- `voice.tts` is merged over `messages.tts`; the resulting audio is played in the joined channel.

View File

@@ -539,6 +539,41 @@ describe("DiscordVoiceManager", () => {
expect(commandArgs?.model).toBe("openai/gpt-5.4-mini");
});
it("runs voice replies under Discord voice output policy", async () => {
agentCommandMock.mockResolvedValueOnce({
payloads: [{ text: "hello back" }],
} as never);
const client = createClient();
client.fetchMember.mockResolvedValue({
nickname: "Guest Nick",
user: {
id: "u-guest",
username: "guest",
globalName: "Guest",
discriminator: "4321",
},
});
const manager = createManager({ groupPolicy: "open" }, client, {
commands: { useAccessGroups: false },
});
await processVoiceSegment(manager, "u-guest");
const commandArgs = agentCommandMock.mock.calls.at(-1)?.[0] as
| { message?: string; messageChannel?: string; messageProvider?: string }
| undefined;
expect(commandArgs?.messageChannel).toBe("discord");
expect(commandArgs?.messageProvider).toBe("discord-voice");
expect(commandArgs?.message).toContain("Do not call the tts tool");
expect(textToSpeechMock).toHaveBeenCalledWith(
expect.objectContaining({
channel: "discord",
text: "hello back",
}),
);
});
it("reuses speaker context cache for repeated segments from the same speaker", async () => {
const client = createClient();
client.fetchMember.mockResolvedValue({

View File

@@ -1,14 +1,16 @@
import { describe, expect, it } from "vitest";
import { formatVoiceIngressPrompt } from "./prompt.js";
import { DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT, formatVoiceIngressPrompt } from "./prompt.js";
describe("formatVoiceIngressPrompt", () => {
it("formats speaker-labeled voice input without imperative-looking prefixes", () => {
it("formats speaker-labeled voice input with the spoken-output contract", () => {
expect(formatVoiceIngressPrompt("hello there", "speaker-1")).toBe(
'Voice transcript from speaker "speaker-1":\nhello there',
`${DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT}\n\nVoice transcript from speaker "speaker-1":\nhello there`,
);
});
it("returns the bare transcript when no speaker label exists", () => {
expect(formatVoiceIngressPrompt("hello there")).toBe("hello there");
it("keeps unlabeled transcripts under the spoken-output contract", () => {
expect(formatVoiceIngressPrompt("hello there")).toBe(
`${DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT}\n\nhello there`,
);
});
});

View File

@@ -1,8 +1,17 @@
export const DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT = [
"Discord voice reply requirements:",
"- Return only the concise text that should be spoken aloud in the voice channel.",
"- Do not call the tts tool; Discord voice will synthesize and play the returned text.",
"- Do not reply with NO_REPLY unless no spoken response is appropriate.",
"- Keep the response brief and conversational.",
].join("\n");
export function formatVoiceIngressPrompt(transcript: string, speakerLabel?: string): string {
const cleanedTranscript = transcript.trim();
const cleanedLabel = speakerLabel?.trim();
if (!cleanedLabel) {
return cleanedTranscript;
}
return [`Voice transcript from speaker "${cleanedLabel}":`, cleanedTranscript].join("\n");
const voiceInput = cleanedLabel
? [`Voice transcript from speaker "${cleanedLabel}":`, cleanedTranscript].join("\n")
: cleanedTranscript;
return [DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT, voiceInput].join("\n\n");
}

View File

@@ -18,6 +18,7 @@ import {
import type { DiscordVoiceSpeakerContextResolver } from "./speaker-context.js";
import { synthesizeVoiceReplyAudio, transcribeVoiceAudio } from "./tts.js";
const DISCORD_VOICE_MESSAGE_PROVIDER = "discord-voice";
const logger = createSubsystemLogger("discord/voice");
export async function processDiscordVoiceSegment(params: {
@@ -89,6 +90,7 @@ export async function processDiscordVoiceSegment(params: {
sessionKey: entry.route.sessionKey,
agentId: entry.route.agentId,
messageChannel: "discord",
messageProvider: DISCORD_VOICE_MESSAGE_PROVIDER,
senderIsOwner: speaker.senderIsOwner,
allowModelOverride: Boolean(modelOverride),
model: modelOverride,

View File

@@ -424,7 +424,7 @@ describe("CLI attempt execution", () => {
});
});
it("forwards user trigger and channel context to CLI runs", async () => {
it("forwards separate user trigger, channel, and provider context to CLI runs", async () => {
const sessionKey = "agent:main:direct:claude-channel-context";
const sessionEntry: SessionEntry = {
sessionId: "openclaw-session-channel",
@@ -450,10 +450,13 @@ describe("CLI attempt execution", () => {
resolvedThinkLevel: "medium",
timeoutMs: 1_000,
runId: "run-cli-channel-context",
opts: { senderIsOwner: false } as Parameters<typeof runAgentAttempt>[0]["opts"],
opts: {
senderIsOwner: false,
messageProvider: "discord-voice",
} as Parameters<typeof runAgentAttempt>[0]["opts"],
runContext: {} as Parameters<typeof runAgentAttempt>[0]["runContext"],
spawnedBy: undefined,
messageChannel: "telegram",
messageChannel: "discord",
skillsSnapshot: undefined,
resolvedVerboseLevel: undefined,
agentDir: tmpDir,
@@ -468,8 +471,8 @@ describe("CLI attempt execution", () => {
expect(runCliAgentMock).toHaveBeenCalledWith(
expect.objectContaining({
trigger: "user",
messageChannel: "telegram",
messageProvider: "telegram",
messageChannel: "discord",
messageProvider: "discord-voice",
}),
);
});
@@ -567,6 +570,7 @@ describe("CLI attempt execution", () => {
senderIsOwner: false,
modelRun: true,
promptMode: "none",
messageProvider: "discord-voice",
inputProvenance: {
kind: "inter_session",
sourceSessionKey: "agent:main:discord:source",
@@ -575,7 +579,7 @@ describe("CLI attempt execution", () => {
} as Parameters<typeof runAgentAttempt>[0]["opts"],
runContext: {} as Parameters<typeof runAgentAttempt>[0]["runContext"],
spawnedBy: undefined,
messageChannel: "telegram",
messageChannel: "discord",
skillsSnapshot: undefined,
resolvedVerboseLevel: undefined,
agentDir: tmpDir,
@@ -593,6 +597,8 @@ describe("CLI attempt execution", () => {
model: "claude-opus-4-7",
agentHarnessId: "pi",
prompt: "raw prompt",
messageChannel: "discord",
messageProvider: "discord-voice",
modelRun: true,
promptMode: "none",
disableTools: true,

View File

@@ -481,7 +481,7 @@ export function runAgentAttempt(params: {
skillsSnapshot: params.skillsSnapshot,
messageChannel: params.messageChannel,
streamParams: params.opts.streamParams,
messageProvider: params.messageChannel,
messageProvider: params.opts.messageProvider ?? params.messageChannel,
agentAccountId: params.runContext.accountId,
senderIsOwner: params.opts.senderIsOwner,
cleanupBundleMcpOnRunEnd: params.opts.cleanupBundleMcpOnRunEnd,
@@ -550,6 +550,7 @@ export function runAgentAttempt(params: {
agentId: params.sessionAgentId,
trigger: "user",
messageChannel: params.messageChannel,
messageProvider: params.opts.messageProvider ?? params.messageChannel,
agentAccountId: params.runContext.accountId,
messageTo: params.opts.replyTo ?? params.opts.to,
messageThreadId: params.opts.threadId,

View File

@@ -71,6 +71,8 @@ export type AgentCommandOpts = {
threadId?: string | number;
/** Message channel context. */
messageChannel?: string;
/** Tool-policy/output surface context. Defaults to messageChannel. */
messageProvider?: string;
/** Delivery channel. */
channel?: string;
/** Account ID for multi-account channel routing. */

View File

@@ -22,6 +22,7 @@ import {
resolveEmbeddedAgentStreamFn,
resolveUnknownToolGuardThreshold,
shouldCreateBundleMcpRuntimeForAttempt,
resolveAttemptToolPolicyMessageProvider,
resolvePromptBuildHookResult,
resolvePromptModeForSession,
shouldStripBootstrapFromEmbeddedContext,
@@ -195,6 +196,21 @@ describe("shouldCreateBundleMcpRuntimeForAttempt", () => {
});
});
describe("resolveAttemptToolPolicyMessageProvider", () => {
it("prefers explicit tool-policy provider over transport channel", () => {
expect(
resolveAttemptToolPolicyMessageProvider({
messageChannel: "discord",
messageProvider: "discord-voice",
}),
).toBe("discord-voice");
});
it("falls back to message channel when provider is omitted", () => {
expect(resolveAttemptToolPolicyMessageProvider({ messageChannel: "discord" })).toBe("discord");
});
});
describe("resolvePromptBuildHookResult", () => {
function createLegacyOnlyHookRunner() {
return {

View File

@@ -566,6 +566,13 @@ export function shouldCreateBundleMcpRuntimeForAttempt(params: {
);
}
export function resolveAttemptToolPolicyMessageProvider(params: {
messageProvider?: string;
messageChannel?: string;
}): string | undefined {
return params.messageProvider ?? params.messageChannel;
}
function collectAttemptExplicitToolAllowlistSources(params: {
config?: EmbeddedRunAttemptParams["config"];
sessionKey?: string;
@@ -784,7 +791,7 @@ export async function runEmbeddedAttempt(
elevated: params.bashElevated,
},
sandbox,
messageProvider: params.messageChannel ?? params.messageProvider,
messageProvider: resolveAttemptToolPolicyMessageProvider(params),
agentAccountId: params.agentAccountId,
messageTo: params.messageTo,
messageThreadId: params.messageThreadId,
@@ -1003,7 +1010,7 @@ export async function runEmbeddedAttempt(
agentId: sessionAgentId,
modelProvider: params.provider,
modelId: params.modelId,
messageProvider: params.messageChannel ?? params.messageProvider,
messageProvider: resolveAttemptToolPolicyMessageProvider(params),
agentAccountId: params.agentAccountId,
groupId: params.groupId,
groupChannel: params.groupChannel,
@@ -1030,7 +1037,7 @@ export async function runEmbeddedAttempt(
agentId: sessionAgentId,
modelProvider: params.provider,
modelId: params.modelId,
messageProvider: params.messageChannel ?? params.messageProvider,
messageProvider: resolveAttemptToolPolicyMessageProvider(params),
agentAccountId: params.agentAccountId,
groupId: params.groupId,
groupChannel: params.groupChannel,

View File

@@ -4,7 +4,7 @@ import { filterToolNamesByMessageProvider } from "./pi-tools.message-provider-po
const DEFAULT_TOOL_NAMES = ["read", "write", "tts", "web_search"];
describe("createOpenClawCodingTools message provider policy", () => {
it.each(["voice", "VOICE", " Voice "])(
it.each(["voice", "VOICE", " Voice ", "discord-voice", "DISCORD-VOICE", " Discord-Voice "])(
"does not expose tts tool for normalized voice provider: %s",
(messageProvider) => {
const names = new Set(filterToolNamesByMessageProvider(DEFAULT_TOOL_NAMES, messageProvider));

View File

@@ -1,6 +1,7 @@
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
const TOOL_DENY_BY_MESSAGE_PROVIDER: Readonly<Record<string, readonly string[]>> = {
"discord-voice": ["tts"],
voice: ["tts"],
};