diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d1175850e4..338bb5d6162 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Discord/voice: run voice-channel turns under a voice-output policy that hides the agent `tts` tool and asks for spoken reply text, so `/vc join` sessions synthesize and play agent replies instead of ending with `NO_REPLY`. Fixes #61536. Thanks @aounakram. - Plugins/runtime-deps: prune legacy version-scoped plugin runtime-deps roots during bundled dependency repair and cover the path in Package Acceptance's upgrade-survivor matrix, so upgrades from 2026.4.x no longer leave stale per-plugin runtime trees after doctor runs. Thanks @vincentkoc. - Plugins/runtime-deps: keep Gateway startup plugin imports and runtime plugin fallback loads verify-only after startup/config repair planning, so packaged installs no longer spawn package-manager repair from hot paths after readiness. Refs #75283 and #75069. Thanks @brokemac79 and @xiaohuaxi. - Voice Call/realtime: add default-off fast memory/session context for `openclaw_agent_consult`, giving live calls a bounded answer-or-miss path before the full agent consult. Fixes #71849. Thanks @amzzzzzzz. diff --git a/docs/channels/discord.md b/docs/channels/discord.md index a3b3daa0ef9..e0883caa895 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -1075,7 +1075,7 @@ Voice channel pipeline: - Discord PCM capture is converted to a WAV temp file. - `tools.media.audio` handles STT, for example `openai/gpt-4o-mini-transcribe`. -- The transcript is sent through normal Discord ingress and routing. +- The transcript is sent through Discord ingress and routing while the response LLM runs with a voice-output policy that hides the agent `tts` tool and asks for returned text, because Discord voice owns final TTS playback. - `voice.model`, when set, overrides only the response LLM for this voice-channel turn. - `voice.tts` is merged over `messages.tts`; the resulting audio is played in the joined channel. diff --git a/extensions/discord/src/voice/manager.e2e.test.ts b/extensions/discord/src/voice/manager.e2e.test.ts index 02d68d33fe8..2aeeda93c69 100644 --- a/extensions/discord/src/voice/manager.e2e.test.ts +++ b/extensions/discord/src/voice/manager.e2e.test.ts @@ -539,6 +539,41 @@ describe("DiscordVoiceManager", () => { expect(commandArgs?.model).toBe("openai/gpt-5.4-mini"); }); + it("runs voice replies under Discord voice output policy", async () => { + agentCommandMock.mockResolvedValueOnce({ + payloads: [{ text: "hello back" }], + } as never); + + const client = createClient(); + client.fetchMember.mockResolvedValue({ + nickname: "Guest Nick", + user: { + id: "u-guest", + username: "guest", + globalName: "Guest", + discriminator: "4321", + }, + }); + const manager = createManager({ groupPolicy: "open" }, client, { + commands: { useAccessGroups: false }, + }); + await processVoiceSegment(manager, "u-guest"); + + const commandArgs = agentCommandMock.mock.calls.at(-1)?.[0] as + | { message?: string; messageChannel?: string; messageProvider?: string } + | undefined; + + expect(commandArgs?.messageChannel).toBe("discord"); + expect(commandArgs?.messageProvider).toBe("discord-voice"); + expect(commandArgs?.message).toContain("Do not call the tts tool"); + expect(textToSpeechMock).toHaveBeenCalledWith( + expect.objectContaining({ + channel: "discord", + text: "hello back", + }), + ); + }); + it("reuses speaker context cache for repeated segments from the same speaker", async () => { const client = createClient(); client.fetchMember.mockResolvedValue({ diff --git a/extensions/discord/src/voice/prompt.test.ts b/extensions/discord/src/voice/prompt.test.ts index babfedfae5f..f747f42ac6c 100644 --- a/extensions/discord/src/voice/prompt.test.ts +++ b/extensions/discord/src/voice/prompt.test.ts @@ -1,14 +1,16 @@ import { describe, expect, it } from "vitest"; -import { formatVoiceIngressPrompt } from "./prompt.js"; +import { DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT, formatVoiceIngressPrompt } from "./prompt.js"; describe("formatVoiceIngressPrompt", () => { - it("formats speaker-labeled voice input without imperative-looking prefixes", () => { + it("formats speaker-labeled voice input with the spoken-output contract", () => { expect(formatVoiceIngressPrompt("hello there", "speaker-1")).toBe( - 'Voice transcript from speaker "speaker-1":\nhello there', + `${DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT}\n\nVoice transcript from speaker "speaker-1":\nhello there`, ); }); - it("returns the bare transcript when no speaker label exists", () => { - expect(formatVoiceIngressPrompt("hello there")).toBe("hello there"); + it("keeps unlabeled transcripts under the spoken-output contract", () => { + expect(formatVoiceIngressPrompt("hello there")).toBe( + `${DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT}\n\nhello there`, + ); }); }); diff --git a/extensions/discord/src/voice/prompt.ts b/extensions/discord/src/voice/prompt.ts index af12e814c68..bc49e896646 100644 --- a/extensions/discord/src/voice/prompt.ts +++ b/extensions/discord/src/voice/prompt.ts @@ -1,8 +1,17 @@ +export const DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT = [ + "Discord voice reply requirements:", + "- Return only the concise text that should be spoken aloud in the voice channel.", + "- Do not call the tts tool; Discord voice will synthesize and play the returned text.", + "- Do not reply with NO_REPLY unless no spoken response is appropriate.", + "- Keep the response brief and conversational.", +].join("\n"); + export function formatVoiceIngressPrompt(transcript: string, speakerLabel?: string): string { const cleanedTranscript = transcript.trim(); const cleanedLabel = speakerLabel?.trim(); - if (!cleanedLabel) { - return cleanedTranscript; - } - return [`Voice transcript from speaker "${cleanedLabel}":`, cleanedTranscript].join("\n"); + const voiceInput = cleanedLabel + ? [`Voice transcript from speaker "${cleanedLabel}":`, cleanedTranscript].join("\n") + : cleanedTranscript; + + return [DISCORD_VOICE_SPOKEN_OUTPUT_CONTRACT, voiceInput].join("\n\n"); } diff --git a/extensions/discord/src/voice/segment.ts b/extensions/discord/src/voice/segment.ts index 5f32d22e55a..718db850690 100644 --- a/extensions/discord/src/voice/segment.ts +++ b/extensions/discord/src/voice/segment.ts @@ -18,6 +18,7 @@ import { import type { DiscordVoiceSpeakerContextResolver } from "./speaker-context.js"; import { synthesizeVoiceReplyAudio, transcribeVoiceAudio } from "./tts.js"; +const DISCORD_VOICE_MESSAGE_PROVIDER = "discord-voice"; const logger = createSubsystemLogger("discord/voice"); export async function processDiscordVoiceSegment(params: { @@ -89,6 +90,7 @@ export async function processDiscordVoiceSegment(params: { sessionKey: entry.route.sessionKey, agentId: entry.route.agentId, messageChannel: "discord", + messageProvider: DISCORD_VOICE_MESSAGE_PROVIDER, senderIsOwner: speaker.senderIsOwner, allowModelOverride: Boolean(modelOverride), model: modelOverride, diff --git a/src/agents/command/attempt-execution.cli.test.ts b/src/agents/command/attempt-execution.cli.test.ts index bc173c24f60..27e4f6d55b3 100644 --- a/src/agents/command/attempt-execution.cli.test.ts +++ b/src/agents/command/attempt-execution.cli.test.ts @@ -424,7 +424,7 @@ describe("CLI attempt execution", () => { }); }); - it("forwards user trigger and channel context to CLI runs", async () => { + it("forwards separate user trigger, channel, and provider context to CLI runs", async () => { const sessionKey = "agent:main:direct:claude-channel-context"; const sessionEntry: SessionEntry = { sessionId: "openclaw-session-channel", @@ -450,10 +450,13 @@ describe("CLI attempt execution", () => { resolvedThinkLevel: "medium", timeoutMs: 1_000, runId: "run-cli-channel-context", - opts: { senderIsOwner: false } as Parameters[0]["opts"], + opts: { + senderIsOwner: false, + messageProvider: "discord-voice", + } as Parameters[0]["opts"], runContext: {} as Parameters[0]["runContext"], spawnedBy: undefined, - messageChannel: "telegram", + messageChannel: "discord", skillsSnapshot: undefined, resolvedVerboseLevel: undefined, agentDir: tmpDir, @@ -468,8 +471,8 @@ describe("CLI attempt execution", () => { expect(runCliAgentMock).toHaveBeenCalledWith( expect.objectContaining({ trigger: "user", - messageChannel: "telegram", - messageProvider: "telegram", + messageChannel: "discord", + messageProvider: "discord-voice", }), ); }); @@ -567,6 +570,7 @@ describe("CLI attempt execution", () => { senderIsOwner: false, modelRun: true, promptMode: "none", + messageProvider: "discord-voice", inputProvenance: { kind: "inter_session", sourceSessionKey: "agent:main:discord:source", @@ -575,7 +579,7 @@ describe("CLI attempt execution", () => { } as Parameters[0]["opts"], runContext: {} as Parameters[0]["runContext"], spawnedBy: undefined, - messageChannel: "telegram", + messageChannel: "discord", skillsSnapshot: undefined, resolvedVerboseLevel: undefined, agentDir: tmpDir, @@ -593,6 +597,8 @@ describe("CLI attempt execution", () => { model: "claude-opus-4-7", agentHarnessId: "pi", prompt: "raw prompt", + messageChannel: "discord", + messageProvider: "discord-voice", modelRun: true, promptMode: "none", disableTools: true, diff --git a/src/agents/command/attempt-execution.ts b/src/agents/command/attempt-execution.ts index 71aee4a7bd7..7ad64588577 100644 --- a/src/agents/command/attempt-execution.ts +++ b/src/agents/command/attempt-execution.ts @@ -481,7 +481,7 @@ export function runAgentAttempt(params: { skillsSnapshot: params.skillsSnapshot, messageChannel: params.messageChannel, streamParams: params.opts.streamParams, - messageProvider: params.messageChannel, + messageProvider: params.opts.messageProvider ?? params.messageChannel, agentAccountId: params.runContext.accountId, senderIsOwner: params.opts.senderIsOwner, cleanupBundleMcpOnRunEnd: params.opts.cleanupBundleMcpOnRunEnd, @@ -550,6 +550,7 @@ export function runAgentAttempt(params: { agentId: params.sessionAgentId, trigger: "user", messageChannel: params.messageChannel, + messageProvider: params.opts.messageProvider ?? params.messageChannel, agentAccountId: params.runContext.accountId, messageTo: params.opts.replyTo ?? params.opts.to, messageThreadId: params.opts.threadId, diff --git a/src/agents/command/types.ts b/src/agents/command/types.ts index 2430951f7c3..7371a973bf7 100644 --- a/src/agents/command/types.ts +++ b/src/agents/command/types.ts @@ -71,6 +71,8 @@ export type AgentCommandOpts = { threadId?: string | number; /** Message channel context. */ messageChannel?: string; + /** Tool-policy/output surface context. Defaults to messageChannel. */ + messageProvider?: string; /** Delivery channel. */ channel?: string; /** Account ID for multi-account channel routing. */ diff --git a/src/agents/pi-embedded-runner/run/attempt.test.ts b/src/agents/pi-embedded-runner/run/attempt.test.ts index 31a88f23156..04ff01856e8 100644 --- a/src/agents/pi-embedded-runner/run/attempt.test.ts +++ b/src/agents/pi-embedded-runner/run/attempt.test.ts @@ -22,6 +22,7 @@ import { resolveEmbeddedAgentStreamFn, resolveUnknownToolGuardThreshold, shouldCreateBundleMcpRuntimeForAttempt, + resolveAttemptToolPolicyMessageProvider, resolvePromptBuildHookResult, resolvePromptModeForSession, shouldStripBootstrapFromEmbeddedContext, @@ -195,6 +196,21 @@ describe("shouldCreateBundleMcpRuntimeForAttempt", () => { }); }); +describe("resolveAttemptToolPolicyMessageProvider", () => { + it("prefers explicit tool-policy provider over transport channel", () => { + expect( + resolveAttemptToolPolicyMessageProvider({ + messageChannel: "discord", + messageProvider: "discord-voice", + }), + ).toBe("discord-voice"); + }); + + it("falls back to message channel when provider is omitted", () => { + expect(resolveAttemptToolPolicyMessageProvider({ messageChannel: "discord" })).toBe("discord"); + }); +}); + describe("resolvePromptBuildHookResult", () => { function createLegacyOnlyHookRunner() { return { diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 36f482994a0..b25937eda23 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -566,6 +566,13 @@ export function shouldCreateBundleMcpRuntimeForAttempt(params: { ); } +export function resolveAttemptToolPolicyMessageProvider(params: { + messageProvider?: string; + messageChannel?: string; +}): string | undefined { + return params.messageProvider ?? params.messageChannel; +} + function collectAttemptExplicitToolAllowlistSources(params: { config?: EmbeddedRunAttemptParams["config"]; sessionKey?: string; @@ -784,7 +791,7 @@ export async function runEmbeddedAttempt( elevated: params.bashElevated, }, sandbox, - messageProvider: params.messageChannel ?? params.messageProvider, + messageProvider: resolveAttemptToolPolicyMessageProvider(params), agentAccountId: params.agentAccountId, messageTo: params.messageTo, messageThreadId: params.messageThreadId, @@ -1003,7 +1010,7 @@ export async function runEmbeddedAttempt( agentId: sessionAgentId, modelProvider: params.provider, modelId: params.modelId, - messageProvider: params.messageChannel ?? params.messageProvider, + messageProvider: resolveAttemptToolPolicyMessageProvider(params), agentAccountId: params.agentAccountId, groupId: params.groupId, groupChannel: params.groupChannel, @@ -1030,7 +1037,7 @@ export async function runEmbeddedAttempt( agentId: sessionAgentId, modelProvider: params.provider, modelId: params.modelId, - messageProvider: params.messageChannel ?? params.messageProvider, + messageProvider: resolveAttemptToolPolicyMessageProvider(params), agentAccountId: params.agentAccountId, groupId: params.groupId, groupChannel: params.groupChannel, diff --git a/src/agents/pi-tools.message-provider-policy.test.ts b/src/agents/pi-tools.message-provider-policy.test.ts index 9b921498c78..f033eddf9d2 100644 --- a/src/agents/pi-tools.message-provider-policy.test.ts +++ b/src/agents/pi-tools.message-provider-policy.test.ts @@ -4,7 +4,7 @@ import { filterToolNamesByMessageProvider } from "./pi-tools.message-provider-po const DEFAULT_TOOL_NAMES = ["read", "write", "tts", "web_search"]; describe("createOpenClawCodingTools message provider policy", () => { - it.each(["voice", "VOICE", " Voice "])( + it.each(["voice", "VOICE", " Voice ", "discord-voice", "DISCORD-VOICE", " Discord-Voice "])( "does not expose tts tool for normalized voice provider: %s", (messageProvider) => { const names = new Set(filterToolNamesByMessageProvider(DEFAULT_TOOL_NAMES, messageProvider)); diff --git a/src/agents/pi-tools.message-provider-policy.ts b/src/agents/pi-tools.message-provider-policy.ts index 62dd1de9195..9c565a09053 100644 --- a/src/agents/pi-tools.message-provider-policy.ts +++ b/src/agents/pi-tools.message-provider-policy.ts @@ -1,6 +1,7 @@ import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; const TOOL_DENY_BY_MESSAGE_PROVIDER: Readonly> = { + "discord-voice": ["tts"], voice: ["tts"], };