mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-11 00:42:53 +00:00
fix(discord): stabilize realtime wake-name feedback
This commit is contained in:
@@ -1234,7 +1234,7 @@ Notes:
|
||||
- In `stt-tts` mode, STT uses `tools.media.audio`; `voice.model` does not affect transcription.
|
||||
- In realtime modes, `voice.realtime.provider`, `voice.realtime.model`, and `voice.realtime.voice` configure the realtime audio session. For OpenAI Realtime 2 plus the Codex brain, use `voice.realtime.model: "gpt-realtime-2"` and `voice.model: "openai-codex/gpt-5.5"`.
|
||||
- Realtime voice modes include small `IDENTITY.md`, `USER.md`, and `SOUL.md` profile files in the realtime provider instructions by default so fast direct turns keep the same identity, user grounding, and persona as the routed OpenClaw agent. Set `voice.realtime.bootstrapContextFiles` to a subset to customize this, or `[]` to disable it. The supported realtime bootstrap files are limited to those profile files; `AGENTS.md` stays in the normal agent context. The injected profile context does not replace `openclaw_agent_consult` for workspace work, current facts, memory lookup, or tool-backed actions.
|
||||
- In OpenAI `agent-proxy` realtime mode, set `voice.realtime.requireWakeName: true` to keep Discord realtime voice silent until a transcript starts or ends with a wake name. Configured wake names must be one or two words. If `voice.realtime.wakeNames` is unset, OpenClaw uses the routed agent `name` plus `OpenClaw`, falling back to the agent id plus `OpenClaw`. Wake-name gating disables realtime provider auto-response and routes accepted turns through the OpenClaw agent consult path.
|
||||
- In OpenAI `agent-proxy` realtime mode, set `voice.realtime.requireWakeName: true` to keep Discord realtime voice silent until a transcript starts or ends with a wake name. Configured wake names must be one or two words. If `voice.realtime.wakeNames` is unset, OpenClaw uses the routed agent `name` plus `OpenClaw`, falling back to the agent id plus `OpenClaw`. Wake-name gating disables realtime provider auto-response, routes accepted turns through the OpenClaw agent consult path, and gives a short spoken acknowledgement when a leading wake name is recognized from partial transcription before the final transcript arrives.
|
||||
- The OpenAI realtime provider accepts current Realtime 2 event names and legacy Codex-compatible aliases for output audio and transcript events, so compatible provider snapshots can drift without dropping assistant audio.
|
||||
- `voice.realtime.bargeIn` controls whether Discord speaker-start events interrupt active realtime playback. If unset, it follows the realtime provider's input-audio interruption setting.
|
||||
- `voice.realtime.minBargeInAudioEndMs` controls the minimum assistant playback duration before an OpenAI realtime barge-in truncates audio. Default: `250`. Set `0` for immediate interruption in low-echo rooms, or raise it for echo-heavy speaker setups.
|
||||
|
||||
@@ -2241,6 +2241,7 @@ describe("DiscordVoiceManager", () => {
|
||||
| undefined;
|
||||
expect(bridgeParams?.autoRespondToAudio).toBe(false);
|
||||
expect(bridgeParams?.instructions).toContain("same OpenClaw agent");
|
||||
expect(bridgeParams?.instructions).toContain("short natural backchannel");
|
||||
expect(bridgeParams?.tools?.map((tool) => tool.name)).toContain("openclaw_agent_consult");
|
||||
expect(bridgeParams?.tools?.map((tool) => tool.name)).toContain("openclaw_agent_control");
|
||||
const player = getLastAudioPlayer();
|
||||
@@ -2915,6 +2916,65 @@ describe("DiscordVoiceManager", () => {
|
||||
expect(lastAgentCommandArgs().message).not.toContain("Hey");
|
||||
});
|
||||
|
||||
it("acknowledges leading wake names from partial realtime transcripts", async () => {
|
||||
agentCommandMock.mockResolvedValueOnce({ payloads: [{ text: "wake answer" }] });
|
||||
const manager = createManager(
|
||||
{
|
||||
groupPolicy: "open",
|
||||
voice: {
|
||||
enabled: true,
|
||||
mode: "agent-proxy",
|
||||
realtime: { provider: "openai", consultPolicy: "auto", requireWakeName: true },
|
||||
},
|
||||
},
|
||||
undefined,
|
||||
{
|
||||
agents: {
|
||||
list: [{ id: "agent-1", identity: { name: "Molty" } }],
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
await manager.join({ guildId: "g1", channelId: "1001" });
|
||||
const entry = getSessionEntry(manager) as {
|
||||
realtime?: {
|
||||
beginSpeakerTurn: (
|
||||
context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string },
|
||||
userId: string,
|
||||
) => { close: () => void; sendInputAudio: (audio: Buffer) => void };
|
||||
};
|
||||
};
|
||||
const bridgeParams = lastRealtimeBridgeParams() as
|
||||
| {
|
||||
onEvent?: (event: { direction: "server"; type: string }) => void;
|
||||
onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
|
||||
}
|
||||
| undefined;
|
||||
|
||||
const ownerTurn = entry.realtime?.beginSpeakerTurn(
|
||||
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
|
||||
"u-owner",
|
||||
);
|
||||
ownerTurn?.sendInputAudio(Buffer.alloc(8));
|
||||
bridgeParams?.onEvent?.({ direction: "server", type: "input_audio_buffer.speech_started" });
|
||||
bridgeParams?.onTranscript?.("user", "Hey, Molty", false);
|
||||
|
||||
expectUserMessageIncludes('Answer: "Yeah."');
|
||||
expect(controlRealtimeVoiceAgentRunMock).not.toHaveBeenCalled();
|
||||
expect(agentCommandMock).not.toHaveBeenCalled();
|
||||
|
||||
bridgeParams?.onEvent?.({ direction: "server", type: "response.done" });
|
||||
bridgeParams?.onTranscript?.("user", "Hey, Molty, how is it going", true);
|
||||
await new Promise((resolve) => setTimeout(resolve, 260));
|
||||
|
||||
expect(controlRealtimeVoiceAgentRunMock).toHaveBeenCalledWith({
|
||||
sessionKey: "discord:g1:c1",
|
||||
text: "how is it going",
|
||||
});
|
||||
expect(lastAgentCommandArgs().message).toContain("how is it going");
|
||||
expectUserMessageIncludes("wake answer");
|
||||
});
|
||||
|
||||
it("reuses recently ignored speaker context when wake-name consult has no pending turn", async () => {
|
||||
agentCommandMock.mockResolvedValueOnce({ payloads: [{ text: "wake answer" }] });
|
||||
const manager = createManager(
|
||||
@@ -3185,16 +3245,16 @@ describe("DiscordVoiceManager", () => {
|
||||
expect(agentCommandArgsAt(6).message).toContain("can you hear me too?");
|
||||
expect(agentCommandArgsAt(6).message).not.toContain("Open Cloud");
|
||||
|
||||
const trailingMultiTurn = entry.realtime?.beginSpeakerTurn(
|
||||
const trailingMoltyTurn = entry.realtime?.beginSpeakerTurn(
|
||||
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
|
||||
"u-owner",
|
||||
);
|
||||
trailingMultiTurn?.sendInputAudio(Buffer.alloc(8));
|
||||
bridgeParams?.onTranscript?.("user", "Can you still hear trailing, Multi.", true);
|
||||
trailingMoltyTurn?.sendInputAudio(Buffer.alloc(8));
|
||||
bridgeParams?.onTranscript?.("user", "Can you still hear trailing, Molty.", true);
|
||||
await new Promise((resolve) => setTimeout(resolve, 260));
|
||||
|
||||
expect(agentCommandArgsAt(7).message).toContain("Can you still hear trailing");
|
||||
expect(agentCommandArgsAt(7).message).not.toContain("Multi");
|
||||
expect(agentCommandArgsAt(7).message).not.toContain("Molty");
|
||||
|
||||
const openChatTurn = entry.realtime?.beginSpeakerTurn(
|
||||
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
|
||||
@@ -3264,6 +3324,14 @@ describe("DiscordVoiceManager", () => {
|
||||
bridgeParams?.onTranscript?.("user", "Open law is not the wake phrase.", true);
|
||||
await new Promise((resolve) => setTimeout(resolve, 260));
|
||||
|
||||
const fuzzyTrailingTurn = entry.realtime?.beginSpeakerTurn(
|
||||
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
|
||||
"u-owner",
|
||||
);
|
||||
fuzzyTrailingTurn?.sendInputAudio(Buffer.alloc(8));
|
||||
bridgeParams?.onTranscript?.("user", "I miss the nonsensical German ranting from Multy.", true);
|
||||
await new Promise((resolve) => setTimeout(resolve, 260));
|
||||
|
||||
expect(agentCommandMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
@@ -4400,6 +4468,7 @@ describe("DiscordVoiceManager", () => {
|
||||
| undefined;
|
||||
expect(bridgeParams?.instructions).toContain("OpenClaw realtime voice profile context");
|
||||
expect(bridgeParams?.instructions).toContain("Name: Wilfred");
|
||||
expect(bridgeParams?.instructions).toContain("short natural backchannel");
|
||||
expect(bridgeParams?.instructions).toContain("Call openclaw_agent_consult");
|
||||
});
|
||||
|
||||
|
||||
@@ -71,6 +71,8 @@ const DISCORD_REALTIME_FORCED_CONSULT_FALLBACK_DELAY_MS = 200;
|
||||
const DISCORD_REALTIME_DUPLICATE_ERROR_SUPPRESS_MS = 60_000;
|
||||
const DISCORD_REALTIME_CONTROL_SPEECH_DEDUPE_MS = 5_000;
|
||||
const DISCORD_REALTIME_OUTPUT_PLAYBACK_WATCHDOG_MARGIN_MS = 1_500;
|
||||
const DISCORD_REALTIME_WAKE_ACKS = ["Yeah.", "Mm-hmm.", "Got it.", "One sec."];
|
||||
const DISCORD_REALTIME_PARTIAL_TRANSCRIPT_MAX_CHARS = 240;
|
||||
const REALTIME_PCM16_BYTES_PER_SAMPLE = 2;
|
||||
const DISCORD_RAW_PCM_FRAME_BYTES = 3_840;
|
||||
const DISCORD_REALTIME_OUTPUT_PREROLL_FRAMES = 25;
|
||||
@@ -314,6 +316,15 @@ function normalizeControlSpeechText(text: string): string {
|
||||
return text.toLowerCase().replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function mergeRealtimePartialTranscript(previous: string, next: string): string {
|
||||
const trimmed = next.trim();
|
||||
if (!trimmed) {
|
||||
return previous;
|
||||
}
|
||||
const merged = trimmed.startsWith(previous) ? trimmed : `${previous}${next}`;
|
||||
return merged.slice(-DISCORD_REALTIME_PARTIAL_TRANSCRIPT_MAX_CHARS);
|
||||
}
|
||||
|
||||
function resolveDiscordRealtimeWakeNames(params: {
|
||||
config: DiscordRealtimeVoiceConfig;
|
||||
cfg: OpenClawConfig;
|
||||
@@ -380,6 +391,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
private queuedExactSpeechMessages: string[] = [];
|
||||
private exactSpeechResponseActive = false;
|
||||
private exactSpeechAudioStarted = false;
|
||||
private partialUserTranscript = "";
|
||||
private wakeNameAckedForTurn = false;
|
||||
private wakeNameAckIndex = 0;
|
||||
private lastControlSpeech:
|
||||
| { normalizedText: string; sentAt: number; assistantTranscriptCount: number }
|
||||
| undefined;
|
||||
@@ -499,7 +513,11 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
if (isFinal && role === "assistant") {
|
||||
this.suppressDuplicateControlSpeech(text);
|
||||
}
|
||||
if (!isFinal || role !== "user") {
|
||||
if (role !== "user") {
|
||||
return;
|
||||
}
|
||||
if (!isFinal) {
|
||||
this.handlePartialUserTranscript(text);
|
||||
return;
|
||||
}
|
||||
void this.handleFinalUserTranscript(text, { usesRealtimeAgentHandoff });
|
||||
@@ -507,6 +525,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
onToolCall: (event, session) => this.handleToolCall(event, session),
|
||||
onEvent: (event) => {
|
||||
const detail = event.detail ? ` ${event.detail}` : "";
|
||||
if (event.direction === "server" && event.type === "input_audio_buffer.speech_started") {
|
||||
this.resetPartialWakeNameTracking();
|
||||
}
|
||||
if (shouldLogRealtimeVerboseEvent(event)) {
|
||||
logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`);
|
||||
}
|
||||
@@ -567,6 +588,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
this.queuedExactSpeechMessages = [];
|
||||
this.exactSpeechResponseActive = false;
|
||||
this.exactSpeechAudioStarted = false;
|
||||
this.resetPartialWakeNameTracking();
|
||||
this.clearOutputAudio("session-close");
|
||||
this.bridge?.close();
|
||||
this.bridge = null;
|
||||
@@ -600,6 +622,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
}
|
||||
|
||||
beginSpeakerTurn(context: VoiceRealtimeSpeakerContext, userId: string): VoiceRealtimeSpeakerTurn {
|
||||
this.resetPartialWakeNameTracking();
|
||||
const turn: PendingSpeakerTurn = {
|
||||
context: { ...context, userId },
|
||||
hasAudio: false,
|
||||
@@ -882,6 +905,25 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text));
|
||||
}
|
||||
|
||||
private sendWakeNameAck(result: RealtimeVoiceActivationNameTranscriptResult): void {
|
||||
if (!result.allowed || this.stopped || this.exactSpeechResponseActive) {
|
||||
return;
|
||||
}
|
||||
if (this.hasInterruptibleOutputAudio()) {
|
||||
logger.info(
|
||||
`discord voice: realtime wake-name ack skipped outputActive=true voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
const ack =
|
||||
DISCORD_REALTIME_WAKE_ACKS[this.wakeNameAckIndex % DISCORD_REALTIME_WAKE_ACKS.length];
|
||||
this.wakeNameAckIndex += 1;
|
||||
logger.info(
|
||||
`discord voice: realtime wake-name ack canonical=${result.activationName} heard=${result.heardName} match=${result.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`,
|
||||
);
|
||||
this.sendExactSpeechMessage(ack ?? "Yeah.");
|
||||
}
|
||||
|
||||
private speakControlResult(text: string): void {
|
||||
const trimmed = text.trim();
|
||||
if (this.stopped || !trimmed) {
|
||||
@@ -1151,6 +1193,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
if (!trimmed) {
|
||||
return;
|
||||
}
|
||||
this.partialUserTranscript = "";
|
||||
const meetingNotesTurn = this.peekPendingSpeakerTurn();
|
||||
this.recordMeetingNotesUtterance(trimmed, meetingNotesTurn);
|
||||
const wakeNameResult = this.resolveWakeNameTranscript(trimmed);
|
||||
@@ -1200,6 +1243,27 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
this.talkback.enqueue(acceptedText, this.consumePendingSpeakerContext());
|
||||
}
|
||||
|
||||
private handlePartialUserTranscript(text: string): void {
|
||||
if (!this.requireWakeName || this.wakeNameAckedForTurn) {
|
||||
return;
|
||||
}
|
||||
this.partialUserTranscript = mergeRealtimePartialTranscript(this.partialUserTranscript, text);
|
||||
const wakeNameResult = matchRealtimeVoiceActivationName(
|
||||
this.partialUserTranscript,
|
||||
this.wakeNames,
|
||||
);
|
||||
if (!wakeNameResult || wakeNameResult.edge !== "leading") {
|
||||
return;
|
||||
}
|
||||
this.wakeNameAckedForTurn = true;
|
||||
this.sendWakeNameAck(wakeNameResult);
|
||||
}
|
||||
|
||||
private resetPartialWakeNameTracking(): void {
|
||||
this.partialUserTranscript = "";
|
||||
this.wakeNameAckedForTurn = false;
|
||||
}
|
||||
|
||||
private resolveWakeNameTranscript(text: string): RealtimeVoiceActivationNameTranscriptResult {
|
||||
if (!this.requireWakeName) {
|
||||
return {
|
||||
@@ -1672,6 +1736,7 @@ function buildDiscordRealtimeInstructions(params: {
|
||||
"Delegate substantive requests, actions, tool work, current facts, memory, workspace context, and user-specific context with openclaw_agent_consult.",
|
||||
"Do not block, refuse, or downscope at the voice layer. Delegate to OpenClaw and treat its result as authoritative.",
|
||||
"Answer directly only for greetings, acknowledgements, brief latency tests, or filler while waiting.",
|
||||
'While waiting for OpenClaw data or tool results, use at most one short natural backchannel such as "yeah", "mm-hmm", "got it", or "one sec"; vary it and do not treat it as the final answer.',
|
||||
"When OpenClaw sends an internal exact answer to speak, do not call tools. Say only that answer.",
|
||||
buildRealtimeVoiceAgentConsultPolicyInstructions({
|
||||
toolPolicy: params.toolPolicy,
|
||||
@@ -1682,6 +1747,7 @@ function buildDiscordRealtimeInstructions(params: {
|
||||
return [
|
||||
base,
|
||||
params.bootstrapContextInstructions?.trim(),
|
||||
'While waiting for OpenClaw data or tool results, use at most one short natural backchannel such as "yeah", "mm-hmm", "got it", or "one sec"; vary it and do not treat it as the final answer.',
|
||||
buildRealtimeVoiceAgentConsultPolicyInstructions({
|
||||
toolPolicy: params.toolPolicy,
|
||||
consultPolicy: params.consultPolicy,
|
||||
|
||||
@@ -50,6 +50,14 @@ describe("realtime voice activation names", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("does not accept fuzzy trailing matches in ambient speech", () => {
|
||||
expect(
|
||||
matchRealtimeVoiceActivationName("I miss the nonsensical German ranting from Multy.", [
|
||||
"molty",
|
||||
]),
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it("does not fuzzy match inside a larger phrase without an edge boundary", () => {
|
||||
expect(matchRealtimeVoiceActivationName("maltiness is not a wake name", ["molty"])).toBe(
|
||||
undefined,
|
||||
|
||||
@@ -88,16 +88,16 @@ export function matchRealtimeVoiceActivationName(
|
||||
}
|
||||
const heardCompact = compactActivationName(candidate.heardName);
|
||||
const activationCompact = compactActivationName(normalizedActivationName);
|
||||
if (
|
||||
heardCompact === activationCompact ||
|
||||
isFuzzyActivationNameMatch(candidate, activationName)
|
||||
) {
|
||||
const exactMatch = heardCompact === activationCompact;
|
||||
const fuzzyMatch =
|
||||
candidate.edge === "leading" && isFuzzyActivationNameMatch(candidate, activationName);
|
||||
if (exactMatch || fuzzyMatch) {
|
||||
return {
|
||||
allowed: true,
|
||||
text: stripEdgeActivationNameCandidate(text, candidate),
|
||||
activationName,
|
||||
heardName: candidate.heardName,
|
||||
match: heardCompact === activationCompact ? "exact" : "fuzzy",
|
||||
match: exactMatch ? "exact" : "fuzzy",
|
||||
edge: candidate.edge,
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user