From d0ab0d9922cd2e9fa30fceafef1c3e31b1f1ccd1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 25 May 2026 20:25:17 +0100 Subject: [PATCH] refactor: share realtime voice activation helpers (#86615) --- CHANGELOG.md | 1 + .../.generated/plugin-sdk-api-baseline.sha256 | 4 +- extensions/discord/src/doctor-contract.ts | 29 +- extensions/discord/src/voice/realtime.ts | 365 ++---------------- src/plugin-sdk/realtime-voice.ts | 17 + src/talk/activation-name.test.ts | 74 ++++ src/talk/activation-name.ts | 334 ++++++++++++++++ src/talk/consult-transcript.test.ts | 35 ++ src/talk/consult-transcript.ts | 53 +++ 9 files changed, 548 insertions(+), 364 deletions(-) create mode 100644 src/talk/activation-name.test.ts create mode 100644 src/talk/activation-name.ts create mode 100644 src/talk/consult-transcript.test.ts create mode 100644 src/talk/consult-transcript.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index c2ac1c486c1..fb34fa6bc2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai ### Changes +- Voice: share activation-name matching and consult-transcript screening through the realtime voice SDK so Discord, browser voice, and meeting surfaces can reuse one implementation. - Cron: default `cron.maxConcurrentRuns` to 8 so scheduled automations and their isolated agent turns can make progress in parallel without explicit configuration. - QA-Lab: add `qa coverage --match ` so focused proof selection can discover matching scenarios from existing metadata before running live or remote lanes. - Control UI: add an ephemeral Activity tab for sanitized live tool activity summaries without persisting raw telemetry. Fixes #12831. Thanks @BunsDev. diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 18b7834b256..56b09a99ceb 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -390681a3d97af8c004db89ead136bd6cff693af5a0ddfe86a8e3c55a29a077eb plugin-sdk-api-baseline.json -8dfaf69ee3d0a946bfdd1d8d97ef85262824d52c20854249f900db61f2a7f7b4 plugin-sdk-api-baseline.jsonl +1d3e6177eeac57fc43736f7d5f76d8f825e1859ca625d268e97dc30b5567ea34 plugin-sdk-api-baseline.json +6c093ff7c10bd81ee9d2c4fc5d07b206bc3a1f5acd0bad491cfc9e0df6689f6b plugin-sdk-api-baseline.jsonl diff --git a/extensions/discord/src/doctor-contract.ts b/extensions/discord/src/doctor-contract.ts index 7e5691b3f7c..4a9ccf29b84 100644 --- a/extensions/discord/src/doctor-contract.ts +++ b/extensions/discord/src/doctor-contract.ts @@ -3,11 +3,14 @@ import type { ChannelDoctorLegacyConfigRule, } from "openclaw/plugin-sdk/channel-contract"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-contracts"; +import { + isSupportedRealtimeVoiceActivationName, + normalizeRealtimeVoiceActivationNamePrefix, +} from "openclaw/plugin-sdk/realtime-voice"; import { asObjectRecord, normalizeLegacyChannelAliases } from "openclaw/plugin-sdk/runtime-doctor"; import { resolveDiscordPreviewStreamMode } from "./preview-streaming.js"; const LEGACY_TTS_PROVIDER_KEYS = ["openai", "elevenlabs", "microsoft", "edge"] as const; -const DISCORD_REALTIME_WAKE_NAME_MAX_WORDS = 2; type AgentBindingConfig = NonNullable[number]; function hasLegacyTtsProviderKeys(value: unknown): boolean { @@ -78,23 +81,6 @@ function hasLegacyDiscordAccountGuildChannelAgentId(value: unknown): boolean { return Object.values(accounts).some((account) => hasLegacyDiscordGuildChannelAgentId(account)); } -function realtimeWakeNameWordCount(value: string): number { - return Array.from(value.matchAll(/[a-z0-9]+/gi)).length; -} - -function normalizeRealtimeWakeName(value: string): string | undefined { - const words = Array.from(value.matchAll(/[a-z0-9]+/gi), (match) => match[0]); - if (words.length === 0) { - return undefined; - } - return words.slice(0, DISCORD_REALTIME_WAKE_NAME_MAX_WORDS).join(" "); -} - -function isSupportedRealtimeWakeName(value: string): boolean { - const wordCount = realtimeWakeNameWordCount(value); - return wordCount >= 1 && wordCount <= DISCORD_REALTIME_WAKE_NAME_MAX_WORDS; -} - function hasUnsupportedRealtimeWakeNamesInVoice(value: unknown): boolean { const voice = asObjectRecord(value); const realtime = asObjectRecord(voice?.realtime); @@ -102,7 +88,8 @@ function hasUnsupportedRealtimeWakeNamesInVoice(value: unknown): boolean { return Array.isArray(wakeNames) ? wakeNames.length === 0 || wakeNames.some( - (wakeName) => typeof wakeName === "string" && !isSupportedRealtimeWakeName(wakeName), + (wakeName) => + typeof wakeName === "string" && !isSupportedRealtimeVoiceActivationName(wakeName), ) : false; } @@ -231,10 +218,10 @@ function normalizeUnsupportedRealtimeWakeNames( let normalized = 0; let removed = 0; const nextWakeNames = wakeNames.flatMap((wakeName) => { - if (typeof wakeName !== "string" || isSupportedRealtimeWakeName(wakeName)) { + if (typeof wakeName !== "string" || isSupportedRealtimeVoiceActivationName(wakeName)) { return [wakeName]; } - const nextWakeName = normalizeRealtimeWakeName(wakeName); + const nextWakeName = normalizeRealtimeVoiceActivationNamePrefix(wakeName); if (!nextWakeName) { removed += 1; return []; diff --git a/extensions/discord/src/voice/realtime.ts b/extensions/discord/src/voice/realtime.ts index dd47daa0f72..afa76a4a9af 100644 --- a/extensions/discord/src/voice/realtime.ts +++ b/extensions/discord/src/voice/realtime.ts @@ -3,9 +3,12 @@ import type { DiscordAccountConfig, OpenClawConfig } from "openclaw/plugin-sdk/c import { buildRealtimeVoiceAgentConsultChatMessage, buildRealtimeVoiceAgentConsultPolicyInstructions, + classifySkippableRealtimeVoiceConsultTranscript, controlRealtimeVoiceAgentRun, createRealtimeVoiceAgentTalkbackQueue, createRealtimeVoiceBridgeSession, + matchRealtimeVoiceActivationName, + normalizeSupportedRealtimeVoiceActivationName, REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, REALTIME_VOICE_AGENT_CONTROL_TOOL, REALTIME_VOICE_AGENT_CONTROL_TOOL_NAME, @@ -22,6 +25,8 @@ import { type RealtimeVoiceBridgeSession, type RealtimeVoiceProviderConfig, type RealtimeVoiceToolCallEvent, + sortRealtimeVoiceActivationNames, + type RealtimeVoiceActivationNameTranscriptResult, } from "openclaw/plugin-sdk/realtime-voice"; import { createSubsystemLogger } from "openclaw/plugin-sdk/runtime-env"; import { formatErrorMessage } from "openclaw/plugin-sdk/ssrf-runtime"; @@ -65,35 +70,11 @@ const DISCORD_REALTIME_FORCED_CONSULT_FALLBACK_DELAY_MS = 200; const DISCORD_REALTIME_DUPLICATE_ERROR_SUPPRESS_MS = 60_000; const DISCORD_REALTIME_CONTROL_SPEECH_DEDUPE_MS = 5_000; const DISCORD_REALTIME_OUTPUT_PLAYBACK_WATCHDOG_MARGIN_MS = 1_500; -const DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS = 2; const REALTIME_PCM16_BYTES_PER_SAMPLE = 2; const DISCORD_RAW_PCM_FRAME_BYTES = 3_840; const DISCORD_REALTIME_OUTPUT_PREROLL_FRAMES = 25; const DISCORD_REALTIME_TRAILING_SILENCE_MIN_MS = 700; const DISCORD_REALTIME_TRAILING_SILENCE_MAX_MS = 3_000; -const DISCORD_REALTIME_FORCED_CONSULT_TRAILING_FRAGMENT_WORDS = new Set([ - "a", - "about", - "an", - "and", - "as", - "at", - "because", - "but", - "by", - "for", - "from", - "in", - "of", - "on", - "or", - "so", - "that", - "the", - "then", - "to", - "with", -]); const DISCORD_REALTIME_FORCED_CONSULT_REASON = "provider_final_transcript_without_openclaw_agent_consult"; const DISCORD_REALTIME_VERBOSE_OMITTED_EVENTS = new Set([ @@ -204,28 +185,6 @@ function shouldLogRealtimeVerboseEvent(event: RealtimeVoiceBridgeEvent): boolean return !DISCORD_REALTIME_VERBOSE_OMITTED_EVENTS.has(event.type); } -function classifySkippableForcedAgentProxyTranscript(text: string): string | undefined { - const normalized = text.replace(/\s+/g, " ").trim().toLowerCase(); - if (!normalized) { - return "empty"; - } - if (/(\.\.\.|…)\s*$/.test(normalized)) { - return "incomplete-transcript"; - } - const lastWord = normalized.match(/[a-z']+$/)?.[0]?.replace(/^'+|'+$/g, ""); - if (lastWord && DISCORD_REALTIME_FORCED_CONSULT_TRAILING_FRAGMENT_WORDS.has(lastWord)) { - return "trailing-fragment"; - } - if ( - !normalized.includes("?") && - (/^(i'?ll|i will) be (right )?back\b/.test(normalized) || - /\b(see you|bye(?:-bye)?|goodbye)\b/.test(normalized)) - ) { - return "non-actionable-closing"; - } - return undefined; -} - function readProviderConfigString( config: RealtimeVoiceProviderConfig, key: string, @@ -355,283 +314,6 @@ function normalizeControlSpeechText(text: string): string { return text.toLowerCase().replace(/\s+/g, " ").trim(); } -function normalizeWakeName(value: string): string | undefined { - const normalized = value.toLowerCase().replace(/\s+/g, " ").trim(); - return normalized || undefined; -} - -function normalizeSupportedWakeName(value: string | undefined): string | undefined { - if (typeof value !== "string") { - return undefined; - } - const normalized = normalizeWakeName(value); - const wordCount = normalized ? Array.from(normalized.matchAll(/[a-z0-9]+/gi)).length : 0; - return wordCount >= 1 && wordCount <= DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS - ? normalized - : undefined; -} - -function normalizeWakeNameCandidate(value: string): string | undefined { - const normalized = value - .toLowerCase() - .replace(/[^a-z0-9]+/g, " ") - .replace(/\s+/g, " ") - .trim(); - return normalized || undefined; -} - -function compactWakeName(value: string): string { - return value.replace(/[^a-z0-9]+/g, ""); -} - -type EdgeWakeNameCandidate = { - edge: "leading" | "trailing"; - heardName: string; - startIndex: number; - endIndex: number; - strongBoundary: boolean; -}; - -type WakeNameTranscriptResult = - | { allowed: true; text: string; wakeName: string; heardName: string; match: "exact" | "fuzzy" } - | { allowed: false; text: string }; -type AllowedWakeNameTranscriptResult = Extract; - -function leadingWakeNameCandidates(text: string, maxWords: number): EdgeWakeNameCandidate[] { - const opener = /^\s*(?:(?:hey|ok|okay)(?:\s*[-,:;]+\s*|\s+))?/i.exec(text); - const nameStart = opener?.[0].length ?? 0; - const candidates: EdgeWakeNameCandidate[] = []; - const candidateStarts = nameStart > 0 ? [0, nameStart] : [0]; - - for (const startIndex of candidateStarts) { - const tokenPattern = /[a-z0-9]+/gi; - tokenPattern.lastIndex = startIndex; - const startCandidates: EdgeWakeNameCandidate[] = []; - - for (let wordCount = 0; wordCount < maxWords; wordCount += 1) { - const token = tokenPattern.exec(text); - if (!token) { - break; - } - const previousEndIndex = - wordCount === 0 ? startIndex : startCandidates[wordCount - 1]?.endIndex; - const between = text.slice(previousEndIndex, token.index); - if (wordCount > 0 && !/^[\s'-]+$/.test(between)) { - break; - } - const endIndex = token.index + token[0].length; - const heardName = normalizeWakeNameCandidate(text.slice(startIndex, endIndex)); - if (!heardName) { - break; - } - const boundary = text.slice(endIndex).match(/^\s*([,.:;!?-]|$)/); - startCandidates.push({ - edge: "leading", - heardName, - startIndex, - endIndex, - strongBoundary: Boolean(boundary), - }); - } - - candidates.push(...startCandidates); - } - - return candidates; -} - -function trailingWakeNameCandidates(text: string, maxWords: number): EdgeWakeNameCandidate[] { - const tokens = Array.from(text.matchAll(/[a-z0-9]+/gi)); - const candidates: EdgeWakeNameCandidate[] = []; - const tokenCount = Math.min(tokens.length, maxWords); - - for (let wordCount = 1; wordCount <= tokenCount; wordCount += 1) { - const startToken = tokens[tokens.length - wordCount]; - const endToken = tokens[tokens.length - 1]; - if (!startToken || !endToken?.[0]) { - break; - } - const startIndex = startToken.index ?? 0; - const endIndex = (endToken.index ?? 0) + endToken[0].length; - if (!/^\s*(?:[,.:;!?-]+\s*)?$/.test(text.slice(endIndex))) { - break; - } - if (!/(^|[\s,.:;!?-])$/.test(text.slice(0, startIndex))) { - break; - } - if (wordCount > 1) { - const previousToken = tokens[tokens.length - wordCount + 1]; - const between = previousToken - ? text.slice(startIndex + startToken[0].length, previousToken.index) - : ""; - if (!/^[\s'-]+$/.test(between)) { - break; - } - } - const heardName = normalizeWakeNameCandidate(text.slice(startIndex, endIndex)); - if (!heardName) { - break; - } - candidates.push({ - edge: "trailing", - heardName, - startIndex, - endIndex, - strongBoundary: true, - }); - } - - return candidates; -} - -function levenshteinDistance(left: string, right: string): number { - if (left === right) { - return 0; - } - if (!left) { - return right.length; - } - if (!right) { - return left.length; - } - - let previous = Array.from({ length: right.length + 1 }, (_, index) => index); - for (let leftIndex = 0; leftIndex < left.length; leftIndex += 1) { - const current = [leftIndex + 1]; - for (let rightIndex = 0; rightIndex < right.length; rightIndex += 1) { - const cost = left[leftIndex] === right[rightIndex] ? 0 : 1; - current[rightIndex + 1] = Math.min( - current[rightIndex] + 1, - previous[rightIndex + 1] + 1, - previous[rightIndex] + cost, - ); - } - previous = current; - } - return previous[right.length] ?? Math.max(left.length, right.length); -} - -function hasOnlyPhoneticSubstitutions(left: string, right: string): boolean { - if (left.length !== right.length) { - return false; - } - const vowels = new Set(["a", "e", "i", "o", "u", "y"]); - const liquids = new Set(["l", "r"]); - let substitutions = 0; - for (let index = 0; index < left.length; index += 1) { - const leftChar = left[index]; - const rightChar = right[index]; - if (leftChar === rightChar) { - continue; - } - const vowelLike = vowels.has(leftChar ?? "") && vowels.has(rightChar ?? ""); - const liquidLike = liquids.has(leftChar ?? "") && liquids.has(rightChar ?? ""); - if (!vowelLike && !liquidLike) { - return false; - } - substitutions += 1; - } - return substitutions > 0; -} - -function commonPrefixLength(left: string, right: string): number { - const limit = Math.min(left.length, right.length); - for (let index = 0; index < limit; index += 1) { - if (left[index] !== right[index]) { - return index; - } - } - return limit; -} - -function isFuzzyWakeNameMatch(candidate: EdgeWakeNameCandidate, wakeName: string): boolean { - const normalizedWakeName = normalizeWakeNameCandidate(wakeName); - if (!normalizedWakeName) { - return false; - } - const heardCompact = compactWakeName(candidate.heardName); - const wakeCompact = compactWakeName(normalizedWakeName); - if (!heardCompact || !wakeCompact || wakeCompact.length < 5) { - return false; - } - if (!candidate.strongBoundary) { - return false; - } - if (heardCompact[0] !== wakeCompact[0]) { - return false; - } - const distance = levenshteinDistance(heardCompact, wakeCompact); - if (distance <= 1) { - return true; - } - if ( - distance === 2 && - heardCompact.length >= 4 && - wakeCompact.length >= 5 && - (heardCompact.length !== wakeCompact.length || - hasOnlyPhoneticSubstitutions(heardCompact, wakeCompact) || - commonPrefixLength(heardCompact, wakeCompact) >= 6) - ) { - return true; - } - if ( - distance === 3 && - heardCompact.length >= 7 && - wakeCompact.length >= 7 && - heardCompact.length !== wakeCompact.length && - commonPrefixLength(heardCompact, wakeCompact) >= 5 - ) { - return true; - } - return false; -} - -function stripEdgeWakeNameCandidate(text: string, candidate: EdgeWakeNameCandidate): string { - if (candidate.edge === "leading") { - return text - .slice(candidate.endIndex) - .replace(/^\s*(?:[-,:;.!?]+\s*)?/, "") - .trim(); - } - return text - .slice(0, candidate.startIndex) - .replace(/\s*(?:[-,:;.!?]+\s*)?$/, "") - .trim(); -} - -function matchEdgeWakeName( - text: string, - wakeNames: string[], -): AllowedWakeNameTranscriptResult | undefined { - const candidates = [ - ...leadingWakeNameCandidates(text, DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS), - ...trailingWakeNameCandidates(text, DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS), - ].toSorted( - (left, right) => - compactWakeName(right.heardName).length - compactWakeName(left.heardName).length, - ); - for (const candidate of candidates) { - for (const wakeName of wakeNames) { - const normalizedWakeName = normalizeWakeNameCandidate(wakeName); - if (!normalizedWakeName) { - continue; - } - const heardCompact = compactWakeName(candidate.heardName); - const wakeCompact = compactWakeName(normalizedWakeName); - if (heardCompact === wakeCompact || isFuzzyWakeNameMatch(candidate, wakeName)) { - return { - allowed: true, - text: stripEdgeWakeNameCandidate(text, candidate), - wakeName, - heardName: candidate.heardName, - match: heardCompact === wakeCompact ? "exact" : "fuzzy", - }; - } - } - } - return undefined; -} - function resolveDiscordRealtimeWakeNames(params: { config: DiscordRealtimeVoiceConfig; cfg: OpenClawConfig; @@ -640,30 +322,24 @@ function resolveDiscordRealtimeWakeNames(params: { const rawConfigured = params.config?.wakeNames; if (rawConfigured) { const configured = rawConfigured - .map((name) => normalizeSupportedWakeName(name)) + .map((name) => normalizeSupportedRealtimeVoiceActivationName(name)) .filter((name): name is string => Boolean(name)); - return sortWakeNames(Array.from(new Set(configured))); + return sortRealtimeVoiceActivationNames(Array.from(new Set(configured))); } const agent = params.cfg.agents?.list?.find((candidate) => candidate.id === params.agentId); const configuredAgentNames = [agent?.name, agent?.identity?.name] - .map((name) => normalizeSupportedWakeName(name)) + .map((name) => normalizeSupportedRealtimeVoiceActivationName(name)) .filter((name): name is string => Boolean(name)); - const productWakeNames = [normalizeSupportedWakeName("OpenClaw")].filter((name): name is string => - Boolean(name), + const productWakeNames = [normalizeSupportedRealtimeVoiceActivationName("OpenClaw")].filter( + (name): name is string => Boolean(name), ); const defaults = configuredAgentNames.length > 0 ? [...configuredAgentNames, ...productWakeNames] - : [normalizeSupportedWakeName(params.agentId), ...productWakeNames].filter( + : [normalizeSupportedRealtimeVoiceActivationName(params.agentId), ...productWakeNames].filter( (name): name is string => Boolean(name), ); - return sortWakeNames(Array.from(new Set(defaults))); -} - -function sortWakeNames(wakeNames: string[]): string[] { - return wakeNames.toSorted( - (left, right) => right.length - left.length || left.localeCompare(right), - ); + return sortRealtimeVoiceActivationNames(Array.from(new Set(defaults))); } function matchesPendingAgentProxyQuestion(consultMessage: string, question: string): boolean { @@ -1524,14 +1200,21 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { this.talkback.enqueue(acceptedText, this.consumePendingSpeakerContext()); } - private resolveWakeNameTranscript(text: string): WakeNameTranscriptResult { + private resolveWakeNameTranscript(text: string): RealtimeVoiceActivationNameTranscriptResult { if (!this.requireWakeName) { - return { allowed: true, text, wakeName: "", heardName: "", match: "exact" }; + return { + allowed: true, + text, + activationName: "", + heardName: "", + match: "exact", + edge: "leading", + }; } - const wakeNameResult = matchEdgeWakeName(text, this.wakeNames); + const wakeNameResult = matchRealtimeVoiceActivationName(text, this.wakeNames); if (wakeNameResult) { logger.info( - `discord voice: realtime wake-name gate matched canonical=${wakeNameResult.wakeName} heard=${wakeNameResult.heardName} match=${wakeNameResult.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`, + `discord voice: realtime wake-name gate matched canonical=${wakeNameResult.activationName} heard=${wakeNameResult.heardName} match=${wakeNameResult.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`, ); return wakeNameResult; } @@ -1585,7 +1268,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { if (!question) { return undefined; } - const skipReason = classifySkippableForcedAgentProxyTranscript(question); + const skipReason = classifySkippableRealtimeVoiceConsultTranscript(question); if (skipReason) { const context = this.consumePendingSpeakerContext(); logger.info( diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index c095cf258cc..85a1e3bb560 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -50,6 +50,23 @@ export { type TalkTurnResult, type TalkTurnSuccess, } from "../talk/talk-session-controller.js"; +export { + REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS, + isSupportedRealtimeVoiceActivationName, + matchRealtimeVoiceActivationName, + normalizeRealtimeVoiceActivationName, + normalizeRealtimeVoiceActivationNamePrefix, + normalizeSupportedRealtimeVoiceActivationName, + realtimeVoiceActivationNameWordCount, + sortRealtimeVoiceActivationNames, + type RealtimeVoiceActivationNameEdge, + type RealtimeVoiceActivationNameMatchKind, + type RealtimeVoiceActivationNameTranscriptResult, +} from "../talk/activation-name.js"; +export { + classifySkippableRealtimeVoiceConsultTranscript, + type SkippableRealtimeVoiceConsultTranscriptReason, +} from "../talk/consult-transcript.js"; export { buildRealtimeVoiceAgentConsultChatMessage, buildRealtimeVoiceAgentConsultPolicyInstructions, diff --git a/src/talk/activation-name.test.ts b/src/talk/activation-name.test.ts new file mode 100644 index 00000000000..d90792389bc --- /dev/null +++ b/src/talk/activation-name.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from "vitest"; +import { + isSupportedRealtimeVoiceActivationName, + matchRealtimeVoiceActivationName, + normalizeRealtimeVoiceActivationNamePrefix, + normalizeSupportedRealtimeVoiceActivationName, + sortRealtimeVoiceActivationNames, +} from "./activation-name.js"; + +describe("realtime voice activation names", () => { + it("normalizes and validates one- or two-word activation names", () => { + expect(normalizeSupportedRealtimeVoiceActivationName(" OpenClaw ")).toBe("openclaw"); + expect(normalizeSupportedRealtimeVoiceActivationName("Open Claw")).toBe("open claw"); + expect(normalizeSupportedRealtimeVoiceActivationName("Claw Bot Helper")).toBeUndefined(); + expect(isSupportedRealtimeVoiceActivationName("Claw Bot")).toBe(true); + expect(isSupportedRealtimeVoiceActivationName("Claw Bot Helper")).toBe(false); + expect(normalizeRealtimeVoiceActivationNamePrefix("Claw Bot Helper")).toBe("Claw Bot"); + }); + + it("matches and strips leading exact activation names", () => { + expect(matchRealtimeVoiceActivationName("Hey, Molty, ship it", ["molty"])).toEqual({ + allowed: true, + activationName: "molty", + edge: "leading", + heardName: "molty", + match: "exact", + text: "ship it", + }); + }); + + it("matches and strips trailing exact activation names", () => { + expect(matchRealtimeVoiceActivationName("ship it, Claw Bot", ["claw bot"])).toEqual({ + allowed: true, + activationName: "claw bot", + edge: "trailing", + heardName: "claw bot", + match: "exact", + text: "ship it", + }); + }); + + it("accepts bounded fuzzy matches at the transcript edge", () => { + expect(matchRealtimeVoiceActivationName("Malty, what changed?", ["molty"])).toMatchObject({ + allowed: true, + activationName: "molty", + edge: "leading", + heardName: "malty", + match: "fuzzy", + text: "what changed?", + }); + }); + + it("does not fuzzy match inside a larger phrase without an edge boundary", () => { + expect(matchRealtimeVoiceActivationName("maltiness is not a wake name", ["molty"])).toBe( + undefined, + ); + }); + + it("prefers longer activation names first", () => { + expect(sortRealtimeVoiceActivationNames(["claw", "claw bot", "openclaw"])).toEqual([ + "claw bot", + "openclaw", + "claw", + ]); + expect(matchRealtimeVoiceActivationName("Claw Bot, status", ["claw", "claw bot"])).toEqual({ + allowed: true, + activationName: "claw bot", + edge: "leading", + heardName: "claw bot", + match: "exact", + text: "status", + }); + }); +}); diff --git a/src/talk/activation-name.ts b/src/talk/activation-name.ts new file mode 100644 index 00000000000..66244176be3 --- /dev/null +++ b/src/talk/activation-name.ts @@ -0,0 +1,334 @@ +export const REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS = 2; + +export type RealtimeVoiceActivationNameEdge = "leading" | "trailing"; +export type RealtimeVoiceActivationNameMatchKind = "exact" | "fuzzy"; + +export type RealtimeVoiceActivationNameTranscriptResult = + | { + allowed: true; + text: string; + activationName: string; + heardName: string; + match: RealtimeVoiceActivationNameMatchKind; + edge: RealtimeVoiceActivationNameEdge; + } + | { allowed: false; text: string }; + +type EdgeActivationNameCandidate = { + edge: RealtimeVoiceActivationNameEdge; + heardName: string; + startIndex: number; + endIndex: number; + strongBoundary: boolean; +}; + +export function realtimeVoiceActivationNameWordCount(value: string): number { + return Array.from(value.matchAll(/[a-z0-9]+/gi)).length; +} + +export function normalizeRealtimeVoiceActivationName(value: string): string | undefined { + const normalized = value.toLowerCase().replace(/\s+/g, " ").trim(); + return normalized || undefined; +} + +export function normalizeRealtimeVoiceActivationNamePrefix( + value: string, + maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS, +): string | undefined { + const words = Array.from(value.matchAll(/[a-z0-9]+/gi), (match) => match[0]); + if (words.length === 0) { + return undefined; + } + return words.slice(0, maxWords).join(" "); +} + +export function isSupportedRealtimeVoiceActivationName( + value: string, + maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS, +): boolean { + const wordCount = realtimeVoiceActivationNameWordCount(value); + return wordCount >= 1 && wordCount <= maxWords; +} + +export function normalizeSupportedRealtimeVoiceActivationName( + value: string | undefined, + maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS, +): string | undefined { + if (typeof value !== "string") { + return undefined; + } + const normalized = normalizeRealtimeVoiceActivationName(value); + return normalized && isSupportedRealtimeVoiceActivationName(normalized, maxWords) + ? normalized + : undefined; +} + +export function sortRealtimeVoiceActivationNames(names: string[]): string[] { + return names.toSorted((left, right) => right.length - left.length || left.localeCompare(right)); +} + +export function matchRealtimeVoiceActivationName( + text: string, + activationNames: string[], + maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS, +): Extract | undefined { + const candidates = [ + ...leadingActivationNameCandidates(text, maxWords), + ...trailingActivationNameCandidates(text, maxWords), + ].toSorted( + (left, right) => + compactActivationName(right.heardName).length - compactActivationName(left.heardName).length, + ); + + for (const candidate of candidates) { + for (const activationName of activationNames) { + const normalizedActivationName = normalizeActivationNameCandidate(activationName); + if (!normalizedActivationName) { + continue; + } + const heardCompact = compactActivationName(candidate.heardName); + const activationCompact = compactActivationName(normalizedActivationName); + if ( + heardCompact === activationCompact || + isFuzzyActivationNameMatch(candidate, activationName) + ) { + return { + allowed: true, + text: stripEdgeActivationNameCandidate(text, candidate), + activationName, + heardName: candidate.heardName, + match: heardCompact === activationCompact ? "exact" : "fuzzy", + edge: candidate.edge, + }; + } + } + } + return undefined; +} + +function normalizeActivationNameCandidate(value: string): string | undefined { + const normalized = value + .toLowerCase() + .replace(/[^a-z0-9]+/g, " ") + .replace(/\s+/g, " ") + .trim(); + return normalized || undefined; +} + +function compactActivationName(value: string): string { + return value.replace(/[^a-z0-9]+/g, ""); +} + +function leadingActivationNameCandidates( + text: string, + maxWords: number, +): EdgeActivationNameCandidate[] { + const opener = /^\s*(?:(?:hey|ok|okay)(?:\s*[-,:;]+\s*|\s+))?/i.exec(text); + const nameStart = opener?.[0].length ?? 0; + const candidates: EdgeActivationNameCandidate[] = []; + const candidateStarts = nameStart > 0 ? [0, nameStart] : [0]; + + for (const startIndex of candidateStarts) { + const tokenPattern = /[a-z0-9]+/gi; + tokenPattern.lastIndex = startIndex; + const startCandidates: EdgeActivationNameCandidate[] = []; + + for (let wordCount = 0; wordCount < maxWords; wordCount += 1) { + const token = tokenPattern.exec(text); + if (!token) { + break; + } + const previousEndIndex = + wordCount === 0 ? startIndex : startCandidates[wordCount - 1]?.endIndex; + const between = text.slice(previousEndIndex, token.index); + if (wordCount > 0 && !/^[\s'-]+$/.test(between)) { + break; + } + const endIndex = token.index + token[0].length; + const heardName = normalizeActivationNameCandidate(text.slice(startIndex, endIndex)); + if (!heardName) { + break; + } + const boundary = text.slice(endIndex).match(/^\s*([,.:;!?-]|$)/); + startCandidates.push({ + edge: "leading", + heardName, + startIndex, + endIndex, + strongBoundary: Boolean(boundary), + }); + } + + candidates.push(...startCandidates); + } + + return candidates; +} + +function trailingActivationNameCandidates( + text: string, + maxWords: number, +): EdgeActivationNameCandidate[] { + const tokens = Array.from(text.matchAll(/[a-z0-9]+/gi)); + const candidates: EdgeActivationNameCandidate[] = []; + const tokenCount = Math.min(tokens.length, maxWords); + + for (let wordCount = 1; wordCount <= tokenCount; wordCount += 1) { + const startToken = tokens[tokens.length - wordCount]; + const endToken = tokens[tokens.length - 1]; + if (!startToken || !endToken?.[0]) { + break; + } + const startIndex = startToken.index ?? 0; + const endIndex = (endToken.index ?? 0) + endToken[0].length; + if (!/^\s*(?:[,.:;!?-]+\s*)?$/.test(text.slice(endIndex))) { + break; + } + if (!/(^|[\s,.:;!?-])$/.test(text.slice(0, startIndex))) { + break; + } + if (wordCount > 1) { + const previousToken = tokens[tokens.length - wordCount + 1]; + const between = previousToken + ? text.slice(startIndex + startToken[0].length, previousToken.index) + : ""; + if (!/^[\s'-]+$/.test(between)) { + break; + } + } + const heardName = normalizeActivationNameCandidate(text.slice(startIndex, endIndex)); + if (!heardName) { + break; + } + candidates.push({ + edge: "trailing", + heardName, + startIndex, + endIndex, + strongBoundary: true, + }); + } + + return candidates; +} + +function levenshteinDistance(left: string, right: string): number { + if (left === right) { + return 0; + } + if (!left) { + return right.length; + } + if (!right) { + return left.length; + } + + let previous = Array.from({ length: right.length + 1 }, (_, index) => index); + for (let leftIndex = 0; leftIndex < left.length; leftIndex += 1) { + const current = [leftIndex + 1]; + for (let rightIndex = 0; rightIndex < right.length; rightIndex += 1) { + const cost = left[leftIndex] === right[rightIndex] ? 0 : 1; + current[rightIndex + 1] = Math.min( + current[rightIndex] + 1, + previous[rightIndex + 1] + 1, + previous[rightIndex] + cost, + ); + } + previous = current; + } + return previous[right.length] ?? Math.max(left.length, right.length); +} + +function hasOnlyPhoneticSubstitutions(left: string, right: string): boolean { + if (left.length !== right.length) { + return false; + } + const vowels = new Set(["a", "e", "i", "o", "u", "y"]); + const liquids = new Set(["l", "r"]); + let substitutions = 0; + for (let index = 0; index < left.length; index += 1) { + const leftChar = left[index]; + const rightChar = right[index]; + if (leftChar === rightChar) { + continue; + } + const vowelLike = vowels.has(leftChar ?? "") && vowels.has(rightChar ?? ""); + const liquidLike = liquids.has(leftChar ?? "") && liquids.has(rightChar ?? ""); + if (!vowelLike && !liquidLike) { + return false; + } + substitutions += 1; + } + return substitutions > 0; +} + +function commonPrefixLength(left: string, right: string): number { + const limit = Math.min(left.length, right.length); + for (let index = 0; index < limit; index += 1) { + if (left[index] !== right[index]) { + return index; + } + } + return limit; +} + +function isFuzzyActivationNameMatch( + candidate: EdgeActivationNameCandidate, + activationName: string, +): boolean { + const normalizedActivationName = normalizeActivationNameCandidate(activationName); + if (!normalizedActivationName) { + return false; + } + const heardCompact = compactActivationName(candidate.heardName); + const activationCompact = compactActivationName(normalizedActivationName); + if (!heardCompact || !activationCompact || activationCompact.length < 5) { + return false; + } + if (!candidate.strongBoundary) { + return false; + } + if (heardCompact[0] !== activationCompact[0]) { + return false; + } + const distance = levenshteinDistance(heardCompact, activationCompact); + if (distance <= 1) { + return true; + } + if ( + distance === 2 && + heardCompact.length >= 4 && + activationCompact.length >= 5 && + (heardCompact.length !== activationCompact.length || + hasOnlyPhoneticSubstitutions(heardCompact, activationCompact) || + commonPrefixLength(heardCompact, activationCompact) >= 6) + ) { + return true; + } + if ( + distance === 3 && + heardCompact.length >= 7 && + activationCompact.length >= 7 && + heardCompact.length !== activationCompact.length && + commonPrefixLength(heardCompact, activationCompact) >= 5 + ) { + return true; + } + return false; +} + +function stripEdgeActivationNameCandidate( + text: string, + candidate: EdgeActivationNameCandidate, +): string { + if (candidate.edge === "leading") { + return text + .slice(candidate.endIndex) + .replace(/^\s*(?:[-,:;.!?]+\s*)?/, "") + .trim(); + } + return text + .slice(0, candidate.startIndex) + .replace(/\s*(?:[-,:;.!?]+\s*)?$/, "") + .trim(); +} diff --git a/src/talk/consult-transcript.test.ts b/src/talk/consult-transcript.test.ts new file mode 100644 index 00000000000..192421f6a36 --- /dev/null +++ b/src/talk/consult-transcript.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, it } from "vitest"; +import { classifySkippableRealtimeVoiceConsultTranscript } from "./consult-transcript.js"; + +describe("realtime voice consult transcript classification", () => { + it("skips empty and incomplete transcripts", () => { + expect(classifySkippableRealtimeVoiceConsultTranscript(" ")).toBe("empty"); + expect(classifySkippableRealtimeVoiceConsultTranscript("can you check...")).toBe( + "incomplete-transcript", + ); + expect(classifySkippableRealtimeVoiceConsultTranscript("can you check…")).toBe( + "incomplete-transcript", + ); + }); + + it("skips likely trailing fragments", () => { + expect(classifySkippableRealtimeVoiceConsultTranscript("tell me about")).toBe( + "trailing-fragment", + ); + expect(classifySkippableRealtimeVoiceConsultTranscript("ship it so")).toBe("trailing-fragment"); + }); + + it("skips non-actionable closings unless phrased as a question", () => { + expect(classifySkippableRealtimeVoiceConsultTranscript("I'll be right back")).toBe( + "non-actionable-closing", + ); + expect(classifySkippableRealtimeVoiceConsultTranscript("goodbye for now")).toBe( + "non-actionable-closing", + ); + expect(classifySkippableRealtimeVoiceConsultTranscript("can you say goodbye?")).toBeUndefined(); + }); + + it("keeps actionable transcripts", () => { + expect(classifySkippableRealtimeVoiceConsultTranscript("what changed in CI?")).toBeUndefined(); + }); +}); diff --git a/src/talk/consult-transcript.ts b/src/talk/consult-transcript.ts new file mode 100644 index 00000000000..44e9bbd4d5c --- /dev/null +++ b/src/talk/consult-transcript.ts @@ -0,0 +1,53 @@ +const REALTIME_VOICE_CONSULT_TRAILING_FRAGMENT_WORDS = new Set([ + "a", + "about", + "an", + "and", + "as", + "at", + "because", + "but", + "by", + "for", + "from", + "in", + "of", + "on", + "or", + "so", + "that", + "the", + "then", + "to", + "with", +]); + +export type SkippableRealtimeVoiceConsultTranscriptReason = + | "empty" + | "incomplete-transcript" + | "trailing-fragment" + | "non-actionable-closing"; + +export function classifySkippableRealtimeVoiceConsultTranscript( + text: string, +): SkippableRealtimeVoiceConsultTranscriptReason | undefined { + const normalized = text.replace(/\s+/g, " ").trim().toLowerCase(); + if (!normalized) { + return "empty"; + } + if (/(\.\.\.|…)\s*$/.test(normalized)) { + return "incomplete-transcript"; + } + const lastWord = normalized.match(/[a-z']+$/)?.[0]?.replace(/^'+|'+$/g, ""); + if (lastWord && REALTIME_VOICE_CONSULT_TRAILING_FRAGMENT_WORDS.has(lastWord)) { + return "trailing-fragment"; + } + if ( + !normalized.includes("?") && + (/^(i'?ll|i will) be (right )?back\b/.test(normalized) || + /\b(see you|bye(?:-bye)?|goodbye)\b/.test(normalized)) + ) { + return "non-actionable-closing"; + } + return undefined; +}