refactor: share realtime voice activation helpers (#86615)

This commit is contained in:
Peter Steinberger
2026-05-25 20:25:17 +01:00
committed by GitHub
parent 170e0aac2a
commit d0ab0d9922
9 changed files with 548 additions and 364 deletions

View File

@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Voice: share activation-name matching and consult-transcript screening through the realtime voice SDK so Discord, browser voice, and meeting surfaces can reuse one implementation.
- Cron: default `cron.maxConcurrentRuns` to 8 so scheduled automations and their isolated agent turns can make progress in parallel without explicit configuration.
- QA-Lab: add `qa coverage --match <query>` so focused proof selection can discover matching scenarios from existing metadata before running live or remote lanes.
- Control UI: add an ephemeral Activity tab for sanitized live tool activity summaries without persisting raw telemetry. Fixes #12831. Thanks @BunsDev.

View File

@@ -1,2 +1,2 @@
390681a3d97af8c004db89ead136bd6cff693af5a0ddfe86a8e3c55a29a077eb plugin-sdk-api-baseline.json
8dfaf69ee3d0a946bfdd1d8d97ef85262824d52c20854249f900db61f2a7f7b4 plugin-sdk-api-baseline.jsonl
1d3e6177eeac57fc43736f7d5f76d8f825e1859ca625d268e97dc30b5567ea34 plugin-sdk-api-baseline.json
6c093ff7c10bd81ee9d2c4fc5d07b206bc3a1f5acd0bad491cfc9e0df6689f6b plugin-sdk-api-baseline.jsonl

View File

@@ -3,11 +3,14 @@ import type {
ChannelDoctorLegacyConfigRule,
} from "openclaw/plugin-sdk/channel-contract";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-contracts";
import {
isSupportedRealtimeVoiceActivationName,
normalizeRealtimeVoiceActivationNamePrefix,
} from "openclaw/plugin-sdk/realtime-voice";
import { asObjectRecord, normalizeLegacyChannelAliases } from "openclaw/plugin-sdk/runtime-doctor";
import { resolveDiscordPreviewStreamMode } from "./preview-streaming.js";
const LEGACY_TTS_PROVIDER_KEYS = ["openai", "elevenlabs", "microsoft", "edge"] as const;
const DISCORD_REALTIME_WAKE_NAME_MAX_WORDS = 2;
type AgentBindingConfig = NonNullable<OpenClawConfig["bindings"]>[number];
function hasLegacyTtsProviderKeys(value: unknown): boolean {
@@ -78,23 +81,6 @@ function hasLegacyDiscordAccountGuildChannelAgentId(value: unknown): boolean {
return Object.values(accounts).some((account) => hasLegacyDiscordGuildChannelAgentId(account));
}
function realtimeWakeNameWordCount(value: string): number {
return Array.from(value.matchAll(/[a-z0-9]+/gi)).length;
}
function normalizeRealtimeWakeName(value: string): string | undefined {
const words = Array.from(value.matchAll(/[a-z0-9]+/gi), (match) => match[0]);
if (words.length === 0) {
return undefined;
}
return words.slice(0, DISCORD_REALTIME_WAKE_NAME_MAX_WORDS).join(" ");
}
function isSupportedRealtimeWakeName(value: string): boolean {
const wordCount = realtimeWakeNameWordCount(value);
return wordCount >= 1 && wordCount <= DISCORD_REALTIME_WAKE_NAME_MAX_WORDS;
}
function hasUnsupportedRealtimeWakeNamesInVoice(value: unknown): boolean {
const voice = asObjectRecord(value);
const realtime = asObjectRecord(voice?.realtime);
@@ -102,7 +88,8 @@ function hasUnsupportedRealtimeWakeNamesInVoice(value: unknown): boolean {
return Array.isArray(wakeNames)
? wakeNames.length === 0 ||
wakeNames.some(
(wakeName) => typeof wakeName === "string" && !isSupportedRealtimeWakeName(wakeName),
(wakeName) =>
typeof wakeName === "string" && !isSupportedRealtimeVoiceActivationName(wakeName),
)
: false;
}
@@ -231,10 +218,10 @@ function normalizeUnsupportedRealtimeWakeNames(
let normalized = 0;
let removed = 0;
const nextWakeNames = wakeNames.flatMap((wakeName) => {
if (typeof wakeName !== "string" || isSupportedRealtimeWakeName(wakeName)) {
if (typeof wakeName !== "string" || isSupportedRealtimeVoiceActivationName(wakeName)) {
return [wakeName];
}
const nextWakeName = normalizeRealtimeWakeName(wakeName);
const nextWakeName = normalizeRealtimeVoiceActivationNamePrefix(wakeName);
if (!nextWakeName) {
removed += 1;
return [];

View File

@@ -3,9 +3,12 @@ import type { DiscordAccountConfig, OpenClawConfig } from "openclaw/plugin-sdk/c
import {
buildRealtimeVoiceAgentConsultChatMessage,
buildRealtimeVoiceAgentConsultPolicyInstructions,
classifySkippableRealtimeVoiceConsultTranscript,
controlRealtimeVoiceAgentRun,
createRealtimeVoiceAgentTalkbackQueue,
createRealtimeVoiceBridgeSession,
matchRealtimeVoiceActivationName,
normalizeSupportedRealtimeVoiceActivationName,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
REALTIME_VOICE_AGENT_CONTROL_TOOL,
REALTIME_VOICE_AGENT_CONTROL_TOOL_NAME,
@@ -22,6 +25,8 @@ import {
type RealtimeVoiceBridgeSession,
type RealtimeVoiceProviderConfig,
type RealtimeVoiceToolCallEvent,
sortRealtimeVoiceActivationNames,
type RealtimeVoiceActivationNameTranscriptResult,
} from "openclaw/plugin-sdk/realtime-voice";
import { createSubsystemLogger } from "openclaw/plugin-sdk/runtime-env";
import { formatErrorMessage } from "openclaw/plugin-sdk/ssrf-runtime";
@@ -65,35 +70,11 @@ const DISCORD_REALTIME_FORCED_CONSULT_FALLBACK_DELAY_MS = 200;
const DISCORD_REALTIME_DUPLICATE_ERROR_SUPPRESS_MS = 60_000;
const DISCORD_REALTIME_CONTROL_SPEECH_DEDUPE_MS = 5_000;
const DISCORD_REALTIME_OUTPUT_PLAYBACK_WATCHDOG_MARGIN_MS = 1_500;
const DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS = 2;
const REALTIME_PCM16_BYTES_PER_SAMPLE = 2;
const DISCORD_RAW_PCM_FRAME_BYTES = 3_840;
const DISCORD_REALTIME_OUTPUT_PREROLL_FRAMES = 25;
const DISCORD_REALTIME_TRAILING_SILENCE_MIN_MS = 700;
const DISCORD_REALTIME_TRAILING_SILENCE_MAX_MS = 3_000;
const DISCORD_REALTIME_FORCED_CONSULT_TRAILING_FRAGMENT_WORDS = new Set([
"a",
"about",
"an",
"and",
"as",
"at",
"because",
"but",
"by",
"for",
"from",
"in",
"of",
"on",
"or",
"so",
"that",
"the",
"then",
"to",
"with",
]);
const DISCORD_REALTIME_FORCED_CONSULT_REASON =
"provider_final_transcript_without_openclaw_agent_consult";
const DISCORD_REALTIME_VERBOSE_OMITTED_EVENTS = new Set([
@@ -204,28 +185,6 @@ function shouldLogRealtimeVerboseEvent(event: RealtimeVoiceBridgeEvent): boolean
return !DISCORD_REALTIME_VERBOSE_OMITTED_EVENTS.has(event.type);
}
function classifySkippableForcedAgentProxyTranscript(text: string): string | undefined {
const normalized = text.replace(/\s+/g, " ").trim().toLowerCase();
if (!normalized) {
return "empty";
}
if (/(\.\.\.|…)\s*$/.test(normalized)) {
return "incomplete-transcript";
}
const lastWord = normalized.match(/[a-z']+$/)?.[0]?.replace(/^'+|'+$/g, "");
if (lastWord && DISCORD_REALTIME_FORCED_CONSULT_TRAILING_FRAGMENT_WORDS.has(lastWord)) {
return "trailing-fragment";
}
if (
!normalized.includes("?") &&
(/^(i'?ll|i will) be (right )?back\b/.test(normalized) ||
/\b(see you|bye(?:-bye)?|goodbye)\b/.test(normalized))
) {
return "non-actionable-closing";
}
return undefined;
}
function readProviderConfigString(
config: RealtimeVoiceProviderConfig,
key: string,
@@ -355,283 +314,6 @@ function normalizeControlSpeechText(text: string): string {
return text.toLowerCase().replace(/\s+/g, " ").trim();
}
function normalizeWakeName(value: string): string | undefined {
const normalized = value.toLowerCase().replace(/\s+/g, " ").trim();
return normalized || undefined;
}
function normalizeSupportedWakeName(value: string | undefined): string | undefined {
if (typeof value !== "string") {
return undefined;
}
const normalized = normalizeWakeName(value);
const wordCount = normalized ? Array.from(normalized.matchAll(/[a-z0-9]+/gi)).length : 0;
return wordCount >= 1 && wordCount <= DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS
? normalized
: undefined;
}
function normalizeWakeNameCandidate(value: string): string | undefined {
const normalized = value
.toLowerCase()
.replace(/[^a-z0-9]+/g, " ")
.replace(/\s+/g, " ")
.trim();
return normalized || undefined;
}
function compactWakeName(value: string): string {
return value.replace(/[^a-z0-9]+/g, "");
}
type EdgeWakeNameCandidate = {
edge: "leading" | "trailing";
heardName: string;
startIndex: number;
endIndex: number;
strongBoundary: boolean;
};
type WakeNameTranscriptResult =
| { allowed: true; text: string; wakeName: string; heardName: string; match: "exact" | "fuzzy" }
| { allowed: false; text: string };
type AllowedWakeNameTranscriptResult = Extract<WakeNameTranscriptResult, { allowed: true }>;
function leadingWakeNameCandidates(text: string, maxWords: number): EdgeWakeNameCandidate[] {
const opener = /^\s*(?:(?:hey|ok|okay)(?:\s*[-,:;]+\s*|\s+))?/i.exec(text);
const nameStart = opener?.[0].length ?? 0;
const candidates: EdgeWakeNameCandidate[] = [];
const candidateStarts = nameStart > 0 ? [0, nameStart] : [0];
for (const startIndex of candidateStarts) {
const tokenPattern = /[a-z0-9]+/gi;
tokenPattern.lastIndex = startIndex;
const startCandidates: EdgeWakeNameCandidate[] = [];
for (let wordCount = 0; wordCount < maxWords; wordCount += 1) {
const token = tokenPattern.exec(text);
if (!token) {
break;
}
const previousEndIndex =
wordCount === 0 ? startIndex : startCandidates[wordCount - 1]?.endIndex;
const between = text.slice(previousEndIndex, token.index);
if (wordCount > 0 && !/^[\s'-]+$/.test(between)) {
break;
}
const endIndex = token.index + token[0].length;
const heardName = normalizeWakeNameCandidate(text.slice(startIndex, endIndex));
if (!heardName) {
break;
}
const boundary = text.slice(endIndex).match(/^\s*([,.:;!?-]|$)/);
startCandidates.push({
edge: "leading",
heardName,
startIndex,
endIndex,
strongBoundary: Boolean(boundary),
});
}
candidates.push(...startCandidates);
}
return candidates;
}
function trailingWakeNameCandidates(text: string, maxWords: number): EdgeWakeNameCandidate[] {
const tokens = Array.from(text.matchAll(/[a-z0-9]+/gi));
const candidates: EdgeWakeNameCandidate[] = [];
const tokenCount = Math.min(tokens.length, maxWords);
for (let wordCount = 1; wordCount <= tokenCount; wordCount += 1) {
const startToken = tokens[tokens.length - wordCount];
const endToken = tokens[tokens.length - 1];
if (!startToken || !endToken?.[0]) {
break;
}
const startIndex = startToken.index ?? 0;
const endIndex = (endToken.index ?? 0) + endToken[0].length;
if (!/^\s*(?:[,.:;!?-]+\s*)?$/.test(text.slice(endIndex))) {
break;
}
if (!/(^|[\s,.:;!?-])$/.test(text.slice(0, startIndex))) {
break;
}
if (wordCount > 1) {
const previousToken = tokens[tokens.length - wordCount + 1];
const between = previousToken
? text.slice(startIndex + startToken[0].length, previousToken.index)
: "";
if (!/^[\s'-]+$/.test(between)) {
break;
}
}
const heardName = normalizeWakeNameCandidate(text.slice(startIndex, endIndex));
if (!heardName) {
break;
}
candidates.push({
edge: "trailing",
heardName,
startIndex,
endIndex,
strongBoundary: true,
});
}
return candidates;
}
function levenshteinDistance(left: string, right: string): number {
if (left === right) {
return 0;
}
if (!left) {
return right.length;
}
if (!right) {
return left.length;
}
let previous = Array.from({ length: right.length + 1 }, (_, index) => index);
for (let leftIndex = 0; leftIndex < left.length; leftIndex += 1) {
const current = [leftIndex + 1];
for (let rightIndex = 0; rightIndex < right.length; rightIndex += 1) {
const cost = left[leftIndex] === right[rightIndex] ? 0 : 1;
current[rightIndex + 1] = Math.min(
current[rightIndex] + 1,
previous[rightIndex + 1] + 1,
previous[rightIndex] + cost,
);
}
previous = current;
}
return previous[right.length] ?? Math.max(left.length, right.length);
}
function hasOnlyPhoneticSubstitutions(left: string, right: string): boolean {
if (left.length !== right.length) {
return false;
}
const vowels = new Set(["a", "e", "i", "o", "u", "y"]);
const liquids = new Set(["l", "r"]);
let substitutions = 0;
for (let index = 0; index < left.length; index += 1) {
const leftChar = left[index];
const rightChar = right[index];
if (leftChar === rightChar) {
continue;
}
const vowelLike = vowels.has(leftChar ?? "") && vowels.has(rightChar ?? "");
const liquidLike = liquids.has(leftChar ?? "") && liquids.has(rightChar ?? "");
if (!vowelLike && !liquidLike) {
return false;
}
substitutions += 1;
}
return substitutions > 0;
}
function commonPrefixLength(left: string, right: string): number {
const limit = Math.min(left.length, right.length);
for (let index = 0; index < limit; index += 1) {
if (left[index] !== right[index]) {
return index;
}
}
return limit;
}
function isFuzzyWakeNameMatch(candidate: EdgeWakeNameCandidate, wakeName: string): boolean {
const normalizedWakeName = normalizeWakeNameCandidate(wakeName);
if (!normalizedWakeName) {
return false;
}
const heardCompact = compactWakeName(candidate.heardName);
const wakeCompact = compactWakeName(normalizedWakeName);
if (!heardCompact || !wakeCompact || wakeCompact.length < 5) {
return false;
}
if (!candidate.strongBoundary) {
return false;
}
if (heardCompact[0] !== wakeCompact[0]) {
return false;
}
const distance = levenshteinDistance(heardCompact, wakeCompact);
if (distance <= 1) {
return true;
}
if (
distance === 2 &&
heardCompact.length >= 4 &&
wakeCompact.length >= 5 &&
(heardCompact.length !== wakeCompact.length ||
hasOnlyPhoneticSubstitutions(heardCompact, wakeCompact) ||
commonPrefixLength(heardCompact, wakeCompact) >= 6)
) {
return true;
}
if (
distance === 3 &&
heardCompact.length >= 7 &&
wakeCompact.length >= 7 &&
heardCompact.length !== wakeCompact.length &&
commonPrefixLength(heardCompact, wakeCompact) >= 5
) {
return true;
}
return false;
}
function stripEdgeWakeNameCandidate(text: string, candidate: EdgeWakeNameCandidate): string {
if (candidate.edge === "leading") {
return text
.slice(candidate.endIndex)
.replace(/^\s*(?:[-,:;.!?]+\s*)?/, "")
.trim();
}
return text
.slice(0, candidate.startIndex)
.replace(/\s*(?:[-,:;.!?]+\s*)?$/, "")
.trim();
}
function matchEdgeWakeName(
text: string,
wakeNames: string[],
): AllowedWakeNameTranscriptResult | undefined {
const candidates = [
...leadingWakeNameCandidates(text, DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS),
...trailingWakeNameCandidates(text, DISCORD_REALTIME_WAKE_NAME_EDGE_WORDS),
].toSorted(
(left, right) =>
compactWakeName(right.heardName).length - compactWakeName(left.heardName).length,
);
for (const candidate of candidates) {
for (const wakeName of wakeNames) {
const normalizedWakeName = normalizeWakeNameCandidate(wakeName);
if (!normalizedWakeName) {
continue;
}
const heardCompact = compactWakeName(candidate.heardName);
const wakeCompact = compactWakeName(normalizedWakeName);
if (heardCompact === wakeCompact || isFuzzyWakeNameMatch(candidate, wakeName)) {
return {
allowed: true,
text: stripEdgeWakeNameCandidate(text, candidate),
wakeName,
heardName: candidate.heardName,
match: heardCompact === wakeCompact ? "exact" : "fuzzy",
};
}
}
}
return undefined;
}
function resolveDiscordRealtimeWakeNames(params: {
config: DiscordRealtimeVoiceConfig;
cfg: OpenClawConfig;
@@ -640,30 +322,24 @@ function resolveDiscordRealtimeWakeNames(params: {
const rawConfigured = params.config?.wakeNames;
if (rawConfigured) {
const configured = rawConfigured
.map((name) => normalizeSupportedWakeName(name))
.map((name) => normalizeSupportedRealtimeVoiceActivationName(name))
.filter((name): name is string => Boolean(name));
return sortWakeNames(Array.from(new Set(configured)));
return sortRealtimeVoiceActivationNames(Array.from(new Set(configured)));
}
const agent = params.cfg.agents?.list?.find((candidate) => candidate.id === params.agentId);
const configuredAgentNames = [agent?.name, agent?.identity?.name]
.map((name) => normalizeSupportedWakeName(name))
.map((name) => normalizeSupportedRealtimeVoiceActivationName(name))
.filter((name): name is string => Boolean(name));
const productWakeNames = [normalizeSupportedWakeName("OpenClaw")].filter((name): name is string =>
Boolean(name),
const productWakeNames = [normalizeSupportedRealtimeVoiceActivationName("OpenClaw")].filter(
(name): name is string => Boolean(name),
);
const defaults =
configuredAgentNames.length > 0
? [...configuredAgentNames, ...productWakeNames]
: [normalizeSupportedWakeName(params.agentId), ...productWakeNames].filter(
: [normalizeSupportedRealtimeVoiceActivationName(params.agentId), ...productWakeNames].filter(
(name): name is string => Boolean(name),
);
return sortWakeNames(Array.from(new Set(defaults)));
}
function sortWakeNames(wakeNames: string[]): string[] {
return wakeNames.toSorted(
(left, right) => right.length - left.length || left.localeCompare(right),
);
return sortRealtimeVoiceActivationNames(Array.from(new Set(defaults)));
}
function matchesPendingAgentProxyQuestion(consultMessage: string, question: string): boolean {
@@ -1524,14 +1200,21 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
this.talkback.enqueue(acceptedText, this.consumePendingSpeakerContext());
}
private resolveWakeNameTranscript(text: string): WakeNameTranscriptResult {
private resolveWakeNameTranscript(text: string): RealtimeVoiceActivationNameTranscriptResult {
if (!this.requireWakeName) {
return { allowed: true, text, wakeName: "", heardName: "", match: "exact" };
return {
allowed: true,
text,
activationName: "",
heardName: "",
match: "exact",
edge: "leading",
};
}
const wakeNameResult = matchEdgeWakeName(text, this.wakeNames);
const wakeNameResult = matchRealtimeVoiceActivationName(text, this.wakeNames);
if (wakeNameResult) {
logger.info(
`discord voice: realtime wake-name gate matched canonical=${wakeNameResult.wakeName} heard=${wakeNameResult.heardName} match=${wakeNameResult.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`,
`discord voice: realtime wake-name gate matched canonical=${wakeNameResult.activationName} heard=${wakeNameResult.heardName} match=${wakeNameResult.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`,
);
return wakeNameResult;
}
@@ -1585,7 +1268,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
if (!question) {
return undefined;
}
const skipReason = classifySkippableForcedAgentProxyTranscript(question);
const skipReason = classifySkippableRealtimeVoiceConsultTranscript(question);
if (skipReason) {
const context = this.consumePendingSpeakerContext();
logger.info(

View File

@@ -50,6 +50,23 @@ export {
type TalkTurnResult,
type TalkTurnSuccess,
} from "../talk/talk-session-controller.js";
export {
REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS,
isSupportedRealtimeVoiceActivationName,
matchRealtimeVoiceActivationName,
normalizeRealtimeVoiceActivationName,
normalizeRealtimeVoiceActivationNamePrefix,
normalizeSupportedRealtimeVoiceActivationName,
realtimeVoiceActivationNameWordCount,
sortRealtimeVoiceActivationNames,
type RealtimeVoiceActivationNameEdge,
type RealtimeVoiceActivationNameMatchKind,
type RealtimeVoiceActivationNameTranscriptResult,
} from "../talk/activation-name.js";
export {
classifySkippableRealtimeVoiceConsultTranscript,
type SkippableRealtimeVoiceConsultTranscriptReason,
} from "../talk/consult-transcript.js";
export {
buildRealtimeVoiceAgentConsultChatMessage,
buildRealtimeVoiceAgentConsultPolicyInstructions,

View File

@@ -0,0 +1,74 @@
import { describe, expect, it } from "vitest";
import {
isSupportedRealtimeVoiceActivationName,
matchRealtimeVoiceActivationName,
normalizeRealtimeVoiceActivationNamePrefix,
normalizeSupportedRealtimeVoiceActivationName,
sortRealtimeVoiceActivationNames,
} from "./activation-name.js";
describe("realtime voice activation names", () => {
it("normalizes and validates one- or two-word activation names", () => {
expect(normalizeSupportedRealtimeVoiceActivationName(" OpenClaw ")).toBe("openclaw");
expect(normalizeSupportedRealtimeVoiceActivationName("Open Claw")).toBe("open claw");
expect(normalizeSupportedRealtimeVoiceActivationName("Claw Bot Helper")).toBeUndefined();
expect(isSupportedRealtimeVoiceActivationName("Claw Bot")).toBe(true);
expect(isSupportedRealtimeVoiceActivationName("Claw Bot Helper")).toBe(false);
expect(normalizeRealtimeVoiceActivationNamePrefix("Claw Bot Helper")).toBe("Claw Bot");
});
it("matches and strips leading exact activation names", () => {
expect(matchRealtimeVoiceActivationName("Hey, Molty, ship it", ["molty"])).toEqual({
allowed: true,
activationName: "molty",
edge: "leading",
heardName: "molty",
match: "exact",
text: "ship it",
});
});
it("matches and strips trailing exact activation names", () => {
expect(matchRealtimeVoiceActivationName("ship it, Claw Bot", ["claw bot"])).toEqual({
allowed: true,
activationName: "claw bot",
edge: "trailing",
heardName: "claw bot",
match: "exact",
text: "ship it",
});
});
it("accepts bounded fuzzy matches at the transcript edge", () => {
expect(matchRealtimeVoiceActivationName("Malty, what changed?", ["molty"])).toMatchObject({
allowed: true,
activationName: "molty",
edge: "leading",
heardName: "malty",
match: "fuzzy",
text: "what changed?",
});
});
it("does not fuzzy match inside a larger phrase without an edge boundary", () => {
expect(matchRealtimeVoiceActivationName("maltiness is not a wake name", ["molty"])).toBe(
undefined,
);
});
it("prefers longer activation names first", () => {
expect(sortRealtimeVoiceActivationNames(["claw", "claw bot", "openclaw"])).toEqual([
"claw bot",
"openclaw",
"claw",
]);
expect(matchRealtimeVoiceActivationName("Claw Bot, status", ["claw", "claw bot"])).toEqual({
allowed: true,
activationName: "claw bot",
edge: "leading",
heardName: "claw bot",
match: "exact",
text: "status",
});
});
});

334
src/talk/activation-name.ts Normal file
View File

@@ -0,0 +1,334 @@
export const REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS = 2;
export type RealtimeVoiceActivationNameEdge = "leading" | "trailing";
export type RealtimeVoiceActivationNameMatchKind = "exact" | "fuzzy";
export type RealtimeVoiceActivationNameTranscriptResult =
| {
allowed: true;
text: string;
activationName: string;
heardName: string;
match: RealtimeVoiceActivationNameMatchKind;
edge: RealtimeVoiceActivationNameEdge;
}
| { allowed: false; text: string };
type EdgeActivationNameCandidate = {
edge: RealtimeVoiceActivationNameEdge;
heardName: string;
startIndex: number;
endIndex: number;
strongBoundary: boolean;
};
export function realtimeVoiceActivationNameWordCount(value: string): number {
return Array.from(value.matchAll(/[a-z0-9]+/gi)).length;
}
export function normalizeRealtimeVoiceActivationName(value: string): string | undefined {
const normalized = value.toLowerCase().replace(/\s+/g, " ").trim();
return normalized || undefined;
}
export function normalizeRealtimeVoiceActivationNamePrefix(
value: string,
maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS,
): string | undefined {
const words = Array.from(value.matchAll(/[a-z0-9]+/gi), (match) => match[0]);
if (words.length === 0) {
return undefined;
}
return words.slice(0, maxWords).join(" ");
}
export function isSupportedRealtimeVoiceActivationName(
value: string,
maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS,
): boolean {
const wordCount = realtimeVoiceActivationNameWordCount(value);
return wordCount >= 1 && wordCount <= maxWords;
}
export function normalizeSupportedRealtimeVoiceActivationName(
value: string | undefined,
maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS,
): string | undefined {
if (typeof value !== "string") {
return undefined;
}
const normalized = normalizeRealtimeVoiceActivationName(value);
return normalized && isSupportedRealtimeVoiceActivationName(normalized, maxWords)
? normalized
: undefined;
}
export function sortRealtimeVoiceActivationNames(names: string[]): string[] {
return names.toSorted((left, right) => right.length - left.length || left.localeCompare(right));
}
export function matchRealtimeVoiceActivationName(
text: string,
activationNames: string[],
maxWords = REALTIME_VOICE_ACTIVATION_NAME_MAX_WORDS,
): Extract<RealtimeVoiceActivationNameTranscriptResult, { allowed: true }> | undefined {
const candidates = [
...leadingActivationNameCandidates(text, maxWords),
...trailingActivationNameCandidates(text, maxWords),
].toSorted(
(left, right) =>
compactActivationName(right.heardName).length - compactActivationName(left.heardName).length,
);
for (const candidate of candidates) {
for (const activationName of activationNames) {
const normalizedActivationName = normalizeActivationNameCandidate(activationName);
if (!normalizedActivationName) {
continue;
}
const heardCompact = compactActivationName(candidate.heardName);
const activationCompact = compactActivationName(normalizedActivationName);
if (
heardCompact === activationCompact ||
isFuzzyActivationNameMatch(candidate, activationName)
) {
return {
allowed: true,
text: stripEdgeActivationNameCandidate(text, candidate),
activationName,
heardName: candidate.heardName,
match: heardCompact === activationCompact ? "exact" : "fuzzy",
edge: candidate.edge,
};
}
}
}
return undefined;
}
function normalizeActivationNameCandidate(value: string): string | undefined {
const normalized = value
.toLowerCase()
.replace(/[^a-z0-9]+/g, " ")
.replace(/\s+/g, " ")
.trim();
return normalized || undefined;
}
function compactActivationName(value: string): string {
return value.replace(/[^a-z0-9]+/g, "");
}
function leadingActivationNameCandidates(
text: string,
maxWords: number,
): EdgeActivationNameCandidate[] {
const opener = /^\s*(?:(?:hey|ok|okay)(?:\s*[-,:;]+\s*|\s+))?/i.exec(text);
const nameStart = opener?.[0].length ?? 0;
const candidates: EdgeActivationNameCandidate[] = [];
const candidateStarts = nameStart > 0 ? [0, nameStart] : [0];
for (const startIndex of candidateStarts) {
const tokenPattern = /[a-z0-9]+/gi;
tokenPattern.lastIndex = startIndex;
const startCandidates: EdgeActivationNameCandidate[] = [];
for (let wordCount = 0; wordCount < maxWords; wordCount += 1) {
const token = tokenPattern.exec(text);
if (!token) {
break;
}
const previousEndIndex =
wordCount === 0 ? startIndex : startCandidates[wordCount - 1]?.endIndex;
const between = text.slice(previousEndIndex, token.index);
if (wordCount > 0 && !/^[\s'-]+$/.test(between)) {
break;
}
const endIndex = token.index + token[0].length;
const heardName = normalizeActivationNameCandidate(text.slice(startIndex, endIndex));
if (!heardName) {
break;
}
const boundary = text.slice(endIndex).match(/^\s*([,.:;!?-]|$)/);
startCandidates.push({
edge: "leading",
heardName,
startIndex,
endIndex,
strongBoundary: Boolean(boundary),
});
}
candidates.push(...startCandidates);
}
return candidates;
}
function trailingActivationNameCandidates(
text: string,
maxWords: number,
): EdgeActivationNameCandidate[] {
const tokens = Array.from(text.matchAll(/[a-z0-9]+/gi));
const candidates: EdgeActivationNameCandidate[] = [];
const tokenCount = Math.min(tokens.length, maxWords);
for (let wordCount = 1; wordCount <= tokenCount; wordCount += 1) {
const startToken = tokens[tokens.length - wordCount];
const endToken = tokens[tokens.length - 1];
if (!startToken || !endToken?.[0]) {
break;
}
const startIndex = startToken.index ?? 0;
const endIndex = (endToken.index ?? 0) + endToken[0].length;
if (!/^\s*(?:[,.:;!?-]+\s*)?$/.test(text.slice(endIndex))) {
break;
}
if (!/(^|[\s,.:;!?-])$/.test(text.slice(0, startIndex))) {
break;
}
if (wordCount > 1) {
const previousToken = tokens[tokens.length - wordCount + 1];
const between = previousToken
? text.slice(startIndex + startToken[0].length, previousToken.index)
: "";
if (!/^[\s'-]+$/.test(between)) {
break;
}
}
const heardName = normalizeActivationNameCandidate(text.slice(startIndex, endIndex));
if (!heardName) {
break;
}
candidates.push({
edge: "trailing",
heardName,
startIndex,
endIndex,
strongBoundary: true,
});
}
return candidates;
}
function levenshteinDistance(left: string, right: string): number {
if (left === right) {
return 0;
}
if (!left) {
return right.length;
}
if (!right) {
return left.length;
}
let previous = Array.from({ length: right.length + 1 }, (_, index) => index);
for (let leftIndex = 0; leftIndex < left.length; leftIndex += 1) {
const current = [leftIndex + 1];
for (let rightIndex = 0; rightIndex < right.length; rightIndex += 1) {
const cost = left[leftIndex] === right[rightIndex] ? 0 : 1;
current[rightIndex + 1] = Math.min(
current[rightIndex] + 1,
previous[rightIndex + 1] + 1,
previous[rightIndex] + cost,
);
}
previous = current;
}
return previous[right.length] ?? Math.max(left.length, right.length);
}
function hasOnlyPhoneticSubstitutions(left: string, right: string): boolean {
if (left.length !== right.length) {
return false;
}
const vowels = new Set(["a", "e", "i", "o", "u", "y"]);
const liquids = new Set(["l", "r"]);
let substitutions = 0;
for (let index = 0; index < left.length; index += 1) {
const leftChar = left[index];
const rightChar = right[index];
if (leftChar === rightChar) {
continue;
}
const vowelLike = vowels.has(leftChar ?? "") && vowels.has(rightChar ?? "");
const liquidLike = liquids.has(leftChar ?? "") && liquids.has(rightChar ?? "");
if (!vowelLike && !liquidLike) {
return false;
}
substitutions += 1;
}
return substitutions > 0;
}
function commonPrefixLength(left: string, right: string): number {
const limit = Math.min(left.length, right.length);
for (let index = 0; index < limit; index += 1) {
if (left[index] !== right[index]) {
return index;
}
}
return limit;
}
function isFuzzyActivationNameMatch(
candidate: EdgeActivationNameCandidate,
activationName: string,
): boolean {
const normalizedActivationName = normalizeActivationNameCandidate(activationName);
if (!normalizedActivationName) {
return false;
}
const heardCompact = compactActivationName(candidate.heardName);
const activationCompact = compactActivationName(normalizedActivationName);
if (!heardCompact || !activationCompact || activationCompact.length < 5) {
return false;
}
if (!candidate.strongBoundary) {
return false;
}
if (heardCompact[0] !== activationCompact[0]) {
return false;
}
const distance = levenshteinDistance(heardCompact, activationCompact);
if (distance <= 1) {
return true;
}
if (
distance === 2 &&
heardCompact.length >= 4 &&
activationCompact.length >= 5 &&
(heardCompact.length !== activationCompact.length ||
hasOnlyPhoneticSubstitutions(heardCompact, activationCompact) ||
commonPrefixLength(heardCompact, activationCompact) >= 6)
) {
return true;
}
if (
distance === 3 &&
heardCompact.length >= 7 &&
activationCompact.length >= 7 &&
heardCompact.length !== activationCompact.length &&
commonPrefixLength(heardCompact, activationCompact) >= 5
) {
return true;
}
return false;
}
function stripEdgeActivationNameCandidate(
text: string,
candidate: EdgeActivationNameCandidate,
): string {
if (candidate.edge === "leading") {
return text
.slice(candidate.endIndex)
.replace(/^\s*(?:[-,:;.!?]+\s*)?/, "")
.trim();
}
return text
.slice(0, candidate.startIndex)
.replace(/\s*(?:[-,:;.!?]+\s*)?$/, "")
.trim();
}

View File

@@ -0,0 +1,35 @@
import { describe, expect, it } from "vitest";
import { classifySkippableRealtimeVoiceConsultTranscript } from "./consult-transcript.js";
describe("realtime voice consult transcript classification", () => {
it("skips empty and incomplete transcripts", () => {
expect(classifySkippableRealtimeVoiceConsultTranscript(" ")).toBe("empty");
expect(classifySkippableRealtimeVoiceConsultTranscript("can you check...")).toBe(
"incomplete-transcript",
);
expect(classifySkippableRealtimeVoiceConsultTranscript("can you check…")).toBe(
"incomplete-transcript",
);
});
it("skips likely trailing fragments", () => {
expect(classifySkippableRealtimeVoiceConsultTranscript("tell me about")).toBe(
"trailing-fragment",
);
expect(classifySkippableRealtimeVoiceConsultTranscript("ship it so")).toBe("trailing-fragment");
});
it("skips non-actionable closings unless phrased as a question", () => {
expect(classifySkippableRealtimeVoiceConsultTranscript("I'll be right back")).toBe(
"non-actionable-closing",
);
expect(classifySkippableRealtimeVoiceConsultTranscript("goodbye for now")).toBe(
"non-actionable-closing",
);
expect(classifySkippableRealtimeVoiceConsultTranscript("can you say goodbye?")).toBeUndefined();
});
it("keeps actionable transcripts", () => {
expect(classifySkippableRealtimeVoiceConsultTranscript("what changed in CI?")).toBeUndefined();
});
});

View File

@@ -0,0 +1,53 @@
const REALTIME_VOICE_CONSULT_TRAILING_FRAGMENT_WORDS = new Set([
"a",
"about",
"an",
"and",
"as",
"at",
"because",
"but",
"by",
"for",
"from",
"in",
"of",
"on",
"or",
"so",
"that",
"the",
"then",
"to",
"with",
]);
export type SkippableRealtimeVoiceConsultTranscriptReason =
| "empty"
| "incomplete-transcript"
| "trailing-fragment"
| "non-actionable-closing";
export function classifySkippableRealtimeVoiceConsultTranscript(
text: string,
): SkippableRealtimeVoiceConsultTranscriptReason | undefined {
const normalized = text.replace(/\s+/g, " ").trim().toLowerCase();
if (!normalized) {
return "empty";
}
if (/(\.\.\.|…)\s*$/.test(normalized)) {
return "incomplete-transcript";
}
const lastWord = normalized.match(/[a-z']+$/)?.[0]?.replace(/^'+|'+$/g, "");
if (lastWord && REALTIME_VOICE_CONSULT_TRAILING_FRAGMENT_WORDS.has(lastWord)) {
return "trailing-fragment";
}
if (
!normalized.includes("?") &&
(/^(i'?ll|i will) be (right )?back\b/.test(normalized) ||
/\b(see you|bye(?:-bye)?|goodbye)\b/.test(normalized))
) {
return "non-actionable-closing";
}
return undefined;
}