voice-call: harden closed-loop turn loop and transcript routing (#19140)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 14a3edb005
Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com>
Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com>
Reviewed-by: @mbelinky
This commit is contained in:
Mariano
2026-02-17 13:02:38 +00:00
committed by GitHub
parent bc4038149c
commit 0c87dbdcfc
12 changed files with 672 additions and 7 deletions

View File

@@ -41,6 +41,40 @@ function resolveDefaultStorePath(config: VoiceCallConfig): string {
return path.join(base, "calls.jsonl");
}
function percentile(values: number[], p: number): number {
if (values.length === 0) {
return 0;
}
const sorted = [...values].sort((a, b) => a - b);
const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1));
return sorted[idx] ?? 0;
}
function summarizeSeries(values: number[]): {
count: number;
minMs: number;
maxMs: number;
avgMs: number;
p50Ms: number;
p95Ms: number;
} {
if (values.length === 0) {
return { count: 0, minMs: 0, maxMs: 0, avgMs: 0, p50Ms: 0, p95Ms: 0 };
}
const minMs = values.reduce((min, value) => (value < min ? value : min), Number.POSITIVE_INFINITY);
const maxMs = values.reduce((max, value) => (value > max ? value : max), Number.NEGATIVE_INFINITY);
const avgMs = values.reduce((sum, value) => sum + value, 0) / values.length;
return {
count: values.length,
minMs,
maxMs,
avgMs,
p50Ms: percentile(values, 50),
p95Ms: percentile(values, 95),
};
}
export function registerVoiceCallCli(params: {
program: Command;
config: VoiceCallConfig;
@@ -216,6 +250,57 @@ export function registerVoiceCallCli(params: {
}
});
root
.command("latency")
.description("Summarize turn latency metrics from voice-call JSONL logs")
.option("--file <path>", "Path to calls.jsonl", resolveDefaultStorePath(config))
.option("--last <n>", "Analyze last N records", "200")
.action(async (options: { file: string; last?: string }) => {
const file = options.file;
const last = Math.max(1, Number(options.last ?? 200));
if (!fs.existsSync(file)) {
throw new Error("No log file at " + file);
}
const content = fs.readFileSync(file, "utf8");
const lines = content.split("\n").filter(Boolean).slice(-last);
const turnLatencyMs: number[] = [];
const listenWaitMs: number[] = [];
for (const line of lines) {
try {
const parsed = JSON.parse(line) as {
metadata?: { lastTurnLatencyMs?: unknown; lastTurnListenWaitMs?: unknown };
};
const latency = parsed.metadata?.lastTurnLatencyMs;
const listenWait = parsed.metadata?.lastTurnListenWaitMs;
if (typeof latency === "number" && Number.isFinite(latency)) {
turnLatencyMs.push(latency);
}
if (typeof listenWait === "number" && Number.isFinite(listenWait)) {
listenWaitMs.push(listenWait);
}
} catch {
// ignore malformed JSON lines
}
}
// eslint-disable-next-line no-console
console.log(
JSON.stringify(
{
recordsScanned: lines.length,
turnLatency: summarizeSeries(turnLatencyMs),
listenWait: summarizeSeries(listenWaitMs),
},
null,
2,
),
);
});
root
.command("expose")
.description("Enable/disable Tailscale serve/funnel for the webhook")

View File

@@ -20,6 +20,8 @@ class FakeProvider implements VoiceCallProvider {
readonly name = "plivo" as const;
readonly playTtsCalls: PlayTtsInput[] = [];
readonly hangupCalls: HangupCallInput[] = [];
readonly startListeningCalls: StartListeningInput[] = [];
readonly stopListeningCalls: StopListeningInput[] = [];
verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
return { ok: true };
@@ -36,8 +38,12 @@ class FakeProvider implements VoiceCallProvider {
async playTts(input: PlayTtsInput): Promise<void> {
this.playTtsCalls.push(input);
}
async startListening(_input: StartListeningInput): Promise<void> {}
async stopListening(_input: StopListeningInput): Promise<void> {}
async startListening(input: StartListeningInput): Promise<void> {
this.startListeningCalls.push(input);
}
async stopListening(input: StopListeningInput): Promise<void> {
this.stopListeningCalls.push(input);
}
}
describe("CallManager", () => {
@@ -261,4 +267,219 @@ describe("CallManager", () => {
expect(manager.getCallByProviderCallId("provider-exact")).toBeDefined();
});
it("completes a closed-loop turn without live audio", async () => {
const config = VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
transcriptTimeoutMs: 5000,
});
const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
const provider = new FakeProvider();
const manager = new CallManager(config, storePath);
manager.initialize(provider, "https://example.com/voice/webhook");
const started = await manager.initiateCall("+15550000003");
expect(started.success).toBe(true);
manager.processEvent({
id: "evt-closed-loop-answered",
type: "call.answered",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
});
const turnPromise = manager.continueCall(started.callId, "How can I help?");
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: "evt-closed-loop-speech",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "Please check status",
isFinal: true,
});
const turn = await turnPromise;
expect(turn.success).toBe(true);
expect(turn.transcript).toBe("Please check status");
expect(provider.startListeningCalls).toHaveLength(1);
expect(provider.stopListeningCalls).toHaveLength(1);
const call = manager.getCall(started.callId);
expect(call?.transcript.map((entry) => entry.text)).toEqual([
"How can I help?",
"Please check status",
]);
const metadata = (call?.metadata ?? {}) as Record<string, unknown>;
expect(typeof metadata.lastTurnLatencyMs).toBe("number");
expect(typeof metadata.lastTurnListenWaitMs).toBe("number");
expect(metadata.turnCount).toBe(1);
});
it("rejects overlapping continueCall requests for the same call", async () => {
const config = VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
transcriptTimeoutMs: 5000,
});
const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
const provider = new FakeProvider();
const manager = new CallManager(config, storePath);
manager.initialize(provider, "https://example.com/voice/webhook");
const started = await manager.initiateCall("+15550000004");
expect(started.success).toBe(true);
manager.processEvent({
id: "evt-overlap-answered",
type: "call.answered",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
});
const first = manager.continueCall(started.callId, "First prompt");
const second = await manager.continueCall(started.callId, "Second prompt");
expect(second.success).toBe(false);
expect(second.error).toBe("Already waiting for transcript");
manager.processEvent({
id: "evt-overlap-speech",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "Done",
isFinal: true,
});
const firstResult = await first;
expect(firstResult.success).toBe(true);
expect(firstResult.transcript).toBe("Done");
expect(provider.startListeningCalls).toHaveLength(1);
expect(provider.stopListeningCalls).toHaveLength(1);
});
it("tracks latency metadata across multiple closed-loop turns", async () => {
const config = VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
transcriptTimeoutMs: 5000,
});
const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
const provider = new FakeProvider();
const manager = new CallManager(config, storePath);
manager.initialize(provider, "https://example.com/voice/webhook");
const started = await manager.initiateCall("+15550000005");
expect(started.success).toBe(true);
manager.processEvent({
id: "evt-multi-answered",
type: "call.answered",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
});
const firstTurn = manager.continueCall(started.callId, "First question");
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: "evt-multi-speech-1",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "First answer",
isFinal: true,
});
await firstTurn;
const secondTurn = manager.continueCall(started.callId, "Second question");
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: "evt-multi-speech-2",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "Second answer",
isFinal: true,
});
const secondResult = await secondTurn;
expect(secondResult.success).toBe(true);
const call = manager.getCall(started.callId);
expect(call?.transcript.map((entry) => entry.text)).toEqual([
"First question",
"First answer",
"Second question",
"Second answer",
]);
const metadata = (call?.metadata ?? {}) as Record<string, unknown>;
expect(metadata.turnCount).toBe(2);
expect(typeof metadata.lastTurnLatencyMs).toBe("number");
expect(typeof metadata.lastTurnListenWaitMs).toBe("number");
expect(provider.startListeningCalls).toHaveLength(2);
expect(provider.stopListeningCalls).toHaveLength(2);
});
it("handles repeated closed-loop turns without waiter churn", async () => {
const config = VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
transcriptTimeoutMs: 5000,
});
const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
const provider = new FakeProvider();
const manager = new CallManager(config, storePath);
manager.initialize(provider, "https://example.com/voice/webhook");
const started = await manager.initiateCall("+15550000006");
expect(started.success).toBe(true);
manager.processEvent({
id: "evt-loop-answered",
type: "call.answered",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
});
for (let i = 1; i <= 5; i++) {
const turnPromise = manager.continueCall(started.callId, `Prompt ${i}`);
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: `evt-loop-speech-${i}`,
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: `Answer ${i}`,
isFinal: true,
});
const result = await turnPromise;
expect(result.success).toBe(true);
expect(result.transcript).toBe(`Answer ${i}`);
}
const call = manager.getCall(started.callId);
const metadata = (call?.metadata ?? {}) as Record<string, unknown>;
expect(metadata.turnCount).toBe(5);
expect(provider.startListeningCalls).toHaveLength(5);
expect(provider.stopListeningCalls).toHaveLength(5);
});
});

View File

@@ -47,6 +47,7 @@ export class CallManager {
private config: VoiceCallConfig;
private storePath: string;
private webhookUrl: string | null = null;
private activeTurnCalls = new Set<CallId>();
private transcriptWaiters = new Map<
CallId,
{
@@ -137,6 +138,7 @@ export class CallManager {
config: this.config,
storePath: this.storePath,
webhookUrl: this.webhookUrl,
activeTurnCalls: this.activeTurnCalls,
transcriptWaiters: this.transcriptWaiters,
maxDurationTimers: this.maxDurationTimers,
onCallAnswered: (call) => {

View File

@@ -24,6 +24,7 @@ export type CallManagerRuntimeDeps = {
};
export type CallManagerTransientState = {
activeTurnCalls: Set<CallId>;
transcriptWaiters: Map<CallId, TranscriptWaiter>;
maxDurationTimers: Map<CallId, NodeJS.Timeout>;
};

View File

@@ -24,6 +24,7 @@ function createContext(overrides: Partial<CallManagerContext> = {}): CallManager
}),
storePath,
webhookUrl: null,
activeTurnCalls: new Set(),
transcriptWaiters: new Map(),
maxDurationTimers: new Map(),
...overrides,

View File

@@ -36,6 +36,7 @@ type ConversationContext = Pick<
| "provider"
| "config"
| "storePath"
| "activeTurnCalls"
| "transcriptWaiters"
| "maxDurationTimers"
>;
@@ -158,7 +159,6 @@ export async function speak(
if (TerminalStates.has(call.state)) {
return { success: false, error: "Call has ended" };
}
try {
transitionState(call, "speaking");
persistCallRecord(ctx.storePath, call);
@@ -242,6 +242,12 @@ export async function continueCall(
if (TerminalStates.has(call.state)) {
return { success: false, error: "Call has ended" };
}
if (ctx.activeTurnCalls.has(callId) || ctx.transcriptWaiters.has(callId)) {
return { success: false, error: "Already waiting for transcript" };
}
ctx.activeTurnCalls.add(callId);
const turnStartedAt = Date.now();
try {
await speak(ctx, callId, prompt);
@@ -249,17 +255,45 @@ export async function continueCall(
transitionState(call, "listening");
persistCallRecord(ctx.storePath, call);
const listenStartedAt = Date.now();
await ctx.provider.startListening({ callId, providerCallId: call.providerCallId });
const transcript = await waitForFinalTranscript(ctx, callId);
const transcriptReceivedAt = Date.now();
// Best-effort: stop listening after final transcript.
await ctx.provider.stopListening({ callId, providerCallId: call.providerCallId });
const lastTurnLatencyMs = transcriptReceivedAt - turnStartedAt;
const lastTurnListenWaitMs = transcriptReceivedAt - listenStartedAt;
const turnCount =
call.metadata && typeof call.metadata.turnCount === "number"
? call.metadata.turnCount + 1
: 1;
call.metadata = {
...(call.metadata ?? {}),
turnCount,
lastTurnLatencyMs,
lastTurnListenWaitMs,
lastTurnCompletedAt: transcriptReceivedAt,
};
persistCallRecord(ctx.storePath, call);
console.log(
"[voice-call] continueCall latency call=" +
call.callId +
" totalMs=" +
String(lastTurnLatencyMs) +
" listenWaitMs=" +
String(lastTurnListenWaitMs),
);
return { success: true, transcript };
} catch (err) {
return { success: false, error: err instanceof Error ? err.message : String(err) };
} finally {
ctx.activeTurnCalls.delete(callId);
clearTranscriptWaiter(ctx, callId);
}
}

View File

@@ -87,8 +87,9 @@ export function resolveTranscriptWaiter(
}
export function waitForFinalTranscript(ctx: TimerContext, callId: CallId): Promise<string> {
// Only allow one in-flight waiter per call.
rejectTranscriptWaiter(ctx, callId, "Transcript waiter replaced");
if (ctx.transcriptWaiters.has(callId)) {
return Promise.reject(new Error("Already waiting for transcript"));
}
const timeoutMs = ctx.config.transcriptTimeoutMs;
return new Promise((resolve, reject) => {