fix(voice-call): support per-call session scope

This commit is contained in:
Peter Steinberger
2026-05-02 08:42:38 +01:00
parent b9096de37c
commit e4aab1419a
17 changed files with 298 additions and 17 deletions

View File

@@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai
- Providers/OpenAI: resolve `keychain:<service>:<account>` `OPENAI_API_KEY` refs before creating OpenAI Realtime browser sessions or voice bridges, with a bounded cached Keychain lookup. Fixes #72120. Thanks @ctbritt.
- Discord/gateway: reconnect when the gateway socket closes while waiting for the shared IDENTIFY concurrency window, instead of silently skipping IDENTIFY and leaving the bot online but unresponsive. Fixes #74617. Thanks @zeeskdr-ai.
- Voice Call: add `sessionScope: "per-call"` for fresh per-call agent memory while preserving the default per-phone caller history. Fixes #45280. Thanks @pondcountry.
- Telegram/startup: use the existing `getMe` request guard for the gateway bot probe instead of a fixed 2.5-second budget, and honor higher `timeoutSeconds` configs for slow Telegram API paths. Fixes #75783. Thanks @tankotan.
- Telegram/models: make model picker confirmations say selections are session-scoped and do not change the agent's persistent default. Fixes #75965. Thanks @sd1114820.
- Control UI/slash commands: keep fallback command metadata on a browser-safe registry path, so provider thinking runtime imports cannot blank the Web UI with `process is not defined`. Fixes #75987. Thanks @novkien.

View File

@@ -1,4 +1,4 @@
737056ad5544e24250ce91c000ae4a5fe0af751681a529f2e4710b383ef5d4e7 config-baseline.json
a7158716d9262edba32ef9a18ab04d9f48f83cb903444b6f87b991977b6be52f config-baseline.json
2d132b4c2e3b0e0f2524fc1cc889d3be658ad0e40c970b2d367bf27348883658 config-baseline.core.json
f42329d45c095881bd226bdb192c235980658fd250606d0c0badc2b12f12f5d3 config-baseline.channel.json
726c2fb81319f05be6977cdf5c9598884feafc600e6c76d482be626f4983bc32 config-baseline.plugin.json
de03faf42db470fe419a3f93a5777161f830f0355912603c6795945e42f39735 config-baseline.plugin.json

View File

@@ -109,6 +109,7 @@ Voice-call credentials accept SecretRefs. `plugins.entries.voice-call.config.twi
provider: "twilio", // or "telnyx" | "plivo" | "mock"
fromNumber: "+15550001234", // or TWILIO_FROM_NUMBER for Twilio
toNumber: "+15550005678",
sessionScope: "per-phone", // per-phone | per-call
twilio: {
accountSid: "ACxxxxxxxx",
@@ -192,6 +193,14 @@ Voice-call credentials accept SecretRefs. `plugins.entries.voice-call.config.twi
</Accordion>
</AccordionGroup>
## Session scope
By default, Voice Call uses `sessionScope: "per-phone"` so repeat calls from
the same caller keep conversation memory. Set `sessionScope: "per-call"` when
each carrier call should start with fresh context, for example reception,
booking, IVR, or Google Meet bridge flows where the same phone number may
represent different meetings.
## Realtime voice conversations
`realtime` selects a full-duplex realtime voice provider for live call
@@ -212,7 +221,7 @@ Current runtime behaviour:
- Voice Call exposes the shared `openclaw_agent_consult` realtime tool by default. The realtime model can call it when the caller asks for deeper reasoning, current information, or normal OpenClaw tools.
- `realtime.fastContext.enabled` is default-off. When enabled, Voice Call first searches indexed memory/session context for the consult question and returns those snippets to the realtime model within `realtime.fastContext.timeoutMs` before falling back to the full consult agent only if `realtime.fastContext.fallbackToConsult` is true.
- If `realtime.provider` points at an unregistered provider, or no realtime voice provider is registered at all, Voice Call logs a warning and skips realtime media instead of failing the whole plugin.
- Consult session keys reuse the existing voice session when available, then fall back to the caller/callee phone number so follow-up consult calls keep context during the call.
- Consult session keys reuse the stored call session when available, then fall back to the configured `sessionScope` (`per-phone` by default, or `per-call` for isolated calls).
### Tool policy

View File

@@ -40,6 +40,7 @@ Put under `plugins.entries.voice-call.config`:
provider: "twilio", // or "telnyx" | "plivo" | "mock"
fromNumber: "+15550001234",
toNumber: "+15550005678",
sessionScope: "per-phone", // or "per-call"
twilio: {
accountSid: "ACxxxxxxxx",
@@ -104,6 +105,7 @@ Notes:
- If older configs still use `provider: "log"`, `twilio.from`, or legacy `streaming.*` OpenAI keys, run `openclaw doctor --fix` to rewrite them.
- advanced webhook, streaming, and tunnel notes: `https://docs.openclaw.ai/plugins/voice-call`
- `responseModel` is optional. When unset, voice responses use the runtime default model.
- `sessionScope` defaults to `per-phone`, preserving caller memory across calls. Use `per-call` for reception, booking, IVR, and bridge flows where each carrier call should start fresh.
## Stale call reaper

View File

@@ -189,6 +189,10 @@
"label": "Call Log Store Path",
"advanced": true
},
"sessionScope": {
"label": "Session Scope",
"help": "Use per-phone to preserve caller memory across calls, or per-call to isolate every call into a fresh voice session."
},
"responseModel": {
"label": "Response Model",
"help": "Optional override. Falls back to the runtime default model when unset.",
@@ -767,6 +771,10 @@
"store": {
"type": "string"
},
"sessionScope": {
"type": "string",
"enum": ["per-phone", "per-call"]
},
"responseModel": {
"type": "string"
},

View File

@@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, it } from "vitest";
import {
VoiceCallConfigSchema,
resolveTwilioAuthToken,
resolveVoiceCallSessionKey,
validateProviderConfig,
normalizeVoiceCallConfig,
resolveVoiceCallConfig,
@@ -256,6 +257,53 @@ describe("resolveVoiceCallConfig", () => {
expect(config.staleCallReaperSeconds).toBe(120);
});
it("keeps voice sessions scoped by phone by default", () => {
const config = resolveVoiceCallConfig({ enabled: true, provider: "mock" });
expect(config.sessionScope).toBe("per-phone");
expect(
resolveVoiceCallSessionKey({
config,
callId: "call-123",
phone: "+1 (555) 000-1111",
}),
).toBe("voice:15550001111");
});
it("can scope voice sessions to each call", () => {
const config = resolveVoiceCallConfig({
enabled: true,
provider: "mock",
sessionScope: "per-call",
});
expect(config.sessionScope).toBe("per-call");
expect(
resolveVoiceCallSessionKey({
config,
callId: "call-123",
phone: "+1 (555) 000-1111",
}),
).toBe("voice:call:call-123");
});
it("preserves explicit voice session keys", () => {
const config = resolveVoiceCallConfig({
enabled: true,
provider: "mock",
sessionScope: "per-call",
});
expect(
resolveVoiceCallSessionKey({
config,
callId: "call-123",
phone: "+1 (555) 000-1111",
explicitSessionKey: "meet-room-1",
}),
).toBe("meet-room-1");
});
});
describe("normalizeVoiceCallConfig", () => {

View File

@@ -173,6 +173,9 @@ export type WebhookSecurityConfig = z.infer<typeof VoiceCallWebhookSecurityConfi
const CallModeSchema = z.enum(["notify", "conversation"]);
export type CallMode = z.infer<typeof CallModeSchema>;
const VoiceCallSessionScopeSchema = z.enum(["per-phone", "per-call"]);
export type VoiceCallSessionScope = z.infer<typeof VoiceCallSessionScopeSchema>;
const OutboundConfigSchema = z
.object({
/** Default call mode for outbound calls */
@@ -393,6 +396,9 @@ export const VoiceCallConfigSchema = z
/** Realtime voice-to-voice configuration */
realtime: VoiceCallRealtimeConfigSchema,
/** Session memory scope for voice conversations. */
sessionScope: VoiceCallSessionScopeSchema.default("per-phone"),
/** Public webhook URL override (if set, bypasses tunnel auto-detection) */
publicUrl: z.string().url().optional(),
@@ -549,6 +555,23 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
};
}
export function resolveVoiceCallSessionKey(params: {
config: Pick<VoiceCallConfig, "sessionScope">;
callId: string;
phone?: string;
explicitSessionKey?: string;
}): string {
const explicit = params.explicitSessionKey?.trim();
if (explicit) {
return explicit;
}
if (params.config.sessionScope === "per-call") {
return `voice:call:${params.callId}`;
}
const normalizedPhone = params.phone?.replace(/\D/g, "");
return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${params.callId}`;
}
/**
* Resolves the configuration by merging environment variables into missing fields.
* Returns a new configuration object with environment variables applied.

View File

@@ -426,6 +426,33 @@ describe("processEvent (functional)", () => {
expect(call.direction).toBe("inbound");
});
it("assigns per-call session keys to inbound calls when configured", () => {
const ctx = createContext({
config: VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
inboundPolicy: "open",
sessionScope: "per-call",
}),
});
const event: NormalizedEvent = {
id: "evt-inbound-session-scope",
type: "call.initiated",
callId: "CA-inbound-session-scope",
providerCallId: "CA-inbound-session-scope",
timestamp: Date.now(),
direction: "inbound",
from: "+15554444444",
to: "+15550000000",
};
processEvent(ctx, event);
const call = requireFirstActiveCall(ctx);
expect(call.sessionKey).toBe(`voice:call:${call.callId}`);
});
it("deduplicates by dedupeKey even when event IDs differ", () => {
const now = Date.now();
const ctx = createContext();

View File

@@ -1,6 +1,7 @@
import crypto from "node:crypto";
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import { isAllowlistedCaller, normalizePhoneNumber } from "../allowlist.js";
import { resolveVoiceCallSessionKey } from "../config.js";
import type { CallRecord, NormalizedEvent } from "../types.js";
import type { CallManagerContext } from "./context.js";
import { finalizeCall } from "./lifecycle.js";
@@ -73,6 +74,11 @@ function createWebhookCall(params: {
state: "ringing",
from: params.from,
to: params.to,
sessionKey: resolveVoiceCallSessionKey({
config: params.ctx.config,
callId,
phone: params.direction === "outbound" ? params.to : params.from,
}),
startedAt: Date.now(),
transcript: [],
processedEventIds: [],

View File

@@ -170,9 +170,35 @@ describe("voice-call outbound helpers", () => {
inlineTwiml: "<Response />",
});
expect(ctx.providerCallIdMap.get("provider-1")).toBe(callId);
expect(ctx.activeCalls.get(callId)?.sessionKey).toBe("session-1");
expect(persistCallRecordMock).toHaveBeenCalledTimes(2);
});
it("assigns per-call session keys to outbound calls when configured", async () => {
const initiateProviderCall = vi.fn(async () => ({ providerCallId: "provider-1" }));
const ctx = {
activeCalls: new Map(),
providerCallIdMap: new Map(),
provider: { name: "twilio", initiateCall: initiateProviderCall },
config: {
maxConcurrentCalls: 3,
outbound: { defaultMode: "conversation" },
fromNumber: "+14155550100",
sessionScope: "per-call",
},
storePath: "/tmp/voice-call.json",
webhookUrl: "https://example.com/webhook",
};
const result = await initiateCall(ctx as never, "+14155550123");
expect(result).toEqual({
callId: expect.any(String),
success: true,
});
expect(ctx.activeCalls.get(result.callId)?.sessionKey).toBe(`voice:call:${result.callId}`);
});
it("initiates conversation calls with pre-connect DTMF TwiML", async () => {
const initiateProviderCall = vi.fn(async () => ({ providerCallId: "provider-1" }));
const ctx = {

View File

@@ -1,6 +1,6 @@
import crypto from "node:crypto";
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import type { CallMode } from "../config.js";
import { resolveVoiceCallSessionKey, type CallMode } from "../config.js";
import { resolvePreferredTtsVoice } from "../tts-provider-voice.js";
import {
type EndReason,
@@ -162,7 +162,12 @@ export async function initiateCall(
state: "initiated",
from,
to,
sessionKey,
sessionKey: resolveVoiceCallSessionKey({
config: ctx.config,
callId,
phone: to,
explicitSessionKey: sessionKey,
}),
startedAt: Date.now(),
transcript: [],
processedEventIds: [],

View File

@@ -191,6 +191,37 @@ describe("generateVoiceResponse", () => {
);
});
it("uses the persisted per-call session key for classic responses", async () => {
const { runtime, runEmbeddedPiAgent, sessionStore } = createAgentRuntime([
{ text: '{"spoken":"Fresh call context."}' },
]);
const voiceConfig = VoiceCallConfigSchema.parse({
sessionScope: "per-call",
responseTimeoutMs: 5000,
});
const result = await generateVoiceResponse({
voiceConfig,
coreConfig: {} as CoreConfig,
agentRuntime: runtime,
callId: "call-123",
sessionKey: "voice:call:call-123",
from: "+15550001111",
transcript: [{ speaker: "user", text: "hello there" }],
userMessage: "hello there",
});
expect(result.text).toBe("Fresh call context.");
expect(sessionStore["voice:call:call-123"]).toBeDefined();
expect(sessionStore["voice:15550001111"]).toBeUndefined();
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
sessionKey: "voice:call:call-123",
sandboxSessionKey: "agent:main:voice:call:call-123",
}),
);
});
it("uses the main agent workspace when voice config omits agentId", async () => {
const {
runtime,

View File

@@ -7,7 +7,7 @@ import crypto from "node:crypto";
import { applyModelOverrideToSessionEntry } from "openclaw/plugin-sdk/model-session-runtime";
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
import type { SessionEntry } from "../api.js";
import type { VoiceCallConfig } from "./config.js";
import { resolveVoiceCallSessionKey, type VoiceCallConfig } from "./config.js";
import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
import { resolveVoiceResponseModel } from "./response-model.js";
@@ -20,6 +20,8 @@ export type VoiceResponseParams = {
agentRuntime: CoreAgentDeps;
/** Call ID for session tracking */
callId: string;
/** Persisted call session key */
sessionKey?: string;
/** Caller's phone number */
from: string;
/** Conversation transcript */
@@ -187,16 +189,28 @@ function resolveVoiceSandboxSessionKey(agentId: string, sessionKey: string): str
export async function generateVoiceResponse(
params: VoiceResponseParams,
): Promise<VoiceResponseResult> {
const { voiceConfig, callId, from, transcript, userMessage, coreConfig, agentRuntime } = params;
const {
voiceConfig,
callId,
sessionKey,
from,
transcript,
userMessage,
coreConfig,
agentRuntime,
} = params;
if (!coreConfig) {
return { text: null, error: "Core config unavailable for voice response" };
}
const cfg = coreConfig;
// Build voice-specific session key based on phone number
const normalizedPhone = from.replace(/\D/g, "");
const sessionKey = `voice:${normalizedPhone}`;
const resolvedSessionKey = resolveVoiceCallSessionKey({
config: voiceConfig,
callId,
phone: from,
explicitSessionKey: sessionKey,
});
const agentId = voiceConfig.agentId ?? "main";
// Resolve paths
@@ -210,7 +224,7 @@ export async function generateVoiceResponse(
// Load or create session entry
const sessionStore = agentRuntime.session.loadSessionStore(storePath);
const now = Date.now();
let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
let sessionEntry = sessionStore[resolvedSessionKey] as SessionEntry | undefined;
let sessionEntryUpdated = false;
if (!sessionEntry) {
@@ -218,7 +232,7 @@ export async function generateVoiceResponse(
sessionId: crypto.randomUUID(),
updatedAt: now,
};
sessionStore[sessionKey] = sessionEntry;
sessionStore[resolvedSessionKey] = sessionEntry;
sessionEntryUpdated = true;
}
@@ -271,8 +285,8 @@ export async function generateVoiceResponse(
try {
const result = await agentRuntime.runEmbeddedPiAgent({
sessionId,
sessionKey,
sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, sessionKey),
sessionKey: resolvedSessionKey,
sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, resolvedSessionKey),
agentId,
messageProvider: "voice",
sessionFile,

View File

@@ -28,6 +28,22 @@ const mocks = vi.hoisted(() => ({
}));
vi.mock("./config.js", () => ({
resolveVoiceCallSessionKey: (params: {
config: Pick<VoiceCallConfig, "sessionScope">;
callId: string;
phone?: string;
explicitSessionKey?: string;
}) => {
const explicit = params.explicitSessionKey?.trim();
if (explicit) {
return explicit;
}
if (params.config.sessionScope === "per-call") {
return `voice:call:${params.callId}`;
}
const normalizedPhone = params.phone?.replace(/\D/g, "");
return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${params.callId}`;
},
resolveVoiceCallConfig: mocks.resolveVoiceCallConfig,
resolveTwilioAuthToken: mocks.resolveTwilioAuthToken,
validateProviderConfig: mocks.validateProviderConfig,
@@ -382,6 +398,64 @@ describe("createVoiceCallRuntime lifecycle", () => {
);
});
it("uses persisted per-call session keys for realtime consults", async () => {
const config = createBaseConfig();
config.inboundPolicy = "allowlist";
config.realtime.enabled = true;
config.sessionScope = "per-call";
const runEmbeddedPiAgent = vi.fn(async () => ({
payloads: [{ text: "Per-call consult answer." }],
meta: {},
}));
const sessionStore: Record<string, unknown> = {};
const agentRuntime = {
defaults: { provider: "openai", model: "gpt-5.4" },
resolveAgentDir: vi.fn(() => "/tmp/agent"),
resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"),
resolveAgentIdentity: vi.fn(),
resolveThinkingDefault: vi.fn(() => "high"),
resolveAgentTimeoutMs: vi.fn(() => 30_000),
ensureAgentWorkspace: vi.fn(async () => {}),
session: {
resolveStorePath: vi.fn(() => "/tmp/sessions.json"),
loadSessionStore: vi.fn(() => sessionStore),
saveSessionStore: vi.fn(async () => {}),
resolveSessionFilePath: vi.fn(() => "/tmp/session.json"),
},
runEmbeddedPiAgent,
};
mocks.managerGetCall.mockReturnValue({
callId: "call-1",
sessionKey: "voice:call:call-1",
direction: "inbound",
from: "+15550001234",
to: "+15550009999",
transcript: [],
});
await createVoiceCallRuntime({
config,
coreConfig: {} as CoreConfig,
agentRuntime: agentRuntime as never,
});
const handler = mocks.realtimeHandlerRegisterToolHandler.mock.calls[0]?.[1] as
| ((
args: unknown,
callId: string,
context?: { partialUserTranscript?: string },
) => Promise<unknown>)
| undefined;
await expect(handler?.({ question: "What should I say?" }, "call-1")).resolves.toEqual({
text: "Per-call consult answer.",
});
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
sessionKey: "voice:call:call-1",
}),
);
});
it("answers realtime consults from fast memory context before starting the full agent", async () => {
const config = createBaseConfig();
config.realtime.enabled = true;

View File

@@ -10,6 +10,7 @@ import {
} from "openclaw/plugin-sdk/realtime-voice";
import type { VoiceCallConfig } from "./config.js";
import {
resolveVoiceCallSessionKey,
resolveTwilioAuthToken,
resolveVoiceCallConfig,
validateProviderConfig,
@@ -103,6 +104,7 @@ function loadRealtimeHandler(): Promise<RealtimeHandlerModule> {
}
function resolveVoiceCallConsultSessionKey(call: {
config: VoiceCallConfig;
sessionKey?: string;
from?: string;
to?: string;
@@ -113,8 +115,11 @@ function resolveVoiceCallConsultSessionKey(call: {
return call.sessionKey;
}
const phone = call.direction === "outbound" ? call.to : call.from;
const normalizedPhone = phone?.replace(/\D/g, "");
return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${call.callId}`;
return resolveVoiceCallSessionKey({
config: call.config,
callId: call.callId,
phone,
});
}
function mapVoiceCallConsultTranscript(
@@ -335,7 +340,7 @@ export async function createVoiceCallRuntime(params: {
return { error: `Call "${callId}" not found` };
}
const agentId = config.agentId ?? "main";
const sessionKey = resolveVoiceCallConsultSessionKey(call);
const sessionKey = resolveVoiceCallConsultSessionKey({ ...call, config });
const fastContext = await resolveRealtimeFastContextConsult({
cfg,
agentId,

View File

@@ -18,6 +18,7 @@ export function createVoiceCallBaseConfig(params?: {
transcriptTimeoutMs: 180000,
ringTimeoutMs: 30000,
maxConcurrentCalls: 1,
sessionScope: "per-phone",
serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
tailscale: { mode: "off", path: "/voice/webhook" },
tunnel: {

View File

@@ -879,6 +879,7 @@ export class VoiceCallWebhookServer {
coreConfig: this.coreConfig,
agentRuntime: this.agentRuntime,
callId,
sessionKey: call.sessionKey,
from: call.from,
transcript: call.transcript,
userMessage,