feat(voice-call): share realtime agent consult tool

Centralize the shared realtime agent consult tool for browser Talk, Google Meet, and Voice Call.
This commit is contained in:
Peter Steinberger
2026-04-24 23:11:18 +01:00
committed by GitHub
parent 900ba7cf33
commit e2f13959d4
23 changed files with 807 additions and 159 deletions

View File

@@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai
- Plugins/Google Meet: add a bundled participant plugin with personal Google auth, explicit meeting URL joins, Chrome and Twilio transports, and realtime voice support. (#70765) Thanks @steipete.
- Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`.
- Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key.
- Plugins/Voice Call: expose the shared `openclaw_agent_consult` realtime tool so live phone calls can ask the full OpenClaw agent for deeper/tool-backed answers.
- Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support.
- Plugins/Google Meet: let realtime Meet sessions consult the full OpenClaw agent for deeper answers while staying in the live voice loop.

View File

@@ -1,4 +1,4 @@
0adf332920764704575b21d2fe9568742d977ff0169683319c168d68ea7cf143 config-baseline.json
a608561acecc7cfc5f16a31b7498d7a66001f6655f5a5960a68842c59b7dcaa8 config-baseline.json
2936d2ccf0c1e6e932a0e7c617b809e4b31dbb9a7d5afefbba29b229913b9e50 config-baseline.core.json
22d7cd6d8279146b2d79c9531a55b80b52a2c99c81338c508104729154fdd02d config-baseline.channel.json
28d874a4910174c7014ef2a267269a3327d31ff657f76d38c034ef1b86eae484 config-baseline.plugin.json
d47a574045a47356e513ab308d7dcad9fa0b389f50e93c5cf0f820fab858e70e config-baseline.plugin.json

View File

@@ -670,6 +670,7 @@ OpenClaw tools, it can call `openclaw_agent_consult`.
The consult tool runs the regular OpenClaw agent behind the scenes with recent
meeting transcript context and returns a concise spoken answer to the realtime
voice session. The voice model can then speak that answer back into the meeting.
It uses the same shared realtime consult tool as Voice Call.
`realtime.toolPolicy` controls the consult run:

View File

@@ -126,6 +126,7 @@ Set config under `plugins.entries.voice-call.config`:
realtime: {
enabled: false,
provider: "google", // optional; first registered realtime voice provider when unset
toolPolicy: "safe-read-only",
providers: {
google: {
model: "gemini-2.5-flash-native-audio-preview-12-2025",
@@ -174,6 +175,20 @@ Current runtime behavior:
- Bundled realtime voice providers include Google Gemini Live (`google`) and
OpenAI (`openai`), registered by their provider plugins.
- Provider-owned raw config lives under `realtime.providers.<providerId>`.
- Voice Call exposes the shared `openclaw_agent_consult` realtime tool by
default. The realtime model can call it when the caller asks for deeper
reasoning, current information, or normal OpenClaw tools.
- `realtime.toolPolicy` controls the consult run:
- `safe-read-only`: expose the consult tool and limit the regular agent to
`read`, `web_search`, `web_fetch`, `x_search`, `memory_search`, and
`memory_get`.
- `owner`: expose the consult tool and let the regular agent use the normal
agent tool policy.
- `none`: do not expose the consult tool. Custom `realtime.tools` are still
passed through to the realtime provider.
- Consult session keys reuse the existing voice session when available, then
fall back to the caller/callee phone number so follow-up consult calls keep
context during the call.
- If `realtime.provider` points at an unregistered provider, or no realtime
voice provider is registered at all, Voice Call logs a warning and skips
realtime media instead of failing the whole plugin.
@@ -199,7 +214,8 @@ Example:
realtime: {
enabled: true,
provider: "google",
instructions: "Speak briefly and ask before using tools.",
instructions: "Speak briefly. Call openclaw_agent_consult before using deeper tools.",
toolPolicy: "safe-read-only",
providers: {
google: {
apiKey: "${GEMINI_API_KEY}",

View File

@@ -1,84 +1,20 @@
import { randomUUID } from "node:crypto";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime";
import {
consultRealtimeVoiceAgent,
REALTIME_VOICE_AGENT_CONSULT_TOOL,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
resolveRealtimeVoiceAgentConsultTools,
resolveRealtimeVoiceAgentConsultToolsAllow,
type RealtimeVoiceTool,
} from "openclaw/plugin-sdk/realtime-voice";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import type { GoogleMeetConfig, GoogleMeetToolPolicy } from "./config.js";
type AgentPayload = {
text?: string;
isError?: boolean;
isReasoning?: boolean;
};
export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME;
export const GOOGLE_MEET_AGENT_CONSULT_TOOL = REALTIME_VOICE_AGENT_CONSULT_TOOL;
export function resolveGoogleMeetRealtimeTools(policy: GoogleMeetToolPolicy): RealtimeVoiceTool[] {
return policy === "none" ? [] : [GOOGLE_MEET_AGENT_CONSULT_TOOL];
}
function normalizeToolArgString(args: unknown, key: string): string | undefined {
if (!args || typeof args !== "object" || Array.isArray(args)) {
return undefined;
}
return normalizeOptionalString((args as Record<string, unknown>)[key]);
}
function collectVisibleText(payloads: AgentPayload[]): string | null {
const chunks: string[] = [];
for (const payload of payloads) {
if (payload.isError || payload.isReasoning) {
continue;
}
const text = normalizeOptionalString(payload.text);
if (text) {
chunks.push(text);
}
}
return chunks.length > 0 ? chunks.join("\n\n").trim() : null;
}
function resolveToolsAllow(policy: GoogleMeetToolPolicy): string[] | undefined {
if (policy === "owner") {
return undefined;
}
if (policy === "safe-read-only") {
return ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"];
}
return [];
}
function buildPrompt(params: {
args: unknown;
transcript: Array<{ role: "user" | "assistant"; text: string }>;
}): string {
const question = normalizeToolArgString(params.args, "question");
if (!question) {
throw new Error("question required");
}
const context = normalizeToolArgString(params.args, "context");
const responseStyle = normalizeToolArgString(params.args, "responseStyle");
const transcript = params.transcript
.slice(-12)
.map((entry) => `${entry.role === "assistant" ? "Agent" : "Participant"}: ${entry.text}`)
.join("\n");
return [
"You are helping an OpenClaw realtime voice agent during a private Google Meet.",
"Answer the participant's question with the strongest useful reasoning and available tools.",
"Return only the concise answer the realtime voice agent should speak next.",
"Do not include markdown, citations unless needed, tool logs, or private reasoning.",
responseStyle ? `Spoken style: ${responseStyle}` : undefined,
transcript ? `Recent meeting transcript:\n${transcript}` : undefined,
context ? `Additional context:\n${context}` : undefined,
`Question:\n${question}`,
]
.filter(Boolean)
.join("\n\n");
return resolveRealtimeVoiceAgentConsultTools(policy);
}
export async function consultOpenClawAgentForGoogleMeet(params: {
@@ -90,54 +26,22 @@ export async function consultOpenClawAgentForGoogleMeet(params: {
args: unknown;
transcript: Array<{ role: "user" | "assistant"; text: string }>;
}): Promise<{ text: string }> {
const agentId = "main";
const sessionKey = `google-meet:${params.meetingSessionId}`;
const cfg = params.fullConfig;
const agentDir = params.runtime.agent.resolveAgentDir(cfg, agentId);
const workspaceDir = params.runtime.agent.resolveAgentWorkspaceDir(cfg, agentId);
await params.runtime.agent.ensureAgentWorkspace({ dir: workspaceDir });
const storePath = params.runtime.agent.session.resolveStorePath(cfg.session?.store, { agentId });
const sessionStore = params.runtime.agent.session.loadSessionStore(storePath);
const now = Date.now();
const existing = sessionStore[sessionKey] as
| { sessionId?: string; updatedAt?: number }
| undefined;
const sessionId = normalizeOptionalString(existing?.sessionId) ?? randomUUID();
sessionStore[sessionKey] = { ...existing, sessionId, updatedAt: now };
await params.runtime.agent.session.saveSessionStore(storePath, sessionStore);
const sessionFile = params.runtime.agent.session.resolveSessionFilePath(
sessionId,
sessionStore[sessionKey],
{ agentId },
);
const result = await params.runtime.agent.runEmbeddedPiAgent({
sessionId,
sessionKey,
return await consultRealtimeVoiceAgent({
cfg: params.fullConfig,
agentRuntime: params.runtime.agent,
logger: params.logger,
sessionKey: `google-meet:${params.meetingSessionId}`,
messageProvider: "google-meet",
sessionFile,
workspaceDir,
config: cfg,
prompt: buildPrompt({ args: params.args, transcript: params.transcript }),
thinkLevel: "high",
verboseLevel: "off",
reasoningLevel: "off",
toolResultFormat: "plain",
toolsAllow: resolveToolsAllow(params.config.realtime.toolPolicy),
timeoutMs: params.runtime.agent.resolveAgentTimeoutMs({ cfg }),
runId: `google-meet:${params.meetingSessionId}:${Date.now()}`,
lane: "google-meet",
runIdPrefix: `google-meet:${params.meetingSessionId}`,
args: params.args,
transcript: params.transcript,
surface: "a private Google Meet",
userLabel: "Participant",
assistantLabel: "Agent",
questionSourceLabel: "participant",
toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(params.config.realtime.toolPolicy),
extraSystemPrompt:
"You are a behind-the-scenes consultant for a live meeting voice agent. Be accurate, brief, and speakable.",
agentDir,
});
const text = collectVisibleText((result.payloads ?? []) as AgentPayload[]);
if (!text) {
const reason = result.meta?.aborted ? "agent run aborted" : "agent returned no speakable text";
params.logger.warn(`[google-meet] agent consult produced no answer: ${reason}`);
return { text: "I need a moment to verify that before answering." };
}
return { text };
}

View File

@@ -911,6 +911,7 @@ describe("matrix CLI verification commands", () => {
expect(pruneMatrixStaleGatewayDevicesMock).toHaveBeenCalledWith({
accountId: "poe",
cfg: {},
});
expect(console.log).toHaveBeenCalledWith("Deleted stale OpenClaw devices: BritdXC6iL");
expect(console.log).toHaveBeenCalledWith("Current device: A7hWrQ70ea");

View File

@@ -1,9 +1,9 @@
import { randomUUID } from "node:crypto";
import { chmod, mkdir, mkdtemp, rm, stat, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import path from "node:path";
import { setTimeout as sleep } from "node:timers/promises";
import type { MatrixVerificationSummary } from "@openclaw/matrix/test-api.js";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
import { createMatrixQaClient } from "../../substrate/client.js";
import {
createMatrixQaE2eeScenarioClient,
@@ -391,7 +391,9 @@ async function createMatrixQaCliSelfVerificationRuntime(params: {
userId: string;
}) {
const outputDir = requireMatrixQaE2eeOutputDir(params.context);
const rootDir = await mkdtemp(path.join(tmpdir(), "openclaw-matrix-cli-qa-"));
const rootDir = await mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-matrix-cli-qa-"),
);
const artifactDir = path.join(
outputDir,
"cli-self-verification",

View File

@@ -82,6 +82,11 @@ const voiceCallConfigSchema = {
},
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
"realtime.toolPolicy": {
label: "Realtime Tool Policy",
help: "Controls the shared openclaw_agent_consult tool.",
advanced: true,
},
"realtime.providers": { label: "Realtime Provider Config", advanced: true },
"tts.provider": {
label: "TTS Provider Override",

View File

@@ -402,6 +402,10 @@
"instructions": {
"type": "string"
},
"toolPolicy": {
"type": "string",
"enum": ["safe-read-only", "owner", "none"]
},
"tools": {
"type": "array",
"items": {

View File

@@ -242,6 +242,8 @@ describe("normalizeVoiceCallConfig", () => {
expect(normalized.streaming.provider).toBeUndefined();
expect(normalized.streaming.providers).toEqual({});
expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime");
expect(normalized.realtime.toolPolicy).toBe("safe-read-only");
expect(normalized.realtime.instructions).toContain("openclaw_agent_consult");
expect(normalized.tunnel.provider).toBe("none");
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
});
@@ -300,6 +302,7 @@ describe("resolveVoiceCallConfig", () => {
});
expect(resolved.realtime.instructions).toBe("Stay concise.");
expect(resolved.realtime.toolPolicy).toBe("safe-read-only");
expect(resolved.realtime.provider).toBeUndefined();
});

View File

@@ -1,3 +1,7 @@
import {
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
type RealtimeVoiceAgentConsultToolPolicy,
} from "openclaw/plugin-sdk/realtime-voice";
import { z } from "openclaw/plugin-sdk/zod";
import { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema } from "../api.js";
import { deepMergeDefined } from "./deep-merge.js";
@@ -205,6 +209,11 @@ export type VoiceCallRealtimeProvidersConfig = z.infer<
typeof VoiceCallRealtimeProvidersConfigSchema
>;
export const VoiceCallRealtimeToolPolicySchema = z.enum(["safe-read-only", "owner", "none"]);
export type VoiceCallRealtimeToolPolicy = RealtimeVoiceAgentConsultToolPolicy;
export const DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS = `You are OpenClaw's phone-call realtime voice interface. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
export const VoiceCallStreamingProvidersConfigSchema = z
.record(z.string(), z.record(z.string(), z.unknown()))
.default({});
@@ -221,14 +230,22 @@ export const VoiceCallRealtimeConfigSchema = z
/** Optional override for the local WebSocket route path. */
streamPath: z.string().min(1).optional(),
/** System instructions passed to the realtime provider. */
instructions: z.string().optional(),
instructions: z.string().default(DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS),
/** Tool policy for the shared OpenClaw agent consult tool. */
toolPolicy: VoiceCallRealtimeToolPolicySchema.default("safe-read-only"),
/** Tool definitions exposed to the realtime provider. */
tools: z.array(RealtimeToolSchema).default([]),
/** Provider-owned raw config blobs keyed by provider id. */
providers: VoiceCallRealtimeProvidersConfigSchema,
})
.strict()
.default({ enabled: false, tools: [], providers: {} });
.default({
enabled: false,
instructions: DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS,
toolPolicy: "safe-read-only",
tools: [],
providers: {},
});
export type VoiceCallRealtimeConfig = z.infer<typeof VoiceCallRealtimeConfigSchema>;
// -----------------------------------------------------------------------------

View File

@@ -8,10 +8,17 @@ const mocks = vi.hoisted(() => ({
resolveVoiceCallConfig: vi.fn(),
validateProviderConfig: vi.fn(),
managerInitialize: vi.fn(),
managerGetCall: vi.fn(),
webhookStart: vi.fn(),
webhookStop: vi.fn(),
webhookSetRealtimeHandler: vi.fn(),
webhookGetRealtimeHandler: vi.fn(),
webhookGetMediaStreamHandler: vi.fn(),
webhookCtorArgs: [] as unknown[][],
realtimeHandlerCtorArgs: [] as unknown[][],
realtimeHandlerRegisterToolHandler: vi.fn(),
realtimeHandlerSetPublicUrl: vi.fn(),
resolveConfiguredRealtimeVoiceProvider: vi.fn(),
startTunnel: vi.fn(),
setupTailscaleExposure: vi.fn(),
cleanupTailscaleExposure: vi.fn(),
@@ -25,6 +32,7 @@ vi.mock("./config.js", () => ({
vi.mock("./manager.js", () => ({
CallManager: class {
initialize = mocks.managerInitialize;
getCall = mocks.managerGetCall;
},
}));
@@ -35,10 +43,26 @@ vi.mock("./webhook.js", () => ({
}
start = mocks.webhookStart;
stop = mocks.webhookStop;
setRealtimeHandler = mocks.webhookSetRealtimeHandler;
getRealtimeHandler = mocks.webhookGetRealtimeHandler;
getMediaStreamHandler = mocks.webhookGetMediaStreamHandler;
},
}));
vi.mock("./realtime-voice.runtime.js", () => ({
resolveConfiguredRealtimeVoiceProvider: mocks.resolveConfiguredRealtimeVoiceProvider,
}));
vi.mock("./webhook/realtime-handler.js", () => ({
RealtimeCallHandler: class {
constructor(...args: unknown[]) {
mocks.realtimeHandlerCtorArgs.push(args);
}
registerToolHandler = mocks.realtimeHandlerRegisterToolHandler;
setPublicUrl = mocks.realtimeHandlerSetPublicUrl;
},
}));
vi.mock("./tunnel.js", () => ({
startTunnel: mocks.startTunnel,
}));
@@ -60,10 +84,22 @@ describe("createVoiceCallRuntime lifecycle", () => {
mocks.resolveVoiceCallConfig.mockImplementation((cfg: VoiceCallConfig) => cfg);
mocks.validateProviderConfig.mockReturnValue({ valid: true, errors: [] });
mocks.managerInitialize.mockResolvedValue(undefined);
mocks.managerGetCall.mockReset();
mocks.webhookStart.mockResolvedValue("http://127.0.0.1:3334/voice/webhook");
mocks.webhookStop.mockResolvedValue(undefined);
mocks.webhookSetRealtimeHandler.mockReset();
mocks.webhookGetRealtimeHandler.mockReturnValue({
setPublicUrl: mocks.realtimeHandlerSetPublicUrl,
});
mocks.webhookGetMediaStreamHandler.mockReturnValue(undefined);
mocks.webhookCtorArgs.length = 0;
mocks.realtimeHandlerCtorArgs.length = 0;
mocks.realtimeHandlerRegisterToolHandler.mockReset();
mocks.realtimeHandlerSetPublicUrl.mockReset();
mocks.resolveConfiguredRealtimeVoiceProvider.mockResolvedValue({
provider: { id: "openai" },
providerConfig: { model: "gpt-realtime" },
});
mocks.startTunnel.mockResolvedValue(null);
mocks.setupTailscaleExposure.mockResolvedValue(null);
mocks.cleanupTailscaleExposure.mockResolvedValue(undefined);
@@ -133,4 +169,81 @@ describe("createVoiceCallRuntime lifecycle", () => {
expect(mocks.webhookCtorArgs[0]?.[3]).toBe(coreConfig);
expect(mocks.webhookCtorArgs[0]?.[4]).toBe(fullConfig);
});
it("wires the shared realtime agent consult tool and handler", async () => {
const config = createBaseConfig();
config.inboundPolicy = "allowlist";
config.realtime.enabled = true;
config.realtime.tools = [
{
type: "function",
name: "custom_tool",
description: "Custom tool",
parameters: { type: "object", properties: {} },
},
];
const sessionStore: Record<string, unknown> = {};
const runEmbeddedPiAgent = vi.fn(async () => ({
payloads: [{ text: "Use the shipment status." }],
meta: {},
}));
const agentRuntime = {
defaults: { provider: "openai", model: "gpt-5.4" },
resolveAgentDir: vi.fn(() => "/tmp/agent"),
resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"),
resolveAgentIdentity: vi.fn(),
resolveThinkingDefault: vi.fn(() => "high"),
resolveAgentTimeoutMs: vi.fn(() => 30_000),
ensureAgentWorkspace: vi.fn(async () => {}),
session: {
resolveStorePath: vi.fn(() => "/tmp/sessions.json"),
loadSessionStore: vi.fn(() => sessionStore),
saveSessionStore: vi.fn(async () => {}),
resolveSessionFilePath: vi.fn(() => "/tmp/session.json"),
},
runEmbeddedPiAgent,
};
mocks.managerGetCall.mockReturnValue({
callId: "call-1",
direction: "outbound",
from: "+15550001234",
to: "+15550009999",
transcript: [{ speaker: "user", text: "Can you check shipment status?" }],
});
await createVoiceCallRuntime({
config,
coreConfig: {} as CoreConfig,
agentRuntime: agentRuntime as never,
});
expect(mocks.realtimeHandlerCtorArgs[0]?.[0]).toMatchObject({
tools: [
expect.objectContaining({ name: "openclaw_agent_consult" }),
expect.objectContaining({ name: "custom_tool" }),
],
});
expect(mocks.realtimeHandlerRegisterToolHandler).toHaveBeenCalledWith(
"openclaw_agent_consult",
expect.any(Function),
);
const handler = mocks.realtimeHandlerRegisterToolHandler.mock.calls[0]?.[1] as
| ((args: unknown, callId: string) => Promise<unknown>)
| undefined;
await expect(handler?.({ question: "What should I say?" }, "call-1")).resolves.toEqual({
text: "Use the shipment status.",
});
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
sessionKey: "voice:15550009999",
messageProvider: "voice",
lane: "voice",
provider: "openai",
model: "gpt-5.4",
toolsAllow: ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"],
prompt: expect.stringContaining("Caller: Can you check shipment status?"),
}),
);
});
});

View File

@@ -1,12 +1,21 @@
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import type { ResolvedRealtimeVoiceProvider } from "openclaw/plugin-sdk/realtime-voice";
import {
consultRealtimeVoiceAgent,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
resolveRealtimeVoiceAgentConsultTools,
resolveRealtimeVoiceAgentConsultToolsAllow,
type RealtimeVoiceAgentConsultTranscriptEntry,
type RealtimeVoiceTool,
type ResolvedRealtimeVoiceProvider,
} from "openclaw/plugin-sdk/realtime-voice";
import type { VoiceCallConfig } from "./config.js";
import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
import { CallManager } from "./manager.js";
import type { VoiceCallProvider } from "./providers/base.js";
import type { TwilioProvider } from "./providers/twilio.js";
import { resolveVoiceResponseModel } from "./response-model.js";
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
import { createTelephonyTtsProvider } from "./telephony-tts.js";
import { startTunnel, type TunnelResult } from "./tunnel.js";
@@ -76,6 +85,43 @@ function loadRealtimeHandler(): Promise<RealtimeHandlerModule> {
return realtimeHandlerPromise;
}
function resolveRealtimeTools(config: VoiceCallConfig): RealtimeVoiceTool[] {
const tools = new Map<string, RealtimeVoiceTool>();
for (const tool of resolveRealtimeVoiceAgentConsultTools(config.realtime.toolPolicy)) {
tools.set(tool.name, tool);
}
for (const tool of config.realtime.tools) {
if (!tools.has(tool.name)) {
tools.set(tool.name, tool);
}
}
return [...tools.values()];
}
function resolveVoiceCallConsultSessionKey(call: {
sessionKey?: string;
from?: string;
to?: string;
direction?: "inbound" | "outbound";
callId: string;
}): string {
if (call.sessionKey) {
return call.sessionKey;
}
const phone = call.direction === "outbound" ? call.to : call.from;
const normalizedPhone = phone?.replace(/\D/g, "");
return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${call.callId}`;
}
function mapVoiceCallConsultTranscript(call: {
transcript?: Array<{ speaker: "user" | "bot"; text: string }>;
}): RealtimeVoiceAgentConsultTranscriptEntry[] {
return (call.transcript ?? []).map((entry) => ({
role: entry.speaker === "bot" ? "assistant" : "user",
text: entry.text,
}));
}
function createRuntimeResourceLifecycle(params: {
config: VoiceCallConfig;
webhookServer: VoiceCallWebhookServer;
@@ -215,6 +261,7 @@ export async function createVoiceCallRuntime(params: {
};
const config = resolveVoiceCallConfig(rawConfig);
const cfg = fullConfig ?? (coreConfig as OpenClawConfig);
if (!config.enabled) {
throw new Error("Voice call disabled. Enable the plugin entry in config.");
@@ -236,7 +283,7 @@ export async function createVoiceCallRuntime(params: {
const realtimeProvider = config.realtime.enabled
? await resolveRealtimeProvider({
config,
fullConfig: fullConfig ?? (coreConfig as OpenClawConfig),
fullConfig: cfg,
})
: null;
const webhookServer = new VoiceCallWebhookServer(
@@ -249,16 +296,61 @@ export async function createVoiceCallRuntime(params: {
);
if (realtimeProvider) {
const { RealtimeCallHandler } = await loadRealtimeHandler();
webhookServer.setRealtimeHandler(
new RealtimeCallHandler(
config.realtime,
manager,
provider,
realtimeProvider.provider,
realtimeProvider.providerConfig,
config.serve.path,
),
const realtimeConfig = {
...config.realtime,
tools: resolveRealtimeTools(config),
};
const realtimeHandler = new RealtimeCallHandler(
realtimeConfig,
manager,
provider,
realtimeProvider.provider,
realtimeProvider.providerConfig,
config.serve.path,
);
if (config.realtime.toolPolicy !== "none") {
realtimeHandler.registerToolHandler(
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
async (args, callId) => {
const call = manager.getCall(callId);
if (!call) {
return { error: `Call "${callId}" not found` };
}
const { provider: agentProvider, model } = resolveVoiceResponseModel({
voiceConfig: config,
agentRuntime,
});
const thinkLevel = agentRuntime.resolveThinkingDefault({
cfg,
provider: agentProvider,
model,
});
return await consultRealtimeVoiceAgent({
cfg,
agentRuntime,
logger: log,
sessionKey: resolveVoiceCallConsultSessionKey(call),
messageProvider: "voice",
lane: "voice",
runIdPrefix: `voice-realtime-consult:${callId}`,
args,
transcript: mapVoiceCallConsultTranscript(call),
surface: "a live phone call",
userLabel: "Caller",
assistantLabel: "Agent",
questionSourceLabel: "caller",
provider: agentProvider,
model,
thinkLevel,
timeoutMs: config.responseTimeoutMs,
toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(config.realtime.toolPolicy),
extraSystemPrompt:
"You are a behind-the-scenes consultant for a live phone voice agent. Be accurate, brief, and speakable.",
});
},
);
}
webhookServer.setRealtimeHandler(realtimeHandler);
}
const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer });

View File

@@ -46,6 +46,9 @@ export function createVoiceCallBaseConfig(params?: {
realtime: {
enabled: false,
streamPath: "/voice/stream/realtime",
instructions:
"You are OpenClaw's phone-call realtime voice interface. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering.",
toolPolicy: "safe-read-only",
tools: [],
providers: {},
},

View File

@@ -571,6 +571,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
realtime: {
enabled: true,
streamPath: "/voice/stream/realtime",
instructions: "Be helpful.",
toolPolicy: "safe-read-only",
tools: [],
providers: {},
},
@@ -628,6 +630,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
realtime: {
enabled: true,
streamPath: "/voice/stream/realtime",
instructions: "Be helpful.",
toolPolicy: "safe-read-only",
tools: [],
providers: {},
},
@@ -680,6 +684,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
realtime: {
enabled: true,
streamPath: "/voice/stream/realtime",
instructions: "Be helpful.",
toolPolicy: "safe-read-only",
tools: [],
providers: {},
},
@@ -730,6 +736,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
realtime: {
enabled: true,
streamPath: "/voice/stream/realtime",
instructions: "Be helpful.",
toolPolicy: "safe-read-only",
tools: [],
providers: {},
},

View File

@@ -8,6 +8,7 @@ import { WebSocket } from "ws";
import type { VoiceCallRealtimeConfig } from "../config.js";
import type { CallManager } from "../manager.js";
import type { VoiceCallProvider } from "../providers/base.js";
import type { CallRecord } from "../types.js";
import { connectWs, startUpgradeWsServer, waitForClose } from "../websocket-test-support.js";
import { RealtimeCallHandler } from "./realtime-handler.js";
@@ -33,7 +34,7 @@ function makeBridge(): RealtimeVoiceBridge {
}
function makeRealtimeProvider(
createBridge: () => RealtimeVoiceBridge,
createBridge: RealtimeVoiceProviderPlugin["createBridge"],
): RealtimeVoiceProviderPlugin {
return {
id: "openai",
@@ -51,15 +52,17 @@ function makeHandler(
realtimeProvider?: RealtimeVoiceProviderPlugin;
},
) {
const config: VoiceCallRealtimeConfig = {
enabled: true,
streamPath: overrides?.streamPath ?? "/voice/stream/realtime",
instructions: overrides?.instructions ?? "Be helpful.",
toolPolicy: overrides?.toolPolicy ?? "safe-read-only",
tools: overrides?.tools ?? [],
providers: overrides?.providers ?? {},
...(overrides?.provider ? { provider: overrides.provider } : {}),
};
return new RealtimeCallHandler(
{
enabled: true,
streamPath: "/voice/stream/realtime",
instructions: "Be helpful.",
tools: [],
providers: {},
...overrides,
},
config,
{
processEvent: vi.fn(),
getCallByProviderCallId: vi.fn(),
@@ -124,6 +127,91 @@ describe("RealtimeCallHandler path routing", () => {
/wss:\/\/public\.example\/api\/custom\/stream\/realtime\/[0-9a-f-]{36}/,
);
});
it("normalizes Twilio outbound realtime directions", async () => {
let callbacks:
| {
onReady?: () => void;
}
| undefined;
const createBridge = vi.fn(
(request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0]) => {
callbacks = request;
return makeBridge();
},
);
const processEvent = vi.fn();
const getCallByProviderCallId = vi.fn(
(): CallRecord => ({
callId: "call-1",
providerCallId: "CA-outbound",
provider: "twilio",
direction: "outbound",
state: "ringing",
from: "+15550001234",
to: "+15550009999",
startedAt: Date.now(),
transcript: [],
processedEventIds: [],
metadata: {},
}),
);
const handler = makeHandler(undefined, {
manager: {
processEvent,
getCallByProviderCallId,
},
realtimeProvider: makeRealtimeProvider(createBridge),
});
const payload = handler.buildTwiMLPayload(
makeRequest("/voice/webhook"),
new URLSearchParams({
Direction: "outbound-dial",
From: "+15550001234",
To: "+15550009999",
}),
);
const match = payload.body.match(/wss:\/\/[^/]+(\/[^"]+)/);
if (!match) {
throw new Error("Failed to extract realtime stream path");
}
const server = await startUpgradeWsServer({
urlPath: match[1],
onUpgrade: (request, socket, head) => {
handler.handleWebSocketUpgrade(request, socket, head);
},
});
try {
const ws = await connectWs(server.url);
try {
ws.send(
JSON.stringify({
event: "start",
start: { streamSid: "MZ-outbound", callSid: "CA-outbound" },
}),
);
await vi.waitFor(() => {
expect(createBridge).toHaveBeenCalled();
});
callbacks?.onReady?.();
expect(processEvent).toHaveBeenCalledWith(
expect.objectContaining({
type: "call.initiated",
direction: "outbound",
from: "+15550001234",
to: "+15550009999",
}),
);
} finally {
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
ws.close();
}
}
} finally {
await server.close();
}
});
});
describe("RealtimeCallHandler websocket hardening", () => {

View File

@@ -101,7 +101,7 @@ export class RealtimeCallHandler {
const token = this.issueStreamToken({
from: params?.get("From") ?? undefined,
to: params?.get("To") ?? undefined,
direction: rawDirection === "outbound-api" ? "outbound" : "inbound",
direction: rawDirection?.startsWith("outbound") ? "outbound" : "inbound",
});
const wsUrl = `wss://${host}${this.getStreamPathPattern()}/${token}`;
const twiml = `<?xml version="1.0" encoding="UTF-8"?>

View File

@@ -15,9 +15,23 @@ export type {
RealtimeVoiceToolCallEvent,
} from "../realtime-voice/provider-types.js";
export {
buildRealtimeVoiceAgentConsultChatMessage,
buildRealtimeVoiceAgentConsultPrompt,
collectRealtimeVoiceAgentConsultVisibleText,
parseRealtimeVoiceAgentConsultArgs,
REALTIME_VOICE_AGENT_CONSULT_TOOL,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
type RealtimeVoiceAgentConsultArgs,
type RealtimeVoiceAgentConsultToolPolicy,
type RealtimeVoiceAgentConsultTranscriptEntry,
} from "../realtime-voice/agent-consult-tool.js";
export {
consultRealtimeVoiceAgent,
resolveRealtimeVoiceAgentConsultTools,
resolveRealtimeVoiceAgentConsultToolsAllow,
type RealtimeVoiceAgentConsultResult,
type RealtimeVoiceAgentConsultRuntime,
} from "../realtime-voice/agent-consult-runtime.js";
export {
canonicalizeRealtimeVoiceProviderId,
getRealtimeVoiceProvider,

View File

@@ -0,0 +1,116 @@
import { describe, expect, it, vi } from "vitest";
import {
consultRealtimeVoiceAgent,
resolveRealtimeVoiceAgentConsultTools,
resolveRealtimeVoiceAgentConsultToolsAllow,
} from "./agent-consult-runtime.js";
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "./agent-consult-tool.js";
function createAgentRuntime(payloads: unknown[] = [{ text: "Speak this." }]) {
const sessionStore: Record<string, { sessionId?: string; updatedAt?: number }> = {};
const runEmbeddedPiAgent = vi.fn(async () => ({
payloads,
meta: {},
}));
return {
runtime: {
resolveAgentDir: vi.fn(() => "/tmp/agent"),
resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"),
ensureAgentWorkspace: vi.fn(async () => {}),
resolveAgentTimeoutMs: vi.fn(() => 30_000),
session: {
resolveStorePath: vi.fn(() => "/tmp/sessions.json"),
loadSessionStore: vi.fn(() => sessionStore),
saveSessionStore: vi.fn(async () => {}),
resolveSessionFilePath: vi.fn(() => "/tmp/session.json"),
},
runEmbeddedPiAgent,
},
runEmbeddedPiAgent,
sessionStore,
};
}
describe("realtime voice agent consult runtime", () => {
it("exposes the shared consult tool based on policy", () => {
expect(resolveRealtimeVoiceAgentConsultTools("safe-read-only")).toEqual([
expect.objectContaining({ name: REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME }),
]);
expect(resolveRealtimeVoiceAgentConsultTools("none")).toEqual([]);
expect(resolveRealtimeVoiceAgentConsultToolsAllow("safe-read-only")).toEqual([
"read",
"web_search",
"web_fetch",
"x_search",
"memory_search",
"memory_get",
]);
expect(resolveRealtimeVoiceAgentConsultToolsAllow("owner")).toBeUndefined();
expect(resolveRealtimeVoiceAgentConsultToolsAllow("none")).toEqual([]);
});
it("runs an embedded agent using the shared session and prompt contract", async () => {
const { runtime, runEmbeddedPiAgent, sessionStore } = createAgentRuntime();
const result = await consultRealtimeVoiceAgent({
cfg: {} as never,
agentRuntime: runtime as never,
logger: { warn: vi.fn() },
sessionKey: "voice:15550001234",
messageProvider: "voice",
lane: "voice",
runIdPrefix: "voice-realtime-consult:call-1",
args: { question: "What should I say?", context: "Caller asked about PR #123." },
transcript: [{ role: "user", text: "Can you check this?" }],
surface: "a live phone call",
userLabel: "Caller",
questionSourceLabel: "caller",
toolsAllow: ["read"],
provider: "openai",
model: "gpt-5.4",
thinkLevel: "high",
timeoutMs: 10_000,
});
expect(result).toEqual({ text: "Speak this." });
expect(sessionStore["voice:15550001234"]?.sessionId).toBeTruthy();
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
sessionKey: "voice:15550001234",
messageProvider: "voice",
lane: "voice",
toolsAllow: ["read"],
provider: "openai",
model: "gpt-5.4",
thinkLevel: "high",
timeoutMs: 10_000,
prompt: expect.stringContaining("Caller: Can you check this?"),
}),
);
});
it("returns a speakable fallback when the embedded agent has no visible text", async () => {
const warn = vi.fn();
const { runtime } = createAgentRuntime([{ text: "hidden", isReasoning: true }]);
const result = await consultRealtimeVoiceAgent({
cfg: {} as never,
agentRuntime: runtime as never,
logger: { warn },
sessionKey: "google-meet:meet-1",
messageProvider: "google-meet",
lane: "google-meet",
runIdPrefix: "google-meet:meet-1",
args: { question: "What now?" },
transcript: [],
surface: "a private Google Meet",
userLabel: "Participant",
fallbackText: "Let me verify that first.",
});
expect(result).toEqual({ text: "Let me verify that first." });
expect(warn).toHaveBeenCalledWith(
"[realtime-voice] agent consult produced no answer: agent returned no speakable text",
);
});
});

View File

@@ -0,0 +1,127 @@
import { randomUUID } from "node:crypto";
import type { RunEmbeddedPiAgentParams } from "../agents/pi-embedded-runner/run/params.js";
import type { OpenClawConfig } from "../config/types.openclaw.js";
import type { RuntimeLogger, PluginRuntimeCore } from "../plugins/runtime/types-core.js";
import {
buildRealtimeVoiceAgentConsultPrompt,
collectRealtimeVoiceAgentConsultVisibleText,
REALTIME_VOICE_AGENT_CONSULT_TOOL,
type RealtimeVoiceAgentConsultToolPolicy,
type RealtimeVoiceAgentConsultTranscriptEntry,
} from "./agent-consult-tool.js";
import type { RealtimeVoiceTool } from "./provider-types.js";
export type RealtimeVoiceAgentConsultRuntime = PluginRuntimeCore["agent"];
export type RealtimeVoiceAgentConsultResult = { text: string };
const SAFE_READ_ONLY_TOOLS = [
"read",
"web_search",
"web_fetch",
"x_search",
"memory_search",
"memory_get",
] as const;
export function resolveRealtimeVoiceAgentConsultTools(
policy: RealtimeVoiceAgentConsultToolPolicy,
): RealtimeVoiceTool[] {
return policy === "none" ? [] : [REALTIME_VOICE_AGENT_CONSULT_TOOL];
}
export function resolveRealtimeVoiceAgentConsultToolsAllow(
policy: RealtimeVoiceAgentConsultToolPolicy,
): string[] | undefined {
if (policy === "owner") {
return undefined;
}
if (policy === "safe-read-only") {
return [...SAFE_READ_ONLY_TOOLS];
}
return [];
}
export async function consultRealtimeVoiceAgent(params: {
cfg: OpenClawConfig;
agentRuntime: RealtimeVoiceAgentConsultRuntime;
logger: Pick<RuntimeLogger, "warn">;
sessionKey: string;
messageProvider: string;
lane: string;
runIdPrefix: string;
args: unknown;
transcript: RealtimeVoiceAgentConsultTranscriptEntry[];
surface: string;
userLabel: string;
assistantLabel?: string;
questionSourceLabel?: string;
agentId?: string;
provider?: RunEmbeddedPiAgentParams["provider"];
model?: RunEmbeddedPiAgentParams["model"];
thinkLevel?: RunEmbeddedPiAgentParams["thinkLevel"];
timeoutMs?: number;
toolsAllow?: string[];
extraSystemPrompt?: string;
fallbackText?: string;
}): Promise<RealtimeVoiceAgentConsultResult> {
const agentId = params.agentId ?? "main";
const agentDir = params.agentRuntime.resolveAgentDir(params.cfg, agentId);
const workspaceDir = params.agentRuntime.resolveAgentWorkspaceDir(params.cfg, agentId);
await params.agentRuntime.ensureAgentWorkspace({ dir: workspaceDir });
const storePath = params.agentRuntime.session.resolveStorePath(params.cfg.session?.store, {
agentId,
});
const sessionStore = params.agentRuntime.session.loadSessionStore(storePath);
const now = Date.now();
const existing = sessionStore[params.sessionKey] as
| { sessionId?: string; updatedAt?: number }
| undefined;
const sessionId = existing?.sessionId?.trim() || randomUUID();
sessionStore[params.sessionKey] = { ...existing, sessionId, updatedAt: now };
await params.agentRuntime.session.saveSessionStore(storePath, sessionStore);
const sessionFile = params.agentRuntime.session.resolveSessionFilePath(
sessionId,
sessionStore[params.sessionKey],
{ agentId },
);
const result = await params.agentRuntime.runEmbeddedPiAgent({
sessionId,
sessionKey: params.sessionKey,
messageProvider: params.messageProvider,
sessionFile,
workspaceDir,
config: params.cfg,
prompt: buildRealtimeVoiceAgentConsultPrompt({
args: params.args,
transcript: params.transcript,
surface: params.surface,
userLabel: params.userLabel,
assistantLabel: params.assistantLabel,
questionSourceLabel: params.questionSourceLabel,
}),
provider: params.provider,
model: params.model,
thinkLevel: params.thinkLevel ?? "high",
verboseLevel: "off",
reasoningLevel: "off",
toolResultFormat: "plain",
toolsAllow: params.toolsAllow,
timeoutMs: params.timeoutMs ?? params.agentRuntime.resolveAgentTimeoutMs({ cfg: params.cfg }),
runId: `${params.runIdPrefix}:${Date.now()}`,
lane: params.lane,
extraSystemPrompt:
params.extraSystemPrompt ??
"You are a behind-the-scenes consultant for a live voice agent. Be accurate, brief, and speakable.",
agentDir,
});
const text = collectRealtimeVoiceAgentConsultVisibleText(result.payloads ?? []);
if (!text) {
const reason = result.meta?.aborted ? "agent run aborted" : "agent returned no speakable text";
params.logger.warn(`[realtime-voice] agent consult produced no answer: ${reason}`);
return { text: params.fallbackText ?? "I need a moment to verify that before answering." };
}
return { text };
}

View File

@@ -0,0 +1,55 @@
import { describe, expect, it } from "vitest";
import {
buildRealtimeVoiceAgentConsultChatMessage,
buildRealtimeVoiceAgentConsultPrompt,
collectRealtimeVoiceAgentConsultVisibleText,
parseRealtimeVoiceAgentConsultArgs,
} from "./agent-consult-tool.js";
describe("realtime voice agent consult tool", () => {
it("normalizes shared tool arguments for browser chat forwarding", () => {
expect(
buildRealtimeVoiceAgentConsultChatMessage({
question: " What changed? ",
context: " PR #123 ",
responseStyle: " concise ",
}),
).toBe("What changed?\n\nContext:\nPR #123\n\nSpoken style:\nconcise");
});
it("requires a non-empty question", () => {
expect(() => parseRealtimeVoiceAgentConsultArgs({ context: "missing" })).toThrow(
"question required",
);
});
it("builds a reusable spoken consultant prompt with recent transcript", () => {
const prompt = buildRealtimeVoiceAgentConsultPrompt({
args: { question: "Do we support realtime tools?" },
transcript: [
{ role: "user", text: "Can you check the repo?" },
{ role: "assistant", text: "I'll verify." },
],
surface: "a private Google Meet",
userLabel: "Participant",
assistantLabel: "Agent",
questionSourceLabel: "participant",
});
expect(prompt).toContain("during a private Google Meet");
expect(prompt).toContain("Participant: Can you check the repo?");
expect(prompt).toContain("Agent: I'll verify.");
expect(prompt).toContain("Question:\nDo we support realtime tools?");
});
it("filters reasoning and error payloads from visible consult output", () => {
expect(
collectRealtimeVoiceAgentConsultVisibleText([
{ text: "thinking", isReasoning: true },
{ text: "first" },
{ text: "error", isError: true },
{ text: "second" },
]),
).toBe("first\n\nsecond");
});
});

View File

@@ -1,6 +1,17 @@
import { normalizeOptionalString } from "../shared/string-coerce.js";
import type { RealtimeVoiceTool } from "./provider-types.js";
export const REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult";
export type RealtimeVoiceAgentConsultToolPolicy = "safe-read-only" | "owner" | "none";
export type RealtimeVoiceAgentConsultArgs = {
question: string;
context?: string;
responseStyle?: string;
};
export type RealtimeVoiceAgentConsultTranscriptEntry = {
role: "user" | "assistant";
text: string;
};
export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
type: "function",
@@ -26,3 +37,81 @@ export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
required: ["question"],
},
};
export function parseRealtimeVoiceAgentConsultArgs(args: unknown): RealtimeVoiceAgentConsultArgs {
const question = readConsultStringArg(args, "question");
if (!question) {
throw new Error("question required");
}
return {
question,
context: readConsultStringArg(args, "context"),
responseStyle: readConsultStringArg(args, "responseStyle"),
};
}
export function buildRealtimeVoiceAgentConsultChatMessage(args: unknown): string {
const parsed = parseRealtimeVoiceAgentConsultArgs(args);
return [
parsed.question,
parsed.context ? `Context:\n${parsed.context}` : undefined,
parsed.responseStyle ? `Spoken style:\n${parsed.responseStyle}` : undefined,
]
.filter(Boolean)
.join("\n\n");
}
export function buildRealtimeVoiceAgentConsultPrompt(params: {
args: unknown;
transcript: RealtimeVoiceAgentConsultTranscriptEntry[];
surface: string;
userLabel: string;
assistantLabel?: string;
questionSourceLabel?: string;
}): string {
const parsed = parseRealtimeVoiceAgentConsultArgs(params.args);
const assistantLabel = params.assistantLabel ?? "Agent";
const questionSourceLabel = params.questionSourceLabel ?? params.userLabel.toLowerCase();
const transcript = params.transcript
.slice(-12)
.map(
(entry) => `${entry.role === "assistant" ? assistantLabel : params.userLabel}: ${entry.text}`,
)
.join("\n");
return [
`You are helping an OpenClaw realtime voice agent during ${params.surface}.`,
`Answer the ${questionSourceLabel}'s question with the strongest useful reasoning and available tools.`,
"Return only the concise answer the realtime voice agent should speak next.",
"Do not include markdown, citations unless needed, tool logs, or private reasoning.",
parsed.responseStyle ? `Spoken style: ${parsed.responseStyle}` : undefined,
transcript ? `Recent transcript:\n${transcript}` : undefined,
parsed.context ? `Additional context:\n${parsed.context}` : undefined,
`Question:\n${parsed.question}`,
]
.filter(Boolean)
.join("\n\n");
}
export function collectRealtimeVoiceAgentConsultVisibleText(
payloads: Array<{ text?: unknown; isError?: boolean; isReasoning?: boolean }>,
): string | null {
const chunks: string[] = [];
for (const payload of payloads) {
if (payload.isError || payload.isReasoning) {
continue;
}
const text = normalizeOptionalString(payload.text);
if (text) {
chunks.push(text);
}
}
return chunks.length > 0 ? chunks.join("\n\n").trim() : null;
}
function readConsultStringArg(args: unknown, key: string): string | undefined {
if (!args || typeof args !== "object" || Array.isArray(args)) {
return undefined;
}
return normalizeOptionalString((args as Record<string, unknown>)[key]);
}

View File

@@ -1,4 +1,7 @@
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../../../src/realtime-voice/agent-consult-tool.js";
import {
buildRealtimeVoiceAgentConsultChatMessage,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
} from "../../../../src/realtime-voice/agent-consult-tool.js";
import type { GatewayBrowserClient, GatewayEventFrame } from "../gateway.ts";
import { generateUUID } from "../uuid.ts";
@@ -239,23 +242,9 @@ export class RealtimeTalkSession {
this.callbacks.onStatus?.("thinking");
let question = "";
try {
const args = JSON.parse(buffered?.args || event.arguments || "{}") as {
question?: unknown;
context?: unknown;
responseStyle?: unknown;
};
question = typeof args.question === "string" ? args.question.trim() : "";
const context = typeof args.context === "string" ? args.context.trim() : "";
const responseStyle = typeof args.responseStyle === "string" ? args.responseStyle.trim() : "";
if (context || responseStyle) {
question = [
question,
context ? `Context:\n${context}` : undefined,
responseStyle ? `Spoken style:\n${responseStyle}` : undefined,
]
.filter(Boolean)
.join("\n\n");
}
question = buildRealtimeVoiceAgentConsultChatMessage(
JSON.parse(buffered?.args || event.arguments || "{}"),
);
} catch {}
if (!question) {
this.submitToolResult(callId, {