mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:00:43 +00:00
feat(voice-call): share realtime agent consult tool
Centralize the shared realtime agent consult tool for browser Talk, Google Meet, and Voice Call.
This commit is contained in:
committed by
GitHub
parent
900ba7cf33
commit
e2f13959d4
@@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Plugins/Google Meet: add a bundled participant plugin with personal Google auth, explicit meeting URL joins, Chrome and Twilio transports, and realtime voice support. (#70765) Thanks @steipete.
|
||||
- Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`.
|
||||
- Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key.
|
||||
- Plugins/Voice Call: expose the shared `openclaw_agent_consult` realtime tool so live phone calls can ask the full OpenClaw agent for deeper/tool-backed answers.
|
||||
- Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
|
||||
- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support.
|
||||
- Plugins/Google Meet: let realtime Meet sessions consult the full OpenClaw agent for deeper answers while staying in the live voice loop.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
0adf332920764704575b21d2fe9568742d977ff0169683319c168d68ea7cf143 config-baseline.json
|
||||
a608561acecc7cfc5f16a31b7498d7a66001f6655f5a5960a68842c59b7dcaa8 config-baseline.json
|
||||
2936d2ccf0c1e6e932a0e7c617b809e4b31dbb9a7d5afefbba29b229913b9e50 config-baseline.core.json
|
||||
22d7cd6d8279146b2d79c9531a55b80b52a2c99c81338c508104729154fdd02d config-baseline.channel.json
|
||||
28d874a4910174c7014ef2a267269a3327d31ff657f76d38c034ef1b86eae484 config-baseline.plugin.json
|
||||
d47a574045a47356e513ab308d7dcad9fa0b389f50e93c5cf0f820fab858e70e config-baseline.plugin.json
|
||||
|
||||
@@ -670,6 +670,7 @@ OpenClaw tools, it can call `openclaw_agent_consult`.
|
||||
The consult tool runs the regular OpenClaw agent behind the scenes with recent
|
||||
meeting transcript context and returns a concise spoken answer to the realtime
|
||||
voice session. The voice model can then speak that answer back into the meeting.
|
||||
It uses the same shared realtime consult tool as Voice Call.
|
||||
|
||||
`realtime.toolPolicy` controls the consult run:
|
||||
|
||||
|
||||
@@ -126,6 +126,7 @@ Set config under `plugins.entries.voice-call.config`:
|
||||
realtime: {
|
||||
enabled: false,
|
||||
provider: "google", // optional; first registered realtime voice provider when unset
|
||||
toolPolicy: "safe-read-only",
|
||||
providers: {
|
||||
google: {
|
||||
model: "gemini-2.5-flash-native-audio-preview-12-2025",
|
||||
@@ -174,6 +175,20 @@ Current runtime behavior:
|
||||
- Bundled realtime voice providers include Google Gemini Live (`google`) and
|
||||
OpenAI (`openai`), registered by their provider plugins.
|
||||
- Provider-owned raw config lives under `realtime.providers.<providerId>`.
|
||||
- Voice Call exposes the shared `openclaw_agent_consult` realtime tool by
|
||||
default. The realtime model can call it when the caller asks for deeper
|
||||
reasoning, current information, or normal OpenClaw tools.
|
||||
- `realtime.toolPolicy` controls the consult run:
|
||||
- `safe-read-only`: expose the consult tool and limit the regular agent to
|
||||
`read`, `web_search`, `web_fetch`, `x_search`, `memory_search`, and
|
||||
`memory_get`.
|
||||
- `owner`: expose the consult tool and let the regular agent use the normal
|
||||
agent tool policy.
|
||||
- `none`: do not expose the consult tool. Custom `realtime.tools` are still
|
||||
passed through to the realtime provider.
|
||||
- Consult session keys reuse the existing voice session when available, then
|
||||
fall back to the caller/callee phone number so follow-up consult calls keep
|
||||
context during the call.
|
||||
- If `realtime.provider` points at an unregistered provider, or no realtime
|
||||
voice provider is registered at all, Voice Call logs a warning and skips
|
||||
realtime media instead of failing the whole plugin.
|
||||
@@ -199,7 +214,8 @@ Example:
|
||||
realtime: {
|
||||
enabled: true,
|
||||
provider: "google",
|
||||
instructions: "Speak briefly and ask before using tools.",
|
||||
instructions: "Speak briefly. Call openclaw_agent_consult before using deeper tools.",
|
||||
toolPolicy: "safe-read-only",
|
||||
providers: {
|
||||
google: {
|
||||
apiKey: "${GEMINI_API_KEY}",
|
||||
|
||||
@@ -1,84 +1,20 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
|
||||
import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime";
|
||||
import {
|
||||
consultRealtimeVoiceAgent,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
resolveRealtimeVoiceAgentConsultTools,
|
||||
resolveRealtimeVoiceAgentConsultToolsAllow,
|
||||
type RealtimeVoiceTool,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
|
||||
import type { GoogleMeetConfig, GoogleMeetToolPolicy } from "./config.js";
|
||||
|
||||
type AgentPayload = {
|
||||
text?: string;
|
||||
isError?: boolean;
|
||||
isReasoning?: boolean;
|
||||
};
|
||||
|
||||
export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME;
|
||||
export const GOOGLE_MEET_AGENT_CONSULT_TOOL = REALTIME_VOICE_AGENT_CONSULT_TOOL;
|
||||
|
||||
export function resolveGoogleMeetRealtimeTools(policy: GoogleMeetToolPolicy): RealtimeVoiceTool[] {
|
||||
return policy === "none" ? [] : [GOOGLE_MEET_AGENT_CONSULT_TOOL];
|
||||
}
|
||||
|
||||
function normalizeToolArgString(args: unknown, key: string): string | undefined {
|
||||
if (!args || typeof args !== "object" || Array.isArray(args)) {
|
||||
return undefined;
|
||||
}
|
||||
return normalizeOptionalString((args as Record<string, unknown>)[key]);
|
||||
}
|
||||
|
||||
function collectVisibleText(payloads: AgentPayload[]): string | null {
|
||||
const chunks: string[] = [];
|
||||
for (const payload of payloads) {
|
||||
if (payload.isError || payload.isReasoning) {
|
||||
continue;
|
||||
}
|
||||
const text = normalizeOptionalString(payload.text);
|
||||
if (text) {
|
||||
chunks.push(text);
|
||||
}
|
||||
}
|
||||
return chunks.length > 0 ? chunks.join("\n\n").trim() : null;
|
||||
}
|
||||
|
||||
function resolveToolsAllow(policy: GoogleMeetToolPolicy): string[] | undefined {
|
||||
if (policy === "owner") {
|
||||
return undefined;
|
||||
}
|
||||
if (policy === "safe-read-only") {
|
||||
return ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
function buildPrompt(params: {
|
||||
args: unknown;
|
||||
transcript: Array<{ role: "user" | "assistant"; text: string }>;
|
||||
}): string {
|
||||
const question = normalizeToolArgString(params.args, "question");
|
||||
if (!question) {
|
||||
throw new Error("question required");
|
||||
}
|
||||
const context = normalizeToolArgString(params.args, "context");
|
||||
const responseStyle = normalizeToolArgString(params.args, "responseStyle");
|
||||
const transcript = params.transcript
|
||||
.slice(-12)
|
||||
.map((entry) => `${entry.role === "assistant" ? "Agent" : "Participant"}: ${entry.text}`)
|
||||
.join("\n");
|
||||
return [
|
||||
"You are helping an OpenClaw realtime voice agent during a private Google Meet.",
|
||||
"Answer the participant's question with the strongest useful reasoning and available tools.",
|
||||
"Return only the concise answer the realtime voice agent should speak next.",
|
||||
"Do not include markdown, citations unless needed, tool logs, or private reasoning.",
|
||||
responseStyle ? `Spoken style: ${responseStyle}` : undefined,
|
||||
transcript ? `Recent meeting transcript:\n${transcript}` : undefined,
|
||||
context ? `Additional context:\n${context}` : undefined,
|
||||
`Question:\n${question}`,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
return resolveRealtimeVoiceAgentConsultTools(policy);
|
||||
}
|
||||
|
||||
export async function consultOpenClawAgentForGoogleMeet(params: {
|
||||
@@ -90,54 +26,22 @@ export async function consultOpenClawAgentForGoogleMeet(params: {
|
||||
args: unknown;
|
||||
transcript: Array<{ role: "user" | "assistant"; text: string }>;
|
||||
}): Promise<{ text: string }> {
|
||||
const agentId = "main";
|
||||
const sessionKey = `google-meet:${params.meetingSessionId}`;
|
||||
const cfg = params.fullConfig;
|
||||
const agentDir = params.runtime.agent.resolveAgentDir(cfg, agentId);
|
||||
const workspaceDir = params.runtime.agent.resolveAgentWorkspaceDir(cfg, agentId);
|
||||
await params.runtime.agent.ensureAgentWorkspace({ dir: workspaceDir });
|
||||
|
||||
const storePath = params.runtime.agent.session.resolveStorePath(cfg.session?.store, { agentId });
|
||||
const sessionStore = params.runtime.agent.session.loadSessionStore(storePath);
|
||||
const now = Date.now();
|
||||
const existing = sessionStore[sessionKey] as
|
||||
| { sessionId?: string; updatedAt?: number }
|
||||
| undefined;
|
||||
const sessionId = normalizeOptionalString(existing?.sessionId) ?? randomUUID();
|
||||
sessionStore[sessionKey] = { ...existing, sessionId, updatedAt: now };
|
||||
await params.runtime.agent.session.saveSessionStore(storePath, sessionStore);
|
||||
|
||||
const sessionFile = params.runtime.agent.session.resolveSessionFilePath(
|
||||
sessionId,
|
||||
sessionStore[sessionKey],
|
||||
{ agentId },
|
||||
);
|
||||
const result = await params.runtime.agent.runEmbeddedPiAgent({
|
||||
sessionId,
|
||||
sessionKey,
|
||||
return await consultRealtimeVoiceAgent({
|
||||
cfg: params.fullConfig,
|
||||
agentRuntime: params.runtime.agent,
|
||||
logger: params.logger,
|
||||
sessionKey: `google-meet:${params.meetingSessionId}`,
|
||||
messageProvider: "google-meet",
|
||||
sessionFile,
|
||||
workspaceDir,
|
||||
config: cfg,
|
||||
prompt: buildPrompt({ args: params.args, transcript: params.transcript }),
|
||||
thinkLevel: "high",
|
||||
verboseLevel: "off",
|
||||
reasoningLevel: "off",
|
||||
toolResultFormat: "plain",
|
||||
toolsAllow: resolveToolsAllow(params.config.realtime.toolPolicy),
|
||||
timeoutMs: params.runtime.agent.resolveAgentTimeoutMs({ cfg }),
|
||||
runId: `google-meet:${params.meetingSessionId}:${Date.now()}`,
|
||||
lane: "google-meet",
|
||||
runIdPrefix: `google-meet:${params.meetingSessionId}`,
|
||||
args: params.args,
|
||||
transcript: params.transcript,
|
||||
surface: "a private Google Meet",
|
||||
userLabel: "Participant",
|
||||
assistantLabel: "Agent",
|
||||
questionSourceLabel: "participant",
|
||||
toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(params.config.realtime.toolPolicy),
|
||||
extraSystemPrompt:
|
||||
"You are a behind-the-scenes consultant for a live meeting voice agent. Be accurate, brief, and speakable.",
|
||||
agentDir,
|
||||
});
|
||||
|
||||
const text = collectVisibleText((result.payloads ?? []) as AgentPayload[]);
|
||||
if (!text) {
|
||||
const reason = result.meta?.aborted ? "agent run aborted" : "agent returned no speakable text";
|
||||
params.logger.warn(`[google-meet] agent consult produced no answer: ${reason}`);
|
||||
return { text: "I need a moment to verify that before answering." };
|
||||
}
|
||||
return { text };
|
||||
}
|
||||
|
||||
@@ -911,6 +911,7 @@ describe("matrix CLI verification commands", () => {
|
||||
|
||||
expect(pruneMatrixStaleGatewayDevicesMock).toHaveBeenCalledWith({
|
||||
accountId: "poe",
|
||||
cfg: {},
|
||||
});
|
||||
expect(console.log).toHaveBeenCalledWith("Deleted stale OpenClaw devices: BritdXC6iL");
|
||||
expect(console.log).toHaveBeenCalledWith("Current device: A7hWrQ70ea");
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { chmod, mkdir, mkdtemp, rm, stat, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { setTimeout as sleep } from "node:timers/promises";
|
||||
import type { MatrixVerificationSummary } from "@openclaw/matrix/test-api.js";
|
||||
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
|
||||
import { createMatrixQaClient } from "../../substrate/client.js";
|
||||
import {
|
||||
createMatrixQaE2eeScenarioClient,
|
||||
@@ -391,7 +391,9 @@ async function createMatrixQaCliSelfVerificationRuntime(params: {
|
||||
userId: string;
|
||||
}) {
|
||||
const outputDir = requireMatrixQaE2eeOutputDir(params.context);
|
||||
const rootDir = await mkdtemp(path.join(tmpdir(), "openclaw-matrix-cli-qa-"));
|
||||
const rootDir = await mkdtemp(
|
||||
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-matrix-cli-qa-"),
|
||||
);
|
||||
const artifactDir = path.join(
|
||||
outputDir,
|
||||
"cli-self-verification",
|
||||
|
||||
@@ -82,6 +82,11 @@ const voiceCallConfigSchema = {
|
||||
},
|
||||
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
|
||||
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
|
||||
"realtime.toolPolicy": {
|
||||
label: "Realtime Tool Policy",
|
||||
help: "Controls the shared openclaw_agent_consult tool.",
|
||||
advanced: true,
|
||||
},
|
||||
"realtime.providers": { label: "Realtime Provider Config", advanced: true },
|
||||
"tts.provider": {
|
||||
label: "TTS Provider Override",
|
||||
|
||||
@@ -402,6 +402,10 @@
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
},
|
||||
"toolPolicy": {
|
||||
"type": "string",
|
||||
"enum": ["safe-read-only", "owner", "none"]
|
||||
},
|
||||
"tools": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
||||
@@ -242,6 +242,8 @@ describe("normalizeVoiceCallConfig", () => {
|
||||
expect(normalized.streaming.provider).toBeUndefined();
|
||||
expect(normalized.streaming.providers).toEqual({});
|
||||
expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime");
|
||||
expect(normalized.realtime.toolPolicy).toBe("safe-read-only");
|
||||
expect(normalized.realtime.instructions).toContain("openclaw_agent_consult");
|
||||
expect(normalized.tunnel.provider).toBe("none");
|
||||
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
|
||||
});
|
||||
@@ -300,6 +302,7 @@ describe("resolveVoiceCallConfig", () => {
|
||||
});
|
||||
|
||||
expect(resolved.realtime.instructions).toBe("Stay concise.");
|
||||
expect(resolved.realtime.toolPolicy).toBe("safe-read-only");
|
||||
expect(resolved.realtime.provider).toBeUndefined();
|
||||
});
|
||||
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
import {
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
type RealtimeVoiceAgentConsultToolPolicy,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { z } from "openclaw/plugin-sdk/zod";
|
||||
import { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema } from "../api.js";
|
||||
import { deepMergeDefined } from "./deep-merge.js";
|
||||
@@ -205,6 +209,11 @@ export type VoiceCallRealtimeProvidersConfig = z.infer<
|
||||
typeof VoiceCallRealtimeProvidersConfigSchema
|
||||
>;
|
||||
|
||||
export const VoiceCallRealtimeToolPolicySchema = z.enum(["safe-read-only", "owner", "none"]);
|
||||
export type VoiceCallRealtimeToolPolicy = RealtimeVoiceAgentConsultToolPolicy;
|
||||
|
||||
export const DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS = `You are OpenClaw's phone-call realtime voice interface. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
|
||||
|
||||
export const VoiceCallStreamingProvidersConfigSchema = z
|
||||
.record(z.string(), z.record(z.string(), z.unknown()))
|
||||
.default({});
|
||||
@@ -221,14 +230,22 @@ export const VoiceCallRealtimeConfigSchema = z
|
||||
/** Optional override for the local WebSocket route path. */
|
||||
streamPath: z.string().min(1).optional(),
|
||||
/** System instructions passed to the realtime provider. */
|
||||
instructions: z.string().optional(),
|
||||
instructions: z.string().default(DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS),
|
||||
/** Tool policy for the shared OpenClaw agent consult tool. */
|
||||
toolPolicy: VoiceCallRealtimeToolPolicySchema.default("safe-read-only"),
|
||||
/** Tool definitions exposed to the realtime provider. */
|
||||
tools: z.array(RealtimeToolSchema).default([]),
|
||||
/** Provider-owned raw config blobs keyed by provider id. */
|
||||
providers: VoiceCallRealtimeProvidersConfigSchema,
|
||||
})
|
||||
.strict()
|
||||
.default({ enabled: false, tools: [], providers: {} });
|
||||
.default({
|
||||
enabled: false,
|
||||
instructions: DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS,
|
||||
toolPolicy: "safe-read-only",
|
||||
tools: [],
|
||||
providers: {},
|
||||
});
|
||||
export type VoiceCallRealtimeConfig = z.infer<typeof VoiceCallRealtimeConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -8,10 +8,17 @@ const mocks = vi.hoisted(() => ({
|
||||
resolveVoiceCallConfig: vi.fn(),
|
||||
validateProviderConfig: vi.fn(),
|
||||
managerInitialize: vi.fn(),
|
||||
managerGetCall: vi.fn(),
|
||||
webhookStart: vi.fn(),
|
||||
webhookStop: vi.fn(),
|
||||
webhookSetRealtimeHandler: vi.fn(),
|
||||
webhookGetRealtimeHandler: vi.fn(),
|
||||
webhookGetMediaStreamHandler: vi.fn(),
|
||||
webhookCtorArgs: [] as unknown[][],
|
||||
realtimeHandlerCtorArgs: [] as unknown[][],
|
||||
realtimeHandlerRegisterToolHandler: vi.fn(),
|
||||
realtimeHandlerSetPublicUrl: vi.fn(),
|
||||
resolveConfiguredRealtimeVoiceProvider: vi.fn(),
|
||||
startTunnel: vi.fn(),
|
||||
setupTailscaleExposure: vi.fn(),
|
||||
cleanupTailscaleExposure: vi.fn(),
|
||||
@@ -25,6 +32,7 @@ vi.mock("./config.js", () => ({
|
||||
vi.mock("./manager.js", () => ({
|
||||
CallManager: class {
|
||||
initialize = mocks.managerInitialize;
|
||||
getCall = mocks.managerGetCall;
|
||||
},
|
||||
}));
|
||||
|
||||
@@ -35,10 +43,26 @@ vi.mock("./webhook.js", () => ({
|
||||
}
|
||||
start = mocks.webhookStart;
|
||||
stop = mocks.webhookStop;
|
||||
setRealtimeHandler = mocks.webhookSetRealtimeHandler;
|
||||
getRealtimeHandler = mocks.webhookGetRealtimeHandler;
|
||||
getMediaStreamHandler = mocks.webhookGetMediaStreamHandler;
|
||||
},
|
||||
}));
|
||||
|
||||
vi.mock("./realtime-voice.runtime.js", () => ({
|
||||
resolveConfiguredRealtimeVoiceProvider: mocks.resolveConfiguredRealtimeVoiceProvider,
|
||||
}));
|
||||
|
||||
vi.mock("./webhook/realtime-handler.js", () => ({
|
||||
RealtimeCallHandler: class {
|
||||
constructor(...args: unknown[]) {
|
||||
mocks.realtimeHandlerCtorArgs.push(args);
|
||||
}
|
||||
registerToolHandler = mocks.realtimeHandlerRegisterToolHandler;
|
||||
setPublicUrl = mocks.realtimeHandlerSetPublicUrl;
|
||||
},
|
||||
}));
|
||||
|
||||
vi.mock("./tunnel.js", () => ({
|
||||
startTunnel: mocks.startTunnel,
|
||||
}));
|
||||
@@ -60,10 +84,22 @@ describe("createVoiceCallRuntime lifecycle", () => {
|
||||
mocks.resolveVoiceCallConfig.mockImplementation((cfg: VoiceCallConfig) => cfg);
|
||||
mocks.validateProviderConfig.mockReturnValue({ valid: true, errors: [] });
|
||||
mocks.managerInitialize.mockResolvedValue(undefined);
|
||||
mocks.managerGetCall.mockReset();
|
||||
mocks.webhookStart.mockResolvedValue("http://127.0.0.1:3334/voice/webhook");
|
||||
mocks.webhookStop.mockResolvedValue(undefined);
|
||||
mocks.webhookSetRealtimeHandler.mockReset();
|
||||
mocks.webhookGetRealtimeHandler.mockReturnValue({
|
||||
setPublicUrl: mocks.realtimeHandlerSetPublicUrl,
|
||||
});
|
||||
mocks.webhookGetMediaStreamHandler.mockReturnValue(undefined);
|
||||
mocks.webhookCtorArgs.length = 0;
|
||||
mocks.realtimeHandlerCtorArgs.length = 0;
|
||||
mocks.realtimeHandlerRegisterToolHandler.mockReset();
|
||||
mocks.realtimeHandlerSetPublicUrl.mockReset();
|
||||
mocks.resolveConfiguredRealtimeVoiceProvider.mockResolvedValue({
|
||||
provider: { id: "openai" },
|
||||
providerConfig: { model: "gpt-realtime" },
|
||||
});
|
||||
mocks.startTunnel.mockResolvedValue(null);
|
||||
mocks.setupTailscaleExposure.mockResolvedValue(null);
|
||||
mocks.cleanupTailscaleExposure.mockResolvedValue(undefined);
|
||||
@@ -133,4 +169,81 @@ describe("createVoiceCallRuntime lifecycle", () => {
|
||||
expect(mocks.webhookCtorArgs[0]?.[3]).toBe(coreConfig);
|
||||
expect(mocks.webhookCtorArgs[0]?.[4]).toBe(fullConfig);
|
||||
});
|
||||
|
||||
it("wires the shared realtime agent consult tool and handler", async () => {
|
||||
const config = createBaseConfig();
|
||||
config.inboundPolicy = "allowlist";
|
||||
config.realtime.enabled = true;
|
||||
config.realtime.tools = [
|
||||
{
|
||||
type: "function",
|
||||
name: "custom_tool",
|
||||
description: "Custom tool",
|
||||
parameters: { type: "object", properties: {} },
|
||||
},
|
||||
];
|
||||
const sessionStore: Record<string, unknown> = {};
|
||||
const runEmbeddedPiAgent = vi.fn(async () => ({
|
||||
payloads: [{ text: "Use the shipment status." }],
|
||||
meta: {},
|
||||
}));
|
||||
const agentRuntime = {
|
||||
defaults: { provider: "openai", model: "gpt-5.4" },
|
||||
resolveAgentDir: vi.fn(() => "/tmp/agent"),
|
||||
resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"),
|
||||
resolveAgentIdentity: vi.fn(),
|
||||
resolveThinkingDefault: vi.fn(() => "high"),
|
||||
resolveAgentTimeoutMs: vi.fn(() => 30_000),
|
||||
ensureAgentWorkspace: vi.fn(async () => {}),
|
||||
session: {
|
||||
resolveStorePath: vi.fn(() => "/tmp/sessions.json"),
|
||||
loadSessionStore: vi.fn(() => sessionStore),
|
||||
saveSessionStore: vi.fn(async () => {}),
|
||||
resolveSessionFilePath: vi.fn(() => "/tmp/session.json"),
|
||||
},
|
||||
runEmbeddedPiAgent,
|
||||
};
|
||||
mocks.managerGetCall.mockReturnValue({
|
||||
callId: "call-1",
|
||||
direction: "outbound",
|
||||
from: "+15550001234",
|
||||
to: "+15550009999",
|
||||
transcript: [{ speaker: "user", text: "Can you check shipment status?" }],
|
||||
});
|
||||
|
||||
await createVoiceCallRuntime({
|
||||
config,
|
||||
coreConfig: {} as CoreConfig,
|
||||
agentRuntime: agentRuntime as never,
|
||||
});
|
||||
|
||||
expect(mocks.realtimeHandlerCtorArgs[0]?.[0]).toMatchObject({
|
||||
tools: [
|
||||
expect.objectContaining({ name: "openclaw_agent_consult" }),
|
||||
expect.objectContaining({ name: "custom_tool" }),
|
||||
],
|
||||
});
|
||||
expect(mocks.realtimeHandlerRegisterToolHandler).toHaveBeenCalledWith(
|
||||
"openclaw_agent_consult",
|
||||
expect.any(Function),
|
||||
);
|
||||
|
||||
const handler = mocks.realtimeHandlerRegisterToolHandler.mock.calls[0]?.[1] as
|
||||
| ((args: unknown, callId: string) => Promise<unknown>)
|
||||
| undefined;
|
||||
await expect(handler?.({ question: "What should I say?" }, "call-1")).resolves.toEqual({
|
||||
text: "Use the shipment status.",
|
||||
});
|
||||
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
sessionKey: "voice:15550009999",
|
||||
messageProvider: "voice",
|
||||
lane: "voice",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
toolsAllow: ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"],
|
||||
prompt: expect.stringContaining("Caller: Can you check shipment status?"),
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,12 +1,21 @@
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
|
||||
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
|
||||
import type { ResolvedRealtimeVoiceProvider } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import {
|
||||
consultRealtimeVoiceAgent,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
resolveRealtimeVoiceAgentConsultTools,
|
||||
resolveRealtimeVoiceAgentConsultToolsAllow,
|
||||
type RealtimeVoiceAgentConsultTranscriptEntry,
|
||||
type RealtimeVoiceTool,
|
||||
type ResolvedRealtimeVoiceProvider,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import type { VoiceCallConfig } from "./config.js";
|
||||
import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
|
||||
import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
|
||||
import { CallManager } from "./manager.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import type { TwilioProvider } from "./providers/twilio.js";
|
||||
import { resolveVoiceResponseModel } from "./response-model.js";
|
||||
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
|
||||
import { createTelephonyTtsProvider } from "./telephony-tts.js";
|
||||
import { startTunnel, type TunnelResult } from "./tunnel.js";
|
||||
@@ -76,6 +85,43 @@ function loadRealtimeHandler(): Promise<RealtimeHandlerModule> {
|
||||
return realtimeHandlerPromise;
|
||||
}
|
||||
|
||||
function resolveRealtimeTools(config: VoiceCallConfig): RealtimeVoiceTool[] {
|
||||
const tools = new Map<string, RealtimeVoiceTool>();
|
||||
for (const tool of resolveRealtimeVoiceAgentConsultTools(config.realtime.toolPolicy)) {
|
||||
tools.set(tool.name, tool);
|
||||
}
|
||||
for (const tool of config.realtime.tools) {
|
||||
if (!tools.has(tool.name)) {
|
||||
tools.set(tool.name, tool);
|
||||
}
|
||||
}
|
||||
return [...tools.values()];
|
||||
}
|
||||
|
||||
function resolveVoiceCallConsultSessionKey(call: {
|
||||
sessionKey?: string;
|
||||
from?: string;
|
||||
to?: string;
|
||||
direction?: "inbound" | "outbound";
|
||||
callId: string;
|
||||
}): string {
|
||||
if (call.sessionKey) {
|
||||
return call.sessionKey;
|
||||
}
|
||||
const phone = call.direction === "outbound" ? call.to : call.from;
|
||||
const normalizedPhone = phone?.replace(/\D/g, "");
|
||||
return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${call.callId}`;
|
||||
}
|
||||
|
||||
function mapVoiceCallConsultTranscript(call: {
|
||||
transcript?: Array<{ speaker: "user" | "bot"; text: string }>;
|
||||
}): RealtimeVoiceAgentConsultTranscriptEntry[] {
|
||||
return (call.transcript ?? []).map((entry) => ({
|
||||
role: entry.speaker === "bot" ? "assistant" : "user",
|
||||
text: entry.text,
|
||||
}));
|
||||
}
|
||||
|
||||
function createRuntimeResourceLifecycle(params: {
|
||||
config: VoiceCallConfig;
|
||||
webhookServer: VoiceCallWebhookServer;
|
||||
@@ -215,6 +261,7 @@ export async function createVoiceCallRuntime(params: {
|
||||
};
|
||||
|
||||
const config = resolveVoiceCallConfig(rawConfig);
|
||||
const cfg = fullConfig ?? (coreConfig as OpenClawConfig);
|
||||
|
||||
if (!config.enabled) {
|
||||
throw new Error("Voice call disabled. Enable the plugin entry in config.");
|
||||
@@ -236,7 +283,7 @@ export async function createVoiceCallRuntime(params: {
|
||||
const realtimeProvider = config.realtime.enabled
|
||||
? await resolveRealtimeProvider({
|
||||
config,
|
||||
fullConfig: fullConfig ?? (coreConfig as OpenClawConfig),
|
||||
fullConfig: cfg,
|
||||
})
|
||||
: null;
|
||||
const webhookServer = new VoiceCallWebhookServer(
|
||||
@@ -249,16 +296,61 @@ export async function createVoiceCallRuntime(params: {
|
||||
);
|
||||
if (realtimeProvider) {
|
||||
const { RealtimeCallHandler } = await loadRealtimeHandler();
|
||||
webhookServer.setRealtimeHandler(
|
||||
new RealtimeCallHandler(
|
||||
config.realtime,
|
||||
manager,
|
||||
provider,
|
||||
realtimeProvider.provider,
|
||||
realtimeProvider.providerConfig,
|
||||
config.serve.path,
|
||||
),
|
||||
const realtimeConfig = {
|
||||
...config.realtime,
|
||||
tools: resolveRealtimeTools(config),
|
||||
};
|
||||
const realtimeHandler = new RealtimeCallHandler(
|
||||
realtimeConfig,
|
||||
manager,
|
||||
provider,
|
||||
realtimeProvider.provider,
|
||||
realtimeProvider.providerConfig,
|
||||
config.serve.path,
|
||||
);
|
||||
if (config.realtime.toolPolicy !== "none") {
|
||||
realtimeHandler.registerToolHandler(
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
async (args, callId) => {
|
||||
const call = manager.getCall(callId);
|
||||
if (!call) {
|
||||
return { error: `Call "${callId}" not found` };
|
||||
}
|
||||
const { provider: agentProvider, model } = resolveVoiceResponseModel({
|
||||
voiceConfig: config,
|
||||
agentRuntime,
|
||||
});
|
||||
const thinkLevel = agentRuntime.resolveThinkingDefault({
|
||||
cfg,
|
||||
provider: agentProvider,
|
||||
model,
|
||||
});
|
||||
return await consultRealtimeVoiceAgent({
|
||||
cfg,
|
||||
agentRuntime,
|
||||
logger: log,
|
||||
sessionKey: resolveVoiceCallConsultSessionKey(call),
|
||||
messageProvider: "voice",
|
||||
lane: "voice",
|
||||
runIdPrefix: `voice-realtime-consult:${callId}`,
|
||||
args,
|
||||
transcript: mapVoiceCallConsultTranscript(call),
|
||||
surface: "a live phone call",
|
||||
userLabel: "Caller",
|
||||
assistantLabel: "Agent",
|
||||
questionSourceLabel: "caller",
|
||||
provider: agentProvider,
|
||||
model,
|
||||
thinkLevel,
|
||||
timeoutMs: config.responseTimeoutMs,
|
||||
toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(config.realtime.toolPolicy),
|
||||
extraSystemPrompt:
|
||||
"You are a behind-the-scenes consultant for a live phone voice agent. Be accurate, brief, and speakable.",
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
webhookServer.setRealtimeHandler(realtimeHandler);
|
||||
}
|
||||
const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer });
|
||||
|
||||
|
||||
@@ -46,6 +46,9 @@ export function createVoiceCallBaseConfig(params?: {
|
||||
realtime: {
|
||||
enabled: false,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
instructions:
|
||||
"You are OpenClaw's phone-call realtime voice interface. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering.",
|
||||
toolPolicy: "safe-read-only",
|
||||
tools: [],
|
||||
providers: {},
|
||||
},
|
||||
|
||||
@@ -571,6 +571,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
|
||||
realtime: {
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
instructions: "Be helpful.",
|
||||
toolPolicy: "safe-read-only",
|
||||
tools: [],
|
||||
providers: {},
|
||||
},
|
||||
@@ -628,6 +630,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
|
||||
realtime: {
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
instructions: "Be helpful.",
|
||||
toolPolicy: "safe-read-only",
|
||||
tools: [],
|
||||
providers: {},
|
||||
},
|
||||
@@ -680,6 +684,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
|
||||
realtime: {
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
instructions: "Be helpful.",
|
||||
toolPolicy: "safe-read-only",
|
||||
tools: [],
|
||||
providers: {},
|
||||
},
|
||||
@@ -730,6 +736,8 @@ describe("VoiceCallWebhookServer replay handling", () => {
|
||||
realtime: {
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
instructions: "Be helpful.",
|
||||
toolPolicy: "safe-read-only",
|
||||
tools: [],
|
||||
providers: {},
|
||||
},
|
||||
|
||||
@@ -8,6 +8,7 @@ import { WebSocket } from "ws";
|
||||
import type { VoiceCallRealtimeConfig } from "../config.js";
|
||||
import type { CallManager } from "../manager.js";
|
||||
import type { VoiceCallProvider } from "../providers/base.js";
|
||||
import type { CallRecord } from "../types.js";
|
||||
import { connectWs, startUpgradeWsServer, waitForClose } from "../websocket-test-support.js";
|
||||
import { RealtimeCallHandler } from "./realtime-handler.js";
|
||||
|
||||
@@ -33,7 +34,7 @@ function makeBridge(): RealtimeVoiceBridge {
|
||||
}
|
||||
|
||||
function makeRealtimeProvider(
|
||||
createBridge: () => RealtimeVoiceBridge,
|
||||
createBridge: RealtimeVoiceProviderPlugin["createBridge"],
|
||||
): RealtimeVoiceProviderPlugin {
|
||||
return {
|
||||
id: "openai",
|
||||
@@ -51,15 +52,17 @@ function makeHandler(
|
||||
realtimeProvider?: RealtimeVoiceProviderPlugin;
|
||||
},
|
||||
) {
|
||||
const config: VoiceCallRealtimeConfig = {
|
||||
enabled: true,
|
||||
streamPath: overrides?.streamPath ?? "/voice/stream/realtime",
|
||||
instructions: overrides?.instructions ?? "Be helpful.",
|
||||
toolPolicy: overrides?.toolPolicy ?? "safe-read-only",
|
||||
tools: overrides?.tools ?? [],
|
||||
providers: overrides?.providers ?? {},
|
||||
...(overrides?.provider ? { provider: overrides.provider } : {}),
|
||||
};
|
||||
return new RealtimeCallHandler(
|
||||
{
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
instructions: "Be helpful.",
|
||||
tools: [],
|
||||
providers: {},
|
||||
...overrides,
|
||||
},
|
||||
config,
|
||||
{
|
||||
processEvent: vi.fn(),
|
||||
getCallByProviderCallId: vi.fn(),
|
||||
@@ -124,6 +127,91 @@ describe("RealtimeCallHandler path routing", () => {
|
||||
/wss:\/\/public\.example\/api\/custom\/stream\/realtime\/[0-9a-f-]{36}/,
|
||||
);
|
||||
});
|
||||
|
||||
it("normalizes Twilio outbound realtime directions", async () => {
|
||||
let callbacks:
|
||||
| {
|
||||
onReady?: () => void;
|
||||
}
|
||||
| undefined;
|
||||
const createBridge = vi.fn(
|
||||
(request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0]) => {
|
||||
callbacks = request;
|
||||
return makeBridge();
|
||||
},
|
||||
);
|
||||
const processEvent = vi.fn();
|
||||
const getCallByProviderCallId = vi.fn(
|
||||
(): CallRecord => ({
|
||||
callId: "call-1",
|
||||
providerCallId: "CA-outbound",
|
||||
provider: "twilio",
|
||||
direction: "outbound",
|
||||
state: "ringing",
|
||||
from: "+15550001234",
|
||||
to: "+15550009999",
|
||||
startedAt: Date.now(),
|
||||
transcript: [],
|
||||
processedEventIds: [],
|
||||
metadata: {},
|
||||
}),
|
||||
);
|
||||
const handler = makeHandler(undefined, {
|
||||
manager: {
|
||||
processEvent,
|
||||
getCallByProviderCallId,
|
||||
},
|
||||
realtimeProvider: makeRealtimeProvider(createBridge),
|
||||
});
|
||||
const payload = handler.buildTwiMLPayload(
|
||||
makeRequest("/voice/webhook"),
|
||||
new URLSearchParams({
|
||||
Direction: "outbound-dial",
|
||||
From: "+15550001234",
|
||||
To: "+15550009999",
|
||||
}),
|
||||
);
|
||||
const match = payload.body.match(/wss:\/\/[^/]+(\/[^"]+)/);
|
||||
if (!match) {
|
||||
throw new Error("Failed to extract realtime stream path");
|
||||
}
|
||||
const server = await startUpgradeWsServer({
|
||||
urlPath: match[1],
|
||||
onUpgrade: (request, socket, head) => {
|
||||
handler.handleWebSocketUpgrade(request, socket, head);
|
||||
},
|
||||
});
|
||||
|
||||
try {
|
||||
const ws = await connectWs(server.url);
|
||||
try {
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
event: "start",
|
||||
start: { streamSid: "MZ-outbound", callSid: "CA-outbound" },
|
||||
}),
|
||||
);
|
||||
await vi.waitFor(() => {
|
||||
expect(createBridge).toHaveBeenCalled();
|
||||
});
|
||||
callbacks?.onReady?.();
|
||||
expect(processEvent).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
type: "call.initiated",
|
||||
direction: "outbound",
|
||||
from: "+15550001234",
|
||||
to: "+15550009999",
|
||||
}),
|
||||
);
|
||||
} finally {
|
||||
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
|
||||
ws.close();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await server.close();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("RealtimeCallHandler websocket hardening", () => {
|
||||
|
||||
@@ -101,7 +101,7 @@ export class RealtimeCallHandler {
|
||||
const token = this.issueStreamToken({
|
||||
from: params?.get("From") ?? undefined,
|
||||
to: params?.get("To") ?? undefined,
|
||||
direction: rawDirection === "outbound-api" ? "outbound" : "inbound",
|
||||
direction: rawDirection?.startsWith("outbound") ? "outbound" : "inbound",
|
||||
});
|
||||
const wsUrl = `wss://${host}${this.getStreamPathPattern()}/${token}`;
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
@@ -15,9 +15,23 @@ export type {
|
||||
RealtimeVoiceToolCallEvent,
|
||||
} from "../realtime-voice/provider-types.js";
|
||||
export {
|
||||
buildRealtimeVoiceAgentConsultChatMessage,
|
||||
buildRealtimeVoiceAgentConsultPrompt,
|
||||
collectRealtimeVoiceAgentConsultVisibleText,
|
||||
parseRealtimeVoiceAgentConsultArgs,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
type RealtimeVoiceAgentConsultArgs,
|
||||
type RealtimeVoiceAgentConsultToolPolicy,
|
||||
type RealtimeVoiceAgentConsultTranscriptEntry,
|
||||
} from "../realtime-voice/agent-consult-tool.js";
|
||||
export {
|
||||
consultRealtimeVoiceAgent,
|
||||
resolveRealtimeVoiceAgentConsultTools,
|
||||
resolveRealtimeVoiceAgentConsultToolsAllow,
|
||||
type RealtimeVoiceAgentConsultResult,
|
||||
type RealtimeVoiceAgentConsultRuntime,
|
||||
} from "../realtime-voice/agent-consult-runtime.js";
|
||||
export {
|
||||
canonicalizeRealtimeVoiceProviderId,
|
||||
getRealtimeVoiceProvider,
|
||||
|
||||
116
src/realtime-voice/agent-consult-runtime.test.ts
Normal file
116
src/realtime-voice/agent-consult-runtime.test.ts
Normal file
@@ -0,0 +1,116 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
consultRealtimeVoiceAgent,
|
||||
resolveRealtimeVoiceAgentConsultTools,
|
||||
resolveRealtimeVoiceAgentConsultToolsAllow,
|
||||
} from "./agent-consult-runtime.js";
|
||||
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "./agent-consult-tool.js";
|
||||
|
||||
function createAgentRuntime(payloads: unknown[] = [{ text: "Speak this." }]) {
|
||||
const sessionStore: Record<string, { sessionId?: string; updatedAt?: number }> = {};
|
||||
const runEmbeddedPiAgent = vi.fn(async () => ({
|
||||
payloads,
|
||||
meta: {},
|
||||
}));
|
||||
return {
|
||||
runtime: {
|
||||
resolveAgentDir: vi.fn(() => "/tmp/agent"),
|
||||
resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"),
|
||||
ensureAgentWorkspace: vi.fn(async () => {}),
|
||||
resolveAgentTimeoutMs: vi.fn(() => 30_000),
|
||||
session: {
|
||||
resolveStorePath: vi.fn(() => "/tmp/sessions.json"),
|
||||
loadSessionStore: vi.fn(() => sessionStore),
|
||||
saveSessionStore: vi.fn(async () => {}),
|
||||
resolveSessionFilePath: vi.fn(() => "/tmp/session.json"),
|
||||
},
|
||||
runEmbeddedPiAgent,
|
||||
},
|
||||
runEmbeddedPiAgent,
|
||||
sessionStore,
|
||||
};
|
||||
}
|
||||
|
||||
describe("realtime voice agent consult runtime", () => {
|
||||
it("exposes the shared consult tool based on policy", () => {
|
||||
expect(resolveRealtimeVoiceAgentConsultTools("safe-read-only")).toEqual([
|
||||
expect.objectContaining({ name: REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME }),
|
||||
]);
|
||||
expect(resolveRealtimeVoiceAgentConsultTools("none")).toEqual([]);
|
||||
expect(resolveRealtimeVoiceAgentConsultToolsAllow("safe-read-only")).toEqual([
|
||||
"read",
|
||||
"web_search",
|
||||
"web_fetch",
|
||||
"x_search",
|
||||
"memory_search",
|
||||
"memory_get",
|
||||
]);
|
||||
expect(resolveRealtimeVoiceAgentConsultToolsAllow("owner")).toBeUndefined();
|
||||
expect(resolveRealtimeVoiceAgentConsultToolsAllow("none")).toEqual([]);
|
||||
});
|
||||
|
||||
it("runs an embedded agent using the shared session and prompt contract", async () => {
|
||||
const { runtime, runEmbeddedPiAgent, sessionStore } = createAgentRuntime();
|
||||
|
||||
const result = await consultRealtimeVoiceAgent({
|
||||
cfg: {} as never,
|
||||
agentRuntime: runtime as never,
|
||||
logger: { warn: vi.fn() },
|
||||
sessionKey: "voice:15550001234",
|
||||
messageProvider: "voice",
|
||||
lane: "voice",
|
||||
runIdPrefix: "voice-realtime-consult:call-1",
|
||||
args: { question: "What should I say?", context: "Caller asked about PR #123." },
|
||||
transcript: [{ role: "user", text: "Can you check this?" }],
|
||||
surface: "a live phone call",
|
||||
userLabel: "Caller",
|
||||
questionSourceLabel: "caller",
|
||||
toolsAllow: ["read"],
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
thinkLevel: "high",
|
||||
timeoutMs: 10_000,
|
||||
});
|
||||
|
||||
expect(result).toEqual({ text: "Speak this." });
|
||||
expect(sessionStore["voice:15550001234"]?.sessionId).toBeTruthy();
|
||||
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
sessionKey: "voice:15550001234",
|
||||
messageProvider: "voice",
|
||||
lane: "voice",
|
||||
toolsAllow: ["read"],
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
thinkLevel: "high",
|
||||
timeoutMs: 10_000,
|
||||
prompt: expect.stringContaining("Caller: Can you check this?"),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("returns a speakable fallback when the embedded agent has no visible text", async () => {
|
||||
const warn = vi.fn();
|
||||
const { runtime } = createAgentRuntime([{ text: "hidden", isReasoning: true }]);
|
||||
|
||||
const result = await consultRealtimeVoiceAgent({
|
||||
cfg: {} as never,
|
||||
agentRuntime: runtime as never,
|
||||
logger: { warn },
|
||||
sessionKey: "google-meet:meet-1",
|
||||
messageProvider: "google-meet",
|
||||
lane: "google-meet",
|
||||
runIdPrefix: "google-meet:meet-1",
|
||||
args: { question: "What now?" },
|
||||
transcript: [],
|
||||
surface: "a private Google Meet",
|
||||
userLabel: "Participant",
|
||||
fallbackText: "Let me verify that first.",
|
||||
});
|
||||
|
||||
expect(result).toEqual({ text: "Let me verify that first." });
|
||||
expect(warn).toHaveBeenCalledWith(
|
||||
"[realtime-voice] agent consult produced no answer: agent returned no speakable text",
|
||||
);
|
||||
});
|
||||
});
|
||||
127
src/realtime-voice/agent-consult-runtime.ts
Normal file
127
src/realtime-voice/agent-consult-runtime.ts
Normal file
@@ -0,0 +1,127 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import type { RunEmbeddedPiAgentParams } from "../agents/pi-embedded-runner/run/params.js";
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import type { RuntimeLogger, PluginRuntimeCore } from "../plugins/runtime/types-core.js";
|
||||
import {
|
||||
buildRealtimeVoiceAgentConsultPrompt,
|
||||
collectRealtimeVoiceAgentConsultVisibleText,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL,
|
||||
type RealtimeVoiceAgentConsultToolPolicy,
|
||||
type RealtimeVoiceAgentConsultTranscriptEntry,
|
||||
} from "./agent-consult-tool.js";
|
||||
import type { RealtimeVoiceTool } from "./provider-types.js";
|
||||
|
||||
export type RealtimeVoiceAgentConsultRuntime = PluginRuntimeCore["agent"];
|
||||
export type RealtimeVoiceAgentConsultResult = { text: string };
|
||||
|
||||
const SAFE_READ_ONLY_TOOLS = [
|
||||
"read",
|
||||
"web_search",
|
||||
"web_fetch",
|
||||
"x_search",
|
||||
"memory_search",
|
||||
"memory_get",
|
||||
] as const;
|
||||
|
||||
export function resolveRealtimeVoiceAgentConsultTools(
|
||||
policy: RealtimeVoiceAgentConsultToolPolicy,
|
||||
): RealtimeVoiceTool[] {
|
||||
return policy === "none" ? [] : [REALTIME_VOICE_AGENT_CONSULT_TOOL];
|
||||
}
|
||||
|
||||
export function resolveRealtimeVoiceAgentConsultToolsAllow(
|
||||
policy: RealtimeVoiceAgentConsultToolPolicy,
|
||||
): string[] | undefined {
|
||||
if (policy === "owner") {
|
||||
return undefined;
|
||||
}
|
||||
if (policy === "safe-read-only") {
|
||||
return [...SAFE_READ_ONLY_TOOLS];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
export async function consultRealtimeVoiceAgent(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentRuntime: RealtimeVoiceAgentConsultRuntime;
|
||||
logger: Pick<RuntimeLogger, "warn">;
|
||||
sessionKey: string;
|
||||
messageProvider: string;
|
||||
lane: string;
|
||||
runIdPrefix: string;
|
||||
args: unknown;
|
||||
transcript: RealtimeVoiceAgentConsultTranscriptEntry[];
|
||||
surface: string;
|
||||
userLabel: string;
|
||||
assistantLabel?: string;
|
||||
questionSourceLabel?: string;
|
||||
agentId?: string;
|
||||
provider?: RunEmbeddedPiAgentParams["provider"];
|
||||
model?: RunEmbeddedPiAgentParams["model"];
|
||||
thinkLevel?: RunEmbeddedPiAgentParams["thinkLevel"];
|
||||
timeoutMs?: number;
|
||||
toolsAllow?: string[];
|
||||
extraSystemPrompt?: string;
|
||||
fallbackText?: string;
|
||||
}): Promise<RealtimeVoiceAgentConsultResult> {
|
||||
const agentId = params.agentId ?? "main";
|
||||
const agentDir = params.agentRuntime.resolveAgentDir(params.cfg, agentId);
|
||||
const workspaceDir = params.agentRuntime.resolveAgentWorkspaceDir(params.cfg, agentId);
|
||||
await params.agentRuntime.ensureAgentWorkspace({ dir: workspaceDir });
|
||||
|
||||
const storePath = params.agentRuntime.session.resolveStorePath(params.cfg.session?.store, {
|
||||
agentId,
|
||||
});
|
||||
const sessionStore = params.agentRuntime.session.loadSessionStore(storePath);
|
||||
const now = Date.now();
|
||||
const existing = sessionStore[params.sessionKey] as
|
||||
| { sessionId?: string; updatedAt?: number }
|
||||
| undefined;
|
||||
const sessionId = existing?.sessionId?.trim() || randomUUID();
|
||||
sessionStore[params.sessionKey] = { ...existing, sessionId, updatedAt: now };
|
||||
await params.agentRuntime.session.saveSessionStore(storePath, sessionStore);
|
||||
|
||||
const sessionFile = params.agentRuntime.session.resolveSessionFilePath(
|
||||
sessionId,
|
||||
sessionStore[params.sessionKey],
|
||||
{ agentId },
|
||||
);
|
||||
const result = await params.agentRuntime.runEmbeddedPiAgent({
|
||||
sessionId,
|
||||
sessionKey: params.sessionKey,
|
||||
messageProvider: params.messageProvider,
|
||||
sessionFile,
|
||||
workspaceDir,
|
||||
config: params.cfg,
|
||||
prompt: buildRealtimeVoiceAgentConsultPrompt({
|
||||
args: params.args,
|
||||
transcript: params.transcript,
|
||||
surface: params.surface,
|
||||
userLabel: params.userLabel,
|
||||
assistantLabel: params.assistantLabel,
|
||||
questionSourceLabel: params.questionSourceLabel,
|
||||
}),
|
||||
provider: params.provider,
|
||||
model: params.model,
|
||||
thinkLevel: params.thinkLevel ?? "high",
|
||||
verboseLevel: "off",
|
||||
reasoningLevel: "off",
|
||||
toolResultFormat: "plain",
|
||||
toolsAllow: params.toolsAllow,
|
||||
timeoutMs: params.timeoutMs ?? params.agentRuntime.resolveAgentTimeoutMs({ cfg: params.cfg }),
|
||||
runId: `${params.runIdPrefix}:${Date.now()}`,
|
||||
lane: params.lane,
|
||||
extraSystemPrompt:
|
||||
params.extraSystemPrompt ??
|
||||
"You are a behind-the-scenes consultant for a live voice agent. Be accurate, brief, and speakable.",
|
||||
agentDir,
|
||||
});
|
||||
|
||||
const text = collectRealtimeVoiceAgentConsultVisibleText(result.payloads ?? []);
|
||||
if (!text) {
|
||||
const reason = result.meta?.aborted ? "agent run aborted" : "agent returned no speakable text";
|
||||
params.logger.warn(`[realtime-voice] agent consult produced no answer: ${reason}`);
|
||||
return { text: params.fallbackText ?? "I need a moment to verify that before answering." };
|
||||
}
|
||||
return { text };
|
||||
}
|
||||
55
src/realtime-voice/agent-consult-tool.test.ts
Normal file
55
src/realtime-voice/agent-consult-tool.test.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
buildRealtimeVoiceAgentConsultChatMessage,
|
||||
buildRealtimeVoiceAgentConsultPrompt,
|
||||
collectRealtimeVoiceAgentConsultVisibleText,
|
||||
parseRealtimeVoiceAgentConsultArgs,
|
||||
} from "./agent-consult-tool.js";
|
||||
|
||||
describe("realtime voice agent consult tool", () => {
|
||||
it("normalizes shared tool arguments for browser chat forwarding", () => {
|
||||
expect(
|
||||
buildRealtimeVoiceAgentConsultChatMessage({
|
||||
question: " What changed? ",
|
||||
context: " PR #123 ",
|
||||
responseStyle: " concise ",
|
||||
}),
|
||||
).toBe("What changed?\n\nContext:\nPR #123\n\nSpoken style:\nconcise");
|
||||
});
|
||||
|
||||
it("requires a non-empty question", () => {
|
||||
expect(() => parseRealtimeVoiceAgentConsultArgs({ context: "missing" })).toThrow(
|
||||
"question required",
|
||||
);
|
||||
});
|
||||
|
||||
it("builds a reusable spoken consultant prompt with recent transcript", () => {
|
||||
const prompt = buildRealtimeVoiceAgentConsultPrompt({
|
||||
args: { question: "Do we support realtime tools?" },
|
||||
transcript: [
|
||||
{ role: "user", text: "Can you check the repo?" },
|
||||
{ role: "assistant", text: "I'll verify." },
|
||||
],
|
||||
surface: "a private Google Meet",
|
||||
userLabel: "Participant",
|
||||
assistantLabel: "Agent",
|
||||
questionSourceLabel: "participant",
|
||||
});
|
||||
|
||||
expect(prompt).toContain("during a private Google Meet");
|
||||
expect(prompt).toContain("Participant: Can you check the repo?");
|
||||
expect(prompt).toContain("Agent: I'll verify.");
|
||||
expect(prompt).toContain("Question:\nDo we support realtime tools?");
|
||||
});
|
||||
|
||||
it("filters reasoning and error payloads from visible consult output", () => {
|
||||
expect(
|
||||
collectRealtimeVoiceAgentConsultVisibleText([
|
||||
{ text: "thinking", isReasoning: true },
|
||||
{ text: "first" },
|
||||
{ text: "error", isError: true },
|
||||
{ text: "second" },
|
||||
]),
|
||||
).toBe("first\n\nsecond");
|
||||
});
|
||||
});
|
||||
@@ -1,6 +1,17 @@
|
||||
import { normalizeOptionalString } from "../shared/string-coerce.js";
|
||||
import type { RealtimeVoiceTool } from "./provider-types.js";
|
||||
|
||||
export const REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult";
|
||||
export type RealtimeVoiceAgentConsultToolPolicy = "safe-read-only" | "owner" | "none";
|
||||
export type RealtimeVoiceAgentConsultArgs = {
|
||||
question: string;
|
||||
context?: string;
|
||||
responseStyle?: string;
|
||||
};
|
||||
export type RealtimeVoiceAgentConsultTranscriptEntry = {
|
||||
role: "user" | "assistant";
|
||||
text: string;
|
||||
};
|
||||
|
||||
export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
|
||||
type: "function",
|
||||
@@ -26,3 +37,81 @@ export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = {
|
||||
required: ["question"],
|
||||
},
|
||||
};
|
||||
|
||||
export function parseRealtimeVoiceAgentConsultArgs(args: unknown): RealtimeVoiceAgentConsultArgs {
|
||||
const question = readConsultStringArg(args, "question");
|
||||
if (!question) {
|
||||
throw new Error("question required");
|
||||
}
|
||||
return {
|
||||
question,
|
||||
context: readConsultStringArg(args, "context"),
|
||||
responseStyle: readConsultStringArg(args, "responseStyle"),
|
||||
};
|
||||
}
|
||||
|
||||
export function buildRealtimeVoiceAgentConsultChatMessage(args: unknown): string {
|
||||
const parsed = parseRealtimeVoiceAgentConsultArgs(args);
|
||||
return [
|
||||
parsed.question,
|
||||
parsed.context ? `Context:\n${parsed.context}` : undefined,
|
||||
parsed.responseStyle ? `Spoken style:\n${parsed.responseStyle}` : undefined,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
}
|
||||
|
||||
export function buildRealtimeVoiceAgentConsultPrompt(params: {
|
||||
args: unknown;
|
||||
transcript: RealtimeVoiceAgentConsultTranscriptEntry[];
|
||||
surface: string;
|
||||
userLabel: string;
|
||||
assistantLabel?: string;
|
||||
questionSourceLabel?: string;
|
||||
}): string {
|
||||
const parsed = parseRealtimeVoiceAgentConsultArgs(params.args);
|
||||
const assistantLabel = params.assistantLabel ?? "Agent";
|
||||
const questionSourceLabel = params.questionSourceLabel ?? params.userLabel.toLowerCase();
|
||||
const transcript = params.transcript
|
||||
.slice(-12)
|
||||
.map(
|
||||
(entry) => `${entry.role === "assistant" ? assistantLabel : params.userLabel}: ${entry.text}`,
|
||||
)
|
||||
.join("\n");
|
||||
|
||||
return [
|
||||
`You are helping an OpenClaw realtime voice agent during ${params.surface}.`,
|
||||
`Answer the ${questionSourceLabel}'s question with the strongest useful reasoning and available tools.`,
|
||||
"Return only the concise answer the realtime voice agent should speak next.",
|
||||
"Do not include markdown, citations unless needed, tool logs, or private reasoning.",
|
||||
parsed.responseStyle ? `Spoken style: ${parsed.responseStyle}` : undefined,
|
||||
transcript ? `Recent transcript:\n${transcript}` : undefined,
|
||||
parsed.context ? `Additional context:\n${parsed.context}` : undefined,
|
||||
`Question:\n${parsed.question}`,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
}
|
||||
|
||||
export function collectRealtimeVoiceAgentConsultVisibleText(
|
||||
payloads: Array<{ text?: unknown; isError?: boolean; isReasoning?: boolean }>,
|
||||
): string | null {
|
||||
const chunks: string[] = [];
|
||||
for (const payload of payloads) {
|
||||
if (payload.isError || payload.isReasoning) {
|
||||
continue;
|
||||
}
|
||||
const text = normalizeOptionalString(payload.text);
|
||||
if (text) {
|
||||
chunks.push(text);
|
||||
}
|
||||
}
|
||||
return chunks.length > 0 ? chunks.join("\n\n").trim() : null;
|
||||
}
|
||||
|
||||
function readConsultStringArg(args: unknown, key: string): string | undefined {
|
||||
if (!args || typeof args !== "object" || Array.isArray(args)) {
|
||||
return undefined;
|
||||
}
|
||||
return normalizeOptionalString((args as Record<string, unknown>)[key]);
|
||||
}
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../../../src/realtime-voice/agent-consult-tool.js";
|
||||
import {
|
||||
buildRealtimeVoiceAgentConsultChatMessage,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
} from "../../../../src/realtime-voice/agent-consult-tool.js";
|
||||
import type { GatewayBrowserClient, GatewayEventFrame } from "../gateway.ts";
|
||||
import { generateUUID } from "../uuid.ts";
|
||||
|
||||
@@ -239,23 +242,9 @@ export class RealtimeTalkSession {
|
||||
this.callbacks.onStatus?.("thinking");
|
||||
let question = "";
|
||||
try {
|
||||
const args = JSON.parse(buffered?.args || event.arguments || "{}") as {
|
||||
question?: unknown;
|
||||
context?: unknown;
|
||||
responseStyle?: unknown;
|
||||
};
|
||||
question = typeof args.question === "string" ? args.question.trim() : "";
|
||||
const context = typeof args.context === "string" ? args.context.trim() : "";
|
||||
const responseStyle = typeof args.responseStyle === "string" ? args.responseStyle.trim() : "";
|
||||
if (context || responseStyle) {
|
||||
question = [
|
||||
question,
|
||||
context ? `Context:\n${context}` : undefined,
|
||||
responseStyle ? `Spoken style:\n${responseStyle}` : undefined,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
}
|
||||
question = buildRealtimeVoiceAgentConsultChatMessage(
|
||||
JSON.parse(buffered?.args || event.arguments || "{}"),
|
||||
);
|
||||
} catch {}
|
||||
if (!question) {
|
||||
this.submitToolResult(callId, {
|
||||
|
||||
Reference in New Issue
Block a user