From e2f13959d409398908ac6899f1c7db28ec243cf0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 23:11:18 +0100 Subject: [PATCH] feat(voice-call): share realtime agent consult tool Centralize the shared realtime agent consult tool for browser Talk, Google Meet, and Voice Call. --- CHANGELOG.md | 1 + docs/.generated/config-baseline.sha256 | 4 +- docs/plugins/google-meet.md | 1 + docs/plugins/voice-call.md | 18 ++- extensions/google-meet/src/agent-consult.ts | 130 +++--------------- extensions/matrix/src/cli.test.ts | 1 + .../runners/contract/scenario-runtime-e2ee.ts | 6 +- extensions/voice-call/index.ts | 5 + extensions/voice-call/openclaw.plugin.json | 4 + extensions/voice-call/src/config.test.ts | 3 + extensions/voice-call/src/config.ts | 21 ++- extensions/voice-call/src/runtime.test.ts | 113 +++++++++++++++ extensions/voice-call/src/runtime.ts | 114 +++++++++++++-- extensions/voice-call/src/test-fixtures.ts | 3 + extensions/voice-call/src/webhook.test.ts | 8 ++ .../src/webhook/realtime-handler.test.ts | 106 ++++++++++++-- .../src/webhook/realtime-handler.ts | 2 +- src/plugin-sdk/realtime-voice.ts | 14 ++ .../agent-consult-runtime.test.ts | 116 ++++++++++++++++ src/realtime-voice/agent-consult-runtime.ts | 127 +++++++++++++++++ src/realtime-voice/agent-consult-tool.test.ts | 55 ++++++++ src/realtime-voice/agent-consult-tool.ts | 89 ++++++++++++ ui/src/ui/chat/realtime-talk.ts | 25 +--- 23 files changed, 807 insertions(+), 159 deletions(-) create mode 100644 src/realtime-voice/agent-consult-runtime.test.ts create mode 100644 src/realtime-voice/agent-consult-runtime.ts create mode 100644 src/realtime-voice/agent-consult-tool.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 616b7bca01f..13d97286a4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai - Plugins/Google Meet: add a bundled participant plugin with personal Google auth, explicit meeting URL joins, Chrome and Twilio transports, and realtime voice support. (#70765) Thanks @steipete. - Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`. - Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key. +- Plugins/Voice Call: expose the shared `openclaw_agent_consult` realtime tool so live phone calls can ask the full OpenClaw agent for deeper/tool-backed answers. - Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc. - Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support. - Plugins/Google Meet: let realtime Meet sessions consult the full OpenClaw agent for deeper answers while staying in the live voice loop. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 4bf159d5fa1..5690f60a2e6 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -0adf332920764704575b21d2fe9568742d977ff0169683319c168d68ea7cf143 config-baseline.json +a608561acecc7cfc5f16a31b7498d7a66001f6655f5a5960a68842c59b7dcaa8 config-baseline.json 2936d2ccf0c1e6e932a0e7c617b809e4b31dbb9a7d5afefbba29b229913b9e50 config-baseline.core.json 22d7cd6d8279146b2d79c9531a55b80b52a2c99c81338c508104729154fdd02d config-baseline.channel.json -28d874a4910174c7014ef2a267269a3327d31ff657f76d38c034ef1b86eae484 config-baseline.plugin.json +d47a574045a47356e513ab308d7dcad9fa0b389f50e93c5cf0f820fab858e70e config-baseline.plugin.json diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index 67d7e278daf..cec61f6f460 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -670,6 +670,7 @@ OpenClaw tools, it can call `openclaw_agent_consult`. The consult tool runs the regular OpenClaw agent behind the scenes with recent meeting transcript context and returns a concise spoken answer to the realtime voice session. The voice model can then speak that answer back into the meeting. +It uses the same shared realtime consult tool as Voice Call. `realtime.toolPolicy` controls the consult run: diff --git a/docs/plugins/voice-call.md b/docs/plugins/voice-call.md index 35a6b7514cd..92abfd5f37c 100644 --- a/docs/plugins/voice-call.md +++ b/docs/plugins/voice-call.md @@ -126,6 +126,7 @@ Set config under `plugins.entries.voice-call.config`: realtime: { enabled: false, provider: "google", // optional; first registered realtime voice provider when unset + toolPolicy: "safe-read-only", providers: { google: { model: "gemini-2.5-flash-native-audio-preview-12-2025", @@ -174,6 +175,20 @@ Current runtime behavior: - Bundled realtime voice providers include Google Gemini Live (`google`) and OpenAI (`openai`), registered by their provider plugins. - Provider-owned raw config lives under `realtime.providers.`. +- Voice Call exposes the shared `openclaw_agent_consult` realtime tool by + default. The realtime model can call it when the caller asks for deeper + reasoning, current information, or normal OpenClaw tools. +- `realtime.toolPolicy` controls the consult run: + - `safe-read-only`: expose the consult tool and limit the regular agent to + `read`, `web_search`, `web_fetch`, `x_search`, `memory_search`, and + `memory_get`. + - `owner`: expose the consult tool and let the regular agent use the normal + agent tool policy. + - `none`: do not expose the consult tool. Custom `realtime.tools` are still + passed through to the realtime provider. +- Consult session keys reuse the existing voice session when available, then + fall back to the caller/callee phone number so follow-up consult calls keep + context during the call. - If `realtime.provider` points at an unregistered provider, or no realtime voice provider is registered at all, Voice Call logs a warning and skips realtime media instead of failing the whole plugin. @@ -199,7 +214,8 @@ Example: realtime: { enabled: true, provider: "google", - instructions: "Speak briefly and ask before using tools.", + instructions: "Speak briefly. Call openclaw_agent_consult before using deeper tools.", + toolPolicy: "safe-read-only", providers: { google: { apiKey: "${GEMINI_API_KEY}", diff --git a/extensions/google-meet/src/agent-consult.ts b/extensions/google-meet/src/agent-consult.ts index 1229a6a5ca3..36bd1e3c2df 100644 --- a/extensions/google-meet/src/agent-consult.ts +++ b/extensions/google-meet/src/agent-consult.ts @@ -1,84 +1,20 @@ -import { randomUUID } from "node:crypto"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime"; import { + consultRealtimeVoiceAgent, REALTIME_VOICE_AGENT_CONSULT_TOOL, REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + resolveRealtimeVoiceAgentConsultTools, + resolveRealtimeVoiceAgentConsultToolsAllow, type RealtimeVoiceTool, } from "openclaw/plugin-sdk/realtime-voice"; -import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime"; import type { GoogleMeetConfig, GoogleMeetToolPolicy } from "./config.js"; -type AgentPayload = { - text?: string; - isError?: boolean; - isReasoning?: boolean; -}; - export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME; export const GOOGLE_MEET_AGENT_CONSULT_TOOL = REALTIME_VOICE_AGENT_CONSULT_TOOL; export function resolveGoogleMeetRealtimeTools(policy: GoogleMeetToolPolicy): RealtimeVoiceTool[] { - return policy === "none" ? [] : [GOOGLE_MEET_AGENT_CONSULT_TOOL]; -} - -function normalizeToolArgString(args: unknown, key: string): string | undefined { - if (!args || typeof args !== "object" || Array.isArray(args)) { - return undefined; - } - return normalizeOptionalString((args as Record)[key]); -} - -function collectVisibleText(payloads: AgentPayload[]): string | null { - const chunks: string[] = []; - for (const payload of payloads) { - if (payload.isError || payload.isReasoning) { - continue; - } - const text = normalizeOptionalString(payload.text); - if (text) { - chunks.push(text); - } - } - return chunks.length > 0 ? chunks.join("\n\n").trim() : null; -} - -function resolveToolsAllow(policy: GoogleMeetToolPolicy): string[] | undefined { - if (policy === "owner") { - return undefined; - } - if (policy === "safe-read-only") { - return ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"]; - } - return []; -} - -function buildPrompt(params: { - args: unknown; - transcript: Array<{ role: "user" | "assistant"; text: string }>; -}): string { - const question = normalizeToolArgString(params.args, "question"); - if (!question) { - throw new Error("question required"); - } - const context = normalizeToolArgString(params.args, "context"); - const responseStyle = normalizeToolArgString(params.args, "responseStyle"); - const transcript = params.transcript - .slice(-12) - .map((entry) => `${entry.role === "assistant" ? "Agent" : "Participant"}: ${entry.text}`) - .join("\n"); - return [ - "You are helping an OpenClaw realtime voice agent during a private Google Meet.", - "Answer the participant's question with the strongest useful reasoning and available tools.", - "Return only the concise answer the realtime voice agent should speak next.", - "Do not include markdown, citations unless needed, tool logs, or private reasoning.", - responseStyle ? `Spoken style: ${responseStyle}` : undefined, - transcript ? `Recent meeting transcript:\n${transcript}` : undefined, - context ? `Additional context:\n${context}` : undefined, - `Question:\n${question}`, - ] - .filter(Boolean) - .join("\n\n"); + return resolveRealtimeVoiceAgentConsultTools(policy); } export async function consultOpenClawAgentForGoogleMeet(params: { @@ -90,54 +26,22 @@ export async function consultOpenClawAgentForGoogleMeet(params: { args: unknown; transcript: Array<{ role: "user" | "assistant"; text: string }>; }): Promise<{ text: string }> { - const agentId = "main"; - const sessionKey = `google-meet:${params.meetingSessionId}`; - const cfg = params.fullConfig; - const agentDir = params.runtime.agent.resolveAgentDir(cfg, agentId); - const workspaceDir = params.runtime.agent.resolveAgentWorkspaceDir(cfg, agentId); - await params.runtime.agent.ensureAgentWorkspace({ dir: workspaceDir }); - - const storePath = params.runtime.agent.session.resolveStorePath(cfg.session?.store, { agentId }); - const sessionStore = params.runtime.agent.session.loadSessionStore(storePath); - const now = Date.now(); - const existing = sessionStore[sessionKey] as - | { sessionId?: string; updatedAt?: number } - | undefined; - const sessionId = normalizeOptionalString(existing?.sessionId) ?? randomUUID(); - sessionStore[sessionKey] = { ...existing, sessionId, updatedAt: now }; - await params.runtime.agent.session.saveSessionStore(storePath, sessionStore); - - const sessionFile = params.runtime.agent.session.resolveSessionFilePath( - sessionId, - sessionStore[sessionKey], - { agentId }, - ); - const result = await params.runtime.agent.runEmbeddedPiAgent({ - sessionId, - sessionKey, + return await consultRealtimeVoiceAgent({ + cfg: params.fullConfig, + agentRuntime: params.runtime.agent, + logger: params.logger, + sessionKey: `google-meet:${params.meetingSessionId}`, messageProvider: "google-meet", - sessionFile, - workspaceDir, - config: cfg, - prompt: buildPrompt({ args: params.args, transcript: params.transcript }), - thinkLevel: "high", - verboseLevel: "off", - reasoningLevel: "off", - toolResultFormat: "plain", - toolsAllow: resolveToolsAllow(params.config.realtime.toolPolicy), - timeoutMs: params.runtime.agent.resolveAgentTimeoutMs({ cfg }), - runId: `google-meet:${params.meetingSessionId}:${Date.now()}`, lane: "google-meet", + runIdPrefix: `google-meet:${params.meetingSessionId}`, + args: params.args, + transcript: params.transcript, + surface: "a private Google Meet", + userLabel: "Participant", + assistantLabel: "Agent", + questionSourceLabel: "participant", + toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(params.config.realtime.toolPolicy), extraSystemPrompt: "You are a behind-the-scenes consultant for a live meeting voice agent. Be accurate, brief, and speakable.", - agentDir, }); - - const text = collectVisibleText((result.payloads ?? []) as AgentPayload[]); - if (!text) { - const reason = result.meta?.aborted ? "agent run aborted" : "agent returned no speakable text"; - params.logger.warn(`[google-meet] agent consult produced no answer: ${reason}`); - return { text: "I need a moment to verify that before answering." }; - } - return { text }; } diff --git a/extensions/matrix/src/cli.test.ts b/extensions/matrix/src/cli.test.ts index 12fcd328304..06f2e422cdc 100644 --- a/extensions/matrix/src/cli.test.ts +++ b/extensions/matrix/src/cli.test.ts @@ -911,6 +911,7 @@ describe("matrix CLI verification commands", () => { expect(pruneMatrixStaleGatewayDevicesMock).toHaveBeenCalledWith({ accountId: "poe", + cfg: {}, }); expect(console.log).toHaveBeenCalledWith("Deleted stale OpenClaw devices: BritdXC6iL"); expect(console.log).toHaveBeenCalledWith("Current device: A7hWrQ70ea"); diff --git a/extensions/qa-matrix/src/runners/contract/scenario-runtime-e2ee.ts b/extensions/qa-matrix/src/runners/contract/scenario-runtime-e2ee.ts index ec05cccecc6..740203db535 100644 --- a/extensions/qa-matrix/src/runners/contract/scenario-runtime-e2ee.ts +++ b/extensions/qa-matrix/src/runners/contract/scenario-runtime-e2ee.ts @@ -1,9 +1,9 @@ import { randomUUID } from "node:crypto"; import { chmod, mkdir, mkdtemp, rm, stat, writeFile } from "node:fs/promises"; -import { tmpdir } from "node:os"; import path from "node:path"; import { setTimeout as sleep } from "node:timers/promises"; import type { MatrixVerificationSummary } from "@openclaw/matrix/test-api.js"; +import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path"; import { createMatrixQaClient } from "../../substrate/client.js"; import { createMatrixQaE2eeScenarioClient, @@ -391,7 +391,9 @@ async function createMatrixQaCliSelfVerificationRuntime(params: { userId: string; }) { const outputDir = requireMatrixQaE2eeOutputDir(params.context); - const rootDir = await mkdtemp(path.join(tmpdir(), "openclaw-matrix-cli-qa-")); + const rootDir = await mkdtemp( + path.join(resolvePreferredOpenClawTmpDir(), "openclaw-matrix-cli-qa-"), + ); const artifactDir = path.join( outputDir, "cli-self-verification", diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index 7ed35ed7b63..c9381f9c11b 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -82,6 +82,11 @@ const voiceCallConfigSchema = { }, "realtime.streamPath": { label: "Realtime Stream Path", advanced: true }, "realtime.instructions": { label: "Realtime Instructions", advanced: true }, + "realtime.toolPolicy": { + label: "Realtime Tool Policy", + help: "Controls the shared openclaw_agent_consult tool.", + advanced: true, + }, "realtime.providers": { label: "Realtime Provider Config", advanced: true }, "tts.provider": { label: "TTS Provider Override", diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index 4a65212dcf4..943e7017ea9 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -402,6 +402,10 @@ "instructions": { "type": "string" }, + "toolPolicy": { + "type": "string", + "enum": ["safe-read-only", "owner", "none"] + }, "tools": { "type": "array", "items": { diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index ab5aa7fe30f..0bac83934dc 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -242,6 +242,8 @@ describe("normalizeVoiceCallConfig", () => { expect(normalized.streaming.provider).toBeUndefined(); expect(normalized.streaming.providers).toEqual({}); expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime"); + expect(normalized.realtime.toolPolicy).toBe("safe-read-only"); + expect(normalized.realtime.instructions).toContain("openclaw_agent_consult"); expect(normalized.tunnel.provider).toBe("none"); expect(normalized.webhookSecurity.allowedHosts).toEqual([]); }); @@ -300,6 +302,7 @@ describe("resolveVoiceCallConfig", () => { }); expect(resolved.realtime.instructions).toBe("Stay concise."); + expect(resolved.realtime.toolPolicy).toBe("safe-read-only"); expect(resolved.realtime.provider).toBeUndefined(); }); diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index c86cbf64961..fd2e07aba68 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -1,3 +1,7 @@ +import { + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + type RealtimeVoiceAgentConsultToolPolicy, +} from "openclaw/plugin-sdk/realtime-voice"; import { z } from "openclaw/plugin-sdk/zod"; import { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema } from "../api.js"; import { deepMergeDefined } from "./deep-merge.js"; @@ -205,6 +209,11 @@ export type VoiceCallRealtimeProvidersConfig = z.infer< typeof VoiceCallRealtimeProvidersConfigSchema >; +export const VoiceCallRealtimeToolPolicySchema = z.enum(["safe-read-only", "owner", "none"]); +export type VoiceCallRealtimeToolPolicy = RealtimeVoiceAgentConsultToolPolicy; + +export const DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS = `You are OpenClaw's phone-call realtime voice interface. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`; + export const VoiceCallStreamingProvidersConfigSchema = z .record(z.string(), z.record(z.string(), z.unknown())) .default({}); @@ -221,14 +230,22 @@ export const VoiceCallRealtimeConfigSchema = z /** Optional override for the local WebSocket route path. */ streamPath: z.string().min(1).optional(), /** System instructions passed to the realtime provider. */ - instructions: z.string().optional(), + instructions: z.string().default(DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS), + /** Tool policy for the shared OpenClaw agent consult tool. */ + toolPolicy: VoiceCallRealtimeToolPolicySchema.default("safe-read-only"), /** Tool definitions exposed to the realtime provider. */ tools: z.array(RealtimeToolSchema).default([]), /** Provider-owned raw config blobs keyed by provider id. */ providers: VoiceCallRealtimeProvidersConfigSchema, }) .strict() - .default({ enabled: false, tools: [], providers: {} }); + .default({ + enabled: false, + instructions: DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS, + toolPolicy: "safe-read-only", + tools: [], + providers: {}, + }); export type VoiceCallRealtimeConfig = z.infer; // ----------------------------------------------------------------------------- diff --git a/extensions/voice-call/src/runtime.test.ts b/extensions/voice-call/src/runtime.test.ts index ecb7ab2cb07..ad9aa64b208 100644 --- a/extensions/voice-call/src/runtime.test.ts +++ b/extensions/voice-call/src/runtime.test.ts @@ -8,10 +8,17 @@ const mocks = vi.hoisted(() => ({ resolveVoiceCallConfig: vi.fn(), validateProviderConfig: vi.fn(), managerInitialize: vi.fn(), + managerGetCall: vi.fn(), webhookStart: vi.fn(), webhookStop: vi.fn(), + webhookSetRealtimeHandler: vi.fn(), + webhookGetRealtimeHandler: vi.fn(), webhookGetMediaStreamHandler: vi.fn(), webhookCtorArgs: [] as unknown[][], + realtimeHandlerCtorArgs: [] as unknown[][], + realtimeHandlerRegisterToolHandler: vi.fn(), + realtimeHandlerSetPublicUrl: vi.fn(), + resolveConfiguredRealtimeVoiceProvider: vi.fn(), startTunnel: vi.fn(), setupTailscaleExposure: vi.fn(), cleanupTailscaleExposure: vi.fn(), @@ -25,6 +32,7 @@ vi.mock("./config.js", () => ({ vi.mock("./manager.js", () => ({ CallManager: class { initialize = mocks.managerInitialize; + getCall = mocks.managerGetCall; }, })); @@ -35,10 +43,26 @@ vi.mock("./webhook.js", () => ({ } start = mocks.webhookStart; stop = mocks.webhookStop; + setRealtimeHandler = mocks.webhookSetRealtimeHandler; + getRealtimeHandler = mocks.webhookGetRealtimeHandler; getMediaStreamHandler = mocks.webhookGetMediaStreamHandler; }, })); +vi.mock("./realtime-voice.runtime.js", () => ({ + resolveConfiguredRealtimeVoiceProvider: mocks.resolveConfiguredRealtimeVoiceProvider, +})); + +vi.mock("./webhook/realtime-handler.js", () => ({ + RealtimeCallHandler: class { + constructor(...args: unknown[]) { + mocks.realtimeHandlerCtorArgs.push(args); + } + registerToolHandler = mocks.realtimeHandlerRegisterToolHandler; + setPublicUrl = mocks.realtimeHandlerSetPublicUrl; + }, +})); + vi.mock("./tunnel.js", () => ({ startTunnel: mocks.startTunnel, })); @@ -60,10 +84,22 @@ describe("createVoiceCallRuntime lifecycle", () => { mocks.resolveVoiceCallConfig.mockImplementation((cfg: VoiceCallConfig) => cfg); mocks.validateProviderConfig.mockReturnValue({ valid: true, errors: [] }); mocks.managerInitialize.mockResolvedValue(undefined); + mocks.managerGetCall.mockReset(); mocks.webhookStart.mockResolvedValue("http://127.0.0.1:3334/voice/webhook"); mocks.webhookStop.mockResolvedValue(undefined); + mocks.webhookSetRealtimeHandler.mockReset(); + mocks.webhookGetRealtimeHandler.mockReturnValue({ + setPublicUrl: mocks.realtimeHandlerSetPublicUrl, + }); mocks.webhookGetMediaStreamHandler.mockReturnValue(undefined); mocks.webhookCtorArgs.length = 0; + mocks.realtimeHandlerCtorArgs.length = 0; + mocks.realtimeHandlerRegisterToolHandler.mockReset(); + mocks.realtimeHandlerSetPublicUrl.mockReset(); + mocks.resolveConfiguredRealtimeVoiceProvider.mockResolvedValue({ + provider: { id: "openai" }, + providerConfig: { model: "gpt-realtime" }, + }); mocks.startTunnel.mockResolvedValue(null); mocks.setupTailscaleExposure.mockResolvedValue(null); mocks.cleanupTailscaleExposure.mockResolvedValue(undefined); @@ -133,4 +169,81 @@ describe("createVoiceCallRuntime lifecycle", () => { expect(mocks.webhookCtorArgs[0]?.[3]).toBe(coreConfig); expect(mocks.webhookCtorArgs[0]?.[4]).toBe(fullConfig); }); + + it("wires the shared realtime agent consult tool and handler", async () => { + const config = createBaseConfig(); + config.inboundPolicy = "allowlist"; + config.realtime.enabled = true; + config.realtime.tools = [ + { + type: "function", + name: "custom_tool", + description: "Custom tool", + parameters: { type: "object", properties: {} }, + }, + ]; + const sessionStore: Record = {}; + const runEmbeddedPiAgent = vi.fn(async () => ({ + payloads: [{ text: "Use the shipment status." }], + meta: {}, + })); + const agentRuntime = { + defaults: { provider: "openai", model: "gpt-5.4" }, + resolveAgentDir: vi.fn(() => "/tmp/agent"), + resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"), + resolveAgentIdentity: vi.fn(), + resolveThinkingDefault: vi.fn(() => "high"), + resolveAgentTimeoutMs: vi.fn(() => 30_000), + ensureAgentWorkspace: vi.fn(async () => {}), + session: { + resolveStorePath: vi.fn(() => "/tmp/sessions.json"), + loadSessionStore: vi.fn(() => sessionStore), + saveSessionStore: vi.fn(async () => {}), + resolveSessionFilePath: vi.fn(() => "/tmp/session.json"), + }, + runEmbeddedPiAgent, + }; + mocks.managerGetCall.mockReturnValue({ + callId: "call-1", + direction: "outbound", + from: "+15550001234", + to: "+15550009999", + transcript: [{ speaker: "user", text: "Can you check shipment status?" }], + }); + + await createVoiceCallRuntime({ + config, + coreConfig: {} as CoreConfig, + agentRuntime: agentRuntime as never, + }); + + expect(mocks.realtimeHandlerCtorArgs[0]?.[0]).toMatchObject({ + tools: [ + expect.objectContaining({ name: "openclaw_agent_consult" }), + expect.objectContaining({ name: "custom_tool" }), + ], + }); + expect(mocks.realtimeHandlerRegisterToolHandler).toHaveBeenCalledWith( + "openclaw_agent_consult", + expect.any(Function), + ); + + const handler = mocks.realtimeHandlerRegisterToolHandler.mock.calls[0]?.[1] as + | ((args: unknown, callId: string) => Promise) + | undefined; + await expect(handler?.({ question: "What should I say?" }, "call-1")).resolves.toEqual({ + text: "Use the shipment status.", + }); + expect(runEmbeddedPiAgent).toHaveBeenCalledWith( + expect.objectContaining({ + sessionKey: "voice:15550009999", + messageProvider: "voice", + lane: "voice", + provider: "openai", + model: "gpt-5.4", + toolsAllow: ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"], + prompt: expect.stringContaining("Caller: Can you check shipment status?"), + }), + ); + }); }); diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 2df69c4e75f..9341d75fe6d 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -1,12 +1,21 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; -import type { ResolvedRealtimeVoiceProvider } from "openclaw/plugin-sdk/realtime-voice"; +import { + consultRealtimeVoiceAgent, + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + resolveRealtimeVoiceAgentConsultTools, + resolveRealtimeVoiceAgentConsultToolsAllow, + type RealtimeVoiceAgentConsultTranscriptEntry, + type RealtimeVoiceTool, + type ResolvedRealtimeVoiceProvider, +} from "openclaw/plugin-sdk/realtime-voice"; import type { VoiceCallConfig } from "./config.js"; import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js"; import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js"; import { CallManager } from "./manager.js"; import type { VoiceCallProvider } from "./providers/base.js"; import type { TwilioProvider } from "./providers/twilio.js"; +import { resolveVoiceResponseModel } from "./response-model.js"; import type { TelephonyTtsRuntime } from "./telephony-tts.js"; import { createTelephonyTtsProvider } from "./telephony-tts.js"; import { startTunnel, type TunnelResult } from "./tunnel.js"; @@ -76,6 +85,43 @@ function loadRealtimeHandler(): Promise { return realtimeHandlerPromise; } +function resolveRealtimeTools(config: VoiceCallConfig): RealtimeVoiceTool[] { + const tools = new Map(); + for (const tool of resolveRealtimeVoiceAgentConsultTools(config.realtime.toolPolicy)) { + tools.set(tool.name, tool); + } + for (const tool of config.realtime.tools) { + if (!tools.has(tool.name)) { + tools.set(tool.name, tool); + } + } + return [...tools.values()]; +} + +function resolveVoiceCallConsultSessionKey(call: { + sessionKey?: string; + from?: string; + to?: string; + direction?: "inbound" | "outbound"; + callId: string; +}): string { + if (call.sessionKey) { + return call.sessionKey; + } + const phone = call.direction === "outbound" ? call.to : call.from; + const normalizedPhone = phone?.replace(/\D/g, ""); + return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${call.callId}`; +} + +function mapVoiceCallConsultTranscript(call: { + transcript?: Array<{ speaker: "user" | "bot"; text: string }>; +}): RealtimeVoiceAgentConsultTranscriptEntry[] { + return (call.transcript ?? []).map((entry) => ({ + role: entry.speaker === "bot" ? "assistant" : "user", + text: entry.text, + })); +} + function createRuntimeResourceLifecycle(params: { config: VoiceCallConfig; webhookServer: VoiceCallWebhookServer; @@ -215,6 +261,7 @@ export async function createVoiceCallRuntime(params: { }; const config = resolveVoiceCallConfig(rawConfig); + const cfg = fullConfig ?? (coreConfig as OpenClawConfig); if (!config.enabled) { throw new Error("Voice call disabled. Enable the plugin entry in config."); @@ -236,7 +283,7 @@ export async function createVoiceCallRuntime(params: { const realtimeProvider = config.realtime.enabled ? await resolveRealtimeProvider({ config, - fullConfig: fullConfig ?? (coreConfig as OpenClawConfig), + fullConfig: cfg, }) : null; const webhookServer = new VoiceCallWebhookServer( @@ -249,16 +296,61 @@ export async function createVoiceCallRuntime(params: { ); if (realtimeProvider) { const { RealtimeCallHandler } = await loadRealtimeHandler(); - webhookServer.setRealtimeHandler( - new RealtimeCallHandler( - config.realtime, - manager, - provider, - realtimeProvider.provider, - realtimeProvider.providerConfig, - config.serve.path, - ), + const realtimeConfig = { + ...config.realtime, + tools: resolveRealtimeTools(config), + }; + const realtimeHandler = new RealtimeCallHandler( + realtimeConfig, + manager, + provider, + realtimeProvider.provider, + realtimeProvider.providerConfig, + config.serve.path, ); + if (config.realtime.toolPolicy !== "none") { + realtimeHandler.registerToolHandler( + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + async (args, callId) => { + const call = manager.getCall(callId); + if (!call) { + return { error: `Call "${callId}" not found` }; + } + const { provider: agentProvider, model } = resolveVoiceResponseModel({ + voiceConfig: config, + agentRuntime, + }); + const thinkLevel = agentRuntime.resolveThinkingDefault({ + cfg, + provider: agentProvider, + model, + }); + return await consultRealtimeVoiceAgent({ + cfg, + agentRuntime, + logger: log, + sessionKey: resolveVoiceCallConsultSessionKey(call), + messageProvider: "voice", + lane: "voice", + runIdPrefix: `voice-realtime-consult:${callId}`, + args, + transcript: mapVoiceCallConsultTranscript(call), + surface: "a live phone call", + userLabel: "Caller", + assistantLabel: "Agent", + questionSourceLabel: "caller", + provider: agentProvider, + model, + thinkLevel, + timeoutMs: config.responseTimeoutMs, + toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(config.realtime.toolPolicy), + extraSystemPrompt: + "You are a behind-the-scenes consultant for a live phone voice agent. Be accurate, brief, and speakable.", + }); + }, + ); + } + webhookServer.setRealtimeHandler(realtimeHandler); } const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer }); diff --git a/extensions/voice-call/src/test-fixtures.ts b/extensions/voice-call/src/test-fixtures.ts index 50305f7b693..da4e0965a7f 100644 --- a/extensions/voice-call/src/test-fixtures.ts +++ b/extensions/voice-call/src/test-fixtures.ts @@ -46,6 +46,9 @@ export function createVoiceCallBaseConfig(params?: { realtime: { enabled: false, streamPath: "/voice/stream/realtime", + instructions: + "You are OpenClaw's phone-call realtime voice interface. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering.", + toolPolicy: "safe-read-only", tools: [], providers: {}, }, diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts index e97a63786af..1c3ee15df7f 100644 --- a/extensions/voice-call/src/webhook.test.ts +++ b/extensions/voice-call/src/webhook.test.ts @@ -571,6 +571,8 @@ describe("VoiceCallWebhookServer replay handling", () => { realtime: { enabled: true, streamPath: "/voice/stream/realtime", + instructions: "Be helpful.", + toolPolicy: "safe-read-only", tools: [], providers: {}, }, @@ -628,6 +630,8 @@ describe("VoiceCallWebhookServer replay handling", () => { realtime: { enabled: true, streamPath: "/voice/stream/realtime", + instructions: "Be helpful.", + toolPolicy: "safe-read-only", tools: [], providers: {}, }, @@ -680,6 +684,8 @@ describe("VoiceCallWebhookServer replay handling", () => { realtime: { enabled: true, streamPath: "/voice/stream/realtime", + instructions: "Be helpful.", + toolPolicy: "safe-read-only", tools: [], providers: {}, }, @@ -730,6 +736,8 @@ describe("VoiceCallWebhookServer replay handling", () => { realtime: { enabled: true, streamPath: "/voice/stream/realtime", + instructions: "Be helpful.", + toolPolicy: "safe-read-only", tools: [], providers: {}, }, diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index 7593992843c..730bf5ef18a 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -8,6 +8,7 @@ import { WebSocket } from "ws"; import type { VoiceCallRealtimeConfig } from "../config.js"; import type { CallManager } from "../manager.js"; import type { VoiceCallProvider } from "../providers/base.js"; +import type { CallRecord } from "../types.js"; import { connectWs, startUpgradeWsServer, waitForClose } from "../websocket-test-support.js"; import { RealtimeCallHandler } from "./realtime-handler.js"; @@ -33,7 +34,7 @@ function makeBridge(): RealtimeVoiceBridge { } function makeRealtimeProvider( - createBridge: () => RealtimeVoiceBridge, + createBridge: RealtimeVoiceProviderPlugin["createBridge"], ): RealtimeVoiceProviderPlugin { return { id: "openai", @@ -51,15 +52,17 @@ function makeHandler( realtimeProvider?: RealtimeVoiceProviderPlugin; }, ) { + const config: VoiceCallRealtimeConfig = { + enabled: true, + streamPath: overrides?.streamPath ?? "/voice/stream/realtime", + instructions: overrides?.instructions ?? "Be helpful.", + toolPolicy: overrides?.toolPolicy ?? "safe-read-only", + tools: overrides?.tools ?? [], + providers: overrides?.providers ?? {}, + ...(overrides?.provider ? { provider: overrides.provider } : {}), + }; return new RealtimeCallHandler( - { - enabled: true, - streamPath: "/voice/stream/realtime", - instructions: "Be helpful.", - tools: [], - providers: {}, - ...overrides, - }, + config, { processEvent: vi.fn(), getCallByProviderCallId: vi.fn(), @@ -124,6 +127,91 @@ describe("RealtimeCallHandler path routing", () => { /wss:\/\/public\.example\/api\/custom\/stream\/realtime\/[0-9a-f-]{36}/, ); }); + + it("normalizes Twilio outbound realtime directions", async () => { + let callbacks: + | { + onReady?: () => void; + } + | undefined; + const createBridge = vi.fn( + (request: Parameters[0]) => { + callbacks = request; + return makeBridge(); + }, + ); + const processEvent = vi.fn(); + const getCallByProviderCallId = vi.fn( + (): CallRecord => ({ + callId: "call-1", + providerCallId: "CA-outbound", + provider: "twilio", + direction: "outbound", + state: "ringing", + from: "+15550001234", + to: "+15550009999", + startedAt: Date.now(), + transcript: [], + processedEventIds: [], + metadata: {}, + }), + ); + const handler = makeHandler(undefined, { + manager: { + processEvent, + getCallByProviderCallId, + }, + realtimeProvider: makeRealtimeProvider(createBridge), + }); + const payload = handler.buildTwiMLPayload( + makeRequest("/voice/webhook"), + new URLSearchParams({ + Direction: "outbound-dial", + From: "+15550001234", + To: "+15550009999", + }), + ); + const match = payload.body.match(/wss:\/\/[^/]+(\/[^"]+)/); + if (!match) { + throw new Error("Failed to extract realtime stream path"); + } + const server = await startUpgradeWsServer({ + urlPath: match[1], + onUpgrade: (request, socket, head) => { + handler.handleWebSocketUpgrade(request, socket, head); + }, + }); + + try { + const ws = await connectWs(server.url); + try { + ws.send( + JSON.stringify({ + event: "start", + start: { streamSid: "MZ-outbound", callSid: "CA-outbound" }, + }), + ); + await vi.waitFor(() => { + expect(createBridge).toHaveBeenCalled(); + }); + callbacks?.onReady?.(); + expect(processEvent).toHaveBeenCalledWith( + expect.objectContaining({ + type: "call.initiated", + direction: "outbound", + from: "+15550001234", + to: "+15550009999", + }), + ); + } finally { + if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) { + ws.close(); + } + } + } finally { + await server.close(); + } + }); }); describe("RealtimeCallHandler websocket hardening", () => { diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 947a8b86e0b..d25122f4a3c 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -101,7 +101,7 @@ export class RealtimeCallHandler { const token = this.issueStreamToken({ from: params?.get("From") ?? undefined, to: params?.get("To") ?? undefined, - direction: rawDirection === "outbound-api" ? "outbound" : "inbound", + direction: rawDirection?.startsWith("outbound") ? "outbound" : "inbound", }); const wsUrl = `wss://${host}${this.getStreamPathPattern()}/${token}`; const twiml = ` diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index 45ef379b472..a8fc1d273d8 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -15,9 +15,23 @@ export type { RealtimeVoiceToolCallEvent, } from "../realtime-voice/provider-types.js"; export { + buildRealtimeVoiceAgentConsultChatMessage, + buildRealtimeVoiceAgentConsultPrompt, + collectRealtimeVoiceAgentConsultVisibleText, + parseRealtimeVoiceAgentConsultArgs, REALTIME_VOICE_AGENT_CONSULT_TOOL, REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + type RealtimeVoiceAgentConsultArgs, + type RealtimeVoiceAgentConsultToolPolicy, + type RealtimeVoiceAgentConsultTranscriptEntry, } from "../realtime-voice/agent-consult-tool.js"; +export { + consultRealtimeVoiceAgent, + resolveRealtimeVoiceAgentConsultTools, + resolveRealtimeVoiceAgentConsultToolsAllow, + type RealtimeVoiceAgentConsultResult, + type RealtimeVoiceAgentConsultRuntime, +} from "../realtime-voice/agent-consult-runtime.js"; export { canonicalizeRealtimeVoiceProviderId, getRealtimeVoiceProvider, diff --git a/src/realtime-voice/agent-consult-runtime.test.ts b/src/realtime-voice/agent-consult-runtime.test.ts new file mode 100644 index 00000000000..e6e3e2e98ff --- /dev/null +++ b/src/realtime-voice/agent-consult-runtime.test.ts @@ -0,0 +1,116 @@ +import { describe, expect, it, vi } from "vitest"; +import { + consultRealtimeVoiceAgent, + resolveRealtimeVoiceAgentConsultTools, + resolveRealtimeVoiceAgentConsultToolsAllow, +} from "./agent-consult-runtime.js"; +import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "./agent-consult-tool.js"; + +function createAgentRuntime(payloads: unknown[] = [{ text: "Speak this." }]) { + const sessionStore: Record = {}; + const runEmbeddedPiAgent = vi.fn(async () => ({ + payloads, + meta: {}, + })); + return { + runtime: { + resolveAgentDir: vi.fn(() => "/tmp/agent"), + resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"), + ensureAgentWorkspace: vi.fn(async () => {}), + resolveAgentTimeoutMs: vi.fn(() => 30_000), + session: { + resolveStorePath: vi.fn(() => "/tmp/sessions.json"), + loadSessionStore: vi.fn(() => sessionStore), + saveSessionStore: vi.fn(async () => {}), + resolveSessionFilePath: vi.fn(() => "/tmp/session.json"), + }, + runEmbeddedPiAgent, + }, + runEmbeddedPiAgent, + sessionStore, + }; +} + +describe("realtime voice agent consult runtime", () => { + it("exposes the shared consult tool based on policy", () => { + expect(resolveRealtimeVoiceAgentConsultTools("safe-read-only")).toEqual([ + expect.objectContaining({ name: REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME }), + ]); + expect(resolveRealtimeVoiceAgentConsultTools("none")).toEqual([]); + expect(resolveRealtimeVoiceAgentConsultToolsAllow("safe-read-only")).toEqual([ + "read", + "web_search", + "web_fetch", + "x_search", + "memory_search", + "memory_get", + ]); + expect(resolveRealtimeVoiceAgentConsultToolsAllow("owner")).toBeUndefined(); + expect(resolveRealtimeVoiceAgentConsultToolsAllow("none")).toEqual([]); + }); + + it("runs an embedded agent using the shared session and prompt contract", async () => { + const { runtime, runEmbeddedPiAgent, sessionStore } = createAgentRuntime(); + + const result = await consultRealtimeVoiceAgent({ + cfg: {} as never, + agentRuntime: runtime as never, + logger: { warn: vi.fn() }, + sessionKey: "voice:15550001234", + messageProvider: "voice", + lane: "voice", + runIdPrefix: "voice-realtime-consult:call-1", + args: { question: "What should I say?", context: "Caller asked about PR #123." }, + transcript: [{ role: "user", text: "Can you check this?" }], + surface: "a live phone call", + userLabel: "Caller", + questionSourceLabel: "caller", + toolsAllow: ["read"], + provider: "openai", + model: "gpt-5.4", + thinkLevel: "high", + timeoutMs: 10_000, + }); + + expect(result).toEqual({ text: "Speak this." }); + expect(sessionStore["voice:15550001234"]?.sessionId).toBeTruthy(); + expect(runEmbeddedPiAgent).toHaveBeenCalledWith( + expect.objectContaining({ + sessionKey: "voice:15550001234", + messageProvider: "voice", + lane: "voice", + toolsAllow: ["read"], + provider: "openai", + model: "gpt-5.4", + thinkLevel: "high", + timeoutMs: 10_000, + prompt: expect.stringContaining("Caller: Can you check this?"), + }), + ); + }); + + it("returns a speakable fallback when the embedded agent has no visible text", async () => { + const warn = vi.fn(); + const { runtime } = createAgentRuntime([{ text: "hidden", isReasoning: true }]); + + const result = await consultRealtimeVoiceAgent({ + cfg: {} as never, + agentRuntime: runtime as never, + logger: { warn }, + sessionKey: "google-meet:meet-1", + messageProvider: "google-meet", + lane: "google-meet", + runIdPrefix: "google-meet:meet-1", + args: { question: "What now?" }, + transcript: [], + surface: "a private Google Meet", + userLabel: "Participant", + fallbackText: "Let me verify that first.", + }); + + expect(result).toEqual({ text: "Let me verify that first." }); + expect(warn).toHaveBeenCalledWith( + "[realtime-voice] agent consult produced no answer: agent returned no speakable text", + ); + }); +}); diff --git a/src/realtime-voice/agent-consult-runtime.ts b/src/realtime-voice/agent-consult-runtime.ts new file mode 100644 index 00000000000..cbe18b02017 --- /dev/null +++ b/src/realtime-voice/agent-consult-runtime.ts @@ -0,0 +1,127 @@ +import { randomUUID } from "node:crypto"; +import type { RunEmbeddedPiAgentParams } from "../agents/pi-embedded-runner/run/params.js"; +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import type { RuntimeLogger, PluginRuntimeCore } from "../plugins/runtime/types-core.js"; +import { + buildRealtimeVoiceAgentConsultPrompt, + collectRealtimeVoiceAgentConsultVisibleText, + REALTIME_VOICE_AGENT_CONSULT_TOOL, + type RealtimeVoiceAgentConsultToolPolicy, + type RealtimeVoiceAgentConsultTranscriptEntry, +} from "./agent-consult-tool.js"; +import type { RealtimeVoiceTool } from "./provider-types.js"; + +export type RealtimeVoiceAgentConsultRuntime = PluginRuntimeCore["agent"]; +export type RealtimeVoiceAgentConsultResult = { text: string }; + +const SAFE_READ_ONLY_TOOLS = [ + "read", + "web_search", + "web_fetch", + "x_search", + "memory_search", + "memory_get", +] as const; + +export function resolveRealtimeVoiceAgentConsultTools( + policy: RealtimeVoiceAgentConsultToolPolicy, +): RealtimeVoiceTool[] { + return policy === "none" ? [] : [REALTIME_VOICE_AGENT_CONSULT_TOOL]; +} + +export function resolveRealtimeVoiceAgentConsultToolsAllow( + policy: RealtimeVoiceAgentConsultToolPolicy, +): string[] | undefined { + if (policy === "owner") { + return undefined; + } + if (policy === "safe-read-only") { + return [...SAFE_READ_ONLY_TOOLS]; + } + return []; +} + +export async function consultRealtimeVoiceAgent(params: { + cfg: OpenClawConfig; + agentRuntime: RealtimeVoiceAgentConsultRuntime; + logger: Pick; + sessionKey: string; + messageProvider: string; + lane: string; + runIdPrefix: string; + args: unknown; + transcript: RealtimeVoiceAgentConsultTranscriptEntry[]; + surface: string; + userLabel: string; + assistantLabel?: string; + questionSourceLabel?: string; + agentId?: string; + provider?: RunEmbeddedPiAgentParams["provider"]; + model?: RunEmbeddedPiAgentParams["model"]; + thinkLevel?: RunEmbeddedPiAgentParams["thinkLevel"]; + timeoutMs?: number; + toolsAllow?: string[]; + extraSystemPrompt?: string; + fallbackText?: string; +}): Promise { + const agentId = params.agentId ?? "main"; + const agentDir = params.agentRuntime.resolveAgentDir(params.cfg, agentId); + const workspaceDir = params.agentRuntime.resolveAgentWorkspaceDir(params.cfg, agentId); + await params.agentRuntime.ensureAgentWorkspace({ dir: workspaceDir }); + + const storePath = params.agentRuntime.session.resolveStorePath(params.cfg.session?.store, { + agentId, + }); + const sessionStore = params.agentRuntime.session.loadSessionStore(storePath); + const now = Date.now(); + const existing = sessionStore[params.sessionKey] as + | { sessionId?: string; updatedAt?: number } + | undefined; + const sessionId = existing?.sessionId?.trim() || randomUUID(); + sessionStore[params.sessionKey] = { ...existing, sessionId, updatedAt: now }; + await params.agentRuntime.session.saveSessionStore(storePath, sessionStore); + + const sessionFile = params.agentRuntime.session.resolveSessionFilePath( + sessionId, + sessionStore[params.sessionKey], + { agentId }, + ); + const result = await params.agentRuntime.runEmbeddedPiAgent({ + sessionId, + sessionKey: params.sessionKey, + messageProvider: params.messageProvider, + sessionFile, + workspaceDir, + config: params.cfg, + prompt: buildRealtimeVoiceAgentConsultPrompt({ + args: params.args, + transcript: params.transcript, + surface: params.surface, + userLabel: params.userLabel, + assistantLabel: params.assistantLabel, + questionSourceLabel: params.questionSourceLabel, + }), + provider: params.provider, + model: params.model, + thinkLevel: params.thinkLevel ?? "high", + verboseLevel: "off", + reasoningLevel: "off", + toolResultFormat: "plain", + toolsAllow: params.toolsAllow, + timeoutMs: params.timeoutMs ?? params.agentRuntime.resolveAgentTimeoutMs({ cfg: params.cfg }), + runId: `${params.runIdPrefix}:${Date.now()}`, + lane: params.lane, + extraSystemPrompt: + params.extraSystemPrompt ?? + "You are a behind-the-scenes consultant for a live voice agent. Be accurate, brief, and speakable.", + agentDir, + }); + + const text = collectRealtimeVoiceAgentConsultVisibleText(result.payloads ?? []); + if (!text) { + const reason = result.meta?.aborted ? "agent run aborted" : "agent returned no speakable text"; + params.logger.warn(`[realtime-voice] agent consult produced no answer: ${reason}`); + return { text: params.fallbackText ?? "I need a moment to verify that before answering." }; + } + return { text }; +} diff --git a/src/realtime-voice/agent-consult-tool.test.ts b/src/realtime-voice/agent-consult-tool.test.ts new file mode 100644 index 00000000000..0b8c62d6d05 --- /dev/null +++ b/src/realtime-voice/agent-consult-tool.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from "vitest"; +import { + buildRealtimeVoiceAgentConsultChatMessage, + buildRealtimeVoiceAgentConsultPrompt, + collectRealtimeVoiceAgentConsultVisibleText, + parseRealtimeVoiceAgentConsultArgs, +} from "./agent-consult-tool.js"; + +describe("realtime voice agent consult tool", () => { + it("normalizes shared tool arguments for browser chat forwarding", () => { + expect( + buildRealtimeVoiceAgentConsultChatMessage({ + question: " What changed? ", + context: " PR #123 ", + responseStyle: " concise ", + }), + ).toBe("What changed?\n\nContext:\nPR #123\n\nSpoken style:\nconcise"); + }); + + it("requires a non-empty question", () => { + expect(() => parseRealtimeVoiceAgentConsultArgs({ context: "missing" })).toThrow( + "question required", + ); + }); + + it("builds a reusable spoken consultant prompt with recent transcript", () => { + const prompt = buildRealtimeVoiceAgentConsultPrompt({ + args: { question: "Do we support realtime tools?" }, + transcript: [ + { role: "user", text: "Can you check the repo?" }, + { role: "assistant", text: "I'll verify." }, + ], + surface: "a private Google Meet", + userLabel: "Participant", + assistantLabel: "Agent", + questionSourceLabel: "participant", + }); + + expect(prompt).toContain("during a private Google Meet"); + expect(prompt).toContain("Participant: Can you check the repo?"); + expect(prompt).toContain("Agent: I'll verify."); + expect(prompt).toContain("Question:\nDo we support realtime tools?"); + }); + + it("filters reasoning and error payloads from visible consult output", () => { + expect( + collectRealtimeVoiceAgentConsultVisibleText([ + { text: "thinking", isReasoning: true }, + { text: "first" }, + { text: "error", isError: true }, + { text: "second" }, + ]), + ).toBe("first\n\nsecond"); + }); +}); diff --git a/src/realtime-voice/agent-consult-tool.ts b/src/realtime-voice/agent-consult-tool.ts index bdafb384581..1772fed1ea7 100644 --- a/src/realtime-voice/agent-consult-tool.ts +++ b/src/realtime-voice/agent-consult-tool.ts @@ -1,6 +1,17 @@ +import { normalizeOptionalString } from "../shared/string-coerce.js"; import type { RealtimeVoiceTool } from "./provider-types.js"; export const REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME = "openclaw_agent_consult"; +export type RealtimeVoiceAgentConsultToolPolicy = "safe-read-only" | "owner" | "none"; +export type RealtimeVoiceAgentConsultArgs = { + question: string; + context?: string; + responseStyle?: string; +}; +export type RealtimeVoiceAgentConsultTranscriptEntry = { + role: "user" | "assistant"; + text: string; +}; export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = { type: "function", @@ -26,3 +37,81 @@ export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = { required: ["question"], }, }; + +export function parseRealtimeVoiceAgentConsultArgs(args: unknown): RealtimeVoiceAgentConsultArgs { + const question = readConsultStringArg(args, "question"); + if (!question) { + throw new Error("question required"); + } + return { + question, + context: readConsultStringArg(args, "context"), + responseStyle: readConsultStringArg(args, "responseStyle"), + }; +} + +export function buildRealtimeVoiceAgentConsultChatMessage(args: unknown): string { + const parsed = parseRealtimeVoiceAgentConsultArgs(args); + return [ + parsed.question, + parsed.context ? `Context:\n${parsed.context}` : undefined, + parsed.responseStyle ? `Spoken style:\n${parsed.responseStyle}` : undefined, + ] + .filter(Boolean) + .join("\n\n"); +} + +export function buildRealtimeVoiceAgentConsultPrompt(params: { + args: unknown; + transcript: RealtimeVoiceAgentConsultTranscriptEntry[]; + surface: string; + userLabel: string; + assistantLabel?: string; + questionSourceLabel?: string; +}): string { + const parsed = parseRealtimeVoiceAgentConsultArgs(params.args); + const assistantLabel = params.assistantLabel ?? "Agent"; + const questionSourceLabel = params.questionSourceLabel ?? params.userLabel.toLowerCase(); + const transcript = params.transcript + .slice(-12) + .map( + (entry) => `${entry.role === "assistant" ? assistantLabel : params.userLabel}: ${entry.text}`, + ) + .join("\n"); + + return [ + `You are helping an OpenClaw realtime voice agent during ${params.surface}.`, + `Answer the ${questionSourceLabel}'s question with the strongest useful reasoning and available tools.`, + "Return only the concise answer the realtime voice agent should speak next.", + "Do not include markdown, citations unless needed, tool logs, or private reasoning.", + parsed.responseStyle ? `Spoken style: ${parsed.responseStyle}` : undefined, + transcript ? `Recent transcript:\n${transcript}` : undefined, + parsed.context ? `Additional context:\n${parsed.context}` : undefined, + `Question:\n${parsed.question}`, + ] + .filter(Boolean) + .join("\n\n"); +} + +export function collectRealtimeVoiceAgentConsultVisibleText( + payloads: Array<{ text?: unknown; isError?: boolean; isReasoning?: boolean }>, +): string | null { + const chunks: string[] = []; + for (const payload of payloads) { + if (payload.isError || payload.isReasoning) { + continue; + } + const text = normalizeOptionalString(payload.text); + if (text) { + chunks.push(text); + } + } + return chunks.length > 0 ? chunks.join("\n\n").trim() : null; +} + +function readConsultStringArg(args: unknown, key: string): string | undefined { + if (!args || typeof args !== "object" || Array.isArray(args)) { + return undefined; + } + return normalizeOptionalString((args as Record)[key]); +} diff --git a/ui/src/ui/chat/realtime-talk.ts b/ui/src/ui/chat/realtime-talk.ts index bb980bf2775..61fcef9af66 100644 --- a/ui/src/ui/chat/realtime-talk.ts +++ b/ui/src/ui/chat/realtime-talk.ts @@ -1,4 +1,7 @@ -import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME } from "../../../../src/realtime-voice/agent-consult-tool.js"; +import { + buildRealtimeVoiceAgentConsultChatMessage, + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, +} from "../../../../src/realtime-voice/agent-consult-tool.js"; import type { GatewayBrowserClient, GatewayEventFrame } from "../gateway.ts"; import { generateUUID } from "../uuid.ts"; @@ -239,23 +242,9 @@ export class RealtimeTalkSession { this.callbacks.onStatus?.("thinking"); let question = ""; try { - const args = JSON.parse(buffered?.args || event.arguments || "{}") as { - question?: unknown; - context?: unknown; - responseStyle?: unknown; - }; - question = typeof args.question === "string" ? args.question.trim() : ""; - const context = typeof args.context === "string" ? args.context.trim() : ""; - const responseStyle = typeof args.responseStyle === "string" ? args.responseStyle.trim() : ""; - if (context || responseStyle) { - question = [ - question, - context ? `Context:\n${context}` : undefined, - responseStyle ? `Spoken style:\n${responseStyle}` : undefined, - ] - .filter(Boolean) - .join("\n\n"); - } + question = buildRealtimeVoiceAgentConsultChatMessage( + JSON.parse(buffered?.args || event.arguments || "{}"), + ); } catch {} if (!question) { this.submitToolResult(callId, {