diff --git a/CHANGELOG.md b/CHANGELOG.md index e4a92c99323..d1dadf698de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -85,6 +85,7 @@ Docs: https://docs.openclaw.ai - Google Meet: clear queued Gemini Live playback when realtime interruptions arrive, restart Chrome command-pair audio output after clears, and expose Google Live interruption/VAD config knobs for Meet and Voice Call realtime bridges. Fixes #72523. (#72524) Thanks @BsnizND. - Google Meet: add `realtime.agentId` so live meeting consults can target a named OpenClaw agent instead of always using `main`. (#72381) Thanks @BsnizND. - Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND. +- Google Meet/Voice Call: send Gemini Live a non-blocking consult continuation before long OpenClaw agent consults finish, then deliver the final result when idle so calls and meetings do not sit silent during tool-backed answers. (#72189) Thanks @VACInc. - Google Meet: preserve Gemini Live function names when replying to realtime tool calls so Google SDK validation accepts the `FunctionResponse` payload. Fixes #72425. (#72426) Thanks @BsnizND. - Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras. - Cron: apply `cron.maxConcurrentRuns` to a dedicated `cron-nested` isolated agent-turn lane as well as cron dispatch, so parallel cron jobs no longer serialize on inner LLM execution while non-cron nested flows keep their existing lane behavior. Fixes #72707. Thanks @kagura-agent. diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md index c88761bd3ca..ee2fa9f59cb 100644 --- a/docs/plugins/sdk-provider-plugins.md +++ b/docs/plugins/sdk-provider-plugins.md @@ -585,6 +585,10 @@ API key auth, and dynamic model resolution. label: "Acme Realtime Voice", isConfigured: ({ providerConfig }) => Boolean(providerConfig.apiKey), createBridge: (req) => ({ + // Set this only if the provider accepts multiple tool responses for + // one call, for example an immediate "working" response followed by + // the final result. + supportsToolResultContinuation: false, connect: async () => {}, sendAudio: () => {}, setMediaTimestamp: () => {}, diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index 55c46e77425..ac6e6d77e16 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -1944,6 +1944,7 @@ describe("google-meet plugin", () => { | undefined; const sendAudio = vi.fn(); const bridge = { + supportsToolResultContinuation: true, connect: vi.fn(async () => {}), sendAudio, setMediaTimestamp: vi.fn(), @@ -2048,6 +2049,15 @@ describe("google-meet plugin", () => { name: "openclaw_agent_consult", args: { question: "What should I say about launch timing?" }, }); + expect(bridge.submitToolResult).toHaveBeenNthCalledWith( + 1, + "tool-call-1", + expect.objectContaining({ + status: "working", + tool: "openclaw_agent_consult", + }), + { willContinue: true }, + ); expect(spawnMock).toHaveBeenNthCalledWith(1, "play-meet", [], { stdio: ["pipe", "ignore", "pipe"], @@ -2082,9 +2092,13 @@ describe("google-meet plugin", () => { ], }); await vi.waitFor(() => { - expect(bridge.submitToolResult).toHaveBeenCalledWith("tool-call-1", { - text: "Use the Portugal launch data.", - }); + expect(bridge.submitToolResult).toHaveBeenLastCalledWith( + "tool-call-1", + { + text: "Use the Portugal launch data.", + }, + undefined, + ); }); expect(runtime.agent.runEmbeddedPiAgent).toHaveBeenCalledWith( expect.objectContaining({ @@ -2121,6 +2135,7 @@ describe("google-meet plugin", () => { | undefined; const sendAudio = vi.fn(); const bridge = { + supportsToolResultContinuation: true, connect: vi.fn(async () => {}), sendAudio, setMediaTimestamp: vi.fn(), @@ -2196,6 +2211,15 @@ describe("google-meet plugin", () => { name: "openclaw_agent_consult", args: { question: "What should I say?" }, }); + expect(bridge.submitToolResult).toHaveBeenNthCalledWith( + 1, + "tool-call-1", + expect.objectContaining({ + status: "working", + tool: "openclaw_agent_consult", + }), + { willContinue: true }, + ); await vi.waitFor(() => { expect(sendAudio).toHaveBeenCalledWith(Buffer.from([9, 8, 7])); @@ -2227,9 +2251,13 @@ describe("google-meet plugin", () => { ); }); await vi.waitFor(() => { - expect(bridge.submitToolResult).toHaveBeenCalledWith("tool-call-1", { - text: "Use the launch update.", - }); + expect(bridge.submitToolResult).toHaveBeenLastCalledWith( + "tool-call-1", + { + text: "Use the launch update.", + }, + undefined, + ); }); expect(bridge.triggerGreeting).not.toHaveBeenCalled(); handle.speak("Say exactly: hello from the node."); diff --git a/extensions/google-meet/src/agent-consult.ts b/extensions/google-meet/src/agent-consult.ts index 80b3e78b023..e536563efa8 100644 --- a/extensions/google-meet/src/agent-consult.ts +++ b/extensions/google-meet/src/agent-consult.ts @@ -1,11 +1,13 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime"; import { + buildRealtimeVoiceAgentConsultWorkingResponse, consultRealtimeVoiceAgent, REALTIME_VOICE_AGENT_CONSULT_TOOL, REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, resolveRealtimeVoiceAgentConsultTools, resolveRealtimeVoiceAgentConsultToolsAllow, + type RealtimeVoiceBridgeSession, type RealtimeVoiceTool, } from "openclaw/plugin-sdk/realtime-voice"; import { normalizeAgentId } from "openclaw/plugin-sdk/routing"; @@ -14,10 +16,30 @@ import type { GoogleMeetConfig, GoogleMeetToolPolicy } from "./config.js"; export const GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME = REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME; export const GOOGLE_MEET_AGENT_CONSULT_TOOL = REALTIME_VOICE_AGENT_CONSULT_TOOL; +const GOOGLE_MEET_CONSULT_SYSTEM_PROMPT = [ + "You are a behind-the-scenes consultant for a live meeting voice agent.", + "Prioritize a fast, speakable answer over exhaustive investigation.", + "For tool-backed status checks, prefer one or two bounded read-only queries before answering.", + "Do not print secret values or dump environment variables; only check whether required configuration is present.", + "Be accurate, brief, and speakable.", +].join(" "); + export function resolveGoogleMeetRealtimeTools(policy: GoogleMeetToolPolicy): RealtimeVoiceTool[] { return resolveRealtimeVoiceAgentConsultTools(policy); } +export function submitGoogleMeetConsultWorkingResponse( + session: RealtimeVoiceBridgeSession, + callId: string, +): void { + if (!session.bridge.supportsToolResultContinuation) { + return; + } + session.submitToolResult(callId, buildRealtimeVoiceAgentConsultWorkingResponse("participant"), { + willContinue: true, + }); +} + export async function consultOpenClawAgentForGoogleMeet(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; @@ -45,7 +67,6 @@ export async function consultOpenClawAgentForGoogleMeet(params: { assistantLabel: "Agent", questionSourceLabel: "participant", toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(params.config.realtime.toolPolicy), - extraSystemPrompt: - "You are a behind-the-scenes consultant for a live meeting voice agent. Be accurate, brief, and speakable.", + extraSystemPrompt: GOOGLE_MEET_CONSULT_SYSTEM_PROMPT, }); } diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index 1d77df3723d..ca84bff11c7 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -10,6 +10,7 @@ import { consultOpenClawAgentForGoogleMeet, GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME, resolveGoogleMeetRealtimeTools, + submitGoogleMeetConsultWorkingResponse, } from "./agent-consult.js"; import type { GoogleMeetConfig } from "./config.js"; import { resolveGoogleMeetRealtimeProvider } from "./realtime.js"; @@ -157,6 +158,7 @@ export async function startNodeRealtimeAudioBridge(params: { }); return; } + submitGoogleMeetConsultWorkingResponse(session, event.callId || event.itemId); void consultOpenClawAgentForGoogleMeet({ config: params.config, fullConfig: params.fullConfig, diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index b40b820cc9c..5e3585e769f 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -14,6 +14,7 @@ import { consultOpenClawAgentForGoogleMeet, GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME, resolveGoogleMeetRealtimeTools, + submitGoogleMeetConsultWorkingResponse, } from "./agent-consult.js"; import type { GoogleMeetConfig } from "./config.js"; import type { GoogleMeetChromeHealth } from "./transports/types.js"; @@ -216,6 +217,7 @@ export async function startCommandRealtimeAudioBridge(params: { }); return; } + submitGoogleMeetConsultWorkingResponse(session, event.callId || event.itemId); void consultOpenClawAgentForGoogleMeet({ config: params.config, fullConfig: params.fullConfig, diff --git a/extensions/google/realtime-voice-provider.test.ts b/extensions/google/realtime-voice-provider.test.ts index 1ffba7d1153..b481d8dc61f 100644 --- a/extensions/google/realtime-voice-provider.test.ts +++ b/extensions/google/realtime-voice-provider.test.ts @@ -131,6 +131,18 @@ describe("buildGoogleRealtimeVoiceProvider", () => { required: ["query"], }, }, + { + type: "function", + name: "openclaw_agent_consult", + description: "Ask OpenClaw", + parameters: { + type: "object", + properties: { + question: { type: "string" }, + }, + required: ["question"], + }, + }, ], onAudio: vi.fn(), onClearAudio: vi.fn(), @@ -175,6 +187,18 @@ describe("buildGoogleRealtimeVoiceProvider", () => { required: ["query"], }, }, + { + name: "openclaw_agent_consult", + description: "Ask OpenClaw", + parametersJsonSchema: { + type: "object", + properties: { + question: { type: "string" }, + }, + required: ["question"], + }, + behavior: "NON_BLOCKING", + }, ], }, ], @@ -392,6 +416,55 @@ describe("buildGoogleRealtimeVoiceProvider", () => { }); }); + it("keeps Google Live consult calls open after continuing tool responses", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + onToolCall: vi.fn(), + }); + + await bridge.connect(); + lastConnectParams().callbacks.onmessage({ + setupComplete: { sessionId: "session-1" }, + toolCall: { + functionCalls: [ + { id: "consult-call", name: "openclaw_agent_consult", args: { prompt: "hi" } }, + ], + }, + }); + + bridge.submitToolResult( + "consult-call", + { status: "working", message: "Tell the participant you are checking." }, + { willContinue: true }, + ); + bridge.submitToolResult("consult-call", { text: "The meeting starts at 3." }); + + expect(session.sendToolResponse).toHaveBeenNthCalledWith(1, { + functionResponses: [ + { + id: "consult-call", + name: "openclaw_agent_consult", + scheduling: "WHEN_IDLE", + willContinue: true, + response: { status: "working", message: "Tell the participant you are checking." }, + }, + ], + }); + expect(session.sendToolResponse).toHaveBeenNthCalledWith(2, { + functionResponses: [ + { + id: "consult-call", + name: "openclaw_agent_consult", + scheduling: "WHEN_IDLE", + response: { text: "The meeting starts at 3." }, + }, + ], + }); + }); + it("does not send malformed Live API tool responses without a matching call name", async () => { const provider = buildGoogleRealtimeVoiceProvider(); const onError = vi.fn(); diff --git a/extensions/google/realtime-voice-provider.ts b/extensions/google/realtime-voice-provider.ts index feb73835570..29768f9d5bf 100644 --- a/extensions/google/realtime-voice-provider.ts +++ b/extensions/google/realtime-voice-provider.ts @@ -1,7 +1,9 @@ import { randomUUID } from "node:crypto"; import { ActivityHandling, + Behavior, EndSensitivity, + FunctionResponseScheduling, Modality, StartSensitivity, TurnCoverage, @@ -20,8 +22,14 @@ import type { RealtimeVoiceProviderConfig, RealtimeVoiceProviderPlugin, RealtimeVoiceTool, + RealtimeVoiceToolResultOptions, +} from "openclaw/plugin-sdk/realtime-voice"; +import { + convertPcmToMulaw8k, + mulawToPcm, + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + resamplePcm, } from "openclaw/plugin-sdk/realtime-voice"; -import { convertPcmToMulaw8k, mulawToPcm, resamplePcm } from "openclaw/plugin-sdk/realtime-voice"; import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime"; import { createGoogleGenAI } from "./google-genai-runtime.js"; @@ -288,11 +296,17 @@ function buildRealtimeInputConfig( } function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] { - return (tools ?? []).map((tool) => ({ - name: tool.name, - description: tool.description, - parametersJsonSchema: tool.parameters, - })); + return (tools ?? []).map((tool) => { + const declaration: FunctionDeclaration = { + name: tool.name, + description: tool.description, + parametersJsonSchema: tool.parameters, + }; + if (tool.name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME) { + declaration.behavior = Behavior.NON_BLOCKING; + } + return declaration; + }); } function parsePcmSampleRate(mimeType: string | undefined): number { @@ -306,6 +320,8 @@ function isMulawSilence(audio: Buffer): boolean { } class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { + readonly supportsToolResultContinuation = true; + private session: GoogleLiveSession | null = null; private connected = false; private sessionConfigured = false; @@ -448,7 +464,11 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { this.sendUserMessage(greetingPrompt); } - submitToolResult(callId: string, result: unknown): void { + submitToolResult( + callId: string, + result: unknown, + options?: RealtimeVoiceToolResultOptions, + ): void { if (!this.session) { return; } @@ -462,19 +482,34 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { return; } try { + const isConsultTool = name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME; + const functionResponse: FunctionResponse = { + id: callId, + name, + response: + result && typeof result === "object" && !Array.isArray(result) + ? (result as Record) + : { output: result }, + }; + if (isConsultTool) { + functionResponse.scheduling = FunctionResponseScheduling.WHEN_IDLE; + if (options?.willContinue === true) { + functionResponse.willContinue = true; + } + } else if (options?.willContinue === true) { + this.config.onError?.( + new Error( + `Google Live continuation is only supported for ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME}`, + ), + ); + return; + } this.session.sendToolResponse({ - functionResponses: [ - { - id: callId, - name, - response: - result && typeof result === "object" && !Array.isArray(result) - ? (result as Record) - : { output: result }, - }, - ], + functionResponses: [functionResponse], }); - this.pendingFunctionNames.delete(callId); + if (options?.willContinue !== true) { + this.pendingFunctionNames.delete(callId); + } } catch (error) { this.config.onError?.( error instanceof Error ? error : new Error("Failed to send Google Live function response"), diff --git a/extensions/voice-call/src/runtime.test.ts b/extensions/voice-call/src/runtime.test.ts index c169b7b89c4..d1e8280b4c3 100644 --- a/extensions/voice-call/src/runtime.test.ts +++ b/extensions/voice-call/src/runtime.test.ts @@ -326,6 +326,7 @@ describe("createVoiceCallRuntime lifecycle", () => { provider: "openai", model: "gpt-5.4", toolsAllow: ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"], + extraSystemPrompt: expect.stringContaining("one or two bounded read-only queries"), prompt: expect.stringContaining("Caller: Can you check shipment status?"), }), ); diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 88cdf0f0151..5ba24172fd7 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -47,6 +47,14 @@ type MockProviderModule = typeof import("./providers/mock.js"); type RealtimeVoiceRuntimeModule = typeof import("./realtime-voice.runtime.js"); type RealtimeHandlerModule = typeof import("./webhook/realtime-handler.js"); +const REALTIME_VOICE_CONSULT_SYSTEM_PROMPT = [ + "You are a behind-the-scenes consultant for a live phone voice agent.", + "Prioritize a fast, speakable answer over exhaustive investigation.", + "For tool-backed status checks, prefer one or two bounded read-only queries before answering.", + "Do not print secret values or dump environment variables; only check whether required configuration is present.", + "Be accurate, brief, and speakable.", +].join(" "); + let telnyxProviderPromise: Promise | undefined; let twilioProviderPromise: Promise | undefined; let plivoProviderPromise: Promise | undefined; @@ -368,8 +376,7 @@ export async function createVoiceCallRuntime(params: { thinkLevel, timeoutMs: config.responseTimeoutMs, toolsAllow: resolveRealtimeVoiceAgentConsultToolsAllow(config.realtime.toolPolicy), - extraSystemPrompt: - "You are a behind-the-scenes consultant for a live phone voice agent. Be accurate, brief, and speakable.", + extraSystemPrompt: REALTIME_VOICE_CONSULT_SYSTEM_PROMPT, }); }, ); diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index 730bf5ef18a..fb3c176ea18 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -20,16 +20,17 @@ function makeRequest(url: string, host = "gateway.ts.net"): http.IncomingMessage return req; } -function makeBridge(): RealtimeVoiceBridge { +function makeBridge(overrides: Partial = {}): RealtimeVoiceBridge { return { connect: async () => {}, sendAudio: () => {}, setMediaTimestamp: () => {}, - submitToolResult: () => {}, + submitToolResult: vi.fn(), acknowledgeMark: () => {}, close: () => {}, isConnected: () => true, triggerGreeting: () => {}, + ...overrides, }; } @@ -212,6 +213,128 @@ describe("RealtimeCallHandler path routing", () => { await server.close(); } }); + + it("submits continuing responses only for realtime agent consult calls", async () => { + let callbacks: + | { + onToolCall?: (event: { + itemId: string; + callId: string; + name: string; + args: unknown; + }) => void; + } + | undefined; + let resolveConsult: ((value: unknown) => void) | undefined; + const submitToolResult = vi.fn(); + const bridge = makeBridge({ + supportsToolResultContinuation: true, + submitToolResult, + }); + const createBridge = vi.fn( + (request: Parameters[0]) => { + callbacks = request; + return bridge; + }, + ); + const getCallByProviderCallId = vi.fn( + (): CallRecord => ({ + callId: "call-1", + providerCallId: "CA-tool", + provider: "twilio", + direction: "inbound", + state: "ringing", + from: "+15550001234", + to: "+15550009999", + startedAt: Date.now(), + transcript: [], + processedEventIds: [], + metadata: {}, + }), + ); + const handler = makeHandler(undefined, { + manager: { + getCallByProviderCallId, + }, + realtimeProvider: makeRealtimeProvider(createBridge), + }); + handler.registerToolHandler( + "openclaw_agent_consult", + () => + new Promise((resolve) => { + resolveConsult = resolve; + }), + ); + handler.registerToolHandler("custom_lookup", async () => ({ ok: true })); + const server = await startRealtimeServer(handler); + + try { + const ws = await connectWs(server.url); + try { + ws.send( + JSON.stringify({ + event: "start", + start: { streamSid: "MZ-tool", callSid: "CA-tool" }, + }), + ); + await vi.waitFor(() => { + expect(createBridge).toHaveBeenCalled(); + }); + + callbacks?.onToolCall?.({ + itemId: "item-1", + callId: "consult-call", + name: "openclaw_agent_consult", + args: { question: "Are the basement lights on?" }, + }); + + await vi.waitFor(() => { + expect(submitToolResult).toHaveBeenCalledWith( + "consult-call", + expect.objectContaining({ + status: "working", + tool: "openclaw_agent_consult", + }), + { willContinue: true }, + ); + }); + expect(submitToolResult).toHaveBeenCalledTimes(1); + + resolveConsult?.({ text: "The basement lights are on." }); + + await vi.waitFor(() => { + expect(submitToolResult).toHaveBeenLastCalledWith( + "consult-call", + { + text: "The basement lights are on.", + }, + undefined, + ); + }); + + submitToolResult.mockClear(); + callbacks?.onToolCall?.({ + itemId: "item-2", + callId: "custom-call", + name: "custom_lookup", + args: {}, + }); + + await vi.waitFor(() => { + expect(submitToolResult).toHaveBeenCalledWith("custom-call", { ok: true }, undefined); + }); + expect(submitToolResult).not.toHaveBeenCalledWith("custom-call", expect.anything(), { + willContinue: true, + }); + } finally { + if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) { + ws.close(); + } + } + } finally { + await server.close(); + } + }); }); describe("RealtimeCallHandler websocket hardening", () => { diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index d25122f4a3c..b77ca8d0d5d 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -3,7 +3,9 @@ import http from "node:http"; import type { Duplex } from "node:stream"; import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; import { + buildRealtimeVoiceAgentConsultWorkingResponse, createRealtimeVoiceBridgeSession, + REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, type RealtimeVoiceBridgeSession, type RealtimeVoiceProviderConfig, type RealtimeVoiceProviderPlugin, @@ -410,6 +412,17 @@ export class RealtimeCallHandler { args: unknown, ): Promise { const handler = this.toolHandlers.get(name); + if ( + handler && + name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME && + bridge.bridge.supportsToolResultContinuation + ) { + bridge.submitToolResult( + bridgeCallId, + buildRealtimeVoiceAgentConsultWorkingResponse("caller"), + { willContinue: true }, + ); + } const result = !handler ? { error: `Tool "${name}" not available` } : await handler(args, callId).catch((error: unknown) => ({ diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index ec9d59c600e..4ba819eda70 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -13,10 +13,12 @@ export type { RealtimeVoiceRole, RealtimeVoiceTool, RealtimeVoiceToolCallEvent, + RealtimeVoiceToolResultOptions, } from "../realtime-voice/provider-types.js"; export { buildRealtimeVoiceAgentConsultChatMessage, buildRealtimeVoiceAgentConsultPrompt, + buildRealtimeVoiceAgentConsultWorkingResponse, collectRealtimeVoiceAgentConsultVisibleText, isRealtimeVoiceAgentConsultToolPolicy, parseRealtimeVoiceAgentConsultArgs, diff --git a/src/realtime-voice/agent-consult-tool.ts b/src/realtime-voice/agent-consult-tool.ts index 4c76d014bd7..25d524f7007 100644 --- a/src/realtime-voice/agent-consult-tool.ts +++ b/src/realtime-voice/agent-consult-tool.ts @@ -47,6 +47,16 @@ export const REALTIME_VOICE_AGENT_CONSULT_TOOL: RealtimeVoiceTool = { }, }; +export function buildRealtimeVoiceAgentConsultWorkingResponse( + audienceLabel = "person", +): Record { + return { + status: "working", + tool: REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, + message: `Tell the ${audienceLabel} briefly that you are checking, then wait for the final OpenClaw result before answering with the actual result.`, + }; +} + const SAFE_READ_ONLY_TOOLS = [ "read", "web_search", diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts index ba72ff89640..9c59ef7b0f1 100644 --- a/src/realtime-voice/provider-types.ts +++ b/src/realtime-voice/provider-types.ts @@ -24,6 +24,10 @@ export type RealtimeVoiceToolCallEvent = { args: unknown; }; +export type RealtimeVoiceToolResultOptions = { + willContinue?: boolean; +}; + export type RealtimeVoiceBridgeCallbacks = { onAudio: (muLaw: Buffer) => void; onClearAudio: () => void; @@ -70,12 +74,13 @@ export type RealtimeVoiceBrowserSession = { }; export type RealtimeVoiceBridge = { + supportsToolResultContinuation?: boolean; connect(): Promise; sendAudio(audio: Buffer): void; setMediaTimestamp(ts: number): void; sendUserMessage?(text: string): void; triggerGreeting?(instructions?: string): void; - submitToolResult(callId: string, result: unknown): void; + submitToolResult(callId: string, result: unknown, options?: RealtimeVoiceToolResultOptions): void; acknowledgeMark(): void; close(): void; isConnected(): boolean; diff --git a/src/realtime-voice/session-runtime.test.ts b/src/realtime-voice/session-runtime.test.ts index 65b79c6764d..6bbdd89e5ac 100644 --- a/src/realtime-voice/session-runtime.test.ts +++ b/src/realtime-voice/session-runtime.test.ts @@ -144,6 +144,29 @@ describe("realtime voice bridge session runtime", () => { expect(onToolCall).toHaveBeenCalledWith(event, session); }); + it("forwards tool result continuation options to the provider bridge", () => { + const bridge = makeBridge(); + const provider: RealtimeVoiceProviderPlugin = { + id: "test", + label: "Test", + isConfigured: () => true, + createBridge: () => bridge, + }; + const session = createRealtimeVoiceBridgeSession({ + provider, + providerConfig: {}, + audioSink: { sendAudio: vi.fn() }, + }); + + session.submitToolResult("call-1", { status: "working" }, { willContinue: true }); + + expect(bridge.submitToolResult).toHaveBeenCalledWith( + "call-1", + { status: "working" }, + { willContinue: true }, + ); + }); + it("does not expose session callbacks until the provider returns its bridge", () => { let callbacks: Parameters[0] | undefined; const bridge = makeBridge(); diff --git a/src/realtime-voice/session-runtime.ts b/src/realtime-voice/session-runtime.ts index 3887e799876..0569fe292ac 100644 --- a/src/realtime-voice/session-runtime.ts +++ b/src/realtime-voice/session-runtime.ts @@ -6,6 +6,7 @@ import type { RealtimeVoiceRole, RealtimeVoiceTool, RealtimeVoiceToolCallEvent, + RealtimeVoiceToolResultOptions, } from "./provider-types.js"; export type RealtimeVoiceAudioSink = { @@ -25,7 +26,7 @@ export type RealtimeVoiceBridgeSession = { sendAudio(audio: Buffer): void; sendUserMessage(text: string): void; setMediaTimestamp(ts: number): void; - submitToolResult(callId: string, result: unknown): void; + submitToolResult(callId: string, result: unknown, options?: RealtimeVoiceToolResultOptions): void; triggerGreeting(instructions?: string): void; }; @@ -65,7 +66,8 @@ export function createRealtimeVoiceBridgeSession( sendAudio: (audio) => requireBridge().sendAudio(audio), sendUserMessage: (text) => requireBridge().sendUserMessage?.(text), setMediaTimestamp: (ts) => requireBridge().setMediaTimestamp(ts), - submitToolResult: (callId, result) => requireBridge().submitToolResult(callId, result), + submitToolResult: (callId, result, options) => + requireBridge().submitToolResult(callId, result, options), triggerGreeting: (instructions) => requireBridge().triggerGreeting?.(instructions), }; const canSendAudio = () => params.audioSink.isOpen?.() ?? true;