diff --git a/CHANGELOG.md b/CHANGELOG.md index 01ae6dfd289..a9bc5cfd35c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai - Plugins/runtime-deps: prune legacy version-scoped plugin runtime-deps roots during bundled dependency repair and cover the path in Package Acceptance's upgrade-survivor matrix, so upgrades from 2026.4.x no longer leave stale per-plugin runtime trees after doctor runs. Thanks @vincentkoc. - Plugins/runtime-deps: keep Gateway startup plugin imports and runtime plugin fallback loads verify-only after startup/config repair planning, so packaged installs no longer spawn package-manager repair from hot paths after readiness. Refs #75283 and #75069. Thanks @brokemac79 and @xiaohuaxi. +- Voice Call/realtime: add default-off fast memory/session context for `openclaw_agent_consult`, giving live calls a bounded answer-or-miss path before the full agent consult. Fixes #71849. Thanks @amzzzzzzz. - Google Meet: interrupt Realtime provider output when local barge-in clears playback, so command-pair audio stops model speech instead of only restarting Chrome playback. Fixes #73850. (#73834) Thanks @shhtheonlyperson. - Gateway/config: cap oversized plugin-owned schemas in the full `config.schema` response so large installed plugin sets cannot balloon Gateway RSS or crash schema clients. Thanks @vincentkoc. - Gateway/sessions: use bounded tail reads for sessions-list transcript usage fallbacks and cap bulk title/last-message hydration, keeping large session stores responsive when rows request derived previews. Thanks @vincentkoc. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 8fb86a221a5..8c999c3e90b 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -13b715c3aac380161ec167bccfcfb902c3231a802a08ab7ca9ef760e0c11913a config-baseline.json +d70e31fd5f36d4b117ffa750fba88072d6714edc245a18d4b0915a2d11ce603a config-baseline.json 0a259216178a582c567d1fa48c5236bff4bbd27c3e6af838ffcd042459ffce3c config-baseline.core.json da8e055ebba0730498703d209f9e2cfaa1484a83f3240e611dcdd7280e22a525 config-baseline.channel.json -8d41287cd9cb696cf8a5e8810bd731b9eda4af9b0829c6dadae2da56e19dc644 config-baseline.plugin.json +4d017161b4dc986fdc6cc68167fedbd1d415ddbcd66125a872e18aa1769cd182 config-baseline.plugin.json diff --git a/docs/plugins/voice-call.md b/docs/plugins/voice-call.md index 5e29c05f28b..4ac92b0fd8a 100644 --- a/docs/plugins/voice-call.md +++ b/docs/plugins/voice-call.md @@ -210,6 +210,7 @@ Current runtime behaviour: - Bundled realtime voice providers: Google Gemini Live (`google`) and OpenAI (`openai`), registered by their provider plugins. - Provider-owned raw config lives under `realtime.providers.`. - Voice Call exposes the shared `openclaw_agent_consult` realtime tool by default. The realtime model can call it when the caller asks for deeper reasoning, current information, or normal OpenClaw tools. +- `realtime.fastContext.enabled` is default-off. When enabled, Voice Call first searches indexed memory/session context for the consult question and returns those snippets to the realtime model within `realtime.fastContext.timeoutMs` before falling back to the full consult agent only if `realtime.fastContext.fallbackToConsult` is true. - If `realtime.provider` points at an unregistered provider, or no realtime voice provider is registered at all, Voice Call logs a warning and skips realtime media instead of failing the whole plugin. - Consult session keys reuse the existing voice session when available, then fall back to the caller/callee phone number so follow-up consult calls keep context during the call. diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index a05f8fb4267..1c4bc868ec4 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -89,6 +89,27 @@ const voiceCallConfigSchema = { help: "Controls the shared openclaw_agent_consult tool.", advanced: true, }, + "realtime.fastContext.enabled": { + label: "Enable Fast Realtime Context", + help: "Searches memory/session context before the full consult agent.", + advanced: true, + }, + "realtime.fastContext.timeoutMs": { + label: "Fast Context Timeout", + advanced: true, + }, + "realtime.fastContext.maxResults": { + label: "Fast Context Result Limit", + advanced: true, + }, + "realtime.fastContext.sources": { + label: "Fast Context Sources", + advanced: true, + }, + "realtime.fastContext.fallbackToConsult": { + label: "Fallback To Full Consult", + advanced: true, + }, "realtime.providers": { label: "Realtime Provider Config", advanced: true }, "tts.provider": { label: "TTS Provider Override", diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index 93121f536f2..36281d076b9 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -135,6 +135,32 @@ "label": "Realtime Instructions", "advanced": true }, + "realtime.toolPolicy": { + "label": "Realtime Tool Policy", + "help": "Controls the shared openclaw_agent_consult tool.", + "advanced": true + }, + "realtime.fastContext.enabled": { + "label": "Enable Fast Realtime Context", + "help": "Searches memory/session context before the full consult agent.", + "advanced": true + }, + "realtime.fastContext.timeoutMs": { + "label": "Fast Context Timeout", + "advanced": true + }, + "realtime.fastContext.maxResults": { + "label": "Fast Context Result Limit", + "advanced": true + }, + "realtime.fastContext.sources": { + "label": "Fast Context Sources", + "advanced": true + }, + "realtime.fastContext.fallbackToConsult": { + "label": "Fallback To Full Consult", + "advanced": true + }, "realtime.providers": { "label": "Realtime Provider Config", "advanced": true @@ -452,6 +478,34 @@ "required": ["type", "name", "description", "parameters"] } }, + "fastContext": { + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "timeoutMs": { + "type": "number", + "minimum": 1 + }, + "maxResults": { + "type": "number", + "minimum": 1 + }, + "sources": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "enum": ["memory", "sessions"] + } + }, + "fallbackToConsult": { + "type": "boolean" + } + } + }, "providers": { "type": "object", "additionalProperties": { diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index a8c28ca8435..171eb7cd29d 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -275,6 +275,13 @@ describe("normalizeVoiceCallConfig", () => { expect(normalized.streaming.providers).toEqual({}); expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime"); expect(normalized.realtime.toolPolicy).toBe("safe-read-only"); + expect(normalized.realtime.fastContext).toEqual({ + enabled: false, + timeoutMs: 800, + maxResults: 3, + sources: ["memory", "sessions"], + fallbackToConsult: false, + }); expect(normalized.realtime.instructions).toContain("openclaw_agent_consult"); expect(normalized.tunnel.provider).toBe("none"); expect(normalized.webhookSecurity.allowedHosts).toEqual([]); diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 4740905c6d5..fcd522b69b5 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -225,6 +225,39 @@ export type VoiceCallRealtimeProvidersConfig = z.infer< export const VoiceCallRealtimeToolPolicySchema = z.enum(REALTIME_VOICE_AGENT_CONSULT_TOOL_POLICIES); export type VoiceCallRealtimeToolPolicy = RealtimeVoiceAgentConsultToolPolicy; +export const VoiceCallRealtimeFastContextSourceSchema = z.enum(["memory", "sessions"]); +export type VoiceCallRealtimeFastContextSource = z.infer< + typeof VoiceCallRealtimeFastContextSourceSchema +>; + +export const VoiceCallRealtimeFastContextConfigSchema = z + .object({ + /** Enable bounded memory/session lookup before the full consult agent. */ + enabled: z.boolean().default(false), + /** Hard deadline for the fast context lookup. */ + timeoutMs: z.number().int().positive().default(800), + /** Maximum memory/session hits to inject into the realtime tool result. */ + maxResults: z.number().int().positive().default(3), + /** Indexed sources used by the fast context lookup. */ + sources: z + .array(VoiceCallRealtimeFastContextSourceSchema) + .min(1) + .default(["memory", "sessions"]), + /** Fall back to the full agent consult when fast context has no answer. */ + fallbackToConsult: z.boolean().default(false), + }) + .strict() + .default({ + enabled: false, + timeoutMs: 800, + maxResults: 3, + sources: ["memory", "sessions"], + fallbackToConsult: false, + }); +export type VoiceCallRealtimeFastContextConfig = z.infer< + typeof VoiceCallRealtimeFastContextConfigSchema +>; + export const VoiceCallStreamingProvidersConfigSchema = z .record(z.string(), z.record(z.string(), z.unknown())) .default({}); @@ -246,6 +279,8 @@ export const VoiceCallRealtimeConfigSchema = z toolPolicy: VoiceCallRealtimeToolPolicySchema.default("safe-read-only"), /** Tool definitions exposed to the realtime provider. */ tools: z.array(RealtimeToolSchema).default([]), + /** Low-latency memory/session context for the consult tool. */ + fastContext: VoiceCallRealtimeFastContextConfigSchema, /** Provider-owned raw config blobs keyed by provider id. */ providers: VoiceCallRealtimeProvidersConfigSchema, }) @@ -255,6 +290,13 @@ export const VoiceCallRealtimeConfigSchema = z instructions: DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS, toolPolicy: "safe-read-only", tools: [], + fastContext: { + enabled: false, + timeoutMs: 800, + maxResults: 3, + sources: ["memory", "sessions"], + fallbackToConsult: false, + }, providers: {}, }); export type VoiceCallRealtimeConfig = z.infer; @@ -490,6 +532,11 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal const realtimeProviders = sanitizeVoiceCallProviderConfigs( config.realtime?.providers ?? defaults.realtime.providers, ); + const realtimeFastContext = { + ...defaults.realtime.fastContext, + ...config.realtime?.fastContext, + sources: config.realtime?.fastContext?.sources ?? defaults.realtime.fastContext.sources, + }; return { ...defaults, ...config, @@ -520,6 +567,7 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal defaultRealtimeStreamPathForServePath(serve.path ?? defaults.serve.path), tools: (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools, + fastContext: realtimeFastContext, providers: realtimeProviders, }, tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts), diff --git a/extensions/voice-call/src/realtime-fast-context.test.ts b/extensions/voice-call/src/realtime-fast-context.test.ts new file mode 100644 index 00000000000..597080b3d63 --- /dev/null +++ b/extensions/voice-call/src/realtime-fast-context.test.ts @@ -0,0 +1,88 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { VoiceCallRealtimeFastContextConfig } from "./config.js"; + +const mocks = vi.hoisted(() => ({ + getActiveMemorySearchManager: vi.fn(), +})); + +vi.mock("openclaw/plugin-sdk/memory-host-search", () => ({ + getActiveMemorySearchManager: mocks.getActiveMemorySearchManager, +})); + +import { resolveRealtimeFastContextConsult } from "./realtime-fast-context.js"; + +const cfg = {} as OpenClawConfig; + +function createFastContextConfig( + overrides: Partial = {}, +): VoiceCallRealtimeFastContextConfig { + return { + enabled: true, + timeoutMs: 800, + maxResults: 3, + sources: ["memory", "sessions"], + fallbackToConsult: false, + ...overrides, + }; +} + +function createLogger() { + return { + debug: vi.fn(), + warn: vi.fn(), + }; +} + +describe("resolveRealtimeFastContextConsult", () => { + beforeEach(() => { + mocks.getActiveMemorySearchManager.mockReset(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it("falls back to the full consult when memory manager setup fails", async () => { + const logger = createLogger(); + mocks.getActiveMemorySearchManager.mockRejectedValue(new Error("memory misconfigured")); + + await expect( + resolveRealtimeFastContextConsult({ + cfg, + agentId: "main", + sessionKey: "voice:15550001234", + config: createFastContextConfig({ fallbackToConsult: true }), + args: { question: "What do you remember?" }, + logger, + }), + ).resolves.toEqual({ handled: false }); + + expect(logger.debug).toHaveBeenCalledWith(expect.stringContaining("memory misconfigured")); + }); + + it("returns a bounded miss when memory manager setup exceeds the fast context timeout", async () => { + vi.useFakeTimers(); + const logger = createLogger(); + mocks.getActiveMemorySearchManager.mockReturnValue(new Promise(() => {})); + + const resultPromise = resolveRealtimeFastContextConsult({ + cfg, + agentId: "main", + sessionKey: "voice:15550001234", + config: createFastContextConfig({ fallbackToConsult: false, timeoutMs: 25 }), + args: { question: "What do you remember?" }, + logger, + }); + + await vi.advanceTimersByTimeAsync(25); + + await expect(resultPromise).resolves.toEqual({ + handled: true, + result: { + text: expect.stringContaining("No relevant OpenClaw memory or session context"), + }, + }); + expect(logger.debug).toHaveBeenCalledWith(expect.stringContaining("timed out after 25ms")); + }); +}); diff --git a/extensions/voice-call/src/realtime-fast-context.ts b/extensions/voice-call/src/realtime-fast-context.ts new file mode 100644 index 00000000000..19927f2307c --- /dev/null +++ b/extensions/voice-call/src/realtime-fast-context.ts @@ -0,0 +1,165 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; +import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; +import { getActiveMemorySearchManager } from "openclaw/plugin-sdk/memory-host-search"; +import { + parseRealtimeVoiceAgentConsultArgs, + type RealtimeVoiceAgentConsultResult, +} from "openclaw/plugin-sdk/realtime-voice"; +import type { VoiceCallRealtimeFastContextConfig } from "./config.js"; + +type Logger = { + debug?: (message: string) => void; + warn: (message: string) => void; +}; + +type MemorySearchHit = { + path: string; + startLine: number; + endLine: number; + snippet: string; + source: "memory" | "sessions"; + score: number; +}; + +type FastContextLookupResult = + | { status: "unavailable"; error?: string } + | { status: "hits"; hits: MemorySearchHit[] }; + +export type RealtimeFastContextConsultResult = + | { handled: false } + | { handled: true; result: RealtimeVoiceAgentConsultResult }; + +const MAX_SNIPPET_CHARS = 700; + +class RealtimeFastContextTimeoutError extends Error { + constructor(timeoutMs: number) { + super(`fast context lookup timed out after ${timeoutMs}ms`); + this.name = "RealtimeFastContextTimeoutError"; + } +} + +function normalizeSnippet(text: string): string { + const normalized = text.replace(/\s+/g, " ").trim(); + if (normalized.length <= MAX_SNIPPET_CHARS) { + return normalized; + } + return `${normalized.slice(0, MAX_SNIPPET_CHARS - 1).trimEnd()}...`; +} + +function buildSearchQuery(args: unknown): string { + const parsed = parseRealtimeVoiceAgentConsultArgs(args); + return [parsed.question, parsed.context].filter(Boolean).join("\n\n"); +} + +function buildContextText(params: { query: string; hits: MemorySearchHit[] }): string { + const hits = params.hits + .map((hit, index) => { + const location = `${hit.path}:${hit.startLine}-${hit.endLine}`; + return `${index + 1}. [${hit.source}] ${location}\n${normalizeSnippet(hit.snippet)}`; + }) + .join("\n\n"); + return [ + "Fast OpenClaw memory context found for the live caller.", + "Use this context only if it answers the caller's question. If it is not relevant, say briefly that you do not have that context handy.", + `Question:\n${params.query}`, + `Context:\n${hits}`, + ].join("\n\n"); +} + +function buildMissText(query: string): string { + return [ + "No relevant OpenClaw memory or session context was found quickly for the live caller.", + "Answer briefly that you do not have that context handy. Do not keep checking unless the caller asks you to.", + `Question:\n${query}`, + ].join("\n\n"); +} + +async function withTimeout(promise: Promise, timeoutMs: number): Promise { + let timer: ReturnType | undefined; + try { + return await Promise.race([ + promise, + new Promise((_resolve, reject) => { + timer = setTimeout(() => reject(new RealtimeFastContextTimeoutError(timeoutMs)), timeoutMs); + }), + ]); + } finally { + if (timer) { + clearTimeout(timer); + } + } +} + +async function lookupFastContext(params: { + cfg: OpenClawConfig; + agentId: string; + sessionKey: string; + config: VoiceCallRealtimeFastContextConfig; + query: string; +}): Promise { + const memory = await getActiveMemorySearchManager({ + cfg: params.cfg, + agentId: params.agentId, + }); + if (!memory.manager) { + return { + status: "unavailable", + error: memory.error ?? "no active memory manager", + }; + } + const hits = await memory.manager.search(params.query, { + maxResults: params.config.maxResults, + sessionKey: params.sessionKey, + sources: params.config.sources, + }); + return { status: "hits", hits }; +} + +export async function resolveRealtimeFastContextConsult(params: { + cfg: OpenClawConfig; + agentId: string; + sessionKey: string; + config: VoiceCallRealtimeFastContextConfig; + args: unknown; + logger: Logger; +}): Promise { + if (!params.config.enabled) { + return { handled: false }; + } + + const query = buildSearchQuery(params.args); + try { + const lookup = await withTimeout( + lookupFastContext({ + cfg: params.cfg, + agentId: params.agentId, + sessionKey: params.sessionKey, + config: params.config, + query, + }), + params.config.timeoutMs, + ); + if (lookup.status === "unavailable") { + params.logger.debug?.(`[voice-call] realtime fast context unavailable: ${lookup.error}`); + return params.config.fallbackToConsult + ? { handled: false } + : { handled: true, result: { text: buildMissText(query) } }; + } + const { hits } = lookup; + if (hits.length === 0) { + return params.config.fallbackToConsult + ? { handled: false } + : { handled: true, result: { text: buildMissText(query) } }; + } + return { + handled: true, + result: { text: buildContextText({ query, hits }) }, + }; + } catch (error) { + const message = formatErrorMessage(error); + params.logger.debug?.(`[voice-call] realtime fast context lookup failed: ${message}`); + return params.config.fallbackToConsult + ? { handled: false } + : { handled: true, result: { text: buildMissText(query) } }; + } +} diff --git a/extensions/voice-call/src/runtime.test.ts b/extensions/voice-call/src/runtime.test.ts index 71d3fc5db8f..74f1b167164 100644 --- a/extensions/voice-call/src/runtime.test.ts +++ b/extensions/voice-call/src/runtime.test.ts @@ -20,6 +20,8 @@ const mocks = vi.hoisted(() => ({ realtimeHandlerRegisterToolHandler: vi.fn(), realtimeHandlerSetPublicUrl: vi.fn(), resolveConfiguredRealtimeVoiceProvider: vi.fn(), + getActiveMemorySearchManager: vi.fn(), + memorySearch: vi.fn(), startTunnel: vi.fn(), setupTailscaleExposure: vi.fn(), cleanupTailscaleExposure: vi.fn(), @@ -65,6 +67,10 @@ vi.mock("./webhook/realtime-handler.js", () => ({ }, })); +vi.mock("openclaw/plugin-sdk/memory-host-search", () => ({ + getActiveMemorySearchManager: mocks.getActiveMemorySearchManager, +})); + vi.mock("./tunnel.js", () => ({ startTunnel: mocks.startTunnel, })); @@ -132,6 +138,14 @@ describe("createVoiceCallRuntime lifecycle", () => { provider: { id: "openai" }, providerConfig: { model: "gpt-realtime" }, }); + mocks.getActiveMemorySearchManager.mockReset(); + mocks.memorySearch.mockReset(); + mocks.getActiveMemorySearchManager.mockResolvedValue({ + manager: { + search: mocks.memorySearch, + }, + }); + mocks.memorySearch.mockResolvedValue([]); mocks.startTunnel.mockResolvedValue(null); mocks.setupTailscaleExposure.mockResolvedValue(null); mocks.cleanupTailscaleExposure.mockResolvedValue(undefined); @@ -336,9 +350,17 @@ describe("createVoiceCallRuntime lifecycle", () => { ); const handler = mocks.realtimeHandlerRegisterToolHandler.mock.calls[0]?.[1] as - | ((args: unknown, callId: string) => Promise) + | (( + args: unknown, + callId: string, + context?: { partialUserTranscript?: string }, + ) => Promise) | undefined; - await expect(handler?.({ question: "What should I say?" }, "call-1")).resolves.toEqual({ + await expect( + handler?.({ question: "What should I say?" }, "call-1", { + partialUserTranscript: "Also check the ETA.", + }), + ).resolves.toEqual({ text: "Use the shipment status.", }); expect(runEmbeddedPiAgent).toHaveBeenCalledWith( @@ -353,5 +375,84 @@ describe("createVoiceCallRuntime lifecycle", () => { prompt: expect.stringContaining("Caller: Can you check shipment status?"), }), ); + expect(runEmbeddedPiAgent).toHaveBeenCalledWith( + expect.objectContaining({ + prompt: expect.stringContaining("Caller: Also check the ETA."), + }), + ); + }); + + it("answers realtime consults from fast memory context before starting the full agent", async () => { + const config = createBaseConfig(); + config.realtime.enabled = true; + config.realtime.fastContext = { + enabled: true, + timeoutMs: 800, + maxResults: 2, + sources: ["memory"], + fallbackToConsult: false, + }; + const runEmbeddedPiAgent = vi.fn(async () => ({ + payloads: [{ text: "slow answer" }], + meta: {}, + })); + const sessionStore: Record = {}; + const agentRuntime = { + resolveAgentDir: vi.fn(() => "/tmp/agent"), + resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"), + resolveAgentIdentity: vi.fn(), + resolveThinkingDefault: vi.fn(() => "high"), + resolveAgentTimeoutMs: vi.fn(() => 30_000), + ensureAgentWorkspace: vi.fn(async () => {}), + session: { + resolveStorePath: vi.fn(() => "/tmp/sessions.json"), + loadSessionStore: vi.fn(() => sessionStore), + saveSessionStore: vi.fn(async () => {}), + resolveSessionFilePath: vi.fn(() => "/tmp/session.json"), + }, + runEmbeddedPiAgent, + }; + mocks.managerGetCall.mockReturnValue({ + callId: "call-1", + direction: "inbound", + from: "+15550001234", + to: "+15550009999", + transcript: [], + }); + mocks.memorySearch.mockResolvedValue([ + { + source: "memory", + path: "MEMORY.md", + startLine: 12, + endLine: 14, + score: 0.91, + snippet: "The caller's basement lights are on.", + }, + ]); + + await createVoiceCallRuntime({ + config, + coreConfig: {} as CoreConfig, + agentRuntime: agentRuntime as never, + }); + + const handler = mocks.realtimeHandlerRegisterToolHandler.mock.calls[0]?.[1] as + | (( + args: unknown, + callId: string, + context?: { partialUserTranscript?: string }, + ) => Promise) + | undefined; + await expect(handler?.({ question: "Are the basement lights on?" }, "call-1")).resolves.toEqual( + { + text: expect.stringContaining("The caller's basement lights are on."), + }, + ); + expect(mocks.memorySearch).toHaveBeenCalledWith("Are the basement lights on?", { + maxResults: 2, + sessionKey: "voice:15550001234", + sources: ["memory"], + }); + expect(runEmbeddedPiAgent).not.toHaveBeenCalled(); }); }); diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 23e3fab0389..9cc65799b4c 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -18,6 +18,7 @@ import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js"; import { CallManager } from "./manager.js"; import type { VoiceCallProvider } from "./providers/base.js"; import type { TwilioProvider } from "./providers/twilio.js"; +import { resolveRealtimeFastContextConsult } from "./realtime-fast-context.js"; import { resolveVoiceResponseModel } from "./response-model.js"; import type { TelephonyTtsRuntime } from "./telephony-tts.js"; import { createTelephonyTtsProvider } from "./telephony-tts.js"; @@ -27,6 +28,7 @@ import { providerRequiresPublicWebhook, } from "./webhook-exposure.js"; import { VoiceCallWebhookServer } from "./webhook.js"; +import type { ToolHandlerContext } from "./webhook/realtime-handler.js"; import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js"; export type VoiceCallRuntime = { @@ -115,13 +117,23 @@ function resolveVoiceCallConsultSessionKey(call: { return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${call.callId}`; } -function mapVoiceCallConsultTranscript(call: { - transcript?: Array<{ speaker: "user" | "bot"; text: string }>; -}): RealtimeVoiceAgentConsultTranscriptEntry[] { - return (call.transcript ?? []).map((entry) => ({ - role: entry.speaker === "bot" ? "assistant" : "user", - text: entry.text, - })); +function mapVoiceCallConsultTranscript( + call: { + transcript?: Array<{ speaker: "user" | "bot"; text: string }>; + }, + context?: ToolHandlerContext, +): RealtimeVoiceAgentConsultTranscriptEntry[] { + const transcript: RealtimeVoiceAgentConsultTranscriptEntry[] = (call.transcript ?? []).map( + (entry) => ({ + role: entry.speaker === "bot" ? "assistant" : "user", + text: entry.text, + }), + ); + const partial = context?.partialUserTranscript?.trim(); + if (partial && transcript.at(-1)?.text !== partial) { + transcript.push({ role: "user", text: partial }); + } + return transcript; } function createRuntimeResourceLifecycle(params: { @@ -316,11 +328,24 @@ export async function createVoiceCallRuntime(params: { if (config.realtime.toolPolicy !== "none") { realtimeHandler.registerToolHandler( REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, - async (args, callId) => { + async (args, callId, handlerContext) => { const call = manager.getCall(callId); if (!call) { return { error: `Call "${callId}" not found` }; } + const agentId = config.agentId ?? "main"; + const sessionKey = resolveVoiceCallConsultSessionKey(call); + const fastContext = await resolveRealtimeFastContextConsult({ + cfg, + agentId, + sessionKey, + config: config.realtime.fastContext, + args, + logger: log, + }); + if (fastContext.handled) { + return fastContext.result; + } const { provider: agentProvider, model } = resolveVoiceResponseModel({ voiceConfig: config, agentRuntime, @@ -334,13 +359,13 @@ export async function createVoiceCallRuntime(params: { cfg, agentRuntime, logger: log, - agentId: config.agentId ?? "main", - sessionKey: resolveVoiceCallConsultSessionKey(call), + agentId, + sessionKey, messageProvider: "voice", lane: "voice", runIdPrefix: `voice-realtime-consult:${callId}`, args, - transcript: mapVoiceCallConsultTranscript(call), + transcript: mapVoiceCallConsultTranscript(call, handlerContext), surface: "a live phone call", userLabel: "Caller", assistantLabel: "Agent", diff --git a/extensions/voice-call/src/test-fixtures.ts b/extensions/voice-call/src/test-fixtures.ts index 4821409a44f..f382d8bd9b7 100644 --- a/extensions/voice-call/src/test-fixtures.ts +++ b/extensions/voice-call/src/test-fixtures.ts @@ -50,6 +50,13 @@ export function createVoiceCallBaseConfig(params?: { instructions: DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS, toolPolicy: "safe-read-only", tools: [], + fastContext: { + enabled: false, + timeoutMs: 800, + maxResults: 3, + sources: ["memory", "sessions"], + fallbackToConsult: false, + }, providers: {}, }, skipSignatureVerification: false, diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts index 1e321d32844..3534e77878a 100644 --- a/extensions/voice-call/src/webhook.test.ts +++ b/extensions/voice-call/src/webhook.test.ts @@ -1,7 +1,11 @@ import { request, type IncomingMessage } from "node:http"; import type { RealtimeTranscriptionProviderPlugin } from "openclaw/plugin-sdk/realtime-transcription"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { VoiceCallConfigSchema, type VoiceCallConfig } from "./config.js"; +import { + VoiceCallConfigSchema, + type VoiceCallConfig, + type VoiceCallConfigInput, +} from "./config.js"; import type { CallManager } from "./manager.js"; import type { VoiceCallProvider } from "./providers/base.js"; import type { TwilioProvider } from "./providers/twilio.js"; @@ -59,18 +63,35 @@ type TwilioProviderTestDouble = VoiceCallProvider & | "clearTtsQueue" >; -const createConfig = (overrides: Partial = {}): VoiceCallConfig => { +const createConfig = (overrides: VoiceCallConfigInput = {}): VoiceCallConfig => { const base = VoiceCallConfigSchema.parse({}); base.serve.port = 0; - return { + const merged = { ...base, ...overrides, serve: { ...base.serve, ...overrides.serve, }, + realtime: { + ...base.realtime, + ...overrides.realtime, + tools: overrides.realtime?.tools ?? base.realtime.tools, + fastContext: { + ...base.realtime.fastContext, + ...overrides.realtime?.fastContext, + sources: overrides.realtime?.fastContext?.sources ?? base.realtime.fastContext.sources, + }, + providers: overrides.realtime?.providers ?? base.realtime.providers, + }, }; + const parsed = VoiceCallConfigSchema.parse({ + ...merged, + serve: { ...merged.serve, port: merged.serve.port === 0 ? 1 : merged.serve.port }, + }); + parsed.serve.port = merged.serve.port; + return parsed; }; const createCall = (startedAt: number): CallRecord => ({ diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index 539a4bbe2c7..0c8538e4bf6 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -2,6 +2,7 @@ import http from "node:http"; import type { RealtimeVoiceBridge, RealtimeVoiceProviderPlugin, + RealtimeVoiceToolCallEvent, } from "openclaw/plugin-sdk/realtime-voice"; import { describe, expect, it, vi } from "vitest"; import { WebSocket } from "ws"; @@ -59,6 +60,13 @@ function makeHandler( instructions: overrides?.instructions ?? "Be helpful.", toolPolicy: overrides?.toolPolicy ?? "safe-read-only", tools: overrides?.tools ?? [], + fastContext: overrides?.fastContext ?? { + enabled: false, + timeoutMs: 800, + maxResults: 3, + sources: ["memory", "sessions"], + fallbackToConsult: false, + }, providers: overrides?.providers ?? {}, ...(overrides?.provider ? { provider: overrides.provider } : {}), }; @@ -338,9 +346,11 @@ describe("RealtimeCallHandler path routing", () => { name: string; args: unknown; }) => void; + onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void; } | undefined; let resolveConsult: ((value: unknown) => void) | undefined; + let receivedPartialTranscript: string | undefined; const submitToolResult = vi.fn(); const bridge = makeBridge({ supportsToolResultContinuation: true, @@ -373,13 +383,12 @@ describe("RealtimeCallHandler path routing", () => { }, realtimeProvider: makeRealtimeProvider(createBridge), }); - handler.registerToolHandler( - "openclaw_agent_consult", - () => - new Promise((resolve) => { - resolveConsult = resolve; - }), - ); + handler.registerToolHandler("openclaw_agent_consult", (_args, _callId, context) => { + receivedPartialTranscript = context.partialUserTranscript; + return new Promise((resolve) => { + resolveConsult = resolve; + }); + }); handler.registerToolHandler("custom_lookup", async () => ({ ok: true })); const server = await startRealtimeServer(handler); @@ -396,12 +405,14 @@ describe("RealtimeCallHandler path routing", () => { expect(createBridge).toHaveBeenCalled(); }); + callbacks?.onTranscript?.("user", "Are the basement", false); callbacks?.onToolCall?.({ itemId: "item-1", callId: "consult-call", name: "openclaw_agent_consult", args: { question: "Are the basement lights on?" }, }); + expect(receivedPartialTranscript).toBe("Are the basement"); await vi.waitFor(() => { expect(submitToolResult).toHaveBeenCalledWith( @@ -450,6 +461,95 @@ describe("RealtimeCallHandler path routing", () => { await server.close(); } }); + + it("does not submit an interim checking result when fast context is enabled", async () => { + let callbacks: + | { + onToolCall?: (event: RealtimeVoiceToolCallEvent) => void; + } + | undefined; + const submitToolResult = vi.fn(); + const bridge = makeBridge({ + supportsToolResultContinuation: true, + submitToolResult, + }); + const createBridge = vi.fn( + (request: Parameters[0]) => { + callbacks = request; + return bridge; + }, + ); + const handler = makeHandler( + { + fastContext: { + enabled: true, + timeoutMs: 800, + maxResults: 3, + sources: ["memory", "sessions"], + fallbackToConsult: false, + }, + }, + { + manager: { + getCallByProviderCallId: vi.fn( + (): CallRecord => ({ + callId: "call-1", + providerCallId: "CA-fast", + provider: "twilio", + direction: "inbound", + state: "ringing", + from: "+15550001234", + to: "+15550009999", + startedAt: Date.now(), + transcript: [], + processedEventIds: [], + metadata: {}, + }), + ), + }, + realtimeProvider: makeRealtimeProvider(createBridge), + }, + ); + handler.registerToolHandler("openclaw_agent_consult", async () => ({ text: "Fast context." })); + const server = await startRealtimeServer(handler); + + try { + const ws = await connectWs(server.url); + try { + ws.send( + JSON.stringify({ + event: "start", + start: { streamSid: "MZ-fast", callSid: "CA-fast" }, + }), + ); + await vi.waitFor(() => { + expect(createBridge).toHaveBeenCalled(); + }); + + callbacks?.onToolCall?.({ + itemId: "item-1", + callId: "consult-call", + name: "openclaw_agent_consult", + args: { question: "What do you remember?" }, + }); + + await vi.waitFor(() => { + expect(submitToolResult).toHaveBeenCalledWith( + "consult-call", + { text: "Fast context." }, + undefined, + ); + }); + expect(submitToolResult).toHaveBeenCalledTimes(1); + } finally { + if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) { + ws.close(); + } + } + } finally { + await server.close(); + } + }); }); describe("RealtimeCallHandler websocket hardening", () => { diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 0d20517c313..98793eeccbd 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -17,7 +17,14 @@ import type { VoiceCallProvider } from "../providers/base.js"; import type { CallRecord, NormalizedEvent } from "../types.js"; import type { WebhookResponsePayload } from "../webhook.types.js"; -export type ToolHandlerFn = (args: unknown, callId: string) => Promise; +export type ToolHandlerContext = { + partialUserTranscript?: string; +}; +export type ToolHandlerFn = ( + args: unknown, + callId: string, + context: ToolHandlerContext, +) => Promise; const STREAM_TOKEN_TTL_MS = 30_000; const DEFAULT_HOST = "localhost:8443"; @@ -73,6 +80,7 @@ export class RealtimeCallHandler { private readonly toolHandlers = new Map(); private readonly pendingStreamTokens = new Map(); private readonly activeBridgesByCallId = new Map(); + private readonly partialUserTranscriptsByCallId = new Map(); private publicOrigin: string | null = null; private publicPathPrefix = ""; @@ -297,9 +305,13 @@ export class RealtimeCallHandler { }, onTranscript: (role, text, isFinal) => { if (!isFinal) { + if (role === "user" && text.trim()) { + this.partialUserTranscriptsByCallId.set(callId, text); + } return; } if (role === "user") { + this.partialUserTranscriptsByCallId.delete(callId); const event: NormalizedEvent = { id: `realtime-speech-${callSid}-${Date.now()}`, type: "call.speech", @@ -336,6 +348,7 @@ export class RealtimeCallHandler { onClose: (reason) => { this.activeBridgesByCallId.delete(callId); this.activeBridgesByCallId.delete(callSid); + this.partialUserTranscriptsByCallId.delete(callId); if (reason !== "error") { return; } @@ -360,6 +373,7 @@ export class RealtimeCallHandler { bridge.close = () => { this.activeBridgesByCallId.delete(callId); this.activeBridgesByCallId.delete(callSid); + this.partialUserTranscriptsByCallId.delete(callId); closeBridge(); }; @@ -450,7 +464,8 @@ export class RealtimeCallHandler { if ( handler && name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME && - bridge.bridge.supportsToolResultContinuation + bridge.bridge.supportsToolResultContinuation && + !this.config.fastContext.enabled ) { bridge.submitToolResult( bridgeCallId, @@ -460,7 +475,9 @@ export class RealtimeCallHandler { } const result = !handler ? { error: `Tool "${name}" not available` } - : await handler(args, callId).catch((error: unknown) => ({ + : await handler(args, callId, { + partialUserTranscript: this.partialUserTranscriptsByCallId.get(callId), + }).catch((error: unknown) => ({ error: formatErrorMessage(error), })); bridge.submitToolResult(bridgeCallId, result);