From 8f11e5ad188310d9b7cb196c6fd89c980eda39e2 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 03:42:59 +0100 Subject: [PATCH] fix(voice-call): scope sandbox session to agent --- CHANGELOG.md | 1 + .../voice-call/src/response-generator.test.ts | 4 +++ .../voice-call/src/response-generator.ts | 10 +++++++ extensions/voice-call/src/runtime.ts | 1 + .../agent-consult-runtime.test.ts | 29 +++++++++++++++++++ src/realtime-voice/agent-consult-runtime.ts | 10 +++++++ 6 files changed, 55 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 522ed54aa0d..f8d5d9e667b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -84,6 +84,7 @@ Docs: https://docs.openclaw.ai - Plugins/Voice Call: coalesce concurrent webhook server starts on the same runtime instance, avoiding a second `listen()` bind when overlapping startup paths race. Thanks @education-01. - Plugins/Voice Call: pin voice response sessions to `responseModel` before embedded agent runs, avoiding live-session model switch failures when the global default model differs. Fixes #60118. Thanks @xinbenlv. - Plugins/Voice Call: add `agentId` for voice response generation, so phone calls can use a dedicated agent workspace instead of always routing through `main`. Fixes #42155. Thanks @TheOpie. +- Plugins/Voice Call: scope embedded voice response sandbox resolution to the selected voice agent, so implicit `main` voice sessions respect `agents.defaults.sandbox.mode: "off"` even when other agents define sandboxed Docker binds. Fixes #56367. Thanks @crpol. - Media tools: honor the configured web-fetch SSRF policy for media understanding, image/music/video generation references, and PDF inputs, so explicit RFC2544 opt-ins cover WebChat OSS uploads without weakening defaults. Fixes #71300. (#71321) Thanks @neeravmakwana. - Agents/TTS: suppress successful spoken transcripts from verbose chat tool output when structured voice media is already queued, while preserving text output for non-builtin tool-name collisions. Fixes #71282. Thanks @neeravmakwana. - Plugins/Google Meet: reuse existing Meet tabs and active sessions across harmless URL query differences, avoiding duplicate Chrome windows when agents retry a join. Thanks @steipete. diff --git a/extensions/voice-call/src/response-generator.test.ts b/extensions/voice-call/src/response-generator.test.ts index b686eeab91b..2bd32bfa971 100644 --- a/extensions/voice-call/src/response-generator.test.ts +++ b/extensions/voice-call/src/response-generator.test.ts @@ -223,6 +223,8 @@ describe("generateVoiceResponse", () => { expect(runEmbeddedPiAgent).toHaveBeenCalledWith( expect.objectContaining({ agentDir: "/tmp/openclaw/agents/main", + agentId: "main", + sandboxSessionKey: "agent:main:voice:15550001111", workspaceDir: "/tmp/openclaw/workspace/main", sessionFile: "/tmp/openclaw/main/sessions/session.jsonl", }), @@ -265,6 +267,8 @@ describe("generateVoiceResponse", () => { expect(runEmbeddedPiAgent).toHaveBeenCalledWith( expect.objectContaining({ agentDir: "/tmp/openclaw/agents/voice", + agentId: "voice", + sandboxSessionKey: "agent:voice:voice:15550001111", workspaceDir: "/tmp/openclaw/workspace/voice", sessionFile: "/tmp/openclaw/voice/sessions/session.jsonl", }), diff --git a/extensions/voice-call/src/response-generator.ts b/extensions/voice-call/src/response-generator.ts index 7dd39da8ef2..249488b244e 100644 --- a/extensions/voice-call/src/response-generator.ts +++ b/extensions/voice-call/src/response-generator.ts @@ -172,6 +172,14 @@ function extractSpokenTextFromPayloads(payloads: VoiceResponsePayload[]): string return spokenSegments.length > 0 ? spokenSegments.join(" ").trim() : null; } +function resolveVoiceSandboxSessionKey(agentId: string, sessionKey: string): string { + const trimmed = sessionKey.trim(); + if (trimmed.toLowerCase().startsWith("agent:")) { + return trimmed; + } + return `agent:${agentId}:${trimmed}`; +} + /** * Generate a voice response using the embedded Pi agent with full tool support. * Uses the same agent infrastructure as messaging for consistent behavior. @@ -264,6 +272,8 @@ export async function generateVoiceResponse( const result = await agentRuntime.runEmbeddedPiAgent({ sessionId, sessionKey, + sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, sessionKey), + agentId, messageProvider: "voice", sessionFile, workspaceDir, diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 054b7cb8a43..4b1fb181f85 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -318,6 +318,7 @@ export async function createVoiceCallRuntime(params: { cfg, agentRuntime, logger: log, + agentId: config.agentId ?? "main", sessionKey: resolveVoiceCallConsultSessionKey(call), messageProvider: "voice", lane: "voice", diff --git a/src/realtime-voice/agent-consult-runtime.test.ts b/src/realtime-voice/agent-consult-runtime.test.ts index e6e3e2e98ff..2a24ba02df2 100644 --- a/src/realtime-voice/agent-consult-runtime.test.ts +++ b/src/realtime-voice/agent-consult-runtime.test.ts @@ -77,6 +77,8 @@ describe("realtime voice agent consult runtime", () => { expect(runEmbeddedPiAgent).toHaveBeenCalledWith( expect.objectContaining({ sessionKey: "voice:15550001234", + sandboxSessionKey: "agent:main:voice:15550001234", + agentId: "main", messageProvider: "voice", lane: "voice", toolsAllow: ["read"], @@ -89,6 +91,33 @@ describe("realtime voice agent consult runtime", () => { ); }); + it("scopes sandbox resolution to the configured consult agent", async () => { + const { runtime, runEmbeddedPiAgent } = createAgentRuntime(); + + await consultRealtimeVoiceAgent({ + cfg: {} as never, + agentRuntime: runtime as never, + logger: { warn: vi.fn() }, + agentId: "voice", + sessionKey: "voice:15550001234", + messageProvider: "voice", + lane: "voice", + runIdPrefix: "voice-realtime-consult:call-1", + args: { question: "What should I say?" }, + transcript: [], + surface: "a live phone call", + userLabel: "Caller", + }); + + expect(runEmbeddedPiAgent).toHaveBeenCalledWith( + expect.objectContaining({ + sessionKey: "voice:15550001234", + sandboxSessionKey: "agent:voice:voice:15550001234", + agentId: "voice", + }), + ); + }); + it("returns a speakable fallback when the embedded agent has no visible text", async () => { const warn = vi.fn(); const { runtime } = createAgentRuntime([{ text: "hidden", isReasoning: true }]); diff --git a/src/realtime-voice/agent-consult-runtime.ts b/src/realtime-voice/agent-consult-runtime.ts index 871794cd1c5..e256c80d896 100644 --- a/src/realtime-voice/agent-consult-runtime.ts +++ b/src/realtime-voice/agent-consult-runtime.ts @@ -15,6 +15,14 @@ export { resolveRealtimeVoiceAgentConsultToolsAllow, } from "./agent-consult-tool.js"; +function resolveRealtimeVoiceAgentSandboxSessionKey(agentId: string, sessionKey: string): string { + const trimmed = sessionKey.trim(); + if (trimmed.toLowerCase().startsWith("agent:")) { + return trimmed; + } + return `agent:${agentId}:${trimmed}`; +} + export async function consultRealtimeVoiceAgent(params: { cfg: OpenClawConfig; agentRuntime: RealtimeVoiceAgentConsultRuntime; @@ -63,6 +71,8 @@ export async function consultRealtimeVoiceAgent(params: { const result = await params.agentRuntime.runEmbeddedPiAgent({ sessionId, sessionKey: params.sessionKey, + sandboxSessionKey: resolveRealtimeVoiceAgentSandboxSessionKey(agentId, params.sessionKey), + agentId, messageProvider: params.messageProvider, sessionFile, workspaceDir,