From 8f11e5ad188310d9b7cb196c6fd89c980eda39e2 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 25 Apr 2026 03:42:59 +0100
Subject: [PATCH] fix(voice-call): scope sandbox session to agent

---
 CHANGELOG.md                                  |  1 +
 .../voice-call/src/response-generator.test.ts |  4 +++
 .../voice-call/src/response-generator.ts      | 10 +++++++
 extensions/voice-call/src/runtime.ts          |  1 +
 .../agent-consult-runtime.test.ts             | 29 +++++++++++++++++++
 src/realtime-voice/agent-consult-runtime.ts   | 10 +++++++
 6 files changed, 55 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 522ed54aa0d..f8d5d9e667b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -84,6 +84,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/Voice Call: coalesce concurrent webhook server starts on the same runtime instance, avoiding a second `listen()` bind when overlapping startup paths race. Thanks @education-01.
 - Plugins/Voice Call: pin voice response sessions to `responseModel` before embedded agent runs, avoiding live-session model switch failures when the global default model differs. Fixes #60118. Thanks @xinbenlv.
 - Plugins/Voice Call: add `agentId` for voice response generation, so phone calls can use a dedicated agent workspace instead of always routing through `main`. Fixes #42155. Thanks @TheOpie.
+- Plugins/Voice Call: scope embedded voice response sandbox resolution to the selected voice agent, so implicit `main` voice sessions respect `agents.defaults.sandbox.mode: "off"` even when other agents define sandboxed Docker binds. Fixes #56367. Thanks @crpol.
 - Media tools: honor the configured web-fetch SSRF policy for media understanding, image/music/video generation references, and PDF inputs, so explicit RFC2544 opt-ins cover WebChat OSS uploads without weakening defaults. Fixes #71300. (#71321) Thanks @neeravmakwana.
 - Agents/TTS: suppress successful spoken transcripts from verbose chat tool output when structured voice media is already queued, while preserving text output for non-builtin tool-name collisions. Fixes #71282. Thanks @neeravmakwana.
 - Plugins/Google Meet: reuse existing Meet tabs and active sessions across harmless URL query differences, avoiding duplicate Chrome windows when agents retry a join. Thanks @steipete.
diff --git a/extensions/voice-call/src/response-generator.test.ts b/extensions/voice-call/src/response-generator.test.ts
index b686eeab91b..2bd32bfa971 100644
--- a/extensions/voice-call/src/response-generator.test.ts
+++ b/extensions/voice-call/src/response-generator.test.ts
@@ -223,6 +223,8 @@ describe("generateVoiceResponse", () => {
     expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
       expect.objectContaining({
         agentDir: "/tmp/openclaw/agents/main",
+        agentId: "main",
+        sandboxSessionKey: "agent:main:voice:15550001111",
         workspaceDir: "/tmp/openclaw/workspace/main",
         sessionFile: "/tmp/openclaw/main/sessions/session.jsonl",
       }),
@@ -265,6 +267,8 @@ describe("generateVoiceResponse", () => {
     expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
       expect.objectContaining({
         agentDir: "/tmp/openclaw/agents/voice",
+        agentId: "voice",
+        sandboxSessionKey: "agent:voice:voice:15550001111",
         workspaceDir: "/tmp/openclaw/workspace/voice",
         sessionFile: "/tmp/openclaw/voice/sessions/session.jsonl",
       }),
diff --git a/extensions/voice-call/src/response-generator.ts b/extensions/voice-call/src/response-generator.ts
index 7dd39da8ef2..249488b244e 100644
--- a/extensions/voice-call/src/response-generator.ts
+++ b/extensions/voice-call/src/response-generator.ts
@@ -172,6 +172,14 @@ function extractSpokenTextFromPayloads(payloads: VoiceResponsePayload[]): string
   return spokenSegments.length > 0 ? spokenSegments.join(" ").trim() : null;
 }
 
+function resolveVoiceSandboxSessionKey(agentId: string, sessionKey: string): string {
+  const trimmed = sessionKey.trim();
+  if (trimmed.toLowerCase().startsWith("agent:")) {
+    return trimmed;
+  }
+  return `agent:${agentId}:${trimmed}`;
+}
+
 /**
  * Generate a voice response using the embedded Pi agent with full tool support.
  * Uses the same agent infrastructure as messaging for consistent behavior.
@@ -264,6 +272,8 @@ export async function generateVoiceResponse(
     const result = await agentRuntime.runEmbeddedPiAgent({
       sessionId,
       sessionKey,
+      sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, sessionKey),
+      agentId,
       messageProvider: "voice",
       sessionFile,
       workspaceDir,
diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts
index 054b7cb8a43..4b1fb181f85 100644
--- a/extensions/voice-call/src/runtime.ts
+++ b/extensions/voice-call/src/runtime.ts
@@ -318,6 +318,7 @@ export async function createVoiceCallRuntime(params: {
             cfg,
             agentRuntime,
             logger: log,
+            agentId: config.agentId ?? "main",
             sessionKey: resolveVoiceCallConsultSessionKey(call),
             messageProvider: "voice",
             lane: "voice",
diff --git a/src/realtime-voice/agent-consult-runtime.test.ts b/src/realtime-voice/agent-consult-runtime.test.ts
index e6e3e2e98ff..2a24ba02df2 100644
--- a/src/realtime-voice/agent-consult-runtime.test.ts
+++ b/src/realtime-voice/agent-consult-runtime.test.ts
@@ -77,6 +77,8 @@ describe("realtime voice agent consult runtime", () => {
     expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
       expect.objectContaining({
         sessionKey: "voice:15550001234",
+        sandboxSessionKey: "agent:main:voice:15550001234",
+        agentId: "main",
         messageProvider: "voice",
         lane: "voice",
         toolsAllow: ["read"],
@@ -89,6 +91,33 @@ describe("realtime voice agent consult runtime", () => {
     );
   });
 
+  it("scopes sandbox resolution to the configured consult agent", async () => {
+    const { runtime, runEmbeddedPiAgent } = createAgentRuntime();
+
+    await consultRealtimeVoiceAgent({
+      cfg: {} as never,
+      agentRuntime: runtime as never,
+      logger: { warn: vi.fn() },
+      agentId: "voice",
+      sessionKey: "voice:15550001234",
+      messageProvider: "voice",
+      lane: "voice",
+      runIdPrefix: "voice-realtime-consult:call-1",
+      args: { question: "What should I say?" },
+      transcript: [],
+      surface: "a live phone call",
+      userLabel: "Caller",
+    });
+
+    expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
+      expect.objectContaining({
+        sessionKey: "voice:15550001234",
+        sandboxSessionKey: "agent:voice:voice:15550001234",
+        agentId: "voice",
+      }),
+    );
+  });
+
   it("returns a speakable fallback when the embedded agent has no visible text", async () => {
     const warn = vi.fn();
     const { runtime } = createAgentRuntime([{ text: "hidden", isReasoning: true }]);
diff --git a/src/realtime-voice/agent-consult-runtime.ts b/src/realtime-voice/agent-consult-runtime.ts
index 871794cd1c5..e256c80d896 100644
--- a/src/realtime-voice/agent-consult-runtime.ts
+++ b/src/realtime-voice/agent-consult-runtime.ts
@@ -15,6 +15,14 @@ export {
   resolveRealtimeVoiceAgentConsultToolsAllow,
 } from "./agent-consult-tool.js";
 
+function resolveRealtimeVoiceAgentSandboxSessionKey(agentId: string, sessionKey: string): string {
+  const trimmed = sessionKey.trim();
+  if (trimmed.toLowerCase().startsWith("agent:")) {
+    return trimmed;
+  }
+  return `agent:${agentId}:${trimmed}`;
+}
+
 export async function consultRealtimeVoiceAgent(params: {
   cfg: OpenClawConfig;
   agentRuntime: RealtimeVoiceAgentConsultRuntime;
@@ -63,6 +71,8 @@ export async function consultRealtimeVoiceAgent(params: {
   const result = await params.agentRuntime.runEmbeddedPiAgent({
     sessionId,
     sessionKey: params.sessionKey,
+    sandboxSessionKey: resolveRealtimeVoiceAgentSandboxSessionKey(agentId, params.sessionKey),
+    agentId,
     messageProvider: params.messageProvider,
     sessionFile,
     workspaceDir,