fix(voice-call): scope sandbox session to agent

2026-05-06 18:40:44 +00:00 · 2026-04-25 03:42:59 +01:00
parent 460720d0a1
commit 8f11e5ad18
6 changed files with 55 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -84,6 +84,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/Voice Call: coalesce concurrent webhook server starts on the same runtime instance, avoiding a second `listen()` bind when overlapping startup paths race. Thanks @education-01.
 - Plugins/Voice Call: pin voice response sessions to `responseModel` before embedded agent runs, avoiding live-session model switch failures when the global default model differs. Fixes #60118. Thanks @xinbenlv.
 - Plugins/Voice Call: add `agentId` for voice response generation, so phone calls can use a dedicated agent workspace instead of always routing through `main`. Fixes #42155. Thanks @TheOpie.
+- Plugins/Voice Call: scope embedded voice response sandbox resolution to the selected voice agent, so implicit `main` voice sessions respect `agents.defaults.sandbox.mode: "off"` even when other agents define sandboxed Docker binds. Fixes #56367. Thanks @crpol.
 - Media tools: honor the configured web-fetch SSRF policy for media understanding, image/music/video generation references, and PDF inputs, so explicit RFC2544 opt-ins cover WebChat OSS uploads without weakening defaults. Fixes #71300. (#71321) Thanks @neeravmakwana.
 - Agents/TTS: suppress successful spoken transcripts from verbose chat tool output when structured voice media is already queued, while preserving text output for non-builtin tool-name collisions. Fixes #71282. Thanks @neeravmakwana.
 - Plugins/Google Meet: reuse existing Meet tabs and active sessions across harmless URL query differences, avoiding duplicate Chrome windows when agents retry a join. Thanks @steipete.
--- a/extensions/voice-call/src/response-generator.test.ts
+++ b/extensions/voice-call/src/response-generator.test.ts
@@ -223,6 +223,8 @@ describe("generateVoiceResponse", () => {
    expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
      expect.objectContaining({
        agentDir: "/tmp/openclaw/agents/main",
+        agentId: "main",
+        sandboxSessionKey: "agent:main:voice:15550001111",
        workspaceDir: "/tmp/openclaw/workspace/main",
        sessionFile: "/tmp/openclaw/main/sessions/session.jsonl",
      }),
@@ -265,6 +267,8 @@ describe("generateVoiceResponse", () => {
    expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
      expect.objectContaining({
        agentDir: "/tmp/openclaw/agents/voice",
+        agentId: "voice",
+        sandboxSessionKey: "agent:voice:voice:15550001111",
        workspaceDir: "/tmp/openclaw/workspace/voice",
        sessionFile: "/tmp/openclaw/voice/sessions/session.jsonl",
      }),
--- a/extensions/voice-call/src/response-generator.ts
+++ b/extensions/voice-call/src/response-generator.ts
@@ -172,6 +172,14 @@ function extractSpokenTextFromPayloads(payloads: VoiceResponsePayload[]): string
  return spokenSegments.length > 0 ? spokenSegments.join(" ").trim() : null;
 }

+function resolveVoiceSandboxSessionKey(agentId: string, sessionKey: string): string {
+  const trimmed = sessionKey.trim();
+  if (trimmed.toLowerCase().startsWith("agent:")) {
+    return trimmed;
+  }
+  return `agent:${agentId}:${trimmed}`;
+}
+
 /**
 * Generate a voice response using the embedded Pi agent with full tool support.
 * Uses the same agent infrastructure as messaging for consistent behavior.
@@ -264,6 +272,8 @@ export async function generateVoiceResponse(
    const result = await agentRuntime.runEmbeddedPiAgent({
      sessionId,
      sessionKey,
+      sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, sessionKey),
+      agentId,
      messageProvider: "voice",
      sessionFile,
      workspaceDir,
--- a/extensions/voice-call/src/runtime.ts
+++ b/extensions/voice-call/src/runtime.ts
@@ -318,6 +318,7 @@ export async function createVoiceCallRuntime(params: {
            cfg,
            agentRuntime,
            logger: log,
+            agentId: config.agentId ?? "main",
            sessionKey: resolveVoiceCallConsultSessionKey(call),
            messageProvider: "voice",
            lane: "voice",
--- a/src/realtime-voice/agent-consult-runtime.test.ts
+++ b/src/realtime-voice/agent-consult-runtime.test.ts
@@ -77,6 +77,8 @@ describe("realtime voice agent consult runtime", () => {
    expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
      expect.objectContaining({
        sessionKey: "voice:15550001234",
+        sandboxSessionKey: "agent:main:voice:15550001234",
+        agentId: "main",
        messageProvider: "voice",
        lane: "voice",
        toolsAllow: ["read"],
@@ -89,6 +91,33 @@ describe("realtime voice agent consult runtime", () => {
    );
  });

+  it("scopes sandbox resolution to the configured consult agent", async () => {
+    const { runtime, runEmbeddedPiAgent } = createAgentRuntime();
+
+    await consultRealtimeVoiceAgent({
+      cfg: {} as never,
+      agentRuntime: runtime as never,
+      logger: { warn: vi.fn() },
+      agentId: "voice",
+      sessionKey: "voice:15550001234",
+      messageProvider: "voice",
+      lane: "voice",
+      runIdPrefix: "voice-realtime-consult:call-1",
+      args: { question: "What should I say?" },
+      transcript: [],
+      surface: "a live phone call",
+      userLabel: "Caller",
+    });
+
+    expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
+      expect.objectContaining({
+        sessionKey: "voice:15550001234",
+        sandboxSessionKey: "agent:voice:voice:15550001234",
+        agentId: "voice",
+      }),
+    );
+  });
+
  it("returns a speakable fallback when the embedded agent has no visible text", async () => {
    const warn = vi.fn();
    const { runtime } = createAgentRuntime([{ text: "hidden", isReasoning: true }]);
--- a/src/realtime-voice/agent-consult-runtime.ts
+++ b/src/realtime-voice/agent-consult-runtime.ts
@@ -15,6 +15,14 @@ export {
  resolveRealtimeVoiceAgentConsultToolsAllow,
 } from "./agent-consult-tool.js";

+function resolveRealtimeVoiceAgentSandboxSessionKey(agentId: string, sessionKey: string): string {
+  const trimmed = sessionKey.trim();
+  if (trimmed.toLowerCase().startsWith("agent:")) {
+    return trimmed;
+  }
+  return `agent:${agentId}:${trimmed}`;
+}
+
 export async function consultRealtimeVoiceAgent(params: {
  cfg: OpenClawConfig;
  agentRuntime: RealtimeVoiceAgentConsultRuntime;
@@ -63,6 +71,8 @@ export async function consultRealtimeVoiceAgent(params: {
  const result = await params.agentRuntime.runEmbeddedPiAgent({
    sessionId,
    sessionKey: params.sessionKey,
+    sandboxSessionKey: resolveRealtimeVoiceAgentSandboxSessionKey(agentId, params.sessionKey),
+    agentId,
    messageProvider: params.messageProvider,
    sessionFile,
    workspaceDir,