fix(voice-call): scope sandbox session to agent

This commit is contained in:
Peter Steinberger
2026-04-25 03:42:59 +01:00
parent 460720d0a1
commit 8f11e5ad18
6 changed files with 55 additions and 0 deletions

View File

@@ -84,6 +84,7 @@ Docs: https://docs.openclaw.ai
- Plugins/Voice Call: coalesce concurrent webhook server starts on the same runtime instance, avoiding a second `listen()` bind when overlapping startup paths race. Thanks @education-01.
- Plugins/Voice Call: pin voice response sessions to `responseModel` before embedded agent runs, avoiding live-session model switch failures when the global default model differs. Fixes #60118. Thanks @xinbenlv.
- Plugins/Voice Call: add `agentId` for voice response generation, so phone calls can use a dedicated agent workspace instead of always routing through `main`. Fixes #42155. Thanks @TheOpie.
- Plugins/Voice Call: scope embedded voice response sandbox resolution to the selected voice agent, so implicit `main` voice sessions respect `agents.defaults.sandbox.mode: "off"` even when other agents define sandboxed Docker binds. Fixes #56367. Thanks @crpol.
- Media tools: honor the configured web-fetch SSRF policy for media understanding, image/music/video generation references, and PDF inputs, so explicit RFC2544 opt-ins cover WebChat OSS uploads without weakening defaults. Fixes #71300. (#71321) Thanks @neeravmakwana.
- Agents/TTS: suppress successful spoken transcripts from verbose chat tool output when structured voice media is already queued, while preserving text output for non-builtin tool-name collisions. Fixes #71282. Thanks @neeravmakwana.
- Plugins/Google Meet: reuse existing Meet tabs and active sessions across harmless URL query differences, avoiding duplicate Chrome windows when agents retry a join. Thanks @steipete.

View File

@@ -223,6 +223,8 @@ describe("generateVoiceResponse", () => {
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
agentDir: "/tmp/openclaw/agents/main",
agentId: "main",
sandboxSessionKey: "agent:main:voice:15550001111",
workspaceDir: "/tmp/openclaw/workspace/main",
sessionFile: "/tmp/openclaw/main/sessions/session.jsonl",
}),
@@ -265,6 +267,8 @@ describe("generateVoiceResponse", () => {
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
agentDir: "/tmp/openclaw/agents/voice",
agentId: "voice",
sandboxSessionKey: "agent:voice:voice:15550001111",
workspaceDir: "/tmp/openclaw/workspace/voice",
sessionFile: "/tmp/openclaw/voice/sessions/session.jsonl",
}),

View File

@@ -172,6 +172,14 @@ function extractSpokenTextFromPayloads(payloads: VoiceResponsePayload[]): string
return spokenSegments.length > 0 ? spokenSegments.join(" ").trim() : null;
}
function resolveVoiceSandboxSessionKey(agentId: string, sessionKey: string): string {
const trimmed = sessionKey.trim();
if (trimmed.toLowerCase().startsWith("agent:")) {
return trimmed;
}
return `agent:${agentId}:${trimmed}`;
}
/**
* Generate a voice response using the embedded Pi agent with full tool support.
* Uses the same agent infrastructure as messaging for consistent behavior.
@@ -264,6 +272,8 @@ export async function generateVoiceResponse(
const result = await agentRuntime.runEmbeddedPiAgent({
sessionId,
sessionKey,
sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, sessionKey),
agentId,
messageProvider: "voice",
sessionFile,
workspaceDir,

View File

@@ -318,6 +318,7 @@ export async function createVoiceCallRuntime(params: {
cfg,
agentRuntime,
logger: log,
agentId: config.agentId ?? "main",
sessionKey: resolveVoiceCallConsultSessionKey(call),
messageProvider: "voice",
lane: "voice",

View File

@@ -77,6 +77,8 @@ describe("realtime voice agent consult runtime", () => {
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
sessionKey: "voice:15550001234",
sandboxSessionKey: "agent:main:voice:15550001234",
agentId: "main",
messageProvider: "voice",
lane: "voice",
toolsAllow: ["read"],
@@ -89,6 +91,33 @@ describe("realtime voice agent consult runtime", () => {
);
});
it("scopes sandbox resolution to the configured consult agent", async () => {
const { runtime, runEmbeddedPiAgent } = createAgentRuntime();
await consultRealtimeVoiceAgent({
cfg: {} as never,
agentRuntime: runtime as never,
logger: { warn: vi.fn() },
agentId: "voice",
sessionKey: "voice:15550001234",
messageProvider: "voice",
lane: "voice",
runIdPrefix: "voice-realtime-consult:call-1",
args: { question: "What should I say?" },
transcript: [],
surface: "a live phone call",
userLabel: "Caller",
});
expect(runEmbeddedPiAgent).toHaveBeenCalledWith(
expect.objectContaining({
sessionKey: "voice:15550001234",
sandboxSessionKey: "agent:voice:voice:15550001234",
agentId: "voice",
}),
);
});
it("returns a speakable fallback when the embedded agent has no visible text", async () => {
const warn = vi.fn();
const { runtime } = createAgentRuntime([{ text: "hidden", isReasoning: true }]);

View File

@@ -15,6 +15,14 @@ export {
resolveRealtimeVoiceAgentConsultToolsAllow,
} from "./agent-consult-tool.js";
function resolveRealtimeVoiceAgentSandboxSessionKey(agentId: string, sessionKey: string): string {
const trimmed = sessionKey.trim();
if (trimmed.toLowerCase().startsWith("agent:")) {
return trimmed;
}
return `agent:${agentId}:${trimmed}`;
}
export async function consultRealtimeVoiceAgent(params: {
cfg: OpenClawConfig;
agentRuntime: RealtimeVoiceAgentConsultRuntime;
@@ -63,6 +71,8 @@ export async function consultRealtimeVoiceAgent(params: {
const result = await params.agentRuntime.runEmbeddedPiAgent({
sessionId,
sessionKey: params.sessionKey,
sandboxSessionKey: resolveRealtimeVoiceAgentSandboxSessionKey(agentId, params.sessionKey),
agentId,
messageProvider: params.messageProvider,
sessionFile,
workspaceDir,