fix(talk): surface openai realtime browser failures

This commit is contained in:
Peter Steinberger
2026-05-02 08:47:07 +01:00
parent bf67976ea5
commit 10c8b9085a
7 changed files with 158 additions and 1 deletions

View File

@@ -28,6 +28,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Control UI/Talk: allow the OpenAI Realtime WebRTC offer endpoint through the Control UI CSP, configure browser sessions with explicit VAD/transcription input settings, and surface OpenAI realtime error/lifecycle events instead of leaving Talk stuck as live with no diagnostic. Fixes #73427.
- Providers/OpenAI: resolve `keychain:<service>:<account>` `OPENAI_API_KEY` refs before creating OpenAI Realtime browser sessions or voice bridges, with a bounded cached Keychain lookup. Fixes #72120. Thanks @ctbritt.
- Discord/gateway: reconnect when the gateway socket closes while waiting for the shared IDENTIFY concurrency window, instead of silently skipping IDENTIFY and leaving the bot online but unresponsive. Fixes #74617. Thanks @zeeskdr-ai.
- Voice Call: add `sessionScope: "per-call"` for fresh per-call agent memory while preserving the default per-phone caller history. Fixes #45280. Thanks @pondcountry.

View File

@@ -166,6 +166,27 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
}),
}),
);
const request = fetchWithSsrFGuardMock.mock.calls[0]?.[0] as
| { init?: { body?: string } }
| undefined;
const body = JSON.parse(request?.init?.body ?? "{}") as {
session?: {
audio?: {
input?: {
turn_detection?: Record<string, unknown>;
transcription?: Record<string, unknown>;
};
};
};
};
expect(body.session?.audio?.input).toEqual({
turn_detection: {
type: "server_vad",
create_response: true,
interrupt_response: true,
},
transcription: { model: "whisper-1" },
});
expect(session).toMatchObject({
provider: "openai",
transport: "webrtc-sdp",

View File

@@ -753,6 +753,14 @@ async function createOpenAIRealtimeBrowserSession(
model,
instructions: req.instructions,
audio: {
input: {
turn_detection: {
type: "server_vad",
create_response: true,
interrupt_response: true,
},
transcription: { model: "whisper-1" },
},
output: { voice },
},
};

View File

@@ -17,6 +17,18 @@ describe("buildControlUiCspHeader", () => {
expect(csp).toContain("font-src 'self' https://fonts.gstatic.com");
});
it("allows OpenAI realtime WebRTC offer requests without allowing all HTTPS", () => {
const csp = buildControlUiCspHeader();
const connectSrc = csp.split("; ").find((directive) => directive.startsWith("connect-src "));
expect(connectSrc?.split(" ")).toEqual([
"connect-src",
"'self'",
"ws:",
"wss:",
"https://api.openai.com",
]);
});
it("limits image loading to same-origin, data, and managed blob URLs", () => {
const csp = buildControlUiCspHeader();
expect(csp).toContain("img-src 'self' data: blob:");

View File

@@ -47,6 +47,6 @@ export function buildControlUiCspHeader(opts?: { inlineScriptHashes?: string[] }
"img-src 'self' data: blob:",
"font-src 'self' https://fonts.gstatic.com",
"worker-src 'self'",
"connect-src 'self' ws: wss:",
"connect-src 'self' ws: wss: https://api.openai.com",
].join("; ");
}

View File

@@ -14,6 +14,11 @@ type RealtimeServerEvent = {
delta?: string;
transcript?: string;
arguments?: string;
error?: unknown;
response?: {
status?: string;
status_details?: unknown;
};
};
type ToolBuffer = {
@@ -133,11 +138,42 @@ export class WebRtcSdpRealtimeTalkTransport implements RealtimeTalkTransport {
case "response.function_call_arguments.done":
void this.handleToolCall(event);
return;
case "input_audio_buffer.speech_started":
this.ctx.callbacks.onStatus?.("listening", "Speech detected");
return;
case "input_audio_buffer.speech_stopped":
this.ctx.callbacks.onStatus?.("thinking", "Processing speech");
return;
case "response.created":
this.ctx.callbacks.onStatus?.("thinking", "Generating response");
return;
case "response.done":
this.ctx.callbacks.onStatus?.("listening", this.extractResponseStatus(event));
return;
case "error":
this.ctx.callbacks.onStatus?.("error", this.extractErrorDetail(event.error));
return;
default:
return;
}
}
private extractResponseStatus(event: RealtimeServerEvent): string | undefined {
const status = event.response?.status;
return status && status !== "completed" ? `Response ${status}` : undefined;
}
private extractErrorDetail(error: unknown): string {
if (!error || typeof error !== "object") {
return "Realtime provider error";
}
const record = error as Record<string, unknown>;
const message = typeof record.message === "string" ? record.message.trim() : "";
const code = typeof record.code === "string" ? record.code.trim() : "";
const type = typeof record.type === "string" ? record.type.trim() : "";
return message || code || type || "Realtime provider error";
}
private bufferToolDelta(event: RealtimeServerEvent): void {
const key = event.item_id ?? "unknown";
const existing = this.toolBuffers.get(key);

View File

@@ -11,12 +11,19 @@ class FakeDataChannel extends EventTarget {
}
class FakePeerConnection extends EventTarget {
static instances: FakePeerConnection[] = [];
connectionState: RTCPeerConnectionState = "new";
readonly channel = new FakeDataChannel();
readonly addTrack = vi.fn();
localDescription: RTCSessionDescriptionInit | null = null;
remoteDescription: RTCSessionDescriptionInit | null = null;
constructor() {
super();
FakePeerConnection.instances.push(this);
}
createDataChannel(): RTCDataChannel {
return this.channel as unknown as RTCDataChannel;
}
@@ -44,6 +51,7 @@ describe("WebRtcSdpRealtimeTalkTransport", () => {
});
beforeEach(() => {
FakePeerConnection.instances = [];
const track = { stop: vi.fn() } as unknown as MediaStreamTrack;
const stream = {
getAudioTracks: () => [track],
@@ -93,4 +101,75 @@ describe("WebRtcSdpRealtimeTalkTransport", () => {
});
transport.stop();
});
it("surfaces realtime provider errors from the OpenAI data channel", async () => {
vi.stubGlobal(
"fetch",
vi.fn(async () => new Response("answer-sdp")) as unknown as typeof fetch,
);
const onStatus = vi.fn();
const transport = new WebRtcSdpRealtimeTalkTransport(
{
provider: "openai",
transport: "webrtc-sdp",
clientSecret: "client-secret-123",
},
{
client: {} as never,
sessionKey: "main",
callbacks: { onStatus },
},
);
await transport.start();
const peer = FakePeerConnection.instances[0];
peer?.channel.dispatchEvent(
new MessageEvent("message", {
data: JSON.stringify({
type: "error",
error: { message: "Realtime model rejected the session" },
}),
}),
);
expect(onStatus).toHaveBeenCalledWith("error", "Realtime model rejected the session");
transport.stop();
});
it("surfaces speech and response lifecycle status from the OpenAI data channel", async () => {
vi.stubGlobal(
"fetch",
vi.fn(async () => new Response("answer-sdp")) as unknown as typeof fetch,
);
const onStatus = vi.fn();
const transport = new WebRtcSdpRealtimeTalkTransport(
{
provider: "openai",
transport: "webrtc-sdp",
clientSecret: "client-secret-123",
},
{
client: {} as never,
sessionKey: "main",
callbacks: { onStatus },
},
);
await transport.start();
const peer = FakePeerConnection.instances[0];
for (const type of [
"input_audio_buffer.speech_started",
"input_audio_buffer.speech_stopped",
"response.created",
"response.done",
]) {
peer?.channel.dispatchEvent(new MessageEvent("message", { data: JSON.stringify({ type }) }));
}
expect(onStatus).toHaveBeenCalledWith("listening", "Speech detected");
expect(onStatus).toHaveBeenCalledWith("thinking", "Processing speech");
expect(onStatus).toHaveBeenCalledWith("thinking", "Generating response");
expect(onStatus).toHaveBeenCalledWith("listening", undefined);
transport.stop();
});
});