mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 08:40:44 +00:00
fix(talk): surface openai realtime browser failures
This commit is contained in:
@@ -28,6 +28,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Control UI/Talk: allow the OpenAI Realtime WebRTC offer endpoint through the Control UI CSP, configure browser sessions with explicit VAD/transcription input settings, and surface OpenAI realtime error/lifecycle events instead of leaving Talk stuck as live with no diagnostic. Fixes #73427.
|
||||
- Providers/OpenAI: resolve `keychain:<service>:<account>` `OPENAI_API_KEY` refs before creating OpenAI Realtime browser sessions or voice bridges, with a bounded cached Keychain lookup. Fixes #72120. Thanks @ctbritt.
|
||||
- Discord/gateway: reconnect when the gateway socket closes while waiting for the shared IDENTIFY concurrency window, instead of silently skipping IDENTIFY and leaving the bot online but unresponsive. Fixes #74617. Thanks @zeeskdr-ai.
|
||||
- Voice Call: add `sessionScope: "per-call"` for fresh per-call agent memory while preserving the default per-phone caller history. Fixes #45280. Thanks @pondcountry.
|
||||
|
||||
@@ -166,6 +166,27 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
}),
|
||||
}),
|
||||
);
|
||||
const request = fetchWithSsrFGuardMock.mock.calls[0]?.[0] as
|
||||
| { init?: { body?: string } }
|
||||
| undefined;
|
||||
const body = JSON.parse(request?.init?.body ?? "{}") as {
|
||||
session?: {
|
||||
audio?: {
|
||||
input?: {
|
||||
turn_detection?: Record<string, unknown>;
|
||||
transcription?: Record<string, unknown>;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
expect(body.session?.audio?.input).toEqual({
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
create_response: true,
|
||||
interrupt_response: true,
|
||||
},
|
||||
transcription: { model: "whisper-1" },
|
||||
});
|
||||
expect(session).toMatchObject({
|
||||
provider: "openai",
|
||||
transport: "webrtc-sdp",
|
||||
|
||||
@@ -753,6 +753,14 @@ async function createOpenAIRealtimeBrowserSession(
|
||||
model,
|
||||
instructions: req.instructions,
|
||||
audio: {
|
||||
input: {
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
create_response: true,
|
||||
interrupt_response: true,
|
||||
},
|
||||
transcription: { model: "whisper-1" },
|
||||
},
|
||||
output: { voice },
|
||||
},
|
||||
};
|
||||
|
||||
@@ -17,6 +17,18 @@ describe("buildControlUiCspHeader", () => {
|
||||
expect(csp).toContain("font-src 'self' https://fonts.gstatic.com");
|
||||
});
|
||||
|
||||
it("allows OpenAI realtime WebRTC offer requests without allowing all HTTPS", () => {
|
||||
const csp = buildControlUiCspHeader();
|
||||
const connectSrc = csp.split("; ").find((directive) => directive.startsWith("connect-src "));
|
||||
expect(connectSrc?.split(" ")).toEqual([
|
||||
"connect-src",
|
||||
"'self'",
|
||||
"ws:",
|
||||
"wss:",
|
||||
"https://api.openai.com",
|
||||
]);
|
||||
});
|
||||
|
||||
it("limits image loading to same-origin, data, and managed blob URLs", () => {
|
||||
const csp = buildControlUiCspHeader();
|
||||
expect(csp).toContain("img-src 'self' data: blob:");
|
||||
|
||||
@@ -47,6 +47,6 @@ export function buildControlUiCspHeader(opts?: { inlineScriptHashes?: string[] }
|
||||
"img-src 'self' data: blob:",
|
||||
"font-src 'self' https://fonts.gstatic.com",
|
||||
"worker-src 'self'",
|
||||
"connect-src 'self' ws: wss:",
|
||||
"connect-src 'self' ws: wss: https://api.openai.com",
|
||||
].join("; ");
|
||||
}
|
||||
|
||||
@@ -14,6 +14,11 @@ type RealtimeServerEvent = {
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
arguments?: string;
|
||||
error?: unknown;
|
||||
response?: {
|
||||
status?: string;
|
||||
status_details?: unknown;
|
||||
};
|
||||
};
|
||||
|
||||
type ToolBuffer = {
|
||||
@@ -133,11 +138,42 @@ export class WebRtcSdpRealtimeTalkTransport implements RealtimeTalkTransport {
|
||||
case "response.function_call_arguments.done":
|
||||
void this.handleToolCall(event);
|
||||
return;
|
||||
case "input_audio_buffer.speech_started":
|
||||
this.ctx.callbacks.onStatus?.("listening", "Speech detected");
|
||||
return;
|
||||
case "input_audio_buffer.speech_stopped":
|
||||
this.ctx.callbacks.onStatus?.("thinking", "Processing speech");
|
||||
return;
|
||||
case "response.created":
|
||||
this.ctx.callbacks.onStatus?.("thinking", "Generating response");
|
||||
return;
|
||||
case "response.done":
|
||||
this.ctx.callbacks.onStatus?.("listening", this.extractResponseStatus(event));
|
||||
return;
|
||||
case "error":
|
||||
this.ctx.callbacks.onStatus?.("error", this.extractErrorDetail(event.error));
|
||||
return;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private extractResponseStatus(event: RealtimeServerEvent): string | undefined {
|
||||
const status = event.response?.status;
|
||||
return status && status !== "completed" ? `Response ${status}` : undefined;
|
||||
}
|
||||
|
||||
private extractErrorDetail(error: unknown): string {
|
||||
if (!error || typeof error !== "object") {
|
||||
return "Realtime provider error";
|
||||
}
|
||||
const record = error as Record<string, unknown>;
|
||||
const message = typeof record.message === "string" ? record.message.trim() : "";
|
||||
const code = typeof record.code === "string" ? record.code.trim() : "";
|
||||
const type = typeof record.type === "string" ? record.type.trim() : "";
|
||||
return message || code || type || "Realtime provider error";
|
||||
}
|
||||
|
||||
private bufferToolDelta(event: RealtimeServerEvent): void {
|
||||
const key = event.item_id ?? "unknown";
|
||||
const existing = this.toolBuffers.get(key);
|
||||
|
||||
@@ -11,12 +11,19 @@ class FakeDataChannel extends EventTarget {
|
||||
}
|
||||
|
||||
class FakePeerConnection extends EventTarget {
|
||||
static instances: FakePeerConnection[] = [];
|
||||
|
||||
connectionState: RTCPeerConnectionState = "new";
|
||||
readonly channel = new FakeDataChannel();
|
||||
readonly addTrack = vi.fn();
|
||||
localDescription: RTCSessionDescriptionInit | null = null;
|
||||
remoteDescription: RTCSessionDescriptionInit | null = null;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
FakePeerConnection.instances.push(this);
|
||||
}
|
||||
|
||||
createDataChannel(): RTCDataChannel {
|
||||
return this.channel as unknown as RTCDataChannel;
|
||||
}
|
||||
@@ -44,6 +51,7 @@ describe("WebRtcSdpRealtimeTalkTransport", () => {
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
FakePeerConnection.instances = [];
|
||||
const track = { stop: vi.fn() } as unknown as MediaStreamTrack;
|
||||
const stream = {
|
||||
getAudioTracks: () => [track],
|
||||
@@ -93,4 +101,75 @@ describe("WebRtcSdpRealtimeTalkTransport", () => {
|
||||
});
|
||||
transport.stop();
|
||||
});
|
||||
|
||||
it("surfaces realtime provider errors from the OpenAI data channel", async () => {
|
||||
vi.stubGlobal(
|
||||
"fetch",
|
||||
vi.fn(async () => new Response("answer-sdp")) as unknown as typeof fetch,
|
||||
);
|
||||
const onStatus = vi.fn();
|
||||
const transport = new WebRtcSdpRealtimeTalkTransport(
|
||||
{
|
||||
provider: "openai",
|
||||
transport: "webrtc-sdp",
|
||||
clientSecret: "client-secret-123",
|
||||
},
|
||||
{
|
||||
client: {} as never,
|
||||
sessionKey: "main",
|
||||
callbacks: { onStatus },
|
||||
},
|
||||
);
|
||||
|
||||
await transport.start();
|
||||
const peer = FakePeerConnection.instances[0];
|
||||
peer?.channel.dispatchEvent(
|
||||
new MessageEvent("message", {
|
||||
data: JSON.stringify({
|
||||
type: "error",
|
||||
error: { message: "Realtime model rejected the session" },
|
||||
}),
|
||||
}),
|
||||
);
|
||||
|
||||
expect(onStatus).toHaveBeenCalledWith("error", "Realtime model rejected the session");
|
||||
transport.stop();
|
||||
});
|
||||
|
||||
it("surfaces speech and response lifecycle status from the OpenAI data channel", async () => {
|
||||
vi.stubGlobal(
|
||||
"fetch",
|
||||
vi.fn(async () => new Response("answer-sdp")) as unknown as typeof fetch,
|
||||
);
|
||||
const onStatus = vi.fn();
|
||||
const transport = new WebRtcSdpRealtimeTalkTransport(
|
||||
{
|
||||
provider: "openai",
|
||||
transport: "webrtc-sdp",
|
||||
clientSecret: "client-secret-123",
|
||||
},
|
||||
{
|
||||
client: {} as never,
|
||||
sessionKey: "main",
|
||||
callbacks: { onStatus },
|
||||
},
|
||||
);
|
||||
|
||||
await transport.start();
|
||||
const peer = FakePeerConnection.instances[0];
|
||||
for (const type of [
|
||||
"input_audio_buffer.speech_started",
|
||||
"input_audio_buffer.speech_stopped",
|
||||
"response.created",
|
||||
"response.done",
|
||||
]) {
|
||||
peer?.channel.dispatchEvent(new MessageEvent("message", { data: JSON.stringify({ type }) }));
|
||||
}
|
||||
|
||||
expect(onStatus).toHaveBeenCalledWith("listening", "Speech detected");
|
||||
expect(onStatus).toHaveBeenCalledWith("thinking", "Processing speech");
|
||||
expect(onStatus).toHaveBeenCalledWith("thinking", "Generating response");
|
||||
expect(onStatus).toHaveBeenCalledWith("listening", undefined);
|
||||
transport.stop();
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user