mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 14:10:51 +00:00
feat: add browser realtime talk transports
This commit is contained in:
515
scripts/dev/realtime-talk-live-smoke.ts
Normal file
515
scripts/dev/realtime-talk-live-smoke.ts
Normal file
@@ -0,0 +1,515 @@
|
||||
import { mkdtemp, rm, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { GoogleGenAI, Modality } from "@google/genai";
|
||||
import { chromium, type Browser } from "playwright";
|
||||
import { createServer, type ViteDevServer } from "vite";
|
||||
|
||||
const OPENAI_REALTIME_MODEL =
|
||||
process.env.OPENCLAW_REALTIME_OPENAI_MODEL?.trim() || "gpt-realtime-1.5";
|
||||
const OPENAI_REALTIME_VOICE = process.env.OPENCLAW_REALTIME_OPENAI_VOICE?.trim() || "alloy";
|
||||
const GOOGLE_REALTIME_MODEL =
|
||||
process.env.OPENCLAW_REALTIME_GOOGLE_MODEL?.trim() ||
|
||||
"gemini-2.5-flash-native-audio-preview-12-2025";
|
||||
const GOOGLE_REALTIME_VOICE = process.env.OPENCLAW_REALTIME_GOOGLE_VOICE?.trim() || "Kore";
|
||||
const GOOGLE_LIVE_WS_URL =
|
||||
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
|
||||
|
||||
type SmokeResult = {
|
||||
name: string;
|
||||
ok: boolean;
|
||||
details?: Record<string, unknown>;
|
||||
};
|
||||
|
||||
function getEnv(name: string): string | undefined {
|
||||
const value = process.env[name]?.trim();
|
||||
return value ? value : undefined;
|
||||
}
|
||||
|
||||
function shortError(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
async function readBoundedText(response: Response): Promise<string> {
|
||||
const text = await response.text();
|
||||
return text.length > 600 ? `${text.slice(0, 600)}...` : text;
|
||||
}
|
||||
|
||||
function printResult(result: SmokeResult): void {
|
||||
console.log(`${result.name}: ${result.ok ? "ok" : "failed"}`, result.details ?? {});
|
||||
}
|
||||
|
||||
function compareStrings(left: string | undefined, right: string | undefined): number {
|
||||
return (left ?? "").localeCompare(right ?? "");
|
||||
}
|
||||
|
||||
async function createOpenAIClientSecret(apiKey: string): Promise<string> {
|
||||
const response = await fetch("https://api.openai.com/v1/realtime/client_secrets", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
session: {
|
||||
type: "realtime",
|
||||
model: OPENAI_REALTIME_MODEL,
|
||||
audio: {
|
||||
output: { voice: OPENAI_REALTIME_VOICE },
|
||||
},
|
||||
},
|
||||
}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`OpenAI Realtime client secret failed (${response.status}): ${await readBoundedText(
|
||||
response,
|
||||
)}`,
|
||||
);
|
||||
}
|
||||
const payload = (await response.json()) as Record<string, unknown>;
|
||||
const nested =
|
||||
payload.client_secret && typeof payload.client_secret === "object"
|
||||
? (payload.client_secret as Record<string, unknown>)
|
||||
: undefined;
|
||||
const value = typeof payload.value === "string" ? payload.value : undefined;
|
||||
const nestedValue = typeof nested?.value === "string" ? nested.value : undefined;
|
||||
const secret = value ?? nestedValue;
|
||||
if (!secret) {
|
||||
throw new Error("OpenAI Realtime client secret response did not include a value");
|
||||
}
|
||||
return secret;
|
||||
}
|
||||
|
||||
async function smokeOpenAIWebRtc(browser: Browser, apiKey: string): Promise<SmokeResult> {
|
||||
try {
|
||||
const clientSecret = await createOpenAIClientSecret(apiKey);
|
||||
const context = await browser.newContext({
|
||||
permissions: ["microphone"],
|
||||
});
|
||||
const page = await context.newPage();
|
||||
const result = await page.evaluate(
|
||||
async ({ clientSecret: secret }) => {
|
||||
let media: MediaStream;
|
||||
if (navigator.mediaDevices?.getUserMedia) {
|
||||
media = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} else {
|
||||
const audioContext = new AudioContext();
|
||||
const destination = audioContext.createMediaStreamDestination();
|
||||
const oscillator = audioContext.createOscillator();
|
||||
oscillator.connect(destination);
|
||||
oscillator.start();
|
||||
media = destination.stream;
|
||||
}
|
||||
const peer = new RTCPeerConnection();
|
||||
for (const track of media.getAudioTracks()) {
|
||||
peer.addTrack(track, media);
|
||||
}
|
||||
const channel = peer.createDataChannel("oai-events");
|
||||
const connectionState = new Promise<string>((resolve) => {
|
||||
const timeout = window.setTimeout(() => resolve(peer.connectionState), 12_000);
|
||||
peer.addEventListener("connectionstatechange", () => {
|
||||
if (peer.connectionState === "connected" || peer.connectionState === "failed") {
|
||||
window.clearTimeout(timeout);
|
||||
resolve(peer.connectionState);
|
||||
}
|
||||
});
|
||||
channel.addEventListener("open", () => {
|
||||
window.clearTimeout(timeout);
|
||||
resolve(peer.connectionState || "data-channel-open");
|
||||
});
|
||||
});
|
||||
const offer = await peer.createOffer();
|
||||
await peer.setLocalDescription(offer);
|
||||
const response = await fetch("https://api.openai.com/v1/realtime/calls", {
|
||||
method: "POST",
|
||||
body: offer.sdp,
|
||||
headers: {
|
||||
Authorization: `Bearer ${secret}`,
|
||||
"Content-Type": "application/sdp",
|
||||
},
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenAI Realtime SDP offer failed (${response.status})`);
|
||||
}
|
||||
const answer = await response.text();
|
||||
await peer.setRemoteDescription({ type: "answer", sdp: answer });
|
||||
const state = await connectionState;
|
||||
peer.close();
|
||||
media.getTracks().forEach((track) => track.stop());
|
||||
return {
|
||||
answerHasAudio: answer.includes("m=audio"),
|
||||
remoteDescriptionApplied: peer.remoteDescription?.type === "answer",
|
||||
connectionState: state,
|
||||
};
|
||||
},
|
||||
{ clientSecret },
|
||||
);
|
||||
await context.close();
|
||||
return {
|
||||
name: "openai-webrtc-browser",
|
||||
ok: result.answerHasAudio && result.remoteDescriptionApplied,
|
||||
details: {
|
||||
model: OPENAI_REALTIME_MODEL,
|
||||
answerHasAudio: result.answerHasAudio,
|
||||
remoteDescriptionApplied: result.remoteDescriptionApplied,
|
||||
connectionState: result.connectionState,
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
return { name: "openai-webrtc-browser", ok: false, details: { error: shortError(error) } };
|
||||
}
|
||||
}
|
||||
|
||||
async function createGoogleLiveToken(apiKey: string): Promise<string> {
|
||||
const ai = new GoogleGenAI({
|
||||
apiKey,
|
||||
httpOptions: { apiVersion: "v1alpha" },
|
||||
});
|
||||
const now = Date.now();
|
||||
const token = await ai.authTokens.create({
|
||||
config: {
|
||||
uses: 1,
|
||||
expireTime: new Date(now + 30 * 60 * 1000).toISOString(),
|
||||
newSessionExpireTime: new Date(now + 60 * 1000).toISOString(),
|
||||
liveConnectConstraints: {
|
||||
model: GOOGLE_REALTIME_MODEL,
|
||||
config: {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: { voiceName: GOOGLE_REALTIME_VOICE },
|
||||
},
|
||||
},
|
||||
systemInstruction: "OpenClaw browser Talk live smoke.",
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
const name = token.name?.trim();
|
||||
if (!name) {
|
||||
throw new Error("Google Live auth token response did not include a token name");
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
async function smokeGoogleLiveBrowserWs(browser: Browser, apiKey: string): Promise<SmokeResult> {
|
||||
try {
|
||||
const token = await createGoogleLiveToken(apiKey);
|
||||
const page = await browser.newPage();
|
||||
await page.evaluate("globalThis.__name = (fn) => fn");
|
||||
const result = await page.evaluate(
|
||||
async ({ model, tokenName, websocketUrl }) => {
|
||||
const debug: {
|
||||
opened: boolean;
|
||||
messages: string[];
|
||||
close?: { code: number; reason: string };
|
||||
error: boolean;
|
||||
} = { opened: false, messages: [], error: false };
|
||||
const dataToText = async (data: unknown): Promise<string> => {
|
||||
if (typeof data === "string") {
|
||||
return data;
|
||||
}
|
||||
if (data instanceof Blob) {
|
||||
return await data.text();
|
||||
}
|
||||
if (data instanceof ArrayBuffer) {
|
||||
return new TextDecoder().decode(data);
|
||||
}
|
||||
return String(data);
|
||||
};
|
||||
const url = new URL(websocketUrl);
|
||||
url.searchParams.set("access_token", tokenName);
|
||||
const ws = new WebSocket(url.toString());
|
||||
const done = new Promise<Record<string, unknown>>((resolve, reject) => {
|
||||
const timeout = window.setTimeout(
|
||||
() => reject(new Error(`Google Live setup timed out: ${JSON.stringify(debug)}`)),
|
||||
15_000,
|
||||
);
|
||||
ws.addEventListener("open", () => {
|
||||
debug.opened = true;
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
setup: {
|
||||
model: model.startsWith("models/") ? model : `models/${model}`,
|
||||
generationConfig: { responseModalities: ["AUDIO"] },
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
},
|
||||
}),
|
||||
);
|
||||
});
|
||||
ws.addEventListener("message", (event) => {
|
||||
void (async () => {
|
||||
const text = await dataToText(event.data);
|
||||
debug.messages.push(text.slice(0, 300));
|
||||
const message = JSON.parse(text) as { setupComplete?: unknown };
|
||||
if (!message.setupComplete) {
|
||||
return;
|
||||
}
|
||||
window.clearTimeout(timeout);
|
||||
resolve({ setupComplete: true, readyState: ws.readyState });
|
||||
})().catch((error) => {
|
||||
window.clearTimeout(timeout);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
ws.addEventListener("error", () => {
|
||||
debug.error = true;
|
||||
window.clearTimeout(timeout);
|
||||
reject(new Error("Google Live browser WebSocket errored"));
|
||||
});
|
||||
ws.addEventListener("close", (event) => {
|
||||
debug.close = { code: event.code, reason: event.reason };
|
||||
if (event.code !== 1000) {
|
||||
window.clearTimeout(timeout);
|
||||
reject(new Error(`Google Live browser WebSocket closed: ${JSON.stringify(debug)}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
const value = await done;
|
||||
ws.close(1000);
|
||||
return value;
|
||||
},
|
||||
{
|
||||
model: GOOGLE_REALTIME_MODEL,
|
||||
tokenName: token,
|
||||
websocketUrl: GOOGLE_LIVE_WS_URL,
|
||||
},
|
||||
);
|
||||
await page.close();
|
||||
return {
|
||||
name: "google-live-browser-ws",
|
||||
ok: result.setupComplete === true,
|
||||
details: { model: GOOGLE_REALTIME_MODEL, setupComplete: result.setupComplete === true },
|
||||
};
|
||||
} catch (error) {
|
||||
return { name: "google-live-browser-ws", ok: false, details: { error: shortError(error) } };
|
||||
}
|
||||
}
|
||||
|
||||
async function smokeGatewayRelayBrowser(browser: Browser): Promise<SmokeResult> {
|
||||
let server: ViteDevServer | undefined;
|
||||
const dir = await mkdtemp(path.join(tmpdir(), "openclaw-realtime-talk-"));
|
||||
try {
|
||||
const repoRoot = process.cwd().replaceAll("\\", "/");
|
||||
await writeFile(
|
||||
path.join(dir, "index.html"),
|
||||
'<!doctype html><meta charset="utf-8"><script type="module" src="/main.ts"></script>',
|
||||
);
|
||||
await writeFile(
|
||||
path.join(dir, "main.ts"),
|
||||
`
|
||||
import { GatewayRelayRealtimeTalkTransport } from "/@fs/${repoRoot}/ui/src/ui/chat/realtime-talk-gateway-relay.ts";
|
||||
|
||||
const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
const listeners = new Set();
|
||||
const requests = [];
|
||||
const statuses = [];
|
||||
const transcripts = [];
|
||||
|
||||
function emit(event) {
|
||||
for (const listener of [...listeners]) {
|
||||
listener(event);
|
||||
}
|
||||
}
|
||||
|
||||
function base64ZeroPcm(bytes) {
|
||||
let text = "";
|
||||
for (let index = 0; index < bytes; index += 1) {
|
||||
text += String.fromCharCode(0);
|
||||
}
|
||||
return btoa(text);
|
||||
}
|
||||
|
||||
const client = {
|
||||
addEventListener(listener) {
|
||||
listeners.add(listener);
|
||||
return () => listeners.delete(listener);
|
||||
},
|
||||
async request(method, params) {
|
||||
requests.push({ method, params });
|
||||
if (method === "chat.send") {
|
||||
const runId = params.idempotencyKey || "run-smoke";
|
||||
window.setTimeout(() => {
|
||||
emit({ event: "chat", payload: { runId, state: "final", message: { text: "relay consult ok" } } });
|
||||
}, 50);
|
||||
return { runId };
|
||||
}
|
||||
return { ok: true };
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
const transport = new GatewayRelayRealtimeTalkTransport(
|
||||
{
|
||||
provider: "smoke",
|
||||
transport: "gateway-relay",
|
||||
relaySessionId: "relay-live-smoke",
|
||||
audio: {
|
||||
inputEncoding: "pcm16",
|
||||
inputSampleRateHz: 24000,
|
||||
outputEncoding: "pcm16",
|
||||
outputSampleRateHz: 24000,
|
||||
},
|
||||
},
|
||||
{
|
||||
client,
|
||||
sessionKey: "main",
|
||||
callbacks: {
|
||||
onStatus: (status, detail) => statuses.push({ status, detail }),
|
||||
onTranscript: (entry) => transcripts.push(entry),
|
||||
},
|
||||
},
|
||||
);
|
||||
await transport.start();
|
||||
emit({ event: "talk.realtime.relay", payload: { relaySessionId: "relay-live-smoke", type: "ready" } });
|
||||
emit({
|
||||
event: "talk.realtime.relay",
|
||||
payload: { relaySessionId: "relay-live-smoke", type: "transcript", role: "user", text: "relay user", final: true },
|
||||
});
|
||||
emit({
|
||||
event: "talk.realtime.relay",
|
||||
payload: { relaySessionId: "relay-live-smoke", type: "transcript", role: "assistant", text: "relay assistant", final: false },
|
||||
});
|
||||
emit({
|
||||
event: "talk.realtime.relay",
|
||||
payload: { relaySessionId: "relay-live-smoke", type: "audio", audioBase64: base64ZeroPcm(480) },
|
||||
});
|
||||
const processor = transport.inputProcessor;
|
||||
processor?.onaudioprocess?.({
|
||||
inputBuffer: { getChannelData: () => new Float32Array(160).fill(0.01) },
|
||||
});
|
||||
emit({ event: "talk.realtime.relay", payload: { relaySessionId: "relay-live-smoke", type: "mark" } });
|
||||
emit({
|
||||
event: "talk.realtime.relay",
|
||||
payload: {
|
||||
relaySessionId: "relay-live-smoke",
|
||||
type: "toolCall",
|
||||
callId: "call-smoke",
|
||||
name: "openclaw_agent_consult",
|
||||
args: { question: "confirm relay consult path" },
|
||||
},
|
||||
});
|
||||
await delay(400);
|
||||
transport.stop();
|
||||
await delay(100);
|
||||
window.__relaySmokeResult = { requests, statuses, transcripts };
|
||||
window.__relaySmokeDone = true;
|
||||
} catch (error) {
|
||||
window.__relaySmokeResult = { error: error instanceof Error ? error.message : String(error), requests, statuses, transcripts };
|
||||
window.__relaySmokeDone = true;
|
||||
}
|
||||
`,
|
||||
);
|
||||
server = await createServer({
|
||||
root: dir,
|
||||
logLevel: "silent",
|
||||
server: { host: "127.0.0.1", port: 0 },
|
||||
});
|
||||
await server.listen();
|
||||
const address = server.httpServer?.address();
|
||||
if (!address || typeof address === "string") {
|
||||
throw new Error("Vite did not expose a local port");
|
||||
}
|
||||
const url = `http://127.0.0.1:${address.port}/`;
|
||||
const context = await browser.newContext({ permissions: ["microphone"] });
|
||||
await context.grantPermissions(["microphone"], { origin: url });
|
||||
const page = await context.newPage();
|
||||
await page.goto(url);
|
||||
await page.waitForFunction(() => globalThis.__relaySmokeDone === true, undefined, {
|
||||
timeout: 15_000,
|
||||
});
|
||||
const result = (await page.evaluate(() => globalThis.__relaySmokeResult)) as {
|
||||
error?: string;
|
||||
requests?: Array<{ method?: string }>;
|
||||
statuses?: Array<{ status?: string }>;
|
||||
transcripts?: Array<{ role?: string; text?: string }>;
|
||||
};
|
||||
await context.close();
|
||||
if (result.error) {
|
||||
throw new Error(result.error);
|
||||
}
|
||||
const methods = new Set((result.requests ?? []).map((request) => request.method));
|
||||
const statusNames = new Set((result.statuses ?? []).map((entry) => entry.status));
|
||||
const transcriptTexts = new Set((result.transcripts ?? []).map((entry) => entry.text));
|
||||
const expectedMethods = [
|
||||
"talk.realtime.relayAudio",
|
||||
"talk.realtime.relayMark",
|
||||
"talk.realtime.relayToolResult",
|
||||
"talk.realtime.relayStop",
|
||||
];
|
||||
const ok =
|
||||
expectedMethods.every((method) => methods.has(method)) &&
|
||||
statusNames.has("listening") &&
|
||||
statusNames.has("thinking") &&
|
||||
transcriptTexts.has("relay user") &&
|
||||
transcriptTexts.has("relay assistant");
|
||||
return {
|
||||
name: "gateway-relay-browser-adapter",
|
||||
ok,
|
||||
details: {
|
||||
methods: [...methods].toSorted(compareStrings),
|
||||
statuses: [...statusNames].toSorted(compareStrings),
|
||||
transcripts: [...transcriptTexts].toSorted(compareStrings),
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
name: "gateway-relay-browser-adapter",
|
||||
ok: false,
|
||||
details: { error: shortError(error) },
|
||||
};
|
||||
} finally {
|
||||
await server?.close();
|
||||
await rm(dir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const openAIKey = getEnv("OPENAI_API_KEY");
|
||||
const googleKey = getEnv("GEMINI_API_KEY") ?? getEnv("GOOGLE_API_KEY");
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-sandbox",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
],
|
||||
});
|
||||
const results: SmokeResult[] = [];
|
||||
try {
|
||||
if (!openAIKey) {
|
||||
results.push({
|
||||
name: "openai-webrtc-browser",
|
||||
ok: false,
|
||||
details: { error: "OPENAI_API_KEY missing" },
|
||||
});
|
||||
} else {
|
||||
results.push(await smokeOpenAIWebRtc(browser, openAIKey));
|
||||
}
|
||||
if (!googleKey) {
|
||||
results.push({
|
||||
name: "google-live-browser-ws",
|
||||
ok: false,
|
||||
details: { error: "GEMINI_API_KEY or GOOGLE_API_KEY missing" },
|
||||
});
|
||||
} else {
|
||||
results.push(await smokeGoogleLiveBrowserWs(browser, googleKey));
|
||||
}
|
||||
results.push(await smokeGatewayRelayBrowser(browser));
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
for (const result of results) {
|
||||
printResult(result);
|
||||
}
|
||||
if (results.some((result) => !result.ok)) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
await main();
|
||||
Reference in New Issue
Block a user