Files
openclaw/scripts/dev/realtime-talk-live-smoke.ts
2026-04-27 14:22:32 +01:00

516 lines
17 KiB
TypeScript

import { mkdtemp, rm, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import path from "node:path";
import { GoogleGenAI, Modality } from "@google/genai";
import { chromium, type Browser } from "playwright";
import { createServer, type ViteDevServer } from "vite";
const OPENAI_REALTIME_MODEL =
process.env.OPENCLAW_REALTIME_OPENAI_MODEL?.trim() || "gpt-realtime-1.5";
const OPENAI_REALTIME_VOICE = process.env.OPENCLAW_REALTIME_OPENAI_VOICE?.trim() || "alloy";
const GOOGLE_REALTIME_MODEL =
process.env.OPENCLAW_REALTIME_GOOGLE_MODEL?.trim() ||
"gemini-2.5-flash-native-audio-preview-12-2025";
const GOOGLE_REALTIME_VOICE = process.env.OPENCLAW_REALTIME_GOOGLE_VOICE?.trim() || "Kore";
const GOOGLE_LIVE_WS_URL =
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
type SmokeResult = {
name: string;
ok: boolean;
details?: Record<string, unknown>;
};
function getEnv(name: string): string | undefined {
const value = process.env[name]?.trim();
return value ? value : undefined;
}
function shortError(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
async function readBoundedText(response: Response): Promise<string> {
const text = await response.text();
return text.length > 600 ? `${text.slice(0, 600)}...` : text;
}
function printResult(result: SmokeResult): void {
console.log(`${result.name}: ${result.ok ? "ok" : "failed"}`, result.details ?? {});
}
function compareStrings(left: string | undefined, right: string | undefined): number {
return (left ?? "").localeCompare(right ?? "");
}
async function createOpenAIClientSecret(apiKey: string): Promise<string> {
const response = await fetch("https://api.openai.com/v1/realtime/client_secrets", {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
session: {
type: "realtime",
model: OPENAI_REALTIME_MODEL,
audio: {
output: { voice: OPENAI_REALTIME_VOICE },
},
},
}),
});
if (!response.ok) {
throw new Error(
`OpenAI Realtime client secret failed (${response.status}): ${await readBoundedText(
response,
)}`,
);
}
const payload = (await response.json()) as Record<string, unknown>;
const nested =
payload.client_secret && typeof payload.client_secret === "object"
? (payload.client_secret as Record<string, unknown>)
: undefined;
const value = typeof payload.value === "string" ? payload.value : undefined;
const nestedValue = typeof nested?.value === "string" ? nested.value : undefined;
const secret = value ?? nestedValue;
if (!secret) {
throw new Error("OpenAI Realtime client secret response did not include a value");
}
return secret;
}
async function smokeOpenAIWebRtc(browser: Browser, apiKey: string): Promise<SmokeResult> {
try {
const clientSecret = await createOpenAIClientSecret(apiKey);
const context = await browser.newContext({
permissions: ["microphone"],
});
const page = await context.newPage();
const result = await page.evaluate(
async ({ clientSecret: secret }) => {
let media: MediaStream;
if (navigator.mediaDevices?.getUserMedia) {
media = await navigator.mediaDevices.getUserMedia({ audio: true });
} else {
const audioContext = new AudioContext();
const destination = audioContext.createMediaStreamDestination();
const oscillator = audioContext.createOscillator();
oscillator.connect(destination);
oscillator.start();
media = destination.stream;
}
const peer = new RTCPeerConnection();
for (const track of media.getAudioTracks()) {
peer.addTrack(track, media);
}
const channel = peer.createDataChannel("oai-events");
const connectionState = new Promise<string>((resolve) => {
const timeout = window.setTimeout(() => resolve(peer.connectionState), 12_000);
peer.addEventListener("connectionstatechange", () => {
if (peer.connectionState === "connected" || peer.connectionState === "failed") {
window.clearTimeout(timeout);
resolve(peer.connectionState);
}
});
channel.addEventListener("open", () => {
window.clearTimeout(timeout);
resolve(peer.connectionState || "data-channel-open");
});
});
const offer = await peer.createOffer();
await peer.setLocalDescription(offer);
const response = await fetch("https://api.openai.com/v1/realtime/calls", {
method: "POST",
body: offer.sdp,
headers: {
Authorization: `Bearer ${secret}`,
"Content-Type": "application/sdp",
},
});
if (!response.ok) {
throw new Error(`OpenAI Realtime SDP offer failed (${response.status})`);
}
const answer = await response.text();
await peer.setRemoteDescription({ type: "answer", sdp: answer });
const state = await connectionState;
peer.close();
media.getTracks().forEach((track) => track.stop());
return {
answerHasAudio: answer.includes("m=audio"),
remoteDescriptionApplied: peer.remoteDescription?.type === "answer",
connectionState: state,
};
},
{ clientSecret },
);
await context.close();
return {
name: "openai-webrtc-browser",
ok: result.answerHasAudio && result.remoteDescriptionApplied,
details: {
model: OPENAI_REALTIME_MODEL,
answerHasAudio: result.answerHasAudio,
remoteDescriptionApplied: result.remoteDescriptionApplied,
connectionState: result.connectionState,
},
};
} catch (error) {
return { name: "openai-webrtc-browser", ok: false, details: { error: shortError(error) } };
}
}
async function createGoogleLiveToken(apiKey: string): Promise<string> {
const ai = new GoogleGenAI({
apiKey,
httpOptions: { apiVersion: "v1alpha" },
});
const now = Date.now();
const token = await ai.authTokens.create({
config: {
uses: 1,
expireTime: new Date(now + 30 * 60 * 1000).toISOString(),
newSessionExpireTime: new Date(now + 60 * 1000).toISOString(),
liveConnectConstraints: {
model: GOOGLE_REALTIME_MODEL,
config: {
responseModalities: [Modality.AUDIO],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: { voiceName: GOOGLE_REALTIME_VOICE },
},
},
systemInstruction: "OpenClaw browser Talk live smoke.",
inputAudioTranscription: {},
outputAudioTranscription: {},
},
},
},
});
const name = token.name?.trim();
if (!name) {
throw new Error("Google Live auth token response did not include a token name");
}
return name;
}
async function smokeGoogleLiveBrowserWs(browser: Browser, apiKey: string): Promise<SmokeResult> {
try {
const token = await createGoogleLiveToken(apiKey);
const page = await browser.newPage();
await page.evaluate("globalThis.__name = (fn) => fn");
const result = await page.evaluate(
async ({ model, tokenName, websocketUrl }) => {
const debug: {
opened: boolean;
messages: string[];
close?: { code: number; reason: string };
error: boolean;
} = { opened: false, messages: [], error: false };
const dataToText = async (data: unknown): Promise<string> => {
if (typeof data === "string") {
return data;
}
if (data instanceof Blob) {
return await data.text();
}
if (data instanceof ArrayBuffer) {
return new TextDecoder().decode(data);
}
return String(data);
};
const url = new URL(websocketUrl);
url.searchParams.set("access_token", tokenName);
const ws = new WebSocket(url.toString());
const done = new Promise<Record<string, unknown>>((resolve, reject) => {
const timeout = window.setTimeout(
() => reject(new Error(`Google Live setup timed out: ${JSON.stringify(debug)}`)),
15_000,
);
ws.addEventListener("open", () => {
debug.opened = true;
ws.send(
JSON.stringify({
setup: {
model: model.startsWith("models/") ? model : `models/${model}`,
generationConfig: { responseModalities: ["AUDIO"] },
inputAudioTranscription: {},
outputAudioTranscription: {},
},
}),
);
});
ws.addEventListener("message", (event) => {
void (async () => {
const text = await dataToText(event.data);
debug.messages.push(text.slice(0, 300));
const message = JSON.parse(text) as { setupComplete?: unknown };
if (!message.setupComplete) {
return;
}
window.clearTimeout(timeout);
resolve({ setupComplete: true, readyState: ws.readyState });
})().catch((error) => {
window.clearTimeout(timeout);
reject(error);
});
});
ws.addEventListener("error", () => {
debug.error = true;
window.clearTimeout(timeout);
reject(new Error("Google Live browser WebSocket errored"));
});
ws.addEventListener("close", (event) => {
debug.close = { code: event.code, reason: event.reason };
if (event.code !== 1000) {
window.clearTimeout(timeout);
reject(new Error(`Google Live browser WebSocket closed: ${JSON.stringify(debug)}`));
}
});
});
const value = await done;
ws.close(1000);
return value;
},
{
model: GOOGLE_REALTIME_MODEL,
tokenName: token,
websocketUrl: GOOGLE_LIVE_WS_URL,
},
);
await page.close();
return {
name: "google-live-browser-ws",
ok: result.setupComplete === true,
details: { model: GOOGLE_REALTIME_MODEL, setupComplete: result.setupComplete === true },
};
} catch (error) {
return { name: "google-live-browser-ws", ok: false, details: { error: shortError(error) } };
}
}
async function smokeGatewayRelayBrowser(browser: Browser): Promise<SmokeResult> {
let server: ViteDevServer | undefined;
const dir = await mkdtemp(path.join(tmpdir(), "openclaw-realtime-talk-"));
try {
const repoRoot = process.cwd().replaceAll("\\", "/");
await writeFile(
path.join(dir, "index.html"),
'<!doctype html><meta charset="utf-8"><script type="module" src="/main.ts"></script>',
);
await writeFile(
path.join(dir, "main.ts"),
`
import { GatewayRelayRealtimeTalkTransport } from "/@fs/${repoRoot}/ui/src/ui/chat/realtime-talk-gateway-relay.ts";
const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
const listeners = new Set();
const requests = [];
const statuses = [];
const transcripts = [];
function emit(event) {
for (const listener of [...listeners]) {
listener(event);
}
}
function base64ZeroPcm(bytes) {
let text = "";
for (let index = 0; index < bytes; index += 1) {
text += String.fromCharCode(0);
}
return btoa(text);
}
const client = {
addEventListener(listener) {
listeners.add(listener);
return () => listeners.delete(listener);
},
async request(method, params) {
requests.push({ method, params });
if (method === "chat.send") {
const runId = params.idempotencyKey || "run-smoke";
window.setTimeout(() => {
emit({ event: "chat", payload: { runId, state: "final", message: { text: "relay consult ok" } } });
}, 50);
return { runId };
}
return { ok: true };
},
};
try {
const transport = new GatewayRelayRealtimeTalkTransport(
{
provider: "smoke",
transport: "gateway-relay",
relaySessionId: "relay-live-smoke",
audio: {
inputEncoding: "pcm16",
inputSampleRateHz: 24000,
outputEncoding: "pcm16",
outputSampleRateHz: 24000,
},
},
{
client,
sessionKey: "main",
callbacks: {
onStatus: (status, detail) => statuses.push({ status, detail }),
onTranscript: (entry) => transcripts.push(entry),
},
},
);
await transport.start();
emit({ event: "talk.realtime.relay", payload: { relaySessionId: "relay-live-smoke", type: "ready" } });
emit({
event: "talk.realtime.relay",
payload: { relaySessionId: "relay-live-smoke", type: "transcript", role: "user", text: "relay user", final: true },
});
emit({
event: "talk.realtime.relay",
payload: { relaySessionId: "relay-live-smoke", type: "transcript", role: "assistant", text: "relay assistant", final: false },
});
emit({
event: "talk.realtime.relay",
payload: { relaySessionId: "relay-live-smoke", type: "audio", audioBase64: base64ZeroPcm(480) },
});
const processor = transport.inputProcessor;
processor?.onaudioprocess?.({
inputBuffer: { getChannelData: () => new Float32Array(160).fill(0.01) },
});
emit({ event: "talk.realtime.relay", payload: { relaySessionId: "relay-live-smoke", type: "mark" } });
emit({
event: "talk.realtime.relay",
payload: {
relaySessionId: "relay-live-smoke",
type: "toolCall",
callId: "call-smoke",
name: "openclaw_agent_consult",
args: { question: "confirm relay consult path" },
},
});
await delay(400);
transport.stop();
await delay(100);
window.__relaySmokeResult = { requests, statuses, transcripts };
window.__relaySmokeDone = true;
} catch (error) {
window.__relaySmokeResult = { error: error instanceof Error ? error.message : String(error), requests, statuses, transcripts };
window.__relaySmokeDone = true;
}
`,
);
server = await createServer({
root: dir,
logLevel: "silent",
server: { host: "127.0.0.1", port: 0 },
});
await server.listen();
const address = server.httpServer?.address();
if (!address || typeof address === "string") {
throw new Error("Vite did not expose a local port");
}
const url = `http://127.0.0.1:${address.port}/`;
const context = await browser.newContext({ permissions: ["microphone"] });
await context.grantPermissions(["microphone"], { origin: url });
const page = await context.newPage();
await page.goto(url);
await page.waitForFunction(() => globalThis.__relaySmokeDone === true, undefined, {
timeout: 15_000,
});
const result = (await page.evaluate(() => globalThis.__relaySmokeResult)) as {
error?: string;
requests?: Array<{ method?: string }>;
statuses?: Array<{ status?: string }>;
transcripts?: Array<{ role?: string; text?: string }>;
};
await context.close();
if (result.error) {
throw new Error(result.error);
}
const methods = new Set((result.requests ?? []).map((request) => request.method));
const statusNames = new Set((result.statuses ?? []).map((entry) => entry.status));
const transcriptTexts = new Set((result.transcripts ?? []).map((entry) => entry.text));
const expectedMethods = [
"talk.realtime.relayAudio",
"talk.realtime.relayMark",
"talk.realtime.relayToolResult",
"talk.realtime.relayStop",
];
const ok =
expectedMethods.every((method) => methods.has(method)) &&
statusNames.has("listening") &&
statusNames.has("thinking") &&
transcriptTexts.has("relay user") &&
transcriptTexts.has("relay assistant");
return {
name: "gateway-relay-browser-adapter",
ok,
details: {
methods: [...methods].toSorted(compareStrings),
statuses: [...statusNames].toSorted(compareStrings),
transcripts: [...transcriptTexts].toSorted(compareStrings),
},
};
} catch (error) {
return {
name: "gateway-relay-browser-adapter",
ok: false,
details: { error: shortError(error) },
};
} finally {
await server?.close();
await rm(dir, { recursive: true, force: true });
}
}
async function main(): Promise<void> {
const openAIKey = getEnv("OPENAI_API_KEY");
const googleKey = getEnv("GEMINI_API_KEY") ?? getEnv("GOOGLE_API_KEY");
const browser = await chromium.launch({
headless: true,
args: [
"--autoplay-policy=no-user-gesture-required",
"--no-sandbox",
"--use-fake-device-for-media-stream",
"--use-fake-ui-for-media-stream",
],
});
const results: SmokeResult[] = [];
try {
if (!openAIKey) {
results.push({
name: "openai-webrtc-browser",
ok: false,
details: { error: "OPENAI_API_KEY missing" },
});
} else {
results.push(await smokeOpenAIWebRtc(browser, openAIKey));
}
if (!googleKey) {
results.push({
name: "google-live-browser-ws",
ok: false,
details: { error: "GEMINI_API_KEY or GOOGLE_API_KEY missing" },
});
} else {
results.push(await smokeGoogleLiveBrowserWs(browser, googleKey));
}
results.push(await smokeGatewayRelayBrowser(browser));
} finally {
await browser.close();
}
for (const result of results) {
printResult(result);
}
if (results.some((result) => !result.ok)) {
process.exitCode = 1;
}
}
await main();