fix: split google meet realtime providers

This commit is contained in:
Peter Steinberger
2026-05-04 04:07:35 +01:00
parent 51fea3826a
commit 11c600cf19
11 changed files with 338 additions and 21 deletions

View File

@@ -47,6 +47,7 @@ Docs: https://docs.openclaw.ai
- Diagnostics: keep webhook/message OTEL attributes and Prometheus delivery labels low-cardinality and omit raw chat/message IDs from spans, so progress-draft and message-tool modes do not leak high-cardinality messaging identifiers.
- Google Meet: stop advertising legacy `mode: "realtime"` to agents and config UIs, while keeping it as a hidden compatibility alias for `mode: "agent"`, so new joins use the STT -> OpenClaw agent -> TTS path instead of selecting the direct realtime voice fallback.
- Google Meet: add `chrome.audioBufferBytes` for generated command-pair SoX audio commands and lower the default buffer from SoX's 8192 bytes to 4096 bytes to reduce Chrome talk-back latency.
- Google Meet: split realtime provider config into agent-mode transcription and bidi-mode voice providers, and migrate legacy Gemini Live bidi configs with `doctor --fix`, so Gemini Live can back direct bidi fallback without breaking the default OpenClaw agent talk-back path.
- Telegram: render shared interactive reply buttons in reply delivery so plugin approval messages show inline keyboards. (#76238) Thanks @keshavbotagent.
- Agents/cli-runner: drop a saved `claude-cli` resume sessionId at preparation time when its on-disk transcript no longer exists in `~/.claude/projects/`, so a stale binding from a half-installed `update.run` cannot trap follow-up runs (auto-reply / Telegram direct) in a `claude --resume` timeout loop; the run starts fresh and the new sessionId is written back through the existing post-run flow. (#77030; refs #77011) Thanks @openperf.
- Release validation: install the cross-OS TypeScript harness through Windows-safe Node/npm shims so native Windows package checks reach the OpenClaw smoke suites instead of exiting before artifact capture. Thanks @vincentkoc.

View File

@@ -31,13 +31,13 @@ Google Meet participant support for OpenClaw — the plugin is explicit by desig
Install the local audio dependencies and configure a realtime transcription
provider plus regular OpenClaw TTS. OpenAI is the default transcription
provider; Google Gemini Live also works with `realtime.provider: "google"` for
`bidi` mode:
provider; Google Gemini Live also works as a separate `bidi` voice fallback with
`realtime.voiceProvider: "google"`:
```bash
brew install blackhole-2ch sox
export OPENAI_API_KEY=sk-...
# or
# only needed when realtime.voiceProvider is "google" for bidi mode
export GEMINI_API_KEY=...
```
@@ -973,8 +973,9 @@ Workspace Developer Preview Program for Meet media APIs.
The common Chrome agent path only needs the plugin enabled, BlackHole, SoX, a
realtime transcription provider key, and a configured OpenClaw TTS provider.
OpenAI is the default transcription provider; set `realtime.provider: "google"`
to use Google Gemini Live for `bidi` mode:
OpenAI is the default transcription provider; set `realtime.voiceProvider` to
`"google"` and `realtime.model` to use Google Gemini Live for `bidi` mode
without changing the default agent-mode transcription provider:
```bash
brew install blackhole-2ch sox
@@ -1042,8 +1043,13 @@ Defaults:
realtime voice provider answers participant speech directly and may call
`openclaw_agent_consult` for deeper/tool-backed answers.
- `mode: "transcribe"`: observe-only mode without the talk-back bridge.
- `realtime.provider: "openai"`: provider id used by `agent` mode for realtime
transcription and by `bidi` mode for realtime voice.
- `realtime.provider: "openai"`: compatibility fallback used when the scoped
provider fields below are unset.
- `realtime.transcriptionProvider: "openai"`: provider id used by `agent` mode
for realtime transcription.
- `realtime.voiceProvider`: provider id used by `bidi` mode for direct realtime
voice. Set this to `"google"` to use Gemini Live while keeping agent-mode
transcription on OpenAI.
- `realtime.toolPolicy: "safe-read-only"`
- `realtime.instructions`: brief spoken replies, with
`openclaw_agent_consult` for deeper answers
@@ -1089,13 +1095,15 @@ Optional overrides:
},
defaultMode: "agent",
realtime: {
provider: "google",
provider: "openai",
transcriptionProvider: "openai",
voiceProvider: "google",
model: "gemini-2.5-flash-native-audio-preview-12-2025",
agentId: "jay",
toolPolicy: "owner",
introMessage: "Say exactly: I'm here.",
providers: {
google: {
model: "gemini-2.5-flash-native-audio-preview-12-2025",
voice: "Kore",
},
},

View File

@@ -0,0 +1 @@
export { legacyConfigRules, normalizeCompatibilityConfig } from "./src/config-compat.js";

View File

@@ -29,6 +29,8 @@ import {
convertGoogleMeetTtsAudioForBridge,
extendGoogleMeetOutputEchoSuppression,
isGoogleMeetLikelyAssistantEchoTranscript,
resolveGoogleMeetRealtimeProvider,
resolveGoogleMeetRealtimeTranscriptionProvider,
startCommandAgentAudioBridge,
startCommandRealtimeAudioBridge,
} from "./src/realtime.js";
@@ -385,6 +387,7 @@ describe("google-meet plugin", () => {
realtime: {
strategy: "agent",
provider: "openai",
transcriptionProvider: "openai",
introMessage: "Say exactly: I'm here and listening.",
toolPolicy: "safe-read-only",
},
@@ -395,6 +398,87 @@ describe("google-meet plugin", () => {
expect(resolveGoogleMeetConfig({}).realtime.instructions).toContain("openclaw_agent_consult");
});
it("resolves separate realtime providers for agent transcription and bidi voice", () => {
expect(
resolveGoogleMeetConfig({
realtime: {
provider: "openai",
transcriptionProvider: "openai",
voiceProvider: "google",
model: "gemini-2.5-flash-native-audio-preview-12-2025",
},
}).realtime,
).toMatchObject({
provider: "openai",
transcriptionProvider: "openai",
voiceProvider: "google",
model: "gemini-2.5-flash-native-audio-preview-12-2025",
});
});
it("uses voiceProvider for bidi and transcriptionProvider for agent mode resolution", () => {
const voiceProviders: RealtimeVoiceProviderPlugin[] = [
{
id: "openai",
label: "OpenAI",
autoSelectOrder: 1,
isConfigured: () => true,
createBridge: () => {
throw new Error("unused");
},
},
{
id: "google",
label: "Google",
autoSelectOrder: 2,
resolveConfig: ({ rawConfig }) => rawConfig,
isConfigured: () => true,
createBridge: () => {
throw new Error("unused");
},
},
];
const transcriptionProviders: RealtimeTranscriptionProviderPlugin[] = [
{
id: "openai",
label: "OpenAI",
autoSelectOrder: 1,
isConfigured: () => true,
createSession: () => {
throw new Error("unused");
},
},
];
const config = resolveGoogleMeetConfig({
realtime: {
provider: "openai",
transcriptionProvider: "openai",
voiceProvider: "google",
model: "gemini-2.5-flash-native-audio-preview-12-2025",
},
});
expect(
resolveGoogleMeetRealtimeProvider({
config,
fullConfig: {} as never,
providers: voiceProviders,
}),
).toMatchObject({
provider: { id: "google" },
providerConfig: { model: "gemini-2.5-flash-native-audio-preview-12-2025" },
});
expect(
resolveGoogleMeetRealtimeTranscriptionProvider({
config,
fullConfig: {} as never,
providers: transcriptionProviders,
}),
).toMatchObject({
provider: { id: "openai" },
});
});
it("declares barge-in config metadata in the plugin entry and manifest", () => {
const manifest = JSON.parse(
readFileSync(new URL("./openclaw.plugin.json", import.meta.url), "utf8"),

View File

@@ -161,7 +161,15 @@ const googleMeetConfigSchema = {
},
"realtime.provider": {
label: "Speech Provider",
help: "Agent mode uses this for realtime transcription. Bidi mode uses it as the realtime voice provider.",
help: "Compatibility fallback for both realtime transcription and bidi voice. Prefer realtime.transcriptionProvider and realtime.voiceProvider for new configs.",
},
"realtime.transcriptionProvider": {
label: "Realtime Transcription Provider",
help: "Agent mode uses this provider to transcribe meeting audio before regular OpenClaw TTS answers.",
},
"realtime.voiceProvider": {
label: "Bidi Voice Provider",
help: "Bidi mode uses this realtime voice provider. Falls back to realtime.provider when unset.",
},
"realtime.model": {
label: "Bidi Realtime Model",

View File

@@ -154,7 +154,15 @@
},
"realtime.provider": {
"label": "Speech Provider",
"help": "Agent mode uses this for realtime transcription. Bidi mode uses it as the realtime voice provider."
"help": "Compatibility fallback for both realtime transcription and bidi voice. Prefer realtime.transcriptionProvider and realtime.voiceProvider for new configs."
},
"realtime.transcriptionProvider": {
"label": "Realtime Transcription Provider",
"help": "Agent mode uses this provider to transcribe meeting audio before regular OpenClaw TTS answers."
},
"realtime.voiceProvider": {
"label": "Bidi Voice Provider",
"help": "Bidi mode uses this realtime voice provider. Falls back to realtime.provider when unset."
},
"realtime.model": {
"label": "Bidi Realtime Model",
@@ -431,6 +439,13 @@
"type": "string",
"default": "openai"
},
"transcriptionProvider": {
"type": "string",
"default": "openai"
},
"voiceProvider": {
"type": "string"
},
"model": {
"type": "string"
},
@@ -501,5 +516,8 @@
}
}
}
},
"configContracts": {
"compatibilityMigrationPaths": ["plugins.entries.google-meet.config.realtime.provider"]
}
}

View File

@@ -0,0 +1,98 @@
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
import { describe, expect, it } from "vitest";
import {
legacyConfigRules,
migrateGoogleMeetLegacyRealtimeProvider,
normalizeCompatibilityConfig,
} from "./config-compat.js";
describe("google-meet config compatibility", () => {
it("detects legacy Google realtime provider config", () => {
expect(
legacyConfigRules[0]?.match({
provider: "google",
model: "gemini-2.5-flash-native-audio-preview-12-2025",
}),
).toBe(true);
});
it("migrates legacy Google bidi provider intent to scoped realtime providers", () => {
const config = {
plugins: {
entries: {
"google-meet": {
enabled: true,
config: {
defaultMode: "agent",
realtime: {
provider: "google",
model: "gemini-2.5-flash-native-audio-preview-12-2025",
providers: {
google: {
voice: "Kore",
},
},
},
},
},
},
},
} as OpenClawConfig;
const migration = migrateGoogleMeetLegacyRealtimeProvider(config);
expect(migration?.changes).toEqual([
'Moved Google Meet legacy realtime.provider="google" intent to realtime.voiceProvider="google" and realtime.transcriptionProvider="openai".',
]);
expect(
(
migration?.config.plugins?.entries?.["google-meet"] as {
config?: { realtime?: Record<string, unknown> };
}
).config?.realtime,
).toEqual({
provider: "openai",
transcriptionProvider: "openai",
voiceProvider: "google",
model: "gemini-2.5-flash-native-audio-preview-12-2025",
providers: {
google: {
voice: "Kore",
},
},
});
});
it("leaves fully scoped provider configs alone", () => {
const config = {
plugins: {
entries: {
"google-meet": {
config: {
realtime: {
provider: "google",
transcriptionProvider: "custom-stt",
voiceProvider: "custom-voice",
},
},
},
},
},
} as OpenClawConfig;
const migration = normalizeCompatibilityConfig({ cfg: config });
expect(migration.changes).toEqual([]);
expect(
(
migration.config.plugins?.entries?.["google-meet"] as {
config?: { realtime?: Record<string, unknown> };
}
).config?.realtime,
).toEqual({
provider: "google",
transcriptionProvider: "custom-stt",
voiceProvider: "custom-voice",
});
});
});

View File

@@ -0,0 +1,84 @@
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
type LegacyConfigRule = {
path: Array<string | number>;
message: string;
match: (value: unknown) => boolean;
};
function asRecord(value: unknown): Record<string, unknown> | null {
return value && typeof value === "object" && !Array.isArray(value)
? (value as Record<string, unknown>)
: null;
}
function normalizeProviderId(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim().toLowerCase() : undefined;
}
function hasOwn(record: Record<string, unknown>, key: string): boolean {
return Object.prototype.hasOwnProperty.call(record, key);
}
function hasLegacyGoogleRealtimeProvider(value: unknown): boolean {
const realtime = asRecord(value);
if (!realtime || normalizeProviderId(realtime.provider) !== "google") {
return false;
}
return !hasOwn(realtime, "voiceProvider") || !hasOwn(realtime, "transcriptionProvider");
}
export const legacyConfigRules: LegacyConfigRule[] = [
{
path: ["plugins", "entries", "google-meet", "config", "realtime"],
message:
'plugins.entries.google-meet.config.realtime.provider="google" is legacy for Gemini Live bidi mode; use realtime.voiceProvider="google" and realtime.transcriptionProvider="openai". Run "openclaw doctor --fix".',
match: hasLegacyGoogleRealtimeProvider,
},
];
export function migrateGoogleMeetLegacyRealtimeProvider(config: OpenClawConfig): {
config: OpenClawConfig;
changes: string[];
} | null {
const rawEntry = asRecord(config.plugins?.entries?.["google-meet"]);
const rawPluginConfig = asRecord(rawEntry?.config);
const rawRealtime = asRecord(rawPluginConfig?.realtime);
if (!rawRealtime || !hasLegacyGoogleRealtimeProvider(rawRealtime)) {
return null;
}
const nextConfig = structuredClone(config);
const nextPlugins = asRecord(nextConfig.plugins) ?? {};
nextConfig.plugins = nextPlugins;
const nextEntries = asRecord(nextPlugins.entries) ?? {};
nextPlugins.entries = nextEntries;
const nextEntry = asRecord(nextEntries["google-meet"]) ?? {};
nextEntries["google-meet"] = nextEntry;
const nextPluginConfig = asRecord(nextEntry.config) ?? {};
nextEntry.config = nextPluginConfig;
const nextRealtime = asRecord(nextPluginConfig.realtime) ?? {};
nextPluginConfig.realtime = nextRealtime;
nextRealtime.provider = "openai";
if (!hasOwn(nextRealtime, "transcriptionProvider")) {
nextRealtime.transcriptionProvider = "openai";
}
if (!hasOwn(nextRealtime, "voiceProvider")) {
nextRealtime.voiceProvider = "google";
}
return {
config: nextConfig,
changes: [
'Moved Google Meet legacy realtime.provider="google" intent to realtime.voiceProvider="google" and realtime.transcriptionProvider="openai".',
],
};
}
export function normalizeCompatibilityConfig({ cfg }: { cfg: OpenClawConfig }): {
config: OpenClawConfig;
changes: string[];
} {
return migrateGoogleMeetLegacyRealtimeProvider(cfg) ?? { config: cfg, changes: [] };
}

View File

@@ -65,6 +65,8 @@ export type GoogleMeetConfig = {
realtime: {
strategy: GoogleMeetRealtimeStrategy;
provider?: string;
transcriptionProvider?: string;
voiceProvider?: string;
model?: string;
instructions?: string;
introMessage?: string;
@@ -220,6 +222,7 @@ const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
realtime: {
strategy: "agent",
provider: "openai",
transcriptionProvider: "openai",
instructions: DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS,
introMessage: DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE,
toolPolicy: "safe-read-only",
@@ -536,6 +539,10 @@ export function resolveGoogleMeetConfigWithEnv(
),
provider:
normalizeOptionalString(realtime.provider) ?? DEFAULT_GOOGLE_MEET_CONFIG.realtime.provider,
transcriptionProvider:
normalizeOptionalString(realtime.transcriptionProvider) ??
DEFAULT_GOOGLE_MEET_CONFIG.realtime.transcriptionProvider,
voiceProvider: normalizeOptionalString(realtime.voiceProvider),
model: normalizeOptionalString(realtime.model) ?? DEFAULT_GOOGLE_MEET_CONFIG.realtime.model,
instructions:
normalizeOptionalString(realtime.instructions) ??

View File

@@ -357,8 +357,9 @@ export function resolveGoogleMeetRealtimeProvider(params: {
fullConfig: OpenClawConfig;
providers?: RealtimeVoiceProviderPlugin[];
}): ResolvedRealtimeProvider {
const providerId = params.config.realtime.voiceProvider ?? params.config.realtime.provider;
return resolveConfiguredRealtimeVoiceProvider({
configuredProviderId: params.config.realtime.provider,
configuredProviderId: providerId,
providerConfigs: params.config.realtime.providers,
cfg: params.fullConfig,
providers: params.providers,
@@ -376,19 +377,19 @@ export function resolveGoogleMeetRealtimeTranscriptionProvider(params: {
if (providers.length === 0) {
throw new Error("No configured realtime transcription provider registered");
}
const configuredProvider = params.config.realtime.provider
const providerId =
params.config.realtime.transcriptionProvider ?? params.config.realtime.provider;
const configuredProvider = providerId
? (params.providers?.find(
(entry) =>
entry.id === params.config.realtime.provider ||
entry.aliases?.includes(params.config.realtime.provider ?? ""),
) ?? getRealtimeTranscriptionProvider(params.config.realtime.provider, params.fullConfig))
(entry) => entry.id === providerId || entry.aliases?.includes(providerId),
) ?? getRealtimeTranscriptionProvider(providerId, params.fullConfig))
: undefined;
const provider = configuredProvider ?? providers[0];
if (!provider) {
throw new Error("No configured realtime transcription provider registered");
}
const rawConfig = params.config.realtime.provider
? (params.config.realtime.providers[params.config.realtime.provider] ??
const rawConfig = providerId
? (params.config.realtime.providers[providerId] ??
params.config.realtime.providers[provider.id] ??
{})
: (params.config.realtime.providers[provider.id] ?? {});

View File

@@ -402,9 +402,16 @@ export class GoogleMeetRuntime {
realtime: {
enabled: isGoogleMeetTalkBackMode(mode),
strategy: mode === "bidi" ? "bidi" : "agent",
provider: mode === "bidi" ? this.params.config.realtime.provider : undefined,
provider:
mode === "bidi"
? (this.params.config.realtime.voiceProvider ?? this.params.config.realtime.provider)
: undefined,
model: mode === "bidi" ? this.params.config.realtime.model : undefined,
transcriptionProvider: mode === "agent" ? this.params.config.realtime.provider : undefined,
transcriptionProvider:
mode === "agent"
? (this.params.config.realtime.transcriptionProvider ??
this.params.config.realtime.provider)
: undefined,
toolPolicy: this.params.config.realtime.toolPolicy,
},
notes: [],