From 155f4300babdff2f0e506f9ea878088fd80f5415 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 5 Apr 2026 08:14:41 +0100 Subject: [PATCH] fix(voice-call): use full config for realtime transcription (#61224) * fix(voice-call): use full config for realtime transcription * fix(changelog): note voice-call transcription regression * Update CHANGELOG.md --- CHANGELOG.md | 1 + extensions/voice-call/src/runtime.test.ts | 27 +++++++++++++++++++++++ extensions/voice-call/src/runtime.ts | 1 + extensions/voice-call/src/webhook.ts | 6 ++++- 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab10a5f3148..00d4035d78f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai - Status/usage: let `/status` and `session_status` fall back to transcript token totals when the session meta store stayed at zero, so LM Studio, Ollama, DashScope, and similar OpenAI-compatible providers stop showing `Context: 0/...`. (#55041) Thanks @jjjojoj. - Providers/Z.AI: preserve explicitly registered `glm-5-*` variants like `glm-5-turbo` instead of intercepting them with the generic GLM-5 forward-compat shim. (#48185) Thanks @haoyu-haoyu. - Live model switching: only treat explicit user-driven model changes as pending live switches, so fallback rotation, heartbeat overrides, and compaction no longer trip `LiveSessionModelSwitchError` before making an API call. (#60266) Thanks @kiranvk-2011. +- Voice-call/OpenAI: pass full plugin config into realtime transcription provider resolution so streaming calls can discover the bundled OpenAI realtime transcription provider again. Fixes #60936. Thanks @sliekens and @vincentkoc. - Plugins/OpenAI: enable `gpt-image-1` reference-image edits through `/images/edits` multipart uploads, and stop inferring unsupported resolution overrides when no explicit `size` or `resolution` is provided. - Gateway/startup: default `gateway.mode` to `local` when unset, detect PID recycling in gateway lock files on Windows and macOS, and show startup progress so healthy restarts stop getting blocked by stale locks. (#54801, #60085, #59843) - Mobile pairing/Android: tighten secure endpoint handling so Tailscale and public remote setup reject cleartext endpoints, private LAN pairing still works, merged-role approvals mint both node and operator device tokens, and bootstrap tokens survive node auto-pair until operator approval finishes. (#60128, #60208, #60221) diff --git a/extensions/voice-call/src/runtime.test.ts b/extensions/voice-call/src/runtime.test.ts index ffe9093c4e2..ecb7ab2cb07 100644 --- a/extensions/voice-call/src/runtime.test.ts +++ b/extensions/voice-call/src/runtime.test.ts @@ -1,3 +1,4 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk/core"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { VoiceCallConfig } from "./config.js"; import type { CoreConfig } from "./core-bridge.js"; @@ -10,6 +11,7 @@ const mocks = vi.hoisted(() => ({ webhookStart: vi.fn(), webhookStop: vi.fn(), webhookGetMediaStreamHandler: vi.fn(), + webhookCtorArgs: [] as unknown[][], startTunnel: vi.fn(), setupTailscaleExposure: vi.fn(), cleanupTailscaleExposure: vi.fn(), @@ -28,6 +30,9 @@ vi.mock("./manager.js", () => ({ vi.mock("./webhook.js", () => ({ VoiceCallWebhookServer: class { + constructor(...args: unknown[]) { + mocks.webhookCtorArgs.push(args); + } start = mocks.webhookStart; stop = mocks.webhookStop; getMediaStreamHandler = mocks.webhookGetMediaStreamHandler; @@ -58,6 +63,7 @@ describe("createVoiceCallRuntime lifecycle", () => { mocks.webhookStart.mockResolvedValue("http://127.0.0.1:3334/voice/webhook"); mocks.webhookStop.mockResolvedValue(undefined); mocks.webhookGetMediaStreamHandler.mockReturnValue(undefined); + mocks.webhookCtorArgs.length = 0; mocks.startTunnel.mockResolvedValue(null); mocks.setupTailscaleExposure.mockResolvedValue(null); mocks.cleanupTailscaleExposure.mockResolvedValue(undefined); @@ -106,4 +112,25 @@ describe("createVoiceCallRuntime lifecycle", () => { expect(mocks.cleanupTailscaleExposure).toHaveBeenCalledTimes(1); expect(mocks.webhookStop).toHaveBeenCalledTimes(1); }); + + it("passes fullConfig to the webhook server for streaming provider resolution", async () => { + const coreConfig = { messages: { tts: { provider: "openai" } } } as CoreConfig; + const fullConfig = { + plugins: { + entries: { + openai: { enabled: true }, + }, + }, + } as OpenClawConfig; + + await createVoiceCallRuntime({ + config: createBaseConfig(), + coreConfig, + fullConfig, + agentRuntime: {} as never, + }); + + expect(mocks.webhookCtorArgs[0]?.[3]).toBe(coreConfig); + expect(mocks.webhookCtorArgs[0]?.[4]).toBe(fullConfig); + }); }); diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 731073985ec..dc463b6ebbf 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -231,6 +231,7 @@ export async function createVoiceCallRuntime(params: { manager, provider, coreConfig, + (fullConfig ?? (coreConfig as OpenClawConfig)) as OpenClawConfig, agentRuntime, ); if (realtimeProvider) { diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index ef64cfcc4d3..5327066e9a8 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -84,6 +84,7 @@ export class VoiceCallWebhookServer { private manager: CallManager; private provider: VoiceCallProvider; private coreConfig: CoreConfig | null; + private fullConfig: OpenClawConfig | null; private agentRuntime: CoreAgentDeps | null; private stopStaleCallReaper: (() => void) | null = null; private readonly webhookInFlightLimiter = createWebhookInFlightLimiter(); @@ -100,12 +101,14 @@ export class VoiceCallWebhookServer { manager: CallManager, provider: VoiceCallProvider, coreConfig?: CoreConfig, + fullConfig?: OpenClawConfig, agentRuntime?: CoreAgentDeps, ) { this.config = normalizeVoiceCallConfig(config); this.manager = manager; this.provider = provider; this.coreConfig = coreConfig ?? null; + this.fullConfig = fullConfig ?? null; this.agentRuntime = agentRuntime ?? null; } @@ -159,7 +162,8 @@ export class VoiceCallWebhookServer { */ private async initializeMediaStreaming(): Promise { const streaming = this.config.streaming; - const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined; + const pluginConfig = + this.fullConfig ?? (this.coreConfig as unknown as OpenClawConfig | undefined); const { getRealtimeTranscriptionProvider, listRealtimeTranscriptionProviders } = await import("./realtime-transcription.runtime.js"); const resolution = resolveConfiguredCapabilityProvider({