fix(voice-call): use full config for realtime transcription (#61224)

* fix(voice-call): use full config for realtime transcription * fix(changelog): note voice-call transcription regression * Update CHANGELOG.md
2026-04-11 01:01:13 +00:00 · 2026-04-05 08:14:41 +01:00
parent 42bc411c46
commit 155f4300ba
4 changed files with 34 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai
 - Status/usage: let `/status` and `session_status` fall back to transcript token totals when the session meta store stayed at zero, so LM Studio, Ollama, DashScope, and similar OpenAI-compatible providers stop showing `Context: 0/...`. (#55041) Thanks @jjjojoj.
 - Providers/Z.AI: preserve explicitly registered `glm-5-*` variants like `glm-5-turbo` instead of intercepting them with the generic GLM-5 forward-compat shim. (#48185) Thanks @haoyu-haoyu.
 - Live model switching: only treat explicit user-driven model changes as pending live switches, so fallback rotation, heartbeat overrides, and compaction no longer trip `LiveSessionModelSwitchError` before making an API call. (#60266) Thanks @kiranvk-2011.
+- Voice-call/OpenAI: pass full plugin config into realtime transcription provider resolution so streaming calls can discover the bundled OpenAI realtime transcription provider again. Fixes #60936. Thanks @sliekens and @vincentkoc.
 - Plugins/OpenAI: enable `gpt-image-1` reference-image edits through `/images/edits` multipart uploads, and stop inferring unsupported resolution overrides when no explicit `size` or `resolution` is provided.
 - Gateway/startup: default `gateway.mode` to `local` when unset, detect PID recycling in gateway lock files on Windows and macOS, and show startup progress so healthy restarts stop getting blocked by stale locks. (#54801, #60085, #59843)
 - Mobile pairing/Android: tighten secure endpoint handling so Tailscale and public remote setup reject cleartext endpoints, private LAN pairing still works, merged-role approvals mint both node and operator device tokens, and bootstrap tokens survive node auto-pair until operator approval finishes. (#60128, #60208, #60221)
--- a/extensions/voice-call/src/runtime.test.ts
+++ b/extensions/voice-call/src/runtime.test.ts
@@ -1,3 +1,4 @@
+import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import type { VoiceCallConfig } from "./config.js";
 import type { CoreConfig } from "./core-bridge.js";
@@ -10,6 +11,7 @@ const mocks = vi.hoisted(() => ({
  webhookStart: vi.fn(),
  webhookStop: vi.fn(),
  webhookGetMediaStreamHandler: vi.fn(),
+  webhookCtorArgs: [] as unknown[][],
  startTunnel: vi.fn(),
  setupTailscaleExposure: vi.fn(),
  cleanupTailscaleExposure: vi.fn(),
@@ -28,6 +30,9 @@ vi.mock("./manager.js", () => ({

 vi.mock("./webhook.js", () => ({
  VoiceCallWebhookServer: class {
+    constructor(...args: unknown[]) {
+      mocks.webhookCtorArgs.push(args);
+    }
    start = mocks.webhookStart;
    stop = mocks.webhookStop;
    getMediaStreamHandler = mocks.webhookGetMediaStreamHandler;
@@ -58,6 +63,7 @@ describe("createVoiceCallRuntime lifecycle", () => {
    mocks.webhookStart.mockResolvedValue("http://127.0.0.1:3334/voice/webhook");
    mocks.webhookStop.mockResolvedValue(undefined);
    mocks.webhookGetMediaStreamHandler.mockReturnValue(undefined);
+    mocks.webhookCtorArgs.length = 0;
    mocks.startTunnel.mockResolvedValue(null);
    mocks.setupTailscaleExposure.mockResolvedValue(null);
    mocks.cleanupTailscaleExposure.mockResolvedValue(undefined);
@@ -106,4 +112,25 @@ describe("createVoiceCallRuntime lifecycle", () => {
    expect(mocks.cleanupTailscaleExposure).toHaveBeenCalledTimes(1);
    expect(mocks.webhookStop).toHaveBeenCalledTimes(1);
  });
+
+  it("passes fullConfig to the webhook server for streaming provider resolution", async () => {
+    const coreConfig = { messages: { tts: { provider: "openai" } } } as CoreConfig;
+    const fullConfig = {
+      plugins: {
+        entries: {
+          openai: { enabled: true },
+        },
+      },
+    } as OpenClawConfig;
+
+    await createVoiceCallRuntime({
+      config: createBaseConfig(),
+      coreConfig,
+      fullConfig,
+      agentRuntime: {} as never,
+    });
+
+    expect(mocks.webhookCtorArgs[0]?.[3]).toBe(coreConfig);
+    expect(mocks.webhookCtorArgs[0]?.[4]).toBe(fullConfig);
+  });
 });
--- a/extensions/voice-call/src/runtime.ts
+++ b/extensions/voice-call/src/runtime.ts
@@ -231,6 +231,7 @@ export async function createVoiceCallRuntime(params: {
    manager,
    provider,
    coreConfig,
+    (fullConfig ?? (coreConfig as OpenClawConfig)) as OpenClawConfig,
    agentRuntime,
  );
  if (realtimeProvider) {
--- a/extensions/voice-call/src/webhook.ts
+++ b/extensions/voice-call/src/webhook.ts
@@ -84,6 +84,7 @@ export class VoiceCallWebhookServer {
  private manager: CallManager;
  private provider: VoiceCallProvider;
  private coreConfig: CoreConfig | null;
+  private fullConfig: OpenClawConfig | null;
  private agentRuntime: CoreAgentDeps | null;
  private stopStaleCallReaper: (() => void) | null = null;
  private readonly webhookInFlightLimiter = createWebhookInFlightLimiter();
@@ -100,12 +101,14 @@ export class VoiceCallWebhookServer {
    manager: CallManager,
    provider: VoiceCallProvider,
    coreConfig?: CoreConfig,
+    fullConfig?: OpenClawConfig,
    agentRuntime?: CoreAgentDeps,
  ) {
    this.config = normalizeVoiceCallConfig(config);
    this.manager = manager;
    this.provider = provider;
    this.coreConfig = coreConfig ?? null;
+    this.fullConfig = fullConfig ?? null;
    this.agentRuntime = agentRuntime ?? null;
  }

@@ -159,7 +162,8 @@ export class VoiceCallWebhookServer {
   */
  private async initializeMediaStreaming(): Promise<void> {
    const streaming = this.config.streaming;
-    const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined;
+    const pluginConfig =
+      this.fullConfig ?? (this.coreConfig as unknown as OpenClawConfig | undefined);
    const { getRealtimeTranscriptionProvider, listRealtimeTranscriptionProviders } =
      await import("./realtime-transcription.runtime.js");
    const resolution = resolveConfiguredCapabilityProvider({