From 012841816d1dedb07299d591fcf4b84318cde32d Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Thu, 23 Apr 2026 00:46:19 +0100
Subject: [PATCH] feat: add xai speech-to-text support

---
 CHANGELOG.md                                  |  2 +-
 docs/nodes/media-understanding.md             |  3 +-
 docs/providers/index.md                       |  1 +
 docs/providers/xai.md                         | 93 ++++++++++++++-----
 docs/tools/media-overview.md                  | 10 +-
 extensions/xai/index.test.ts                  | 19 ++++
 extensions/xai/index.ts                       |  2 +
 extensions/xai/openclaw.plugin.json           | 12 +++
 .../xai/plugin-registration.contract.test.ts  |  1 +
 extensions/xai/stt.test.ts                    | 65 +++++++++++++
 extensions/xai/stt.ts                         | 89 ++++++++++++++++++
 extensions/xai/xai.live.test.ts               | 38 ++++++++
 src/media-understanding/defaults.test.ts      |  1 +
 .../plugin-registration-contract-cases.ts     |  1 +
 14 files changed, 307 insertions(+), 30 deletions(-)
 create mode 100644 extensions/xai/stt.test.ts
 create mode 100644 extensions/xai/stt.ts
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dc3b4f97d1a..28e855153cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,7 +10,7 @@ Docs: https://docs.openclaw.ai
 - OpenAI/Responses: use OpenAI's native `web_search` tool automatically for direct OpenAI Responses models when web search is enabled and no managed search provider is pinned; explicit providers such as Brave keep the managed `web_search` tool.
 - ACPX: add an explicit `openClawToolsMcpBridge` option that injects a core OpenClaw MCP server for selected built-in tools, starting with `cron`.
 - Providers/GPT-5: move the GPT-5 prompt overlay into the shared provider runtime so compatible GPT-5 models receive the same behavior and heartbeat guidance through OpenAI, OpenRouter, OpenCode, Codex, and other GPT providers; add `agents.defaults.promptOverlays.gpt5.personality` as the global friendly-style toggle while keeping the OpenAI plugin setting as a fallback.
-- Providers/xAI: add image generation and text-to-speech support, including `grok-imagine-image` / `grok-imagine-image-pro`, reference-image edits, six live xAI voices, and MP3/WAV/PCM/G.711 TTS formats. (#68694) Thanks @KateWilkins.
+- Providers/xAI: add image generation, text-to-speech, and speech-to-text support, including `grok-imagine-image` / `grok-imagine-image-pro`, reference-image edits, six live xAI voices, MP3/WAV/PCM/G.711 TTS formats, and `grok-stt` audio transcription. (#68694) Thanks @KateWilkins.
 - Models/commands: add `/models add <provider> <modelId>` so you can register a model from chat and use it without restarting the gateway; keep `/models` as a simple provider browser while adding clearer add guidance and copy-friendly command examples. (#70211) Thanks @Takhoffman.
 - Pi/models: update the bundled pi packages to `0.68.1` and let the OpenCode Go catalog come from pi instead of plugin-maintained model aliases, adding the refreshed `opencode-go/kimi-k2.6`, Qwen, GLM, MiMo, and MiniMax entries.
 - CLI/doctor plugins: lazy-load doctor plugin paths and prefer installed plugin `dist/*` runtime entries over source-adjacent JavaScript fallbacks, reducing the measured `doctor --non-interactive` runtime by about 74% while keeping cold doctor startup on built plugin artifacts. (#69840) Thanks @gumadeiras.
diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md
index ff4e60f92be..185bfdcd6dc 100644
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -164,7 +164,7 @@ working option**:
      example through `agents.defaults.imageModel` or
      `openclaw infer image describe --model ollama/<vision-model>`.
    - Bundled fallback order:
-     - Audio: OpenAI → Groq → Deepgram → Google → Mistral
+     - Audio: OpenAI → Groq → xAI → Deepgram → Google → Mistral
      - Image: OpenAI → Anthropic → Google → MiniMax → MiniMax Portal → Z.AI
      - Video: Google → Qwen → Moonshot
 
@@ -212,6 +212,7 @@ lists, OpenClaw can infer defaults:
 - `mistral`: **audio**
 - `zai`: **image**
 - `groq`: **audio**
+- `xai`: **audio**
 - `deepgram`: **audio**
 - Any `models.providers.<id>.models[]` catalog with an image-capable model:
   **image**
diff --git a/docs/providers/index.md b/docs/providers/index.md
index d5c60c14bd9..c6b82cf62fb 100644
--- a/docs/providers/index.md
+++ b/docs/providers/index.md
@@ -82,6 +82,7 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/Mattermost (plugi
 ## Transcription providers
 
 - [Deepgram (audio transcription)](/providers/deepgram)
+- [xAI](/providers/xai#speech-to-text)
 
 ## Community tools
 
diff --git a/docs/providers/xai.md b/docs/providers/xai.md
index 361fc049b83..4d2da6750ec 100644
--- a/docs/providers/xai.md
+++ b/docs/providers/xai.md
@@ -68,25 +68,27 @@ current image-capable Grok refs in the bundled catalog.
 The bundled plugin maps xAI's current public API surface onto OpenClaw's shared
 provider and tool contracts where the behavior fits cleanly.
 
-| xAI capability             | OpenClaw surface                       | Status                                                              |
-| -------------------------- | -------------------------------------- | ------------------------------------------------------------------- |
-| Chat / Responses           | `xai/<model>` model provider           | Yes                                                                 |
-| Server-side web search     | `web_search` provider `grok`           | Yes                                                                 |
-| Server-side X search       | `x_search` tool                        | Yes                                                                 |
-| Server-side code execution | `code_execution` tool                  | Yes                                                                 |
-| Images                     | `image_generate`                       | Yes                                                                 |
-| Videos                     | `video_generate`                       | Yes                                                                 |
-| Batch text-to-speech       | `messages.tts.provider: "xai"` / `tts` | Yes                                                                 |
-| Streaming TTS              | —                                      | Not exposed; OpenClaw's TTS contract returns complete audio buffers |
-| Speech-to-text             | —                                      | Not exposed yet; needs a transcription provider surface             |
-| Realtime voice             | —                                      | Not exposed yet; different session/WebSocket contract               |
-| Files / batches            | Generic model API compatibility only   | Not a first-class OpenClaw tool                                     |
+| xAI capability             | OpenClaw surface                          | Status                                                              |
+| -------------------------- | ----------------------------------------- | ------------------------------------------------------------------- |
+| Chat / Responses           | `xai/<model>` model provider              | Yes                                                                 |
+| Server-side web search     | `web_search` provider `grok`              | Yes                                                                 |
+| Server-side X search       | `x_search` tool                           | Yes                                                                 |
+| Server-side code execution | `code_execution` tool                     | Yes                                                                 |
+| Images                     | `image_generate`                          | Yes                                                                 |
+| Videos                     | `video_generate`                          | Yes                                                                 |
+| Batch text-to-speech       | `messages.tts.provider: "xai"` / `tts`    | Yes                                                                 |
+| Streaming TTS              | —                                         | Not exposed; OpenClaw's TTS contract returns complete audio buffers |
+| Batch speech-to-text       | `tools.media.audio` / media understanding | Yes                                                                 |
+| Streaming speech-to-text   | —                                         | Not exposed; needs streaming transcription contract mapping         |
+| Realtime voice             | —                                         | Not exposed yet; different session/WebSocket contract               |
+| Files / batches            | Generic model API compatibility only      | Not a first-class OpenClaw tool                                     |
 
 <Note>
-OpenClaw uses xAI's REST image/video/TTS APIs for media generation and the
-Responses API for model, search, and code-execution tools. Features that need
-new OpenClaw contracts, such as streaming STT or Realtime voice sessions, are
-documented here as upstream capabilities rather than hidden plugin behavior.
+OpenClaw uses xAI's REST image/video/TTS/STT APIs for media generation,
+speech, and transcription, and the Responses API for model, search, and
+code-execution tools. Features that need new OpenClaw contracts, such as
+streaming STT or Realtime voice sessions, are documented here as upstream
+capabilities rather than hidden plugin behavior.
 </Note>
 
 ### Fast-mode mappings
@@ -239,6 +241,50 @@ Legacy aliases still normalize to the canonical bundled ids:
 
   </Accordion>
 
+  <Accordion title="Speech-to-text">
+    The bundled `xai` plugin registers batch speech-to-text through OpenClaw's
+    media-understanding transcription surface.
+
+    - Default model: `grok-stt`
+    - Endpoint: xAI REST `/v1/stt`
+    - Input path: multipart audio file upload
+    - Supported by OpenClaw wherever inbound audio transcription uses
+      `tools.media.audio`, including Discord voice-channel segments and
+      channel audio attachments
+
+    To force xAI for inbound audio transcription:
+
+    ```json5
+    {
+      tools: {
+        media: {
+          audio: {
+            models: [
+              {
+                type: "provider",
+                provider: "xai",
+                model: "grok-stt",
+              },
+            ],
+          },
+        },
+      },
+    }
+    ```
+
+    Language can be supplied through the shared audio media config or per-call
+    transcription request. Prompt hints are accepted by the shared OpenClaw
+    surface, but the xAI REST STT integration only forwards file, model, and
+    language because those map cleanly to the current public xAI endpoint.
+
+    <Note>
+    xAI also offers streaming STT over `wss://api.x.ai/v1/stt`. OpenClaw's
+    bundled xAI plugin does not expose that yet; the current provider is batch
+    STT for file/segment transcription.
+    </Note>
+
+  </Accordion>
+
   <Accordion title="x_search configuration">
     The bundled xAI plugin exposes `x_search` as an OpenClaw tool for searching
     X (formerly Twitter) content via Grok.
@@ -316,9 +362,9 @@ Legacy aliases still normalize to the canonical bundled ids:
     - `grok-4.20-multi-agent-experimental-beta-0304` is not supported on the
       normal xAI provider path because it requires a different upstream API
       surface than the standard OpenClaw xAI transport.
-    - xAI STT and Realtime voice are not registered as OpenClaw providers yet.
-      They require transcription/session contracts rather than the existing
-      batch TTS provider shape.
+    - xAI streaming STT and Realtime voice are not registered as OpenClaw
+      providers yet. Batch xAI STT is registered through media understanding.
+      Streaming STT and Realtime voice need WebSocket/session contract mapping.
     - xAI image `quality`, image `mask`, and extra native-only aspect ratios are
       not exposed until the shared `image_generate` tool has corresponding
       cross-provider controls.
@@ -355,9 +401,10 @@ OPENCLAW_LIVE_TEST=1 OPENCLAW_LIVE_TEST_QUIET=1 OPENCLAW_LIVE_IMAGE_GENERATION_P
 ```
 
 The provider-specific live file synthesizes normal TTS, telephony-friendly PCM
-TTS, text-to-image generation, and reference-image editing. The shared image
-live file verifies the same xAI provider through OpenClaw's runtime selection,
-fallback, normalization, and media attachment path.
+TTS, transcribes audio through xAI STT, generates text-to-image output, and
+edits a reference image. The shared image live file verifies the same xAI
+provider through OpenClaw's runtime selection, fallback, normalization, and
+media attachment path.
 
 ## Related
 
diff --git a/docs/tools/media-overview.md b/docs/tools/media-overview.md
index a930c1d522d..2e89ed1eff3 100644
--- a/docs/tools/media-overview.md
+++ b/docs/tools/media-overview.md
@@ -41,7 +41,7 @@ This table shows which providers support which media capabilities across the pla
 | Runway     |       | Yes   |       |     |                     |                     |
 | Together   |       | Yes   |       |     |                     |                     |
 | Vydra      | Yes   | Yes   |       |     |                     |                     |
-| xAI        | Yes   | Yes   |       | Yes |                     |                     |
+| xAI        | Yes   | Yes   |       | Yes | Yes                 | Yes                 |
 
 <Note>
 Media understanding uses any vision-capable or audio-capable model registered in your provider config. The table above highlights providers with dedicated media-understanding support; most LLM providers with multimodal models (Anthropic, Google, OpenAI, etc.) can also understand inbound media when configured as the active reply model.
@@ -51,10 +51,10 @@ Media understanding uses any vision-capable or audio-capable model registered in
 
 Video and music generation run as background tasks because provider processing typically takes 30 seconds to several minutes. When the agent calls `video_generate` or `music_generate`, OpenClaw submits the request to the provider, returns a task ID immediately, and tracks the job in the task ledger. The agent continues responding to other messages while the job runs. When the provider finishes, OpenClaw wakes the agent so it can post the finished media back into the original channel. Image generation and TTS are synchronous and complete inline with the reply.
 
-xAI currently maps to OpenClaw's image, video, search, code-execution, and
-batch TTS surfaces. xAI STT and Realtime voice are upstream capabilities, but
-they are not registered in OpenClaw until the shared transcription and realtime
-voice contracts can represent them.
+xAI currently maps to OpenClaw's image, video, search, code-execution, batch
+TTS, and batch STT surfaces. xAI streaming STT and Realtime voice are upstream
+capabilities, but they are not registered in OpenClaw until the shared
+streaming transcription and realtime voice contracts can represent them.
 
 ## Quick links
 
diff --git a/extensions/xai/index.test.ts b/extensions/xai/index.test.ts
index 5963e09ac9b..5ffbb500b21 100644
--- a/extensions/xai/index.test.ts
+++ b/extensions/xai/index.test.ts
@@ -2,6 +2,7 @@ import type { OpenClawPluginApi } from "openclaw/plugin-sdk/plugin-entry";
 import { describe, expect, it } from "vitest";
 import { createTestPluginApi } from "../../test/helpers/plugins/plugin-api.js";
 import { registerSingleProviderPlugin } from "../../test/helpers/plugins/plugin-registration.js";
+import { registerProviderPlugin } from "../../test/helpers/plugins/provider-registration.js";
 import plugin from "./index.js";
 import setupPlugin from "./setup-api.js";
 import {
@@ -49,6 +50,24 @@ function registerXaiAutoEnableProbe(): XaiAutoEnableProbe {
 }
 
 describe("xai provider plugin", () => {
+  it("registers xAI media understanding for batch STT", async () => {
+    const { mediaProviders } = await registerProviderPlugin({
+      plugin,
+      id: "xai",
+      name: "xAI Provider",
+    });
+
+    expect(mediaProviders).toEqual(
+      expect.arrayContaining([
+        expect.objectContaining({
+          id: "xai",
+          capabilities: ["audio"],
+          defaultModels: { audio: "grok-stt" },
+        }),
+      ]),
+    );
+  });
+
   it("declares setup auto-enable reasons for plugin-owned tool config", () => {
     const probe = registerXaiAutoEnableProbe();
 
diff --git a/extensions/xai/index.ts b/extensions/xai/index.ts
index 0153de9493b..7519660d9d3 100644
--- a/extensions/xai/index.ts
+++ b/extensions/xai/index.ts
@@ -18,6 +18,7 @@ import { buildXaiSpeechProvider } from "./speech-provider.js";
 import { resolveFallbackXaiAuth } from "./src/tool-auth-shared.js";
 import { resolveEffectiveXSearchConfig } from "./src/x-search-config.js";
 import { wrapXaiProviderStream } from "./stream.js";
+import { buildXaiMediaUnderstandingProvider } from "./stt.js";
 import { buildXaiVideoGenerationProvider } from "./video-generation-provider.js";
 import { createXaiWebSearchProvider } from "./web-search.js";
 import {
@@ -204,6 +205,7 @@ export default defineSingleProviderPluginEntry({
   },
   register(api) {
     api.registerWebSearchProvider(createXaiWebSearchProvider());
+    api.registerMediaUnderstandingProvider(buildXaiMediaUnderstandingProvider());
     api.registerVideoGenerationProvider(buildXaiVideoGenerationProvider());
     api.registerImageGenerationProvider(buildXaiImageGenerationProvider());
     api.registerSpeechProvider(buildXaiSpeechProvider());
diff --git a/extensions/xai/openclaw.plugin.json b/extensions/xai/openclaw.plugin.json
index a1ae0b0617a..1e29cc7b1dd 100644
--- a/extensions/xai/openclaw.plugin.json
+++ b/extensions/xai/openclaw.plugin.json
@@ -85,10 +85,22 @@
   "contracts": {
     "webSearchProviders": ["grok"],
     "videoGenerationProviders": ["xai"],
+    "mediaUnderstandingProviders": ["xai"],
     "speechProviders": ["xai"],
     "imageGenerationProviders": ["xai"],
     "tools": ["code_execution", "x_search"]
   },
+  "mediaUnderstandingProviderMetadata": {
+    "xai": {
+      "capabilities": ["audio"],
+      "defaultModels": {
+        "audio": "grok-stt"
+      },
+      "autoPriority": {
+        "audio": 25
+      }
+    }
+  },
   "configContracts": {
     "compatibilityRuntimePaths": ["tools.web.search.apiKey"]
   },
diff --git a/extensions/xai/plugin-registration.contract.test.ts b/extensions/xai/plugin-registration.contract.test.ts
index 1de5bd17bd2..bf97b7f035f 100644
--- a/extensions/xai/plugin-registration.contract.test.ts
+++ b/extensions/xai/plugin-registration.contract.test.ts
@@ -4,6 +4,7 @@ describePluginRegistrationContract({
   pluginId: "xai",
   providerIds: ["xai"],
   webSearchProviderIds: ["grok"],
+  mediaUnderstandingProviderIds: ["xai"],
   videoGenerationProviderIds: ["xai"],
   toolNames: ["code_execution", "x_search"],
   requireGenerateVideo: true,
diff --git a/extensions/xai/stt.test.ts b/extensions/xai/stt.test.ts
new file mode 100644
index 00000000000..ecbeeb917b5
--- /dev/null
+++ b/extensions/xai/stt.test.ts
@@ -0,0 +1,65 @@
+import { describe, expect, it, vi } from "vitest";
+import {
+  buildXaiMediaUnderstandingProvider,
+  transcribeXaiAudio,
+  XAI_DEFAULT_STT_MODEL,
+} from "./stt.js";
+
+const { postTranscriptionRequestMock } = vi.hoisted(() => ({
+  postTranscriptionRequestMock: vi.fn(
+    async (_params: { headers: Headers; body: BodyInit; url: string; timeoutMs?: number }) => ({
+      response: new Response(JSON.stringify({ text: "hello from audio" }), { status: 200 }),
+      release: vi.fn(),
+    }),
+  ),
+}));
+
+vi.mock("openclaw/plugin-sdk/provider-http", async (importOriginal) => {
+  const actual = await importOriginal<typeof import("openclaw/plugin-sdk/provider-http")>();
+  return {
+    ...actual,
+    postTranscriptionRequest: postTranscriptionRequestMock,
+  };
+});
+
+describe("xai stt", () => {
+  it("posts audio files to the xAI STT endpoint", async () => {
+    const result = await transcribeXaiAudio({
+      buffer: Buffer.from("audio-bytes"),
+      fileName: "sample.wav",
+      mime: "audio/wav",
+      apiKey: "xai-key",
+      baseUrl: "https://api.x.ai/v1/",
+      model: XAI_DEFAULT_STT_MODEL,
+      language: "en",
+      prompt: "ignored provider hint",
+      timeoutMs: 10_000,
+    });
+
+    expect(result).toEqual({ text: "hello from audio", model: XAI_DEFAULT_STT_MODEL });
+    expect(postTranscriptionRequestMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        url: "https://api.x.ai/v1/stt",
+        timeoutMs: 10_000,
+        auditContext: "xai stt",
+      }),
+    );
+    const call = postTranscriptionRequestMock.mock.calls[0]?.[0];
+    expect(call?.headers.get("authorization")).toBe("Bearer xai-key");
+    expect(call?.body).toBeInstanceOf(FormData);
+    const form = call?.body as FormData;
+    expect(form.get("model")).toBe(XAI_DEFAULT_STT_MODEL);
+    expect(form.get("language")).toBe("en");
+    expect(form.get("prompt")).toBeNull();
+    expect(form.get("file")).toBeInstanceOf(Blob);
+  });
+
+  it("registers as an audio media-understanding provider", () => {
+    expect(buildXaiMediaUnderstandingProvider()).toMatchObject({
+      id: "xai",
+      capabilities: ["audio"],
+      defaultModels: { audio: XAI_DEFAULT_STT_MODEL },
+      autoPriority: { audio: 25 },
+    });
+  });
+});
diff --git a/extensions/xai/stt.ts b/extensions/xai/stt.ts
new file mode 100644
index 00000000000..4394be1bcf3
--- /dev/null
+++ b/extensions/xai/stt.ts
@@ -0,0 +1,89 @@
+import type {
+  AudioTranscriptionRequest,
+  AudioTranscriptionResult,
+  MediaUnderstandingProvider,
+} from "openclaw/plugin-sdk/media-understanding";
+import {
+  assertOkOrThrowHttpError,
+  postTranscriptionRequest,
+  resolveProviderHttpRequestConfig,
+  requireTranscriptionText,
+} from "openclaw/plugin-sdk/provider-http";
+import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
+import { XAI_BASE_URL } from "./model-definitions.js";
+
+export const XAI_DEFAULT_STT_MODEL = "grok-stt";
+
+type XaiSttResponse = {
+  text?: string;
+};
+
+function resolveXaiSttBaseUrl(value?: string): string {
+  return normalizeOptionalString(value ?? process.env.XAI_BASE_URL) ?? XAI_BASE_URL;
+}
+
+export async function transcribeXaiAudio(
+  params: AudioTranscriptionRequest,
+): Promise<AudioTranscriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
+    resolveProviderHttpRequestConfig({
+      baseUrl: resolveXaiSttBaseUrl(params.baseUrl),
+      defaultBaseUrl: XAI_BASE_URL,
+      headers: params.headers,
+      request: params.request,
+      defaultHeaders: {
+        Authorization: `Bearer ${params.apiKey}`,
+      },
+      provider: "xai",
+      api: "xai-stt",
+      capability: "audio",
+      transport: "media-understanding",
+    });
+
+  const form = new FormData();
+  const blob = new Blob([new Uint8Array(params.buffer)], {
+    type: params.mime ?? "application/octet-stream",
+  });
+  form.append("file", blob, params.fileName || "audio");
+  const model = normalizeOptionalString(params.model);
+  if (model) {
+    form.append("model", model);
+  }
+  const language = normalizeOptionalString(params.language);
+  if (language) {
+    form.append("language", language);
+  }
+
+  const { response, release } = await postTranscriptionRequest({
+    url: `${baseUrl}/stt`,
+    headers,
+    body: form,
+    timeoutMs: params.timeoutMs,
+    fetchFn,
+    allowPrivateNetwork,
+    dispatcherPolicy,
+    auditContext: "xai stt",
+  });
+
+  try {
+    await assertOkOrThrowHttpError(response, "xAI audio transcription failed");
+    const payload = (await response.json()) as XaiSttResponse;
+    return {
+      text: requireTranscriptionText(payload.text, "xAI transcription response missing text"),
+      ...(model ? { model } : {}),
+    };
+  } finally {
+    await release();
+  }
+}
+
+export function buildXaiMediaUnderstandingProvider(): MediaUnderstandingProvider {
+  return {
+    id: "xai",
+    capabilities: ["audio"],
+    defaultModels: { audio: XAI_DEFAULT_STT_MODEL },
+    autoPriority: { audio: 25 },
+    transcribeAudio: transcribeXaiAudio,
+  };
+}
diff --git a/extensions/xai/xai.live.test.ts b/extensions/xai/xai.live.test.ts
index 18db53b3fa7..a601bc40cef 100644
--- a/extensions/xai/xai.live.test.ts
+++ b/extensions/xai/xai.live.test.ts
@@ -9,6 +9,7 @@ import {
   requireRegisteredProvider,
 } from "../../test/helpers/plugins/provider-registration.js";
 import plugin from "./index.js";
+import { XAI_DEFAULT_STT_MODEL } from "./stt.js";
 
 const XAI_API_KEY = process.env.XAI_API_KEY ?? "";
 const LIVE_IMAGE_MODEL = process.env.OPENCLAW_LIVE_XAI_IMAGE_MODEL?.trim() || "grok-imagine-image";
@@ -106,6 +107,43 @@ describeLive("xai plugin live", () => {
     expect(telephony?.audioBuffer.byteLength).toBeGreaterThan(512);
   }, 120_000);
 
+  it("transcribes audio through the registered media provider", async () => {
+    const { mediaProviders, speechProviders } = await registerXaiPlugin();
+    const mediaProvider = requireRegisteredProvider(mediaProviders, "xai");
+    const speechProvider = requireRegisteredProvider(speechProviders, "xai");
+    const cfg = createLiveConfig();
+    const phrase = "OpenClaw xAI speech to text integration test OK.";
+
+    const audioFile = await speechProvider.synthesize({
+      text: phrase,
+      cfg,
+      providerConfig: {
+        apiKey: XAI_API_KEY,
+        baseUrl: "https://api.x.ai/v1",
+        voiceId: "eve",
+      },
+      target: "audio-file",
+      timeoutMs: 90_000,
+    });
+
+    const transcript = await mediaProvider.transcribeAudio?.({
+      buffer: audioFile.audioBuffer,
+      fileName: "xai-stt-live.mp3",
+      mime: "audio/mpeg",
+      apiKey: XAI_API_KEY,
+      baseUrl: "https://api.x.ai/v1",
+      model: XAI_DEFAULT_STT_MODEL,
+      timeoutMs: 90_000,
+    });
+
+    const normalized = transcript?.text.toLowerCase() ?? "";
+    expect(transcript?.model).toBe(XAI_DEFAULT_STT_MODEL);
+    expect(normalized).toContain("openclaw");
+    expect(normalized).toContain("speech");
+    expect(normalized).toContain("text");
+    expect(normalized).toContain("integration");
+  }, 180_000);
+
   it("generates and edits images through the registered image provider", async () => {
     const { imageProviders } = await registerXaiPlugin();
     const imageProvider = requireRegisteredProvider(imageProviders, "xai");
diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts
index e30ef38c232..2e73d32cf54 100644
--- a/src/media-understanding/defaults.test.ts
+++ b/src/media-understanding/defaults.test.ts
@@ -33,6 +33,7 @@ describe("resolveAutoMediaKeyProviders", () => {
     expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([
       "openai",
       "groq",
+      "xai",
       "deepgram",
       "google",
       "mistral",
diff --git a/test/helpers/plugins/plugin-registration-contract-cases.ts b/test/helpers/plugins/plugin-registration-contract-cases.ts
index e6a1e554d9d..caa6435e7b3 100644
--- a/test/helpers/plugins/plugin-registration-contract-cases.ts
+++ b/test/helpers/plugins/plugin-registration-contract-cases.ts
@@ -128,6 +128,7 @@ export const pluginRegistrationContractCases = {
     pluginId: "xai",
     providerIds: ["xai"],
     webSearchProviderIds: ["grok"],
+    mediaUnderstandingProviderIds: ["xai"],
   },
   zai: {
     pluginId: "zai",