diff --git a/extensions/bluebubbles/src/monitor.test.ts b/extensions/bluebubbles/src/monitor.test.ts
index 483b6cd9ed3..68aa4351e10 100644
--- a/extensions/bluebubbles/src/monitor.test.ts
+++ b/extensions/bluebubbles/src/monitor.test.ts
@@ -120,6 +120,9 @@ function createMockRuntime(): PluginRuntime {
     tts: {
       textToSpeechTelephony: vi.fn() as unknown as PluginRuntime["tts"]["textToSpeechTelephony"],
     },
+    stt: {
+      transcribeAudioFile: vi.fn() as unknown as PluginRuntime["stt"]["transcribeAudioFile"],
+    },
     tools: {
       createMemoryGetTool: vi.fn() as unknown as PluginRuntime["tools"]["createMemoryGetTool"],
       createMemorySearchTool:
diff --git a/src/media-understanding/transcribe-audio.ts b/src/media-understanding/transcribe-audio.ts
new file mode 100644
index 00000000000..3573a0a4333
--- /dev/null
+++ b/src/media-understanding/transcribe-audio.ts
@@ -0,0 +1,51 @@
+import type { OpenClawConfig } from "../config/config.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+/**
+ * Transcribe an audio file using the configured media-understanding provider.
+ *
+ * Reads provider/model/apiKey from `tools.media.audio` in the openclaw config,
+ * falling back through configured models until one succeeds.
+ *
+ * This is the runtime-exposed entry point for external plugins (e.g. marmot)
+ * that need STT without importing internal media-understanding modules directly.
+ */
+export async function transcribeAudioFile(params: {
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+}): Promise<{ text: string | undefined }> {
+  const ctx = {
+    MediaPath: params.filePath,
+    MediaType: params.mime ?? "audio/wav",
+  };
+  const attachments = normalizeMediaAttachments(ctx);
+  if (attachments.length === 0) {
+    return { text: undefined };
+  }
+  const cache = createMediaAttachmentCache(attachments);
+  const providerRegistry = buildProviderRegistry();
+  try {
+    const result = await runCapability({
+      capability: "audio",
+      cfg: params.cfg,
+      ctx,
+      attachments: cache,
+      media: attachments,
+      agentDir: params.agentDir,
+      providerRegistry,
+      config: params.cfg.tools?.media?.audio,
+    });
+    const output = result.outputs.find((entry) => entry.kind === "audio.transcription");
+    const text = output?.text?.trim();
+    return { text: text || undefined };
+  } finally {
+    await cache.cleanup();
+  }
+}
diff --git a/src/plugins/runtime/index.ts b/src/plugins/runtime/index.ts
index cba4e9f6d00..f45e1e9b6b7 100644
--- a/src/plugins/runtime/index.ts
+++ b/src/plugins/runtime/index.ts
@@ -95,6 +95,7 @@ import { buildTemplateMessageFromPayload } from "../../line/template-messages.js
 import { getChildLogger } from "../../logging.js";
 import { normalizeLogLevel } from "../../logging/levels.js";
 import { convertMarkdownTables } from "../../markdown/tables.js";
+import { transcribeAudioFile } from "../../media-understanding/transcribe-audio.js";
 import { isVoiceCompatibleAudio } from "../../media/audio.js";
 import { mediaKindFromMime } from "../../media/constants.js";
 import { fetchRemoteMedia } from "../../media/fetch.js";
@@ -244,6 +245,7 @@ export function createPluginRuntime(): PluginRuntime {
     system: createRuntimeSystem(),
     media: createRuntimeMedia(),
     tts: { textToSpeechTelephony },
+    stt: { transcribeAudioFile },
     tools: createRuntimeTools(),
     channel: createRuntimeChannel(),
     logging: createRuntimeLogging(),
diff --git a/src/plugins/runtime/types.ts b/src/plugins/runtime/types.ts
index 39ada4cd431..cfb06627ddd 100644
--- a/src/plugins/runtime/types.ts
+++ b/src/plugins/runtime/types.ts
@@ -25,6 +25,8 @@ type UpsertChannelPairingRequestForAccount = (
 type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
 type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
 type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
+type TranscribeAudioFile =
+  typeof import("../../media-understanding/transcribe-audio.js").transcribeAudioFile;
 type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
 type MatchesMentionPatterns =
   typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
@@ -207,6 +209,9 @@ export type PluginRuntime = {
   tts: {
     textToSpeechTelephony: TextToSpeechTelephony;
   };
+  stt: {
+    transcribeAudioFile: TranscribeAudioFile;
+  };
   tools: {
     createMemoryGetTool: CreateMemoryGetTool;
     createMemorySearchTool: CreateMemorySearchTool;