Add runtime.stt.transcribeAudioFile for plugin STT access

Expose audio transcription through the PluginRuntime so external plugins (e.g. marmot) can use openclaw's media-understanding provider framework without importing unexported internal modules. The new transcribeAudioFile() wraps runCapability({capability: "audio"}) and reads provider/model/apiKey from tools.media.audio in the config, matching the pattern used by the Discord VC implementation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-06 09:40:43 +00:00 · 2026-02-20 21:52:08 -06:00
parent f7b0378ccb
commit faa4ffec03
4 changed files with 61 additions and 0 deletions
--- a/extensions/bluebubbles/src/monitor.test.ts
+++ b/extensions/bluebubbles/src/monitor.test.ts
@@ -120,6 +120,9 @@ function createMockRuntime(): PluginRuntime {
    tts: {
      textToSpeechTelephony: vi.fn() as unknown as PluginRuntime["tts"]["textToSpeechTelephony"],
    },
+    stt: {
+      transcribeAudioFile: vi.fn() as unknown as PluginRuntime["stt"]["transcribeAudioFile"],
+    },
    tools: {
      createMemoryGetTool: vi.fn() as unknown as PluginRuntime["tools"]["createMemoryGetTool"],
      createMemorySearchTool:
--- a/src/media-understanding/transcribe-audio.ts
+++ b/src/media-understanding/transcribe-audio.ts
@@ -0,0 +1,51 @@
+import type { OpenClawConfig } from "../config/config.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+/**
+ * Transcribe an audio file using the configured media-understanding provider.
+ *
+ * Reads provider/model/apiKey from `tools.media.audio` in the openclaw config,
+ * falling back through configured models until one succeeds.
+ *
+ * This is the runtime-exposed entry point for external plugins (e.g. marmot)
+ * that need STT without importing internal media-understanding modules directly.
+ */
+export async function transcribeAudioFile(params: {
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+}): Promise<{ text: string | undefined }> {
+  const ctx = {
+    MediaPath: params.filePath,
+    MediaType: params.mime ?? "audio/wav",
+  };
+  const attachments = normalizeMediaAttachments(ctx);
+  if (attachments.length === 0) {
+    return { text: undefined };
+  }
+  const cache = createMediaAttachmentCache(attachments);
+  const providerRegistry = buildProviderRegistry();
+  try {
+    const result = await runCapability({
+      capability: "audio",
+      cfg: params.cfg,
+      ctx,
+      attachments: cache,
+      media: attachments,
+      agentDir: params.agentDir,
+      providerRegistry,
+      config: params.cfg.tools?.media?.audio,
+    });
+    const output = result.outputs.find((entry) => entry.kind === "audio.transcription");
+    const text = output?.text?.trim();
+    return { text: text || undefined };
+  } finally {
+    await cache.cleanup();
+  }
+}
--- a/src/plugins/runtime/index.ts
+++ b/src/plugins/runtime/index.ts
@@ -95,6 +95,7 @@ import { buildTemplateMessageFromPayload } from "../../line/template-messages.js
 import { getChildLogger } from "../../logging.js";
 import { normalizeLogLevel } from "../../logging/levels.js";
 import { convertMarkdownTables } from "../../markdown/tables.js";
+import { transcribeAudioFile } from "../../media-understanding/transcribe-audio.js";
 import { isVoiceCompatibleAudio } from "../../media/audio.js";
 import { mediaKindFromMime } from "../../media/constants.js";
 import { fetchRemoteMedia } from "../../media/fetch.js";
@@ -244,6 +245,7 @@ export function createPluginRuntime(): PluginRuntime {
    system: createRuntimeSystem(),
    media: createRuntimeMedia(),
    tts: { textToSpeechTelephony },
+    stt: { transcribeAudioFile },
    tools: createRuntimeTools(),
    channel: createRuntimeChannel(),
    logging: createRuntimeLogging(),
--- a/src/plugins/runtime/types.ts
+++ b/src/plugins/runtime/types.ts
@@ -25,6 +25,8 @@ type UpsertChannelPairingRequestForAccount = (
 type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
 type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
 type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
+type TranscribeAudioFile =
+  typeof import("../../media-understanding/transcribe-audio.js").transcribeAudioFile;
 type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
 type MatchesMentionPatterns =
  typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
@@ -207,6 +209,9 @@ export type PluginRuntime = {
  tts: {
    textToSpeechTelephony: TextToSpeechTelephony;
  };
+  stt: {
+    transcribeAudioFile: TranscribeAudioFile;
+  };
  tools: {
    createMemoryGetTool: CreateMemoryGetTool;
    createMemorySearchTool: CreateMemorySearchTool;