diff --git a/extensions/bluebubbles/src/monitor.test.ts b/extensions/bluebubbles/src/monitor.test.ts index 483b6cd9ed3..68aa4351e10 100644 --- a/extensions/bluebubbles/src/monitor.test.ts +++ b/extensions/bluebubbles/src/monitor.test.ts @@ -120,6 +120,9 @@ function createMockRuntime(): PluginRuntime { tts: { textToSpeechTelephony: vi.fn() as unknown as PluginRuntime["tts"]["textToSpeechTelephony"], }, + stt: { + transcribeAudioFile: vi.fn() as unknown as PluginRuntime["stt"]["transcribeAudioFile"], + }, tools: { createMemoryGetTool: vi.fn() as unknown as PluginRuntime["tools"]["createMemoryGetTool"], createMemorySearchTool: diff --git a/src/media-understanding/transcribe-audio.ts b/src/media-understanding/transcribe-audio.ts new file mode 100644 index 00000000000..3573a0a4333 --- /dev/null +++ b/src/media-understanding/transcribe-audio.ts @@ -0,0 +1,51 @@ +import type { OpenClawConfig } from "../config/config.js"; +import { + buildProviderRegistry, + createMediaAttachmentCache, + normalizeMediaAttachments, + runCapability, +} from "./runner.js"; + +/** + * Transcribe an audio file using the configured media-understanding provider. + * + * Reads provider/model/apiKey from `tools.media.audio` in the openclaw config, + * falling back through configured models until one succeeds. + * + * This is the runtime-exposed entry point for external plugins (e.g. marmot) + * that need STT without importing internal media-understanding modules directly. + */ +export async function transcribeAudioFile(params: { + filePath: string; + cfg: OpenClawConfig; + agentDir?: string; + mime?: string; +}): Promise<{ text: string | undefined }> { + const ctx = { + MediaPath: params.filePath, + MediaType: params.mime ?? "audio/wav", + }; + const attachments = normalizeMediaAttachments(ctx); + if (attachments.length === 0) { + return { text: undefined }; + } + const cache = createMediaAttachmentCache(attachments); + const providerRegistry = buildProviderRegistry(); + try { + const result = await runCapability({ + capability: "audio", + cfg: params.cfg, + ctx, + attachments: cache, + media: attachments, + agentDir: params.agentDir, + providerRegistry, + config: params.cfg.tools?.media?.audio, + }); + const output = result.outputs.find((entry) => entry.kind === "audio.transcription"); + const text = output?.text?.trim(); + return { text: text || undefined }; + } finally { + await cache.cleanup(); + } +} diff --git a/src/plugins/runtime/index.ts b/src/plugins/runtime/index.ts index cba4e9f6d00..f45e1e9b6b7 100644 --- a/src/plugins/runtime/index.ts +++ b/src/plugins/runtime/index.ts @@ -95,6 +95,7 @@ import { buildTemplateMessageFromPayload } from "../../line/template-messages.js import { getChildLogger } from "../../logging.js"; import { normalizeLogLevel } from "../../logging/levels.js"; import { convertMarkdownTables } from "../../markdown/tables.js"; +import { transcribeAudioFile } from "../../media-understanding/transcribe-audio.js"; import { isVoiceCompatibleAudio } from "../../media/audio.js"; import { mediaKindFromMime } from "../../media/constants.js"; import { fetchRemoteMedia } from "../../media/fetch.js"; @@ -244,6 +245,7 @@ export function createPluginRuntime(): PluginRuntime { system: createRuntimeSystem(), media: createRuntimeMedia(), tts: { textToSpeechTelephony }, + stt: { transcribeAudioFile }, tools: createRuntimeTools(), channel: createRuntimeChannel(), logging: createRuntimeLogging(), diff --git a/src/plugins/runtime/types.ts b/src/plugins/runtime/types.ts index 39ada4cd431..cfb06627ddd 100644 --- a/src/plugins/runtime/types.ts +++ b/src/plugins/runtime/types.ts @@ -25,6 +25,8 @@ type UpsertChannelPairingRequestForAccount = ( type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia; type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer; type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony; +type TranscribeAudioFile = + typeof import("../../media-understanding/transcribe-audio.js").transcribeAudioFile; type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes; type MatchesMentionPatterns = typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns; @@ -207,6 +209,9 @@ export type PluginRuntime = { tts: { textToSpeechTelephony: TextToSpeechTelephony; }; + stt: { + transcribeAudioFile: TranscribeAudioFile; + }; tools: { createMemoryGetTool: CreateMemoryGetTool; createMemorySearchTool: CreateMemorySearchTool;