Add runtime.stt.transcribeAudioFile for plugin STT access

Expose audio transcription through the PluginRuntime so external
plugins (e.g. marmot) can use openclaw's media-understanding provider
framework without importing unexported internal modules.

The new transcribeAudioFile() wraps runCapability({capability: "audio"})
and reads provider/model/apiKey from tools.media.audio in the config,
matching the pattern used by the Discord VC implementation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
benthecarman
2026-02-20 21:52:08 -06:00
committed by Peter Steinberger
parent f7b0378ccb
commit faa4ffec03
4 changed files with 61 additions and 0 deletions

View File

@@ -120,6 +120,9 @@ function createMockRuntime(): PluginRuntime {
tts: {
textToSpeechTelephony: vi.fn() as unknown as PluginRuntime["tts"]["textToSpeechTelephony"],
},
stt: {
transcribeAudioFile: vi.fn() as unknown as PluginRuntime["stt"]["transcribeAudioFile"],
},
tools: {
createMemoryGetTool: vi.fn() as unknown as PluginRuntime["tools"]["createMemoryGetTool"],
createMemorySearchTool:

View File

@@ -0,0 +1,51 @@
import type { OpenClawConfig } from "../config/config.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,
runCapability,
} from "./runner.js";
/**
* Transcribe an audio file using the configured media-understanding provider.
*
* Reads provider/model/apiKey from `tools.media.audio` in the openclaw config,
* falling back through configured models until one succeeds.
*
* This is the runtime-exposed entry point for external plugins (e.g. marmot)
* that need STT without importing internal media-understanding modules directly.
*/
export async function transcribeAudioFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
}): Promise<{ text: string | undefined }> {
const ctx = {
MediaPath: params.filePath,
MediaType: params.mime ?? "audio/wav",
};
const attachments = normalizeMediaAttachments(ctx);
if (attachments.length === 0) {
return { text: undefined };
}
const cache = createMediaAttachmentCache(attachments);
const providerRegistry = buildProviderRegistry();
try {
const result = await runCapability({
capability: "audio",
cfg: params.cfg,
ctx,
attachments: cache,
media: attachments,
agentDir: params.agentDir,
providerRegistry,
config: params.cfg.tools?.media?.audio,
});
const output = result.outputs.find((entry) => entry.kind === "audio.transcription");
const text = output?.text?.trim();
return { text: text || undefined };
} finally {
await cache.cleanup();
}
}

View File

@@ -95,6 +95,7 @@ import { buildTemplateMessageFromPayload } from "../../line/template-messages.js
import { getChildLogger } from "../../logging.js";
import { normalizeLogLevel } from "../../logging/levels.js";
import { convertMarkdownTables } from "../../markdown/tables.js";
import { transcribeAudioFile } from "../../media-understanding/transcribe-audio.js";
import { isVoiceCompatibleAudio } from "../../media/audio.js";
import { mediaKindFromMime } from "../../media/constants.js";
import { fetchRemoteMedia } from "../../media/fetch.js";
@@ -244,6 +245,7 @@ export function createPluginRuntime(): PluginRuntime {
system: createRuntimeSystem(),
media: createRuntimeMedia(),
tts: { textToSpeechTelephony },
stt: { transcribeAudioFile },
tools: createRuntimeTools(),
channel: createRuntimeChannel(),
logging: createRuntimeLogging(),

View File

@@ -25,6 +25,8 @@ type UpsertChannelPairingRequestForAccount = (
type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
type TranscribeAudioFile =
typeof import("../../media-understanding/transcribe-audio.js").transcribeAudioFile;
type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
type MatchesMentionPatterns =
typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
@@ -207,6 +209,9 @@ export type PluginRuntime = {
tts: {
textToSpeechTelephony: TextToSpeechTelephony;
};
stt: {
transcribeAudioFile: TranscribeAudioFile;
};
tools: {
createMemoryGetTool: CreateMemoryGetTool;
createMemorySearchTool: CreateMemorySearchTool;