mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-17 04:01:05 +00:00
95 lines
3.3 KiB
TypeScript
95 lines
3.3 KiB
TypeScript
/**
|
|
* OpenAI-compatible STT used at the plugin layer.
|
|
*
|
|
* This avoids pushing raw WAV PCM into the framework media-understanding pipeline.
|
|
*/
|
|
|
|
import * as fs from "node:fs";
|
|
import path from "node:path";
|
|
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
|
|
import { asRecord, readString } from "./config-record-shared.js";
|
|
import { sanitizeFileName } from "./utils/platform.js";
|
|
|
|
export interface STTConfig {
|
|
baseUrl: string;
|
|
apiKey: string;
|
|
model: string;
|
|
}
|
|
|
|
export function resolveSTTConfig(cfg: Record<string, unknown>): STTConfig | null {
|
|
const channels = asRecord(cfg.channels);
|
|
const qqbot = asRecord(channels?.qqbot);
|
|
const channelStt = asRecord(qqbot?.stt);
|
|
const models = asRecord(cfg.models);
|
|
const providers = asRecord(models?.providers);
|
|
|
|
// Prefer plugin-specific STT config.
|
|
if (channelStt && channelStt.enabled !== false) {
|
|
const providerId = readString(channelStt, "provider") ?? "openai";
|
|
const providerCfg = asRecord(providers?.[providerId]);
|
|
const baseUrl = readString(channelStt, "baseUrl") ?? readString(providerCfg, "baseUrl");
|
|
const apiKey = readString(channelStt, "apiKey") ?? readString(providerCfg, "apiKey");
|
|
const model = readString(channelStt, "model") ?? "whisper-1";
|
|
if (baseUrl && apiKey) {
|
|
return { baseUrl: baseUrl.replace(/\/+$/, ""), apiKey, model };
|
|
}
|
|
}
|
|
|
|
// Fall back to framework-level audio model config.
|
|
const tools = asRecord(cfg.tools);
|
|
const media = asRecord(tools?.media);
|
|
const audio = asRecord(media?.audio);
|
|
const audioModels = audio?.models;
|
|
const audioModelEntry = Array.isArray(audioModels) ? asRecord(audioModels[0]) : undefined;
|
|
if (audioModelEntry) {
|
|
const providerId = readString(audioModelEntry, "provider") ?? "openai";
|
|
const providerCfg = asRecord(providers?.[providerId]);
|
|
const baseUrl = readString(audioModelEntry, "baseUrl") ?? readString(providerCfg, "baseUrl");
|
|
const apiKey = readString(audioModelEntry, "apiKey") ?? readString(providerCfg, "apiKey");
|
|
const model = readString(audioModelEntry, "model") ?? "whisper-1";
|
|
if (baseUrl && apiKey) {
|
|
return { baseUrl: baseUrl.replace(/\/+$/, ""), apiKey, model };
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
export async function transcribeAudio(
|
|
audioPath: string,
|
|
cfg: Record<string, unknown>,
|
|
): Promise<string | null> {
|
|
const sttCfg = resolveSTTConfig(cfg);
|
|
if (!sttCfg) {
|
|
return null;
|
|
}
|
|
|
|
const fileBuffer = fs.readFileSync(audioPath);
|
|
const fileName = sanitizeFileName(path.basename(audioPath));
|
|
const mime = fileName.endsWith(".wav")
|
|
? "audio/wav"
|
|
: fileName.endsWith(".mp3")
|
|
? "audio/mpeg"
|
|
: fileName.endsWith(".ogg")
|
|
? "audio/ogg"
|
|
: "application/octet-stream";
|
|
|
|
const form = new FormData();
|
|
form.append("file", new Blob([fileBuffer], { type: mime }), fileName);
|
|
form.append("model", sttCfg.model);
|
|
|
|
const resp = await fetch(`${sttCfg.baseUrl}/audio/transcriptions`, {
|
|
method: "POST",
|
|
headers: { Authorization: `Bearer ${sttCfg.apiKey}` },
|
|
body: form,
|
|
});
|
|
|
|
if (!resp.ok) {
|
|
const detail = await resp.text().catch(() => "");
|
|
throw new Error(`STT failed (HTTP ${resp.status}): ${detail.slice(0, 300)}`);
|
|
}
|
|
|
|
const result = (await resp.json()) as { text?: string };
|
|
return normalizeOptionalString(result.text) ?? null;
|
|
}
|