Files
openclaw/extensions/qqbot/src/stt.ts
2026-04-08 01:36:39 +01:00

95 lines
3.3 KiB
TypeScript

/**
* OpenAI-compatible STT used at the plugin layer.
*
* This avoids pushing raw WAV PCM into the framework media-understanding pipeline.
*/
import * as fs from "node:fs";
import path from "node:path";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import { asRecord, readString } from "./config-record-shared.js";
import { sanitizeFileName } from "./utils/platform.js";
export interface STTConfig {
baseUrl: string;
apiKey: string;
model: string;
}
export function resolveSTTConfig(cfg: Record<string, unknown>): STTConfig | null {
const channels = asRecord(cfg.channels);
const qqbot = asRecord(channels?.qqbot);
const channelStt = asRecord(qqbot?.stt);
const models = asRecord(cfg.models);
const providers = asRecord(models?.providers);
// Prefer plugin-specific STT config.
if (channelStt && channelStt.enabled !== false) {
const providerId = readString(channelStt, "provider") ?? "openai";
const providerCfg = asRecord(providers?.[providerId]);
const baseUrl = readString(channelStt, "baseUrl") ?? readString(providerCfg, "baseUrl");
const apiKey = readString(channelStt, "apiKey") ?? readString(providerCfg, "apiKey");
const model = readString(channelStt, "model") ?? "whisper-1";
if (baseUrl && apiKey) {
return { baseUrl: baseUrl.replace(/\/+$/, ""), apiKey, model };
}
}
// Fall back to framework-level audio model config.
const tools = asRecord(cfg.tools);
const media = asRecord(tools?.media);
const audio = asRecord(media?.audio);
const audioModels = audio?.models;
const audioModelEntry = Array.isArray(audioModels) ? asRecord(audioModels[0]) : undefined;
if (audioModelEntry) {
const providerId = readString(audioModelEntry, "provider") ?? "openai";
const providerCfg = asRecord(providers?.[providerId]);
const baseUrl = readString(audioModelEntry, "baseUrl") ?? readString(providerCfg, "baseUrl");
const apiKey = readString(audioModelEntry, "apiKey") ?? readString(providerCfg, "apiKey");
const model = readString(audioModelEntry, "model") ?? "whisper-1";
if (baseUrl && apiKey) {
return { baseUrl: baseUrl.replace(/\/+$/, ""), apiKey, model };
}
}
return null;
}
export async function transcribeAudio(
audioPath: string,
cfg: Record<string, unknown>,
): Promise<string | null> {
const sttCfg = resolveSTTConfig(cfg);
if (!sttCfg) {
return null;
}
const fileBuffer = fs.readFileSync(audioPath);
const fileName = sanitizeFileName(path.basename(audioPath));
const mime = fileName.endsWith(".wav")
? "audio/wav"
: fileName.endsWith(".mp3")
? "audio/mpeg"
: fileName.endsWith(".ogg")
? "audio/ogg"
: "application/octet-stream";
const form = new FormData();
form.append("file", new Blob([fileBuffer], { type: mime }), fileName);
form.append("model", sttCfg.model);
const resp = await fetch(`${sttCfg.baseUrl}/audio/transcriptions`, {
method: "POST",
headers: { Authorization: `Bearer ${sttCfg.apiKey}` },
body: form,
});
if (!resp.ok) {
const detail = await resp.text().catch(() => "");
throw new Error(`STT failed (HTTP ${resp.status}): ${detail.slice(0, 300)}`);
}
const result = (await resp.json()) as { text?: string };
return normalizeOptionalString(result.text) ?? null;
}