mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-20 21:51:28 +00:00
372 lines
12 KiB
TypeScript
372 lines
12 KiB
TypeScript
import { transcribeAudio, resolveSTTConfig } from "./stt.js";
|
|
import { convertSilkToWav, isVoiceAttachment, formatDuration } from "./utils/audio-convert.js";
|
|
import { downloadFile } from "./utils/file-utils.js";
|
|
import { getQQBotMediaDir } from "./utils/platform.js";
|
|
|
|
export interface RawAttachment {
|
|
content_type: string;
|
|
url: string;
|
|
filename?: string;
|
|
voice_wav_url?: string;
|
|
asr_refer_text?: string;
|
|
}
|
|
|
|
export type TranscriptSource = "stt" | "asr" | "fallback";
|
|
|
|
/** Normalized attachment output consumed by the gateway. */
|
|
export interface ProcessedAttachments {
|
|
attachmentInfo: string;
|
|
imageUrls: string[];
|
|
imageMediaTypes: string[];
|
|
voiceAttachmentPaths: string[];
|
|
voiceAttachmentUrls: string[];
|
|
voiceAsrReferTexts: string[];
|
|
voiceTranscripts: string[];
|
|
voiceTranscriptSources: TranscriptSource[];
|
|
attachmentLocalPaths: Array<string | null>;
|
|
}
|
|
|
|
interface ProcessContext {
|
|
accountId: string;
|
|
cfg: unknown;
|
|
log?: {
|
|
info: (msg: string) => void;
|
|
error: (msg: string) => void;
|
|
debug?: (msg: string) => void;
|
|
};
|
|
}
|
|
|
|
const EMPTY_RESULT: ProcessedAttachments = {
|
|
attachmentInfo: "",
|
|
imageUrls: [],
|
|
imageMediaTypes: [],
|
|
voiceAttachmentPaths: [],
|
|
voiceAttachmentUrls: [],
|
|
voiceAsrReferTexts: [],
|
|
voiceTranscripts: [],
|
|
voiceTranscriptSources: [],
|
|
attachmentLocalPaths: [],
|
|
};
|
|
|
|
/** Download, convert, transcribe, and classify inbound attachments. */
|
|
export async function processAttachments(
|
|
attachments: RawAttachment[] | undefined,
|
|
ctx: ProcessContext,
|
|
): Promise<ProcessedAttachments> {
|
|
if (!attachments?.length) {
|
|
return EMPTY_RESULT;
|
|
}
|
|
|
|
const { accountId, cfg, log } = ctx;
|
|
const downloadDir = getQQBotMediaDir("downloads");
|
|
const prefix = `[qqbot:${accountId}]`;
|
|
|
|
const imageUrls: string[] = [];
|
|
const imageMediaTypes: string[] = [];
|
|
const voiceAttachmentPaths: string[] = [];
|
|
const voiceAttachmentUrls: string[] = [];
|
|
const voiceAsrReferTexts: string[] = [];
|
|
const voiceTranscripts: string[] = [];
|
|
const voiceTranscriptSources: TranscriptSource[] = [];
|
|
const attachmentLocalPaths: Array<string | null> = [];
|
|
const otherAttachments: string[] = [];
|
|
|
|
// Phase 1: download all attachments in parallel.
|
|
const downloadTasks = attachments.map(async (att) => {
|
|
const attUrl = att.url?.startsWith("//") ? `https:${att.url}` : att.url;
|
|
const isVoice = isVoiceAttachment(att);
|
|
const wavUrl =
|
|
isVoice && att.voice_wav_url
|
|
? att.voice_wav_url.startsWith("//")
|
|
? `https:${att.voice_wav_url}`
|
|
: att.voice_wav_url
|
|
: "";
|
|
|
|
let localPath: string | null = null;
|
|
let audioPath: string | null = null;
|
|
|
|
if (isVoice && wavUrl) {
|
|
const wavLocalPath = await downloadFile(wavUrl, downloadDir);
|
|
if (wavLocalPath) {
|
|
localPath = wavLocalPath;
|
|
audioPath = wavLocalPath;
|
|
log?.info(
|
|
`${prefix} Voice attachment: ${att.filename}, downloaded WAV directly (skip SILK→WAV)`,
|
|
);
|
|
} else {
|
|
log?.error(`${prefix} Failed to download voice_wav_url, falling back to original URL`);
|
|
}
|
|
}
|
|
|
|
if (!localPath) {
|
|
localPath = await downloadFile(attUrl, downloadDir, att.filename);
|
|
}
|
|
|
|
return { att, attUrl, isVoice, localPath, audioPath };
|
|
});
|
|
|
|
const downloadResults = await Promise.all(downloadTasks);
|
|
|
|
// Phase 2: convert/transcribe voice attachments and classify everything else.
|
|
const processTasks = downloadResults.map(
|
|
async ({ att, attUrl, isVoice, localPath, audioPath }) => {
|
|
const asrReferText = typeof att.asr_refer_text === "string" ? att.asr_refer_text.trim() : "";
|
|
const wavUrl =
|
|
isVoice && att.voice_wav_url
|
|
? att.voice_wav_url.startsWith("//")
|
|
? `https:${att.voice_wav_url}`
|
|
: att.voice_wav_url
|
|
: "";
|
|
const voiceSourceUrl = wavUrl || attUrl;
|
|
|
|
const meta = {
|
|
voiceUrl: isVoice && voiceSourceUrl ? voiceSourceUrl : undefined,
|
|
asrReferText: isVoice && asrReferText ? asrReferText : undefined,
|
|
};
|
|
|
|
if (localPath) {
|
|
if (att.content_type?.startsWith("image/")) {
|
|
log?.info(`${prefix} Downloaded attachment to: ${localPath}`);
|
|
return { localPath, type: "image" as const, contentType: att.content_type, meta };
|
|
} else if (isVoice) {
|
|
log?.info(`${prefix} Downloaded attachment to: ${localPath}`);
|
|
return processVoiceAttachment(
|
|
localPath,
|
|
audioPath,
|
|
att,
|
|
asrReferText,
|
|
cfg,
|
|
downloadDir,
|
|
log,
|
|
prefix,
|
|
);
|
|
} else {
|
|
log?.info(`${prefix} Downloaded attachment to: ${localPath}`);
|
|
return { localPath, type: "other" as const, filename: att.filename, meta };
|
|
}
|
|
} else {
|
|
log?.error(`${prefix} Failed to download: ${attUrl}`);
|
|
if (att.content_type?.startsWith("image/")) {
|
|
return {
|
|
localPath: null,
|
|
type: "image-fallback" as const,
|
|
attUrl,
|
|
contentType: att.content_type,
|
|
meta,
|
|
};
|
|
} else if (isVoice && asrReferText) {
|
|
log?.info(`${prefix} Voice attachment download failed, using asr_refer_text fallback`);
|
|
return {
|
|
localPath: null,
|
|
type: "voice-fallback" as const,
|
|
transcript: asrReferText,
|
|
meta,
|
|
};
|
|
} else {
|
|
return {
|
|
localPath: null,
|
|
type: "other-fallback" as const,
|
|
filename: att.filename ?? att.content_type,
|
|
meta,
|
|
};
|
|
}
|
|
}
|
|
},
|
|
);
|
|
|
|
const processResults = await Promise.all(processTasks);
|
|
|
|
// Phase 3: collect results in the original attachment order.
|
|
for (const result of processResults) {
|
|
if (result.meta.voiceUrl) {
|
|
voiceAttachmentUrls.push(result.meta.voiceUrl);
|
|
}
|
|
if (result.meta.asrReferText) {
|
|
voiceAsrReferTexts.push(result.meta.asrReferText);
|
|
}
|
|
|
|
if (result.type === "image" && result.localPath) {
|
|
imageUrls.push(result.localPath);
|
|
imageMediaTypes.push(result.contentType);
|
|
attachmentLocalPaths.push(result.localPath);
|
|
} else if (result.type === "voice" && result.localPath) {
|
|
voiceAttachmentPaths.push(result.localPath);
|
|
voiceTranscripts.push(result.transcript);
|
|
voiceTranscriptSources.push(result.transcriptSource);
|
|
attachmentLocalPaths.push(result.localPath);
|
|
} else if (result.type === "other" && result.localPath) {
|
|
otherAttachments.push(`[Attachment: ${result.localPath}]`);
|
|
attachmentLocalPaths.push(result.localPath);
|
|
} else if (result.type === "image-fallback") {
|
|
imageUrls.push(result.attUrl);
|
|
imageMediaTypes.push(result.contentType);
|
|
attachmentLocalPaths.push(null);
|
|
} else if (result.type === "voice-fallback") {
|
|
voiceTranscripts.push(result.transcript);
|
|
voiceTranscriptSources.push("asr");
|
|
attachmentLocalPaths.push(null);
|
|
} else if (result.type === "other-fallback") {
|
|
otherAttachments.push(`[Attachment: ${result.filename}] (download failed)`);
|
|
attachmentLocalPaths.push(null);
|
|
}
|
|
}
|
|
|
|
const attachmentInfo = otherAttachments.length > 0 ? "\n" + otherAttachments.join("\n") : "";
|
|
|
|
return {
|
|
attachmentInfo,
|
|
imageUrls,
|
|
imageMediaTypes,
|
|
voiceAttachmentPaths,
|
|
voiceAttachmentUrls,
|
|
voiceAsrReferTexts,
|
|
voiceTranscripts,
|
|
voiceTranscriptSources,
|
|
attachmentLocalPaths,
|
|
};
|
|
}
|
|
|
|
/** Format voice transcripts into user-visible text. */
|
|
export function formatVoiceText(transcripts: string[]): string {
|
|
if (transcripts.length === 0) {
|
|
return "";
|
|
}
|
|
return transcripts.length === 1
|
|
? `[Voice message] ${transcripts[0]}`
|
|
: transcripts.map((t, i) => `[Voice ${i + 1}] ${t}`).join("\n");
|
|
}
|
|
|
|
// Internal helpers.
|
|
|
|
type VoiceResult =
|
|
| {
|
|
localPath: string;
|
|
type: "voice";
|
|
transcript: string;
|
|
transcriptSource: TranscriptSource;
|
|
meta: { voiceUrl?: string; asrReferText?: string };
|
|
}
|
|
| {
|
|
localPath: string;
|
|
type: "voice";
|
|
transcript: string;
|
|
transcriptSource: TranscriptSource;
|
|
meta: { voiceUrl?: string; asrReferText?: string };
|
|
};
|
|
|
|
async function processVoiceAttachment(
|
|
localPath: string,
|
|
audioPath: string | null,
|
|
att: RawAttachment,
|
|
asrReferText: string,
|
|
cfg: unknown,
|
|
downloadDir: string,
|
|
log: ProcessContext["log"],
|
|
prefix: string,
|
|
): Promise<VoiceResult> {
|
|
const wavUrl = att.voice_wav_url
|
|
? att.voice_wav_url.startsWith("//")
|
|
? `https:${att.voice_wav_url}`
|
|
: att.voice_wav_url
|
|
: "";
|
|
const attUrl = att.url?.startsWith("//") ? `https:${att.url}` : att.url;
|
|
const voiceSourceUrl = wavUrl || attUrl;
|
|
const meta = {
|
|
voiceUrl: voiceSourceUrl || undefined,
|
|
asrReferText: asrReferText || undefined,
|
|
};
|
|
|
|
const sttCfg = resolveSTTConfig(cfg as Record<string, unknown>);
|
|
if (!sttCfg) {
|
|
if (asrReferText) {
|
|
log?.info(
|
|
`${prefix} Voice attachment: ${att.filename} (STT not configured, using asr_refer_text fallback)`,
|
|
);
|
|
return { localPath, type: "voice", transcript: asrReferText, transcriptSource: "asr", meta };
|
|
}
|
|
log?.info(
|
|
`${prefix} Voice attachment: ${att.filename} (STT not configured, skipping transcription)`,
|
|
);
|
|
return {
|
|
localPath,
|
|
type: "voice",
|
|
transcript: "[Voice message - transcription unavailable because STT is not configured]",
|
|
transcriptSource: "fallback",
|
|
meta,
|
|
};
|
|
}
|
|
|
|
// Convert SILK input to WAV before STT when necessary.
|
|
if (!audioPath) {
|
|
log?.info(`${prefix} Voice attachment: ${att.filename}, converting SILK→WAV...`);
|
|
try {
|
|
const wavResult = await convertSilkToWav(localPath, downloadDir);
|
|
if (wavResult) {
|
|
audioPath = wavResult.wavPath;
|
|
log?.info(
|
|
`${prefix} Voice converted: ${wavResult.wavPath} (${formatDuration(wavResult.duration)})`,
|
|
);
|
|
} else {
|
|
audioPath = localPath;
|
|
}
|
|
} catch (convertErr) {
|
|
log?.error(
|
|
`${prefix} Voice conversion failed: ${
|
|
convertErr instanceof Error ? convertErr.message : JSON.stringify(convertErr)
|
|
}`,
|
|
);
|
|
if (asrReferText) {
|
|
return {
|
|
localPath,
|
|
type: "voice",
|
|
transcript: asrReferText,
|
|
transcriptSource: "asr",
|
|
meta,
|
|
};
|
|
}
|
|
return {
|
|
localPath,
|
|
type: "voice",
|
|
transcript: "[Voice message - format conversion failed]",
|
|
transcriptSource: "fallback",
|
|
meta,
|
|
};
|
|
}
|
|
}
|
|
|
|
// Run speech-to-text on the prepared audio file.
|
|
try {
|
|
const transcript = await transcribeAudio(audioPath, cfg as Record<string, unknown>);
|
|
if (transcript) {
|
|
log?.info(`${prefix} STT transcript: ${transcript.slice(0, 100)}...`);
|
|
return { localPath, type: "voice", transcript, transcriptSource: "stt", meta };
|
|
}
|
|
if (asrReferText) {
|
|
log?.info(`${prefix} STT returned empty result, using asr_refer_text fallback`);
|
|
return { localPath, type: "voice", transcript: asrReferText, transcriptSource: "asr", meta };
|
|
}
|
|
log?.info(`${prefix} STT returned empty result`);
|
|
return {
|
|
localPath,
|
|
type: "voice",
|
|
transcript: "[Voice message - transcription returned an empty result]",
|
|
transcriptSource: "fallback",
|
|
meta,
|
|
};
|
|
} catch (sttErr) {
|
|
log?.error(
|
|
`${prefix} STT failed: ${sttErr instanceof Error ? sttErr.message : JSON.stringify(sttErr)}`,
|
|
);
|
|
if (asrReferText) {
|
|
return { localPath, type: "voice", transcript: asrReferText, transcriptSource: "asr", meta };
|
|
}
|
|
return {
|
|
localPath,
|
|
type: "voice",
|
|
transcript: "[Voice message - transcription failed]",
|
|
transcriptSource: "fallback",
|
|
meta,
|
|
};
|
|
}
|
|
}
|