Files
openclaw/extensions/qqbot/src/inbound-attachments.ts
2026-04-06 14:53:55 +01:00

372 lines
12 KiB
TypeScript

import { transcribeAudio, resolveSTTConfig } from "./stt.js";
import { convertSilkToWav, isVoiceAttachment, formatDuration } from "./utils/audio-convert.js";
import { downloadFile } from "./utils/file-utils.js";
import { getQQBotMediaDir } from "./utils/platform.js";
export interface RawAttachment {
content_type: string;
url: string;
filename?: string;
voice_wav_url?: string;
asr_refer_text?: string;
}
export type TranscriptSource = "stt" | "asr" | "fallback";
/** Normalized attachment output consumed by the gateway. */
export interface ProcessedAttachments {
attachmentInfo: string;
imageUrls: string[];
imageMediaTypes: string[];
voiceAttachmentPaths: string[];
voiceAttachmentUrls: string[];
voiceAsrReferTexts: string[];
voiceTranscripts: string[];
voiceTranscriptSources: TranscriptSource[];
attachmentLocalPaths: Array<string | null>;
}
interface ProcessContext {
accountId: string;
cfg: unknown;
log?: {
info: (msg: string) => void;
error: (msg: string) => void;
debug?: (msg: string) => void;
};
}
const EMPTY_RESULT: ProcessedAttachments = {
attachmentInfo: "",
imageUrls: [],
imageMediaTypes: [],
voiceAttachmentPaths: [],
voiceAttachmentUrls: [],
voiceAsrReferTexts: [],
voiceTranscripts: [],
voiceTranscriptSources: [],
attachmentLocalPaths: [],
};
/** Download, convert, transcribe, and classify inbound attachments. */
export async function processAttachments(
attachments: RawAttachment[] | undefined,
ctx: ProcessContext,
): Promise<ProcessedAttachments> {
if (!attachments?.length) {
return EMPTY_RESULT;
}
const { accountId, cfg, log } = ctx;
const downloadDir = getQQBotMediaDir("downloads");
const prefix = `[qqbot:${accountId}]`;
const imageUrls: string[] = [];
const imageMediaTypes: string[] = [];
const voiceAttachmentPaths: string[] = [];
const voiceAttachmentUrls: string[] = [];
const voiceAsrReferTexts: string[] = [];
const voiceTranscripts: string[] = [];
const voiceTranscriptSources: TranscriptSource[] = [];
const attachmentLocalPaths: Array<string | null> = [];
const otherAttachments: string[] = [];
// Phase 1: download all attachments in parallel.
const downloadTasks = attachments.map(async (att) => {
const attUrl = att.url?.startsWith("//") ? `https:${att.url}` : att.url;
const isVoice = isVoiceAttachment(att);
const wavUrl =
isVoice && att.voice_wav_url
? att.voice_wav_url.startsWith("//")
? `https:${att.voice_wav_url}`
: att.voice_wav_url
: "";
let localPath: string | null = null;
let audioPath: string | null = null;
if (isVoice && wavUrl) {
const wavLocalPath = await downloadFile(wavUrl, downloadDir);
if (wavLocalPath) {
localPath = wavLocalPath;
audioPath = wavLocalPath;
log?.info(
`${prefix} Voice attachment: ${att.filename}, downloaded WAV directly (skip SILK→WAV)`,
);
} else {
log?.error(`${prefix} Failed to download voice_wav_url, falling back to original URL`);
}
}
if (!localPath) {
localPath = await downloadFile(attUrl, downloadDir, att.filename);
}
return { att, attUrl, isVoice, localPath, audioPath };
});
const downloadResults = await Promise.all(downloadTasks);
// Phase 2: convert/transcribe voice attachments and classify everything else.
const processTasks = downloadResults.map(
async ({ att, attUrl, isVoice, localPath, audioPath }) => {
const asrReferText = typeof att.asr_refer_text === "string" ? att.asr_refer_text.trim() : "";
const wavUrl =
isVoice && att.voice_wav_url
? att.voice_wav_url.startsWith("//")
? `https:${att.voice_wav_url}`
: att.voice_wav_url
: "";
const voiceSourceUrl = wavUrl || attUrl;
const meta = {
voiceUrl: isVoice && voiceSourceUrl ? voiceSourceUrl : undefined,
asrReferText: isVoice && asrReferText ? asrReferText : undefined,
};
if (localPath) {
if (att.content_type?.startsWith("image/")) {
log?.info(`${prefix} Downloaded attachment to: ${localPath}`);
return { localPath, type: "image" as const, contentType: att.content_type, meta };
} else if (isVoice) {
log?.info(`${prefix} Downloaded attachment to: ${localPath}`);
return processVoiceAttachment(
localPath,
audioPath,
att,
asrReferText,
cfg,
downloadDir,
log,
prefix,
);
} else {
log?.info(`${prefix} Downloaded attachment to: ${localPath}`);
return { localPath, type: "other" as const, filename: att.filename, meta };
}
} else {
log?.error(`${prefix} Failed to download: ${attUrl}`);
if (att.content_type?.startsWith("image/")) {
return {
localPath: null,
type: "image-fallback" as const,
attUrl,
contentType: att.content_type,
meta,
};
} else if (isVoice && asrReferText) {
log?.info(`${prefix} Voice attachment download failed, using asr_refer_text fallback`);
return {
localPath: null,
type: "voice-fallback" as const,
transcript: asrReferText,
meta,
};
} else {
return {
localPath: null,
type: "other-fallback" as const,
filename: att.filename ?? att.content_type,
meta,
};
}
}
},
);
const processResults = await Promise.all(processTasks);
// Phase 3: collect results in the original attachment order.
for (const result of processResults) {
if (result.meta.voiceUrl) {
voiceAttachmentUrls.push(result.meta.voiceUrl);
}
if (result.meta.asrReferText) {
voiceAsrReferTexts.push(result.meta.asrReferText);
}
if (result.type === "image" && result.localPath) {
imageUrls.push(result.localPath);
imageMediaTypes.push(result.contentType);
attachmentLocalPaths.push(result.localPath);
} else if (result.type === "voice" && result.localPath) {
voiceAttachmentPaths.push(result.localPath);
voiceTranscripts.push(result.transcript);
voiceTranscriptSources.push(result.transcriptSource);
attachmentLocalPaths.push(result.localPath);
} else if (result.type === "other" && result.localPath) {
otherAttachments.push(`[Attachment: ${result.localPath}]`);
attachmentLocalPaths.push(result.localPath);
} else if (result.type === "image-fallback") {
imageUrls.push(result.attUrl);
imageMediaTypes.push(result.contentType);
attachmentLocalPaths.push(null);
} else if (result.type === "voice-fallback") {
voiceTranscripts.push(result.transcript);
voiceTranscriptSources.push("asr");
attachmentLocalPaths.push(null);
} else if (result.type === "other-fallback") {
otherAttachments.push(`[Attachment: ${result.filename}] (download failed)`);
attachmentLocalPaths.push(null);
}
}
const attachmentInfo = otherAttachments.length > 0 ? "\n" + otherAttachments.join("\n") : "";
return {
attachmentInfo,
imageUrls,
imageMediaTypes,
voiceAttachmentPaths,
voiceAttachmentUrls,
voiceAsrReferTexts,
voiceTranscripts,
voiceTranscriptSources,
attachmentLocalPaths,
};
}
/** Format voice transcripts into user-visible text. */
export function formatVoiceText(transcripts: string[]): string {
if (transcripts.length === 0) {
return "";
}
return transcripts.length === 1
? `[Voice message] ${transcripts[0]}`
: transcripts.map((t, i) => `[Voice ${i + 1}] ${t}`).join("\n");
}
// Internal helpers.
type VoiceResult =
| {
localPath: string;
type: "voice";
transcript: string;
transcriptSource: TranscriptSource;
meta: { voiceUrl?: string; asrReferText?: string };
}
| {
localPath: string;
type: "voice";
transcript: string;
transcriptSource: TranscriptSource;
meta: { voiceUrl?: string; asrReferText?: string };
};
async function processVoiceAttachment(
localPath: string,
audioPath: string | null,
att: RawAttachment,
asrReferText: string,
cfg: unknown,
downloadDir: string,
log: ProcessContext["log"],
prefix: string,
): Promise<VoiceResult> {
const wavUrl = att.voice_wav_url
? att.voice_wav_url.startsWith("//")
? `https:${att.voice_wav_url}`
: att.voice_wav_url
: "";
const attUrl = att.url?.startsWith("//") ? `https:${att.url}` : att.url;
const voiceSourceUrl = wavUrl || attUrl;
const meta = {
voiceUrl: voiceSourceUrl || undefined,
asrReferText: asrReferText || undefined,
};
const sttCfg = resolveSTTConfig(cfg as Record<string, unknown>);
if (!sttCfg) {
if (asrReferText) {
log?.info(
`${prefix} Voice attachment: ${att.filename} (STT not configured, using asr_refer_text fallback)`,
);
return { localPath, type: "voice", transcript: asrReferText, transcriptSource: "asr", meta };
}
log?.info(
`${prefix} Voice attachment: ${att.filename} (STT not configured, skipping transcription)`,
);
return {
localPath,
type: "voice",
transcript: "[Voice message - transcription unavailable because STT is not configured]",
transcriptSource: "fallback",
meta,
};
}
// Convert SILK input to WAV before STT when necessary.
if (!audioPath) {
log?.info(`${prefix} Voice attachment: ${att.filename}, converting SILK→WAV...`);
try {
const wavResult = await convertSilkToWav(localPath, downloadDir);
if (wavResult) {
audioPath = wavResult.wavPath;
log?.info(
`${prefix} Voice converted: ${wavResult.wavPath} (${formatDuration(wavResult.duration)})`,
);
} else {
audioPath = localPath;
}
} catch (convertErr) {
log?.error(
`${prefix} Voice conversion failed: ${
convertErr instanceof Error ? convertErr.message : JSON.stringify(convertErr)
}`,
);
if (asrReferText) {
return {
localPath,
type: "voice",
transcript: asrReferText,
transcriptSource: "asr",
meta,
};
}
return {
localPath,
type: "voice",
transcript: "[Voice message - format conversion failed]",
transcriptSource: "fallback",
meta,
};
}
}
// Run speech-to-text on the prepared audio file.
try {
const transcript = await transcribeAudio(audioPath, cfg as Record<string, unknown>);
if (transcript) {
log?.info(`${prefix} STT transcript: ${transcript.slice(0, 100)}...`);
return { localPath, type: "voice", transcript, transcriptSource: "stt", meta };
}
if (asrReferText) {
log?.info(`${prefix} STT returned empty result, using asr_refer_text fallback`);
return { localPath, type: "voice", transcript: asrReferText, transcriptSource: "asr", meta };
}
log?.info(`${prefix} STT returned empty result`);
return {
localPath,
type: "voice",
transcript: "[Voice message - transcription returned an empty result]",
transcriptSource: "fallback",
meta,
};
} catch (sttErr) {
log?.error(
`${prefix} STT failed: ${sttErr instanceof Error ? sttErr.message : JSON.stringify(sttErr)}`,
);
if (asrReferText) {
return { localPath, type: "voice", transcript: asrReferText, transcriptSource: "asr", meta };
}
return {
localPath,
type: "voice",
transcript: "[Voice message - transcription failed]",
transcriptSource: "fallback",
meta,
};
}
}