mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 20:40:43 +00:00
fix(tts): pre-transcode synthesized audio to opus-in-CAF for native iMessage voice-memo bubbles via BlueBubbles (#72586)
End-to-end testing on macOS + BlueBubbles + ElevenLabs walked through three CAF flavors before landing on the format Apple's Messages.app actually emits when a user records a native iMessage voice memo: - PCM int16 @ 44.1 kHz CAF: BlueBubbles' internal `afconvert -f m4af -d aac` conversion fails; the original CAF reaches iMessage but renders with 0 s duration. - AAC @ 22.05 kHz mono CAF: BlueBubbles' conversion succeeds and the server silently downgrades the delivery, sending the converted MP3 as a generic audio attachment. - **Opus @ 24 kHz mono CAF**: byte-identical to the descriptor block Apple's Messages.app produces; BlueBubbles passes it through unchanged and iMessage renders a native voice-memo bubble with proper duration and waveform UI. Adds an opt-in `tts.voice.preferAudioFileFormat` channel capability and a macOS `afconvert`-backed pre-transcode in the speech-core pipeline. BlueBubbles declares `preferAudioFileFormat: "caf"`. Other channels are unaffected. Falls back to the original buffer when the host platform, the source/target pair, or the transcoder process can't produce the preferred container — so non-Darwin hosts and unsupported provider combinations are unchanged. Also adds a `caff` magic-byte sniff in `src/media/mime.ts` so the auto-reply host-local-media validator (which uses `file-type` and didn't recognize CAF natively) accepts the buffer instead of dropping it as "⚠️ Media failed." Fixes #72506. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -71,6 +71,10 @@ Docs: https://docs.openclaw.ai
|
|||||||
- WhatsApp/Web: keep quiet but healthy linked-device sessions connected by basing the watchdog on WhatsApp Web transport activity, while retaining a longer app-silence cap so frame activity cannot mask a stuck session forever. Fixes #70678; carries forward the focused #71466 approach and keeps #63939 as related configurable-timeout follow-up. Thanks @vincentkoc and @oromeis.
|
- WhatsApp/Web: keep quiet but healthy linked-device sessions connected by basing the watchdog on WhatsApp Web transport activity, while retaining a longer app-silence cap so frame activity cannot mask a stuck session forever. Fixes #70678; carries forward the focused #71466 approach and keeps #63939 as related configurable-timeout follow-up. Thanks @vincentkoc and @oromeis.
|
||||||
- Discord/gateway: count failed health-monitor restart attempts toward cooldown and hourly caps, and evict stale account lifecycle state during channel reloads so repeated Discord gateway recovery cannot loop on old status. Fixes #38596. (#40413) Thanks @jellyAI-dev and @vashquez.
|
- Discord/gateway: count failed health-monitor restart attempts toward cooldown and hourly caps, and evict stale account lifecycle state during channel reloads so repeated Discord gateway recovery cannot loop on old status. Fixes #38596. (#40413) Thanks @jellyAI-dev and @vashquez.
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
- TTS/BlueBubbles: pre-transcode synthesized MP3 audio to opus-in-CAF (mono, 24 kHz — validated against macOS 15.x Messages.app's native voice-memo CAF descriptor) on macOS hosts before handing the file to BlueBubbles, so iMessage renders the result as a native voice-memo bubble with proper duration and waveform UI instead of a plain file attachment. Adds an opt-in `tts.voice.preferAudioFileFormat` channel capability and a magic-byte sniff for the CAF container so the host-local-media validator (which uses `file-type` and didn't recognize CAF natively) can verify the pre-transcoded buffer. Channels that don't opt in are unaffected. (#72586) Fixes #72506. Thanks @omarshahine.
|
||||||
|
|
||||||
## 2026.4.26
|
## 2026.4.26
|
||||||
|
|
||||||
### Changes
|
### Changes
|
||||||
|
|||||||
@@ -35,6 +35,13 @@ export const bluebubblesCapabilities: ChannelPlugin<ResolvedBlueBubblesAccount>[
|
|||||||
voice: {
|
voice: {
|
||||||
synthesisTarget: "audio-file",
|
synthesisTarget: "audio-file",
|
||||||
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
||||||
|
// Prefer CAF when the host can pre-transcode (afconvert on macOS).
|
||||||
|
// The BlueBubbles server otherwise races a CAF→MP3 conversion against
|
||||||
|
// the upload write completing and silently falls back to a generic
|
||||||
|
// attachment send when its conversion fails. Pre-encoding to CAF
|
||||||
|
// bypasses that race so iMessage renders the result as a native voice
|
||||||
|
// memo bubble (waveform UI) instead of a plain audio attachment.
|
||||||
|
preferAudioFileFormat: "caf",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
reactions: true,
|
reactions: true,
|
||||||
|
|||||||
64
extensions/speech-core/src/audio-transcode.test.ts
Normal file
64
extensions/speech-core/src/audio-transcode.test.ts
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { transcodeAudioBuffer } from "./audio-transcode.js";
|
||||||
|
|
||||||
|
describe("transcodeAudioBuffer", () => {
|
||||||
|
it("returns noop-same-container when source and target containers match", async () => {
|
||||||
|
const result = await transcodeAudioBuffer({
|
||||||
|
audioBuffer: Buffer.from("payload"),
|
||||||
|
sourceExtension: "mp3",
|
||||||
|
targetExtension: ".mp3",
|
||||||
|
});
|
||||||
|
expect(result).toEqual({ ok: false, reason: "noop-same-container" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns no-recipe when no afconvert recipe is defined for the requested pair", async () => {
|
||||||
|
const result = await transcodeAudioBuffer({
|
||||||
|
audioBuffer: Buffer.from("payload"),
|
||||||
|
sourceExtension: "mp3",
|
||||||
|
targetExtension: "flac",
|
||||||
|
});
|
||||||
|
expect(result).toEqual({ ok: false, reason: "no-recipe" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns invalid-extension for an empty source extension", async () => {
|
||||||
|
const result = await transcodeAudioBuffer({
|
||||||
|
audioBuffer: Buffer.from("payload"),
|
||||||
|
sourceExtension: "",
|
||||||
|
targetExtension: "caf",
|
||||||
|
});
|
||||||
|
expect(result).toEqual({ ok: false, reason: "invalid-extension" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns invalid-extension for an empty target extension", async () => {
|
||||||
|
const result = await transcodeAudioBuffer({
|
||||||
|
audioBuffer: Buffer.from("payload"),
|
||||||
|
sourceExtension: "mp3",
|
||||||
|
targetExtension: "",
|
||||||
|
});
|
||||||
|
expect(result).toEqual({ ok: false, reason: "invalid-extension" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("rejects path-traversal style extensions", async () => {
|
||||||
|
const result = await transcodeAudioBuffer({
|
||||||
|
audioBuffer: Buffer.from("payload"),
|
||||||
|
sourceExtension: "../etc/passwd",
|
||||||
|
targetExtension: "caf",
|
||||||
|
});
|
||||||
|
expect(result).toEqual({ ok: false, reason: "invalid-extension" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns platform-unsupported off-Darwin without invoking afconvert", async () => {
|
||||||
|
if (process.platform === "darwin") {
|
||||||
|
// macOS: a valid mp3→caf request would proceed to spawn `afconvert`,
|
||||||
|
// which we don't want to run from a unit test. The Darwin happy path
|
||||||
|
// is exercised end-to-end via the BlueBubbles voice-memo flow.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const result = await transcodeAudioBuffer({
|
||||||
|
audioBuffer: Buffer.from("payload"),
|
||||||
|
sourceExtension: "mp3",
|
||||||
|
targetExtension: "caf",
|
||||||
|
});
|
||||||
|
expect(result).toEqual({ ok: false, reason: "platform-unsupported" });
|
||||||
|
});
|
||||||
|
});
|
||||||
134
extensions/speech-core/src/audio-transcode.ts
Normal file
134
extensions/speech-core/src/audio-transcode.ts
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
import { spawn } from "node:child_process";
|
||||||
|
import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/sandbox";
|
||||||
|
|
||||||
|
/** Container token (file-extension shape, no leading dot) the host knows how
|
||||||
|
* to pre-transcode into. Update in lockstep with `pickAfconvertRecipe`. */
|
||||||
|
export type HostTranscodableContainer = "caf";
|
||||||
|
|
||||||
|
export type TranscodeOutcome =
|
||||||
|
| { ok: true; buffer: Buffer }
|
||||||
|
| {
|
||||||
|
ok: false;
|
||||||
|
reason:
|
||||||
|
| "platform-unsupported"
|
||||||
|
| "invalid-extension"
|
||||||
|
| "noop-same-container"
|
||||||
|
| "no-recipe"
|
||||||
|
| "transcoder-failed";
|
||||||
|
detail?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Best-effort audio container transcode using macOS `afconvert`.
|
||||||
|
*
|
||||||
|
* Used by the TTS pipeline to pre-encode synthesized audio into a channel's
|
||||||
|
* preferred container (see `ChannelTtsVoiceDeliveryCapabilities.preferAudioFileFormat`)
|
||||||
|
* so the channel's downstream does not have to perform a container
|
||||||
|
* conversion of its own. Returns a discriminated outcome so callers can
|
||||||
|
* distinguish "we didn't try" (platform/recipe/noop) from "we tried and the
|
||||||
|
* transcoder failed", which is the case worth logging.
|
||||||
|
*
|
||||||
|
* Currently only macOS is supported because `afconvert` is the only widely
|
||||||
|
* available encoder we ship a recipe for.
|
||||||
|
*/
|
||||||
|
export async function transcodeAudioBuffer(params: {
|
||||||
|
audioBuffer: Buffer;
|
||||||
|
sourceExtension: string;
|
||||||
|
targetExtension: string;
|
||||||
|
timeoutMs?: number;
|
||||||
|
}): Promise<TranscodeOutcome> {
|
||||||
|
// Validate inputs first so callers get a specific reason regardless of
|
||||||
|
// host platform. Platform-unsupported is the gate immediately before the
|
||||||
|
// actual `afconvert` invocation.
|
||||||
|
const source = normalizeExt(params.sourceExtension);
|
||||||
|
const target = normalizeExt(params.targetExtension);
|
||||||
|
if (!source || !target) {
|
||||||
|
return { ok: false, reason: "invalid-extension" };
|
||||||
|
}
|
||||||
|
if (source === target) {
|
||||||
|
return { ok: false, reason: "noop-same-container" };
|
||||||
|
}
|
||||||
|
const recipe = pickAfconvertRecipe(source, target);
|
||||||
|
if (!recipe) {
|
||||||
|
return { ok: false, reason: "no-recipe" };
|
||||||
|
}
|
||||||
|
if (process.platform !== "darwin") {
|
||||||
|
return { ok: false, reason: "platform-unsupported" };
|
||||||
|
}
|
||||||
|
|
||||||
|
const tmpRoot = resolvePreferredOpenClawTmpDir();
|
||||||
|
mkdirSync(tmpRoot, { recursive: true, mode: 0o700 });
|
||||||
|
const tmpDir = mkdtempSync(join(tmpRoot, "tts-transcode-"));
|
||||||
|
const inPath = join(tmpDir, `in.${source}`);
|
||||||
|
const outPath = join(tmpDir, `out.${target}`);
|
||||||
|
try {
|
||||||
|
writeFileSync(inPath, params.audioBuffer, { mode: 0o600 });
|
||||||
|
const result = await runAfconvert({
|
||||||
|
args: [...recipe, inPath, outPath],
|
||||||
|
timeoutMs: params.timeoutMs ?? 5000,
|
||||||
|
});
|
||||||
|
if (!result.ok) {
|
||||||
|
return { ok: false, reason: "transcoder-failed", detail: result.detail };
|
||||||
|
}
|
||||||
|
return { ok: true, buffer: readFileSync(outPath) };
|
||||||
|
} catch (err) {
|
||||||
|
return { ok: false, reason: "transcoder-failed", detail: (err as Error).message };
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
rmSync(tmpDir, { recursive: true, force: true });
|
||||||
|
} catch {
|
||||||
|
// best-effort cleanup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeExt(ext: string): string | undefined {
|
||||||
|
// Pattern matches the sibling helper in src/media/audio-transcode.ts: a short
|
||||||
|
// alphanumeric extension token. Keeps the value safe to interpolate into
|
||||||
|
// tmp-file names below without introducing a path-traversal surface.
|
||||||
|
const trimmed = ext.trim().toLowerCase().replace(/^\./, "");
|
||||||
|
return /^[a-z0-9]{1,12}$/.test(trimmed) ? trimmed : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function pickAfconvertRecipe(source: string, target: string): string[] | undefined {
|
||||||
|
// Currently only the MP3→CAF path used by BlueBubbles voice memos. Keep
|
||||||
|
// this in lockstep with `HostTranscodableContainer` above so a typo at the
|
||||||
|
// channel-capability declaration site is a compile-time error.
|
||||||
|
if (target === "caf") {
|
||||||
|
// Opus-in-CAF, mono, 24 kHz. Validated against macOS 15.x Messages.app's
|
||||||
|
// native voice-memo CAF descriptor (1 ch, 24000 Hz, opus); other CAF
|
||||||
|
// flavors (PCM, AAC) get downgraded to plain audio attachments along the
|
||||||
|
// BlueBubbles → Messages.app path. If iMessage stops rendering the result
|
||||||
|
// as a voice memo after a system update, try forcing frames-per-packet
|
||||||
|
// explicitly via `opus@24000#480` and re-validate. See #72506.
|
||||||
|
return ["-f", "caff", "-d", "opus@24000", "-c", "1"];
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function runAfconvert(params: {
|
||||||
|
args: string[];
|
||||||
|
timeoutMs: number;
|
||||||
|
}): Promise<{ ok: true } | { ok: false; detail: string }> {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const child = spawn("/usr/bin/afconvert", params.args, { stdio: "ignore" });
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
child.kill("SIGKILL");
|
||||||
|
resolve({ ok: false, detail: `timeout-${params.timeoutMs}ms` });
|
||||||
|
}, params.timeoutMs);
|
||||||
|
child.once("error", (err) => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
resolve({ ok: false, detail: err.message });
|
||||||
|
});
|
||||||
|
child.once("exit", (code) => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
if (code === 0) {
|
||||||
|
resolve({ ok: true });
|
||||||
|
} else {
|
||||||
|
resolve({ ok: false, detail: `exit-${code ?? "unknown"}` });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -31,6 +31,32 @@ const prepareSynthesisMock = vi.hoisted(() =>
|
|||||||
|
|
||||||
const listSpeechProvidersMock = vi.hoisted(() => vi.fn());
|
const listSpeechProvidersMock = vi.hoisted(() => vi.fn());
|
||||||
const getSpeechProviderMock = vi.hoisted(() => vi.fn());
|
const getSpeechProviderMock = vi.hoisted(() => vi.fn());
|
||||||
|
const transcodeAudioBufferMock = vi.hoisted(() =>
|
||||||
|
// Default off: most tests rely on the synthesized buffer reaching the
|
||||||
|
// channel unchanged. Tests that exercise the pre-transcode branch override
|
||||||
|
// per-call via `transcodeAudioBufferMock.mockResolvedValueOnce(...)`.
|
||||||
|
// Typed as the helper's full return shape so per-call overrides aren't
|
||||||
|
// narrowed to the default's literal.
|
||||||
|
vi.fn<
|
||||||
|
() => Promise<
|
||||||
|
| { ok: true; buffer: Buffer }
|
||||||
|
| {
|
||||||
|
ok: false;
|
||||||
|
reason:
|
||||||
|
| "platform-unsupported"
|
||||||
|
| "invalid-extension"
|
||||||
|
| "noop-same-container"
|
||||||
|
| "no-recipe"
|
||||||
|
| "transcoder-failed";
|
||||||
|
detail?: string;
|
||||||
|
}
|
||||||
|
>
|
||||||
|
>(async () => ({ ok: false, reason: "platform-unsupported" })),
|
||||||
|
);
|
||||||
|
|
||||||
|
vi.mock("./audio-transcode.js", () => ({
|
||||||
|
transcodeAudioBuffer: transcodeAudioBufferMock,
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("openclaw/plugin-sdk/channel-targets", () => ({
|
vi.mock("openclaw/plugin-sdk/channel-targets", () => ({
|
||||||
normalizeChannelId: (channel: string | undefined) => channel?.trim().toLowerCase() ?? null,
|
normalizeChannelId: (channel: string | undefined) => channel?.trim().toLowerCase() ?? null,
|
||||||
@@ -40,6 +66,7 @@ vi.mock("openclaw/plugin-sdk/channel-targets", () => ({
|
|||||||
return {
|
return {
|
||||||
synthesisTarget: "audio-file",
|
synthesisTarget: "audio-file",
|
||||||
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
||||||
|
preferAudioFileFormat: "caf",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (normalized === "feishu" || normalized === "whatsapp") {
|
if (normalized === "feishu" || normalized === "whatsapp") {
|
||||||
@@ -171,6 +198,7 @@ describe("speech-core native voice-note routing", () => {
|
|||||||
clearRuntimeConfigSnapshot();
|
clearRuntimeConfigSnapshot();
|
||||||
synthesizeMock.mockClear();
|
synthesizeMock.mockClear();
|
||||||
prepareSynthesisMock.mockClear();
|
prepareSynthesisMock.mockClear();
|
||||||
|
transcodeAudioBufferMock.mockClear();
|
||||||
installSpeechProviders([createMockSpeechProvider()]);
|
installSpeechProviders([createMockSpeechProvider()]);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -220,6 +248,57 @@ describe("speech-core native voice-note routing", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("pre-transcodes BlueBubbles synthesized mp3 to opus-in-CAF when the host can satisfy preferAudioFileFormat", async () => {
|
||||||
|
transcodeAudioBufferMock.mockResolvedValueOnce({
|
||||||
|
ok: true,
|
||||||
|
buffer: Buffer.from("transcoded-caf"),
|
||||||
|
});
|
||||||
|
await expectTtsPayloadResult({
|
||||||
|
channel: "bluebubbles",
|
||||||
|
prefsName: "openclaw-speech-core-tts-bluebubbles-caf-transcode-test",
|
||||||
|
text: "This BlueBubbles reply should be pre-transcoded to a native voice-memo CAF.",
|
||||||
|
target: "audio-file",
|
||||||
|
audioAsVoice: true,
|
||||||
|
mediaExtension: "caf",
|
||||||
|
providerResult: {
|
||||||
|
audioBuffer: Buffer.from("mp3"),
|
||||||
|
outputFormat: "mp3",
|
||||||
|
fileExtension: ".mp3",
|
||||||
|
voiceCompatible: false,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
expect(transcodeAudioBufferMock).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({ sourceExtension: "mp3", targetExtension: "caf" }),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to the original mp3 buffer when the host transcoder fails", async () => {
|
||||||
|
transcodeAudioBufferMock.mockResolvedValueOnce({
|
||||||
|
ok: false,
|
||||||
|
reason: "transcoder-failed",
|
||||||
|
detail: "exit-1",
|
||||||
|
});
|
||||||
|
// Even though the transcode failed, the original mp3 still satisfies
|
||||||
|
// BlueBubbles' audioFileFormats list, so the channel still flips
|
||||||
|
// audioAsVoice. The user gets the v2026.4.26 PCM-CAF behavior (a voice
|
||||||
|
// memo bubble, possibly with bad duration) instead of a regression — and
|
||||||
|
// the failure is logged via the call site in tts.ts so it isn't silent.
|
||||||
|
await expectTtsPayloadResult({
|
||||||
|
channel: "bluebubbles",
|
||||||
|
prefsName: "openclaw-speech-core-tts-bluebubbles-caf-fallback-test",
|
||||||
|
text: "This BlueBubbles reply should fall back to the original mp3.",
|
||||||
|
target: "audio-file",
|
||||||
|
audioAsVoice: true,
|
||||||
|
mediaExtension: "mp3",
|
||||||
|
providerResult: {
|
||||||
|
audioBuffer: Buffer.from("mp3"),
|
||||||
|
outputFormat: "mp3",
|
||||||
|
fileExtension: ".mp3",
|
||||||
|
voiceCompatible: false,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("uses the active runtime snapshot when source config still contains TTS SecretRefs", async () => {
|
it("uses the active runtime snapshot when source config still contains TTS SecretRefs", async () => {
|
||||||
const sourceConfig = {
|
const sourceConfig = {
|
||||||
messages: {
|
messages: {
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ import {
|
|||||||
type TtsDirectiveParseResult,
|
type TtsDirectiveParseResult,
|
||||||
type TtsConfigResolutionContext,
|
type TtsConfigResolutionContext,
|
||||||
} from "../api.js";
|
} from "../api.js";
|
||||||
|
import { transcodeAudioBuffer } from "./audio-transcode.js";
|
||||||
|
|
||||||
export type {
|
export type {
|
||||||
ResolvedTtsConfig,
|
ResolvedTtsConfig,
|
||||||
@@ -1111,11 +1112,27 @@ export async function textToSpeech(params: {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let audioBuffer = synthesis.audioBuffer;
|
||||||
|
let fileExtension = synthesis.fileExtension;
|
||||||
|
let outputFormat = synthesis.outputFormat;
|
||||||
|
const transcoded = await maybePreTranscodeForVoiceDelivery({
|
||||||
|
channel: params.channel,
|
||||||
|
target: synthesis.target,
|
||||||
|
audioBuffer,
|
||||||
|
fileExtension,
|
||||||
|
outputFormat,
|
||||||
|
});
|
||||||
|
if (transcoded) {
|
||||||
|
audioBuffer = transcoded.audioBuffer;
|
||||||
|
fileExtension = transcoded.fileExtension;
|
||||||
|
outputFormat = transcoded.outputFormat;
|
||||||
|
}
|
||||||
|
|
||||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||||
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
||||||
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
|
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
|
||||||
const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
|
const audioPath = path.join(tempDir, `voice-${Date.now()}${fileExtension}`);
|
||||||
writeFileSync(audioPath, synthesis.audioBuffer);
|
writeFileSync(audioPath, audioBuffer);
|
||||||
scheduleCleanup(tempDir);
|
scheduleCleanup(tempDir);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -1127,19 +1144,64 @@ export async function textToSpeech(params: {
|
|||||||
fallbackFrom: synthesis.fallbackFrom,
|
fallbackFrom: synthesis.fallbackFrom,
|
||||||
attemptedProviders: synthesis.attemptedProviders,
|
attemptedProviders: synthesis.attemptedProviders,
|
||||||
attempts: synthesis.attempts,
|
attempts: synthesis.attempts,
|
||||||
outputFormat: synthesis.outputFormat,
|
outputFormat,
|
||||||
voiceCompatible: synthesis.voiceCompatible,
|
voiceCompatible: synthesis.voiceCompatible,
|
||||||
audioAsVoice: shouldDeliverTtsAsVoice({
|
audioAsVoice: shouldDeliverTtsAsVoice({
|
||||||
channel: params.channel,
|
channel: params.channel,
|
||||||
target: synthesis.target,
|
target: synthesis.target,
|
||||||
voiceCompatible: synthesis.voiceCompatible,
|
voiceCompatible: synthesis.voiceCompatible,
|
||||||
fileExtension: synthesis.fileExtension,
|
fileExtension,
|
||||||
outputFormat: synthesis.outputFormat,
|
outputFormat,
|
||||||
}),
|
}),
|
||||||
target: synthesis.target,
|
target: synthesis.target,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function maybePreTranscodeForVoiceDelivery(params: {
|
||||||
|
channel: string | undefined;
|
||||||
|
target: "audio-file" | "voice-note" | undefined;
|
||||||
|
audioBuffer: Buffer;
|
||||||
|
fileExtension: string;
|
||||||
|
outputFormat?: string;
|
||||||
|
}): Promise<{ audioBuffer: Buffer; fileExtension: string; outputFormat?: string } | undefined> {
|
||||||
|
if (params.target !== "audio-file") {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const delivery = resolveChannelTtsVoiceDelivery(params.channel);
|
||||||
|
const preferred = delivery?.preferAudioFileFormat?.trim().toLowerCase();
|
||||||
|
if (!preferred) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const sourceExt = params.fileExtension.trim().toLowerCase().replace(/^\./, "");
|
||||||
|
if (sourceExt === preferred) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const outcome = await transcodeAudioBuffer({
|
||||||
|
audioBuffer: params.audioBuffer,
|
||||||
|
sourceExtension: sourceExt,
|
||||||
|
targetExtension: preferred,
|
||||||
|
});
|
||||||
|
if (!outcome.ok) {
|
||||||
|
if (outcome.reason === "transcoder-failed") {
|
||||||
|
// Surface only the case where the host actually attempted the transcode
|
||||||
|
// and it broke. The other reasons ("no-recipe", "noop-same-container",
|
||||||
|
// "platform-unsupported", "invalid-extension") are by-design skips and
|
||||||
|
// would just be log noise. This is the line that tells you "the channel
|
||||||
|
// asked for a pre-encode, the host had a recipe for it, and it failed"
|
||||||
|
// — i.e. the case where #72506 silently regresses.
|
||||||
|
logVerbose(
|
||||||
|
`TTS: pre-transcode ${sourceExt}->${preferred} for channel=${params.channel ?? "?"} failed: ${outcome.detail ?? "unknown"}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
audioBuffer: outcome.buffer,
|
||||||
|
fileExtension: `.${preferred}`,
|
||||||
|
outputFormat: preferred,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
export async function synthesizeSpeech(params: {
|
export async function synthesizeSpeech(params: {
|
||||||
text: string;
|
text: string;
|
||||||
cfg: OpenClawConfig;
|
cfg: OpenClawConfig;
|
||||||
|
|||||||
@@ -273,10 +273,29 @@ export type ChannelGroupContext = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/** TTS voice delivery behavior advertised by a channel plugin. */
|
/** TTS voice delivery behavior advertised by a channel plugin. */
|
||||||
|
/**
|
||||||
|
* Container tokens (file-extension shape, no leading dot) that the host
|
||||||
|
* speech-core pipeline knows how to pre-transcode synthesized audio into.
|
||||||
|
* Channels that benefit from a specific container — currently only
|
||||||
|
* BlueBubbles, which needs Apple's native voice-memo CAF descriptor — name
|
||||||
|
* one here. Adding a new entry requires extending the host transcoder
|
||||||
|
* recipe table in lockstep so a typed declaration cannot silently no-op.
|
||||||
|
*/
|
||||||
|
export type PreferredAudioFileFormat = "caf";
|
||||||
|
|
||||||
export type ChannelTtsVoiceDeliveryCapabilities = {
|
export type ChannelTtsVoiceDeliveryCapabilities = {
|
||||||
synthesisTarget: "audio-file" | "voice-note";
|
synthesisTarget: "audio-file" | "voice-note";
|
||||||
transcodesAudio?: boolean;
|
transcodesAudio?: boolean;
|
||||||
audioFileFormats?: readonly string[];
|
audioFileFormats?: readonly string[];
|
||||||
|
/**
|
||||||
|
* Optional preferred audio container the channel wants for voice-memo
|
||||||
|
* delivery. When set and the host can transcode (e.g. `afconvert` on
|
||||||
|
* macOS), the TTS pipeline pre-encodes synthesized audio to this format
|
||||||
|
* before handing it to the channel. Useful for channels (such as
|
||||||
|
* BlueBubbles) whose downstream attempts its own container conversion
|
||||||
|
* that races against the upload write and fails.
|
||||||
|
*/
|
||||||
|
preferAudioFileFormat?: PreferredAudioFileFormat;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Static capability flags advertised by a channel plugin. */
|
/** Static capability flags advertised by a channel plugin. */
|
||||||
|
|||||||
@@ -118,6 +118,23 @@ describe("mime detection", () => {
|
|||||||
expect(mime).toBe("audio/aac");
|
expect(mime).toBe("audio/aac");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("detects Apple CAF audio by magic bytes when file-type does not recognize the container", async () => {
|
||||||
|
// CAF files start with the four-byte ASCII tag "caff". `file-type` v22 has
|
||||||
|
// no native CAF detector, so without the manual magic-byte fallback the
|
||||||
|
// host-local-media validator drops `afconvert`-produced voice-memo CAFs as
|
||||||
|
// unknown binary blobs. Regression guard for the BlueBubbles voice-memo
|
||||||
|
// pre-transcode path.
|
||||||
|
const buf = Buffer.concat([Buffer.from("caff", "ascii"), Buffer.alloc(60)]);
|
||||||
|
const mime = await detectMime({ buffer: buf });
|
||||||
|
expect(mime).toBe("audio/x-caf");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns audio/x-caf when extension and CAF magic bytes both agree", async () => {
|
||||||
|
const buf = Buffer.concat([Buffer.from("caff", "ascii"), Buffer.alloc(60)]);
|
||||||
|
const mime = await detectMime({ buffer: buf, filePath: "/tmp/voice.caf" });
|
||||||
|
expect(mime).toBe("audio/x-caf");
|
||||||
|
});
|
||||||
|
|
||||||
it("caps dependency sniffing to a bounded prefix", () => {
|
it("caps dependency sniffing to a bounded prefix", () => {
|
||||||
const small = Buffer.alloc(32);
|
const small = Buffer.alloc(32);
|
||||||
const large = Buffer.alloc(FILE_TYPE_SNIFF_MAX_BYTES + 16);
|
const large = Buffer.alloc(FILE_TYPE_SNIFF_MAX_BYTES + 16);
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ const EXT_BY_MIME: Record<string, string> = {
|
|||||||
"audio/opus": ".opus",
|
"audio/opus": ".opus",
|
||||||
"audio/x-m4a": ".m4a",
|
"audio/x-m4a": ".m4a",
|
||||||
"audio/mp4": ".m4a",
|
"audio/mp4": ".m4a",
|
||||||
|
"audio/x-caf": ".caf",
|
||||||
"video/mp4": ".mp4",
|
"video/mp4": ".mp4",
|
||||||
"video/quicktime": ".mov",
|
"video/quicktime": ".mov",
|
||||||
"application/pdf": ".pdf",
|
"application/pdf": ".pdf",
|
||||||
@@ -91,10 +92,25 @@ async function sniffMime(buffer?: Buffer): Promise<string | undefined> {
|
|||||||
fileTypeModulePromise ??= import("file-type");
|
fileTypeModulePromise ??= import("file-type");
|
||||||
const { fileTypeFromBuffer } = await fileTypeModulePromise;
|
const { fileTypeFromBuffer } = await fileTypeModulePromise;
|
||||||
const type = await fileTypeFromBuffer(sliceMimeSniffBuffer(buffer));
|
const type = await fileTypeFromBuffer(sliceMimeSniffBuffer(buffer));
|
||||||
return type?.mime ?? undefined;
|
if (type?.mime) {
|
||||||
|
return type.mime;
|
||||||
|
}
|
||||||
} catch {
|
} catch {
|
||||||
return undefined;
|
// fall through to manual magic-byte sniffs
|
||||||
}
|
}
|
||||||
|
return sniffKnownAudioMagic(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallbacks for audio containers `file-type` doesn't recognize natively (e.g.
|
||||||
|
// Apple's CAF, used by iMessage voice memos when produced by `afconvert`).
|
||||||
|
// Without this the host-local-media validator drops these buffers as unknown
|
||||||
|
// binary blobs because the sniff returns undefined, even though the file is
|
||||||
|
// a valid audio container.
|
||||||
|
function sniffKnownAudioMagic(buffer: Buffer): string | undefined {
|
||||||
|
if (buffer.byteLength >= 4 && buffer.toString("ascii", 0, 4) === "caff") {
|
||||||
|
return "audio/x-caf";
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function getFileExtension(filePath?: string | null): string | undefined {
|
export function getFileExtension(filePath?: string | null): string | undefined {
|
||||||
|
|||||||
Reference in New Issue
Block a user