From d89c25d69e0569305c55b3fd7948b503b2196481 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 2 Mar 2026 22:21:57 +0000 Subject: [PATCH] fix: support parakeet-mlx output-dir transcript parsing (#9177) (thanks @mac-110) --- CHANGELOG.md | 1 + docs/nodes/audio.md | 1 + docs/nodes/media-understanding.md | 1 + src/media-understanding/apply.test.ts | 76 +++++++++++++++++++++++ src/media-understanding/runner.entries.ts | 17 ++++- 5 files changed, 95 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dac2f4b09e..0b72f1fc052 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai - Plugin SDK/runtime hardening: add package export verification in CI/release checks to catch missing runtime exports before publish-time regressions. (#28575) Thanks @Glucksberg. - Media understanding/provider HTTP proxy routing: pass a proxy-aware fetch function from `HTTPS_PROXY`/`HTTP_PROXY` env vars into audio/video provider calls (with graceful malformed-proxy fallback) so transcription/video requests honor configured outbound proxies. (#27093) Thanks @mcaxtr. - Media understanding/malformed attachment guards: harden attachment selection and decision summary formatting against non-array or malformed attachment payloads to prevent runtime crashes on invalid inbound metadata shapes. (#28024) Thanks @claw9267. +- Media understanding/parakeet CLI output parsing: read `parakeet-mlx` transcripts from `--output-dir/.txt` when txt output is requested (or default), with stdout fallback for non-txt formats. (#9177) Thanks @mac-110. - Media understanding/audio transcription guard: skip tiny/empty audio files (<1024 bytes) before provider/CLI transcription to avoid noisy invalid-audio failures and preserve clean fallback behavior. (#8388) Thanks @Glucksberg. - OpenAI media capabilities: include `audio` in the OpenAI provider capability list so audio transcription models are eligible in media-understanding provider selection. (#12717) Thanks @openjay. - Security/Node exec approvals: preserve shell/dispatch-wrapper argv semantics during approval hardening so approved wrapper commands (for example `env sh -c ...`) cannot drift into a different runtime command shape, and add regression coverage for both approval-plan generation and approved runtime execution paths. Thanks @tdjackey for reporting. diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index a897d55ae19..fb8afe21831 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -176,5 +176,6 @@ When `requireMention: true` is set for a group chat, OpenClaw now transcribes au - Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`. - Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`. +- For `parakeet-mlx`, if you pass `--output-dir`, OpenClaw reads `/.txt` when `--output-format` is `txt` (or omitted); non-`txt` output formats fall back to stdout parsing. - Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue. - Preflight transcription only processes the **first** audio attachment for mention detection. Additional audio is processed during the main media understanding phase. diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index c04037a7147..e03d7b1d4c7 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -210,6 +210,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. - `openai/gpt-4o-mini-transcribe`, `groq/whisper-large-v3-turbo`, `deepgram/nova-3`, or `mistral/voxtral-mini-latest`. - CLI fallback: `whisper-cli` (whisper-cpp) or `whisper`. +- `parakeet-mlx` note: with `--output-dir`, OpenClaw reads `/.txt` when output format is `txt` (or unspecified); non-`txt` formats fall back to stdout. - Deepgram setup: [Deepgram (audio transcription)](/providers/deepgram). **Video** diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 2b7f7f19360..e35259c267e 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -477,6 +477,82 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript"); }); + it("reads parakeet-mlx transcript from output-dir txt file", async () => { + const ctx = await createAudioCtx({ fileName: "sample.wav", mediaType: "audio/wav" }); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + models: [ + { + type: "cli", + command: "parakeet-mlx", + args: ["{{MediaPath}}", "--output-format", "txt", "--output-dir", "{{OutputDir}}"], + }, + ], + }, + }, + }, + }; + + mockedRunExec.mockImplementationOnce(async (_cmd, args) => { + const mediaPath = args[0]; + const outputDirArgIndex = args.indexOf("--output-dir"); + const outputDir = outputDirArgIndex >= 0 ? args[outputDirArgIndex + 1] : undefined; + const transcriptPath = + mediaPath && outputDir ? path.join(outputDir, `${path.parse(mediaPath).name}.txt`) : ""; + if (transcriptPath) { + await fs.writeFile(transcriptPath, "parakeet transcript\n"); + } + return { stdout: "", stderr: "" }; + }); + + const result = await applyMediaUnderstanding({ ctx, cfg }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("parakeet transcript"); + expect(ctx.Body).toBe("[Audio]\nTranscript:\nparakeet transcript"); + }); + + it("falls back to stdout for parakeet-mlx when output format is not txt", async () => { + const ctx = await createAudioCtx({ fileName: "sample.wav", mediaType: "audio/wav" }); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + models: [ + { + type: "cli", + command: "parakeet-mlx", + args: ["{{MediaPath}}", "--output-format", "json", "--output-dir", "{{OutputDir}}"], + }, + ], + }, + }, + }, + }; + + mockedRunExec.mockImplementationOnce(async (_cmd, args) => { + const mediaPath = args[0]; + const outputDirArgIndex = args.indexOf("--output-dir"); + const outputDir = outputDirArgIndex >= 0 ? args[outputDirArgIndex + 1] : undefined; + const transcriptPath = + mediaPath && outputDir ? path.join(outputDir, `${path.parse(mediaPath).name}.txt`) : ""; + if (transcriptPath) { + await fs.writeFile(transcriptPath, "should-not-be-used\n"); + } + return { stdout: "stdout transcript\n", stderr: "" }; + }); + + const result = await applyMediaUnderstanding({ ctx, cfg }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("stdout transcript"); + expect(ctx.Body).toBe("[Audio]\nTranscript:\nstdout transcript"); + }); + it("auto-detects sherpa for audio when binary and model files are available", async () => { const binDir = await createTempMediaDir(); const modelDir = await createTempMediaDir(); diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index 6b9f0d7922b..8423ece464d 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -136,6 +136,19 @@ function resolveWhisperCppOutputPath(args: string[]): string | null { return `${outputBase}.txt`; } +function resolveParakeetOutputPath(args: string[], mediaPath: string): string | null { + const outputDir = findArgValue(args, ["--output-dir"]); + const outputFormat = findArgValue(args, ["--output-format"]); + if (!outputDir) { + return null; + } + if (outputFormat && outputFormat !== "txt") { + return null; + } + const base = path.parse(mediaPath).name; + return path.join(outputDir, `${base}.txt`); +} + async function resolveCliOutput(params: { command: string; args: string[]; @@ -148,7 +161,9 @@ async function resolveCliOutput(params: { ? resolveWhisperCppOutputPath(params.args) : commandId === "whisper" ? resolveWhisperOutputPath(params.args, params.mediaPath) - : null; + : commandId === "parakeet-mlx" + ? resolveParakeetOutputPath(params.args, params.mediaPath) + : null; if (fileOutput && (await fileExists(fileOutput))) { try { const content = await fs.readFile(fileOutput, "utf8");