diff --git a/CHANGELOG.md b/CHANGELOG.md index 87abc6e9c54..e01db72ea7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Media-understanding/audio: expand deprecated `{input}` CLI placeholders to the local media path and migrate legacy `audio.transcription.command` configs to `{{MediaPath}}`, so custom audio transcribers no longer receive the literal placeholder. Fixes #72760. Thanks @krisfanue3-hash. - Control UI/Gateway: preserve WebChat client version labels across localhost, 127.0.0.1, and IPv6 loopback aliases on the same port, avoiding misleading `vcontrol-ui` connection logs while investigating duplicate-message reports. Refs #72753 and #72742. Thanks @LumenFromTheFuture and @allesgutefy. - Agents/reasoning: treat orphan closing reasoning tags with following answer text as a privacy boundary across delivery, history, streaming, and Control UI sanitizers so malformed local-model output cannot leak chain-of-thought text. Fixes #67092. Thanks @AnildoSilva. - Memory-core: run one-shot memory CLI commands through transient builtin and QMD managers so `memory index`, `memory status --index`, and `memory search` no longer start long-lived file watchers that can hit macOS `EMFILE` limits. Fixes #59101; carries forward #49851. Thanks @mbear469210-coder and @maoyuanxue. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 4235008709c..844429421ea 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -1007d795926cef7568a93ed600c22304099c0559cf1b46392dbd8e863f248700 config-baseline.json -37ce9efbc441fc91fbc344b7b5e1460ecc9344f45187ba37019ab40d81765bb7 config-baseline.core.json +1784e028361704e55bbffd845234f0df5657e2772a8bf1e4816483ad453c8125 config-baseline.json +12a9fb470f9a40d587b3595c96d777956c33804706d2b5bded6c42af12f4cc57 config-baseline.core.json 07963db49502132f26db396c56b36e018b110e6c55a68b3cb012d3ec96f43901 config-baseline.channel.json ed65cefbef96f034ce2b73069d9d5bacc341a43489ff9b20a34d40956b877f79 config-baseline.plugin.json diff --git a/docs/gateway/config-tools.md b/docs/gateway/config-tools.md index e0eee038acb..9049215db7f 100644 --- a/docs/gateway/config-tools.md +++ b/docs/gateway/config-tools.md @@ -241,7 +241,7 @@ Configures inbound media understanding (image/audio/video): **CLI entry** (`type: "cli"`): - `command`: executable to run - - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc.) + - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc.; deprecated `{input}` is accepted as an alias for `{{MediaPath}}`) **Common fields:** diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index 1ca7df257d5..69b02c70ee4 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -163,6 +163,7 @@ Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI - `tools.media.audio.echoTranscript` is off by default; enable it to send transcript confirmation back to the originating chat before agent processing. - `tools.media.audio.echoFormat` customizes the echo text (placeholder: `{transcript}`). - CLI stdout is capped (5MB); keep CLI output concise. +- CLI `args` should use `{{MediaPath}}` for the local audio file path. Deprecated `{input}` placeholders from older `audio.transcription.command` configs are accepted as a compatibility alias and migrate to `{{MediaPath}}`. ### Proxy environment support diff --git a/src/commands/doctor/shared/legacy-config-migrate.test.ts b/src/commands/doctor/shared/legacy-config-migrate.test.ts index cb4151c3c44..350bc816c2e 100644 --- a/src/commands/doctor/shared/legacy-config-migrate.test.ts +++ b/src/commands/doctor/shared/legacy-config-migrate.test.ts @@ -67,6 +67,28 @@ describe("legacy migrate audio transcription", () => { expect(res.config?.audio).toBeUndefined(); expect(res.config?.tools?.media?.audio).toBeUndefined(); }); + + it("rewrites legacy audio {input} placeholders to media templates", () => { + const res = migrateLegacyConfigForTest({ + audio: { + transcription: { + command: ["whisper-cli", "--model", "small", "{input}", "--input={input}"], + timeoutSeconds: 30, + }, + }, + }); + + expect(res.changes).toContain("Moved audio.transcription → tools.media.audio.models."); + expect(res.config?.audio).toBeUndefined(); + expect(res.config?.tools?.media?.audio?.models).toEqual([ + { + type: "cli", + command: "whisper-cli", + args: ["--model", "small", "{{MediaPath}}", "--input={{MediaPath}}"], + timeoutSeconds: 30, + }, + ]); + }); }); describe("legacy migrate mention routing", () => { diff --git a/src/config/legacy.shared.ts b/src/config/legacy.shared.ts index 7a067271cd3..60ced600741 100644 --- a/src/config/legacy.shared.ts +++ b/src/config/legacy.shared.ts @@ -75,7 +75,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record part.replace(/\{input\}/g, "{{MediaPath}}")); const timeoutSeconds = typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined; diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 04045f69784..3f5a51a3576 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -18736,7 +18736,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { }, title: "Audio Transcription Command", description: - 'Executable + args used to transcribe audio (first token must be a safe binary/path), for example `["whisper-cli", "--model", "small", "{input}"]`. Prefer a pinned command so runtime environments behave consistently.', + 'Executable + args used to transcribe audio (first token must be a safe binary/path), for example `["whisper-cli", "--model", "small", "{{MediaPath}}"]`. Deprecated `{input}` placeholders are still accepted and migrated to `{{MediaPath}}`.', }, timeoutSeconds: { type: "integer", @@ -25727,7 +25727,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { }, "audio.transcription.command": { label: "Audio Transcription Command", - help: 'Executable + args used to transcribe audio (first token must be a safe binary/path), for example `["whisper-cli", "--model", "small", "{input}"]`. Prefer a pinned command so runtime environments behave consistently.', + help: 'Executable + args used to transcribe audio (first token must be a safe binary/path), for example `["whisper-cli", "--model", "small", "{{MediaPath}}"]`. Deprecated `{input}` placeholders are still accepted and migrated to `{{MediaPath}}`.', tags: ["media"], }, "audio.transcription.timeoutSeconds": { diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index c372ebcb176..4e9a2315847 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -511,7 +511,7 @@ export const FIELD_HELP: Record = { "audio.transcription": "Command-based transcription settings for converting audio files into text before agent handling. Keep a simple, deterministic command path here so failures are easy to diagnose in logs.", "audio.transcription.command": - 'Executable + args used to transcribe audio (first token must be a safe binary/path), for example `["whisper-cli", "--model", "small", "{input}"]`. Prefer a pinned command so runtime environments behave consistently.', + 'Executable + args used to transcribe audio (first token must be a safe binary/path), for example `["whisper-cli", "--model", "small", "{{MediaPath}}"]`. Deprecated `{input}` placeholders are still accepted and migrated to `{{MediaPath}}`.', "audio.transcription.timeoutSeconds": "Maximum time allowed for the transcription command to finish before it is aborted. Increase this for longer recordings, and keep it tight in latency-sensitive deployments.", bindings: diff --git a/src/media-understanding/runner.cli-audio.test.ts b/src/media-understanding/runner.cli-audio.test.ts index 6db28691336..fbc30e3dd32 100644 --- a/src/media-understanding/runner.cli-audio.test.ts +++ b/src/media-understanding/runner.cli-audio.test.ts @@ -64,4 +64,35 @@ describe("media-understanding CLI audio entry", () => { expect.any(Object), ); }); + + it("expands legacy {input} aliases to the media path", async () => { + let mediaPath = ""; + await withAudioFixture("openclaw-cli-audio-input-alias", async ({ ctx, cache }) => { + mediaPath = ctx.MediaPath; + await runCliEntry({ + capability: "audio", + entry: { + type: "cli", + command: "mock-transcriber", + args: ["{input}", "--input={input}"], + }, + cfg: { + tools: { + media: { + audio: {}, + }, + }, + } as OpenClawConfig, + ctx, + attachmentIndex: 0, + cache, + }); + }); + + expect(runExecMock).toHaveBeenCalledWith( + "mock-transcriber", + [mediaPath, `--input=${mediaPath}`], + expect.any(Object), + ); + }); }); diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index 56d300a4f2e..b14a306aeb2 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -407,6 +407,15 @@ function resolveAudioRequestOverrides(config: MediaUnderstandingConfig | undefin }; } +function applyCliArgTemplate(part: string, ctx: MsgContext): string { + const templated = applyTemplate(part, ctx); + const mediaPath = ctx.MediaPath; + if (!mediaPath || !templated.includes("{input}")) { + return templated; + } + return templated.replace(/\{input\}/g, mediaPath); +} + async function resolveProviderExecutionAuth(params: { providerId: string; cfg: OpenClawConfig; @@ -774,7 +783,7 @@ export async function runCliEntry(params: { MaxChars: maxChars, }; const argv = [command, ...args].map((part, index) => - index === 0 ? part : applyTemplate(part, templCtx), + index === 0 ? part : applyCliArgTemplate(part, templCtx), ); try { if (shouldLogVerbose()) {