fix: apply missed media/runtime follow-ups from merged PRs

This commit is contained in:
Peter Steinberger
2026-03-02 21:45:29 +00:00
parent f2b37f0aa9
commit a183656f8f
9 changed files with 205 additions and 9 deletions

View File

@@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Outbound adapters/plugins: add shared `sendPayload` support across direct-text-media, Discord, Slack, WhatsApp, Zalo, and Zalouser with multi-media iteration and chunk-aware text fallback. (#30144) Thanks @nohat.
- Plugin runtime/STT: add `api.runtime.stt.transcribeAudioFile(...)` so extensions can transcribe local audio files through OpenClaw's configured media-understanding audio providers. (#22402) Thanks @benthecarman.
- Sessions/Attachments: add inline file attachment support for `sessions_spawn` (subagent runtime only) with base64/utf8 encoding, transcript content redaction, lifecycle cleanup, and configurable limits via `tools.sessions_spawn.attachments`. (#16761) Thanks @napetrov.
- Tools/PDF analysis: add a first-class `pdf` tool with native Anthropic and Google PDF provider support, extraction fallback for non-native models, configurable defaults (`agents.defaults.pdfModel`, `pdfMaxBytesMb`, `pdfMaxPages`), and docs/tests covering routing, validation, and registration. (#31319) Thanks @tyler6204.
- Zalo Personal plugin (`@openclaw/zalouser`): rebuilt channel runtime to use native `zca-js` integration in-process, removing external CLI transport usage and keeping QR/login + send/listen flows fully inside OpenClaw.
@@ -24,6 +25,11 @@ Docs: https://docs.openclaw.ai
### Fixes
- macOS/LaunchAgent security defaults: write `Umask=63` (octal `077`) into generated gateway launchd plists so post-update service reinstalls keep owner-only file permissions by default instead of falling back to system `022`. (#32022) Fixes #31905. Thanks @liuxiaopai-ai.
- Plugin SDK/runtime hardening: add package export verification in CI/release checks to catch missing runtime exports before publish-time regressions. (#28575) Thanks @Glucksberg.
- Media understanding/provider HTTP proxy routing: pass a proxy-aware fetch function from `HTTPS_PROXY`/`HTTP_PROXY` env vars into audio/video provider calls (with graceful malformed-proxy fallback) so transcription/video requests honor configured outbound proxies. (#27093) Thanks @mcaxtr.
- Media understanding/malformed attachment guards: harden attachment selection and decision summary formatting against non-array or malformed attachment payloads to prevent runtime crashes on invalid inbound metadata shapes. (#28024) Thanks @claw9267.
- Media understanding/audio transcription guard: skip tiny/empty audio files (<1024 bytes) before provider/CLI transcription to avoid noisy invalid-audio failures and preserve clean fallback behavior. (#8388) Thanks @Glucksberg.
- OpenAI media capabilities: include `audio` in the OpenAI provider capability list so audio transcription models are eligible in media-understanding provider selection. (#12717) Thanks @openjay.
- Security/Node exec approvals: preserve shell/dispatch-wrapper argv semantics during approval hardening so approved wrapper commands (for example `env sh -c ...`) cannot drift into a different runtime command shape, and add regression coverage for both approval-plan generation and approved runtime execution paths. Thanks @tdjackey for reporting.
- Browser/Security output boundary hardening: replace check-then-rename output commits with root-bound fd-verified writes, unify install/skills canonical path-boundary checks, and add regression coverage for symlink-rebind race paths across browser output and shared fs-safe write flows. Thanks @tdjackey for reporting.
- Security/Webhook request hardening: enforce auth-before-body parsing for BlueBubbles and Google Chat webhook handlers, add strict pre-auth body/time budgets for webhook auth paths (including LINE signature verification), and add shared in-flight/request guardrails plus regression tests/lint checks to prevent reintroducing unauthenticated slow-body DoS patterns. Thanks @GCXWLP for reporting.

View File

@@ -123,6 +123,7 @@ Recommended defaults:
Rules:
- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription.
- If the model returns more than `maxChars`, output is trimmed.
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
- If `<capability>.enabled: true` but no models are configured, OpenClaw tries the
@@ -160,6 +161,20 @@ To disable auto-detection, set:
Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI is on `PATH` (we expand `~`), or set an explicit CLI model with a full command path.
### Proxy environment support (provider models)
When provider-based **audio** and **video** media understanding is enabled, OpenClaw
honors standard outbound proxy environment variables for provider HTTP calls:
- `HTTPS_PROXY`
- `HTTP_PROXY`
- `https_proxy`
- `http_proxy`
If no proxy env vars are set, media understanding uses direct egress.
If the proxy value is malformed, OpenClaw logs a warning and falls back to direct
fetch.
## Capabilities (optional)
If you set `capabilities`, the entry only runs for those media types. For shared

View File

@@ -90,6 +90,22 @@ Notes:
- Returns PCM audio buffer + sample rate. Plugins must resample/encode for providers.
- Edge TTS is not supported for telephony.
For STT/transcription, plugins can call:
```ts
const { text } = await api.runtime.stt.transcribeAudioFile({
filePath: "/tmp/inbound-audio.ogg",
cfg: api.config,
// Optional when MIME cannot be inferred reliably:
mime: "audio/ogg",
});
```
Notes:
- Uses core media-understanding audio configuration (`tools.media.audio`) and provider fallback order.
- Returns `{ text: undefined }` when no transcription output is produced (for example skipped/unsupported input).
## Discovery & precedence
OpenClaw scans, in order:

View File

@@ -1,6 +1,6 @@
import { describe, expect, it } from "vitest";
import type { MediaAttachment } from "./types.js";
import { selectAttachments } from "./attachments.js";
import type { MediaAttachment } from "./types.js";
describe("media-understanding selectAttachments guards", () => {
it("does not throw when attachments is undefined", () => {
@@ -26,4 +26,21 @@ describe("media-understanding selectAttachments guards", () => {
expect(run).not.toThrow();
expect(run()).toEqual([]);
});
it("ignores malformed attachment entries inside an array", () => {
const run = () =>
selectAttachments({
capability: "audio",
attachments: [
null,
{ index: 1, path: 123 },
{ index: 2, url: true },
{ index: 3, mime: { nope: true } },
] as unknown as MediaAttachment[],
policy: { prefer: "path" },
});
expect(run).not.toThrow();
expect(run()).toEqual([]);
});
});

View File

@@ -169,7 +169,7 @@ function orderAttachments(
attachments: MediaAttachment[],
prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
): MediaAttachment[] {
const list = Array.isArray(attachments) ? attachments : [];
const list = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
if (!prefer || prefer === "first") {
return list;
}
@@ -189,13 +189,36 @@ function orderAttachments(
return list;
}
function isAttachmentRecord(value: unknown): value is MediaAttachment {
if (!value || typeof value !== "object") {
return false;
}
const entry = value as Record<string, unknown>;
if (typeof entry.index !== "number") {
return false;
}
if (entry.path !== undefined && typeof entry.path !== "string") {
return false;
}
if (entry.url !== undefined && typeof entry.url !== "string") {
return false;
}
if (entry.mime !== undefined && typeof entry.mime !== "string") {
return false;
}
if (entry.alreadyTranscribed !== undefined && typeof entry.alreadyTranscribed !== "boolean") {
return false;
}
return true;
}
export function selectAttachments(params: {
capability: MediaUnderstandingCapability;
attachments: MediaAttachment[];
policy?: MediaUnderstandingAttachmentsConfig;
}): MediaAttachment[] {
const { capability, attachments, policy } = params;
const input = Array.isArray(attachments) ? attachments : [];
const input = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
const matches = input.filter((item) => {
// Skip already-transcribed audio attachments from preflight
if (capability === "audio" && item.alreadyTranscribed) {

View File

@@ -1,6 +1,6 @@
import { describe, expect, it } from "vitest";
import type { MediaUnderstandingDecision } from "./types.js";
import { formatDecisionSummary } from "./runner.entries.js";
import type { MediaUnderstandingDecision } from "./types.js";
describe("media-understanding formatDecisionSummary guards", () => {
it("does not throw when decision.attachments is undefined", () => {
@@ -26,4 +26,26 @@ describe("media-understanding formatDecisionSummary guards", () => {
expect(run).not.toThrow();
expect(run()).toBe("video: skipped (0/1)");
});
it("ignores non-string provider/model/reason fields", () => {
const run = () =>
formatDecisionSummary({
capability: "audio",
outcome: "failed",
attachments: [
{
attachmentIndex: 0,
chosen: {
outcome: "failed",
provider: { bad: true },
model: 42,
},
attempts: [{ reason: { malformed: true } }],
},
],
} as unknown as MediaUnderstandingDecision);
expect(run).not.toThrow();
expect(run()).toBe("audio: failed (0/1)");
});
});

View File

@@ -350,15 +350,17 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
const total = attachments.length;
const success = attachments.filter((entry) => entry?.chosen?.outcome === "success").length;
const chosen = attachments.find((entry) => entry?.chosen)?.chosen;
const provider = chosen?.provider?.trim();
const model = chosen?.model?.trim();
const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined;
const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined;
const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
const reason = attachments
.flatMap((entry) => {
const attempts = Array.isArray(entry?.attempts) ? entry.attempts : [];
return attempts.map((attempt) => attempt?.reason).filter(Boolean);
return attempts
.map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined))
.filter((value): value is string => Boolean(value));
})
.find(Boolean);
.find((value) => value.trim().length > 0);
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
const countLabel = total > 0 ? ` (${success}/${total})` : "";
const viaLabel = modelLabel ? ` via ${modelLabel}` : "";

View File

@@ -0,0 +1,95 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
const {
normalizeMediaAttachments,
createMediaAttachmentCache,
buildProviderRegistry,
runCapability,
cacheCleanup,
} = vi.hoisted(() => {
const normalizeMediaAttachments = vi.fn();
const cacheCleanup = vi.fn(async () => {});
const createMediaAttachmentCache = vi.fn(() => ({ cleanup: cacheCleanup }));
const buildProviderRegistry = vi.fn(() => new Map());
const runCapability = vi.fn();
return {
normalizeMediaAttachments,
createMediaAttachmentCache,
buildProviderRegistry,
runCapability,
cacheCleanup,
};
});
vi.mock("./runner.js", () => ({
normalizeMediaAttachments,
createMediaAttachmentCache,
buildProviderRegistry,
runCapability,
}));
import { transcribeAudioFile } from "./transcribe-audio.js";
describe("transcribeAudioFile", () => {
beforeEach(() => {
vi.clearAllMocks();
cacheCleanup.mockResolvedValue(undefined);
});
it("does not force audio/wav when mime is omitted", async () => {
normalizeMediaAttachments.mockReturnValue([{ index: 0, path: "/tmp/note.mp3" }]);
runCapability.mockResolvedValue({
outputs: [{ kind: "audio.transcription", text: " hello " }],
});
const result = await transcribeAudioFile({
filePath: "/tmp/note.mp3",
cfg: {} as OpenClawConfig,
});
expect(normalizeMediaAttachments).toHaveBeenCalledWith({
MediaPath: "/tmp/note.mp3",
MediaType: undefined,
});
expect(result).toEqual({ text: "hello" });
expect(cacheCleanup).toHaveBeenCalledTimes(1);
});
it("returns undefined and skips cache when there are no attachments", async () => {
normalizeMediaAttachments.mockReturnValue([]);
const result = await transcribeAudioFile({
filePath: "/tmp/missing.wav",
cfg: {} as OpenClawConfig,
});
expect(result).toEqual({ text: undefined });
expect(createMediaAttachmentCache).not.toHaveBeenCalled();
expect(runCapability).not.toHaveBeenCalled();
});
it("always cleans up cache on errors", async () => {
const cfg = {
tools: { media: { audio: { timeoutSeconds: 10 } } },
} as unknown as OpenClawConfig;
normalizeMediaAttachments.mockReturnValue([{ index: 0, path: "/tmp/note.wav" }]);
runCapability.mockRejectedValue(new Error("boom"));
await expect(
transcribeAudioFile({
filePath: "/tmp/note.wav",
cfg,
}),
).rejects.toThrow("boom");
expect(runCapability).toHaveBeenCalledWith(
expect.objectContaining({
capability: "audio",
cfg,
config: cfg.tools?.media?.audio,
}),
);
expect(cacheCleanup).toHaveBeenCalledTimes(1);
});
});

View File

@@ -23,7 +23,7 @@ export async function transcribeAudioFile(params: {
}): Promise<{ text: string | undefined }> {
const ctx = {
MediaPath: params.filePath,
MediaType: params.mime ?? "audio/wav",
MediaType: params.mime,
};
const attachments = normalizeMediaAttachments(ctx);
if (attachments.length === 0) {