mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
fix: apply missed media/runtime follow-ups from merged PRs
This commit is contained in:
@@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Changes
|
||||
|
||||
- Outbound adapters/plugins: add shared `sendPayload` support across direct-text-media, Discord, Slack, WhatsApp, Zalo, and Zalouser with multi-media iteration and chunk-aware text fallback. (#30144) Thanks @nohat.
|
||||
- Plugin runtime/STT: add `api.runtime.stt.transcribeAudioFile(...)` so extensions can transcribe local audio files through OpenClaw's configured media-understanding audio providers. (#22402) Thanks @benthecarman.
|
||||
- Sessions/Attachments: add inline file attachment support for `sessions_spawn` (subagent runtime only) with base64/utf8 encoding, transcript content redaction, lifecycle cleanup, and configurable limits via `tools.sessions_spawn.attachments`. (#16761) Thanks @napetrov.
|
||||
- Tools/PDF analysis: add a first-class `pdf` tool with native Anthropic and Google PDF provider support, extraction fallback for non-native models, configurable defaults (`agents.defaults.pdfModel`, `pdfMaxBytesMb`, `pdfMaxPages`), and docs/tests covering routing, validation, and registration. (#31319) Thanks @tyler6204.
|
||||
- Zalo Personal plugin (`@openclaw/zalouser`): rebuilt channel runtime to use native `zca-js` integration in-process, removing external CLI transport usage and keeping QR/login + send/listen flows fully inside OpenClaw.
|
||||
@@ -24,6 +25,11 @@ Docs: https://docs.openclaw.ai
|
||||
### Fixes
|
||||
|
||||
- macOS/LaunchAgent security defaults: write `Umask=63` (octal `077`) into generated gateway launchd plists so post-update service reinstalls keep owner-only file permissions by default instead of falling back to system `022`. (#32022) Fixes #31905. Thanks @liuxiaopai-ai.
|
||||
- Plugin SDK/runtime hardening: add package export verification in CI/release checks to catch missing runtime exports before publish-time regressions. (#28575) Thanks @Glucksberg.
|
||||
- Media understanding/provider HTTP proxy routing: pass a proxy-aware fetch function from `HTTPS_PROXY`/`HTTP_PROXY` env vars into audio/video provider calls (with graceful malformed-proxy fallback) so transcription/video requests honor configured outbound proxies. (#27093) Thanks @mcaxtr.
|
||||
- Media understanding/malformed attachment guards: harden attachment selection and decision summary formatting against non-array or malformed attachment payloads to prevent runtime crashes on invalid inbound metadata shapes. (#28024) Thanks @claw9267.
|
||||
- Media understanding/audio transcription guard: skip tiny/empty audio files (<1024 bytes) before provider/CLI transcription to avoid noisy invalid-audio failures and preserve clean fallback behavior. (#8388) Thanks @Glucksberg.
|
||||
- OpenAI media capabilities: include `audio` in the OpenAI provider capability list so audio transcription models are eligible in media-understanding provider selection. (#12717) Thanks @openjay.
|
||||
- Security/Node exec approvals: preserve shell/dispatch-wrapper argv semantics during approval hardening so approved wrapper commands (for example `env sh -c ...`) cannot drift into a different runtime command shape, and add regression coverage for both approval-plan generation and approved runtime execution paths. Thanks @tdjackey for reporting.
|
||||
- Browser/Security output boundary hardening: replace check-then-rename output commits with root-bound fd-verified writes, unify install/skills canonical path-boundary checks, and add regression coverage for symlink-rebind race paths across browser output and shared fs-safe write flows. Thanks @tdjackey for reporting.
|
||||
- Security/Webhook request hardening: enforce auth-before-body parsing for BlueBubbles and Google Chat webhook handlers, add strict pre-auth body/time budgets for webhook auth paths (including LINE signature verification), and add shared in-flight/request guardrails plus regression tests/lint checks to prevent reintroducing unauthenticated slow-body DoS patterns. Thanks @GCXWLP for reporting.
|
||||
|
||||
@@ -123,6 +123,7 @@ Recommended defaults:
|
||||
Rules:
|
||||
|
||||
- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
|
||||
- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription.
|
||||
- If the model returns more than `maxChars`, output is trimmed.
|
||||
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
|
||||
- If `<capability>.enabled: true` but no models are configured, OpenClaw tries the
|
||||
@@ -160,6 +161,20 @@ To disable auto-detection, set:
|
||||
|
||||
Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI is on `PATH` (we expand `~`), or set an explicit CLI model with a full command path.
|
||||
|
||||
### Proxy environment support (provider models)
|
||||
|
||||
When provider-based **audio** and **video** media understanding is enabled, OpenClaw
|
||||
honors standard outbound proxy environment variables for provider HTTP calls:
|
||||
|
||||
- `HTTPS_PROXY`
|
||||
- `HTTP_PROXY`
|
||||
- `https_proxy`
|
||||
- `http_proxy`
|
||||
|
||||
If no proxy env vars are set, media understanding uses direct egress.
|
||||
If the proxy value is malformed, OpenClaw logs a warning and falls back to direct
|
||||
fetch.
|
||||
|
||||
## Capabilities (optional)
|
||||
|
||||
If you set `capabilities`, the entry only runs for those media types. For shared
|
||||
|
||||
@@ -90,6 +90,22 @@ Notes:
|
||||
- Returns PCM audio buffer + sample rate. Plugins must resample/encode for providers.
|
||||
- Edge TTS is not supported for telephony.
|
||||
|
||||
For STT/transcription, plugins can call:
|
||||
|
||||
```ts
|
||||
const { text } = await api.runtime.stt.transcribeAudioFile({
|
||||
filePath: "/tmp/inbound-audio.ogg",
|
||||
cfg: api.config,
|
||||
// Optional when MIME cannot be inferred reliably:
|
||||
mime: "audio/ogg",
|
||||
});
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- Uses core media-understanding audio configuration (`tools.media.audio`) and provider fallback order.
|
||||
- Returns `{ text: undefined }` when no transcription output is produced (for example skipped/unsupported input).
|
||||
|
||||
## Discovery & precedence
|
||||
|
||||
OpenClaw scans, in order:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { MediaAttachment } from "./types.js";
|
||||
import { selectAttachments } from "./attachments.js";
|
||||
import type { MediaAttachment } from "./types.js";
|
||||
|
||||
describe("media-understanding selectAttachments guards", () => {
|
||||
it("does not throw when attachments is undefined", () => {
|
||||
@@ -26,4 +26,21 @@ describe("media-understanding selectAttachments guards", () => {
|
||||
expect(run).not.toThrow();
|
||||
expect(run()).toEqual([]);
|
||||
});
|
||||
|
||||
it("ignores malformed attachment entries inside an array", () => {
|
||||
const run = () =>
|
||||
selectAttachments({
|
||||
capability: "audio",
|
||||
attachments: [
|
||||
null,
|
||||
{ index: 1, path: 123 },
|
||||
{ index: 2, url: true },
|
||||
{ index: 3, mime: { nope: true } },
|
||||
] as unknown as MediaAttachment[],
|
||||
policy: { prefer: "path" },
|
||||
});
|
||||
|
||||
expect(run).not.toThrow();
|
||||
expect(run()).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -169,7 +169,7 @@ function orderAttachments(
|
||||
attachments: MediaAttachment[],
|
||||
prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
|
||||
): MediaAttachment[] {
|
||||
const list = Array.isArray(attachments) ? attachments : [];
|
||||
const list = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
|
||||
if (!prefer || prefer === "first") {
|
||||
return list;
|
||||
}
|
||||
@@ -189,13 +189,36 @@ function orderAttachments(
|
||||
return list;
|
||||
}
|
||||
|
||||
function isAttachmentRecord(value: unknown): value is MediaAttachment {
|
||||
if (!value || typeof value !== "object") {
|
||||
return false;
|
||||
}
|
||||
const entry = value as Record<string, unknown>;
|
||||
if (typeof entry.index !== "number") {
|
||||
return false;
|
||||
}
|
||||
if (entry.path !== undefined && typeof entry.path !== "string") {
|
||||
return false;
|
||||
}
|
||||
if (entry.url !== undefined && typeof entry.url !== "string") {
|
||||
return false;
|
||||
}
|
||||
if (entry.mime !== undefined && typeof entry.mime !== "string") {
|
||||
return false;
|
||||
}
|
||||
if (entry.alreadyTranscribed !== undefined && typeof entry.alreadyTranscribed !== "boolean") {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export function selectAttachments(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
attachments: MediaAttachment[];
|
||||
policy?: MediaUnderstandingAttachmentsConfig;
|
||||
}): MediaAttachment[] {
|
||||
const { capability, attachments, policy } = params;
|
||||
const input = Array.isArray(attachments) ? attachments : [];
|
||||
const input = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
|
||||
const matches = input.filter((item) => {
|
||||
// Skip already-transcribed audio attachments from preflight
|
||||
if (capability === "audio" && item.alreadyTranscribed) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { MediaUnderstandingDecision } from "./types.js";
|
||||
import { formatDecisionSummary } from "./runner.entries.js";
|
||||
import type { MediaUnderstandingDecision } from "./types.js";
|
||||
|
||||
describe("media-understanding formatDecisionSummary guards", () => {
|
||||
it("does not throw when decision.attachments is undefined", () => {
|
||||
@@ -26,4 +26,26 @@ describe("media-understanding formatDecisionSummary guards", () => {
|
||||
expect(run).not.toThrow();
|
||||
expect(run()).toBe("video: skipped (0/1)");
|
||||
});
|
||||
|
||||
it("ignores non-string provider/model/reason fields", () => {
|
||||
const run = () =>
|
||||
formatDecisionSummary({
|
||||
capability: "audio",
|
||||
outcome: "failed",
|
||||
attachments: [
|
||||
{
|
||||
attachmentIndex: 0,
|
||||
chosen: {
|
||||
outcome: "failed",
|
||||
provider: { bad: true },
|
||||
model: 42,
|
||||
},
|
||||
attempts: [{ reason: { malformed: true } }],
|
||||
},
|
||||
],
|
||||
} as unknown as MediaUnderstandingDecision);
|
||||
|
||||
expect(run).not.toThrow();
|
||||
expect(run()).toBe("audio: failed (0/1)");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -350,15 +350,17 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
|
||||
const total = attachments.length;
|
||||
const success = attachments.filter((entry) => entry?.chosen?.outcome === "success").length;
|
||||
const chosen = attachments.find((entry) => entry?.chosen)?.chosen;
|
||||
const provider = chosen?.provider?.trim();
|
||||
const model = chosen?.model?.trim();
|
||||
const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined;
|
||||
const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined;
|
||||
const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
|
||||
const reason = attachments
|
||||
.flatMap((entry) => {
|
||||
const attempts = Array.isArray(entry?.attempts) ? entry.attempts : [];
|
||||
return attempts.map((attempt) => attempt?.reason).filter(Boolean);
|
||||
return attempts
|
||||
.map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined))
|
||||
.filter((value): value is string => Boolean(value));
|
||||
})
|
||||
.find(Boolean);
|
||||
.find((value) => value.trim().length > 0);
|
||||
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
|
||||
const countLabel = total > 0 ? ` (${success}/${total})` : "";
|
||||
const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
|
||||
|
||||
95
src/media-understanding/transcribe-audio.test.ts
Normal file
95
src/media-understanding/transcribe-audio.test.ts
Normal file
@@ -0,0 +1,95 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
|
||||
const {
|
||||
normalizeMediaAttachments,
|
||||
createMediaAttachmentCache,
|
||||
buildProviderRegistry,
|
||||
runCapability,
|
||||
cacheCleanup,
|
||||
} = vi.hoisted(() => {
|
||||
const normalizeMediaAttachments = vi.fn();
|
||||
const cacheCleanup = vi.fn(async () => {});
|
||||
const createMediaAttachmentCache = vi.fn(() => ({ cleanup: cacheCleanup }));
|
||||
const buildProviderRegistry = vi.fn(() => new Map());
|
||||
const runCapability = vi.fn();
|
||||
return {
|
||||
normalizeMediaAttachments,
|
||||
createMediaAttachmentCache,
|
||||
buildProviderRegistry,
|
||||
runCapability,
|
||||
cacheCleanup,
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("./runner.js", () => ({
|
||||
normalizeMediaAttachments,
|
||||
createMediaAttachmentCache,
|
||||
buildProviderRegistry,
|
||||
runCapability,
|
||||
}));
|
||||
|
||||
import { transcribeAudioFile } from "./transcribe-audio.js";
|
||||
|
||||
describe("transcribeAudioFile", () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
cacheCleanup.mockResolvedValue(undefined);
|
||||
});
|
||||
|
||||
it("does not force audio/wav when mime is omitted", async () => {
|
||||
normalizeMediaAttachments.mockReturnValue([{ index: 0, path: "/tmp/note.mp3" }]);
|
||||
runCapability.mockResolvedValue({
|
||||
outputs: [{ kind: "audio.transcription", text: " hello " }],
|
||||
});
|
||||
|
||||
const result = await transcribeAudioFile({
|
||||
filePath: "/tmp/note.mp3",
|
||||
cfg: {} as OpenClawConfig,
|
||||
});
|
||||
|
||||
expect(normalizeMediaAttachments).toHaveBeenCalledWith({
|
||||
MediaPath: "/tmp/note.mp3",
|
||||
MediaType: undefined,
|
||||
});
|
||||
expect(result).toEqual({ text: "hello" });
|
||||
expect(cacheCleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("returns undefined and skips cache when there are no attachments", async () => {
|
||||
normalizeMediaAttachments.mockReturnValue([]);
|
||||
|
||||
const result = await transcribeAudioFile({
|
||||
filePath: "/tmp/missing.wav",
|
||||
cfg: {} as OpenClawConfig,
|
||||
});
|
||||
|
||||
expect(result).toEqual({ text: undefined });
|
||||
expect(createMediaAttachmentCache).not.toHaveBeenCalled();
|
||||
expect(runCapability).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("always cleans up cache on errors", async () => {
|
||||
const cfg = {
|
||||
tools: { media: { audio: { timeoutSeconds: 10 } } },
|
||||
} as unknown as OpenClawConfig;
|
||||
normalizeMediaAttachments.mockReturnValue([{ index: 0, path: "/tmp/note.wav" }]);
|
||||
runCapability.mockRejectedValue(new Error("boom"));
|
||||
|
||||
await expect(
|
||||
transcribeAudioFile({
|
||||
filePath: "/tmp/note.wav",
|
||||
cfg,
|
||||
}),
|
||||
).rejects.toThrow("boom");
|
||||
|
||||
expect(runCapability).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
config: cfg.tools?.media?.audio,
|
||||
}),
|
||||
);
|
||||
expect(cacheCleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
@@ -23,7 +23,7 @@ export async function transcribeAudioFile(params: {
|
||||
}): Promise<{ text: string | undefined }> {
|
||||
const ctx = {
|
||||
MediaPath: params.filePath,
|
||||
MediaType: params.mime ?? "audio/wav",
|
||||
MediaType: params.mime,
|
||||
};
|
||||
const attachments = normalizeMediaAttachments(ctx);
|
||||
if (attachments.length === 0) {
|
||||
|
||||
Reference in New Issue
Block a user