refactor: extract media understanding common package (#88297)

* refactor: extract media understanding common package

* test: move media understanding format test
This commit is contained in:
Peter Steinberger
2026-05-30 12:40:49 +02:00
committed by GitHub
parent b13fb788b5
commit 8b92aca27f
50 changed files with 795 additions and 297 deletions

View File

@@ -0,0 +1,7 @@
//#region packages/media-understanding-common/src/active-model.d.ts
type ActiveMediaModel = {
provider: string;
model?: string;
};
//#endregion
export { ActiveMediaModel };

View File

@@ -0,0 +1 @@
export {};

View File

@@ -0,0 +1,14 @@
import { MediaUnderstandingCapability } from "./types.mjs";
//#region packages/media-understanding-common/src/defaults.d.ts
declare const DEFAULT_MAX_CHARS = 500;
declare const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<MediaUnderstandingCapability, number | undefined>;
declare const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number>;
declare const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number>;
declare const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string>;
declare const DEFAULT_VIDEO_MAX_BASE64_BYTES: number;
declare const CLI_OUTPUT_MAX_BUFFER: number;
declare const DEFAULT_MEDIA_CONCURRENCY = 2;
declare const MIN_AUDIO_FILE_BYTES = 1024;
//#endregion
export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES };

View File

@@ -0,0 +1,29 @@
//#region packages/media-understanding-common/src/defaults.ts
const MB = 1024 * 1024;
const DEFAULT_MAX_CHARS = 500;
const DEFAULT_MAX_CHARS_BY_CAPABILITY = {
image: 500,
audio: void 0,
video: 500
};
const DEFAULT_MAX_BYTES = {
image: 10 * MB,
audio: 20 * MB,
video: 50 * MB
};
const DEFAULT_TIMEOUT_SECONDS = {
image: 60,
audio: 60,
video: 120
};
const DEFAULT_PROMPT = {
image: "Describe the image.",
audio: "Transcribe the audio.",
video: "Describe the video."
};
const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
const DEFAULT_MEDIA_CONCURRENCY = 2;
const MIN_AUDIO_FILE_BYTES = 1024;
//#endregion
export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES };

View File

@@ -0,0 +1,9 @@
//#region packages/media-understanding-common/src/errors.d.ts
type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty" | "blocked" | "tooSmall";
declare class MediaUnderstandingSkipError extends Error {
readonly reason: MediaUnderstandingSkipReason;
constructor(reason: MediaUnderstandingSkipReason, message: string);
}
declare function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError;
//#endregion
export { MediaUnderstandingSkipError, isMediaUnderstandingSkipError };

View File

@@ -0,0 +1,13 @@
//#region packages/media-understanding-common/src/errors.ts
var MediaUnderstandingSkipError = class extends Error {
constructor(reason, message) {
super(message);
this.reason = reason;
this.name = "MediaUnderstandingSkipError";
}
};
function isMediaUnderstandingSkipError(err) {
return err instanceof MediaUnderstandingSkipError;
}
//#endregion
export { MediaUnderstandingSkipError, isMediaUnderstandingSkipError };

View File

@@ -0,0 +1,11 @@
import { MediaUnderstandingOutput } from "./types.mjs";
//#region packages/media-understanding-common/src/format.d.ts
declare function extractMediaUserText(body?: string): string | undefined;
declare function formatMediaUnderstandingBody(params: {
body?: string;
outputs: MediaUnderstandingOutput[];
}): string;
declare function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string;
//#endregion
export { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody };

View File

@@ -0,0 +1,47 @@
//#region packages/media-understanding-common/src/format.ts
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
function extractMediaUserText(body) {
const trimmed = body?.trim() ?? "";
if (!trimmed) return;
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return;
return trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim() || void 0;
}
function formatSection(title, kind, text, userText) {
const lines = [`[${title}]`];
if (userText) lines.push(`User text:\n${userText}`);
lines.push(`${kind}:\n${text}`);
return lines.join("\n");
}
function formatMediaUnderstandingBody(params) {
const outputs = params.outputs.filter((output) => output.text.trim());
if (outputs.length === 0) return params.body ?? "";
const userText = extractMediaUserText(params.body);
const sections = [];
if (userText && outputs.length > 1) sections.push(`User text:\n${userText}`);
const counts = /* @__PURE__ */ new Map();
for (const output of outputs) counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
const seen = /* @__PURE__ */ new Map();
for (const output of outputs) {
const count = counts.get(output.kind) ?? 1;
const next = (seen.get(output.kind) ?? 0) + 1;
seen.set(output.kind, next);
const suffix = count > 1 ? ` ${next}/${count}` : "";
if (output.kind === "audio.transcription") {
sections.push(formatSection(`Audio${suffix}`, "Transcript", output.text, outputs.length === 1 ? userText : void 0));
continue;
}
if (output.kind === "image.description") {
sections.push(formatSection(`Image${suffix}`, "Description", output.text, outputs.length === 1 ? userText : void 0));
continue;
}
sections.push(formatSection(`Video${suffix}`, "Description", output.text, outputs.length === 1 ? userText : void 0));
}
return sections.join("\n\n").trim();
}
function formatAudioTranscripts(outputs) {
if (outputs.length === 1) return outputs[0].text;
return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n");
}
//#endregion
export { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody };

View File

@@ -0,0 +1,11 @@
import { ActiveMediaModel } from "./active-model.mjs";
import { MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.mjs";
import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES } from "./defaults.mjs";
import { MediaUnderstandingSkipError, isMediaUnderstandingSkipError } from "./errors.mjs";
import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody } from "./format.mjs";
import { OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString } from "./openai-compatible-video.mjs";
import { extractGeminiResponse } from "./output-extract.mjs";
import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.mjs";
import { providerSupportsCapability } from "./provider-supports.mjs";
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.mjs";
export { ActiveMediaModel, CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES, MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider, MediaUnderstandingSkipError, OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, estimateBase64Size, extractGeminiResponse, extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, isMediaUnderstandingSkipError, normalizeMediaExecutionProviderId, normalizeMediaProviderId, providerSupportsCapability, resolveMediaUnderstandingString, resolveVideoMaxBase64Bytes };

View File

@@ -0,0 +1,11 @@
import "./active-model.mjs";
import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES } from "./defaults.mjs";
import { MediaUnderstandingSkipError, isMediaUnderstandingSkipError } from "./errors.mjs";
import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody } from "./format.mjs";
import { buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString } from "./openai-compatible-video.mjs";
import { extractGeminiResponse } from "./output-extract.mjs";
import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.mjs";
import { providerSupportsCapability } from "./provider-supports.mjs";
import "./types.mjs";
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.mjs";
export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES, MediaUnderstandingSkipError, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, estimateBase64Size, extractGeminiResponse, extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, isMediaUnderstandingSkipError, normalizeMediaExecutionProviderId, normalizeMediaProviderId, providerSupportsCapability, resolveMediaUnderstandingString, resolveVideoMaxBase64Bytes };

View File

@@ -0,0 +1,37 @@
//#region packages/media-understanding-common/src/openai-compatible-video.d.ts
type OpenAiCompatibleVideoPayload = {
choices?: Array<{
message?: {
content?: string | Array<{
text?: string;
}>;
reasoning_content?: string;
};
}>;
};
declare function resolveMediaUnderstandingString(value: string | undefined, fallback: string): string;
declare function coerceOpenAiCompatibleVideoText(payload: OpenAiCompatibleVideoPayload): string | null;
declare function buildOpenAiCompatibleVideoRequestBody(params: {
model: string;
prompt: string;
mime: string;
buffer: Buffer;
}): {
model: string;
messages: {
role: string;
content: ({
type: string;
text: string;
video_url?: undefined;
} | {
type: string;
video_url: {
url: string;
};
text?: undefined;
})[];
}[];
};
//#endregion
export { OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString };

View File

@@ -0,0 +1,32 @@
//#region packages/media-understanding-common/src/openai-compatible-video.ts
function resolveMediaUnderstandingString(value, fallback) {
return value?.trim() || fallback;
}
function coerceOpenAiCompatibleVideoText(payload) {
const message = payload.choices?.[0]?.message;
if (!message) return null;
if (typeof message.content === "string" && message.content.trim()) return message.content.trim();
if (Array.isArray(message.content)) {
const text = message.content.map((part) => part.text?.trim() ?? "").filter(Boolean).join("\n");
if (text) return text;
}
if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) return message.reasoning_content.trim();
return null;
}
function buildOpenAiCompatibleVideoRequestBody(params) {
return {
model: params.model,
messages: [{
role: "user",
content: [{
type: "text",
text: params.prompt
}, {
type: "video_url",
video_url: { url: `data:${params.mime};base64,${params.buffer.toString("base64")}` }
}]
}]
};
}
//#endregion
export { buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString };

View File

@@ -0,0 +1,4 @@
//#region packages/media-understanding-common/src/output-extract.d.ts
declare function extractGeminiResponse(raw: string): string | null;
//#endregion
export { extractGeminiResponse };

View File

@@ -0,0 +1,21 @@
//#region packages/media-understanding-common/src/output-extract.ts
function extractLastJsonObject(raw) {
const trimmed = raw.trim();
const start = trimmed.lastIndexOf("{");
if (start === -1) return null;
const slice = trimmed.slice(start);
try {
return JSON.parse(slice);
} catch {
return null;
}
}
function extractGeminiResponse(raw) {
const payload = extractLastJsonObject(raw);
if (!payload || typeof payload !== "object") return null;
const response = payload.response;
if (typeof response !== "string") return null;
return response.trim() || null;
}
//#endregion
export { extractGeminiResponse };

View File

@@ -0,0 +1,5 @@
//#region packages/media-understanding-common/src/provider-id.d.ts
declare function normalizeMediaProviderId(id: string): string;
declare function normalizeMediaExecutionProviderId(id: string): string;
//#endregion
export { normalizeMediaExecutionProviderId, normalizeMediaProviderId };

View File

@@ -0,0 +1,18 @@
//#region packages/media-understanding-common/src/provider-id.ts
function normalizeProviderId(provider) {
return provider.trim().toLowerCase();
}
function normalizeMediaProviderId(id) {
const normalized = normalizeProviderId(id);
if (normalized === "gemini") return "google";
if (normalized === "minimax-cn") return "minimax";
if (normalized === "minimax-portal-cn") return "minimax-portal";
return normalized;
}
function normalizeMediaExecutionProviderId(id) {
const normalized = normalizeProviderId(id);
if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") return normalized;
return normalizeMediaProviderId(normalized);
}
//#endregion
export { normalizeMediaExecutionProviderId, normalizeMediaProviderId };

View File

@@ -0,0 +1,6 @@
import { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.mjs";
//#region packages/media-understanding-common/src/provider-supports.d.ts
declare function providerSupportsCapability(provider: MediaUnderstandingProvider | undefined, capability: MediaUnderstandingCapability): boolean;
//#endregion
export { providerSupportsCapability };

View File

@@ -0,0 +1,9 @@
//#region packages/media-understanding-common/src/provider-supports.ts
function providerSupportsCapability(provider, capability) {
if (!provider) return false;
if (capability === "audio") return Boolean(provider.transcribeAudio);
if (capability === "image") return Boolean(provider.describeImage);
return Boolean(provider.describeVideo);
}
//#endregion
export { providerSupportsCapability };

View File

@@ -0,0 +1,31 @@
//#region packages/media-understanding-common/src/types.d.ts
type MediaUnderstandingKind = "audio.transcription" | "video.description" | "image.description";
type MediaUnderstandingCapability = "image" | "audio" | "video";
type MediaUnderstandingCapabilityRegistry = Map<string, {
capabilities?: MediaUnderstandingCapability[];
}>;
type MediaAttachment = {
path?: string;
url?: string;
mime?: string;
index: number;
alreadyTranscribed?: boolean;
};
type MediaUnderstandingOutput = {
kind: MediaUnderstandingKind;
attachmentIndex: number;
text: string;
provider: string;
model?: string;
};
type MediaUnderstandingProvider = {
id: string;
capabilities?: MediaUnderstandingCapability[];
transcribeAudio?: unknown;
describeVideo?: unknown;
describeImage?: unknown;
describeImages?: unknown;
extractStructured?: unknown;
};
//#endregion
export { MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider };

View File

@@ -0,0 +1 @@
export {};

View File

@@ -0,0 +1,5 @@
//#region packages/media-understanding-common/src/video.d.ts
declare function estimateBase64Size(bytes: number): number;
declare function resolveVideoMaxBase64Bytes(maxBytes: number): number;
//#endregion
export { estimateBase64Size, resolveVideoMaxBase64Bytes };

View File

@@ -0,0 +1,11 @@
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.mjs";
//#region packages/media-understanding-common/src/video.ts
function estimateBase64Size(bytes) {
return Math.ceil(bytes / 3) * 4;
}
function resolveVideoMaxBase64Bytes(maxBytes) {
const expanded = Math.floor(maxBytes * (4 / 3));
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
}
//#endregion
export { estimateBase64Size, resolveVideoMaxBase64Bytes };

View File

@@ -0,0 +1,71 @@
{
"name": "@openclaw/media-understanding-common",
"version": "0.0.0-private",
"private": true,
"files": [
"dist"
],
"type": "module",
"main": "./dist/index.mjs",
"types": "./dist/index.d.mts",
"exports": {
".": {
"types": "./dist/index.d.mts",
"import": "./dist/index.mjs",
"default": "./dist/index.mjs"
},
"./active-model": {
"types": "./dist/active-model.d.mts",
"import": "./dist/active-model.mjs",
"default": "./dist/active-model.mjs"
},
"./defaults": {
"types": "./dist/defaults.d.mts",
"import": "./dist/defaults.mjs",
"default": "./dist/defaults.mjs"
},
"./errors": {
"types": "./dist/errors.d.mts",
"import": "./dist/errors.mjs",
"default": "./dist/errors.mjs"
},
"./format": {
"types": "./dist/format.d.mts",
"import": "./dist/format.mjs",
"default": "./dist/format.mjs"
},
"./openai-compatible-video": {
"types": "./dist/openai-compatible-video.d.mts",
"import": "./dist/openai-compatible-video.mjs",
"default": "./dist/openai-compatible-video.mjs"
},
"./output-extract": {
"types": "./dist/output-extract.d.mts",
"import": "./dist/output-extract.mjs",
"default": "./dist/output-extract.mjs"
},
"./provider-id": {
"types": "./dist/provider-id.d.mts",
"import": "./dist/provider-id.mjs",
"default": "./dist/provider-id.mjs"
},
"./provider-supports": {
"types": "./dist/provider-supports.d.mts",
"import": "./dist/provider-supports.mjs",
"default": "./dist/provider-supports.mjs"
},
"./types": {
"types": "./dist/types.d.mts",
"import": "./dist/types.mjs",
"default": "./dist/types.mjs"
},
"./video": {
"types": "./dist/video.d.mts",
"import": "./dist/video.mjs",
"default": "./dist/video.mjs"
}
},
"scripts": {
"build": "tsdown src/index.ts src/active-model.ts src/defaults.ts src/errors.ts src/format.ts src/openai-compatible-video.ts src/output-extract.ts src/provider-id.ts src/provider-supports.ts src/types.ts src/video.ts --no-config --platform node --format esm --dts --out-dir dist --clean"
}
}

View File

@@ -0,0 +1,4 @@
export type ActiveMediaModel = {
provider: string;
model?: string;
};

View File

@@ -0,0 +1,32 @@
import type { MediaUnderstandingCapability } from "./types.js";
const MB = 1024 * 1024;
export const DEFAULT_MAX_CHARS = 500;
export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
MediaUnderstandingCapability,
number | undefined
> = {
image: DEFAULT_MAX_CHARS,
audio: undefined,
video: DEFAULT_MAX_CHARS,
};
export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
image: 10 * MB,
audio: 20 * MB,
video: 50 * MB,
};
export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
image: 60,
audio: 60,
video: 120,
};
export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
image: "Describe the image.",
audio: "Transcribe the audio.",
video: "Describe the video.",
};
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export const DEFAULT_MEDIA_CONCURRENCY = 2;
export const MIN_AUDIO_FILE_BYTES = 1024;

View File

@@ -0,0 +1,21 @@
type MediaUnderstandingSkipReason =
| "maxBytes"
| "timeout"
| "unsupported"
| "empty"
| "blocked"
| "tooSmall";
export class MediaUnderstandingSkipError extends Error {
readonly reason: MediaUnderstandingSkipReason;
constructor(reason: MediaUnderstandingSkipReason, message: string) {
super(message);
this.reason = reason;
this.name = "MediaUnderstandingSkipError";
}
}
export function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError {
return err instanceof MediaUnderstandingSkipError;
}

View File

@@ -0,0 +1,98 @@
import type { MediaUnderstandingOutput } from "./types.js";
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
export function extractMediaUserText(body?: string): string | undefined {
const trimmed = body?.trim() ?? "";
if (!trimmed) {
return undefined;
}
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) {
return undefined;
}
const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
return cleaned || undefined;
}
function formatSection(
title: string,
kind: "Transcript" | "Description",
text: string,
userText?: string,
): string {
const lines = [`[${title}]`];
if (userText) {
lines.push(`User text:\n${userText}`);
}
lines.push(`${kind}:\n${text}`);
return lines.join("\n");
}
export function formatMediaUnderstandingBody(params: {
body?: string;
outputs: MediaUnderstandingOutput[];
}): string {
const outputs = params.outputs.filter((output) => output.text.trim());
if (outputs.length === 0) {
return params.body ?? "";
}
const userText = extractMediaUserText(params.body);
const sections: string[] = [];
if (userText && outputs.length > 1) {
sections.push(`User text:\n${userText}`);
}
const counts = new Map<MediaUnderstandingOutput["kind"], number>();
for (const output of outputs) {
counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
}
const seen = new Map<MediaUnderstandingOutput["kind"], number>();
for (const output of outputs) {
const count = counts.get(output.kind) ?? 1;
const next = (seen.get(output.kind) ?? 0) + 1;
seen.set(output.kind, next);
const suffix = count > 1 ? ` ${next}/${count}` : "";
if (output.kind === "audio.transcription") {
sections.push(
formatSection(
`Audio${suffix}`,
"Transcript",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
continue;
}
if (output.kind === "image.description") {
sections.push(
formatSection(
`Image${suffix}`,
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
continue;
}
sections.push(
formatSection(
`Video${suffix}`,
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
}
return sections.join("\n\n").trim();
}
export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
if (outputs.length === 1) {
return outputs[0].text;
}
return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n");
}

View File

@@ -0,0 +1,10 @@
export * from "./active-model.js";
export * from "./defaults.js";
export * from "./errors.js";
export * from "./format.js";
export * from "./openai-compatible-video.js";
export * from "./output-extract.js";
export * from "./provider-id.js";
export * from "./provider-supports.js";
export * from "./types.js";
export * from "./video.js";

View File

@@ -0,0 +1,66 @@
export type OpenAiCompatibleVideoPayload = {
choices?: Array<{
message?: {
content?: string | Array<{ text?: string }>;
reasoning_content?: string;
};
}>;
};
export function resolveMediaUnderstandingString(
value: string | undefined,
fallback: string,
): string {
const trimmed = value?.trim();
return trimmed || fallback;
}
export function coerceOpenAiCompatibleVideoText(
payload: OpenAiCompatibleVideoPayload,
): string | null {
const message = payload.choices?.[0]?.message;
if (!message) {
return null;
}
if (typeof message.content === "string" && message.content.trim()) {
return message.content.trim();
}
if (Array.isArray(message.content)) {
const text = message.content
.map((part) => part.text?.trim() ?? "")
.filter(Boolean)
.join("\n");
if (text) {
return text;
}
}
if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) {
return message.reasoning_content.trim();
}
return null;
}
export function buildOpenAiCompatibleVideoRequestBody(params: {
model: string;
prompt: string;
mime: string;
buffer: Buffer;
}) {
return {
model: params.model,
messages: [
{
role: "user",
content: [
{ type: "text", text: params.prompt },
{
type: "video_url",
video_url: {
url: `data:${params.mime};base64,${params.buffer.toString("base64")}`,
},
},
],
},
],
};
}

View File

@@ -0,0 +1,26 @@
function extractLastJsonObject(raw: string): unknown {
const trimmed = raw.trim();
const start = trimmed.lastIndexOf("{");
if (start === -1) {
return null;
}
const slice = trimmed.slice(start);
try {
return JSON.parse(slice);
} catch {
return null;
}
}
export function extractGeminiResponse(raw: string): string | null {
const payload = extractLastJsonObject(raw);
if (!payload || typeof payload !== "object") {
return null;
}
const response = (payload as { response?: unknown }).response;
if (typeof response !== "string") {
return null;
}
const trimmed = response.trim();
return trimmed || null;
}

View File

@@ -0,0 +1,25 @@
function normalizeProviderId(provider: string): string {
return provider.trim().toLowerCase();
}
export function normalizeMediaProviderId(id: string): string {
const normalized = normalizeProviderId(id);
if (normalized === "gemini") {
return "google";
}
if (normalized === "minimax-cn") {
return "minimax";
}
if (normalized === "minimax-portal-cn") {
return "minimax-portal";
}
return normalized;
}
export function normalizeMediaExecutionProviderId(id: string): string {
const normalized = normalizeProviderId(id);
if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") {
return normalized;
}
return normalizeMediaProviderId(normalized);
}

View File

@@ -0,0 +1,17 @@
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
export function providerSupportsCapability(
provider: MediaUnderstandingProvider | undefined,
capability: MediaUnderstandingCapability,
): boolean {
if (!provider) {
return false;
}
if (capability === "audio") {
return Boolean(provider.transcribeAudio);
}
if (capability === "image") {
return Boolean(provider.describeImage);
}
return Boolean(provider.describeVideo);
}

View File

@@ -0,0 +1,39 @@
export type MediaUnderstandingKind =
| "audio.transcription"
| "video.description"
| "image.description";
export type MediaUnderstandingCapability = "image" | "audio" | "video";
export type MediaUnderstandingCapabilityRegistry = Map<
string,
{
capabilities?: MediaUnderstandingCapability[];
}
>;
export type MediaAttachment = {
path?: string;
url?: string;
mime?: string;
index: number;
alreadyTranscribed?: boolean;
};
export type MediaUnderstandingOutput = {
kind: MediaUnderstandingKind;
attachmentIndex: number;
text: string;
provider: string;
model?: string;
};
export type MediaUnderstandingProvider = {
id: string;
capabilities?: MediaUnderstandingCapability[];
transcribeAudio?: unknown;
describeVideo?: unknown;
describeImage?: unknown;
describeImages?: unknown;
extractStructured?: unknown;
};

View File

@@ -0,0 +1,10 @@
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js";
export function estimateBase64Size(bytes: number): number {
return Math.ceil(bytes / 3) * 4;
}
export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
const expanded = Math.floor(maxBytes * (4 / 3));
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
}

2
pnpm-lock.yaml generated
View File

@@ -1832,6 +1832,8 @@ importers:
packages/media-generation-core: {}
packages/media-understanding-common: {}
packages/memory-host-sdk: {}
packages/net-policy:

View File

@@ -48,6 +48,7 @@ export const BUILD_ALL_STEPS = [
"packages/plugin-sdk/package.json",
"packages/llm-core/package.json",
"packages/markdown-core/package.json",
"packages/media-understanding-common/package.json",
"packages/terminal-core/package.json",
"packages/memory-host-sdk/package.json",
"tsconfig.json",
@@ -57,6 +58,7 @@ export const BUILD_ALL_STEPS = [
"packages/markdown-core/src",
"packages/memory-host-sdk/src",
"packages/media-generation-core/src",
"packages/media-understanding-common/src",
"packages/terminal-core/src",
"src/types",
"src/video-generation/dashscope-compatible.ts",

View File

@@ -19,6 +19,7 @@ const PLUGIN_SDK_TYPE_INPUTS = [
"packages/markdown-core/src",
"packages/memory-host-sdk/src",
"packages/media-generation-core/src",
"packages/media-understanding-common/src",
"packages/terminal-core/src",
"src/video-generation/dashscope-compatible.ts",
"src/video-generation/types.ts",

View File

@@ -12,6 +12,7 @@ const RUN_NODE_PACKAGE_SOURCE_ROOTS = [
"packages/gateway-protocol/src",
"packages/markdown-core/src",
"packages/media-generation-core/src",
"packages/media-understanding-common/src",
"packages/terminal-core/src",
"packages/net-policy/src",
];

View File

@@ -1,4 +1 @@
export type ActiveMediaModel = {
provider: string;
model?: string;
};
export * from "../../packages/media-understanding-common/src/active-model.js";

View File

@@ -1,32 +1 @@
import type { MediaUnderstandingCapability } from "./types.js";
const MB = 1024 * 1024;
export const DEFAULT_MAX_CHARS = 500;
export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
MediaUnderstandingCapability,
number | undefined
> = {
image: DEFAULT_MAX_CHARS,
audio: undefined,
video: DEFAULT_MAX_CHARS,
};
export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
image: 10 * MB,
audio: 20 * MB,
video: 50 * MB,
};
export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
image: 60,
audio: 60,
video: 120,
};
export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
image: "Describe the image.",
audio: "Transcribe the audio.",
video: "Describe the video.",
};
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export const DEFAULT_MEDIA_CONCURRENCY = 2;
export const MIN_AUDIO_FILE_BYTES = 1024;
export * from "../../packages/media-understanding-common/src/defaults.js";

View File

@@ -1,21 +1 @@
type MediaUnderstandingSkipReason =
| "maxBytes"
| "timeout"
| "unsupported"
| "empty"
| "blocked"
| "tooSmall";
export class MediaUnderstandingSkipError extends Error {
readonly reason: MediaUnderstandingSkipReason;
constructor(reason: MediaUnderstandingSkipReason, message: string) {
super(message);
this.reason = reason;
this.name = "MediaUnderstandingSkipError";
}
}
export function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError {
return err instanceof MediaUnderstandingSkipError;
}
export * from "../../packages/media-understanding-common/src/errors.js";

View File

@@ -1,98 +1 @@
import type { MediaUnderstandingOutput } from "./types.js";
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
export function extractMediaUserText(body?: string): string | undefined {
const trimmed = body?.trim() ?? "";
if (!trimmed) {
return undefined;
}
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) {
return undefined;
}
const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
return cleaned || undefined;
}
function formatSection(
title: string,
kind: "Transcript" | "Description",
text: string,
userText?: string,
): string {
const lines = [`[${title}]`];
if (userText) {
lines.push(`User text:\n${userText}`);
}
lines.push(`${kind}:\n${text}`);
return lines.join("\n");
}
export function formatMediaUnderstandingBody(params: {
body?: string;
outputs: MediaUnderstandingOutput[];
}): string {
const outputs = params.outputs.filter((output) => output.text.trim());
if (outputs.length === 0) {
return params.body ?? "";
}
const userText = extractMediaUserText(params.body);
const sections: string[] = [];
if (userText && outputs.length > 1) {
sections.push(`User text:\n${userText}`);
}
const counts = new Map<MediaUnderstandingOutput["kind"], number>();
for (const output of outputs) {
counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
}
const seen = new Map<MediaUnderstandingOutput["kind"], number>();
for (const output of outputs) {
const count = counts.get(output.kind) ?? 1;
const next = (seen.get(output.kind) ?? 0) + 1;
seen.set(output.kind, next);
const suffix = count > 1 ? ` ${next}/${count}` : "";
if (output.kind === "audio.transcription") {
sections.push(
formatSection(
`Audio${suffix}`,
"Transcript",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
continue;
}
if (output.kind === "image.description") {
sections.push(
formatSection(
`Image${suffix}`,
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
continue;
}
sections.push(
formatSection(
`Video${suffix}`,
"Description",
output.text,
outputs.length === 1 ? userText : undefined,
),
);
}
return sections.join("\n\n").trim();
}
export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
if (outputs.length === 1) {
return outputs[0].text;
}
return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n");
}
export * from "../../packages/media-understanding-common/src/format.js";

View File

@@ -1,66 +1 @@
import { normalizeOptionalString } from "../shared/string-coerce.js";
import { normalizeTrimmedStringList } from "../shared/string-normalization.js";
export type OpenAiCompatibleVideoPayload = {
choices?: Array<{
message?: {
content?: string | Array<{ text?: string }>;
reasoning_content?: string;
};
}>;
};
export function resolveMediaUnderstandingString(
value: string | undefined,
fallback: string,
): string {
const trimmed = normalizeOptionalString(value);
return trimmed || fallback;
}
export function coerceOpenAiCompatibleVideoText(
payload: OpenAiCompatibleVideoPayload,
): string | null {
const message = payload.choices?.[0]?.message;
if (!message) {
return null;
}
if (typeof message.content === "string" && message.content.trim()) {
return message.content.trim();
}
if (Array.isArray(message.content)) {
const text = normalizeTrimmedStringList(message.content.map((part) => part.text)).join("\n");
if (text) {
return text;
}
}
if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) {
return message.reasoning_content.trim();
}
return null;
}
export function buildOpenAiCompatibleVideoRequestBody(params: {
model: string;
prompt: string;
mime: string;
buffer: Buffer;
}) {
return {
model: params.model,
messages: [
{
role: "user",
content: [
{ type: "text", text: params.prompt },
{
type: "video_url",
video_url: {
url: `data:${params.mime};base64,${params.buffer.toString("base64")}`,
},
},
],
},
],
};
}
export * from "../../packages/media-understanding-common/src/openai-compatible-video.js";

View File

@@ -1,26 +1 @@
function extractLastJsonObject(raw: string): unknown {
const trimmed = raw.trim();
const start = trimmed.lastIndexOf("{");
if (start === -1) {
return null;
}
const slice = trimmed.slice(start);
try {
return JSON.parse(slice);
} catch {
return null;
}
}
export function extractGeminiResponse(raw: string): string | null {
const payload = extractLastJsonObject(raw);
if (!payload || typeof payload !== "object") {
return null;
}
const response = (payload as { response?: unknown }).response;
if (typeof response !== "string") {
return null;
}
const trimmed = response.trim();
return trimmed || null;
}
export * from "../../packages/media-understanding-common/src/output-extract.js";

View File

@@ -1,23 +1 @@
import { normalizeProviderId } from "../agents/provider-id.js";
export function normalizeMediaProviderId(id: string): string {
const normalized = normalizeProviderId(id);
if (normalized === "gemini") {
return "google";
}
if (normalized === "minimax-cn") {
return "minimax";
}
if (normalized === "minimax-portal-cn") {
return "minimax-portal";
}
return normalized;
}
export function normalizeMediaExecutionProviderId(id: string): string {
const normalized = normalizeProviderId(id);
if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") {
return normalized;
}
return normalizeMediaProviderId(normalized);
}
export * from "../../packages/media-understanding-common/src/provider-id.js";

View File

@@ -1,17 +1 @@
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
export function providerSupportsCapability(
provider: MediaUnderstandingProvider | undefined,
capability: MediaUnderstandingCapability,
): boolean {
if (!provider) {
return false;
}
if (capability === "audio") {
return Boolean(provider.transcribeAudio);
}
if (capability === "image") {
return Boolean(provider.describeImage);
}
return Boolean(provider.describeVideo);
}
export * from "../../packages/media-understanding-common/src/provider-supports.js";

View File

@@ -1,10 +1 @@
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.constants.js";
export function estimateBase64Size(bytes: number): number {
return Math.ceil(bytes / 3) * 4;
}
export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
const expanded = Math.floor(maxBytes * (4 / 3));
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
}
export * from "../../packages/media-understanding-common/src/video.js";

View File

@@ -65,6 +65,12 @@
"./packages/media-generation-core/src/normalization.ts"
],
"@openclaw/media-generation-core/*": ["./packages/media-generation-core/src/*"],
"@openclaw/media-understanding-common": [
"./packages/media-understanding-common/src/index.ts"
],
"@openclaw/media-understanding-common/*": [
"./packages/media-understanding-common/src/*"
],
"@openclaw/markdown-core": ["./packages/markdown-core/src/index.ts"],
"@openclaw/markdown-core/code-spans": ["./packages/markdown-core/src/code-spans.ts"],
"@openclaw/markdown-core/fences": ["./packages/markdown-core/src/fences.ts"],

View File

@@ -394,6 +394,22 @@ function buildMediaGenerationCoreDistEntries(): Record<string, string> {
};
}
function buildMediaUnderstandingCoreDistEntries(): Record<string, string> {
return {
index: "packages/media-understanding-common/src/index.ts",
"active-model": "packages/media-understanding-common/src/active-model.ts",
defaults: "packages/media-understanding-common/src/defaults.ts",
errors: "packages/media-understanding-common/src/errors.ts",
format: "packages/media-understanding-common/src/format.ts",
"openai-compatible-video": "packages/media-understanding-common/src/openai-compatible-video.ts",
"output-extract": "packages/media-understanding-common/src/output-extract.ts",
"provider-id": "packages/media-understanding-common/src/provider-id.ts",
"provider-supports": "packages/media-understanding-common/src/provider-supports.ts",
types: "packages/media-understanding-common/src/types.ts",
video: "packages/media-understanding-common/src/video.ts",
};
}
function buildMarkdownCoreDistEntries(): Record<string, string> {
return {
index: "packages/markdown-core/src/index.ts",
@@ -592,6 +608,12 @@ export default defineConfig([
entry: buildMediaGenerationCoreDistEntries(),
outDir: "packages/media-generation-core/dist",
}),
nodeWorkspacePackageBuildConfig({
clean: true,
dts: RUN_NODE_SKIP_DTS_BUILD ? false : undefined,
entry: buildMediaUnderstandingCoreDistEntries(),
outDir: "packages/media-understanding-common/dist",
}),
nodeWorkspacePackageBuildConfig({
clean: true,
dts: RUN_NODE_SKIP_DTS_BUILD ? false : undefined,