mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-03 08:14:07 +00:00
refactor: extract media understanding common package (#88297)
* refactor: extract media understanding common package * test: move media understanding format test
This commit is contained in:
committed by
GitHub
parent
b13fb788b5
commit
8b92aca27f
7
packages/media-understanding-common/dist/active-model.d.mts
vendored
Normal file
7
packages/media-understanding-common/dist/active-model.d.mts
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
//#region packages/media-understanding-common/src/active-model.d.ts
|
||||
type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
//#endregion
|
||||
export { ActiveMediaModel };
|
||||
1
packages/media-understanding-common/dist/active-model.mjs
vendored
Normal file
1
packages/media-understanding-common/dist/active-model.mjs
vendored
Normal file
@@ -0,0 +1 @@
|
||||
export {};
|
||||
14
packages/media-understanding-common/dist/defaults.d.mts
vendored
Normal file
14
packages/media-understanding-common/dist/defaults.d.mts
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
import { MediaUnderstandingCapability } from "./types.mjs";
|
||||
|
||||
//#region packages/media-understanding-common/src/defaults.d.ts
|
||||
declare const DEFAULT_MAX_CHARS = 500;
|
||||
declare const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<MediaUnderstandingCapability, number | undefined>;
|
||||
declare const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number>;
|
||||
declare const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number>;
|
||||
declare const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string>;
|
||||
declare const DEFAULT_VIDEO_MAX_BASE64_BYTES: number;
|
||||
declare const CLI_OUTPUT_MAX_BUFFER: number;
|
||||
declare const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||
declare const MIN_AUDIO_FILE_BYTES = 1024;
|
||||
//#endregion
|
||||
export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES };
|
||||
29
packages/media-understanding-common/dist/defaults.mjs
vendored
Normal file
29
packages/media-understanding-common/dist/defaults.mjs
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
//#region packages/media-understanding-common/src/defaults.ts
|
||||
const MB = 1024 * 1024;
|
||||
const DEFAULT_MAX_CHARS = 500;
|
||||
const DEFAULT_MAX_CHARS_BY_CAPABILITY = {
|
||||
image: 500,
|
||||
audio: void 0,
|
||||
video: 500
|
||||
};
|
||||
const DEFAULT_MAX_BYTES = {
|
||||
image: 10 * MB,
|
||||
audio: 20 * MB,
|
||||
video: 50 * MB
|
||||
};
|
||||
const DEFAULT_TIMEOUT_SECONDS = {
|
||||
image: 60,
|
||||
audio: 60,
|
||||
video: 120
|
||||
};
|
||||
const DEFAULT_PROMPT = {
|
||||
image: "Describe the image.",
|
||||
audio: "Transcribe the audio.",
|
||||
video: "Describe the video."
|
||||
};
|
||||
const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
||||
const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||
const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||
const MIN_AUDIO_FILE_BYTES = 1024;
|
||||
//#endregion
|
||||
export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES };
|
||||
9
packages/media-understanding-common/dist/errors.d.mts
vendored
Normal file
9
packages/media-understanding-common/dist/errors.d.mts
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
//#region packages/media-understanding-common/src/errors.d.ts
|
||||
type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty" | "blocked" | "tooSmall";
|
||||
declare class MediaUnderstandingSkipError extends Error {
|
||||
readonly reason: MediaUnderstandingSkipReason;
|
||||
constructor(reason: MediaUnderstandingSkipReason, message: string);
|
||||
}
|
||||
declare function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError;
|
||||
//#endregion
|
||||
export { MediaUnderstandingSkipError, isMediaUnderstandingSkipError };
|
||||
13
packages/media-understanding-common/dist/errors.mjs
vendored
Normal file
13
packages/media-understanding-common/dist/errors.mjs
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
//#region packages/media-understanding-common/src/errors.ts
|
||||
var MediaUnderstandingSkipError = class extends Error {
|
||||
constructor(reason, message) {
|
||||
super(message);
|
||||
this.reason = reason;
|
||||
this.name = "MediaUnderstandingSkipError";
|
||||
}
|
||||
};
|
||||
function isMediaUnderstandingSkipError(err) {
|
||||
return err instanceof MediaUnderstandingSkipError;
|
||||
}
|
||||
//#endregion
|
||||
export { MediaUnderstandingSkipError, isMediaUnderstandingSkipError };
|
||||
11
packages/media-understanding-common/dist/format.d.mts
vendored
Normal file
11
packages/media-understanding-common/dist/format.d.mts
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import { MediaUnderstandingOutput } from "./types.mjs";
|
||||
|
||||
//#region packages/media-understanding-common/src/format.d.ts
|
||||
declare function extractMediaUserText(body?: string): string | undefined;
|
||||
declare function formatMediaUnderstandingBody(params: {
|
||||
body?: string;
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
}): string;
|
||||
declare function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string;
|
||||
//#endregion
|
||||
export { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody };
|
||||
47
packages/media-understanding-common/dist/format.mjs
vendored
Normal file
47
packages/media-understanding-common/dist/format.mjs
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
//#region packages/media-understanding-common/src/format.ts
|
||||
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
|
||||
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
|
||||
function extractMediaUserText(body) {
|
||||
const trimmed = body?.trim() ?? "";
|
||||
if (!trimmed) return;
|
||||
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return;
|
||||
return trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim() || void 0;
|
||||
}
|
||||
function formatSection(title, kind, text, userText) {
|
||||
const lines = [`[${title}]`];
|
||||
if (userText) lines.push(`User text:\n${userText}`);
|
||||
lines.push(`${kind}:\n${text}`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
function formatMediaUnderstandingBody(params) {
|
||||
const outputs = params.outputs.filter((output) => output.text.trim());
|
||||
if (outputs.length === 0) return params.body ?? "";
|
||||
const userText = extractMediaUserText(params.body);
|
||||
const sections = [];
|
||||
if (userText && outputs.length > 1) sections.push(`User text:\n${userText}`);
|
||||
const counts = /* @__PURE__ */ new Map();
|
||||
for (const output of outputs) counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
|
||||
const seen = /* @__PURE__ */ new Map();
|
||||
for (const output of outputs) {
|
||||
const count = counts.get(output.kind) ?? 1;
|
||||
const next = (seen.get(output.kind) ?? 0) + 1;
|
||||
seen.set(output.kind, next);
|
||||
const suffix = count > 1 ? ` ${next}/${count}` : "";
|
||||
if (output.kind === "audio.transcription") {
|
||||
sections.push(formatSection(`Audio${suffix}`, "Transcript", output.text, outputs.length === 1 ? userText : void 0));
|
||||
continue;
|
||||
}
|
||||
if (output.kind === "image.description") {
|
||||
sections.push(formatSection(`Image${suffix}`, "Description", output.text, outputs.length === 1 ? userText : void 0));
|
||||
continue;
|
||||
}
|
||||
sections.push(formatSection(`Video${suffix}`, "Description", output.text, outputs.length === 1 ? userText : void 0));
|
||||
}
|
||||
return sections.join("\n\n").trim();
|
||||
}
|
||||
function formatAudioTranscripts(outputs) {
|
||||
if (outputs.length === 1) return outputs[0].text;
|
||||
return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n");
|
||||
}
|
||||
//#endregion
|
||||
export { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody };
|
||||
11
packages/media-understanding-common/dist/index.d.mts
vendored
Normal file
11
packages/media-understanding-common/dist/index.d.mts
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import { ActiveMediaModel } from "./active-model.mjs";
|
||||
import { MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.mjs";
|
||||
import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES } from "./defaults.mjs";
|
||||
import { MediaUnderstandingSkipError, isMediaUnderstandingSkipError } from "./errors.mjs";
|
||||
import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody } from "./format.mjs";
|
||||
import { OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString } from "./openai-compatible-video.mjs";
|
||||
import { extractGeminiResponse } from "./output-extract.mjs";
|
||||
import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.mjs";
|
||||
import { providerSupportsCapability } from "./provider-supports.mjs";
|
||||
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.mjs";
|
||||
export { ActiveMediaModel, CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES, MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider, MediaUnderstandingSkipError, OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, estimateBase64Size, extractGeminiResponse, extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, isMediaUnderstandingSkipError, normalizeMediaExecutionProviderId, normalizeMediaProviderId, providerSupportsCapability, resolveMediaUnderstandingString, resolveVideoMaxBase64Bytes };
|
||||
11
packages/media-understanding-common/dist/index.mjs
vendored
Normal file
11
packages/media-understanding-common/dist/index.mjs
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import "./active-model.mjs";
|
||||
import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES } from "./defaults.mjs";
|
||||
import { MediaUnderstandingSkipError, isMediaUnderstandingSkipError } from "./errors.mjs";
|
||||
import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody } from "./format.mjs";
|
||||
import { buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString } from "./openai-compatible-video.mjs";
|
||||
import { extractGeminiResponse } from "./output-extract.mjs";
|
||||
import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.mjs";
|
||||
import { providerSupportsCapability } from "./provider-supports.mjs";
|
||||
import "./types.mjs";
|
||||
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.mjs";
|
||||
export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES, MediaUnderstandingSkipError, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, estimateBase64Size, extractGeminiResponse, extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, isMediaUnderstandingSkipError, normalizeMediaExecutionProviderId, normalizeMediaProviderId, providerSupportsCapability, resolveMediaUnderstandingString, resolveVideoMaxBase64Bytes };
|
||||
37
packages/media-understanding-common/dist/openai-compatible-video.d.mts
vendored
Normal file
37
packages/media-understanding-common/dist/openai-compatible-video.d.mts
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
//#region packages/media-understanding-common/src/openai-compatible-video.d.ts
|
||||
type OpenAiCompatibleVideoPayload = {
|
||||
choices?: Array<{
|
||||
message?: {
|
||||
content?: string | Array<{
|
||||
text?: string;
|
||||
}>;
|
||||
reasoning_content?: string;
|
||||
};
|
||||
}>;
|
||||
};
|
||||
declare function resolveMediaUnderstandingString(value: string | undefined, fallback: string): string;
|
||||
declare function coerceOpenAiCompatibleVideoText(payload: OpenAiCompatibleVideoPayload): string | null;
|
||||
declare function buildOpenAiCompatibleVideoRequestBody(params: {
|
||||
model: string;
|
||||
prompt: string;
|
||||
mime: string;
|
||||
buffer: Buffer;
|
||||
}): {
|
||||
model: string;
|
||||
messages: {
|
||||
role: string;
|
||||
content: ({
|
||||
type: string;
|
||||
text: string;
|
||||
video_url?: undefined;
|
||||
} | {
|
||||
type: string;
|
||||
video_url: {
|
||||
url: string;
|
||||
};
|
||||
text?: undefined;
|
||||
})[];
|
||||
}[];
|
||||
};
|
||||
//#endregion
|
||||
export { OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString };
|
||||
32
packages/media-understanding-common/dist/openai-compatible-video.mjs
vendored
Normal file
32
packages/media-understanding-common/dist/openai-compatible-video.mjs
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
//#region packages/media-understanding-common/src/openai-compatible-video.ts
|
||||
function resolveMediaUnderstandingString(value, fallback) {
|
||||
return value?.trim() || fallback;
|
||||
}
|
||||
function coerceOpenAiCompatibleVideoText(payload) {
|
||||
const message = payload.choices?.[0]?.message;
|
||||
if (!message) return null;
|
||||
if (typeof message.content === "string" && message.content.trim()) return message.content.trim();
|
||||
if (Array.isArray(message.content)) {
|
||||
const text = message.content.map((part) => part.text?.trim() ?? "").filter(Boolean).join("\n");
|
||||
if (text) return text;
|
||||
}
|
||||
if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) return message.reasoning_content.trim();
|
||||
return null;
|
||||
}
|
||||
function buildOpenAiCompatibleVideoRequestBody(params) {
|
||||
return {
|
||||
model: params.model,
|
||||
messages: [{
|
||||
role: "user",
|
||||
content: [{
|
||||
type: "text",
|
||||
text: params.prompt
|
||||
}, {
|
||||
type: "video_url",
|
||||
video_url: { url: `data:${params.mime};base64,${params.buffer.toString("base64")}` }
|
||||
}]
|
||||
}]
|
||||
};
|
||||
}
|
||||
//#endregion
|
||||
export { buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString };
|
||||
4
packages/media-understanding-common/dist/output-extract.d.mts
vendored
Normal file
4
packages/media-understanding-common/dist/output-extract.d.mts
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
//#region packages/media-understanding-common/src/output-extract.d.ts
|
||||
declare function extractGeminiResponse(raw: string): string | null;
|
||||
//#endregion
|
||||
export { extractGeminiResponse };
|
||||
21
packages/media-understanding-common/dist/output-extract.mjs
vendored
Normal file
21
packages/media-understanding-common/dist/output-extract.mjs
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
//#region packages/media-understanding-common/src/output-extract.ts
|
||||
function extractLastJsonObject(raw) {
|
||||
const trimmed = raw.trim();
|
||||
const start = trimmed.lastIndexOf("{");
|
||||
if (start === -1) return null;
|
||||
const slice = trimmed.slice(start);
|
||||
try {
|
||||
return JSON.parse(slice);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
function extractGeminiResponse(raw) {
|
||||
const payload = extractLastJsonObject(raw);
|
||||
if (!payload || typeof payload !== "object") return null;
|
||||
const response = payload.response;
|
||||
if (typeof response !== "string") return null;
|
||||
return response.trim() || null;
|
||||
}
|
||||
//#endregion
|
||||
export { extractGeminiResponse };
|
||||
5
packages/media-understanding-common/dist/provider-id.d.mts
vendored
Normal file
5
packages/media-understanding-common/dist/provider-id.d.mts
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
//#region packages/media-understanding-common/src/provider-id.d.ts
|
||||
declare function normalizeMediaProviderId(id: string): string;
|
||||
declare function normalizeMediaExecutionProviderId(id: string): string;
|
||||
//#endregion
|
||||
export { normalizeMediaExecutionProviderId, normalizeMediaProviderId };
|
||||
18
packages/media-understanding-common/dist/provider-id.mjs
vendored
Normal file
18
packages/media-understanding-common/dist/provider-id.mjs
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
//#region packages/media-understanding-common/src/provider-id.ts
|
||||
function normalizeProviderId(provider) {
|
||||
return provider.trim().toLowerCase();
|
||||
}
|
||||
function normalizeMediaProviderId(id) {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "gemini") return "google";
|
||||
if (normalized === "minimax-cn") return "minimax";
|
||||
if (normalized === "minimax-portal-cn") return "minimax-portal";
|
||||
return normalized;
|
||||
}
|
||||
function normalizeMediaExecutionProviderId(id) {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") return normalized;
|
||||
return normalizeMediaProviderId(normalized);
|
||||
}
|
||||
//#endregion
|
||||
export { normalizeMediaExecutionProviderId, normalizeMediaProviderId };
|
||||
6
packages/media-understanding-common/dist/provider-supports.d.mts
vendored
Normal file
6
packages/media-understanding-common/dist/provider-supports.d.mts
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
import { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.mjs";
|
||||
|
||||
//#region packages/media-understanding-common/src/provider-supports.d.ts
|
||||
declare function providerSupportsCapability(provider: MediaUnderstandingProvider | undefined, capability: MediaUnderstandingCapability): boolean;
|
||||
//#endregion
|
||||
export { providerSupportsCapability };
|
||||
9
packages/media-understanding-common/dist/provider-supports.mjs
vendored
Normal file
9
packages/media-understanding-common/dist/provider-supports.mjs
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
//#region packages/media-understanding-common/src/provider-supports.ts
|
||||
function providerSupportsCapability(provider, capability) {
|
||||
if (!provider) return false;
|
||||
if (capability === "audio") return Boolean(provider.transcribeAudio);
|
||||
if (capability === "image") return Boolean(provider.describeImage);
|
||||
return Boolean(provider.describeVideo);
|
||||
}
|
||||
//#endregion
|
||||
export { providerSupportsCapability };
|
||||
31
packages/media-understanding-common/dist/types.d.mts
vendored
Normal file
31
packages/media-understanding-common/dist/types.d.mts
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
//#region packages/media-understanding-common/src/types.d.ts
|
||||
type MediaUnderstandingKind = "audio.transcription" | "video.description" | "image.description";
|
||||
type MediaUnderstandingCapability = "image" | "audio" | "video";
|
||||
type MediaUnderstandingCapabilityRegistry = Map<string, {
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
}>;
|
||||
type MediaAttachment = {
|
||||
path?: string;
|
||||
url?: string;
|
||||
mime?: string;
|
||||
index: number;
|
||||
alreadyTranscribed?: boolean;
|
||||
};
|
||||
type MediaUnderstandingOutput = {
|
||||
kind: MediaUnderstandingKind;
|
||||
attachmentIndex: number;
|
||||
text: string;
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
type MediaUnderstandingProvider = {
|
||||
id: string;
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
transcribeAudio?: unknown;
|
||||
describeVideo?: unknown;
|
||||
describeImage?: unknown;
|
||||
describeImages?: unknown;
|
||||
extractStructured?: unknown;
|
||||
};
|
||||
//#endregion
|
||||
export { MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider };
|
||||
1
packages/media-understanding-common/dist/types.mjs
vendored
Normal file
1
packages/media-understanding-common/dist/types.mjs
vendored
Normal file
@@ -0,0 +1 @@
|
||||
export {};
|
||||
5
packages/media-understanding-common/dist/video.d.mts
vendored
Normal file
5
packages/media-understanding-common/dist/video.d.mts
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
//#region packages/media-understanding-common/src/video.d.ts
|
||||
declare function estimateBase64Size(bytes: number): number;
|
||||
declare function resolveVideoMaxBase64Bytes(maxBytes: number): number;
|
||||
//#endregion
|
||||
export { estimateBase64Size, resolveVideoMaxBase64Bytes };
|
||||
11
packages/media-understanding-common/dist/video.mjs
vendored
Normal file
11
packages/media-understanding-common/dist/video.mjs
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.mjs";
|
||||
//#region packages/media-understanding-common/src/video.ts
|
||||
function estimateBase64Size(bytes) {
|
||||
return Math.ceil(bytes / 3) * 4;
|
||||
}
|
||||
function resolveVideoMaxBase64Bytes(maxBytes) {
|
||||
const expanded = Math.floor(maxBytes * (4 / 3));
|
||||
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
|
||||
}
|
||||
//#endregion
|
||||
export { estimateBase64Size, resolveVideoMaxBase64Bytes };
|
||||
71
packages/media-understanding-common/package.json
Normal file
71
packages/media-understanding-common/package.json
Normal file
@@ -0,0 +1,71 @@
|
||||
{
|
||||
"name": "@openclaw/media-understanding-common",
|
||||
"version": "0.0.0-private",
|
||||
"private": true,
|
||||
"files": [
|
||||
"dist"
|
||||
],
|
||||
"type": "module",
|
||||
"main": "./dist/index.mjs",
|
||||
"types": "./dist/index.d.mts",
|
||||
"exports": {
|
||||
".": {
|
||||
"types": "./dist/index.d.mts",
|
||||
"import": "./dist/index.mjs",
|
||||
"default": "./dist/index.mjs"
|
||||
},
|
||||
"./active-model": {
|
||||
"types": "./dist/active-model.d.mts",
|
||||
"import": "./dist/active-model.mjs",
|
||||
"default": "./dist/active-model.mjs"
|
||||
},
|
||||
"./defaults": {
|
||||
"types": "./dist/defaults.d.mts",
|
||||
"import": "./dist/defaults.mjs",
|
||||
"default": "./dist/defaults.mjs"
|
||||
},
|
||||
"./errors": {
|
||||
"types": "./dist/errors.d.mts",
|
||||
"import": "./dist/errors.mjs",
|
||||
"default": "./dist/errors.mjs"
|
||||
},
|
||||
"./format": {
|
||||
"types": "./dist/format.d.mts",
|
||||
"import": "./dist/format.mjs",
|
||||
"default": "./dist/format.mjs"
|
||||
},
|
||||
"./openai-compatible-video": {
|
||||
"types": "./dist/openai-compatible-video.d.mts",
|
||||
"import": "./dist/openai-compatible-video.mjs",
|
||||
"default": "./dist/openai-compatible-video.mjs"
|
||||
},
|
||||
"./output-extract": {
|
||||
"types": "./dist/output-extract.d.mts",
|
||||
"import": "./dist/output-extract.mjs",
|
||||
"default": "./dist/output-extract.mjs"
|
||||
},
|
||||
"./provider-id": {
|
||||
"types": "./dist/provider-id.d.mts",
|
||||
"import": "./dist/provider-id.mjs",
|
||||
"default": "./dist/provider-id.mjs"
|
||||
},
|
||||
"./provider-supports": {
|
||||
"types": "./dist/provider-supports.d.mts",
|
||||
"import": "./dist/provider-supports.mjs",
|
||||
"default": "./dist/provider-supports.mjs"
|
||||
},
|
||||
"./types": {
|
||||
"types": "./dist/types.d.mts",
|
||||
"import": "./dist/types.mjs",
|
||||
"default": "./dist/types.mjs"
|
||||
},
|
||||
"./video": {
|
||||
"types": "./dist/video.d.mts",
|
||||
"import": "./dist/video.mjs",
|
||||
"default": "./dist/video.mjs"
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"build": "tsdown src/index.ts src/active-model.ts src/defaults.ts src/errors.ts src/format.ts src/openai-compatible-video.ts src/output-extract.ts src/provider-id.ts src/provider-supports.ts src/types.ts src/video.ts --no-config --platform node --format esm --dts --out-dir dist --clean"
|
||||
}
|
||||
}
|
||||
4
packages/media-understanding-common/src/active-model.ts
Normal file
4
packages/media-understanding-common/src/active-model.ts
Normal file
@@ -0,0 +1,4 @@
|
||||
export type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
32
packages/media-understanding-common/src/defaults.ts
Normal file
32
packages/media-understanding-common/src/defaults.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
import type { MediaUnderstandingCapability } from "./types.js";
|
||||
|
||||
const MB = 1024 * 1024;
|
||||
|
||||
export const DEFAULT_MAX_CHARS = 500;
|
||||
export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
|
||||
MediaUnderstandingCapability,
|
||||
number | undefined
|
||||
> = {
|
||||
image: DEFAULT_MAX_CHARS,
|
||||
audio: undefined,
|
||||
video: DEFAULT_MAX_CHARS,
|
||||
};
|
||||
export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
|
||||
image: 10 * MB,
|
||||
audio: 20 * MB,
|
||||
video: 50 * MB,
|
||||
};
|
||||
export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
|
||||
image: 60,
|
||||
audio: 60,
|
||||
video: 120,
|
||||
};
|
||||
export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
|
||||
image: "Describe the image.",
|
||||
audio: "Transcribe the audio.",
|
||||
video: "Describe the video.",
|
||||
};
|
||||
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
||||
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||
export const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||
export const MIN_AUDIO_FILE_BYTES = 1024;
|
||||
21
packages/media-understanding-common/src/errors.ts
Normal file
21
packages/media-understanding-common/src/errors.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
type MediaUnderstandingSkipReason =
|
||||
| "maxBytes"
|
||||
| "timeout"
|
||||
| "unsupported"
|
||||
| "empty"
|
||||
| "blocked"
|
||||
| "tooSmall";
|
||||
|
||||
export class MediaUnderstandingSkipError extends Error {
|
||||
readonly reason: MediaUnderstandingSkipReason;
|
||||
|
||||
constructor(reason: MediaUnderstandingSkipReason, message: string) {
|
||||
super(message);
|
||||
this.reason = reason;
|
||||
this.name = "MediaUnderstandingSkipError";
|
||||
}
|
||||
}
|
||||
|
||||
export function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError {
|
||||
return err instanceof MediaUnderstandingSkipError;
|
||||
}
|
||||
98
packages/media-understanding-common/src/format.ts
Normal file
98
packages/media-understanding-common/src/format.ts
Normal file
@@ -0,0 +1,98 @@
|
||||
import type { MediaUnderstandingOutput } from "./types.js";
|
||||
|
||||
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
|
||||
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
|
||||
|
||||
export function extractMediaUserText(body?: string): string | undefined {
|
||||
const trimmed = body?.trim() ?? "";
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) {
|
||||
return undefined;
|
||||
}
|
||||
const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
|
||||
return cleaned || undefined;
|
||||
}
|
||||
|
||||
function formatSection(
|
||||
title: string,
|
||||
kind: "Transcript" | "Description",
|
||||
text: string,
|
||||
userText?: string,
|
||||
): string {
|
||||
const lines = [`[${title}]`];
|
||||
if (userText) {
|
||||
lines.push(`User text:\n${userText}`);
|
||||
}
|
||||
lines.push(`${kind}:\n${text}`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
export function formatMediaUnderstandingBody(params: {
|
||||
body?: string;
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
}): string {
|
||||
const outputs = params.outputs.filter((output) => output.text.trim());
|
||||
if (outputs.length === 0) {
|
||||
return params.body ?? "";
|
||||
}
|
||||
|
||||
const userText = extractMediaUserText(params.body);
|
||||
const sections: string[] = [];
|
||||
if (userText && outputs.length > 1) {
|
||||
sections.push(`User text:\n${userText}`);
|
||||
}
|
||||
|
||||
const counts = new Map<MediaUnderstandingOutput["kind"], number>();
|
||||
for (const output of outputs) {
|
||||
counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
|
||||
}
|
||||
const seen = new Map<MediaUnderstandingOutput["kind"], number>();
|
||||
|
||||
for (const output of outputs) {
|
||||
const count = counts.get(output.kind) ?? 1;
|
||||
const next = (seen.get(output.kind) ?? 0) + 1;
|
||||
seen.set(output.kind, next);
|
||||
const suffix = count > 1 ? ` ${next}/${count}` : "";
|
||||
if (output.kind === "audio.transcription") {
|
||||
sections.push(
|
||||
formatSection(
|
||||
`Audio${suffix}`,
|
||||
"Transcript",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if (output.kind === "image.description") {
|
||||
sections.push(
|
||||
formatSection(
|
||||
`Image${suffix}`,
|
||||
"Description",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
sections.push(
|
||||
formatSection(
|
||||
`Video${suffix}`,
|
||||
"Description",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
return sections.join("\n\n").trim();
|
||||
}
|
||||
|
||||
export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
|
||||
if (outputs.length === 1) {
|
||||
return outputs[0].text;
|
||||
}
|
||||
return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n");
|
||||
}
|
||||
10
packages/media-understanding-common/src/index.ts
Normal file
10
packages/media-understanding-common/src/index.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
export * from "./active-model.js";
|
||||
export * from "./defaults.js";
|
||||
export * from "./errors.js";
|
||||
export * from "./format.js";
|
||||
export * from "./openai-compatible-video.js";
|
||||
export * from "./output-extract.js";
|
||||
export * from "./provider-id.js";
|
||||
export * from "./provider-supports.js";
|
||||
export * from "./types.js";
|
||||
export * from "./video.js";
|
||||
@@ -0,0 +1,66 @@
|
||||
export type OpenAiCompatibleVideoPayload = {
|
||||
choices?: Array<{
|
||||
message?: {
|
||||
content?: string | Array<{ text?: string }>;
|
||||
reasoning_content?: string;
|
||||
};
|
||||
}>;
|
||||
};
|
||||
|
||||
export function resolveMediaUnderstandingString(
|
||||
value: string | undefined,
|
||||
fallback: string,
|
||||
): string {
|
||||
const trimmed = value?.trim();
|
||||
return trimmed || fallback;
|
||||
}
|
||||
|
||||
export function coerceOpenAiCompatibleVideoText(
|
||||
payload: OpenAiCompatibleVideoPayload,
|
||||
): string | null {
|
||||
const message = payload.choices?.[0]?.message;
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
if (typeof message.content === "string" && message.content.trim()) {
|
||||
return message.content.trim();
|
||||
}
|
||||
if (Array.isArray(message.content)) {
|
||||
const text = message.content
|
||||
.map((part) => part.text?.trim() ?? "")
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (text) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) {
|
||||
return message.reasoning_content.trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export function buildOpenAiCompatibleVideoRequestBody(params: {
|
||||
model: string;
|
||||
prompt: string;
|
||||
mime: string;
|
||||
buffer: Buffer;
|
||||
}) {
|
||||
return {
|
||||
model: params.model,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: params.prompt },
|
||||
{
|
||||
type: "video_url",
|
||||
video_url: {
|
||||
url: `data:${params.mime};base64,${params.buffer.toString("base64")}`,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
26
packages/media-understanding-common/src/output-extract.ts
Normal file
26
packages/media-understanding-common/src/output-extract.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
function extractLastJsonObject(raw: string): unknown {
|
||||
const trimmed = raw.trim();
|
||||
const start = trimmed.lastIndexOf("{");
|
||||
if (start === -1) {
|
||||
return null;
|
||||
}
|
||||
const slice = trimmed.slice(start);
|
||||
try {
|
||||
return JSON.parse(slice);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function extractGeminiResponse(raw: string): string | null {
|
||||
const payload = extractLastJsonObject(raw);
|
||||
if (!payload || typeof payload !== "object") {
|
||||
return null;
|
||||
}
|
||||
const response = (payload as { response?: unknown }).response;
|
||||
if (typeof response !== "string") {
|
||||
return null;
|
||||
}
|
||||
const trimmed = response.trim();
|
||||
return trimmed || null;
|
||||
}
|
||||
25
packages/media-understanding-common/src/provider-id.ts
Normal file
25
packages/media-understanding-common/src/provider-id.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
function normalizeProviderId(provider: string): string {
|
||||
return provider.trim().toLowerCase();
|
||||
}
|
||||
|
||||
export function normalizeMediaProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "gemini") {
|
||||
return "google";
|
||||
}
|
||||
if (normalized === "minimax-cn") {
|
||||
return "minimax";
|
||||
}
|
||||
if (normalized === "minimax-portal-cn") {
|
||||
return "minimax-portal";
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function normalizeMediaExecutionProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") {
|
||||
return normalized;
|
||||
}
|
||||
return normalizeMediaProviderId(normalized);
|
||||
}
|
||||
17
packages/media-understanding-common/src/provider-supports.ts
Normal file
17
packages/media-understanding-common/src/provider-supports.ts
Normal file
@@ -0,0 +1,17 @@
|
||||
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
|
||||
|
||||
export function providerSupportsCapability(
|
||||
provider: MediaUnderstandingProvider | undefined,
|
||||
capability: MediaUnderstandingCapability,
|
||||
): boolean {
|
||||
if (!provider) {
|
||||
return false;
|
||||
}
|
||||
if (capability === "audio") {
|
||||
return Boolean(provider.transcribeAudio);
|
||||
}
|
||||
if (capability === "image") {
|
||||
return Boolean(provider.describeImage);
|
||||
}
|
||||
return Boolean(provider.describeVideo);
|
||||
}
|
||||
39
packages/media-understanding-common/src/types.ts
Normal file
39
packages/media-understanding-common/src/types.ts
Normal file
@@ -0,0 +1,39 @@
|
||||
export type MediaUnderstandingKind =
|
||||
| "audio.transcription"
|
||||
| "video.description"
|
||||
| "image.description";
|
||||
|
||||
export type MediaUnderstandingCapability = "image" | "audio" | "video";
|
||||
|
||||
export type MediaUnderstandingCapabilityRegistry = Map<
|
||||
string,
|
||||
{
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
}
|
||||
>;
|
||||
|
||||
export type MediaAttachment = {
|
||||
path?: string;
|
||||
url?: string;
|
||||
mime?: string;
|
||||
index: number;
|
||||
alreadyTranscribed?: boolean;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingOutput = {
|
||||
kind: MediaUnderstandingKind;
|
||||
attachmentIndex: number;
|
||||
text: string;
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingProvider = {
|
||||
id: string;
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
transcribeAudio?: unknown;
|
||||
describeVideo?: unknown;
|
||||
describeImage?: unknown;
|
||||
describeImages?: unknown;
|
||||
extractStructured?: unknown;
|
||||
};
|
||||
10
packages/media-understanding-common/src/video.ts
Normal file
10
packages/media-understanding-common/src/video.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js";
|
||||
|
||||
export function estimateBase64Size(bytes: number): number {
|
||||
return Math.ceil(bytes / 3) * 4;
|
||||
}
|
||||
|
||||
export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
|
||||
const expanded = Math.floor(maxBytes * (4 / 3));
|
||||
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
|
||||
}
|
||||
2
pnpm-lock.yaml
generated
2
pnpm-lock.yaml
generated
@@ -1832,6 +1832,8 @@ importers:
|
||||
|
||||
packages/media-generation-core: {}
|
||||
|
||||
packages/media-understanding-common: {}
|
||||
|
||||
packages/memory-host-sdk: {}
|
||||
|
||||
packages/net-policy:
|
||||
|
||||
@@ -48,6 +48,7 @@ export const BUILD_ALL_STEPS = [
|
||||
"packages/plugin-sdk/package.json",
|
||||
"packages/llm-core/package.json",
|
||||
"packages/markdown-core/package.json",
|
||||
"packages/media-understanding-common/package.json",
|
||||
"packages/terminal-core/package.json",
|
||||
"packages/memory-host-sdk/package.json",
|
||||
"tsconfig.json",
|
||||
@@ -57,6 +58,7 @@ export const BUILD_ALL_STEPS = [
|
||||
"packages/markdown-core/src",
|
||||
"packages/memory-host-sdk/src",
|
||||
"packages/media-generation-core/src",
|
||||
"packages/media-understanding-common/src",
|
||||
"packages/terminal-core/src",
|
||||
"src/types",
|
||||
"src/video-generation/dashscope-compatible.ts",
|
||||
|
||||
@@ -19,6 +19,7 @@ const PLUGIN_SDK_TYPE_INPUTS = [
|
||||
"packages/markdown-core/src",
|
||||
"packages/memory-host-sdk/src",
|
||||
"packages/media-generation-core/src",
|
||||
"packages/media-understanding-common/src",
|
||||
"packages/terminal-core/src",
|
||||
"src/video-generation/dashscope-compatible.ts",
|
||||
"src/video-generation/types.ts",
|
||||
|
||||
@@ -12,6 +12,7 @@ const RUN_NODE_PACKAGE_SOURCE_ROOTS = [
|
||||
"packages/gateway-protocol/src",
|
||||
"packages/markdown-core/src",
|
||||
"packages/media-generation-core/src",
|
||||
"packages/media-understanding-common/src",
|
||||
"packages/terminal-core/src",
|
||||
"packages/net-policy/src",
|
||||
];
|
||||
|
||||
@@ -1,4 +1 @@
|
||||
export type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
export * from "../../packages/media-understanding-common/src/active-model.js";
|
||||
|
||||
@@ -1,32 +1 @@
|
||||
import type { MediaUnderstandingCapability } from "./types.js";
|
||||
|
||||
const MB = 1024 * 1024;
|
||||
|
||||
export const DEFAULT_MAX_CHARS = 500;
|
||||
export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
|
||||
MediaUnderstandingCapability,
|
||||
number | undefined
|
||||
> = {
|
||||
image: DEFAULT_MAX_CHARS,
|
||||
audio: undefined,
|
||||
video: DEFAULT_MAX_CHARS,
|
||||
};
|
||||
export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
|
||||
image: 10 * MB,
|
||||
audio: 20 * MB,
|
||||
video: 50 * MB,
|
||||
};
|
||||
export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
|
||||
image: 60,
|
||||
audio: 60,
|
||||
video: 120,
|
||||
};
|
||||
export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
|
||||
image: "Describe the image.",
|
||||
audio: "Transcribe the audio.",
|
||||
video: "Describe the video.",
|
||||
};
|
||||
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
||||
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||
export const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||
export const MIN_AUDIO_FILE_BYTES = 1024;
|
||||
export * from "../../packages/media-understanding-common/src/defaults.js";
|
||||
|
||||
@@ -1,21 +1 @@
|
||||
type MediaUnderstandingSkipReason =
|
||||
| "maxBytes"
|
||||
| "timeout"
|
||||
| "unsupported"
|
||||
| "empty"
|
||||
| "blocked"
|
||||
| "tooSmall";
|
||||
|
||||
export class MediaUnderstandingSkipError extends Error {
|
||||
readonly reason: MediaUnderstandingSkipReason;
|
||||
|
||||
constructor(reason: MediaUnderstandingSkipReason, message: string) {
|
||||
super(message);
|
||||
this.reason = reason;
|
||||
this.name = "MediaUnderstandingSkipError";
|
||||
}
|
||||
}
|
||||
|
||||
export function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError {
|
||||
return err instanceof MediaUnderstandingSkipError;
|
||||
}
|
||||
export * from "../../packages/media-understanding-common/src/errors.js";
|
||||
|
||||
@@ -1,98 +1 @@
|
||||
import type { MediaUnderstandingOutput } from "./types.js";
|
||||
|
||||
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
|
||||
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
|
||||
|
||||
export function extractMediaUserText(body?: string): string | undefined {
|
||||
const trimmed = body?.trim() ?? "";
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) {
|
||||
return undefined;
|
||||
}
|
||||
const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
|
||||
return cleaned || undefined;
|
||||
}
|
||||
|
||||
function formatSection(
|
||||
title: string,
|
||||
kind: "Transcript" | "Description",
|
||||
text: string,
|
||||
userText?: string,
|
||||
): string {
|
||||
const lines = [`[${title}]`];
|
||||
if (userText) {
|
||||
lines.push(`User text:\n${userText}`);
|
||||
}
|
||||
lines.push(`${kind}:\n${text}`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
export function formatMediaUnderstandingBody(params: {
|
||||
body?: string;
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
}): string {
|
||||
const outputs = params.outputs.filter((output) => output.text.trim());
|
||||
if (outputs.length === 0) {
|
||||
return params.body ?? "";
|
||||
}
|
||||
|
||||
const userText = extractMediaUserText(params.body);
|
||||
const sections: string[] = [];
|
||||
if (userText && outputs.length > 1) {
|
||||
sections.push(`User text:\n${userText}`);
|
||||
}
|
||||
|
||||
const counts = new Map<MediaUnderstandingOutput["kind"], number>();
|
||||
for (const output of outputs) {
|
||||
counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
|
||||
}
|
||||
const seen = new Map<MediaUnderstandingOutput["kind"], number>();
|
||||
|
||||
for (const output of outputs) {
|
||||
const count = counts.get(output.kind) ?? 1;
|
||||
const next = (seen.get(output.kind) ?? 0) + 1;
|
||||
seen.set(output.kind, next);
|
||||
const suffix = count > 1 ? ` ${next}/${count}` : "";
|
||||
if (output.kind === "audio.transcription") {
|
||||
sections.push(
|
||||
formatSection(
|
||||
`Audio${suffix}`,
|
||||
"Transcript",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if (output.kind === "image.description") {
|
||||
sections.push(
|
||||
formatSection(
|
||||
`Image${suffix}`,
|
||||
"Description",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
sections.push(
|
||||
formatSection(
|
||||
`Video${suffix}`,
|
||||
"Description",
|
||||
output.text,
|
||||
outputs.length === 1 ? userText : undefined,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
return sections.join("\n\n").trim();
|
||||
}
|
||||
|
||||
export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
|
||||
if (outputs.length === 1) {
|
||||
return outputs[0].text;
|
||||
}
|
||||
return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n");
|
||||
}
|
||||
export * from "../../packages/media-understanding-common/src/format.js";
|
||||
|
||||
@@ -1,66 +1 @@
|
||||
import { normalizeOptionalString } from "../shared/string-coerce.js";
|
||||
import { normalizeTrimmedStringList } from "../shared/string-normalization.js";
|
||||
|
||||
export type OpenAiCompatibleVideoPayload = {
|
||||
choices?: Array<{
|
||||
message?: {
|
||||
content?: string | Array<{ text?: string }>;
|
||||
reasoning_content?: string;
|
||||
};
|
||||
}>;
|
||||
};
|
||||
|
||||
export function resolveMediaUnderstandingString(
|
||||
value: string | undefined,
|
||||
fallback: string,
|
||||
): string {
|
||||
const trimmed = normalizeOptionalString(value);
|
||||
return trimmed || fallback;
|
||||
}
|
||||
|
||||
export function coerceOpenAiCompatibleVideoText(
|
||||
payload: OpenAiCompatibleVideoPayload,
|
||||
): string | null {
|
||||
const message = payload.choices?.[0]?.message;
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
if (typeof message.content === "string" && message.content.trim()) {
|
||||
return message.content.trim();
|
||||
}
|
||||
if (Array.isArray(message.content)) {
|
||||
const text = normalizeTrimmedStringList(message.content.map((part) => part.text)).join("\n");
|
||||
if (text) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) {
|
||||
return message.reasoning_content.trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export function buildOpenAiCompatibleVideoRequestBody(params: {
|
||||
model: string;
|
||||
prompt: string;
|
||||
mime: string;
|
||||
buffer: Buffer;
|
||||
}) {
|
||||
return {
|
||||
model: params.model,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: params.prompt },
|
||||
{
|
||||
type: "video_url",
|
||||
video_url: {
|
||||
url: `data:${params.mime};base64,${params.buffer.toString("base64")}`,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
export * from "../../packages/media-understanding-common/src/openai-compatible-video.js";
|
||||
|
||||
@@ -1,26 +1 @@
|
||||
function extractLastJsonObject(raw: string): unknown {
|
||||
const trimmed = raw.trim();
|
||||
const start = trimmed.lastIndexOf("{");
|
||||
if (start === -1) {
|
||||
return null;
|
||||
}
|
||||
const slice = trimmed.slice(start);
|
||||
try {
|
||||
return JSON.parse(slice);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function extractGeminiResponse(raw: string): string | null {
|
||||
const payload = extractLastJsonObject(raw);
|
||||
if (!payload || typeof payload !== "object") {
|
||||
return null;
|
||||
}
|
||||
const response = (payload as { response?: unknown }).response;
|
||||
if (typeof response !== "string") {
|
||||
return null;
|
||||
}
|
||||
const trimmed = response.trim();
|
||||
return trimmed || null;
|
||||
}
|
||||
export * from "../../packages/media-understanding-common/src/output-extract.js";
|
||||
|
||||
@@ -1,23 +1 @@
|
||||
import { normalizeProviderId } from "../agents/provider-id.js";
|
||||
|
||||
export function normalizeMediaProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "gemini") {
|
||||
return "google";
|
||||
}
|
||||
if (normalized === "minimax-cn") {
|
||||
return "minimax";
|
||||
}
|
||||
if (normalized === "minimax-portal-cn") {
|
||||
return "minimax-portal";
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function normalizeMediaExecutionProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") {
|
||||
return normalized;
|
||||
}
|
||||
return normalizeMediaProviderId(normalized);
|
||||
}
|
||||
export * from "../../packages/media-understanding-common/src/provider-id.js";
|
||||
|
||||
@@ -1,17 +1 @@
|
||||
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
|
||||
|
||||
export function providerSupportsCapability(
|
||||
provider: MediaUnderstandingProvider | undefined,
|
||||
capability: MediaUnderstandingCapability,
|
||||
): boolean {
|
||||
if (!provider) {
|
||||
return false;
|
||||
}
|
||||
if (capability === "audio") {
|
||||
return Boolean(provider.transcribeAudio);
|
||||
}
|
||||
if (capability === "image") {
|
||||
return Boolean(provider.describeImage);
|
||||
}
|
||||
return Boolean(provider.describeVideo);
|
||||
}
|
||||
export * from "../../packages/media-understanding-common/src/provider-supports.js";
|
||||
|
||||
@@ -1,10 +1 @@
|
||||
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.constants.js";
|
||||
|
||||
export function estimateBase64Size(bytes: number): number {
|
||||
return Math.ceil(bytes / 3) * 4;
|
||||
}
|
||||
|
||||
export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
|
||||
const expanded = Math.floor(maxBytes * (4 / 3));
|
||||
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
|
||||
}
|
||||
export * from "../../packages/media-understanding-common/src/video.js";
|
||||
|
||||
@@ -65,6 +65,12 @@
|
||||
"./packages/media-generation-core/src/normalization.ts"
|
||||
],
|
||||
"@openclaw/media-generation-core/*": ["./packages/media-generation-core/src/*"],
|
||||
"@openclaw/media-understanding-common": [
|
||||
"./packages/media-understanding-common/src/index.ts"
|
||||
],
|
||||
"@openclaw/media-understanding-common/*": [
|
||||
"./packages/media-understanding-common/src/*"
|
||||
],
|
||||
"@openclaw/markdown-core": ["./packages/markdown-core/src/index.ts"],
|
||||
"@openclaw/markdown-core/code-spans": ["./packages/markdown-core/src/code-spans.ts"],
|
||||
"@openclaw/markdown-core/fences": ["./packages/markdown-core/src/fences.ts"],
|
||||
|
||||
@@ -394,6 +394,22 @@ function buildMediaGenerationCoreDistEntries(): Record<string, string> {
|
||||
};
|
||||
}
|
||||
|
||||
function buildMediaUnderstandingCoreDistEntries(): Record<string, string> {
|
||||
return {
|
||||
index: "packages/media-understanding-common/src/index.ts",
|
||||
"active-model": "packages/media-understanding-common/src/active-model.ts",
|
||||
defaults: "packages/media-understanding-common/src/defaults.ts",
|
||||
errors: "packages/media-understanding-common/src/errors.ts",
|
||||
format: "packages/media-understanding-common/src/format.ts",
|
||||
"openai-compatible-video": "packages/media-understanding-common/src/openai-compatible-video.ts",
|
||||
"output-extract": "packages/media-understanding-common/src/output-extract.ts",
|
||||
"provider-id": "packages/media-understanding-common/src/provider-id.ts",
|
||||
"provider-supports": "packages/media-understanding-common/src/provider-supports.ts",
|
||||
types: "packages/media-understanding-common/src/types.ts",
|
||||
video: "packages/media-understanding-common/src/video.ts",
|
||||
};
|
||||
}
|
||||
|
||||
function buildMarkdownCoreDistEntries(): Record<string, string> {
|
||||
return {
|
||||
index: "packages/markdown-core/src/index.ts",
|
||||
@@ -592,6 +608,12 @@ export default defineConfig([
|
||||
entry: buildMediaGenerationCoreDistEntries(),
|
||||
outDir: "packages/media-generation-core/dist",
|
||||
}),
|
||||
nodeWorkspacePackageBuildConfig({
|
||||
clean: true,
|
||||
dts: RUN_NODE_SKIP_DTS_BUILD ? false : undefined,
|
||||
entry: buildMediaUnderstandingCoreDistEntries(),
|
||||
outDir: "packages/media-understanding-common/dist",
|
||||
}),
|
||||
nodeWorkspacePackageBuildConfig({
|
||||
clean: true,
|
||||
dts: RUN_NODE_SKIP_DTS_BUILD ? false : undefined,
|
||||
|
||||
Reference in New Issue
Block a user