diff --git a/packages/media-understanding-common/dist/active-model.d.mts b/packages/media-understanding-common/dist/active-model.d.mts new file mode 100644 index 00000000000..83dd5e0b21e --- /dev/null +++ b/packages/media-understanding-common/dist/active-model.d.mts @@ -0,0 +1,7 @@ +//#region packages/media-understanding-common/src/active-model.d.ts +type ActiveMediaModel = { + provider: string; + model?: string; +}; +//#endregion +export { ActiveMediaModel }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/active-model.mjs b/packages/media-understanding-common/dist/active-model.mjs new file mode 100644 index 00000000000..cb0ff5c3b54 --- /dev/null +++ b/packages/media-understanding-common/dist/active-model.mjs @@ -0,0 +1 @@ +export {}; diff --git a/packages/media-understanding-common/dist/defaults.d.mts b/packages/media-understanding-common/dist/defaults.d.mts new file mode 100644 index 00000000000..03602bf4550 --- /dev/null +++ b/packages/media-understanding-common/dist/defaults.d.mts @@ -0,0 +1,14 @@ +import { MediaUnderstandingCapability } from "./types.mjs"; + +//#region packages/media-understanding-common/src/defaults.d.ts +declare const DEFAULT_MAX_CHARS = 500; +declare const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record; +declare const DEFAULT_MAX_BYTES: Record; +declare const DEFAULT_TIMEOUT_SECONDS: Record; +declare const DEFAULT_PROMPT: Record; +declare const DEFAULT_VIDEO_MAX_BASE64_BYTES: number; +declare const CLI_OUTPUT_MAX_BUFFER: number; +declare const DEFAULT_MEDIA_CONCURRENCY = 2; +declare const MIN_AUDIO_FILE_BYTES = 1024; +//#endregion +export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/defaults.mjs b/packages/media-understanding-common/dist/defaults.mjs new file mode 100644 index 00000000000..56fa98d454c --- /dev/null +++ b/packages/media-understanding-common/dist/defaults.mjs @@ -0,0 +1,29 @@ +//#region packages/media-understanding-common/src/defaults.ts +const MB = 1024 * 1024; +const DEFAULT_MAX_CHARS = 500; +const DEFAULT_MAX_CHARS_BY_CAPABILITY = { + image: 500, + audio: void 0, + video: 500 +}; +const DEFAULT_MAX_BYTES = { + image: 10 * MB, + audio: 20 * MB, + video: 50 * MB +}; +const DEFAULT_TIMEOUT_SECONDS = { + image: 60, + audio: 60, + video: 120 +}; +const DEFAULT_PROMPT = { + image: "Describe the image.", + audio: "Transcribe the audio.", + video: "Describe the video." +}; +const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; +const CLI_OUTPUT_MAX_BUFFER = 5 * MB; +const DEFAULT_MEDIA_CONCURRENCY = 2; +const MIN_AUDIO_FILE_BYTES = 1024; +//#endregion +export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES }; diff --git a/packages/media-understanding-common/dist/errors.d.mts b/packages/media-understanding-common/dist/errors.d.mts new file mode 100644 index 00000000000..26c3a8b8c29 --- /dev/null +++ b/packages/media-understanding-common/dist/errors.d.mts @@ -0,0 +1,9 @@ +//#region packages/media-understanding-common/src/errors.d.ts +type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty" | "blocked" | "tooSmall"; +declare class MediaUnderstandingSkipError extends Error { + readonly reason: MediaUnderstandingSkipReason; + constructor(reason: MediaUnderstandingSkipReason, message: string); +} +declare function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError; +//#endregion +export { MediaUnderstandingSkipError, isMediaUnderstandingSkipError }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/errors.mjs b/packages/media-understanding-common/dist/errors.mjs new file mode 100644 index 00000000000..f13112c4999 --- /dev/null +++ b/packages/media-understanding-common/dist/errors.mjs @@ -0,0 +1,13 @@ +//#region packages/media-understanding-common/src/errors.ts +var MediaUnderstandingSkipError = class extends Error { + constructor(reason, message) { + super(message); + this.reason = reason; + this.name = "MediaUnderstandingSkipError"; + } +}; +function isMediaUnderstandingSkipError(err) { + return err instanceof MediaUnderstandingSkipError; +} +//#endregion +export { MediaUnderstandingSkipError, isMediaUnderstandingSkipError }; diff --git a/packages/media-understanding-common/dist/format.d.mts b/packages/media-understanding-common/dist/format.d.mts new file mode 100644 index 00000000000..114c0dd1b92 --- /dev/null +++ b/packages/media-understanding-common/dist/format.d.mts @@ -0,0 +1,11 @@ +import { MediaUnderstandingOutput } from "./types.mjs"; + +//#region packages/media-understanding-common/src/format.d.ts +declare function extractMediaUserText(body?: string): string | undefined; +declare function formatMediaUnderstandingBody(params: { + body?: string; + outputs: MediaUnderstandingOutput[]; +}): string; +declare function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string; +//#endregion +export { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/format.mjs b/packages/media-understanding-common/dist/format.mjs new file mode 100644 index 00000000000..ef67515aaf7 --- /dev/null +++ b/packages/media-understanding-common/dist/format.mjs @@ -0,0 +1,47 @@ +//#region packages/media-understanding-common/src/format.ts +const MEDIA_PLACEHOLDER_RE = /^]+>(\s*\([^)]*\))?$/i; +const MEDIA_PLACEHOLDER_TOKEN_RE = /^]+>(\s*\([^)]*\))?\s*/i; +function extractMediaUserText(body) { + const trimmed = body?.trim() ?? ""; + if (!trimmed) return; + if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return; + return trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim() || void 0; +} +function formatSection(title, kind, text, userText) { + const lines = [`[${title}]`]; + if (userText) lines.push(`User text:\n${userText}`); + lines.push(`${kind}:\n${text}`); + return lines.join("\n"); +} +function formatMediaUnderstandingBody(params) { + const outputs = params.outputs.filter((output) => output.text.trim()); + if (outputs.length === 0) return params.body ?? ""; + const userText = extractMediaUserText(params.body); + const sections = []; + if (userText && outputs.length > 1) sections.push(`User text:\n${userText}`); + const counts = /* @__PURE__ */ new Map(); + for (const output of outputs) counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1); + const seen = /* @__PURE__ */ new Map(); + for (const output of outputs) { + const count = counts.get(output.kind) ?? 1; + const next = (seen.get(output.kind) ?? 0) + 1; + seen.set(output.kind, next); + const suffix = count > 1 ? ` ${next}/${count}` : ""; + if (output.kind === "audio.transcription") { + sections.push(formatSection(`Audio${suffix}`, "Transcript", output.text, outputs.length === 1 ? userText : void 0)); + continue; + } + if (output.kind === "image.description") { + sections.push(formatSection(`Image${suffix}`, "Description", output.text, outputs.length === 1 ? userText : void 0)); + continue; + } + sections.push(formatSection(`Video${suffix}`, "Description", output.text, outputs.length === 1 ? userText : void 0)); + } + return sections.join("\n\n").trim(); +} +function formatAudioTranscripts(outputs) { + if (outputs.length === 1) return outputs[0].text; + return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n"); +} +//#endregion +export { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody }; diff --git a/packages/media-understanding-common/dist/index.d.mts b/packages/media-understanding-common/dist/index.d.mts new file mode 100644 index 00000000000..5a68bc9cf4f --- /dev/null +++ b/packages/media-understanding-common/dist/index.d.mts @@ -0,0 +1,11 @@ +import { ActiveMediaModel } from "./active-model.mjs"; +import { MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.mjs"; +import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES } from "./defaults.mjs"; +import { MediaUnderstandingSkipError, isMediaUnderstandingSkipError } from "./errors.mjs"; +import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody } from "./format.mjs"; +import { OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString } from "./openai-compatible-video.mjs"; +import { extractGeminiResponse } from "./output-extract.mjs"; +import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.mjs"; +import { providerSupportsCapability } from "./provider-supports.mjs"; +import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.mjs"; +export { ActiveMediaModel, CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES, MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider, MediaUnderstandingSkipError, OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, estimateBase64Size, extractGeminiResponse, extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, isMediaUnderstandingSkipError, normalizeMediaExecutionProviderId, normalizeMediaProviderId, providerSupportsCapability, resolveMediaUnderstandingString, resolveVideoMaxBase64Bytes }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/index.mjs b/packages/media-understanding-common/dist/index.mjs new file mode 100644 index 00000000000..f6961af4c75 --- /dev/null +++ b/packages/media-understanding-common/dist/index.mjs @@ -0,0 +1,11 @@ +import "./active-model.mjs"; +import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES } from "./defaults.mjs"; +import { MediaUnderstandingSkipError, isMediaUnderstandingSkipError } from "./errors.mjs"; +import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody } from "./format.mjs"; +import { buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString } from "./openai-compatible-video.mjs"; +import { extractGeminiResponse } from "./output-extract.mjs"; +import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.mjs"; +import { providerSupportsCapability } from "./provider-supports.mjs"; +import "./types.mjs"; +import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.mjs"; +export { CLI_OUTPUT_MAX_BUFFER, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS, DEFAULT_MAX_CHARS_BY_CAPABILITY, DEFAULT_MEDIA_CONCURRENCY, DEFAULT_PROMPT, DEFAULT_TIMEOUT_SECONDS, DEFAULT_VIDEO_MAX_BASE64_BYTES, MIN_AUDIO_FILE_BYTES, MediaUnderstandingSkipError, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, estimateBase64Size, extractGeminiResponse, extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, isMediaUnderstandingSkipError, normalizeMediaExecutionProviderId, normalizeMediaProviderId, providerSupportsCapability, resolveMediaUnderstandingString, resolveVideoMaxBase64Bytes }; diff --git a/packages/media-understanding-common/dist/openai-compatible-video.d.mts b/packages/media-understanding-common/dist/openai-compatible-video.d.mts new file mode 100644 index 00000000000..a3278c4c9eb --- /dev/null +++ b/packages/media-understanding-common/dist/openai-compatible-video.d.mts @@ -0,0 +1,37 @@ +//#region packages/media-understanding-common/src/openai-compatible-video.d.ts +type OpenAiCompatibleVideoPayload = { + choices?: Array<{ + message?: { + content?: string | Array<{ + text?: string; + }>; + reasoning_content?: string; + }; + }>; +}; +declare function resolveMediaUnderstandingString(value: string | undefined, fallback: string): string; +declare function coerceOpenAiCompatibleVideoText(payload: OpenAiCompatibleVideoPayload): string | null; +declare function buildOpenAiCompatibleVideoRequestBody(params: { + model: string; + prompt: string; + mime: string; + buffer: Buffer; +}): { + model: string; + messages: { + role: string; + content: ({ + type: string; + text: string; + video_url?: undefined; + } | { + type: string; + video_url: { + url: string; + }; + text?: undefined; + })[]; + }[]; +}; +//#endregion +export { OpenAiCompatibleVideoPayload, buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/openai-compatible-video.mjs b/packages/media-understanding-common/dist/openai-compatible-video.mjs new file mode 100644 index 00000000000..da37e9eb639 --- /dev/null +++ b/packages/media-understanding-common/dist/openai-compatible-video.mjs @@ -0,0 +1,32 @@ +//#region packages/media-understanding-common/src/openai-compatible-video.ts +function resolveMediaUnderstandingString(value, fallback) { + return value?.trim() || fallback; +} +function coerceOpenAiCompatibleVideoText(payload) { + const message = payload.choices?.[0]?.message; + if (!message) return null; + if (typeof message.content === "string" && message.content.trim()) return message.content.trim(); + if (Array.isArray(message.content)) { + const text = message.content.map((part) => part.text?.trim() ?? "").filter(Boolean).join("\n"); + if (text) return text; + } + if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) return message.reasoning_content.trim(); + return null; +} +function buildOpenAiCompatibleVideoRequestBody(params) { + return { + model: params.model, + messages: [{ + role: "user", + content: [{ + type: "text", + text: params.prompt + }, { + type: "video_url", + video_url: { url: `data:${params.mime};base64,${params.buffer.toString("base64")}` } + }] + }] + }; +} +//#endregion +export { buildOpenAiCompatibleVideoRequestBody, coerceOpenAiCompatibleVideoText, resolveMediaUnderstandingString }; diff --git a/packages/media-understanding-common/dist/output-extract.d.mts b/packages/media-understanding-common/dist/output-extract.d.mts new file mode 100644 index 00000000000..9f8bd190608 --- /dev/null +++ b/packages/media-understanding-common/dist/output-extract.d.mts @@ -0,0 +1,4 @@ +//#region packages/media-understanding-common/src/output-extract.d.ts +declare function extractGeminiResponse(raw: string): string | null; +//#endregion +export { extractGeminiResponse }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/output-extract.mjs b/packages/media-understanding-common/dist/output-extract.mjs new file mode 100644 index 00000000000..9d33aea8a67 --- /dev/null +++ b/packages/media-understanding-common/dist/output-extract.mjs @@ -0,0 +1,21 @@ +//#region packages/media-understanding-common/src/output-extract.ts +function extractLastJsonObject(raw) { + const trimmed = raw.trim(); + const start = trimmed.lastIndexOf("{"); + if (start === -1) return null; + const slice = trimmed.slice(start); + try { + return JSON.parse(slice); + } catch { + return null; + } +} +function extractGeminiResponse(raw) { + const payload = extractLastJsonObject(raw); + if (!payload || typeof payload !== "object") return null; + const response = payload.response; + if (typeof response !== "string") return null; + return response.trim() || null; +} +//#endregion +export { extractGeminiResponse }; diff --git a/packages/media-understanding-common/dist/provider-id.d.mts b/packages/media-understanding-common/dist/provider-id.d.mts new file mode 100644 index 00000000000..e82bd138202 --- /dev/null +++ b/packages/media-understanding-common/dist/provider-id.d.mts @@ -0,0 +1,5 @@ +//#region packages/media-understanding-common/src/provider-id.d.ts +declare function normalizeMediaProviderId(id: string): string; +declare function normalizeMediaExecutionProviderId(id: string): string; +//#endregion +export { normalizeMediaExecutionProviderId, normalizeMediaProviderId }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/provider-id.mjs b/packages/media-understanding-common/dist/provider-id.mjs new file mode 100644 index 00000000000..3e0e2a5482d --- /dev/null +++ b/packages/media-understanding-common/dist/provider-id.mjs @@ -0,0 +1,18 @@ +//#region packages/media-understanding-common/src/provider-id.ts +function normalizeProviderId(provider) { + return provider.trim().toLowerCase(); +} +function normalizeMediaProviderId(id) { + const normalized = normalizeProviderId(id); + if (normalized === "gemini") return "google"; + if (normalized === "minimax-cn") return "minimax"; + if (normalized === "minimax-portal-cn") return "minimax-portal"; + return normalized; +} +function normalizeMediaExecutionProviderId(id) { + const normalized = normalizeProviderId(id); + if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") return normalized; + return normalizeMediaProviderId(normalized); +} +//#endregion +export { normalizeMediaExecutionProviderId, normalizeMediaProviderId }; diff --git a/packages/media-understanding-common/dist/provider-supports.d.mts b/packages/media-understanding-common/dist/provider-supports.d.mts new file mode 100644 index 00000000000..e6ae55184c2 --- /dev/null +++ b/packages/media-understanding-common/dist/provider-supports.d.mts @@ -0,0 +1,6 @@ +import { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.mjs"; + +//#region packages/media-understanding-common/src/provider-supports.d.ts +declare function providerSupportsCapability(provider: MediaUnderstandingProvider | undefined, capability: MediaUnderstandingCapability): boolean; +//#endregion +export { providerSupportsCapability }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/provider-supports.mjs b/packages/media-understanding-common/dist/provider-supports.mjs new file mode 100644 index 00000000000..7fcb3f44c6c --- /dev/null +++ b/packages/media-understanding-common/dist/provider-supports.mjs @@ -0,0 +1,9 @@ +//#region packages/media-understanding-common/src/provider-supports.ts +function providerSupportsCapability(provider, capability) { + if (!provider) return false; + if (capability === "audio") return Boolean(provider.transcribeAudio); + if (capability === "image") return Boolean(provider.describeImage); + return Boolean(provider.describeVideo); +} +//#endregion +export { providerSupportsCapability }; diff --git a/packages/media-understanding-common/dist/types.d.mts b/packages/media-understanding-common/dist/types.d.mts new file mode 100644 index 00000000000..d1c49628113 --- /dev/null +++ b/packages/media-understanding-common/dist/types.d.mts @@ -0,0 +1,31 @@ +//#region packages/media-understanding-common/src/types.d.ts +type MediaUnderstandingKind = "audio.transcription" | "video.description" | "image.description"; +type MediaUnderstandingCapability = "image" | "audio" | "video"; +type MediaUnderstandingCapabilityRegistry = Map; +type MediaAttachment = { + path?: string; + url?: string; + mime?: string; + index: number; + alreadyTranscribed?: boolean; +}; +type MediaUnderstandingOutput = { + kind: MediaUnderstandingKind; + attachmentIndex: number; + text: string; + provider: string; + model?: string; +}; +type MediaUnderstandingProvider = { + id: string; + capabilities?: MediaUnderstandingCapability[]; + transcribeAudio?: unknown; + describeVideo?: unknown; + describeImage?: unknown; + describeImages?: unknown; + extractStructured?: unknown; +}; +//#endregion +export { MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingCapabilityRegistry, MediaUnderstandingKind, MediaUnderstandingOutput, MediaUnderstandingProvider }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/types.mjs b/packages/media-understanding-common/dist/types.mjs new file mode 100644 index 00000000000..cb0ff5c3b54 --- /dev/null +++ b/packages/media-understanding-common/dist/types.mjs @@ -0,0 +1 @@ +export {}; diff --git a/packages/media-understanding-common/dist/video.d.mts b/packages/media-understanding-common/dist/video.d.mts new file mode 100644 index 00000000000..ffce1b7034d --- /dev/null +++ b/packages/media-understanding-common/dist/video.d.mts @@ -0,0 +1,5 @@ +//#region packages/media-understanding-common/src/video.d.ts +declare function estimateBase64Size(bytes: number): number; +declare function resolveVideoMaxBase64Bytes(maxBytes: number): number; +//#endregion +export { estimateBase64Size, resolveVideoMaxBase64Bytes }; \ No newline at end of file diff --git a/packages/media-understanding-common/dist/video.mjs b/packages/media-understanding-common/dist/video.mjs new file mode 100644 index 00000000000..07867712901 --- /dev/null +++ b/packages/media-understanding-common/dist/video.mjs @@ -0,0 +1,11 @@ +import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.mjs"; +//#region packages/media-understanding-common/src/video.ts +function estimateBase64Size(bytes) { + return Math.ceil(bytes / 3) * 4; +} +function resolveVideoMaxBase64Bytes(maxBytes) { + const expanded = Math.floor(maxBytes * (4 / 3)); + return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES); +} +//#endregion +export { estimateBase64Size, resolveVideoMaxBase64Bytes }; diff --git a/packages/media-understanding-common/package.json b/packages/media-understanding-common/package.json new file mode 100644 index 00000000000..4b2b6e5039f --- /dev/null +++ b/packages/media-understanding-common/package.json @@ -0,0 +1,71 @@ +{ + "name": "@openclaw/media-understanding-common", + "version": "0.0.0-private", + "private": true, + "files": [ + "dist" + ], + "type": "module", + "main": "./dist/index.mjs", + "types": "./dist/index.d.mts", + "exports": { + ".": { + "types": "./dist/index.d.mts", + "import": "./dist/index.mjs", + "default": "./dist/index.mjs" + }, + "./active-model": { + "types": "./dist/active-model.d.mts", + "import": "./dist/active-model.mjs", + "default": "./dist/active-model.mjs" + }, + "./defaults": { + "types": "./dist/defaults.d.mts", + "import": "./dist/defaults.mjs", + "default": "./dist/defaults.mjs" + }, + "./errors": { + "types": "./dist/errors.d.mts", + "import": "./dist/errors.mjs", + "default": "./dist/errors.mjs" + }, + "./format": { + "types": "./dist/format.d.mts", + "import": "./dist/format.mjs", + "default": "./dist/format.mjs" + }, + "./openai-compatible-video": { + "types": "./dist/openai-compatible-video.d.mts", + "import": "./dist/openai-compatible-video.mjs", + "default": "./dist/openai-compatible-video.mjs" + }, + "./output-extract": { + "types": "./dist/output-extract.d.mts", + "import": "./dist/output-extract.mjs", + "default": "./dist/output-extract.mjs" + }, + "./provider-id": { + "types": "./dist/provider-id.d.mts", + "import": "./dist/provider-id.mjs", + "default": "./dist/provider-id.mjs" + }, + "./provider-supports": { + "types": "./dist/provider-supports.d.mts", + "import": "./dist/provider-supports.mjs", + "default": "./dist/provider-supports.mjs" + }, + "./types": { + "types": "./dist/types.d.mts", + "import": "./dist/types.mjs", + "default": "./dist/types.mjs" + }, + "./video": { + "types": "./dist/video.d.mts", + "import": "./dist/video.mjs", + "default": "./dist/video.mjs" + } + }, + "scripts": { + "build": "tsdown src/index.ts src/active-model.ts src/defaults.ts src/errors.ts src/format.ts src/openai-compatible-video.ts src/output-extract.ts src/provider-id.ts src/provider-supports.ts src/types.ts src/video.ts --no-config --platform node --format esm --dts --out-dir dist --clean" + } +} diff --git a/packages/media-understanding-common/src/active-model.ts b/packages/media-understanding-common/src/active-model.ts new file mode 100644 index 00000000000..2782c4702a1 --- /dev/null +++ b/packages/media-understanding-common/src/active-model.ts @@ -0,0 +1,4 @@ +export type ActiveMediaModel = { + provider: string; + model?: string; +}; diff --git a/packages/media-understanding-common/src/defaults.ts b/packages/media-understanding-common/src/defaults.ts new file mode 100644 index 00000000000..2dc0c119d98 --- /dev/null +++ b/packages/media-understanding-common/src/defaults.ts @@ -0,0 +1,32 @@ +import type { MediaUnderstandingCapability } from "./types.js"; + +const MB = 1024 * 1024; + +export const DEFAULT_MAX_CHARS = 500; +export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record< + MediaUnderstandingCapability, + number | undefined +> = { + image: DEFAULT_MAX_CHARS, + audio: undefined, + video: DEFAULT_MAX_CHARS, +}; +export const DEFAULT_MAX_BYTES: Record = { + image: 10 * MB, + audio: 20 * MB, + video: 50 * MB, +}; +export const DEFAULT_TIMEOUT_SECONDS: Record = { + image: 60, + audio: 60, + video: 120, +}; +export const DEFAULT_PROMPT: Record = { + image: "Describe the image.", + audio: "Transcribe the audio.", + video: "Describe the video.", +}; +export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; +export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; +export const DEFAULT_MEDIA_CONCURRENCY = 2; +export const MIN_AUDIO_FILE_BYTES = 1024; diff --git a/packages/media-understanding-common/src/errors.ts b/packages/media-understanding-common/src/errors.ts new file mode 100644 index 00000000000..e6475f722c7 --- /dev/null +++ b/packages/media-understanding-common/src/errors.ts @@ -0,0 +1,21 @@ +type MediaUnderstandingSkipReason = + | "maxBytes" + | "timeout" + | "unsupported" + | "empty" + | "blocked" + | "tooSmall"; + +export class MediaUnderstandingSkipError extends Error { + readonly reason: MediaUnderstandingSkipReason; + + constructor(reason: MediaUnderstandingSkipReason, message: string) { + super(message); + this.reason = reason; + this.name = "MediaUnderstandingSkipError"; + } +} + +export function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError { + return err instanceof MediaUnderstandingSkipError; +} diff --git a/src/media-understanding/format.test.ts b/packages/media-understanding-common/src/format.test.ts similarity index 100% rename from src/media-understanding/format.test.ts rename to packages/media-understanding-common/src/format.test.ts diff --git a/packages/media-understanding-common/src/format.ts b/packages/media-understanding-common/src/format.ts new file mode 100644 index 00000000000..b0542d1651c --- /dev/null +++ b/packages/media-understanding-common/src/format.ts @@ -0,0 +1,98 @@ +import type { MediaUnderstandingOutput } from "./types.js"; + +const MEDIA_PLACEHOLDER_RE = /^]+>(\s*\([^)]*\))?$/i; +const MEDIA_PLACEHOLDER_TOKEN_RE = /^]+>(\s*\([^)]*\))?\s*/i; + +export function extractMediaUserText(body?: string): string | undefined { + const trimmed = body?.trim() ?? ""; + if (!trimmed) { + return undefined; + } + if (MEDIA_PLACEHOLDER_RE.test(trimmed)) { + return undefined; + } + const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim(); + return cleaned || undefined; +} + +function formatSection( + title: string, + kind: "Transcript" | "Description", + text: string, + userText?: string, +): string { + const lines = [`[${title}]`]; + if (userText) { + lines.push(`User text:\n${userText}`); + } + lines.push(`${kind}:\n${text}`); + return lines.join("\n"); +} + +export function formatMediaUnderstandingBody(params: { + body?: string; + outputs: MediaUnderstandingOutput[]; +}): string { + const outputs = params.outputs.filter((output) => output.text.trim()); + if (outputs.length === 0) { + return params.body ?? ""; + } + + const userText = extractMediaUserText(params.body); + const sections: string[] = []; + if (userText && outputs.length > 1) { + sections.push(`User text:\n${userText}`); + } + + const counts = new Map(); + for (const output of outputs) { + counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1); + } + const seen = new Map(); + + for (const output of outputs) { + const count = counts.get(output.kind) ?? 1; + const next = (seen.get(output.kind) ?? 0) + 1; + seen.set(output.kind, next); + const suffix = count > 1 ? ` ${next}/${count}` : ""; + if (output.kind === "audio.transcription") { + sections.push( + formatSection( + `Audio${suffix}`, + "Transcript", + output.text, + outputs.length === 1 ? userText : undefined, + ), + ); + continue; + } + if (output.kind === "image.description") { + sections.push( + formatSection( + `Image${suffix}`, + "Description", + output.text, + outputs.length === 1 ? userText : undefined, + ), + ); + continue; + } + sections.push( + formatSection( + `Video${suffix}`, + "Description", + output.text, + outputs.length === 1 ? userText : undefined, + ), + ); + } + + return sections.join("\n\n").trim(); +} + +export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string { + if (outputs.length === 1) { + return outputs[0].text; + } + return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n"); +} diff --git a/packages/media-understanding-common/src/index.ts b/packages/media-understanding-common/src/index.ts new file mode 100644 index 00000000000..a709601cac9 --- /dev/null +++ b/packages/media-understanding-common/src/index.ts @@ -0,0 +1,10 @@ +export * from "./active-model.js"; +export * from "./defaults.js"; +export * from "./errors.js"; +export * from "./format.js"; +export * from "./openai-compatible-video.js"; +export * from "./output-extract.js"; +export * from "./provider-id.js"; +export * from "./provider-supports.js"; +export * from "./types.js"; +export * from "./video.js"; diff --git a/packages/media-understanding-common/src/openai-compatible-video.ts b/packages/media-understanding-common/src/openai-compatible-video.ts new file mode 100644 index 00000000000..d1a43ff36a6 --- /dev/null +++ b/packages/media-understanding-common/src/openai-compatible-video.ts @@ -0,0 +1,66 @@ +export type OpenAiCompatibleVideoPayload = { + choices?: Array<{ + message?: { + content?: string | Array<{ text?: string }>; + reasoning_content?: string; + }; + }>; +}; + +export function resolveMediaUnderstandingString( + value: string | undefined, + fallback: string, +): string { + const trimmed = value?.trim(); + return trimmed || fallback; +} + +export function coerceOpenAiCompatibleVideoText( + payload: OpenAiCompatibleVideoPayload, +): string | null { + const message = payload.choices?.[0]?.message; + if (!message) { + return null; + } + if (typeof message.content === "string" && message.content.trim()) { + return message.content.trim(); + } + if (Array.isArray(message.content)) { + const text = message.content + .map((part) => part.text?.trim() ?? "") + .filter(Boolean) + .join("\n"); + if (text) { + return text; + } + } + if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { + return message.reasoning_content.trim(); + } + return null; +} + +export function buildOpenAiCompatibleVideoRequestBody(params: { + model: string; + prompt: string; + mime: string; + buffer: Buffer; +}) { + return { + model: params.model, + messages: [ + { + role: "user", + content: [ + { type: "text", text: params.prompt }, + { + type: "video_url", + video_url: { + url: `data:${params.mime};base64,${params.buffer.toString("base64")}`, + }, + }, + ], + }, + ], + }; +} diff --git a/packages/media-understanding-common/src/output-extract.ts b/packages/media-understanding-common/src/output-extract.ts new file mode 100644 index 00000000000..a0f0f391379 --- /dev/null +++ b/packages/media-understanding-common/src/output-extract.ts @@ -0,0 +1,26 @@ +function extractLastJsonObject(raw: string): unknown { + const trimmed = raw.trim(); + const start = trimmed.lastIndexOf("{"); + if (start === -1) { + return null; + } + const slice = trimmed.slice(start); + try { + return JSON.parse(slice); + } catch { + return null; + } +} + +export function extractGeminiResponse(raw: string): string | null { + const payload = extractLastJsonObject(raw); + if (!payload || typeof payload !== "object") { + return null; + } + const response = (payload as { response?: unknown }).response; + if (typeof response !== "string") { + return null; + } + const trimmed = response.trim(); + return trimmed || null; +} diff --git a/packages/media-understanding-common/src/provider-id.ts b/packages/media-understanding-common/src/provider-id.ts new file mode 100644 index 00000000000..a985603f0d7 --- /dev/null +++ b/packages/media-understanding-common/src/provider-id.ts @@ -0,0 +1,25 @@ +function normalizeProviderId(provider: string): string { + return provider.trim().toLowerCase(); +} + +export function normalizeMediaProviderId(id: string): string { + const normalized = normalizeProviderId(id); + if (normalized === "gemini") { + return "google"; + } + if (normalized === "minimax-cn") { + return "minimax"; + } + if (normalized === "minimax-portal-cn") { + return "minimax-portal"; + } + return normalized; +} + +export function normalizeMediaExecutionProviderId(id: string): string { + const normalized = normalizeProviderId(id); + if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") { + return normalized; + } + return normalizeMediaProviderId(normalized); +} diff --git a/packages/media-understanding-common/src/provider-supports.ts b/packages/media-understanding-common/src/provider-supports.ts new file mode 100644 index 00000000000..60fcb1bd777 --- /dev/null +++ b/packages/media-understanding-common/src/provider-supports.ts @@ -0,0 +1,17 @@ +import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js"; + +export function providerSupportsCapability( + provider: MediaUnderstandingProvider | undefined, + capability: MediaUnderstandingCapability, +): boolean { + if (!provider) { + return false; + } + if (capability === "audio") { + return Boolean(provider.transcribeAudio); + } + if (capability === "image") { + return Boolean(provider.describeImage); + } + return Boolean(provider.describeVideo); +} diff --git a/packages/media-understanding-common/src/types.ts b/packages/media-understanding-common/src/types.ts new file mode 100644 index 00000000000..b5782d17f10 --- /dev/null +++ b/packages/media-understanding-common/src/types.ts @@ -0,0 +1,39 @@ +export type MediaUnderstandingKind = + | "audio.transcription" + | "video.description" + | "image.description"; + +export type MediaUnderstandingCapability = "image" | "audio" | "video"; + +export type MediaUnderstandingCapabilityRegistry = Map< + string, + { + capabilities?: MediaUnderstandingCapability[]; + } +>; + +export type MediaAttachment = { + path?: string; + url?: string; + mime?: string; + index: number; + alreadyTranscribed?: boolean; +}; + +export type MediaUnderstandingOutput = { + kind: MediaUnderstandingKind; + attachmentIndex: number; + text: string; + provider: string; + model?: string; +}; + +export type MediaUnderstandingProvider = { + id: string; + capabilities?: MediaUnderstandingCapability[]; + transcribeAudio?: unknown; + describeVideo?: unknown; + describeImage?: unknown; + describeImages?: unknown; + extractStructured?: unknown; +}; diff --git a/packages/media-understanding-common/src/video.ts b/packages/media-understanding-common/src/video.ts new file mode 100644 index 00000000000..00773f40ca7 --- /dev/null +++ b/packages/media-understanding-common/src/video.ts @@ -0,0 +1,10 @@ +import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js"; + +export function estimateBase64Size(bytes: number): number { + return Math.ceil(bytes / 3) * 4; +} + +export function resolveVideoMaxBase64Bytes(maxBytes: number): number { + const expanded = Math.floor(maxBytes * (4 / 3)); + return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES); +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cefa58a761b..4e5d92deb38 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1832,6 +1832,8 @@ importers: packages/media-generation-core: {} + packages/media-understanding-common: {} + packages/memory-host-sdk: {} packages/net-policy: diff --git a/scripts/build-all.mjs b/scripts/build-all.mjs index c542961f40a..8a2b657e8ef 100644 --- a/scripts/build-all.mjs +++ b/scripts/build-all.mjs @@ -48,6 +48,7 @@ export const BUILD_ALL_STEPS = [ "packages/plugin-sdk/package.json", "packages/llm-core/package.json", "packages/markdown-core/package.json", + "packages/media-understanding-common/package.json", "packages/terminal-core/package.json", "packages/memory-host-sdk/package.json", "tsconfig.json", @@ -57,6 +58,7 @@ export const BUILD_ALL_STEPS = [ "packages/markdown-core/src", "packages/memory-host-sdk/src", "packages/media-generation-core/src", + "packages/media-understanding-common/src", "packages/terminal-core/src", "src/types", "src/video-generation/dashscope-compatible.ts", diff --git a/scripts/prepare-extension-package-boundary-artifacts.mjs b/scripts/prepare-extension-package-boundary-artifacts.mjs index 44101a7d825..dcb84553ac5 100644 --- a/scripts/prepare-extension-package-boundary-artifacts.mjs +++ b/scripts/prepare-extension-package-boundary-artifacts.mjs @@ -19,6 +19,7 @@ const PLUGIN_SDK_TYPE_INPUTS = [ "packages/markdown-core/src", "packages/memory-host-sdk/src", "packages/media-generation-core/src", + "packages/media-understanding-common/src", "packages/terminal-core/src", "src/video-generation/dashscope-compatible.ts", "src/video-generation/types.ts", diff --git a/scripts/run-node-watch-paths.mjs b/scripts/run-node-watch-paths.mjs index 9528634f963..ee0402381a3 100644 --- a/scripts/run-node-watch-paths.mjs +++ b/scripts/run-node-watch-paths.mjs @@ -12,6 +12,7 @@ const RUN_NODE_PACKAGE_SOURCE_ROOTS = [ "packages/gateway-protocol/src", "packages/markdown-core/src", "packages/media-generation-core/src", + "packages/media-understanding-common/src", "packages/terminal-core/src", "packages/net-policy/src", ]; diff --git a/src/media-understanding/active-model.types.ts b/src/media-understanding/active-model.types.ts index 2782c4702a1..474c089ac87 100644 --- a/src/media-understanding/active-model.types.ts +++ b/src/media-understanding/active-model.types.ts @@ -1,4 +1 @@ -export type ActiveMediaModel = { - provider: string; - model?: string; -}; +export * from "../../packages/media-understanding-common/src/active-model.js"; diff --git a/src/media-understanding/defaults.constants.ts b/src/media-understanding/defaults.constants.ts index 2dc0c119d98..c9195eed9d7 100644 --- a/src/media-understanding/defaults.constants.ts +++ b/src/media-understanding/defaults.constants.ts @@ -1,32 +1 @@ -import type { MediaUnderstandingCapability } from "./types.js"; - -const MB = 1024 * 1024; - -export const DEFAULT_MAX_CHARS = 500; -export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record< - MediaUnderstandingCapability, - number | undefined -> = { - image: DEFAULT_MAX_CHARS, - audio: undefined, - video: DEFAULT_MAX_CHARS, -}; -export const DEFAULT_MAX_BYTES: Record = { - image: 10 * MB, - audio: 20 * MB, - video: 50 * MB, -}; -export const DEFAULT_TIMEOUT_SECONDS: Record = { - image: 60, - audio: 60, - video: 120, -}; -export const DEFAULT_PROMPT: Record = { - image: "Describe the image.", - audio: "Transcribe the audio.", - video: "Describe the video.", -}; -export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; -export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; -export const DEFAULT_MEDIA_CONCURRENCY = 2; -export const MIN_AUDIO_FILE_BYTES = 1024; +export * from "../../packages/media-understanding-common/src/defaults.js"; diff --git a/src/media-understanding/errors.ts b/src/media-understanding/errors.ts index e6475f722c7..3dd7a39cd58 100644 --- a/src/media-understanding/errors.ts +++ b/src/media-understanding/errors.ts @@ -1,21 +1 @@ -type MediaUnderstandingSkipReason = - | "maxBytes" - | "timeout" - | "unsupported" - | "empty" - | "blocked" - | "tooSmall"; - -export class MediaUnderstandingSkipError extends Error { - readonly reason: MediaUnderstandingSkipReason; - - constructor(reason: MediaUnderstandingSkipReason, message: string) { - super(message); - this.reason = reason; - this.name = "MediaUnderstandingSkipError"; - } -} - -export function isMediaUnderstandingSkipError(err: unknown): err is MediaUnderstandingSkipError { - return err instanceof MediaUnderstandingSkipError; -} +export * from "../../packages/media-understanding-common/src/errors.js"; diff --git a/src/media-understanding/format.ts b/src/media-understanding/format.ts index b0542d1651c..07b9eef2b17 100644 --- a/src/media-understanding/format.ts +++ b/src/media-understanding/format.ts @@ -1,98 +1 @@ -import type { MediaUnderstandingOutput } from "./types.js"; - -const MEDIA_PLACEHOLDER_RE = /^]+>(\s*\([^)]*\))?$/i; -const MEDIA_PLACEHOLDER_TOKEN_RE = /^]+>(\s*\([^)]*\))?\s*/i; - -export function extractMediaUserText(body?: string): string | undefined { - const trimmed = body?.trim() ?? ""; - if (!trimmed) { - return undefined; - } - if (MEDIA_PLACEHOLDER_RE.test(trimmed)) { - return undefined; - } - const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim(); - return cleaned || undefined; -} - -function formatSection( - title: string, - kind: "Transcript" | "Description", - text: string, - userText?: string, -): string { - const lines = [`[${title}]`]; - if (userText) { - lines.push(`User text:\n${userText}`); - } - lines.push(`${kind}:\n${text}`); - return lines.join("\n"); -} - -export function formatMediaUnderstandingBody(params: { - body?: string; - outputs: MediaUnderstandingOutput[]; -}): string { - const outputs = params.outputs.filter((output) => output.text.trim()); - if (outputs.length === 0) { - return params.body ?? ""; - } - - const userText = extractMediaUserText(params.body); - const sections: string[] = []; - if (userText && outputs.length > 1) { - sections.push(`User text:\n${userText}`); - } - - const counts = new Map(); - for (const output of outputs) { - counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1); - } - const seen = new Map(); - - for (const output of outputs) { - const count = counts.get(output.kind) ?? 1; - const next = (seen.get(output.kind) ?? 0) + 1; - seen.set(output.kind, next); - const suffix = count > 1 ? ` ${next}/${count}` : ""; - if (output.kind === "audio.transcription") { - sections.push( - formatSection( - `Audio${suffix}`, - "Transcript", - output.text, - outputs.length === 1 ? userText : undefined, - ), - ); - continue; - } - if (output.kind === "image.description") { - sections.push( - formatSection( - `Image${suffix}`, - "Description", - output.text, - outputs.length === 1 ? userText : undefined, - ), - ); - continue; - } - sections.push( - formatSection( - `Video${suffix}`, - "Description", - output.text, - outputs.length === 1 ? userText : undefined, - ), - ); - } - - return sections.join("\n\n").trim(); -} - -export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string { - if (outputs.length === 1) { - return outputs[0].text; - } - return outputs.map((output, index) => `Audio ${index + 1}:\n${output.text}`).join("\n\n"); -} +export * from "../../packages/media-understanding-common/src/format.js"; diff --git a/src/media-understanding/openai-compatible-video.ts b/src/media-understanding/openai-compatible-video.ts index 56ec887c7de..392b02b7391 100644 --- a/src/media-understanding/openai-compatible-video.ts +++ b/src/media-understanding/openai-compatible-video.ts @@ -1,66 +1 @@ -import { normalizeOptionalString } from "../shared/string-coerce.js"; -import { normalizeTrimmedStringList } from "../shared/string-normalization.js"; - -export type OpenAiCompatibleVideoPayload = { - choices?: Array<{ - message?: { - content?: string | Array<{ text?: string }>; - reasoning_content?: string; - }; - }>; -}; - -export function resolveMediaUnderstandingString( - value: string | undefined, - fallback: string, -): string { - const trimmed = normalizeOptionalString(value); - return trimmed || fallback; -} - -export function coerceOpenAiCompatibleVideoText( - payload: OpenAiCompatibleVideoPayload, -): string | null { - const message = payload.choices?.[0]?.message; - if (!message) { - return null; - } - if (typeof message.content === "string" && message.content.trim()) { - return message.content.trim(); - } - if (Array.isArray(message.content)) { - const text = normalizeTrimmedStringList(message.content.map((part) => part.text)).join("\n"); - if (text) { - return text; - } - } - if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { - return message.reasoning_content.trim(); - } - return null; -} - -export function buildOpenAiCompatibleVideoRequestBody(params: { - model: string; - prompt: string; - mime: string; - buffer: Buffer; -}) { - return { - model: params.model, - messages: [ - { - role: "user", - content: [ - { type: "text", text: params.prompt }, - { - type: "video_url", - video_url: { - url: `data:${params.mime};base64,${params.buffer.toString("base64")}`, - }, - }, - ], - }, - ], - }; -} +export * from "../../packages/media-understanding-common/src/openai-compatible-video.js"; diff --git a/src/media-understanding/output-extract.ts b/src/media-understanding/output-extract.ts index a0f0f391379..6ae5da38a65 100644 --- a/src/media-understanding/output-extract.ts +++ b/src/media-understanding/output-extract.ts @@ -1,26 +1 @@ -function extractLastJsonObject(raw: string): unknown { - const trimmed = raw.trim(); - const start = trimmed.lastIndexOf("{"); - if (start === -1) { - return null; - } - const slice = trimmed.slice(start); - try { - return JSON.parse(slice); - } catch { - return null; - } -} - -export function extractGeminiResponse(raw: string): string | null { - const payload = extractLastJsonObject(raw); - if (!payload || typeof payload !== "object") { - return null; - } - const response = (payload as { response?: unknown }).response; - if (typeof response !== "string") { - return null; - } - const trimmed = response.trim(); - return trimmed || null; -} +export * from "../../packages/media-understanding-common/src/output-extract.js"; diff --git a/src/media-understanding/provider-id.ts b/src/media-understanding/provider-id.ts index c48152f9bf6..9f40c5fe1b4 100644 --- a/src/media-understanding/provider-id.ts +++ b/src/media-understanding/provider-id.ts @@ -1,23 +1 @@ -import { normalizeProviderId } from "../agents/provider-id.js"; - -export function normalizeMediaProviderId(id: string): string { - const normalized = normalizeProviderId(id); - if (normalized === "gemini") { - return "google"; - } - if (normalized === "minimax-cn") { - return "minimax"; - } - if (normalized === "minimax-portal-cn") { - return "minimax-portal"; - } - return normalized; -} - -export function normalizeMediaExecutionProviderId(id: string): string { - const normalized = normalizeProviderId(id); - if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") { - return normalized; - } - return normalizeMediaProviderId(normalized); -} +export * from "../../packages/media-understanding-common/src/provider-id.js"; diff --git a/src/media-understanding/provider-supports.ts b/src/media-understanding/provider-supports.ts index 60fcb1bd777..76806301beb 100644 --- a/src/media-understanding/provider-supports.ts +++ b/src/media-understanding/provider-supports.ts @@ -1,17 +1 @@ -import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js"; - -export function providerSupportsCapability( - provider: MediaUnderstandingProvider | undefined, - capability: MediaUnderstandingCapability, -): boolean { - if (!provider) { - return false; - } - if (capability === "audio") { - return Boolean(provider.transcribeAudio); - } - if (capability === "image") { - return Boolean(provider.describeImage); - } - return Boolean(provider.describeVideo); -} +export * from "../../packages/media-understanding-common/src/provider-supports.js"; diff --git a/src/media-understanding/video.ts b/src/media-understanding/video.ts index 827b635ba03..c074ec5648d 100644 --- a/src/media-understanding/video.ts +++ b/src/media-understanding/video.ts @@ -1,10 +1 @@ -import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.constants.js"; - -export function estimateBase64Size(bytes: number): number { - return Math.ceil(bytes / 3) * 4; -} - -export function resolveVideoMaxBase64Bytes(maxBytes: number): number { - const expanded = Math.floor(maxBytes * (4 / 3)); - return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES); -} +export * from "../../packages/media-understanding-common/src/video.js"; diff --git a/tsconfig.json b/tsconfig.json index ba0d5c255e8..0329f84ab18 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -65,6 +65,12 @@ "./packages/media-generation-core/src/normalization.ts" ], "@openclaw/media-generation-core/*": ["./packages/media-generation-core/src/*"], + "@openclaw/media-understanding-common": [ + "./packages/media-understanding-common/src/index.ts" + ], + "@openclaw/media-understanding-common/*": [ + "./packages/media-understanding-common/src/*" + ], "@openclaw/markdown-core": ["./packages/markdown-core/src/index.ts"], "@openclaw/markdown-core/code-spans": ["./packages/markdown-core/src/code-spans.ts"], "@openclaw/markdown-core/fences": ["./packages/markdown-core/src/fences.ts"], diff --git a/tsdown.config.ts b/tsdown.config.ts index ccd27f00369..7285ecc72a1 100644 --- a/tsdown.config.ts +++ b/tsdown.config.ts @@ -394,6 +394,22 @@ function buildMediaGenerationCoreDistEntries(): Record { }; } +function buildMediaUnderstandingCoreDistEntries(): Record { + return { + index: "packages/media-understanding-common/src/index.ts", + "active-model": "packages/media-understanding-common/src/active-model.ts", + defaults: "packages/media-understanding-common/src/defaults.ts", + errors: "packages/media-understanding-common/src/errors.ts", + format: "packages/media-understanding-common/src/format.ts", + "openai-compatible-video": "packages/media-understanding-common/src/openai-compatible-video.ts", + "output-extract": "packages/media-understanding-common/src/output-extract.ts", + "provider-id": "packages/media-understanding-common/src/provider-id.ts", + "provider-supports": "packages/media-understanding-common/src/provider-supports.ts", + types: "packages/media-understanding-common/src/types.ts", + video: "packages/media-understanding-common/src/video.ts", + }; +} + function buildMarkdownCoreDistEntries(): Record { return { index: "packages/markdown-core/src/index.ts", @@ -592,6 +608,12 @@ export default defineConfig([ entry: buildMediaGenerationCoreDistEntries(), outDir: "packages/media-generation-core/dist", }), + nodeWorkspacePackageBuildConfig({ + clean: true, + dts: RUN_NODE_SKIP_DTS_BUILD ? false : undefined, + entry: buildMediaUnderstandingCoreDistEntries(), + outDir: "packages/media-understanding-common/dist", + }), nodeWorkspacePackageBuildConfig({ clean: true, dts: RUN_NODE_SKIP_DTS_BUILD ? false : undefined,