diff --git a/extensions/moonshot/media-understanding-provider.ts b/extensions/moonshot/media-understanding-provider.ts index 5579c55e31e..ea6422099a5 100644 --- a/extensions/moonshot/media-understanding-provider.ts +++ b/extensions/moonshot/media-understanding-provider.ts @@ -1,7 +1,11 @@ import { + buildOpenAiCompatibleVideoRequestBody, + coerceOpenAiCompatibleVideoText, describeImageWithModel, describeImagesWithModel, + resolveMediaUnderstandingString, type MediaUnderstandingProvider, + type OpenAiCompatibleVideoPayload, type VideoDescriptionRequest, type VideoDescriptionResult, } from "openclaw/plugin-sdk/media-understanding"; @@ -15,56 +19,13 @@ export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1"; const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5"; const DEFAULT_MOONSHOT_VIDEO_PROMPT = "Describe the video."; -type MoonshotVideoPayload = { - choices?: Array<{ - message?: { - content?: string | Array<{ text?: string }>; - reasoning_content?: string; - }; - }>; -}; - -function resolveModel(model?: string): string { - const trimmed = model?.trim(); - return trimmed || DEFAULT_MOONSHOT_VIDEO_MODEL; -} - -function resolvePrompt(prompt?: string): string { - const trimmed = prompt?.trim(); - return trimmed || DEFAULT_MOONSHOT_VIDEO_PROMPT; -} - -function coerceMoonshotText(payload: MoonshotVideoPayload): string | null { - const message = payload.choices?.[0]?.message; - if (!message) { - return null; - } - if (typeof message.content === "string" && message.content.trim()) { - return message.content.trim(); - } - if (Array.isArray(message.content)) { - const text = message.content - .map((part) => (typeof part.text === "string" ? part.text.trim() : "")) - .filter(Boolean) - .join("\n") - .trim(); - if (text) { - return text; - } - } - if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { - return message.reasoning_content.trim(); - } - return null; -} - export async function describeMoonshotVideo( params: VideoDescriptionRequest, ): Promise { const fetchFn = params.fetchFn ?? fetch; - const model = resolveModel(params.model); - const mime = params.mime ?? "video/mp4"; - const prompt = resolvePrompt(params.prompt); + const model = resolveMediaUnderstandingString(params.model, DEFAULT_MOONSHOT_VIDEO_MODEL); + const mime = resolveMediaUnderstandingString(params.mime, "video/mp4"); + const prompt = resolveMediaUnderstandingString(params.prompt, DEFAULT_MOONSHOT_VIDEO_PROMPT); const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({ baseUrl: params.baseUrl, @@ -82,23 +43,12 @@ export async function describeMoonshotVideo( }); const url = `${baseUrl}/chat/completions`; - const body = { + const body = buildOpenAiCompatibleVideoRequestBody({ model, - messages: [ - { - role: "user", - content: [ - { type: "text", text: prompt }, - { - type: "video_url", - video_url: { - url: `data:${mime};base64,${params.buffer.toString("base64")}`, - }, - }, - ], - }, - ], - }; + prompt, + mime, + buffer: params.buffer, + }); const { response: res, release } = await postJsonRequest({ url, @@ -112,8 +62,8 @@ export async function describeMoonshotVideo( try { await assertOkOrThrowHttpError(res, "Moonshot video description failed"); - const payload = (await res.json()) as MoonshotVideoPayload; - const text = coerceMoonshotText(payload); + const payload = (await res.json()) as OpenAiCompatibleVideoPayload; + const text = coerceOpenAiCompatibleVideoText(payload); if (!text) { throw new Error("Moonshot video description response missing content"); } diff --git a/extensions/qwen/media-understanding-provider.ts b/extensions/qwen/media-understanding-provider.ts index 308ae72959d..582975f9783 100644 --- a/extensions/qwen/media-understanding-provider.ts +++ b/extensions/qwen/media-understanding-provider.ts @@ -1,7 +1,11 @@ import { + buildOpenAiCompatibleVideoRequestBody, + coerceOpenAiCompatibleVideoText, describeImageWithModel, describeImagesWithModel, + resolveMediaUnderstandingString, type MediaUnderstandingProvider, + type OpenAiCompatibleVideoPayload, type VideoDescriptionRequest, type VideoDescriptionResult, } from "openclaw/plugin-sdk/media-understanding"; @@ -15,15 +19,6 @@ import { QWEN_STANDARD_CN_BASE_URL, QWEN_STANDARD_GLOBAL_BASE_URL } from "./mode const DEFAULT_QWEN_VIDEO_MODEL = "qwen-vl-max-latest"; const DEFAULT_QWEN_VIDEO_PROMPT = "Describe the video in detail."; -type QwenVideoPayload = { - choices?: Array<{ - message?: { - content?: string | Array<{ text?: string }>; - reasoning_content?: string; - }; - }>; -}; - function resolveQwenStandardBaseUrl( cfg: { models?: { providers?: Record } } | undefined, providerId: string, @@ -46,37 +41,13 @@ function resolveQwenStandardBaseUrl( } } -function coerceQwenText(payload: QwenVideoPayload): string | null { - const message = payload.choices?.[0]?.message; - if (!message) { - return null; - } - if (typeof message.content === "string" && message.content.trim()) { - return message.content.trim(); - } - if (Array.isArray(message.content)) { - const text = message.content - .map((part) => (typeof part.text === "string" ? part.text.trim() : "")) - .filter(Boolean) - .join("\n") - .trim(); - if (text) { - return text; - } - } - if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { - return message.reasoning_content.trim(); - } - return null; -} - export async function describeQwenVideo( params: VideoDescriptionRequest, ): Promise { const fetchFn = params.fetchFn ?? fetch; - const model = params.model?.trim() || DEFAULT_QWEN_VIDEO_MODEL; - const mime = params.mime?.trim() || "video/mp4"; - const prompt = params.prompt?.trim() || DEFAULT_QWEN_VIDEO_PROMPT; + const model = resolveMediaUnderstandingString(params.model, DEFAULT_QWEN_VIDEO_MODEL); + const mime = resolveMediaUnderstandingString(params.mime, "video/mp4"); + const prompt = resolveMediaUnderstandingString(params.prompt, DEFAULT_QWEN_VIDEO_PROMPT); const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({ baseUrl: params.baseUrl, @@ -96,23 +67,12 @@ export async function describeQwenVideo( const { response: res, release } = await postJsonRequest({ url: `${baseUrl}/chat/completions`, headers, - body: { + body: buildOpenAiCompatibleVideoRequestBody({ model, - messages: [ - { - role: "user", - content: [ - { type: "text", text: prompt }, - { - type: "video_url", - video_url: { - url: `data:${mime};base64,${params.buffer.toString("base64")}`, - }, - }, - ], - }, - ], - }, + prompt, + mime, + buffer: params.buffer, + }), timeoutMs: params.timeoutMs, fetchFn, allowPrivateNetwork, @@ -121,8 +81,8 @@ export async function describeQwenVideo( try { await assertOkOrThrowHttpError(res, "Qwen video description failed"); - const payload = (await res.json()) as QwenVideoPayload; - const text = coerceQwenText(payload); + const payload = (await res.json()) as OpenAiCompatibleVideoPayload; + const text = coerceOpenAiCompatibleVideoText(payload); if (!text) { throw new Error("Qwen video description response missing content"); } diff --git a/src/media-understanding/openai-compatible-video.ts b/src/media-understanding/openai-compatible-video.ts new file mode 100644 index 00000000000..80439dc3df1 --- /dev/null +++ b/src/media-understanding/openai-compatible-video.ts @@ -0,0 +1,67 @@ +export type OpenAiCompatibleVideoPayload = { + choices?: Array<{ + message?: { + content?: string | Array<{ text?: string }>; + reasoning_content?: string; + }; + }>; +}; + +export function resolveMediaUnderstandingString( + value: string | undefined, + fallback: string, +): string { + const trimmed = value?.trim(); + return trimmed || fallback; +} + +export function coerceOpenAiCompatibleVideoText( + payload: OpenAiCompatibleVideoPayload, +): string | null { + const message = payload.choices?.[0]?.message; + if (!message) { + return null; + } + if (typeof message.content === "string" && message.content.trim()) { + return message.content.trim(); + } + if (Array.isArray(message.content)) { + const text = message.content + .map((part) => (typeof part.text === "string" ? part.text.trim() : "")) + .filter(Boolean) + .join("\n") + .trim(); + if (text) { + return text; + } + } + if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { + return message.reasoning_content.trim(); + } + return null; +} + +export function buildOpenAiCompatibleVideoRequestBody(params: { + model: string; + prompt: string; + mime: string; + buffer: Buffer; +}) { + return { + model: params.model, + messages: [ + { + role: "user", + content: [ + { type: "text", text: params.prompt }, + { + type: "video_url", + video_url: { + url: `data:${params.mime};base64,${params.buffer.toString("base64")}`, + }, + }, + ], + }, + ], + }; +} diff --git a/src/plugin-sdk/media-understanding.ts b/src/plugin-sdk/media-understanding.ts index cd6401ad675..2fc4399b77d 100644 --- a/src/plugin-sdk/media-understanding.ts +++ b/src/plugin-sdk/media-understanding.ts @@ -17,4 +17,10 @@ export { describeImageWithModel, describeImagesWithModel, } from "../media-understanding/image-runtime.js"; +export { + buildOpenAiCompatibleVideoRequestBody, + coerceOpenAiCompatibleVideoText, + resolveMediaUnderstandingString, + type OpenAiCompatibleVideoPayload, +} from "../media-understanding/openai-compatible-video.ts"; export { transcribeOpenAiCompatibleAudio } from "../media-understanding/openai-compatible-audio.js";