import { describeImageWithModel, describeImagesWithModel, type AudioTranscriptionRequest, type AudioTranscriptionResult, type MediaUnderstandingProvider, type VideoDescriptionRequest, type VideoDescriptionResult, } from "openclaw/plugin-sdk/media-understanding"; import { assertOkOrThrowHttpError, postJsonRequest, type ProviderRequestTransportOverrides, } from "openclaw/plugin-sdk/provider-http"; import { DEFAULT_GOOGLE_API_BASE_URL, normalizeGoogleModelId, resolveGoogleGenerativeAiHttpRequestConfig, } from "./runtime-api.js"; export const DEFAULT_GOOGLE_AUDIO_BASE_URL = DEFAULT_GOOGLE_API_BASE_URL; export const DEFAULT_GOOGLE_VIDEO_BASE_URL = DEFAULT_GOOGLE_API_BASE_URL; const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview"; const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview"; const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio."; const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video."; async function generateGeminiInlineDataText(params: { buffer: Buffer; mime?: string; apiKey: string; baseUrl?: string; headers?: Record; request?: ProviderRequestTransportOverrides; model?: string; prompt?: string; timeoutMs: number; fetchFn?: typeof fetch; defaultBaseUrl: string; defaultModel: string; defaultPrompt: string; defaultMime: string; httpErrorLabel: string; missingTextError: string; }): Promise<{ text: string; model: string }> { const fetchFn = params.fetchFn ?? fetch; const model = (() => { const trimmed = params.model?.trim(); if (!trimmed) { return params.defaultModel; } return normalizeGoogleModelId(trimmed); })(); const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } = resolveGoogleGenerativeAiHttpRequestConfig({ apiKey: params.apiKey, baseUrl: params.baseUrl, headers: params.headers, request: params.request, capability: params.defaultMime.startsWith("audio/") ? "audio" : "video", transport: "media-understanding", }); const resolvedBaseUrl = baseUrl ?? params.defaultBaseUrl; const url = `${resolvedBaseUrl}/models/${model}:generateContent`; const prompt = (() => { const trimmed = params.prompt?.trim(); return trimmed || params.defaultPrompt; })(); const body = { contents: [ { role: "user", parts: [ { text: prompt }, { inline_data: { mime_type: params.mime ?? params.defaultMime, data: params.buffer.toString("base64"), }, }, ], }, ], }; const { response: res, release } = await postJsonRequest({ url, headers, body, timeoutMs: params.timeoutMs, fetchFn, allowPrivateNetwork, dispatcherPolicy, }); try { await assertOkOrThrowHttpError(res, params.httpErrorLabel); const payload = (await res.json()) as { candidates?: Array<{ content?: { parts?: Array<{ text?: string }> }; }>; }; const parts = payload.candidates?.[0]?.content?.parts ?? []; const text = parts .map((part) => part?.text?.trim()) .filter(Boolean) .join("\n"); if (!text) { throw new Error(params.missingTextError); } return { text, model }; } finally { await release(); } } export async function transcribeGeminiAudio( params: AudioTranscriptionRequest, ): Promise { const { text, model } = await generateGeminiInlineDataText({ ...params, defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL, defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL, defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT, defaultMime: "audio/wav", httpErrorLabel: "Audio transcription failed", missingTextError: "Audio transcription response missing text", }); return { text, model }; } export async function describeGeminiVideo( params: VideoDescriptionRequest, ): Promise { const { text, model } = await generateGeminiInlineDataText({ ...params, defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL, defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL, defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT, defaultMime: "video/mp4", httpErrorLabel: "Video description failed", missingTextError: "Video description response missing text", }); return { text, model }; } export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "google", capabilities: ["image", "audio", "video"], defaultModels: { image: DEFAULT_GOOGLE_VIDEO_MODEL, audio: DEFAULT_GOOGLE_AUDIO_MODEL, video: DEFAULT_GOOGLE_VIDEO_MODEL, }, autoPriority: { image: 30, audio: 40, video: 10 }, nativeDocumentInputs: ["pdf"], describeImage: describeImageWithModel, describeImages: describeImagesWithModel, transcribeAudio: transcribeGeminiAudio, describeVideo: describeGeminiVideo, };