Files
openclaw/extensions/google/media-understanding-provider.ts
Vincent Koc b0f94a227b refactor(providers): normalize transport policy wiring (#59682)
* refactor(providers): normalize transport policy wiring

* fix(providers): address transport policy review

* fix(providers): harden transport overrides

* fix(providers): keep env proxy tls separate

* fix(changelog): note provider transport policy hardening
2026-04-02 22:54:34 +09:00

163 lines
4.8 KiB
TypeScript

import {
describeImageWithModel,
describeImagesWithModel,
type AudioTranscriptionRequest,
type AudioTranscriptionResult,
type MediaUnderstandingProvider,
type VideoDescriptionRequest,
type VideoDescriptionResult,
} from "openclaw/plugin-sdk/media-understanding";
import {
assertOkOrThrowHttpError,
postJsonRequest,
resolveProviderHttpRequestConfig,
type ProviderRequestTransportOverrides,
} from "openclaw/plugin-sdk/provider-http";
import {
DEFAULT_GOOGLE_API_BASE_URL,
normalizeGoogleApiBaseUrl,
normalizeGoogleModelId,
parseGeminiAuth,
} from "./runtime-api.js";
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = DEFAULT_GOOGLE_API_BASE_URL;
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = DEFAULT_GOOGLE_API_BASE_URL;
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
async function generateGeminiInlineDataText(params: {
buffer: Buffer;
mime?: string;
apiKey: string;
baseUrl?: string;
headers?: Record<string, string>;
request?: ProviderRequestTransportOverrides;
model?: string;
prompt?: string;
timeoutMs: number;
fetchFn?: typeof fetch;
defaultBaseUrl: string;
defaultModel: string;
defaultPrompt: string;
defaultMime: string;
httpErrorLabel: string;
missingTextError: string;
}): Promise<{ text: string; model: string }> {
const fetchFn = params.fetchFn ?? fetch;
const model = (() => {
const trimmed = params.model?.trim();
if (!trimmed) {
return params.defaultModel;
}
return normalizeGoogleModelId(trimmed);
})();
const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
resolveProviderHttpRequestConfig({
baseUrl: normalizeGoogleApiBaseUrl(params.baseUrl ?? params.defaultBaseUrl),
defaultBaseUrl: DEFAULT_GOOGLE_API_BASE_URL,
allowPrivateNetwork: Boolean(params.baseUrl?.trim()),
headers: params.headers,
request: params.request,
defaultHeaders: parseGeminiAuth(params.apiKey).headers,
provider: "google",
api: "google-generative-ai",
capability: params.defaultMime.startsWith("audio/") ? "audio" : "video",
transport: "media-understanding",
});
const url = `${baseUrl}/models/${model}:generateContent`;
const prompt = (() => {
const trimmed = params.prompt?.trim();
return trimmed || params.defaultPrompt;
})();
const body = {
contents: [
{
role: "user",
parts: [
{ text: prompt },
{
inline_data: {
mime_type: params.mime ?? params.defaultMime,
data: params.buffer.toString("base64"),
},
},
],
},
],
};
const { response: res, release } = await postJsonRequest({
url,
headers,
body,
timeoutMs: params.timeoutMs,
fetchFn,
allowPrivateNetwork,
dispatcherPolicy,
});
try {
await assertOkOrThrowHttpError(res, params.httpErrorLabel);
const payload = (await res.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ text?: string }> };
}>;
};
const parts = payload.candidates?.[0]?.content?.parts ?? [];
const text = parts
.map((part) => part?.text?.trim())
.filter(Boolean)
.join("\n");
if (!text) {
throw new Error(params.missingTextError);
}
return { text, model };
} finally {
await release();
}
}
export async function transcribeGeminiAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
defaultMime: "audio/wav",
httpErrorLabel: "Audio transcription failed",
missingTextError: "Audio transcription response missing text",
});
return { text, model };
}
export async function describeGeminiVideo(
params: VideoDescriptionRequest,
): Promise<VideoDescriptionResult> {
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
defaultMime: "video/mp4",
httpErrorLabel: "Video description failed",
missingTextError: "Video description response missing text",
});
return { text, model };
}
export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "google",
capabilities: ["image", "audio", "video"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
transcribeAudio: transcribeGeminiAudio,
describeVideo: describeGeminiVideo,
};