mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-11 01:01:13 +00:00
* refactor(google): share oauth token helpers * refactor(xai): share tool auth fallback helpers * refactor(xai): share tool auth resolution * refactor(xai): share tool config helpers * refactor(xai): share fallback auth helpers * refactor(xai): share responses tool helpers * refactor(google): share http request config helper * fix(xai): re-export shared web search extractor * fix(xai): import plugin config type * fix(providers): preserve default google network guard
165 lines
4.8 KiB
TypeScript
165 lines
4.8 KiB
TypeScript
import {
|
|
describeImageWithModel,
|
|
describeImagesWithModel,
|
|
type AudioTranscriptionRequest,
|
|
type AudioTranscriptionResult,
|
|
type MediaUnderstandingProvider,
|
|
type VideoDescriptionRequest,
|
|
type VideoDescriptionResult,
|
|
} from "openclaw/plugin-sdk/media-understanding";
|
|
import {
|
|
assertOkOrThrowHttpError,
|
|
postJsonRequest,
|
|
type ProviderRequestTransportOverrides,
|
|
} from "openclaw/plugin-sdk/provider-http";
|
|
import {
|
|
DEFAULT_GOOGLE_API_BASE_URL,
|
|
normalizeGoogleModelId,
|
|
resolveGoogleGenerativeAiHttpRequestConfig,
|
|
} from "./runtime-api.js";
|
|
|
|
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = DEFAULT_GOOGLE_API_BASE_URL;
|
|
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = DEFAULT_GOOGLE_API_BASE_URL;
|
|
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
|
|
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
|
|
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
|
|
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
|
|
|
|
async function generateGeminiInlineDataText(params: {
|
|
buffer: Buffer;
|
|
mime?: string;
|
|
apiKey: string;
|
|
baseUrl?: string;
|
|
headers?: Record<string, string>;
|
|
request?: ProviderRequestTransportOverrides;
|
|
model?: string;
|
|
prompt?: string;
|
|
timeoutMs: number;
|
|
fetchFn?: typeof fetch;
|
|
defaultBaseUrl: string;
|
|
defaultModel: string;
|
|
defaultPrompt: string;
|
|
defaultMime: string;
|
|
httpErrorLabel: string;
|
|
missingTextError: string;
|
|
}): Promise<{ text: string; model: string }> {
|
|
const fetchFn = params.fetchFn ?? fetch;
|
|
const model = (() => {
|
|
const trimmed = params.model?.trim();
|
|
if (!trimmed) {
|
|
return params.defaultModel;
|
|
}
|
|
return normalizeGoogleModelId(trimmed);
|
|
})();
|
|
const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
|
|
resolveGoogleGenerativeAiHttpRequestConfig({
|
|
apiKey: params.apiKey,
|
|
baseUrl: params.baseUrl,
|
|
headers: params.headers,
|
|
request: params.request,
|
|
capability: params.defaultMime.startsWith("audio/") ? "audio" : "video",
|
|
transport: "media-understanding",
|
|
});
|
|
const resolvedBaseUrl = baseUrl ?? params.defaultBaseUrl;
|
|
const url = `${resolvedBaseUrl}/models/${model}:generateContent`;
|
|
|
|
const prompt = (() => {
|
|
const trimmed = params.prompt?.trim();
|
|
return trimmed || params.defaultPrompt;
|
|
})();
|
|
|
|
const body = {
|
|
contents: [
|
|
{
|
|
role: "user",
|
|
parts: [
|
|
{ text: prompt },
|
|
{
|
|
inline_data: {
|
|
mime_type: params.mime ?? params.defaultMime,
|
|
data: params.buffer.toString("base64"),
|
|
},
|
|
},
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
const { response: res, release } = await postJsonRequest({
|
|
url,
|
|
headers,
|
|
body,
|
|
timeoutMs: params.timeoutMs,
|
|
fetchFn,
|
|
allowPrivateNetwork,
|
|
dispatcherPolicy,
|
|
});
|
|
|
|
try {
|
|
await assertOkOrThrowHttpError(res, params.httpErrorLabel);
|
|
|
|
const payload = (await res.json()) as {
|
|
candidates?: Array<{
|
|
content?: { parts?: Array<{ text?: string }> };
|
|
}>;
|
|
};
|
|
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
|
const text = parts
|
|
.map((part) => part?.text?.trim())
|
|
.filter(Boolean)
|
|
.join("\n");
|
|
if (!text) {
|
|
throw new Error(params.missingTextError);
|
|
}
|
|
return { text, model };
|
|
} finally {
|
|
await release();
|
|
}
|
|
}
|
|
|
|
export async function transcribeGeminiAudio(
|
|
params: AudioTranscriptionRequest,
|
|
): Promise<AudioTranscriptionResult> {
|
|
const { text, model } = await generateGeminiInlineDataText({
|
|
...params,
|
|
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
|
|
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
|
|
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
|
|
defaultMime: "audio/wav",
|
|
httpErrorLabel: "Audio transcription failed",
|
|
missingTextError: "Audio transcription response missing text",
|
|
});
|
|
return { text, model };
|
|
}
|
|
|
|
export async function describeGeminiVideo(
|
|
params: VideoDescriptionRequest,
|
|
): Promise<VideoDescriptionResult> {
|
|
const { text, model } = await generateGeminiInlineDataText({
|
|
...params,
|
|
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
|
|
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
|
|
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
|
|
defaultMime: "video/mp4",
|
|
httpErrorLabel: "Video description failed",
|
|
missingTextError: "Video description response missing text",
|
|
});
|
|
return { text, model };
|
|
}
|
|
|
|
export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
|
id: "google",
|
|
capabilities: ["image", "audio", "video"],
|
|
defaultModels: {
|
|
image: DEFAULT_GOOGLE_VIDEO_MODEL,
|
|
audio: DEFAULT_GOOGLE_AUDIO_MODEL,
|
|
video: DEFAULT_GOOGLE_VIDEO_MODEL,
|
|
},
|
|
autoPriority: { image: 30, audio: 40, video: 10 },
|
|
nativeDocumentInputs: ["pdf"],
|
|
describeImage: describeImageWithModel,
|
|
describeImages: describeImagesWithModel,
|
|
transcribeAudio: transcribeGeminiAudio,
|
|
describeVideo: describeGeminiVideo,
|
|
};
|