mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-18 19:34:45 +00:00
feat(openrouter): add inbound audio STT support
This commit is contained in:
committed by
Peter Steinberger
parent
0543448df8
commit
5c87b692cb
@@ -192,7 +192,7 @@ If `tools.media.<capability>.enabled` is **not** set to `false` and you haven't
|
||||
|
||||
Bundled fallback order:
|
||||
|
||||
- Audio: OpenAI → Groq → xAI → Deepgram → Google → SenseAudio → ElevenLabs → Mistral
|
||||
- Audio: OpenAI → Groq → xAI → Deepgram → OpenRouter → Google → SenseAudio → ElevenLabs → Mistral
|
||||
- Image: OpenAI → Anthropic → Google → MiniMax → MiniMax Portal → Z.AI
|
||||
- Video: Google → Qwen → Moonshot
|
||||
|
||||
@@ -237,7 +237,7 @@ If you set `capabilities`, the entry only runs for those media types. For shared
|
||||
- `openai`, `anthropic`, `minimax`: **image**
|
||||
- `minimax-portal`: **image**
|
||||
- `moonshot`: **image + video**
|
||||
- `openrouter`: **image**
|
||||
- `openrouter`: **image + audio**
|
||||
- `google` (Gemini API): **image + audio + video**
|
||||
- `qwen`: **image + video**
|
||||
- `mistral`: **audio**
|
||||
@@ -254,7 +254,7 @@ For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
|
||||
| Capability | Provider integration | Notes |
|
||||
| ---------- | ---------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Image | OpenAI, OpenAI Codex OAuth, Codex app-server, OpenRouter, Anthropic, Google, MiniMax, Moonshot, Qwen, Z.AI, config providers | Vendor plugins register image support; `openai-codex/*` uses OAuth provider plumbing; `codex/*` uses a bounded Codex app-server turn; MiniMax and MiniMax OAuth both use `MiniMax-VL-01`; image-capable config providers auto-register. |
|
||||
| Audio | OpenAI, Groq, xAI, Deepgram, Google, SenseAudio, ElevenLabs, Mistral | Provider transcription (Whisper/Groq/xAI/Deepgram/Gemini/SenseAudio/Scribe/Voxtral). |
|
||||
| Audio | OpenAI, Groq, xAI, Deepgram, OpenRouter, Google, SenseAudio, ElevenLabs, Mistral | Provider transcription (Whisper/Groq/xAI/Deepgram/OpenRouter STT/Gemini/SenseAudio/Scribe/Voxtral). |
|
||||
| Video | Google, Qwen, Moonshot | Provider video understanding via vendor plugins; Qwen video understanding uses the Standard DashScope endpoints. |
|
||||
|
||||
<Note>
|
||||
|
||||
@@ -133,6 +133,29 @@ OpenRouter can also be used as a TTS provider through its OpenAI-compatible
|
||||
If `messages.tts.providers.openrouter.apiKey` is omitted, TTS reuses
|
||||
`models.providers.openrouter.apiKey`, then `OPENROUTER_API_KEY`.
|
||||
|
||||
## Speech-to-text (inbound audio)
|
||||
|
||||
OpenRouter can transcribe inbound voice/audio attachments through the shared
|
||||
`tools.media.audio` path using its STT endpoint (`/audio/transcriptions`).
|
||||
This applies to any channel plugin that forwards inbound voice/audio into
|
||||
media understanding preflight.
|
||||
|
||||
```json5
|
||||
{
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
models: [{ provider: "openrouter", model: "openai/whisper-large-v3-turbo" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
OpenClaw sends OpenRouter STT requests as JSON with base64 audio under
|
||||
`input_audio` (OpenRouter STT contract), not as multipart OpenAI form uploads.
|
||||
|
||||
## Authentication and headers
|
||||
|
||||
OpenRouter uses a Bearer token with your API key under the hood.
|
||||
|
||||
@@ -67,7 +67,7 @@ telephony, meetings, browser realtime, and native push-to-talk clients.
|
||||
| MiniMax | ✓ | ✓ | ✓ | ✓ | | | |
|
||||
| Mistral | | | | | ✓ | | |
|
||||
| OpenAI | ✓ | ✓ | | ✓ | ✓ | ✓ | ✓ |
|
||||
| OpenRouter | ✓ | ✓ | | ✓ | | | ✓ |
|
||||
| OpenRouter | ✓ | ✓ | | ✓ | ✓ | | ✓ |
|
||||
| Qwen | | ✓ | | | | | |
|
||||
| Runway | | ✓ | | | | | |
|
||||
| SenseAudio | | | | | ✓ | | |
|
||||
@@ -105,7 +105,7 @@ the generated media fallback directly to the original channel.
|
||||
|
||||
## Speech-to-text and Voice Call
|
||||
|
||||
Deepgram, DeepInfra, ElevenLabs, Mistral, OpenAI, SenseAudio, and xAI can all transcribe
|
||||
Deepgram, DeepInfra, ElevenLabs, Mistral, OpenAI, OpenRouter, SenseAudio, and xAI can all transcribe
|
||||
inbound audio through the batch `tools.media.audio` path when configured.
|
||||
Channel plugins that preflight a voice note for mention gating or command
|
||||
parsing mark the transcribed attachment on the inbound context, so the shared
|
||||
|
||||
180
extensions/openrouter/media-understanding-provider.test.ts
Normal file
180
extensions/openrouter/media-understanding-provider.test.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
openrouterMediaUnderstandingProvider,
|
||||
transcribeOpenRouterAudio,
|
||||
} from "./media-understanding-provider.js";
|
||||
|
||||
const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } =
|
||||
vi.hoisted(() => ({
|
||||
assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
|
||||
postJsonRequestMock: vi.fn(),
|
||||
resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
|
||||
baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://openrouter.ai/api/v1",
|
||||
allowPrivateNetwork: false,
|
||||
headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
|
||||
dispatcherPolicy: undefined,
|
||||
})),
|
||||
}));
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/provider-http", () => ({
|
||||
assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock,
|
||||
postJsonRequest: postJsonRequestMock,
|
||||
requireTranscriptionText: (value: string | undefined, message: string) => {
|
||||
const text = value?.trim();
|
||||
if (!text) {
|
||||
throw new Error(message);
|
||||
}
|
||||
return text;
|
||||
},
|
||||
resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock,
|
||||
}));
|
||||
|
||||
describe("openrouter media understanding provider", () => {
|
||||
afterEach(() => {
|
||||
assertOkOrThrowHttpErrorMock.mockClear();
|
||||
postJsonRequestMock.mockReset();
|
||||
resolveProviderHttpRequestConfigMock.mockClear();
|
||||
});
|
||||
|
||||
it("declares image and audio capabilities with defaults", () => {
|
||||
expect(openrouterMediaUnderstandingProvider).toMatchObject({
|
||||
id: "openrouter",
|
||||
capabilities: ["image", "audio"],
|
||||
defaultModels: {
|
||||
image: "auto",
|
||||
audio: "openai/whisper-large-v3-turbo",
|
||||
},
|
||||
autoPriority: { audio: 35 },
|
||||
});
|
||||
expect(openrouterMediaUnderstandingProvider.transcribeAudio).toBeTypeOf("function");
|
||||
});
|
||||
|
||||
it("sends JSON STT payload to OpenRouter transcriptions endpoint", async () => {
|
||||
const release = vi.fn(async () => {});
|
||||
postJsonRequestMock.mockResolvedValue({
|
||||
response: new Response(JSON.stringify({ text: "hello world" }), { status: 200 }),
|
||||
release,
|
||||
});
|
||||
|
||||
const result = await transcribeOpenRouterAudio({
|
||||
buffer: Buffer.from("audio-bytes"),
|
||||
fileName: "voice.oga",
|
||||
mime: "audio/ogg",
|
||||
apiKey: "sk-openrouter",
|
||||
timeoutMs: 12_000,
|
||||
language: " en ",
|
||||
fetchFn: fetch,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
text: "hello world",
|
||||
model: "openai/whisper-large-v3-turbo",
|
||||
});
|
||||
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
provider: "openrouter",
|
||||
capability: "audio",
|
||||
}),
|
||||
);
|
||||
expect(postJsonRequestMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
url: "https://openrouter.ai/api/v1/audio/transcriptions",
|
||||
timeoutMs: 12_000,
|
||||
body: {
|
||||
model: "openai/whisper-large-v3-turbo",
|
||||
input_audio: {
|
||||
data: Buffer.from("audio-bytes").toString("base64"),
|
||||
format: "ogg",
|
||||
},
|
||||
language: "en",
|
||||
},
|
||||
}),
|
||||
);
|
||||
const headers = postJsonRequestMock.mock.calls[0]?.[0]?.headers as Headers;
|
||||
expect(headers.get("authorization")).toBe("Bearer sk-openrouter");
|
||||
expect(headers.get("http-referer")).toBe("https://openclaw.ai");
|
||||
expect(headers.get("x-openrouter-title")).toBe("OpenClaw");
|
||||
expect(release).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("accepts temperature via provider query options", async () => {
|
||||
const release = vi.fn(async () => {});
|
||||
postJsonRequestMock.mockResolvedValue({
|
||||
response: new Response(JSON.stringify({ text: "ok" }), { status: 200 }),
|
||||
release,
|
||||
});
|
||||
|
||||
await transcribeOpenRouterAudio({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "voice.webm",
|
||||
apiKey: "sk-openrouter",
|
||||
timeoutMs: 5_000,
|
||||
query: { temperature: 0.2 },
|
||||
fetchFn: fetch,
|
||||
});
|
||||
|
||||
expect(postJsonRequestMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
body: expect.objectContaining({
|
||||
temperature: 0.2,
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("falls back to filename extension when mime is missing", async () => {
|
||||
const release = vi.fn(async () => {});
|
||||
postJsonRequestMock.mockResolvedValue({
|
||||
response: new Response(JSON.stringify({ text: "ok" }), { status: 200 }),
|
||||
release,
|
||||
});
|
||||
|
||||
await transcribeOpenRouterAudio({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "voice.opus",
|
||||
apiKey: "sk-openrouter",
|
||||
timeoutMs: 5_000,
|
||||
fetchFn: fetch,
|
||||
});
|
||||
|
||||
expect(postJsonRequestMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
body: expect.objectContaining({
|
||||
input_audio: expect.objectContaining({ format: "ogg" }),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("throws when format cannot be resolved", async () => {
|
||||
await expect(
|
||||
transcribeOpenRouterAudio({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "voice.bin",
|
||||
mime: "application/octet-stream",
|
||||
apiKey: "sk-openrouter",
|
||||
timeoutMs: 5_000,
|
||||
fetchFn: fetch,
|
||||
}),
|
||||
).rejects.toThrow("OpenRouter STT could not resolve audio format");
|
||||
expect(postJsonRequestMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("throws when provider response omits text", async () => {
|
||||
const release = vi.fn(async () => {});
|
||||
postJsonRequestMock.mockResolvedValue({
|
||||
response: new Response(JSON.stringify({}), { status: 200 }),
|
||||
release,
|
||||
});
|
||||
|
||||
await expect(
|
||||
transcribeOpenRouterAudio({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "voice.mp3",
|
||||
apiKey: "sk-openrouter",
|
||||
timeoutMs: 5_000,
|
||||
fetchFn: fetch,
|
||||
}),
|
||||
).rejects.toThrow("OpenRouter transcription response missing text");
|
||||
});
|
||||
});
|
||||
@@ -1,13 +1,161 @@
|
||||
import path from "node:path";
|
||||
import {
|
||||
describeImageWithModel,
|
||||
describeImagesWithModel,
|
||||
type AudioTranscriptionRequest,
|
||||
type AudioTranscriptionResult,
|
||||
type MediaUnderstandingProvider,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
postJsonRequest,
|
||||
requireTranscriptionText,
|
||||
resolveProviderHttpRequestConfig,
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
import { OPENROUTER_BASE_URL } from "./provider-catalog.js";
|
||||
|
||||
const DEFAULT_OPENROUTER_AUDIO_TRANSCRIPTION_MODEL = "openai/whisper-large-v3-turbo";
|
||||
const SUPPORTED_AUDIO_FORMATS = new Set(["wav", "mp3", "flac", "m4a", "ogg", "webm", "aac"]);
|
||||
|
||||
function resolveFormatFromMime(mime?: string): string | undefined {
|
||||
const normalized = mime?.trim().toLowerCase();
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
switch (normalized) {
|
||||
case "audio/wav":
|
||||
case "audio/x-wav":
|
||||
return "wav";
|
||||
case "audio/mpeg":
|
||||
case "audio/mp3":
|
||||
return "mp3";
|
||||
case "audio/flac":
|
||||
return "flac";
|
||||
case "audio/mp4":
|
||||
case "audio/x-m4a":
|
||||
return "m4a";
|
||||
case "audio/ogg":
|
||||
case "audio/oga":
|
||||
return "ogg";
|
||||
case "audio/webm":
|
||||
return "webm";
|
||||
case "audio/aac":
|
||||
return "aac";
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function resolveFormatFromFileName(fileName?: string): string | undefined {
|
||||
const ext = path
|
||||
.extname(fileName ?? "")
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replace(/^\./, "");
|
||||
if (!ext) {
|
||||
return undefined;
|
||||
}
|
||||
if (ext === "mpeg") {
|
||||
return "mp3";
|
||||
}
|
||||
if (ext === "oga" || ext === "opus") {
|
||||
return "ogg";
|
||||
}
|
||||
return SUPPORTED_AUDIO_FORMATS.has(ext) ? ext : undefined;
|
||||
}
|
||||
|
||||
function resolveOpenRouterAudioFormat(params: { mime?: string; fileName?: string }): string {
|
||||
const fromMime = resolveFormatFromMime(params.mime);
|
||||
if (fromMime) {
|
||||
return fromMime;
|
||||
}
|
||||
const fromFileName = resolveFormatFromFileName(params.fileName);
|
||||
if (fromFileName) {
|
||||
return fromFileName;
|
||||
}
|
||||
throw new Error(
|
||||
`OpenRouter STT could not resolve audio format from mime "${params.mime ?? ""}" and file "${params.fileName ?? ""}"`,
|
||||
);
|
||||
}
|
||||
|
||||
type OpenRouterSttResponse = {
|
||||
text?: string;
|
||||
};
|
||||
|
||||
export async function transcribeOpenRouterAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const model = params.model?.trim() || DEFAULT_OPENROUTER_AUDIO_TRANSCRIPTION_MODEL;
|
||||
const format = resolveOpenRouterAudioFormat({
|
||||
mime: params.mime,
|
||||
fileName: params.fileName,
|
||||
});
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
|
||||
resolveProviderHttpRequestConfig({
|
||||
baseUrl: params.baseUrl,
|
||||
defaultBaseUrl: OPENROUTER_BASE_URL,
|
||||
headers: params.headers,
|
||||
request: params.request,
|
||||
defaultHeaders: {
|
||||
Authorization: `Bearer ${params.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
"HTTP-Referer": "https://openclaw.ai",
|
||||
"X-OpenRouter-Title": "OpenClaw",
|
||||
},
|
||||
provider: "openrouter",
|
||||
api: "openrouter-stt",
|
||||
capability: "audio",
|
||||
transport: "media-understanding",
|
||||
});
|
||||
|
||||
const { response, release } = await postJsonRequest({
|
||||
url: `${baseUrl}/audio/transcriptions`,
|
||||
headers,
|
||||
body: {
|
||||
model,
|
||||
input_audio: {
|
||||
data: params.buffer.toString("base64"),
|
||||
format,
|
||||
},
|
||||
...(params.language?.trim() ? { language: params.language.trim() } : {}),
|
||||
...(typeof params.query?.temperature === "number"
|
||||
? { temperature: params.query.temperature }
|
||||
: {}),
|
||||
},
|
||||
timeoutMs: params.timeoutMs,
|
||||
fetchFn,
|
||||
allowPrivateNetwork,
|
||||
dispatcherPolicy,
|
||||
auditContext: "openrouter stt",
|
||||
});
|
||||
|
||||
try {
|
||||
await assertOkOrThrowHttpError(response, "OpenRouter audio transcription failed");
|
||||
const payload = (await response.json()) as OpenRouterSttResponse;
|
||||
return {
|
||||
text: requireTranscriptionText(
|
||||
payload.text,
|
||||
"OpenRouter transcription response missing text",
|
||||
),
|
||||
model,
|
||||
};
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
}
|
||||
|
||||
export const openrouterMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "openrouter",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "auto" },
|
||||
capabilities: ["image", "audio"],
|
||||
defaultModels: {
|
||||
image: "auto",
|
||||
audio: DEFAULT_OPENROUTER_AUDIO_TRANSCRIPTION_MODEL,
|
||||
},
|
||||
autoPriority: {
|
||||
audio: 35,
|
||||
},
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
transcribeAudio: transcribeOpenRouterAudio,
|
||||
};
|
||||
|
||||
@@ -61,9 +61,13 @@
|
||||
},
|
||||
"mediaUnderstandingProviderMetadata": {
|
||||
"openrouter": {
|
||||
"capabilities": ["image"],
|
||||
"capabilities": ["image", "audio"],
|
||||
"defaultModels": {
|
||||
"image": "auto"
|
||||
"image": "auto",
|
||||
"audio": "openai/whisper-large-v3-turbo"
|
||||
},
|
||||
"autoPriority": {
|
||||
"audio": 35
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -64,7 +64,11 @@ const mediaMetadataPlugins = vi.hoisted(() => [
|
||||
},
|
||||
opencode: { capabilities: ["image"], defaultModels: { image: "gpt-5-nano" } },
|
||||
"opencode-go": { capabilities: ["image"], defaultModels: { image: "kimi-k2.6" } },
|
||||
openrouter: { capabilities: ["image"], defaultModels: { image: "auto" } },
|
||||
openrouter: {
|
||||
capabilities: ["image", "audio"],
|
||||
defaultModels: { image: "auto", audio: "openai/whisper-large-v3-turbo" },
|
||||
autoPriority: { audio: 35 },
|
||||
},
|
||||
qwen: { capabilities: ["video"], autoPriority: { video: 20 } },
|
||||
xai: { capabilities: ["audio"], autoPriority: { audio: 25 } },
|
||||
zai: { capabilities: ["image"], autoPriority: { image: 60 } },
|
||||
@@ -111,6 +115,9 @@ describe("resolveDefaultMediaModel", () => {
|
||||
expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "audio" })).toBe(
|
||||
"gpt-4o-transcribe",
|
||||
);
|
||||
expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "audio" })).toBe(
|
||||
"openai/whisper-large-v3-turbo",
|
||||
);
|
||||
});
|
||||
|
||||
it("resolves bundled image defaults beyond the historical core set", () => {
|
||||
@@ -141,6 +148,7 @@ describe("resolveAutoMediaKeyProviders", () => {
|
||||
"openai",
|
||||
"openai-codex",
|
||||
"xai",
|
||||
"openrouter",
|
||||
"google",
|
||||
"mistral",
|
||||
]);
|
||||
|
||||
Reference in New Issue
Block a user