feat(openrouter): add inbound audio STT support

This commit is contained in:
remdev
2026-05-04 21:22:30 +03:00
committed by Peter Steinberger
parent 0543448df8
commit 5c87b692cb
7 changed files with 373 additions and 10 deletions

View File

@@ -192,7 +192,7 @@ If `tools.media.<capability>.enabled` is **not** set to `false` and you haven't
Bundled fallback order:
- Audio: OpenAI → Groq → xAI → Deepgram → Google → SenseAudio → ElevenLabs → Mistral
- Audio: OpenAI → Groq → xAI → Deepgram → OpenRouter → Google → SenseAudio → ElevenLabs → Mistral
- Image: OpenAI → Anthropic → Google → MiniMax → MiniMax Portal → Z.AI
- Video: Google → Qwen → Moonshot
@@ -237,7 +237,7 @@ If you set `capabilities`, the entry only runs for those media types. For shared
- `openai`, `anthropic`, `minimax`: **image**
- `minimax-portal`: **image**
- `moonshot`: **image + video**
- `openrouter`: **image**
- `openrouter`: **image + audio**
- `google` (Gemini API): **image + audio + video**
- `qwen`: **image + video**
- `mistral`: **audio**
@@ -254,7 +254,7 @@ For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
| Capability | Provider integration | Notes |
| ---------- | ---------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Image | OpenAI, OpenAI Codex OAuth, Codex app-server, OpenRouter, Anthropic, Google, MiniMax, Moonshot, Qwen, Z.AI, config providers | Vendor plugins register image support; `openai-codex/*` uses OAuth provider plumbing; `codex/*` uses a bounded Codex app-server turn; MiniMax and MiniMax OAuth both use `MiniMax-VL-01`; image-capable config providers auto-register. |
| Audio | OpenAI, Groq, xAI, Deepgram, Google, SenseAudio, ElevenLabs, Mistral | Provider transcription (Whisper/Groq/xAI/Deepgram/Gemini/SenseAudio/Scribe/Voxtral). |
| Audio | OpenAI, Groq, xAI, Deepgram, OpenRouter, Google, SenseAudio, ElevenLabs, Mistral | Provider transcription (Whisper/Groq/xAI/Deepgram/OpenRouter STT/Gemini/SenseAudio/Scribe/Voxtral). |
| Video | Google, Qwen, Moonshot | Provider video understanding via vendor plugins; Qwen video understanding uses the Standard DashScope endpoints. |
<Note>

View File

@@ -133,6 +133,29 @@ OpenRouter can also be used as a TTS provider through its OpenAI-compatible
If `messages.tts.providers.openrouter.apiKey` is omitted, TTS reuses
`models.providers.openrouter.apiKey`, then `OPENROUTER_API_KEY`.
## Speech-to-text (inbound audio)
OpenRouter can transcribe inbound voice/audio attachments through the shared
`tools.media.audio` path using its STT endpoint (`/audio/transcriptions`).
This applies to any channel plugin that forwards inbound voice/audio into
media understanding preflight.
```json5
{
tools: {
media: {
audio: {
enabled: true,
models: [{ provider: "openrouter", model: "openai/whisper-large-v3-turbo" }],
},
},
},
}
```
OpenClaw sends OpenRouter STT requests as JSON with base64 audio under
`input_audio` (OpenRouter STT contract), not as multipart OpenAI form uploads.
## Authentication and headers
OpenRouter uses a Bearer token with your API key under the hood.

View File

@@ -67,7 +67,7 @@ telephony, meetings, browser realtime, and native push-to-talk clients.
| MiniMax | ✓ | ✓ | ✓ | ✓ | | | |
| Mistral | | | | | ✓ | | |
| OpenAI | ✓ | ✓ | | ✓ | ✓ | ✓ | ✓ |
| OpenRouter | ✓ | ✓ | | ✓ | | | ✓ |
| OpenRouter | ✓ | ✓ | | ✓ | | | ✓ |
| Qwen | | ✓ | | | | | |
| Runway | | ✓ | | | | | |
| SenseAudio | | | | | ✓ | | |
@@ -105,7 +105,7 @@ the generated media fallback directly to the original channel.
## Speech-to-text and Voice Call
Deepgram, DeepInfra, ElevenLabs, Mistral, OpenAI, SenseAudio, and xAI can all transcribe
Deepgram, DeepInfra, ElevenLabs, Mistral, OpenAI, OpenRouter, SenseAudio, and xAI can all transcribe
inbound audio through the batch `tools.media.audio` path when configured.
Channel plugins that preflight a voice note for mention gating or command
parsing mark the transcribed attachment on the inbound context, so the shared

View File

@@ -0,0 +1,180 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import {
openrouterMediaUnderstandingProvider,
transcribeOpenRouterAudio,
} from "./media-understanding-provider.js";
const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } =
vi.hoisted(() => ({
assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
postJsonRequestMock: vi.fn(),
resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://openrouter.ai/api/v1",
allowPrivateNetwork: false,
headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
dispatcherPolicy: undefined,
})),
}));
vi.mock("openclaw/plugin-sdk/provider-http", () => ({
assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock,
postJsonRequest: postJsonRequestMock,
requireTranscriptionText: (value: string | undefined, message: string) => {
const text = value?.trim();
if (!text) {
throw new Error(message);
}
return text;
},
resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock,
}));
describe("openrouter media understanding provider", () => {
afterEach(() => {
assertOkOrThrowHttpErrorMock.mockClear();
postJsonRequestMock.mockReset();
resolveProviderHttpRequestConfigMock.mockClear();
});
it("declares image and audio capabilities with defaults", () => {
expect(openrouterMediaUnderstandingProvider).toMatchObject({
id: "openrouter",
capabilities: ["image", "audio"],
defaultModels: {
image: "auto",
audio: "openai/whisper-large-v3-turbo",
},
autoPriority: { audio: 35 },
});
expect(openrouterMediaUnderstandingProvider.transcribeAudio).toBeTypeOf("function");
});
it("sends JSON STT payload to OpenRouter transcriptions endpoint", async () => {
const release = vi.fn(async () => {});
postJsonRequestMock.mockResolvedValue({
response: new Response(JSON.stringify({ text: "hello world" }), { status: 200 }),
release,
});
const result = await transcribeOpenRouterAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.oga",
mime: "audio/ogg",
apiKey: "sk-openrouter",
timeoutMs: 12_000,
language: " en ",
fetchFn: fetch,
});
expect(result).toEqual({
text: "hello world",
model: "openai/whisper-large-v3-turbo",
});
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
expect.objectContaining({
provider: "openrouter",
capability: "audio",
}),
);
expect(postJsonRequestMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://openrouter.ai/api/v1/audio/transcriptions",
timeoutMs: 12_000,
body: {
model: "openai/whisper-large-v3-turbo",
input_audio: {
data: Buffer.from("audio-bytes").toString("base64"),
format: "ogg",
},
language: "en",
},
}),
);
const headers = postJsonRequestMock.mock.calls[0]?.[0]?.headers as Headers;
expect(headers.get("authorization")).toBe("Bearer sk-openrouter");
expect(headers.get("http-referer")).toBe("https://openclaw.ai");
expect(headers.get("x-openrouter-title")).toBe("OpenClaw");
expect(release).toHaveBeenCalledOnce();
});
it("accepts temperature via provider query options", async () => {
const release = vi.fn(async () => {});
postJsonRequestMock.mockResolvedValue({
response: new Response(JSON.stringify({ text: "ok" }), { status: 200 }),
release,
});
await transcribeOpenRouterAudio({
buffer: Buffer.from("audio"),
fileName: "voice.webm",
apiKey: "sk-openrouter",
timeoutMs: 5_000,
query: { temperature: 0.2 },
fetchFn: fetch,
});
expect(postJsonRequestMock).toHaveBeenCalledWith(
expect.objectContaining({
body: expect.objectContaining({
temperature: 0.2,
}),
}),
);
});
it("falls back to filename extension when mime is missing", async () => {
const release = vi.fn(async () => {});
postJsonRequestMock.mockResolvedValue({
response: new Response(JSON.stringify({ text: "ok" }), { status: 200 }),
release,
});
await transcribeOpenRouterAudio({
buffer: Buffer.from("audio"),
fileName: "voice.opus",
apiKey: "sk-openrouter",
timeoutMs: 5_000,
fetchFn: fetch,
});
expect(postJsonRequestMock).toHaveBeenCalledWith(
expect.objectContaining({
body: expect.objectContaining({
input_audio: expect.objectContaining({ format: "ogg" }),
}),
}),
);
});
it("throws when format cannot be resolved", async () => {
await expect(
transcribeOpenRouterAudio({
buffer: Buffer.from("audio"),
fileName: "voice.bin",
mime: "application/octet-stream",
apiKey: "sk-openrouter",
timeoutMs: 5_000,
fetchFn: fetch,
}),
).rejects.toThrow("OpenRouter STT could not resolve audio format");
expect(postJsonRequestMock).not.toHaveBeenCalled();
});
it("throws when provider response omits text", async () => {
const release = vi.fn(async () => {});
postJsonRequestMock.mockResolvedValue({
response: new Response(JSON.stringify({}), { status: 200 }),
release,
});
await expect(
transcribeOpenRouterAudio({
buffer: Buffer.from("audio"),
fileName: "voice.mp3",
apiKey: "sk-openrouter",
timeoutMs: 5_000,
fetchFn: fetch,
}),
).rejects.toThrow("OpenRouter transcription response missing text");
});
});

View File

@@ -1,13 +1,161 @@
import path from "node:path";
import {
describeImageWithModel,
describeImagesWithModel,
type AudioTranscriptionRequest,
type AudioTranscriptionResult,
type MediaUnderstandingProvider,
} from "openclaw/plugin-sdk/media-understanding";
import {
assertOkOrThrowHttpError,
postJsonRequest,
requireTranscriptionText,
resolveProviderHttpRequestConfig,
} from "openclaw/plugin-sdk/provider-http";
import { OPENROUTER_BASE_URL } from "./provider-catalog.js";
const DEFAULT_OPENROUTER_AUDIO_TRANSCRIPTION_MODEL = "openai/whisper-large-v3-turbo";
const SUPPORTED_AUDIO_FORMATS = new Set(["wav", "mp3", "flac", "m4a", "ogg", "webm", "aac"]);
function resolveFormatFromMime(mime?: string): string | undefined {
const normalized = mime?.trim().toLowerCase();
if (!normalized) {
return undefined;
}
switch (normalized) {
case "audio/wav":
case "audio/x-wav":
return "wav";
case "audio/mpeg":
case "audio/mp3":
return "mp3";
case "audio/flac":
return "flac";
case "audio/mp4":
case "audio/x-m4a":
return "m4a";
case "audio/ogg":
case "audio/oga":
return "ogg";
case "audio/webm":
return "webm";
case "audio/aac":
return "aac";
default:
return undefined;
}
}
function resolveFormatFromFileName(fileName?: string): string | undefined {
const ext = path
.extname(fileName ?? "")
.trim()
.toLowerCase()
.replace(/^\./, "");
if (!ext) {
return undefined;
}
if (ext === "mpeg") {
return "mp3";
}
if (ext === "oga" || ext === "opus") {
return "ogg";
}
return SUPPORTED_AUDIO_FORMATS.has(ext) ? ext : undefined;
}
function resolveOpenRouterAudioFormat(params: { mime?: string; fileName?: string }): string {
const fromMime = resolveFormatFromMime(params.mime);
if (fromMime) {
return fromMime;
}
const fromFileName = resolveFormatFromFileName(params.fileName);
if (fromFileName) {
return fromFileName;
}
throw new Error(
`OpenRouter STT could not resolve audio format from mime "${params.mime ?? ""}" and file "${params.fileName ?? ""}"`,
);
}
type OpenRouterSttResponse = {
text?: string;
};
export async function transcribeOpenRouterAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const model = params.model?.trim() || DEFAULT_OPENROUTER_AUDIO_TRANSCRIPTION_MODEL;
const format = resolveOpenRouterAudioFormat({
mime: params.mime,
fileName: params.fileName,
});
const fetchFn = params.fetchFn ?? fetch;
const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
resolveProviderHttpRequestConfig({
baseUrl: params.baseUrl,
defaultBaseUrl: OPENROUTER_BASE_URL,
headers: params.headers,
request: params.request,
defaultHeaders: {
Authorization: `Bearer ${params.apiKey}`,
"Content-Type": "application/json",
"HTTP-Referer": "https://openclaw.ai",
"X-OpenRouter-Title": "OpenClaw",
},
provider: "openrouter",
api: "openrouter-stt",
capability: "audio",
transport: "media-understanding",
});
const { response, release } = await postJsonRequest({
url: `${baseUrl}/audio/transcriptions`,
headers,
body: {
model,
input_audio: {
data: params.buffer.toString("base64"),
format,
},
...(params.language?.trim() ? { language: params.language.trim() } : {}),
...(typeof params.query?.temperature === "number"
? { temperature: params.query.temperature }
: {}),
},
timeoutMs: params.timeoutMs,
fetchFn,
allowPrivateNetwork,
dispatcherPolicy,
auditContext: "openrouter stt",
});
try {
await assertOkOrThrowHttpError(response, "OpenRouter audio transcription failed");
const payload = (await response.json()) as OpenRouterSttResponse;
return {
text: requireTranscriptionText(
payload.text,
"OpenRouter transcription response missing text",
),
model,
};
} finally {
await release();
}
}
export const openrouterMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "openrouter",
capabilities: ["image"],
defaultModels: { image: "auto" },
capabilities: ["image", "audio"],
defaultModels: {
image: "auto",
audio: DEFAULT_OPENROUTER_AUDIO_TRANSCRIPTION_MODEL,
},
autoPriority: {
audio: 35,
},
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
transcribeAudio: transcribeOpenRouterAudio,
};

View File

@@ -61,9 +61,13 @@
},
"mediaUnderstandingProviderMetadata": {
"openrouter": {
"capabilities": ["image"],
"capabilities": ["image", "audio"],
"defaultModels": {
"image": "auto"
"image": "auto",
"audio": "openai/whisper-large-v3-turbo"
},
"autoPriority": {
"audio": 35
}
}
},

View File

@@ -64,7 +64,11 @@ const mediaMetadataPlugins = vi.hoisted(() => [
},
opencode: { capabilities: ["image"], defaultModels: { image: "gpt-5-nano" } },
"opencode-go": { capabilities: ["image"], defaultModels: { image: "kimi-k2.6" } },
openrouter: { capabilities: ["image"], defaultModels: { image: "auto" } },
openrouter: {
capabilities: ["image", "audio"],
defaultModels: { image: "auto", audio: "openai/whisper-large-v3-turbo" },
autoPriority: { audio: 35 },
},
qwen: { capabilities: ["video"], autoPriority: { video: 20 } },
xai: { capabilities: ["audio"], autoPriority: { audio: 25 } },
zai: { capabilities: ["image"], autoPriority: { image: 60 } },
@@ -111,6 +115,9 @@ describe("resolveDefaultMediaModel", () => {
expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "audio" })).toBe(
"gpt-4o-transcribe",
);
expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "audio" })).toBe(
"openai/whisper-large-v3-turbo",
);
});
it("resolves bundled image defaults beyond the historical core set", () => {
@@ -141,6 +148,7 @@ describe("resolveAutoMediaKeyProviders", () => {
"openai",
"openai-codex",
"xai",
"openrouter",
"google",
"mistral",
]);