refactor: route capability runtime through channel stores

2026-04-11 01:01:13 +00:00 · 2026-04-05 09:07:00 +01:00
parent 1903be5401
commit b57372d665
10 changed files with 44 additions and 68 deletions
--- a/docs/.generated/plugin-sdk-api-baseline.sha256
+++ b/docs/.generated/plugin-sdk-api-baseline.sha256
@@ -1,2 +1,2 @@
-884e6fd12b7a8086a11f547e15201f46dea0f2dc46735fad055d4f1b96d5fb82  plugin-sdk-api-baseline.json
-100f6b29793abf858f94cb8c292afc0dc56573f4e264d27496a96e17f8de4c1e  plugin-sdk-api-baseline.jsonl
+cbffdf76d6a7254d8b2d3a601e1206d7b6c835bc44f170d4038bc711a35ef756  plugin-sdk-api-baseline.json
+fe026bf3ba1e3b55f6c0b560d76940f3c301d8f593d6f0f6dcc4625745c76d31  plugin-sdk-api-baseline.jsonl
--- a/docs/plugins/sdk-migration.md
+++ b/docs/plugins/sdk-migration.md
@@ -49,7 +49,7 @@ is a small, self-contained module with a clear purpose and documented contract.
 Legacy provider convenience seams for bundled channels are also gone. Imports
 such as `openclaw/plugin-sdk/slack`, `openclaw/plugin-sdk/discord`,
 `openclaw/plugin-sdk/signal`, `openclaw/plugin-sdk/whatsapp`,
-`openclaw/plugin-sdk/whatsapp-surface`, and
+channel-branded helper seams, and
 `openclaw/plugin-sdk/telegram-core` were private mono-repo shortcuts, not
 stable plugin contracts. Use narrow generic SDK subpaths instead. Inside the
 bundled plugin workspace, keep provider-owned helpers in that plugin's own
@@ -255,17 +255,16 @@ Current bundled provider examples:
  | `plugin-sdk/provider-stream` | Provider stream wrapper helpers | `ProviderStreamFamily`, `buildProviderStreamFamilyHooks`, `composeProviderStreamWrappers`, stream wrapper types, and shared Anthropic/Bedrock/Google/Kilocode/Moonshot/OpenAI/OpenRouter/Z.A.I/MiniMax/Copilot wrapper helpers |
  | `plugin-sdk/keyed-async-queue` | Ordered async queue | `KeyedAsyncQueue` |
  | `plugin-sdk/media-runtime` | Shared media helpers | Media fetch/transform/store helpers plus media payload builders |
-  | `plugin-sdk/media-understanding-runtime` | Media-understanding runtime facade | Media-understanding runner facade and typed result helpers |
+  | `plugin-sdk/media-understanding` | Media-understanding helpers | Media understanding provider types plus provider-facing image/audio helper exports |
  | `plugin-sdk/text-runtime` | Shared text helpers | Assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, safe-text utilities, and related text/logging helpers |
  | `plugin-sdk/text-chunking` | Text chunking helpers | Outbound text chunking helper |
-  | `plugin-sdk/speech-runtime` | Speech runtime facade | TTS resolution and synthesis helpers |
+  | `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, and validation helpers |
  | `plugin-sdk/speech-core` | Shared speech core | Speech provider types, registry, directives, normalization |
  | `plugin-sdk/realtime-transcription` | Realtime transcription helpers | Provider types and registry helpers |
  | `plugin-sdk/realtime-voice` | Realtime voice helpers | Provider types and registry helpers |
  | `plugin-sdk/image-generation-core` | Shared image-generation core | Image-generation types, failover, auth, and registry helpers |
-  | `plugin-sdk/video-generation` | Video-generation provider types | Video-generation provider/request/result types for provider plugins |
+  | `plugin-sdk/video-generation` | Video-generation helpers | Video-generation provider/request/result types |
  | `plugin-sdk/video-generation-core` | Shared video-generation core | Video-generation types, failover helpers, provider lookup, and model-ref parsing |
-  | `plugin-sdk/video-generation-runtime` | Video-generation runtime facade | Shared runtime `generateVideo` / `listRuntimeVideoGenerationProviders` facade |
  | `plugin-sdk/interactive-runtime` | Interactive reply helpers | Interactive reply payload normalization/reduction |
  | `plugin-sdk/channel-config-primitives` | Channel config primitives | Narrow channel config-schema primitives |
  | `plugin-sdk/channel-config-writes` | Channel config-write helpers | Channel config-write authorization helpers |
@@ -314,21 +313,10 @@ The same rule applies to other generated bundled-helper families such as:
 - LINE: `plugin-sdk/line*`
 - IRC: `plugin-sdk/irc*`
 - bundled helper/plugin surfaces like `plugin-sdk/googlechat`,
-  `plugin-sdk/whatsapp-surface`, `plugin-sdk/zalouser`,
-  `plugin-sdk/bluebubbles*`,
+  `plugin-sdk/zalouser`, `plugin-sdk/bluebubbles*`,
  `plugin-sdk/mattermost*`, `plugin-sdk/msteams`,
  `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`,
-  `plugin-sdk/twitch`, `plugin-sdk/openai`, `plugin-sdk/moonshot`,
-  `plugin-sdk/qwen*`, `plugin-sdk/modelstudio*`,
-  `plugin-sdk/provider-moonshot`,
-  `plugin-sdk/cloudflare-ai-gateway`, `plugin-sdk/byteplus`,
-  `plugin-sdk/chutes`, `plugin-sdk/deepseek`, `plugin-sdk/google`,
-  `plugin-sdk/huggingface`, `plugin-sdk/kimi-coding`,
-  `plugin-sdk/kilocode`, `plugin-sdk/minimax`, `plugin-sdk/mistral`,
-  `plugin-sdk/nvidia`, `plugin-sdk/opencode`,
-  `plugin-sdk/opencode-go`, `plugin-sdk/qianfan`, `plugin-sdk/sglang`,
-  `plugin-sdk/synthetic`, `plugin-sdk/venice`, `plugin-sdk/vllm`,
-  `plugin-sdk/xai`, `plugin-sdk/volcengine`,
+  `plugin-sdk/twitch`,
  `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`,
  `plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`,
  `plugin-sdk/thread-ownership`, and `plugin-sdk/voice-call`
@@ -337,15 +325,6 @@ The same rule applies to other generated bundled-helper families such as:
 surface `DEFAULT_COPILOT_API_BASE_URL`,
 `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken`.

-`plugin-sdk/whatsapp-surface` currently exposes `DEFAULT_WEB_MEDIA_BYTES`,
-WhatsApp auth/account helpers, directory-config helpers, group-policy helpers,
-outbound-target resolution, and the narrow `WebChannelStatus` /
-`WebInboundMessage` / `WebListenerCloseReason` / `WebMonitorTuning` types.
-
-For Qwen specifically, prefer the canonical `plugin-sdk/qwen` and
-`plugin-sdk/qwen-definitions` seams. `plugin-sdk/modelstudio*` remains
-exported as a compatibility alias for older plugin code.
-
 Use the narrowest import that matches the job. If you cannot find an export,
 check the source at `src/plugin-sdk/` or ask in Discord.

--- a/docs/plugins/sdk-overview.md
+++ b/docs/plugins/sdk-overview.md
@@ -38,7 +38,7 @@ the broader umbrella surface and shared helpers such as
 Do not add or depend on provider-named convenience seams such as
 `openclaw/plugin-sdk/slack`, `openclaw/plugin-sdk/discord`,
 `openclaw/plugin-sdk/signal`, `openclaw/plugin-sdk/whatsapp`, or
-`openclaw/plugin-sdk/whatsapp-surface`. Bundled plugins should compose generic
+channel-branded helper seams. Bundled plugins should compose generic
 SDK subpaths inside their own `api.ts` or `runtime-api.ts` barrels, and core
 should either use those plugin-local barrels or add a narrow generic SDK
 contract when the need is truly cross-channel.
@@ -224,20 +224,17 @@ explicitly promotes one as public.
    | Subpath | Key exports |
    | --- | --- |
    | `plugin-sdk/media-runtime` | Shared media fetch/transform/store helpers plus media payload builders |
-    | `plugin-sdk/media-understanding-runtime` | Media-understanding runner facade and typed result helpers |
+    | `plugin-sdk/media-understanding` | Media understanding provider types plus provider-facing image/audio helper exports |
    | `plugin-sdk/text-runtime` | Shared text/markdown/logging helpers such as assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, and safe-text utilities |
    | `plugin-sdk/text-chunking` | Outbound text chunking helper |
-    | `plugin-sdk/speech-runtime` | Speech-core runtime facade for TTS resolution and synthesis |
+    | `plugin-sdk/speech` | Speech provider types plus provider-facing directive, registry, and validation helpers |
    | `plugin-sdk/speech-core` | Shared speech provider types, registry, directive, and normalization helpers |
    | `plugin-sdk/realtime-transcription` | Realtime transcription provider types and registry helpers |
    | `plugin-sdk/realtime-voice` | Realtime voice provider types and registry helpers |
    | `plugin-sdk/image-generation` | Image generation provider types |
    | `plugin-sdk/image-generation-core` | Shared image-generation types, failover, auth, and registry helpers |
-    | `plugin-sdk/video-generation` | Video generation provider types |
+    | `plugin-sdk/video-generation` | Video generation provider/request/result types |
    | `plugin-sdk/video-generation-core` | Shared video-generation types, failover helpers, provider lookup, and model-ref parsing |
-    | `plugin-sdk/video-generation-runtime` | Shared runtime `generateVideo` / `listRuntimeVideoGenerationProviders` facade |
-    | `plugin-sdk/media-understanding` | Media understanding provider types |
-    | `plugin-sdk/speech` | Speech provider types |
    | `plugin-sdk/webhook-targets` | Webhook target registry and route-install helpers |
    | `plugin-sdk/webhook-path` | Webhook path normalization helpers |
    | `plugin-sdk/web-media` | Shared remote/local media loading helpers |
@@ -267,12 +264,11 @@ explicitly promotes one as public.
  <Accordion title="Reserved bundled-helper subpaths">
    | Family | Current generated subpaths | Intended use |
    | --- | --- | --- |
-    | Browser | `plugin-sdk/browser`, `plugin-sdk/browser-runtime`, `plugin-sdk/browser-config-support`, `plugin-sdk/browser-support` | Bundled browser plugin maintenance and compatibility |
+    | Browser | `plugin-sdk/browser`, `plugin-sdk/browser-config-support`, `plugin-sdk/browser-support` | Bundled browser plugin maintenance and compatibility |
    | Matrix | `plugin-sdk/matrix`, `plugin-sdk/matrix-helper`, `plugin-sdk/matrix-runtime-heavy`, `plugin-sdk/matrix-runtime-shared`, `plugin-sdk/matrix-runtime-surface`, `plugin-sdk/matrix-surface`, `plugin-sdk/matrix-thread-bindings` | Bundled Matrix helper/runtime surface |
    | Line | `plugin-sdk/line`, `plugin-sdk/line-core`, `plugin-sdk/line-runtime`, `plugin-sdk/line-surface` | Bundled LINE helper/runtime surface |
    | IRC | `plugin-sdk/irc`, `plugin-sdk/irc-surface` | Bundled IRC helper surface |
-    | Channel-specific helpers | `plugin-sdk/googlechat`, `plugin-sdk/whatsapp-surface`, `plugin-sdk/zalouser`, `plugin-sdk/bluebubbles`, `plugin-sdk/bluebubbles-policy`, `plugin-sdk/mattermost`, `plugin-sdk/mattermost-policy`, `plugin-sdk/feishu-conversation`, `plugin-sdk/msteams`, `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`, `plugin-sdk/twitch` | Bundled channel compatibility/helper seams. `plugin-sdk/whatsapp-surface` currently exports `DEFAULT_WEB_MEDIA_BYTES`, WhatsApp auth/account helpers, directory-config helpers, group-policy helpers, outbound-target resolution, and the narrow `WebChannelStatus` / `WebInboundMessage` / `WebListenerCloseReason` / `WebMonitorTuning` types. |
-    | Provider-specific helpers | `plugin-sdk/openai`, `plugin-sdk/moonshot`, `plugin-sdk/qwen`, `plugin-sdk/qwen-definitions`, `plugin-sdk/modelstudio`, `plugin-sdk/modelstudio-definitions`, `plugin-sdk/provider-moonshot`, `plugin-sdk/together`, `plugin-sdk/amazon-bedrock`, `plugin-sdk/anthropic-vertex`, `plugin-sdk/cloudflare-ai-gateway`, `plugin-sdk/byteplus`, `plugin-sdk/chutes`, `plugin-sdk/deepseek`, `plugin-sdk/google`, `plugin-sdk/huggingface`, `plugin-sdk/kimi-coding`, `plugin-sdk/kilocode`, `plugin-sdk/minimax`, `plugin-sdk/mistral`, `plugin-sdk/nvidia`, `plugin-sdk/opencode`, `plugin-sdk/opencode-go`, `plugin-sdk/qianfan`, `plugin-sdk/sglang`, `plugin-sdk/synthetic`, `plugin-sdk/venice`, `plugin-sdk/vllm`, `plugin-sdk/xai`, `plugin-sdk/volcengine` | Bundled provider-specific helper seams; prefer canonical `qwen*`, keep `modelstudio*` as compatibility aliases |
+    | Channel-specific helpers | `plugin-sdk/googlechat`, `plugin-sdk/zalouser`, `plugin-sdk/bluebubbles`, `plugin-sdk/bluebubbles-policy`, `plugin-sdk/mattermost`, `plugin-sdk/mattermost-policy`, `plugin-sdk/feishu-conversation`, `plugin-sdk/msteams`, `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`, `plugin-sdk/twitch` | Bundled channel compatibility/helper seams |
    | Auth/plugin-specific helpers | `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, `plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, `plugin-sdk/voice-call` | Bundled feature/plugin helper seams; `plugin-sdk/github-copilot-token` currently exports `DEFAULT_COPILOT_API_BASE_URL`, `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken` |
  </Accordion>
 </AccordionGroup>
--- a/extensions/discord/src/voice/manager.e2e.test.ts
+++ b/extensions/discord/src/voice/manager.e2e.test.ts
@@ -9,6 +9,7 @@ const {
  resolveAgentRouteMock,
  agentCommandMock,
  transcribeAudioFileMock,
+  textToSpeechMock,
 } = vi.hoisted(() => {
  type EventHandler = (...args: unknown[]) => unknown;
  type MockConnection = {
@@ -66,6 +67,7 @@ const {
    resolveAgentRouteMock: vi.fn(() => ({ agentId: "agent-1", sessionKey: "discord:g1:c1" })),
    agentCommandMock: vi.fn(async (_opts?: unknown, _runtime?: unknown) => ({ payloads: [] })),
    transcribeAudioFileMock: vi.fn(async () => ({ text: "hello from voice" })),
+    textToSpeechMock: vi.fn(async () => ({ success: true, audioPath: "/tmp/voice.mp3" })),
  };
 });

@@ -107,8 +109,15 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", async () => {
  };
 });

-vi.mock("openclaw/plugin-sdk/media-understanding-runtime", () => ({
-  transcribeAudioFile: transcribeAudioFileMock,
+vi.mock("../runtime.js", () => ({
+  getDiscordRuntime: () => ({
+    mediaUnderstanding: {
+      transcribeAudioFile: transcribeAudioFileMock,
+    },
+    tts: {
+      textToSpeech: textToSpeechMock,
+    },
+  }),
 }));

 let managerModule: typeof import("./manager.js");
@@ -157,6 +166,8 @@ describe("DiscordVoiceManager", () => {
    agentCommandMock.mockResolvedValue({ payloads: [] });
    transcribeAudioFileMock.mockReset();
    transcribeAudioFileMock.mockResolvedValue({ text: "hello from voice" });
+    textToSpeechMock.mockReset();
+    textToSpeechMock.mockResolvedValue({ success: true, audioPath: "/tmp/voice.mp3" });
  });

  const createManager = (
--- a/extensions/discord/src/voice/manager.ts
+++ b/extensions/discord/src/voice/manager.ts
@@ -10,18 +10,17 @@ import { agentCommandFromIngress } from "openclaw/plugin-sdk/agent-runtime";
 import { resolveTtsConfig, type ResolvedTtsConfig } from "openclaw/plugin-sdk/agent-runtime";
 import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
 import type { DiscordAccountConfig, TtsConfig } from "openclaw/plugin-sdk/config-runtime";
-import { transcribeAudioFile } from "openclaw/plugin-sdk/media-understanding-runtime";
 import { resolveAgentRoute } from "openclaw/plugin-sdk/routing";
 import { logVerbose, shouldLogVerbose } from "openclaw/plugin-sdk/runtime-env";
 import { createSubsystemLogger } from "openclaw/plugin-sdk/runtime-env";
 import type { RuntimeEnv } from "openclaw/plugin-sdk/runtime-env";
 import { parseTtsDirectives } from "openclaw/plugin-sdk/speech";
-import { textToSpeech } from "openclaw/plugin-sdk/speech-runtime";
 import { formatErrorMessage } from "openclaw/plugin-sdk/ssrf-runtime";
 import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
 import { formatMention } from "../mentions.js";
 import { normalizeDiscordSlug, resolveDiscordOwnerAccess } from "../monitor/allow-list.js";
 import { formatDiscordUserTag } from "../monitor/format.js";
+import { getDiscordRuntime } from "../runtime.js";
 import { authorizeDiscordVoiceIngress } from "./access.js";
 import { loadDiscordVoiceSdk } from "./sdk-runtime.js";

@@ -226,7 +225,7 @@ async function transcribeAudio(params: {
  agentId: string;
  filePath: string;
 }): Promise<string | undefined> {
-  const result = await transcribeAudioFile({
+  const result = await getDiscordRuntime().mediaUnderstanding.transcribeAudioFile({
    filePath: params.filePath,
    cfg: params.cfg,
    agentDir: resolveAgentDir(params.cfg, params.agentId),
@@ -703,7 +702,7 @@ export class DiscordVoiceManager {
      return;
    }

-    const ttsResult = await textToSpeech({
+    const ttsResult = await getDiscordRuntime().tts.textToSpeech({
      text: speakText,
      cfg: ttsCfg,
      channel: "discord",
--- a/extensions/openai/openai.live.test.ts
+++ b/extensions/openai/openai.live.test.ts
@@ -4,10 +4,10 @@ import path from "node:path";
 import { getModel } from "@mariozechner/pi-ai";
 import { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent";
 import OpenAI from "openai";
+import type { ResolvedTtsConfig } from "openclaw/plugin-sdk/agent-runtime";
 import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
 import { loadConfig } from "openclaw/plugin-sdk/config-runtime";
 import { encodePngRgba, fillPixel } from "openclaw/plugin-sdk/media-runtime";
-import type { ResolvedTtsConfig } from "openclaw/plugin-sdk/speech-runtime";
 import { describe, expect, it } from "vitest";
 import {
  registerProviderPlugin,
--- a/extensions/qqbot/src/reply-dispatcher.ts
+++ b/extensions/qqbot/src/reply-dispatcher.ts
@@ -2,7 +2,6 @@ import crypto from "node:crypto";
 import fs from "node:fs";
 import path from "node:path";
 import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
-import { textToSpeech as globalTextToSpeech } from "openclaw/plugin-sdk/speech-runtime";
 import {
  getAccessToken,
  sendC2CMessage,
@@ -19,6 +18,7 @@ import {
  sendC2CFileMessage,
  sendGroupFileMessage,
 } from "./api.js";
+import { getQQBotRuntime } from "./runtime.js";
 import type { ResolvedQQBotAccount } from "./types.js";
 import {
  isGlobalTTSAvailable,
@@ -387,7 +387,7 @@ async function handleAudioPayload(ctx: ReplyContext, payload: MediaPayload): Pro
        return;
      }
      log?.info(`[qqbot:${account.accountId}] TTS (global fallback): "${ttsText.slice(0, 50)}..."`);
-      const globalResult = await globalTextToSpeech({
+      const globalResult = await getQQBotRuntime().tts.textToSpeech({
        text: ttsText,
        cfg: cfg as OpenClawConfig,
        channel: "qqbot",
--- a/extensions/telegram/src/sticker-cache.test.ts
+++ b/extensions/telegram/src/sticker-cache.test.ts
@@ -16,8 +16,12 @@ vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
  resolveDefaultMediaModel: vi.fn(() => "gpt-4.1-mini"),
 }));

-vi.mock("openclaw/plugin-sdk/media-understanding-runtime", () => ({
-  describeImageFileWithModel: vi.fn(),
+vi.mock("./runtime.js", () => ({
+  getTelegramRuntime: () => ({
+    mediaUnderstanding: {
+      describeImageFileWithModel: vi.fn(),
+    },
+  }),
 }));

 const TEST_CACHE_DIR = "/tmp/openclaw-test-sticker-cache/telegram";
--- a/extensions/telegram/src/sticker-cache.ts
+++ b/extensions/telegram/src/sticker-cache.ts
@@ -14,9 +14,9 @@ import {
  resolveAutoMediaKeyProviders,
  resolveDefaultMediaModel,
 } from "openclaw/plugin-sdk/media-runtime";
-import { describeImageFileWithModel } from "openclaw/plugin-sdk/media-understanding-runtime";
 import { logVerbose } from "openclaw/plugin-sdk/runtime-env";
 import { STATE_DIR } from "openclaw/plugin-sdk/state-paths";
+import { getTelegramRuntime } from "./runtime.js";

 const CACHE_FILE = path.join(STATE_DIR, "telegram", "sticker-cache.json");
 const CACHE_VERSION = 1;
@@ -246,7 +246,7 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
  logVerbose(`telegram: describing sticker with ${provider}/${model}`);

  try {
-    const result = await describeImageFileWithModel({
+    const result = await getTelegramRuntime().mediaUnderstanding.describeImageFileWithModel({
      filePath: imagePath,
      mime: "image/webp",
      cfg,
--- a/src/plugin-sdk/speech.ts
+++ b/src/plugin-sdk/speech.ts
@@ -1,12 +1,10 @@
 import { rmSync } from "node:fs";
-import type { OpenClawConfig } from "../config/config.js";
-import type { ResolvedTtsConfig } from "../tts/tts.js";

 // Public speech helpers for bundled or third-party plugins.
 //
-// Keep this surface neutral and import-light. Provider builders commonly import
-// this module just to get types and a few validation helpers, so avoid pulling
-// in the heavy TTS runtime graph at module load time.
+// Keep this surface provider-facing: types, validation, directive parsing, and
+// registry helpers. Runtime synthesis lives on `api.runtime.tts` or narrower
+// core/runtime seams, not here.

 export type { SpeechProviderPlugin } from "../plugins/types.js";
 export type {
@@ -98,14 +96,3 @@ export function scheduleCleanup(
  }, delayMs);
  timer.unref();
 }
-
-export async function summarizeText(params: {
-  text: string;
-  targetLength: number;
-  cfg: OpenClawConfig;
-  config: ResolvedTtsConfig;
-  timeoutMs: number;
-}) {
-  const { summarizeText: summarizeTextRuntime } = await import("../tts/tts-core.js");
-  return summarizeTextRuntime(params);
-}