feat(webchat): add server-side dictation

This commit is contained in:
clawsweeper
2026-05-02 21:28:52 +00:00
parent ff758197f3
commit aeb50ce277
3 changed files with 23 additions and 14 deletions

View File

@@ -427,16 +427,14 @@ function isNpmExecPath(value: string): boolean {
return /^npm(?:-cli)?(?:\.(?:c?js|cmd|exe))?$/.test(basename(value).toLowerCase());
}
export function resolveNpmCommandInvocation(
params: {
npmExecPath?: string;
nodeExecPath?: string;
platform?: NodeJS.Platform;
} = {},
): { command: string; args: string[] } {
const npmExecPath = params.npmExecPath ?? process.env.npm_execpath;
const nodeExecPath = params.nodeExecPath ?? process.execPath;
const npmCommand = (params.platform ?? process.platform) === "win32" ? "npm.cmd" : "npm";
export function resolveNpmCommandInvocation(params?: {
npmExecPath?: string;
nodeExecPath?: string;
platform?: NodeJS.Platform;
}): { command: string; args: string[] } {
const npmExecPath = params === undefined ? process.env.npm_execpath : params.npmExecPath;
const nodeExecPath = params?.nodeExecPath ?? process.execPath;
const npmCommand = (params?.platform ?? process.platform) === "win32" ? "npm.cmd" : "npm";
if (typeof npmExecPath === "string" && npmExecPath.length > 0 && isNpmExecPath(npmExecPath)) {
return { command: nodeExecPath, args: [npmExecPath] };

View File

@@ -0,0 +1 @@
export { transcribeAudioFile } from "../../media-understanding/runtime.js";

View File

@@ -1,7 +1,6 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { transcribeAudioFile } from "../../media-understanding/runtime.js";
import { extensionForMime, normalizeMimeType } from "../../media/mime.js";
import { normalizeOptionalString } from "../../shared/string-coerce.js";
import { ErrorCodes, errorShape } from "../protocol/index.js";
@@ -9,6 +8,18 @@ import { MAX_PAYLOAD_BYTES } from "../server-constants.js";
import { formatForLog } from "../ws-log.js";
import type { GatewayRequestHandlers } from "./types.js";
type ChatTranscribeAudioRuntime = typeof import("./chat-transcribe-audio.runtime.js");
type TranscribeAudioFileResult = Awaited<
ReturnType<ChatTranscribeAudioRuntime["transcribeAudioFile"]>
>;
let chatTranscribeAudioRuntimePromise: Promise<ChatTranscribeAudioRuntime> | null = null;
function loadChatTranscribeAudioRuntime(): Promise<ChatTranscribeAudioRuntime> {
chatTranscribeAudioRuntimePromise ??= import("./chat-transcribe-audio.runtime.js");
return chatTranscribeAudioRuntimePromise;
}
const CHAT_TRANSCRIBE_AUDIO_WS_JSON_OVERHEAD_BYTES = 64 * 1024;
export const MAX_CHAT_TRANSCRIBE_AUDIO_BYTES = Math.floor(
((MAX_PAYLOAD_BYTES - CHAT_TRANSCRIBE_AUDIO_WS_JSON_OVERHEAD_BYTES) * 3) / 4,
@@ -45,9 +56,7 @@ function extensionForAudioMime(mime?: string): string {
return extensionForMime(mime) ?? ".audio";
}
function isMissingMediaUnderstandingProvider(
result: Awaited<ReturnType<typeof transcribeAudioFile>>,
) {
function isMissingMediaUnderstandingProvider(result: TranscribeAudioFileResult) {
const decision = result.decision;
return (
decision?.outcome === "skipped" &&
@@ -86,6 +95,7 @@ export const chatTranscribeAudioHandlers: GatewayRequestHandlers = {
const filePath = path.join(tmpDir, `dictation${extensionForAudioMime(decoded.mime)}`);
try {
await fs.writeFile(filePath, decoded.data);
const { transcribeAudioFile } = await loadChatTranscribeAudioRuntime();
const result = await transcribeAudioFile({
filePath,
cfg: context.getRuntimeConfig(),