mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 14:10:51 +00:00
fix(tts): keep final webchat audio supplemental
This commit is contained in:
@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Fixes
|
||||
|
||||
- Gateway/startup: keep hot Gateway boot paths on leaf config imports and add max-RSS reporting to the gateway startup bench so low-memory startup regressions are visible before release. Thanks @vincentkoc.
|
||||
- WebChat/TTS: persist automatic final-mode TTS audio as a supplemental audio-only transcript update instead of adding a second assistant message with the same visible text. Fixes #72830. Thanks @lhtpluto.
|
||||
- Agents/LSP: terminate bundled stdio LSP process trees during runtime disposal and Gateway shutdown, so nested children such as `tsserver` do not survive stop or restart. Fixes #72357. Thanks @ai-hpc and @bittoby.
|
||||
- Diagnostics/OTEL: capture privacy-safe model-call request payload bytes, streamed response bytes, first-response latency, and total duration in diagnostic events, plugin hooks, stability snapshots, and OTEL model-call spans/metrics without logging raw model content. Fixes #33832. Thanks @wwh830.
|
||||
- Logging: write validated diagnostic trace context as top-level `traceId`, `spanId`, `parentSpanId`, and `traceFlags` fields in file-log JSONL records so traced requests and model calls are easier to correlate in log processors. Refs #40353. Thanks @liangruochong44-ui.
|
||||
|
||||
@@ -156,6 +156,7 @@ async function expectTtsPayloadResult(params: {
|
||||
expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ target: params.target }));
|
||||
expect(result.audioAsVoice).toBe(params.audioAsVoice);
|
||||
expect(result.mediaUrl).toMatch(new RegExp(`voice-\\d+\\.${params.mediaExtension ?? "ogg"}$`));
|
||||
expect(result.spokenText).toBe(params.text);
|
||||
|
||||
mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
|
||||
} finally {
|
||||
|
||||
@@ -1591,6 +1591,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
...nextPayload,
|
||||
mediaUrl: result.audioPath,
|
||||
audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice,
|
||||
spokenText: textForAudio,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -23,6 +23,10 @@ const mockState = vi.hoisted(() => ({
|
||||
finalPayload: null as {
|
||||
text?: string;
|
||||
mediaUrl?: string;
|
||||
mediaUrls?: string[];
|
||||
spokenText?: string;
|
||||
audioAsVoice?: boolean;
|
||||
trustedLocalMedia?: boolean;
|
||||
sensitiveMedia?: boolean;
|
||||
replyToId?: string;
|
||||
replyToCurrent?: boolean;
|
||||
@@ -34,6 +38,8 @@ const mockState = vi.hoisted(() => ({
|
||||
text?: string;
|
||||
mediaUrl?: string;
|
||||
mediaUrls?: string[];
|
||||
spokenText?: string;
|
||||
audioAsVoice?: boolean;
|
||||
trustedLocalMedia?: boolean;
|
||||
replyToId?: string;
|
||||
replyToCurrent?: boolean;
|
||||
@@ -113,6 +119,10 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
|
||||
sendFinalReply: (payload: {
|
||||
text?: string;
|
||||
mediaUrl?: string;
|
||||
mediaUrls?: string[];
|
||||
spokenText?: string;
|
||||
audioAsVoice?: boolean;
|
||||
trustedLocalMedia?: boolean;
|
||||
sensitiveMedia?: boolean;
|
||||
replyToId?: string;
|
||||
replyToCurrent?: boolean;
|
||||
@@ -122,6 +132,8 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
|
||||
text?: string;
|
||||
mediaUrl?: string;
|
||||
mediaUrls?: string[];
|
||||
spokenText?: string;
|
||||
audioAsVoice?: boolean;
|
||||
trustedLocalMedia?: boolean;
|
||||
replyToId?: string;
|
||||
replyToCurrent?: boolean;
|
||||
@@ -131,6 +143,8 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
|
||||
text?: string;
|
||||
mediaUrl?: string;
|
||||
mediaUrls?: string[];
|
||||
spokenText?: string;
|
||||
audioAsVoice?: boolean;
|
||||
trustedLocalMedia?: boolean;
|
||||
replyToId?: string;
|
||||
replyToCurrent?: boolean;
|
||||
@@ -257,6 +271,7 @@ function createTranscriptFixture(prefix: string) {
|
||||
"utf-8",
|
||||
);
|
||||
mockState.transcriptPath = transcriptPath;
|
||||
return dir;
|
||||
}
|
||||
|
||||
function extractFirstTextBlock(payload: unknown): string | undefined {
|
||||
@@ -579,6 +594,121 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
});
|
||||
});
|
||||
|
||||
it("persists auto-TTS final media as audio-only so webchat does not duplicate assistant text", async () => {
|
||||
const transcriptDir = createTranscriptFixture("openclaw-chat-send-agent-tts-final-");
|
||||
const audioPath = path.join(transcriptDir, "tts.mp3");
|
||||
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||||
mockState.config = {
|
||||
agents: {
|
||||
defaults: {
|
||||
workspace: transcriptDir,
|
||||
},
|
||||
},
|
||||
};
|
||||
mockState.triggerAgentRunStart = true;
|
||||
mockState.dispatchedReplies = [
|
||||
{
|
||||
kind: "final",
|
||||
payload: {
|
||||
text: "This text is already in the model transcript.",
|
||||
spokenText: "This text is already in the model transcript.",
|
||||
mediaUrl: audioPath,
|
||||
mediaUrls: [audioPath],
|
||||
trustedLocalMedia: true,
|
||||
audioAsVoice: true,
|
||||
},
|
||||
},
|
||||
];
|
||||
const respond = vi.fn();
|
||||
const context = createChatContext();
|
||||
|
||||
await runNonStreamingChatSend({
|
||||
context,
|
||||
respond,
|
||||
idempotencyKey: "idem-agent-tts",
|
||||
expectBroadcast: false,
|
||||
waitFor: "dedupe",
|
||||
});
|
||||
|
||||
const assistantUpdates = mockState.emittedTranscriptUpdates.filter(
|
||||
(update) =>
|
||||
typeof update.message === "object" &&
|
||||
update.message !== null &&
|
||||
(update.message as { role?: unknown }).role === "assistant",
|
||||
);
|
||||
expect(assistantUpdates).toHaveLength(1);
|
||||
expect(assistantUpdates[0]).toMatchObject({
|
||||
message: {
|
||||
role: "assistant",
|
||||
idempotencyKey: "idem-agent-tts:assistant-media",
|
||||
content: [
|
||||
{ type: "text", text: "Audio reply" },
|
||||
{
|
||||
type: "audio",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: "audio/mpeg",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
expect(JSON.stringify(assistantUpdates[0]?.message)).not.toContain(
|
||||
"This text is already in the model transcript.",
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps visible text on non-agent TTS final media because no model transcript exists", async () => {
|
||||
const transcriptDir = createTranscriptFixture("openclaw-chat-send-command-tts-final-");
|
||||
const audioPath = path.join(transcriptDir, "tts.mp3");
|
||||
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||||
mockState.config = {
|
||||
agents: {
|
||||
defaults: {
|
||||
workspace: transcriptDir,
|
||||
},
|
||||
},
|
||||
};
|
||||
mockState.finalPayload = {
|
||||
text: "Command result with TTS.",
|
||||
spokenText: "Command result with TTS.",
|
||||
mediaUrl: audioPath,
|
||||
mediaUrls: [audioPath],
|
||||
trustedLocalMedia: true,
|
||||
audioAsVoice: true,
|
||||
};
|
||||
const respond = vi.fn();
|
||||
const context = createChatContext();
|
||||
|
||||
const payload = await runNonStreamingChatSend({
|
||||
context,
|
||||
respond,
|
||||
idempotencyKey: "idem-command-tts",
|
||||
});
|
||||
|
||||
expect(payload?.message).toMatchObject({
|
||||
role: "assistant",
|
||||
content: [
|
||||
{ type: "text", text: "Command result with TTS." },
|
||||
{
|
||||
type: "audio",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: "audio/mpeg",
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
const assistantUpdates = mockState.emittedTranscriptUpdates.filter(
|
||||
(update) =>
|
||||
typeof update.message === "object" &&
|
||||
update.message !== null &&
|
||||
(update.message as { role?: unknown }).role === "assistant",
|
||||
);
|
||||
expect(assistantUpdates).toHaveLength(1);
|
||||
expect(JSON.stringify(assistantUpdates[0]?.message)).toContain("Command result with TTS.");
|
||||
});
|
||||
|
||||
it("renders image reply payloads as assistant image content instead of MEDIA text", async () => {
|
||||
createTranscriptFixture("openclaw-chat-send-agent-image-");
|
||||
mockState.finalPayload = {
|
||||
|
||||
@@ -143,6 +143,18 @@ function isMediaBearingPayload(payload: ReplyPayload): boolean {
|
||||
return false;
|
||||
}
|
||||
|
||||
function isTtsSupplementPayload(payload: ReplyPayload): boolean {
|
||||
return (
|
||||
typeof payload.spokenText === "string" &&
|
||||
payload.spokenText.trim().length > 0 &&
|
||||
isMediaBearingPayload(payload)
|
||||
);
|
||||
}
|
||||
|
||||
function stripVisibleTextFromTtsSupplement(payload: ReplyPayload): ReplyPayload {
|
||||
return isTtsSupplementPayload(payload) ? { ...payload, text: undefined } : payload;
|
||||
}
|
||||
|
||||
async function buildWebchatAssistantMediaMessage(
|
||||
payloads: ReplyPayload[],
|
||||
options?: {
|
||||
@@ -2008,6 +2020,7 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
if (!agentRunStarted || appendedWebchatAgentMedia || !isMediaBearingPayload(payload)) {
|
||||
return;
|
||||
}
|
||||
const transcriptPayload = stripVisibleTextFromTtsSupplement(payload);
|
||||
const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(sessionKey);
|
||||
const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
|
||||
const resolvedTranscriptPath = resolveTranscriptPath({
|
||||
@@ -2022,9 +2035,9 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
);
|
||||
const assistantContent = await buildAssistantDisplayContentFromReplyPayloads({
|
||||
sessionKey,
|
||||
payloads: [payload],
|
||||
payloads: [transcriptPayload],
|
||||
managedImageLocalRoots: mediaLocalRoots,
|
||||
includeSensitiveMedia: payload.sensitiveMedia !== true,
|
||||
includeSensitiveMedia: transcriptPayload.sensitiveMedia !== true,
|
||||
onLocalAudioAccessDenied: (message) => {
|
||||
context.logGateway.warn(`webchat audio embedding denied local path: ${message}`);
|
||||
},
|
||||
@@ -2032,7 +2045,7 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
context.logGateway.warn(`webchat image embedding skipped attachment: ${message}`);
|
||||
},
|
||||
});
|
||||
const mediaMessage = await buildWebchatAssistantMediaMessage([payload], {
|
||||
const mediaMessage = await buildWebchatAssistantMediaMessage([transcriptPayload], {
|
||||
localRoots: mediaLocalRoots,
|
||||
onLocalAudioAccessDenied: (message) => {
|
||||
context.logGateway.warn(`webchat audio embedding denied local path: ${message}`);
|
||||
@@ -2048,7 +2061,7 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
const transcriptReply =
|
||||
mediaMessage?.transcriptText ??
|
||||
extractAssistantDisplayTextFromContent(assistantContent) ??
|
||||
buildTranscriptReplyText([payload]);
|
||||
buildTranscriptReplyText([transcriptPayload]);
|
||||
if (!transcriptReply && !persistedAssistantContent?.length && !assistantContent?.length) {
|
||||
return;
|
||||
}
|
||||
@@ -2176,9 +2189,11 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
sessionKey,
|
||||
});
|
||||
} else {
|
||||
const finalPayloads = deliveredReplies
|
||||
.filter((entry) => entry.kind === "final")
|
||||
.map((entry) => entry.payload);
|
||||
const finalPayloads = appendedWebchatAgentMedia
|
||||
? []
|
||||
: deliveredReplies
|
||||
.filter((entry) => entry.kind === "final")
|
||||
.map((entry) => entry.payload);
|
||||
const { storePath: latestStorePath, entry: latestEntry } =
|
||||
loadSessionEntry(sessionKey);
|
||||
const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
|
||||
|
||||
Reference in New Issue
Block a user