fix(diagnostics): export Talk metrics after SDK refactor

Adds bounded Talk lifecycle/audio diagnostics and session recovery metrics for OTEL, Prometheus, and stability snapshots after the Talk SDK/session refactor. Includes changelog/docs updates and Testbox/live proof.
This commit is contained in:
Vincent Koc
2026-05-06 02:01:52 -07:00
committed by GitHub
parent d9ffc1aa63
commit e2501b2d6d
23 changed files with 632 additions and 68 deletions

View File

@@ -2478,6 +2478,118 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("exports session recovery and talk metrics with bounded attributes", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });
await service.start(ctx);
emitTrustedDiagnosticEvent({
type: "session.recovery.requested",
sessionId: "session-should-not-export",
sessionKey: "key-should-not-export",
state: "processing",
ageMs: 12_000,
reason: "startup-sweep",
activeWorkKind: "tool_call",
allowActiveAbort: true,
});
emitTrustedDiagnosticEvent({
type: "session.recovery.completed",
sessionId: "session-should-not-export",
sessionKey: "key-should-not-export",
state: "processing",
ageMs: 13_000,
reason: "startup-sweep",
activeWorkKind: "tool_call",
status: "released",
action: "abort-active-run",
});
emitTrustedDiagnosticEvent({
type: "talk.event",
sessionId: "talk-session-should-not-export",
turnId: "turn-should-not-export",
talkEventType: "input.audio.delta",
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
provider: "openai",
byteLength: 320,
});
emitTrustedDiagnosticEvent({
type: "talk.event",
sessionId: "talk-session-should-not-export",
talkEventType: "latency.metrics",
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
provider: "openai",
durationMs: 45,
});
await flushDiagnosticEvents();
expect(
telemetryState.counters.get("openclaw.session.recovery.requested")?.add,
).toHaveBeenCalledWith(
1,
expect.objectContaining({
"openclaw.state": "processing",
"openclaw.action": "abort",
"openclaw.active_work_kind": "tool_call",
}),
);
expect(
telemetryState.counters.get("openclaw.session.recovery.completed")?.add,
).toHaveBeenCalledWith(
1,
expect.objectContaining({
"openclaw.state": "processing",
"openclaw.status": "released",
"openclaw.action": "abort-active-run",
}),
);
expect(
telemetryState.histograms.get("openclaw.session.recovery.age_ms")?.record,
).toHaveBeenCalledWith(
13_000,
expect.objectContaining({
"openclaw.status": "released",
}),
);
expect(telemetryState.counters.get("openclaw.talk.event")?.add).toHaveBeenCalledWith(1, {
"openclaw.talk.brain": "agent-consult",
"openclaw.talk.event_type": "input.audio.delta",
"openclaw.talk.mode": "realtime",
"openclaw.talk.provider": "openai",
"openclaw.talk.transport": "gateway-relay",
});
expect(telemetryState.histograms.get("openclaw.talk.audio.bytes")?.record).toHaveBeenCalledWith(
320,
{
"openclaw.talk.brain": "agent-consult",
"openclaw.talk.event_type": "input.audio.delta",
"openclaw.talk.mode": "realtime",
"openclaw.talk.provider": "openai",
"openclaw.talk.transport": "gateway-relay",
},
);
expect(
telemetryState.histograms.get("openclaw.talk.event.duration_ms")?.record,
).toHaveBeenCalledWith(45, {
"openclaw.talk.brain": "agent-consult",
"openclaw.talk.event_type": "latency.metrics",
"openclaw.talk.mode": "realtime",
"openclaw.talk.provider": "openai",
"openclaw.talk.transport": "gateway-relay",
});
const talkCounterCalls = JSON.stringify(
telemetryState.counters.get("openclaw.talk.event")?.add.mock.calls,
);
expect(talkCounterCalls).not.toContain("talk-session-should-not-export");
expect(talkCounterCalls).not.toContain("turn-should-not-export");
await service.stop?.(ctx);
});
test("does not export model or tool content unless capture is explicitly enabled", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });

View File

@@ -95,6 +95,7 @@ type SessionRecoveryDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "session.recovery.requested" | "session.recovery.completed" }
>;
type TalkDiagnosticEvent = Extract<DiagnosticEventPayload, { type: "talk.event" }>;
const NO_CONTENT_CAPTURE: OtelContentCapturePolicy = {
inputMessages: false,
@@ -844,6 +845,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
description: "Age of sessions selected for recovery",
},
);
const talkEventCounter = meter.createCounter("openclaw.talk.event", {
unit: "1",
description: "Talk events emitted by type",
});
const talkEventDurationHistogram = meter.createHistogram("openclaw.talk.event.duration_ms", {
unit: "ms",
description: "Talk event duration when reported",
});
const talkAudioBytesHistogram = meter.createHistogram("openclaw.talk.audio.bytes", {
unit: "By",
description: "Talk audio frame byte lengths",
});
const runAttemptCounter = meter.createCounter("openclaw.run.attempt", {
unit: "1",
description: "Run attempts",
@@ -1526,6 +1539,28 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
sessionRecoveryAgeHistogram.record(evt.ageMs, attrs);
};
const talkEventAttrs = (evt: TalkDiagnosticEvent): Record<string, string> => ({
"openclaw.talk.brain": lowCardinalityAttr(evt.brain),
"openclaw.talk.event_type": lowCardinalityAttr(evt.talkEventType),
"openclaw.talk.mode": lowCardinalityAttr(evt.mode),
"openclaw.talk.provider": lowCardinalityAttr(evt.provider),
"openclaw.talk.transport": lowCardinalityAttr(evt.transport),
});
const recordTalkEvent = (evt: TalkDiagnosticEvent, metadata: DiagnosticEventMetadata) => {
if (!metadata.trusted) {
return;
}
const attrs = talkEventAttrs(evt);
talkEventCounter.add(1, attrs);
if (typeof evt.durationMs === "number") {
talkEventDurationHistogram.record(evt.durationMs, attrs);
}
if (typeof evt.byteLength === "number") {
talkAudioBytesHistogram.record(evt.byteLength, attrs);
}
};
const recordRunAttempt = (evt: Extract<DiagnosticEventPayload, { type: "run.attempt" }>) => {
runAttemptCounter.add(1, { "openclaw.attempt": evt.attempt });
};
@@ -2283,6 +2318,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
case "message.delivery.error":
recordMessageDeliveryError(evt);
return;
case "talk.event":
recordTalkEvent(evt, metadata);
return;
case "queue.lane.enqueue":
recordLaneEnqueue(evt);
return;

View File

@@ -90,6 +90,17 @@ describe("diagnostics-prometheus service", () => {
it("bounds messaging labels without exporting raw chat identifiers", () => {
const store = __test__.createPrometheusMetricStore();
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "message.delivery.started",
channel: "matrix",
deliveryKind: "text",
sessionKey: "session-should-not-export",
},
trusted,
);
__test__.recordDiagnosticEvent(
store,
{
@@ -119,6 +130,9 @@ describe("diagnostics-prometheus service", () => {
const rendered = __test__.renderPrometheusMetrics(store);
expect(rendered).toContain(
'openclaw_message_delivery_started_total{channel="matrix",delivery_kind="text"} 1',
);
expect(rendered).toContain(
'openclaw_message_processed_total{channel="unknown",outcome="completed",reason="none"} 1',
);
@@ -127,9 +141,69 @@ describe("diagnostics-prometheus service", () => {
);
expect(rendered).not.toContain("chat-should-not-export");
expect(rendered).not.toContain("message-should-not-export");
expect(rendered).not.toContain("session-should-not-export");
expect(rendered).not.toContain("progress draft");
});
it("records session recovery and talk metrics without exporting raw ids or content", () => {
const store = __test__.createPrometheusMetricStore();
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "session.recovery.completed",
sessionId: "session-should-not-export",
sessionKey: "key-should-not-export",
state: "processing",
stateGeneration: 2,
ageMs: 12_000,
queueDepth: 1,
reason: "startup-sweep",
activeWorkKind: "tool_call",
allowActiveAbort: true,
status: "released",
action: "abort-active-run",
},
trusted,
);
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "talk.event",
sessionId: "talk-session-should-not-export",
turnId: "turn-should-not-export",
talkEventType: "input.audio.delta",
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
provider: "openai",
byteLength: 320,
},
trusted,
);
const rendered = __test__.renderPrometheusMetrics(store);
expect(rendered).toContain(
'openclaw_session_recovery_total{action="abort-active-run",active_work_kind="tool_call",state="processing",status="released"} 1',
);
expect(rendered).toContain(
'openclaw_session_recovery_age_seconds_sum{action="abort-active-run",active_work_kind="tool_call",state="processing",status="released"} 12',
);
expect(rendered).toContain(
'openclaw_talk_event_total{brain="agent-consult",event_type="input.audio.delta",mode="realtime",provider="openai",transport="gateway-relay"} 1',
);
expect(rendered).toContain(
'openclaw_talk_audio_bytes_sum{brain="agent-consult",event_type="input.audio.delta",mode="realtime",provider="openai",transport="gateway-relay"} 320',
);
expect(rendered).not.toContain("session-should-not-export");
expect(rendered).not.toContain("key-should-not-export");
expect(rendered).not.toContain("talk-session-should-not-export");
expect(rendered).not.toContain("turn-should-not-export");
});
it("caps metric series growth and reports dropped series", () => {
const store = __test__.createPrometheusMetricStore();

View File

@@ -351,6 +351,35 @@ function harnessLabels(evt: {
};
}
function sessionRecoveryLabels(
evt: Extract<
DiagnosticEventPayload,
{ type: "session.recovery.requested" | "session.recovery.completed" }
>,
): LabelSet {
return {
action:
evt.type === "session.recovery.completed"
? lowCardinalityLabel(evt.action, "unknown")
: evt.allowActiveAbort
? "abort"
: "recover",
active_work_kind: lowCardinalityLabel(evt.activeWorkKind, "none"),
state: evt.state,
status: evt.type === "session.recovery.completed" ? evt.status : "requested",
};
}
function talkLabels(evt: Extract<DiagnosticEventPayload, { type: "talk.event" }>): LabelSet {
return {
brain: lowCardinalityLabel(evt.brain),
event_type: lowCardinalityLabel(evt.talkEventType),
mode: lowCardinalityLabel(evt.mode),
provider: lowCardinalityLabel(evt.provider),
transport: lowCardinalityLabel(evt.transport),
};
}
function recordModelUsage(
store: PrometheusMetricStore,
evt: Extract<DiagnosticEventPayload, { type: "model.usage" }>,
@@ -497,6 +526,16 @@ function recordDiagnosticEvent(
seconds(evt.durationMs),
);
return;
case "message.delivery.started":
store.counter(
"openclaw_message_delivery_started_total",
"Outbound message delivery attempts started.",
{
channel: lowCardinalityLabel(evt.channel),
delivery_kind: lowCardinalityLabel(evt.deliveryKind, "other"),
},
);
return;
case "message.delivery.completed":
case "message.delivery.error":
store.counter(
@@ -527,6 +566,36 @@ function recordDiagnosticEvent(
seconds(evt.durationMs),
);
return;
case "talk.event":
store.counter("openclaw_talk_event_total", "Talk events emitted by type.", talkLabels(evt));
store.histogram(
"openclaw_talk_event_duration_seconds",
"Talk event duration in seconds when reported.",
talkLabels(evt),
seconds(evt.durationMs),
);
store.histogram(
"openclaw_talk_audio_bytes",
"Talk audio frame byte lengths.",
talkLabels(evt),
numericValue(evt.byteLength),
BYTE_BUCKETS,
);
return;
case "session.recovery.requested":
case "session.recovery.completed":
store.counter(
"openclaw_session_recovery_total",
"Session recovery observations by status and action.",
sessionRecoveryLabels(evt),
);
store.histogram(
"openclaw_session_recovery_age_seconds",
"Age of sessions selected for recovery in seconds.",
sessionRecoveryLabels(evt),
seconds(evt.ageMs),
);
return;
case "queue.lane.enqueue":
case "queue.lane.dequeue":
store.gauge(

View File

@@ -9,6 +9,7 @@ import {
createRealtimeVoiceAgentTalkbackQueue,
createTalkSessionController,
createRealtimeVoiceBridgeSession,
recordTalkDiagnosticEvent,
type RealtimeVoiceAgentTalkbackQueue,
type RealtimeVoiceBridgeSession,
type RealtimeVoiceProviderPlugin,
@@ -359,13 +360,16 @@ export async function startNodeRealtimeAudioBridge(params: {
const transcript: GoogleMeetRealtimeTranscriptEntry[] = [];
const realtimeEvents: GoogleMeetRealtimeEventEntry[] = [];
const strategy = params.config.realtime.strategy;
const talk: TalkSessionController = createTalkSessionController({
sessionId: `google-meet:${params.meetingSessionId}:${params.bridgeId}:node-realtime`,
mode: "realtime",
transport: "gateway-relay",
brain: strategy === "bidi" ? "direct-tools" : "agent-consult",
provider: resolved.provider.id,
});
const talk: TalkSessionController = createTalkSessionController(
{
sessionId: `google-meet:${params.meetingSessionId}:${params.bridgeId}:node-realtime`,
mode: "realtime",
transport: "gateway-relay",
brain: strategy === "bidi" ? "direct-tools" : "agent-consult",
provider: resolved.provider.id,
},
{ onEvent: recordTalkDiagnosticEvent },
);
const recentTalkEvents: TalkEvent[] = [];
const rememberTalkEvent = (event: TalkEvent | undefined): void => {
if (event) {

View File

@@ -23,6 +23,7 @@ import {
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
recordRealtimeVoiceBridgeEvent,
recordTalkDiagnosticEvent,
recordRealtimeVoiceTranscript,
resamplePcm,
resolveConfiguredRealtimeVoiceProvider,
@@ -485,14 +486,17 @@ export async function startCommandAgentAudioBridge(params: {
fullConfig: params.fullConfig,
providers: params.providers,
});
const talk = createTalkSessionController({
sessionId: `google-meet:${params.meetingSessionId}:agent`,
mode: "stt-tts",
transport: "gateway-relay",
brain: "agent-consult",
provider: resolved.provider.id,
turnIdPrefix: `google-meet:${params.meetingSessionId}:turn`,
});
const talk = createTalkSessionController(
{
sessionId: `google-meet:${params.meetingSessionId}:agent`,
mode: "stt-tts",
transport: "gateway-relay",
brain: "agent-consult",
provider: resolved.provider.id,
turnIdPrefix: `google-meet:${params.meetingSessionId}:turn`,
},
{ onEvent: recordTalkDiagnosticEvent },
);
const recentTalkEvents: TalkEvent[] = [];
const emitTalkEvent = (input: TalkEventInput) =>
pushGoogleMeetTalkEvent(recentTalkEvents, talk.emit(input));
@@ -1034,13 +1038,16 @@ export async function startCommandRealtimeAudioBridge(params: {
);
const transcript: GoogleMeetRealtimeTranscriptEntry[] = [];
const realtimeEvents: GoogleMeetRealtimeEventEntry[] = [];
const talk: TalkSessionController = createTalkSessionController({
sessionId: `google-meet:${params.meetingSessionId}:command-realtime`,
mode: "realtime",
transport: "gateway-relay",
brain: strategy === "bidi" ? "direct-tools" : "agent-consult",
provider: resolved.provider.id,
});
const talk: TalkSessionController = createTalkSessionController(
{
sessionId: `google-meet:${params.meetingSessionId}:command-realtime`,
mode: "realtime",
transport: "gateway-relay",
brain: strategy === "bidi" ? "direct-tools" : "agent-consult",
provider: resolved.provider.id,
},
{ onEvent: recordTalkDiagnosticEvent },
);
const recentTalkEvents: TalkEvent[] = [];
const rememberTalkEvent = (event: TalkEvent | undefined): void => {
if (event) {

View File

@@ -16,6 +16,7 @@ import type {
} from "openclaw/plugin-sdk/realtime-transcription";
import {
createTalkSessionController,
recordTalkDiagnosticEvent,
type TalkEvent,
type TalkEventInput,
type TalkSessionController,
@@ -784,14 +785,17 @@ export class MediaStreamHandler {
}
private createTalkEvents(callId: string, streamSid: string): TalkSessionController {
return createTalkSessionController({
sessionId: `voice-call:${callId}:${streamSid}`,
mode: "stt-tts",
transport: "gateway-relay",
brain: "agent-consult",
provider: this.config.transcriptionProvider.id,
turnIdPrefix: `${streamSid}:turn`,
});
return createTalkSessionController(
{
sessionId: `voice-call:${callId}:${streamSid}`,
mode: "stt-tts",
transport: "gateway-relay",
brain: "agent-consult",
provider: this.config.transcriptionProvider.id,
turnIdPrefix: `${streamSid}:turn`,
},
{ onEvent: recordTalkDiagnosticEvent },
);
}
private emitTalkEvent(session: StreamSession, input: TalkEventInput): void {

View File

@@ -7,6 +7,7 @@ import {
createTalkSessionController,
createRealtimeVoiceBridgeSession,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
recordTalkDiagnosticEvent,
type RealtimeVoiceBridgeSession,
type RealtimeVoiceProviderConfig,
type RealtimeVoiceProviderPlugin,
@@ -507,13 +508,16 @@ export class RealtimeCallHandler {
const { callId, initialGreetingInstructions } = registration;
const callRecord = this.manager.getCallByProviderCallId(callSid);
const talk: TalkSessionController = createTalkSessionController({
sessionId: `voice-call:${callId}:realtime`,
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
provider: this.realtimeProvider.id,
});
const talk: TalkSessionController = createTalkSessionController(
{
sessionId: `voice-call:${callId}:realtime`,
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",
provider: this.realtimeProvider.id,
},
{ onEvent: recordTalkDiagnosticEvent },
);
const rememberTalkEvent = (event: TalkEvent | undefined): TalkEvent | undefined => {
if (event) {
appendRecentTalkEventMetadata(callRecord, event);