From 30325f567cc5f1a2953ef01c622e0ad35a2eeb80 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 20:25:36 +0100 Subject: [PATCH] fix: use prompt snapshots for live context diagnostics --- CHANGELOG.md | 3 + docs/logging.md | 3 + docs/reference/token-use.md | 7 + src/agents/pi-embedded-runner/types.ts | 5 + .../agent-runner.misc.runreplyagent.test.ts | 172 +++++++++++++++++- src/auto-reply/reply/agent-runner.ts | 27 +-- 6 files changed, 205 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 486511107aa..aaa15ee7854 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,9 @@ Docs: https://docs.openclaw.ai - CLI/completion: dedupe provider auth flags before registering `openclaw onboard` options, so completion-cache refresh during update no longer fails when stale core fallback flags overlap plugin manifest flags. Fixes #71667. +- Diagnostics/trace: report live context usage from the current prompt snapshot + instead of provider turn totals, avoiding false near-full context spikes on + cached or tool-heavy runs. - Plugins/Bonjour: stop the gateway from crash-looping on `CIAO PROBING CANCELLED` when the mDNS watchdog cancels a stuck probe. Restores the rejection-handler wiring dropped during the bonjour plugin migration and shares unhandled-rejection state across module instances so plugin-staged copies of `openclaw/plugin-sdk/runtime` register into the same handler set the host consults. Especially affects Docker on macOS, where mDNS probing reliably hits the watchdog. Thanks @troyhitch. - Google Meet: report pinned Chrome nodes as offline or missing capabilities in setup/join diagnostics, keep inaccessible nodes out of auto-selection, and diff --git a/docs/logging.md b/docs/logging.md index 9b8db67b70c..67f4283661c 100644 --- a/docs/logging.md +++ b/docs/logging.md @@ -198,6 +198,9 @@ diagnostics + the exporter plugin are enabled. Model usage: - `model.usage`: tokens, cost, duration, context, provider/model/channel, session ids. + `usage` is provider/turn accounting for cost and telemetry; `context.used` + is the current prompt/context snapshot and can be lower than provider + `usage.total` when cached input or tool-loop calls are involved. Message flow: diff --git a/docs/reference/token-use.md b/docs/reference/token-use.md index 796c5ac9958..cee0d86ed3c 100644 --- a/docs/reference/token-use.md +++ b/docs/reference/token-use.md @@ -101,6 +101,13 @@ Assistant transcript entries persist the same normalized usage shape, including returns usage metadata. This gives `/usage cost` and transcript-backed session status a stable source even after the live runtime state is gone. +OpenClaw keeps provider usage accounting separate from the current context +snapshot. Provider `usage.total` can include cached input, output, and multiple +tool-loop model calls, so it is useful for cost and telemetry but can overstate +the live context window. Context displays and diagnostics use the latest prompt +snapshot (`promptTokens`, or the last model call when no prompt snapshot is +available) for `context.used`. + ## Cost estimation (when shown) Costs are estimated from your model pricing config: diff --git a/src/agents/pi-embedded-runner/types.ts b/src/agents/pi-embedded-runner/types.ts index ed1e41e4d4f..385e7d34216 100644 --- a/src/agents/pi-embedded-runner/types.ts +++ b/src/agents/pi-embedded-runner/types.ts @@ -10,6 +10,11 @@ export type EmbeddedPiAgentMeta = { agentHarnessId?: string; cliSessionBinding?: CliSessionBinding; compactionCount?: number; + /** + * Prompt/context snapshot from the latest model request. Prefer this for + * context-window utilization because provider usage totals can include cached + * and completion tokens that are useful for billing but noisy as live context. + */ promptTokens?: number; usage?: { input?: number; diff --git a/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts b/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts index 3ecbf77fe85..cded996fb32 100644 --- a/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts +++ b/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts @@ -10,6 +10,11 @@ import { import * as sessionTypesModule from "../../config/sessions.js"; import type { SessionEntry } from "../../config/sessions.js"; import { loadSessionStore, saveSessionStore } from "../../config/sessions.js"; +import { + onInternalDiagnosticEvent, + resetDiagnosticEventsForTest, + type DiagnosticEventPayload, +} from "../../infra/diagnostic-events.js"; import { clearMemoryPluginState, registerMemoryFlushPlanResolver, @@ -138,6 +143,7 @@ type RunWithModelFallbackParams = { }; beforeEach(() => { + resetDiagnosticEventsForTest(); embeddedRunTesting.resetActiveEmbeddedRuns(); replyRunRegistryTesting.resetReplyRunRegistry(); runEmbeddedPiAgentMock.mockClear(); @@ -169,6 +175,7 @@ beforeEach(() => { }); afterEach(() => { + resetDiagnosticEventsForTest(); vi.useRealTimers(); clearMemoryPluginState(); replyRunRegistryTesting.resetReplyRunRegistry(); @@ -289,6 +296,167 @@ describe("runReplyAgent auto-compaction token update", () => { // totalTokens should use lastCallUsage (55k), not accumulated (75k) expect(stored[sessionKey].totalTokens).toBe(55_000); }); + + it("reports live diagnostic context from promptTokens, not provider usage totals", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-usage-diagnostic-")); + const storePath = path.join(tmp, "sessions.json"); + const sessionKey = "main"; + const sessionEntry = { + sessionId: "session", + updatedAt: Date.now(), + totalTokens: 50_000, + }; + + await seedSessionStore({ storePath, sessionKey, entry: sessionEntry }); + + runEmbeddedPiAgentMock.mockResolvedValue({ + payloads: [{ text: "ok" }], + meta: { + agentMeta: { + usage: { input: 75_000, output: 5_000, cacheRead: 25_000, total: 105_000 }, + lastCallUsage: { input: 55_000, output: 2_000, cacheRead: 25_000, total: 82_000 }, + promptTokens: 44_000, + }, + }, + }); + + const diagnostics: DiagnosticEventPayload[] = []; + const unsubscribe = onInternalDiagnosticEvent((event) => { + diagnostics.push(event); + }); + const { typing, sessionCtx, resolvedQueue, followupRun } = createBaseRun({ + storePath, + sessionEntry, + }); + + try { + await runReplyAgent({ + commandBody: "hello", + followupRun, + queueKey: "main", + resolvedQueue, + shouldSteer: false, + shouldFollowup: false, + isActive: false, + isStreaming: false, + typing, + sessionCtx, + sessionEntry, + sessionStore: { [sessionKey]: sessionEntry }, + sessionKey, + storePath, + defaultModel: "anthropic/claude-opus-4-6", + agentCfgContextTokens: 200_000, + resolvedVerboseLevel: "off", + isNewSession: false, + blockStreamingEnabled: false, + resolvedBlockStreamingBreak: "message_end", + shouldInjectGroupIntro: false, + typingMode: "instant", + }); + } finally { + unsubscribe(); + } + + const usageEvent = diagnostics.find((event) => event.type === "model.usage"); + expect(usageEvent).toMatchObject({ + type: "model.usage", + usage: { + input: 75_000, + output: 5_000, + cacheRead: 25_000, + promptTokens: 100_000, + total: 105_000, + }, + context: { + limit: 200_000, + used: 44_000, + }, + }); + }); + + it("falls back to last-call prompt usage for live diagnostic context", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-usage-diagnostic-last-")); + const storePath = path.join(tmp, "sessions.json"); + const sessionKey = "main"; + const sessionEntry = { + sessionId: "session", + updatedAt: Date.now(), + totalTokens: 50_000, + }; + + await seedSessionStore({ storePath, sessionKey, entry: sessionEntry }); + + runEmbeddedPiAgentMock.mockResolvedValue({ + payloads: [{ text: "ok" }], + meta: { + agentMeta: { + usage: { input: 75_000, output: 5_000, cacheRead: 25_000, total: 105_000 }, + lastCallUsage: { + input: 55_000, + output: 2_000, + cacheRead: 25_000, + cacheWrite: 1_000, + total: 83_000, + }, + }, + }, + }); + + const diagnostics: DiagnosticEventPayload[] = []; + const unsubscribe = onInternalDiagnosticEvent((event) => { + diagnostics.push(event); + }); + const { typing, sessionCtx, resolvedQueue, followupRun } = createBaseRun({ + storePath, + sessionEntry, + }); + + try { + await runReplyAgent({ + commandBody: "hello", + followupRun, + queueKey: "main", + resolvedQueue, + shouldSteer: false, + shouldFollowup: false, + isActive: false, + isStreaming: false, + typing, + sessionCtx, + sessionEntry, + sessionStore: { [sessionKey]: sessionEntry }, + sessionKey, + storePath, + defaultModel: "anthropic/claude-opus-4-6", + agentCfgContextTokens: 200_000, + resolvedVerboseLevel: "off", + isNewSession: false, + blockStreamingEnabled: false, + resolvedBlockStreamingBreak: "message_end", + shouldInjectGroupIntro: false, + typingMode: "instant", + }); + } finally { + unsubscribe(); + } + + const usageEvent = diagnostics.find((event) => event.type === "model.usage"); + expect(usageEvent).toMatchObject({ + type: "model.usage", + usage: { + input: 75_000, + output: 5_000, + cacheRead: 25_000, + promptTokens: 100_000, + total: 105_000, + }, + context: { + limit: 200_000, + used: 81_000, + }, + }); + }); }); describe("runReplyAgent block streaming", () => { @@ -913,6 +1081,7 @@ describe("runReplyAgent Active Memory inline debug", () => { model: "claude", usage: { input: 1200, output: 45, cacheRead: 800, cacheWrite: 200, total: 2245 }, lastCallUsage: { input: 1000, output: 45, cacheRead: 750, cacheWrite: 150, total: 1945 }, + promptTokens: 1250, compactionCount: 1, }, }, @@ -987,6 +1156,7 @@ describe("runReplyAgent Active Memory inline debug", () => { expect(traceText).toContain("🔎 Usage (Session Total):"); expect(traceText).toContain("🔎 Usage (Last Turn Total):"); expect(traceText).toContain("🔎 Context Window (Last Model Request):"); + expect(traceText).toContain("used=1,250 tok (1.3k)"); expect(traceText).toContain("🔎 Execution Result:"); expect(traceText).toContain("winner=anthropic/claude"); expect(traceText).toContain("fallbackUsed=yes"); @@ -1025,7 +1195,7 @@ describe("runReplyAgent Active Memory inline debug", () => { expect(traceText).toContain("🔎 Model Input (User Role):"); expect(traceText).toContain("🔎 Model Output (Assistant Role):"); expect(traceText).toContain( - "Summary: winner=claude 🧠 low fallback=yes attempts=2 stop=end_turn prompt=1.9k/200k ⬇️ 1.2k ⬆️ 45 ♻️ 800 🆕 200 🔢 2.2k tools=2 compactions=1", + "Summary: winner=claude 🧠 low fallback=yes attempts=2 stop=end_turn prompt=1.3k/200k ⬇️ 1.2k ⬆️ 45 ♻️ 800 🆕 200 🔢 2.2k tools=2 compactions=1", ); expect(traceText.indexOf("🔎 Execution Result:")).toBeGreaterThan( traceText.indexOf("🔎 Context Window (Last Model Request):"), diff --git a/src/auto-reply/reply/agent-runner.ts b/src/auto-reply/reply/agent-runner.ts index 928684b829e..317e00cfd3e 100644 --- a/src/auto-reply/reply/agent-runner.ts +++ b/src/auto-reply/reply/agent-runner.ts @@ -585,6 +585,13 @@ function resolveRequestPromptTokens(params: { total?: number; }; }): number | undefined { + if ( + typeof params.promptTokens === "number" && + Number.isFinite(params.promptTokens) && + params.promptTokens > 0 + ) { + return params.promptTokens; + } const lastCall = params.lastCallUsage; if (lastCall) { const input = lastCall.input ?? 0; @@ -595,13 +602,6 @@ function resolveRequestPromptTokens(params: { return sum; } } - if ( - typeof params.promptTokens === "number" && - Number.isFinite(params.promptTokens) && - params.promptTokens > 0 - ) { - return params.promptTokens; - } const usage = params.usage; if (usage) { const input = usage.input ?? 0; @@ -1428,8 +1428,13 @@ export async function runReplyAgent(params: { const output = usage.output ?? 0; const cacheRead = usage.cacheRead ?? 0; const cacheWrite = usage.cacheWrite ?? 0; - const promptTokens = input + cacheRead + cacheWrite; - const totalTokens = usage.total ?? promptTokens + output; + const usagePromptTokens = input + cacheRead + cacheWrite; + const totalTokens = usage.total ?? usagePromptTokens + output; + const contextUsedTokens = resolveRequestPromptTokens({ + lastCallUsage: runResult.meta?.agentMeta?.lastCallUsage, + promptTokens, + usage, + }); const costConfig = resolveModelCostConfig({ provider: providerUsed, model: modelUsed, @@ -1455,13 +1460,13 @@ export async function runReplyAgent(params: { output, cacheRead, cacheWrite, - promptTokens, + promptTokens: usagePromptTokens, total: totalTokens, }, lastCallUsage: runResult.meta?.agentMeta?.lastCallUsage, context: { limit: contextTokensUsed, - used: totalTokens, + ...(contextUsedTokens !== undefined ? { used: contextUsedTokens } : {}), }, costUsd, durationMs: Date.now() - runStartedAt,