From 30325f567cc5f1a2953ef01c622e0ad35a2eeb80 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 25 Apr 2026 20:25:36 +0100
Subject: [PATCH] fix: use prompt snapshots for live context diagnostics

---
 CHANGELOG.md                                  |   3 +
 docs/logging.md                               |   3 +
 docs/reference/token-use.md                   |   7 +
 src/agents/pi-embedded-runner/types.ts        |   5 +
 .../agent-runner.misc.runreplyagent.test.ts   | 172 +++++++++++++++++-
 src/auto-reply/reply/agent-runner.ts          |  27 +--
 6 files changed, 205 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 486511107aa..aaa15ee7854 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -64,6 +64,9 @@ Docs: https://docs.openclaw.ai
 - CLI/completion: dedupe provider auth flags before registering `openclaw onboard`
   options, so completion-cache refresh during update no longer fails when stale
   core fallback flags overlap plugin manifest flags. Fixes #71667.
+- Diagnostics/trace: report live context usage from the current prompt snapshot
+  instead of provider turn totals, avoiding false near-full context spikes on
+  cached or tool-heavy runs.
 - Plugins/Bonjour: stop the gateway from crash-looping on `CIAO PROBING CANCELLED` when the mDNS watchdog cancels a stuck probe. Restores the rejection-handler wiring dropped during the bonjour plugin migration and shares unhandled-rejection state across module instances so plugin-staged copies of `openclaw/plugin-sdk/runtime` register into the same handler set the host consults. Especially affects Docker on macOS, where mDNS probing reliably hits the watchdog. Thanks @troyhitch.
 - Google Meet: report pinned Chrome nodes as offline or missing capabilities in
   setup/join diagnostics, keep inaccessible nodes out of auto-selection, and
diff --git a/docs/logging.md b/docs/logging.md
index 9b8db67b70c..67f4283661c 100644
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -198,6 +198,9 @@ diagnostics + the exporter plugin are enabled.
 Model usage:
 
 - `model.usage`: tokens, cost, duration, context, provider/model/channel, session ids.
+  `usage` is provider/turn accounting for cost and telemetry; `context.used`
+  is the current prompt/context snapshot and can be lower than provider
+  `usage.total` when cached input or tool-loop calls are involved.
 
 Message flow:
 
diff --git a/docs/reference/token-use.md b/docs/reference/token-use.md
index 796c5ac9958..cee0d86ed3c 100644
--- a/docs/reference/token-use.md
+++ b/docs/reference/token-use.md
@@ -101,6 +101,13 @@ Assistant transcript entries persist the same normalized usage shape, including
 returns usage metadata. This gives `/usage cost` and transcript-backed session
 status a stable source even after the live runtime state is gone.
 
+OpenClaw keeps provider usage accounting separate from the current context
+snapshot. Provider `usage.total` can include cached input, output, and multiple
+tool-loop model calls, so it is useful for cost and telemetry but can overstate
+the live context window. Context displays and diagnostics use the latest prompt
+snapshot (`promptTokens`, or the last model call when no prompt snapshot is
+available) for `context.used`.
+
 ## Cost estimation (when shown)
 
 Costs are estimated from your model pricing config:
diff --git a/src/agents/pi-embedded-runner/types.ts b/src/agents/pi-embedded-runner/types.ts
index ed1e41e4d4f..385e7d34216 100644
--- a/src/agents/pi-embedded-runner/types.ts
+++ b/src/agents/pi-embedded-runner/types.ts
@@ -10,6 +10,11 @@ export type EmbeddedPiAgentMeta = {
   agentHarnessId?: string;
   cliSessionBinding?: CliSessionBinding;
   compactionCount?: number;
+  /**
+   * Prompt/context snapshot from the latest model request. Prefer this for
+   * context-window utilization because provider usage totals can include cached
+   * and completion tokens that are useful for billing but noisy as live context.
+   */
   promptTokens?: number;
   usage?: {
     input?: number;
diff --git a/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts b/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts
index 3ecbf77fe85..cded996fb32 100644
--- a/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts
+++ b/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts
@@ -10,6 +10,11 @@ import {
 import * as sessionTypesModule from "../../config/sessions.js";
 import type { SessionEntry } from "../../config/sessions.js";
 import { loadSessionStore, saveSessionStore } from "../../config/sessions.js";
+import {
+  onInternalDiagnosticEvent,
+  resetDiagnosticEventsForTest,
+  type DiagnosticEventPayload,
+} from "../../infra/diagnostic-events.js";
 import {
   clearMemoryPluginState,
   registerMemoryFlushPlanResolver,
@@ -138,6 +143,7 @@ type RunWithModelFallbackParams = {
 };
 
 beforeEach(() => {
+  resetDiagnosticEventsForTest();
   embeddedRunTesting.resetActiveEmbeddedRuns();
   replyRunRegistryTesting.resetReplyRunRegistry();
   runEmbeddedPiAgentMock.mockClear();
@@ -169,6 +175,7 @@ beforeEach(() => {
 });
 
 afterEach(() => {
+  resetDiagnosticEventsForTest();
   vi.useRealTimers();
   clearMemoryPluginState();
   replyRunRegistryTesting.resetReplyRunRegistry();
@@ -289,6 +296,167 @@ describe("runReplyAgent auto-compaction token update", () => {
     // totalTokens should use lastCallUsage (55k), not accumulated (75k)
     expect(stored[sessionKey].totalTokens).toBe(55_000);
   });
+
+  it("reports live diagnostic context from promptTokens, not provider usage totals", async () => {
+    const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-usage-diagnostic-"));
+    const storePath = path.join(tmp, "sessions.json");
+    const sessionKey = "main";
+    const sessionEntry = {
+      sessionId: "session",
+      updatedAt: Date.now(),
+      totalTokens: 50_000,
+    };
+
+    await seedSessionStore({ storePath, sessionKey, entry: sessionEntry });
+
+    runEmbeddedPiAgentMock.mockResolvedValue({
+      payloads: [{ text: "ok" }],
+      meta: {
+        agentMeta: {
+          usage: { input: 75_000, output: 5_000, cacheRead: 25_000, total: 105_000 },
+          lastCallUsage: { input: 55_000, output: 2_000, cacheRead: 25_000, total: 82_000 },
+          promptTokens: 44_000,
+        },
+      },
+    });
+
+    const diagnostics: DiagnosticEventPayload[] = [];
+    const unsubscribe = onInternalDiagnosticEvent((event) => {
+      diagnostics.push(event);
+    });
+    const { typing, sessionCtx, resolvedQueue, followupRun } = createBaseRun({
+      storePath,
+      sessionEntry,
+    });
+
+    try {
+      await runReplyAgent({
+        commandBody: "hello",
+        followupRun,
+        queueKey: "main",
+        resolvedQueue,
+        shouldSteer: false,
+        shouldFollowup: false,
+        isActive: false,
+        isStreaming: false,
+        typing,
+        sessionCtx,
+        sessionEntry,
+        sessionStore: { [sessionKey]: sessionEntry },
+        sessionKey,
+        storePath,
+        defaultModel: "anthropic/claude-opus-4-6",
+        agentCfgContextTokens: 200_000,
+        resolvedVerboseLevel: "off",
+        isNewSession: false,
+        blockStreamingEnabled: false,
+        resolvedBlockStreamingBreak: "message_end",
+        shouldInjectGroupIntro: false,
+        typingMode: "instant",
+      });
+    } finally {
+      unsubscribe();
+    }
+
+    const usageEvent = diagnostics.find((event) => event.type === "model.usage");
+    expect(usageEvent).toMatchObject({
+      type: "model.usage",
+      usage: {
+        input: 75_000,
+        output: 5_000,
+        cacheRead: 25_000,
+        promptTokens: 100_000,
+        total: 105_000,
+      },
+      context: {
+        limit: 200_000,
+        used: 44_000,
+      },
+    });
+  });
+
+  it("falls back to last-call prompt usage for live diagnostic context", async () => {
+    const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-usage-diagnostic-last-"));
+    const storePath = path.join(tmp, "sessions.json");
+    const sessionKey = "main";
+    const sessionEntry = {
+      sessionId: "session",
+      updatedAt: Date.now(),
+      totalTokens: 50_000,
+    };
+
+    await seedSessionStore({ storePath, sessionKey, entry: sessionEntry });
+
+    runEmbeddedPiAgentMock.mockResolvedValue({
+      payloads: [{ text: "ok" }],
+      meta: {
+        agentMeta: {
+          usage: { input: 75_000, output: 5_000, cacheRead: 25_000, total: 105_000 },
+          lastCallUsage: {
+            input: 55_000,
+            output: 2_000,
+            cacheRead: 25_000,
+            cacheWrite: 1_000,
+            total: 83_000,
+          },
+        },
+      },
+    });
+
+    const diagnostics: DiagnosticEventPayload[] = [];
+    const unsubscribe = onInternalDiagnosticEvent((event) => {
+      diagnostics.push(event);
+    });
+    const { typing, sessionCtx, resolvedQueue, followupRun } = createBaseRun({
+      storePath,
+      sessionEntry,
+    });
+
+    try {
+      await runReplyAgent({
+        commandBody: "hello",
+        followupRun,
+        queueKey: "main",
+        resolvedQueue,
+        shouldSteer: false,
+        shouldFollowup: false,
+        isActive: false,
+        isStreaming: false,
+        typing,
+        sessionCtx,
+        sessionEntry,
+        sessionStore: { [sessionKey]: sessionEntry },
+        sessionKey,
+        storePath,
+        defaultModel: "anthropic/claude-opus-4-6",
+        agentCfgContextTokens: 200_000,
+        resolvedVerboseLevel: "off",
+        isNewSession: false,
+        blockStreamingEnabled: false,
+        resolvedBlockStreamingBreak: "message_end",
+        shouldInjectGroupIntro: false,
+        typingMode: "instant",
+      });
+    } finally {
+      unsubscribe();
+    }
+
+    const usageEvent = diagnostics.find((event) => event.type === "model.usage");
+    expect(usageEvent).toMatchObject({
+      type: "model.usage",
+      usage: {
+        input: 75_000,
+        output: 5_000,
+        cacheRead: 25_000,
+        promptTokens: 100_000,
+        total: 105_000,
+      },
+      context: {
+        limit: 200_000,
+        used: 81_000,
+      },
+    });
+  });
 });
 
 describe("runReplyAgent block streaming", () => {
@@ -913,6 +1081,7 @@ describe("runReplyAgent Active Memory inline debug", () => {
           model: "claude",
           usage: { input: 1200, output: 45, cacheRead: 800, cacheWrite: 200, total: 2245 },
           lastCallUsage: { input: 1000, output: 45, cacheRead: 750, cacheWrite: 150, total: 1945 },
+          promptTokens: 1250,
           compactionCount: 1,
         },
       },
@@ -987,6 +1156,7 @@ describe("runReplyAgent Active Memory inline debug", () => {
     expect(traceText).toContain("🔎 Usage (Session Total):");
     expect(traceText).toContain("🔎 Usage (Last Turn Total):");
     expect(traceText).toContain("🔎 Context Window (Last Model Request):");
+    expect(traceText).toContain("used=1,250 tok (1.3k)");
     expect(traceText).toContain("🔎 Execution Result:");
     expect(traceText).toContain("winner=anthropic/claude");
     expect(traceText).toContain("fallbackUsed=yes");
@@ -1025,7 +1195,7 @@ describe("runReplyAgent Active Memory inline debug", () => {
     expect(traceText).toContain("🔎 Model Input (User Role):");
     expect(traceText).toContain("🔎 Model Output (Assistant Role):");
     expect(traceText).toContain(
-      "Summary: winner=claude 🧠 low fallback=yes attempts=2 stop=end_turn prompt=1.9k/200k ⬇️ 1.2k ⬆️ 45 ♻️ 800 🆕 200 🔢 2.2k tools=2 compactions=1",
+      "Summary: winner=claude 🧠 low fallback=yes attempts=2 stop=end_turn prompt=1.3k/200k ⬇️ 1.2k ⬆️ 45 ♻️ 800 🆕 200 🔢 2.2k tools=2 compactions=1",
     );
     expect(traceText.indexOf("🔎 Execution Result:")).toBeGreaterThan(
       traceText.indexOf("🔎 Context Window (Last Model Request):"),
diff --git a/src/auto-reply/reply/agent-runner.ts b/src/auto-reply/reply/agent-runner.ts
index 928684b829e..317e00cfd3e 100644
--- a/src/auto-reply/reply/agent-runner.ts
+++ b/src/auto-reply/reply/agent-runner.ts
@@ -585,6 +585,13 @@ function resolveRequestPromptTokens(params: {
     total?: number;
   };
 }): number | undefined {
+  if (
+    typeof params.promptTokens === "number" &&
+    Number.isFinite(params.promptTokens) &&
+    params.promptTokens > 0
+  ) {
+    return params.promptTokens;
+  }
   const lastCall = params.lastCallUsage;
   if (lastCall) {
     const input = lastCall.input ?? 0;
@@ -595,13 +602,6 @@ function resolveRequestPromptTokens(params: {
       return sum;
     }
   }
-  if (
-    typeof params.promptTokens === "number" &&
-    Number.isFinite(params.promptTokens) &&
-    params.promptTokens > 0
-  ) {
-    return params.promptTokens;
-  }
   const usage = params.usage;
   if (usage) {
     const input = usage.input ?? 0;
@@ -1428,8 +1428,13 @@ export async function runReplyAgent(params: {
       const output = usage.output ?? 0;
       const cacheRead = usage.cacheRead ?? 0;
       const cacheWrite = usage.cacheWrite ?? 0;
-      const promptTokens = input + cacheRead + cacheWrite;
-      const totalTokens = usage.total ?? promptTokens + output;
+      const usagePromptTokens = input + cacheRead + cacheWrite;
+      const totalTokens = usage.total ?? usagePromptTokens + output;
+      const contextUsedTokens = resolveRequestPromptTokens({
+        lastCallUsage: runResult.meta?.agentMeta?.lastCallUsage,
+        promptTokens,
+        usage,
+      });
       const costConfig = resolveModelCostConfig({
         provider: providerUsed,
         model: modelUsed,
@@ -1455,13 +1460,13 @@ export async function runReplyAgent(params: {
           output,
           cacheRead,
           cacheWrite,
-          promptTokens,
+          promptTokens: usagePromptTokens,
           total: totalTokens,
         },
         lastCallUsage: runResult.meta?.agentMeta?.lastCallUsage,
         context: {
           limit: contextTokensUsed,
-          used: totalTokens,
+          ...(contextUsedTokens !== undefined ? { used: contextUsedTokens } : {}),
         },
         costUsd,
         durationMs: Date.now() - runStartedAt,