fix: use prompt snapshots for live context diagnostics

2026-05-06 08:30:42 +00:00 · 2026-04-25 20:25:36 +01:00
parent b732f21a86
commit 30325f567c
6 changed files with 205 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -64,6 +64,9 @@ Docs: https://docs.openclaw.ai
 - CLI/completion: dedupe provider auth flags before registering `openclaw onboard`
  options, so completion-cache refresh during update no longer fails when stale
  core fallback flags overlap plugin manifest flags. Fixes #71667.
+- Diagnostics/trace: report live context usage from the current prompt snapshot
+  instead of provider turn totals, avoiding false near-full context spikes on
+  cached or tool-heavy runs.
 - Plugins/Bonjour: stop the gateway from crash-looping on `CIAO PROBING CANCELLED` when the mDNS watchdog cancels a stuck probe. Restores the rejection-handler wiring dropped during the bonjour plugin migration and shares unhandled-rejection state across module instances so plugin-staged copies of `openclaw/plugin-sdk/runtime` register into the same handler set the host consults. Especially affects Docker on macOS, where mDNS probing reliably hits the watchdog. Thanks @troyhitch.
 - Google Meet: report pinned Chrome nodes as offline or missing capabilities in
  setup/join diagnostics, keep inaccessible nodes out of auto-selection, and
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -198,6 +198,9 @@ diagnostics + the exporter plugin are enabled.
 Model usage:

 - `model.usage`: tokens, cost, duration, context, provider/model/channel, session ids.
+  `usage` is provider/turn accounting for cost and telemetry; `context.used`
+  is the current prompt/context snapshot and can be lower than provider
+  `usage.total` when cached input or tool-loop calls are involved.

 Message flow:

--- a/docs/reference/token-use.md
+++ b/docs/reference/token-use.md
@@ -101,6 +101,13 @@ Assistant transcript entries persist the same normalized usage shape, including
 returns usage metadata. This gives `/usage cost` and transcript-backed session
 status a stable source even after the live runtime state is gone.

+OpenClaw keeps provider usage accounting separate from the current context
+snapshot. Provider `usage.total` can include cached input, output, and multiple
+tool-loop model calls, so it is useful for cost and telemetry but can overstate
+the live context window. Context displays and diagnostics use the latest prompt
+snapshot (`promptTokens`, or the last model call when no prompt snapshot is
+available) for `context.used`.
+
 ## Cost estimation (when shown)

 Costs are estimated from your model pricing config:
--- a/src/agents/pi-embedded-runner/types.ts
+++ b/src/agents/pi-embedded-runner/types.ts
@@ -10,6 +10,11 @@ export type EmbeddedPiAgentMeta = {
  agentHarnessId?: string;
  cliSessionBinding?: CliSessionBinding;
  compactionCount?: number;
+  /**
+   * Prompt/context snapshot from the latest model request. Prefer this for
+   * context-window utilization because provider usage totals can include cached
+   * and completion tokens that are useful for billing but noisy as live context.
+   */
  promptTokens?: number;
  usage?: {
    input?: number;
--- a/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts
+++ b/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts
@@ -10,6 +10,11 @@ import {
 import * as sessionTypesModule from "../../config/sessions.js";
 import type { SessionEntry } from "../../config/sessions.js";
 import { loadSessionStore, saveSessionStore } from "../../config/sessions.js";
+import {
+  onInternalDiagnosticEvent,
+  resetDiagnosticEventsForTest,
+  type DiagnosticEventPayload,
+} from "../../infra/diagnostic-events.js";
 import {
  clearMemoryPluginState,
  registerMemoryFlushPlanResolver,
@@ -138,6 +143,7 @@ type RunWithModelFallbackParams = {
 };

 beforeEach(() => {
+  resetDiagnosticEventsForTest();
  embeddedRunTesting.resetActiveEmbeddedRuns();
  replyRunRegistryTesting.resetReplyRunRegistry();
  runEmbeddedPiAgentMock.mockClear();
@@ -169,6 +175,7 @@ beforeEach(() => {
 });

 afterEach(() => {
+  resetDiagnosticEventsForTest();
  vi.useRealTimers();
  clearMemoryPluginState();
  replyRunRegistryTesting.resetReplyRunRegistry();
@@ -289,6 +296,167 @@ describe("runReplyAgent auto-compaction token update", () => {
    // totalTokens should use lastCallUsage (55k), not accumulated (75k)
    expect(stored[sessionKey].totalTokens).toBe(55_000);
  });
+
+  it("reports live diagnostic context from promptTokens, not provider usage totals", async () => {
+    const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-usage-diagnostic-"));
+    const storePath = path.join(tmp, "sessions.json");
+    const sessionKey = "main";
+    const sessionEntry = {
+      sessionId: "session",
+      updatedAt: Date.now(),
+      totalTokens: 50_000,
+    };
+
+    await seedSessionStore({ storePath, sessionKey, entry: sessionEntry });
+
+    runEmbeddedPiAgentMock.mockResolvedValue({
+      payloads: [{ text: "ok" }],
+      meta: {
+        agentMeta: {
+          usage: { input: 75_000, output: 5_000, cacheRead: 25_000, total: 105_000 },
+          lastCallUsage: { input: 55_000, output: 2_000, cacheRead: 25_000, total: 82_000 },
+          promptTokens: 44_000,
+        },
+      },
+    });
+
+    const diagnostics: DiagnosticEventPayload[] = [];
+    const unsubscribe = onInternalDiagnosticEvent((event) => {
+      diagnostics.push(event);
+    });
+    const { typing, sessionCtx, resolvedQueue, followupRun } = createBaseRun({
+      storePath,
+      sessionEntry,
+    });
+
+    try {
+      await runReplyAgent({
+        commandBody: "hello",
+        followupRun,
+        queueKey: "main",
+        resolvedQueue,
+        shouldSteer: false,
+        shouldFollowup: false,
+        isActive: false,
+        isStreaming: false,
+        typing,
+        sessionCtx,
+        sessionEntry,
+        sessionStore: { [sessionKey]: sessionEntry },
+        sessionKey,
+        storePath,
+        defaultModel: "anthropic/claude-opus-4-6",
+        agentCfgContextTokens: 200_000,
+        resolvedVerboseLevel: "off",
+        isNewSession: false,
+        blockStreamingEnabled: false,
+        resolvedBlockStreamingBreak: "message_end",
+        shouldInjectGroupIntro: false,
+        typingMode: "instant",
+      });
+    } finally {
+      unsubscribe();
+    }
+
+    const usageEvent = diagnostics.find((event) => event.type === "model.usage");
+    expect(usageEvent).toMatchObject({
+      type: "model.usage",
+      usage: {
+        input: 75_000,
+        output: 5_000,
+        cacheRead: 25_000,
+        promptTokens: 100_000,
+        total: 105_000,
+      },
+      context: {
+        limit: 200_000,
+        used: 44_000,
+      },
+    });
+  });
+
+  it("falls back to last-call prompt usage for live diagnostic context", async () => {
+    const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-usage-diagnostic-last-"));
+    const storePath = path.join(tmp, "sessions.json");
+    const sessionKey = "main";
+    const sessionEntry = {
+      sessionId: "session",
+      updatedAt: Date.now(),
+      totalTokens: 50_000,
+    };
+
+    await seedSessionStore({ storePath, sessionKey, entry: sessionEntry });
+
+    runEmbeddedPiAgentMock.mockResolvedValue({
+      payloads: [{ text: "ok" }],
+      meta: {
+        agentMeta: {
+          usage: { input: 75_000, output: 5_000, cacheRead: 25_000, total: 105_000 },
+          lastCallUsage: {
+            input: 55_000,
+            output: 2_000,
+            cacheRead: 25_000,
+            cacheWrite: 1_000,
+            total: 83_000,
+          },
+        },
+      },
+    });
+
+    const diagnostics: DiagnosticEventPayload[] = [];
+    const unsubscribe = onInternalDiagnosticEvent((event) => {
+      diagnostics.push(event);
+    });
+    const { typing, sessionCtx, resolvedQueue, followupRun } = createBaseRun({
+      storePath,
+      sessionEntry,
+    });
+
+    try {
+      await runReplyAgent({
+        commandBody: "hello",
+        followupRun,
+        queueKey: "main",
+        resolvedQueue,
+        shouldSteer: false,
+        shouldFollowup: false,
+        isActive: false,
+        isStreaming: false,
+        typing,
+        sessionCtx,
+        sessionEntry,
+        sessionStore: { [sessionKey]: sessionEntry },
+        sessionKey,
+        storePath,
+        defaultModel: "anthropic/claude-opus-4-6",
+        agentCfgContextTokens: 200_000,
+        resolvedVerboseLevel: "off",
+        isNewSession: false,
+        blockStreamingEnabled: false,
+        resolvedBlockStreamingBreak: "message_end",
+        shouldInjectGroupIntro: false,
+        typingMode: "instant",
+      });
+    } finally {
+      unsubscribe();
+    }
+
+    const usageEvent = diagnostics.find((event) => event.type === "model.usage");
+    expect(usageEvent).toMatchObject({
+      type: "model.usage",
+      usage: {
+        input: 75_000,
+        output: 5_000,
+        cacheRead: 25_000,
+        promptTokens: 100_000,
+        total: 105_000,
+      },
+      context: {
+        limit: 200_000,
+        used: 81_000,
+      },
+    });
+  });
 });

 describe("runReplyAgent block streaming", () => {
@@ -913,6 +1081,7 @@ describe("runReplyAgent Active Memory inline debug", () => {
          model: "claude",
          usage: { input: 1200, output: 45, cacheRead: 800, cacheWrite: 200, total: 2245 },
          lastCallUsage: { input: 1000, output: 45, cacheRead: 750, cacheWrite: 150, total: 1945 },
+          promptTokens: 1250,
          compactionCount: 1,
        },
      },
@@ -987,6 +1156,7 @@ describe("runReplyAgent Active Memory inline debug", () => {
    expect(traceText).toContain("🔎 Usage (Session Total):");
    expect(traceText).toContain("🔎 Usage (Last Turn Total):");
    expect(traceText).toContain("🔎 Context Window (Last Model Request):");
+    expect(traceText).toContain("used=1,250 tok (1.3k)");
    expect(traceText).toContain("🔎 Execution Result:");
    expect(traceText).toContain("winner=anthropic/claude");
    expect(traceText).toContain("fallbackUsed=yes");
@@ -1025,7 +1195,7 @@ describe("runReplyAgent Active Memory inline debug", () => {
    expect(traceText).toContain("🔎 Model Input (User Role):");
    expect(traceText).toContain("🔎 Model Output (Assistant Role):");
    expect(traceText).toContain(
-      "Summary: winner=claude 🧠 low fallback=yes attempts=2 stop=end_turn prompt=1.9k/200k ⬇️ 1.2k ⬆️ 45 ♻️ 800 🆕 200 🔢 2.2k tools=2 compactions=1",
+      "Summary: winner=claude 🧠 low fallback=yes attempts=2 stop=end_turn prompt=1.3k/200k ⬇️ 1.2k ⬆️ 45 ♻️ 800 🆕 200 🔢 2.2k tools=2 compactions=1",
    );
    expect(traceText.indexOf("🔎 Execution Result:")).toBeGreaterThan(
      traceText.indexOf("🔎 Context Window (Last Model Request):"),
--- a/src/auto-reply/reply/agent-runner.ts
+++ b/src/auto-reply/reply/agent-runner.ts
@@ -585,6 +585,13 @@ function resolveRequestPromptTokens(params: {
    total?: number;
  };
 }): number | undefined {
+  if (
+    typeof params.promptTokens === "number" &&
+    Number.isFinite(params.promptTokens) &&
+    params.promptTokens > 0
+  ) {
+    return params.promptTokens;
+  }
  const lastCall = params.lastCallUsage;
  if (lastCall) {
    const input = lastCall.input ?? 0;
@@ -595,13 +602,6 @@ function resolveRequestPromptTokens(params: {
      return sum;
    }
  }
-  if (
-    typeof params.promptTokens === "number" &&
-    Number.isFinite(params.promptTokens) &&
-    params.promptTokens > 0
-  ) {
-    return params.promptTokens;
-  }
  const usage = params.usage;
  if (usage) {
    const input = usage.input ?? 0;
@@ -1428,8 +1428,13 @@ export async function runReplyAgent(params: {
      const output = usage.output ?? 0;
      const cacheRead = usage.cacheRead ?? 0;
      const cacheWrite = usage.cacheWrite ?? 0;
-      const promptTokens = input + cacheRead + cacheWrite;
-      const totalTokens = usage.total ?? promptTokens + output;
+      const usagePromptTokens = input + cacheRead + cacheWrite;
+      const totalTokens = usage.total ?? usagePromptTokens + output;
+      const contextUsedTokens = resolveRequestPromptTokens({
+        lastCallUsage: runResult.meta?.agentMeta?.lastCallUsage,
+        promptTokens,
+        usage,
+      });
      const costConfig = resolveModelCostConfig({
        provider: providerUsed,
        model: modelUsed,
@@ -1455,13 +1460,13 @@ export async function runReplyAgent(params: {
          output,
          cacheRead,
          cacheWrite,
-          promptTokens,
+          promptTokens: usagePromptTokens,
          total: totalTokens,
        },
        lastCallUsage: runResult.meta?.agentMeta?.lastCallUsage,
        context: {
          limit: contextTokensUsed,
-          used: totalTokens,
+          ...(contextUsedTokens !== undefined ? { used: contextUsedTokens } : {}),
        },
        costUsd,
        durationMs: Date.now() - runStartedAt,