fix(agents): trace slow embedded run startup stages

2026-05-06 12:30:44 +00:00 · 2026-04-29 14:08:29 +01:00
parent bbf985d50a
commit 20e2117371
5 changed files with 202 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
 - CLI/models: restore provider-filtered `models list --all --provider <id>` rows for providers without manifest/static catalog coverage, including Anthropic and Amazon Bedrock, while keeping the compatibility fallback off expensive availability and resolver paths. Thanks @shakkernerd.
 - CLI/tools: keep the Gateway `tools.*` RPC namespace out of plugin command discovery and managed proxy startup, so stray commands like `openclaw tools effective` fail quickly instead of cold-loading plugin metadata. Refs #73477. Thanks @oromeis.
 - CLI/status: keep default text `openclaw status --usage` on metadata-only channel scans unless `--deep` or `--all` is set, and send stray `openclaw tools --help` through the precomputed root-help fast path so latency-triage commands avoid plugin/runtime cold loads before printing. Refs #73477 and #74220. Thanks @oromeis and @NianJiuZst.
+- Agents/diagnostics: log slow embedded-run startup and preparation stage timings before model I/O, so Docker/VPS latency reports can identify whether plugin loading, auth/model resolution, tool inventory, bootstrap, MCP/LSP, resource loading, or stream setup is dominating pre-run latency. Refs #73428. Thanks @Dimaoggg, @quangtran88, and @Heyvhuang.
 - Plugins/runtime-deps: memoize packaged bundled runtime dist-mirror preparation after the first successful pass while keeping source-checkout mirrors refreshable, so constrained Docker/VPS installs avoid repeated root scans before chat turns. Refs #73428, #73421, #73532, and #73477. Thanks @Dimaoggg, @oromeis, @oadiazp, @jmfraga, @bstanbury, @antoniusfelix, and @jkobject.
 - Channels/Discord: treat bare numeric outbound targets that match the effective Discord DM allowlist as user DMs while preserving account-specific legacy `dm.allowFrom` precedence over inherited root `allowFrom`. (#74303) Thanks @Squirbie.
 - Control UI: make the chat sidebar split divider focusable, keyboard-resizable, ARIA-described, and pointer-event based so sidebar resizing works without a mouse. Thanks @BunsDev.
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -90,6 +90,11 @@ import { log } from "./logger.js";
 import { resolveModelAsync } from "./model.js";
 import { createEmbeddedRunReplayState, observeReplayMetadata } from "./replay-state.js";
 import { handleAssistantFailover } from "./run/assistant-failover.js";
+import {
+  createEmbeddedRunStageTracker,
+  formatEmbeddedRunStageSummary,
+  shouldEmitEmbeddedRunStageSummary,
+} from "./run/attempt-stage-timing.js";
 import { forgetPromptBuildDrainCacheForRun } from "./run/attempt.prompt-helpers.js";
 import { createEmbeddedRunAuthController } from "./run/auth-controller.js";
 import { resolveAuthProfileFailureReason } from "./run/auth-profile-failure-policy.js";
@@ -324,6 +329,20 @@ export async function runEmbeddedPiAgent(
    return enqueueGlobal(async () => {
      throwIfAborted();
      const started = Date.now();
+      const startupStages = createEmbeddedRunStageTracker();
+      let startupStagesEmitted = false;
+      const emitStartupStageSummary = (phase: string) => {
+        const summary = startupStages.snapshot();
+        const message = formatEmbeddedRunStageSummary(
+          `embedded run startup stages: runId=${params.runId} sessionId=${params.sessionId} phase=${phase}`,
+          summary,
+        );
+        if (shouldEmitEmbeddedRunStageSummary(summary)) {
+          log.warn(message);
+        } else if (log.isEnabled("debug")) {
+          log.debug(message);
+        }
+      };
      params.onExecutionStarted?.();
      const workspaceResolution = resolveRunWorkspaceDir({
        workspaceDir: params.workspaceDir,
@@ -344,11 +363,13 @@ export async function runEmbeddedPiAgent(
          `[workspace-fallback] caller=runEmbeddedPiAgent reason=${workspaceResolution.fallbackReason} run=${params.runId} session=${redactedSessionId} sessionKey=${redactedSessionKey} agent=${workspaceResolution.agentId} workspace=${redactedWorkspace}`,
        );
      }
+      startupStages.mark("workspace");
      ensureRuntimePluginsLoaded({
        config: params.config,
        workspaceDir: resolvedWorkspace,
        allowGatewaySubagentBinding: params.allowGatewaySubagentBinding,
      });
+      startupStages.mark("runtime-plugins");

      let provider = (params.provider ?? DEFAULT_PROVIDER).trim() || DEFAULT_PROVIDER;
      let modelId = (params.model ?? DEFAULT_MODEL).trim() || DEFAULT_MODEL;
@@ -407,6 +428,7 @@ export async function runEmbeddedPiAgent(
      provider = hookSelection.provider;
      modelId = hookSelection.modelId;
      const legacyBeforeAgentStartResult = hookSelection.legacyBeforeAgentStartResult;
+      startupStages.mark("hooks");
      const agentHarness = selectAgentHarness({
        provider,
        modelId,
@@ -453,6 +475,7 @@ export async function runEmbeddedPiAgent(
      });
      const ctxInfo = resolvedRuntimeModel.ctxInfo;
      let effectiveModel = resolvedRuntimeModel.effectiveModel;
+      startupStages.mark("model-resolution");

      const authStore = pluginHarnessOwnsTransport
        ? createEmptyAuthProfileStore()
@@ -611,6 +634,7 @@ export async function runEmbeddedPiAgent(
      } else if (lockedProfileId) {
        lastProfileId = lockedProfileId;
      }
+      startupStages.mark("auth");
      const { sessionAgentId } = resolveSessionAgentIds({
        sessionKey: params.sessionKey,
        config: params.config,
@@ -745,6 +769,7 @@ export async function runEmbeddedPiAgent(
        agentDir,
        workspaceDir: resolvedWorkspace,
      });
+      startupStages.mark("context-engine");
      try {
        let activeSessionId = params.sessionId;
        let activeSessionFile = params.sessionFile;
@@ -892,6 +917,11 @@ export async function runEmbeddedPiAgent(
              fastMode: params.fastMode,
            },
          });
+          if (!startupStagesEmitted) {
+            startupStages.mark("attempt-dispatch");
+            emitStartupStageSummary("attempt-dispatch");
+            startupStagesEmitted = true;
+          }

          const rawAttempt = await runEmbeddedAttemptWithBackend({
            sessionId: activeSessionId,
--- a/src/agents/pi-embedded-runner/run/attempt-stage-timing.test.ts
+++ b/src/agents/pi-embedded-runner/run/attempt-stage-timing.test.ts
@@ -0,0 +1,71 @@
+import { describe, expect, it } from "vitest";
+import {
+  createEmbeddedRunStageTracker,
+  formatEmbeddedRunStageSummary,
+  shouldEmitEmbeddedRunStageSummary,
+} from "./attempt-stage-timing.js";
+
+describe("embedded run stage timing", () => {
+  it("captures stage duration and elapsed time", () => {
+    let clock = 10;
+    const tracker = createEmbeddedRunStageTracker({ now: () => clock });
+
+    clock = 25;
+    tracker.mark("workspace");
+    clock = 40;
+    tracker.mark("tools");
+    clock = 45;
+
+    expect(tracker.snapshot()).toEqual({
+      totalMs: 35,
+      stages: [
+        { name: "workspace", durationMs: 15, elapsedMs: 15 },
+        { name: "tools", durationMs: 15, elapsedMs: 30 },
+      ],
+    });
+  });
+
+  it("emits only slow stage summaries", () => {
+    expect(
+      shouldEmitEmbeddedRunStageSummary(
+        {
+          totalMs: 1_999,
+          stages: [{ name: "auth", durationMs: 999, elapsedMs: 999 }],
+        },
+        { totalThresholdMs: 2_000, stageThresholdMs: 1_000 },
+      ),
+    ).toBe(false);
+    expect(
+      shouldEmitEmbeddedRunStageSummary(
+        {
+          totalMs: 2_000,
+          stages: [{ name: "auth", durationMs: 10, elapsedMs: 10 }],
+        },
+        { totalThresholdMs: 2_000, stageThresholdMs: 1_000 },
+      ),
+    ).toBe(true);
+    expect(
+      shouldEmitEmbeddedRunStageSummary(
+        {
+          totalMs: 10,
+          stages: [{ name: "auth", durationMs: 1_000, elapsedMs: 1_000 }],
+        },
+        { totalThresholdMs: 2_000, stageThresholdMs: 1_000 },
+      ),
+    ).toBe(true);
+  });
+
+  it("formats summaries compactly for logs", () => {
+    expect(
+      formatEmbeddedRunStageSummary("embedded run startup stages: runId=r1", {
+        totalMs: 80,
+        stages: [
+          { name: "workspace", durationMs: 25, elapsedMs: 25 },
+          { name: "tools", durationMs: 55, elapsedMs: 80 },
+        ],
+      }),
+    ).toBe(
+      "embedded run startup stages: runId=r1 totalMs=80 stages=workspace:25ms@25ms,tools:55ms@80ms",
+    );
+  });
+});
--- a/src/agents/pi-embedded-runner/run/attempt-stage-timing.ts
+++ b/src/agents/pi-embedded-runner/run/attempt-stage-timing.ts
@@ -0,0 +1,72 @@
+export type EmbeddedRunStageTiming = {
+  name: string;
+  durationMs: number;
+  elapsedMs: number;
+};
+
+export type EmbeddedRunStageSummary = {
+  totalMs: number;
+  stages: EmbeddedRunStageTiming[];
+};
+
+export type EmbeddedRunStageTracker = {
+  mark: (name: string) => void;
+  snapshot: () => EmbeddedRunStageSummary;
+};
+
+export function createEmbeddedRunStageTracker(options?: {
+  now?: () => number;
+}): EmbeddedRunStageTracker {
+  const now = options?.now ?? Date.now;
+  const startedAt = now();
+  let previousAt = startedAt;
+  const stages: EmbeddedRunStageTiming[] = [];
+
+  const toMs = (value: number) => Math.max(0, Math.round(value));
+
+  return {
+    mark(name) {
+      const currentAt = now();
+      stages.push({
+        name,
+        durationMs: toMs(currentAt - previousAt),
+        elapsedMs: toMs(currentAt - startedAt),
+      });
+      previousAt = currentAt;
+    },
+    snapshot() {
+      return {
+        totalMs: toMs(now() - startedAt),
+        stages: stages.slice(),
+      };
+    },
+  };
+}
+
+export function shouldEmitEmbeddedRunStageSummary(
+  summary: EmbeddedRunStageSummary,
+  options?: {
+    totalThresholdMs?: number;
+    stageThresholdMs?: number;
+  },
+): boolean {
+  const totalThresholdMs = options?.totalThresholdMs ?? 2_000;
+  const stageThresholdMs = options?.stageThresholdMs ?? 1_000;
+  return (
+    summary.totalMs >= totalThresholdMs ||
+    summary.stages.some((stage) => stage.durationMs >= stageThresholdMs)
+  );
+}
+
+export function formatEmbeddedRunStageSummary(
+  prefix: string,
+  summary: EmbeddedRunStageSummary,
+): string {
+  const stages =
+    summary.stages.length > 0
+      ? summary.stages
+          .map((stage) => `${stage.name}:${stage.durationMs}ms@${stage.elapsedMs}ms`)
+          .join(",")
+      : "none";
+  return `${prefix} totalMs=${summary.totalMs} stages=${stages}`;
+}
--- a/src/agents/pi-embedded-runner/run/attempt.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -242,6 +242,11 @@ import {
  shouldRotateCompactionTranscript,
 } from "../compaction-successor-transcript.js";
 import { configureEmbeddedAttemptHttpRuntime } from "./attempt-http-runtime.js";
+import {
+  createEmbeddedRunStageTracker,
+  formatEmbeddedRunStageSummary,
+  shouldEmitEmbeddedRunStageSummary,
+} from "./attempt-stage-timing.js";
 import {
  assembleAttemptContextEngine,
  buildLoopPromptCacheInfo,
@@ -589,6 +594,19 @@ export async function runEmbeddedAttempt(
  log.debug(
    `embedded run start: runId=${params.runId} sessionId=${params.sessionId} provider=${params.provider} model=${params.modelId} thinking=${params.thinkLevel} messageChannel=${params.messageChannel ?? params.messageProvider ?? "unknown"}`,
  );
+  const prepStages = createEmbeddedRunStageTracker();
+  const emitPrepStageSummary = (phase: string) => {
+    const summary = prepStages.snapshot();
+    const message = formatEmbeddedRunStageSummary(
+      `embedded run prep stages: runId=${params.runId} sessionId=${params.sessionId} phase=${phase}`,
+      summary,
+    );
+    if (shouldEmitEmbeddedRunStageSummary(summary)) {
+      log.warn(message);
+    } else if (log.isEnabled("debug")) {
+      log.debug(message);
+    }
+  };

  await fs.mkdir(resolvedWorkspace, { recursive: true });

@@ -610,6 +628,7 @@ export async function runEmbeddedAttempt(
    config: params.config,
    agentId: params.agentId,
  });
+  prepStages.mark("workspace-sandbox");

  let restoreSkillEnv: (() => void) | undefined;
  let aborted = Boolean(params.abortSignal?.aborted);
@@ -645,6 +664,7 @@ export async function runEmbeddedAttempt(
      workspaceDir: effectiveWorkspace,
      agentId: sessionAgentId,
    });
+    prepStages.mark("skills");

    const sessionLabel = params.sessionKey ?? params.sessionId;
    const contextInjectionMode = resolveContextInjectionMode(params.config);
@@ -760,6 +780,7 @@ export async function runEmbeddedAttempt(
            });
            return applyEmbeddedAttemptToolsAllow(allTools, params.toolsAllow);
          })();
+    prepStages.mark("core-plugin-tools");
    const toolsEnabled = supportsModelTools(params.model);
    const bootstrapHasFileAccess = toolsEnabled && toolsRaw.some((tool) => tool.name === "read");
    const bootstrapRouting = await resolveAttemptWorkspaceBootstrapRouting({
@@ -803,6 +824,7 @@ export async function runEmbeddedAttempt(
          runKind: params.bootstrapContextRunKind,
        }),
    });
+    prepStages.mark("bootstrap-context");
    const remappedContextFiles = remapInjectedContextFilesToWorkspace({
      files: resolvedContextFiles,
      sourceWorkspaceDir: resolvedWorkspace,
@@ -935,6 +957,7 @@ export async function runEmbeddedAttempt(
      warn: (message) => log.warn(message),
    });
    const effectiveTools = [...tools, ...filteredBundledTools];
+    prepStages.mark("bundle-tools");
    const allowedToolNames = collectAllowedToolNames({
      tools: effectiveTools,
      clientTools,
@@ -1221,6 +1244,7 @@ export async function runEmbeddedAttempt(
      bootstrapMode,
      contextFiles: remappedContextFiles,
    });
+    prepStages.mark("system-prompt");

    // Keep the session lock scoped to transcript/session mutations. Cold plugin
    // and tool setup can be slow, and holding the lock there blocks CLI fallback
@@ -1345,6 +1369,7 @@ export async function runEmbeddedAttempt(
        cfg: params.config,
        contextTokenBudget: params.contextTokenBudget,
      });
+      prepStages.mark("session-resource-loader");

      // Get hook runner early so it's available when creating tools
      const hookRunner = getGlobalHookRunner();
@@ -1441,6 +1466,7 @@ export async function runEmbeddedAttempt(
      }
      session.setActiveToolsByName(sessionToolAllowlist);
      const activeSession = session;
+      prepStages.mark("agent-session");
      if (isRawModelRun) {
        // Raw model probes should measure exactly the requested prompt against
        // the selected provider/model. Reset clears restored transcript state
@@ -1692,6 +1718,8 @@ export async function runEmbeddedAttempt(
            `(${params.provider}/${params.modelId})`,
        );
      }
+      prepStages.mark("stream-setup");
+      emitPrepStageSummary("stream-ready");

      const cacheObservabilityEnabled = Boolean(cacheTrace) || log.isEnabled("debug");
      const promptCacheToolNames = collectPromptCacheToolNames(