fix: compaction after tool use abortion cause agent infinite loop calls (#62600)

Merged via squash. Prepared head SHA: 304ba07207 Co-authored-by: i-dentifier <44976464+i-dentifier@users.noreply.github.com> Co-authored-by: jalehman <550978+jalehman@users.noreply.github.com> Reviewed-by: @jalehman
2026-04-18 04:31:10 +00:00 · 2026-04-08 01:28:00 +08:00
parent e617aa6d1e
commit adb7b0d5d6
8 changed files with 60 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -84,6 +84,7 @@ Docs: https://docs.openclaw.ai
 - OpenAI TTS/Groq: send `wav` to Groq-compatible speech endpoints, honor explicit `responseFormat` overrides on OpenAI-compatible paths, and only mark voice-note output as voice-compatible when the actual format is `opus`. (#62233) Thanks @neeravmakwana.
 - BlueBubbles/network: respect explicit private-network opt-out for loopback and private `serverUrl` values across account resolution, status probes, monitor startup, and attachment downloads, while keeping public-host attachment hostname pinning intact. (#59373) Thanks @jpreagan.
 - Agents/heartbeat: keep heartbeat runs pinned to the main session so active subagent transcripts are not overwritten by heartbeat status messages. (#61803) thanks @100yenadmin.
+- Agents/compaction: stop compaction-wait aborts from re-entering prompt failover and replaying completed tool turns. (#62600) Thanks @i-dentifier.

 ## 2026.4.5

--- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts
+++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts
@@ -186,6 +186,7 @@ const makeAttempt = (overrides: Partial<EmbeddedRunAttemptResult>): EmbeddedRunA
    timedOut: false,
    timedOutDuringCompaction: false,
    promptError: null,
+    promptErrorSource: null,
    sessionIdUsed: "session:test",
    systemPromptReport: undefined,
    messagesSnapshot: [],
@@ -979,6 +980,44 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
    });
  });

+  it("does not rotate when failover-looking prompt errors came from compaction wait", async () => {
+    await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
+      await writeAuthStore(agentDir);
+
+      runEmbeddedAttemptMock.mockResolvedValueOnce(
+        makeAttempt({
+          promptError: new Error("rate limit exceeded"),
+          promptErrorSource: "compaction",
+          assistantTexts: ["partial"],
+          lastAssistant: buildAssistant({
+            stopReason: "stop",
+            content: [{ type: "text", text: "partial" }],
+          }),
+        }),
+      );
+
+      const result = await runEmbeddedPiAgentInline({
+        sessionId: "session:test",
+        sessionKey: "agent:test:compaction-wait-abort",
+        sessionFile: path.join(workspaceDir, "session.jsonl"),
+        workspaceDir,
+        agentDir,
+        config: makeConfig(),
+        prompt: "hello",
+        provider: "openai",
+        model: "mock-1",
+        authProfileId: "openai:p1",
+        authProfileIdSource: "auto",
+        timeoutMs: 5_000,
+        runId: "run:compaction-wait-abort",
+      });
+
+      expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(1);
+      expect(result.payloads?.[0]?.text).toContain("partial");
+      await expectProfileP2UsageUnchanged(agentDir);
+    });
+  });
+
  it("does not rotate for user-pinned profiles", async () => {
    await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
      await writeAuthStore(agentDir);
--- a/src/agents/pi-embedded-runner/run.overflow-compaction.fixture.ts
+++ b/src/agents/pi-embedded-runner/run.overflow-compaction.fixture.ts
@@ -37,6 +37,7 @@ export function makeAttemptResult(
    timedOut: false,
    timedOutDuringCompaction: false,
    promptError: null,
+    promptErrorSource: null,
    sessionIdUsed: "test-session",
    assistantTexts: ["Hello!"],
    toolMetas,
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -676,6 +676,7 @@ export async function runEmbeddedPiAgent(
          const {
            aborted,
            promptError,
+            promptErrorSource,
            preflightRecovery,
            timedOut,
            timedOutDuringCompaction,
@@ -1086,9 +1087,13 @@ export async function runEmbeddedPiAgent(
            };
          }

-          if (promptError && !aborted) {
+          if (promptError && !aborted && promptErrorSource !== "compaction") {
            // Normalize wrapped errors (e.g. abort-wrapped RESOURCE_EXHAUSTED) into
            // FailoverError so rate-limit classification works even for nested shapes.
+            //
+            // promptErrorSource === "compaction" means the model call already completed and the
+            // abort happened only while waiting for compaction/retry cleanup. Retrying from here
+            // would replay that completed tool turn as a fresh prompt attempt.
            const normalizedPromptFailover = coerceToFailoverError(promptError, {
              provider: activeErrorContext.provider,
              model: activeErrorContext.model,
--- a/src/agents/pi-embedded-runner/run/attempt.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -2329,6 +2329,7 @@ export async function runEmbeddedAttempt(
        timedOut,
        timedOutDuringCompaction,
        promptError,
+        promptErrorSource,
        preflightRecovery,
        sessionIdUsed,
        bootstrapPromptWarningSignaturesSeen: bootstrapPromptWarning.warningSignaturesSeen,
--- a/src/agents/pi-embedded-runner/run/types.ts
+++ b/src/agents/pi-embedded-runner/run/types.ts
@@ -42,6 +42,16 @@ export type EmbeddedRunAttemptResult = {
  /** True if the timeout occurred while compaction was in progress or pending. */
  timedOutDuringCompaction: boolean;
  promptError: unknown;
+  /**
+   * Identifies which phase produced the promptError.
+   * - "prompt": the LLM call itself failed and may be eligible for retry/fallback.
+   * - "compaction": the prompt succeeded, but waiting for compaction/retry teardown was aborted;
+   *   this must not be retried as a fresh prompt or the same tool turn can replay.
+   * - "precheck": pre-prompt overflow recovery intentionally short-circuited the prompt so the
+   *   outer run loop can recover via compaction/truncation before any model call is made.
+   * - null: no promptError.
+   */
+  promptErrorSource: "prompt" | "compaction" | "precheck" | null;
  preflightRecovery?:
    | {
        route: Exclude<PreemptiveCompactionRoute, "fits">;
--- a/src/agents/pi-embedded-runner/usage-reporting.test.ts
+++ b/src/agents/pi-embedded-runner/usage-reporting.test.ts
@@ -21,6 +21,7 @@ function makeAttemptResult(
    timedOut: false,
    timedOutDuringCompaction: false,
    promptError: null,
+    promptErrorSource: null,
    sessionIdUsed: "test-session",
    messagesSnapshot: [],
    assistantTexts: [],
--- a/src/agents/test-helpers/pi-embedded-runner-e2e-fixtures.ts
+++ b/src/agents/test-helpers/pi-embedded-runner-e2e-fixtures.ts
@@ -105,6 +105,7 @@ export function makeEmbeddedRunnerAttempt(
    timedOut: false,
    timedOutDuringCompaction: false,
    promptError: null,
+    promptErrorSource: null,
    sessionIdUsed: "session:test",
    systemPromptReport: undefined,
    messagesSnapshot: [],