fix(pi-embedded-runner): retry silent stopReason=error turns (non-frontier models)

ollama/glm-5.1:cloud (and occasionally other models) can end a turn with stopReason="error", usage.output=0, and empty content[] after a successful tool-call sequence. The existing empty-response retry path in src/agents/pi-embedded-runner/run/incomplete-turn.ts is gated on isStrictAgenticSupportedProviderModel (gpt-5 family only), so non-frontier models fall through to "incomplete turn detected" with payloads=0 and no recovery. The user sees no reply and has to nudge. Add a narrow, model-agnostic resubmission inside the attempt loop, placed before the incompleteTurnText surface-to-user return: - stopReason === "error" - usage.output === 0 - content.length === 0 (excludes reasoning-only error turns) - bounded by MAX_EMPTY_ERROR_RETRIES = 3 No instruction injection, no model gating; same prompt, same session transcript (tool results already captured), just let the loop try again. New test file run.empty-error-retry.test.ts covers: 1. Retries for ollama/glm-5.1:cloud → succeeds on 2nd attempt. 2. Caps at 3 retries → 4 total attempts → surfaces incomplete-turn error. 3. Does NOT retry when output > 0 (preserve produced text). 4. Does NOT retry when stopReason=stop + output=0 (NO_REPLY path). 5. Retries for anthropic/claude-opus-4-7 too — model-agnostic. Relates to #68281.
2026-05-06 07:10:43 +00:00 · 2026-04-17 13:01:22 -07:00
parent 982b1c9464
commit 5fb302ebf1
2 changed files with 199 additions and 0 deletions
--- a/src/agents/pi-embedded-runner/run.empty-error-retry.test.ts
+++ b/src/agents/pi-embedded-runner/run.empty-error-retry.test.ts
@@ -0,0 +1,156 @@
+import { beforeAll, beforeEach, describe, expect, it } from "vitest";
+import { makeAttemptResult } from "./run.overflow-compaction.fixture.js";
+import {
+  loadRunOverflowCompactionHarness,
+  mockedClassifyFailoverReason,
+  mockedGlobalHookRunner,
+  mockedRunEmbeddedAttempt,
+  overflowBaseRunParams,
+  resetRunOverflowCompactionHarnessMocks,
+} from "./run.overflow-compaction.harness.js";
+import type { EmbeddedRunAttemptResult } from "./run/types.js";
+
+// Regression coverage for the silent-error retry in runEmbeddedPiAgent.
+//
+// Symptom: ollama/glm-5.1 occasionally ends a turn with stopReason="error" and
+// zero output tokens after a successful tool-call sequence. The user sees no
+// reply and has to nudge. The existing empty-response retry path is gated on
+// the strict-agentic contract (gpt-5 only), so non-frontier models fell
+// through to "incomplete turn detected". This suite locks in a narrower,
+// model-agnostic resubmission.
+
+let runEmbeddedPiAgent: typeof import("./run.js").runEmbeddedPiAgent;
+
+function emptyErrorAttempt(
+  provider: string,
+  model: string,
+  outputTokens = 0,
+): EmbeddedRunAttemptResult {
+  return makeAttemptResult({
+    assistantTexts: [],
+    lastAssistant: {
+      stopReason: "error",
+      provider,
+      model,
+      content: [],
+      usage: { input: 100, output: outputTokens, totalTokens: 100 + outputTokens },
+    } as unknown as EmbeddedRunAttemptResult["lastAssistant"],
+  });
+}
+
+function successAttempt(provider: string, model: string): EmbeddedRunAttemptResult {
+  return makeAttemptResult({
+    assistantTexts: ["Done."],
+    lastAssistant: {
+      stopReason: "stop",
+      provider,
+      model,
+      content: [{ type: "text", text: "Done." }],
+      usage: { input: 100, output: 5, totalTokens: 105 },
+    } as unknown as EmbeddedRunAttemptResult["lastAssistant"],
+  });
+}
+
+describe("runEmbeddedPiAgent silent-error retry", () => {
+  beforeAll(async () => {
+    ({ runEmbeddedPiAgent } = await loadRunOverflowCompactionHarness());
+  });
+
+  beforeEach(() => {
+    resetRunOverflowCompactionHarnessMocks();
+    mockedGlobalHookRunner.hasHooks.mockImplementation(() => false);
+    mockedClassifyFailoverReason.mockReturnValue(null);
+  });
+
+  it("retries when a turn ends with stopReason=error and zero output tokens", async () => {
+    mockedRunEmbeddedAttempt.mockResolvedValueOnce(emptyErrorAttempt("ollama", "glm-5.1:cloud"));
+    mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("ollama", "glm-5.1:cloud"));
+
+    const result = await runEmbeddedPiAgent({
+      ...overflowBaseRunParams,
+      provider: "ollama",
+      model: "glm-5.1:cloud",
+      runId: "run-empty-error-retry-basic",
+    });
+
+    expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2);
+    expect(result.payloads?.[0]?.isError).toBeFalsy();
+  });
+
+  it("caps retries at MAX_EMPTY_ERROR_RETRIES and surfaces incomplete-turn error", async () => {
+    // 1 initial + 3 retries = 4 attempts, all returning empty-error.
+    for (let i = 0; i < 4; i += 1) {
+      mockedRunEmbeddedAttempt.mockResolvedValueOnce(emptyErrorAttempt("ollama", "glm-5.1:cloud"));
+    }
+
+    const result = await runEmbeddedPiAgent({
+      ...overflowBaseRunParams,
+      provider: "ollama",
+      model: "glm-5.1:cloud",
+      runId: "run-empty-error-retry-exhausted",
+    });
+
+    expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(4);
+    expect(result.payloads?.[0]?.isError).toBe(true);
+  });
+
+  it("does not retry when stopReason=error but output tokens > 0", async () => {
+    // Model produced something before erroring; surfacing that text is better
+    // than silent resubmission.
+    mockedRunEmbeddedAttempt.mockResolvedValueOnce(
+      emptyErrorAttempt("ollama", "glm-5.1:cloud", 12),
+    );
+
+    await runEmbeddedPiAgent({
+      ...overflowBaseRunParams,
+      provider: "ollama",
+      model: "glm-5.1:cloud",
+      runId: "run-empty-error-retry-skip-with-output",
+    });
+
+    expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
+  });
+
+  it("does not retry when stopReason=stop and output=0 (out of scope)", async () => {
+    // Clean stop with no output is a legitimate silent reply (e.g. NO_REPLY
+    // token path), not a crash. This retry must not trigger there.
+    mockedRunEmbeddedAttempt.mockResolvedValueOnce(
+      makeAttemptResult({
+        assistantTexts: [],
+        lastAssistant: {
+          stopReason: "stop",
+          provider: "ollama",
+          model: "glm-5.1:cloud",
+          content: [],
+          usage: { input: 100, output: 0, totalTokens: 100 },
+        } as unknown as EmbeddedRunAttemptResult["lastAssistant"],
+      }),
+    );
+
+    await runEmbeddedPiAgent({
+      ...overflowBaseRunParams,
+      provider: "ollama",
+      model: "glm-5.1:cloud",
+      runId: "run-empty-error-retry-skip-clean-stop",
+    });
+
+    expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
+  });
+
+  it("retries for frontier models too — the fix is model-agnostic", async () => {
+    mockedRunEmbeddedAttempt.mockResolvedValueOnce(
+      emptyErrorAttempt("anthropic", "claude-opus-4-7"),
+    );
+    mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("anthropic", "claude-opus-4-7"));
+
+    const result = await runEmbeddedPiAgent({
+      ...overflowBaseRunParams,
+      provider: "anthropic",
+      model: "claude-opus-4-7",
+      runId: "run-empty-error-retry-frontier",
+    });
+
+    expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2);
+    expect(result.payloads?.[0]?.isError).toBeFalsy();
+  });
+});
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -489,6 +489,13 @@ export async function runEmbeddedPiAgent(
      });
      let rateLimitProfileRotations = 0;
      let timeoutCompactionAttempts = 0;
+      // Silent-error retry: non-strict-agentic models (e.g. ollama/glm-5.1) can
+      // end a turn with stopReason="error" + zero output tokens, producing no
+      // user-visible text. The existing empty-response retry is gated on
+      // isStrictAgenticSupportedProviderModel (gpt-5 only). This is an
+      // orthogonal, model-agnostic resubmission.
+      const MAX_EMPTY_ERROR_RETRIES = 3;
+      let emptyErrorRetries = 0;
      const overloadFailoverBackoffMs = resolveOverloadFailoverBackoffMs(params.config);
      const overloadProfileRotationLimit = resolveOverloadProfileRotationLimit(params.config);
      const rateLimitProfileRotationLimit = resolveRateLimitProfileRotationLimit(params.config);
@@ -1911,6 +1918,42 @@ export async function runEmbeddedPiAgent(
                `provider=${activeErrorContext.provider}/${activeErrorContext.model} attempts=${emptyResponseRetryAttempts}/${maxEmptyResponseRetryAttempts} — surfacing incomplete-turn error`,
            );
          }
+          // ── silent-error retry ────────────────────────────────────────────
+          // Observed with ollama/glm-5.1: a turn can end with stopReason="error"
+          // and zero output tokens AND empty content after a successful
+          // tool-call sequence, producing no user-visible text at all. The
+          // existing empty-response retry path (resolveEmptyResponseRetryInstruction)
+          // is gated on the strict-agentic contract (gpt-5 only), so non-frontier
+          // models fall through to "incomplete turn detected" → silent gap
+          // until the user nudges. This is a narrower, model-agnostic
+          // resubmission: same prompt, same session transcript (tool results
+          // already captured), no instruction injection. Placed before the
+          // incompleteTurnText return so it actually gets a chance to fire.
+          //
+          // Content-empty guard: a reasoning-only error (content has thinking
+          // blocks) is a distinct failure mode handled elsewhere; only retry
+          // when the assistant truly produced nothing.
+          const silentErrorContent = sessionLastAssistant?.content as Array<unknown> | undefined;
+          if (
+            incompleteTurnText &&
+            !aborted &&
+            !promptError &&
+            !timedOut &&
+            sessionLastAssistant?.stopReason === "error" &&
+            ((sessionLastAssistant?.usage as { output?: number } | undefined)?.output ?? 0) === 0 &&
+            (silentErrorContent?.length ?? 0) === 0 &&
+            emptyErrorRetries < MAX_EMPTY_ERROR_RETRIES
+          ) {
+            emptyErrorRetries += 1;
+            log.warn(
+              `[empty-error-retry] stopReason=error output=0; resubmitting ` +
+                `attempt=${emptyErrorRetries}/${MAX_EMPTY_ERROR_RETRIES} ` +
+                `provider=${sessionLastAssistant?.provider ?? provider} ` +
+                `model=${sessionLastAssistant?.model ?? model.id} ` +
+                `sessionKey=${params.sessionKey ?? params.sessionId}`,
+            );
+            continue;
+          }
          if (incompleteTurnText) {
            const replayInvalid = resolveReplayInvalidForAttempt(incompleteTurnText);
            const livenessState = resolveRunLivenessState({