fix(pi-embedded-runner): retry silent stopReason=error turns (non-frontier models)

ollama/glm-5.1:cloud (and occasionally other models) can end a turn with
stopReason="error", usage.output=0, and empty content[] after a successful
tool-call sequence. The existing empty-response retry path in
src/agents/pi-embedded-runner/run/incomplete-turn.ts is gated on
isStrictAgenticSupportedProviderModel (gpt-5 family only), so non-frontier
models fall through to "incomplete turn detected" with payloads=0 and no
recovery. The user sees no reply and has to nudge.

Add a narrow, model-agnostic resubmission inside the attempt loop, placed
before the incompleteTurnText surface-to-user return:

  - stopReason === "error"
  - usage.output === 0
  - content.length === 0   (excludes reasoning-only error turns)
  - bounded by MAX_EMPTY_ERROR_RETRIES = 3

No instruction injection, no model gating; same prompt, same session
transcript (tool results already captured), just let the loop try again.

New test file run.empty-error-retry.test.ts covers:
  1. Retries for ollama/glm-5.1:cloud → succeeds on 2nd attempt.
  2. Caps at 3 retries → 4 total attempts → surfaces incomplete-turn error.
  3. Does NOT retry when output > 0 (preserve produced text).
  4. Does NOT retry when stopReason=stop + output=0 (NO_REPLY path).
  5. Retries for anthropic/claude-opus-4-7 too — model-agnostic.

Relates to #68281.
This commit is contained in:
Watchtower
2026-04-17 13:01:22 -07:00
committed by Peter Steinberger
parent 982b1c9464
commit 5fb302ebf1
2 changed files with 199 additions and 0 deletions

View File

@@ -0,0 +1,156 @@
import { beforeAll, beforeEach, describe, expect, it } from "vitest";
import { makeAttemptResult } from "./run.overflow-compaction.fixture.js";
import {
loadRunOverflowCompactionHarness,
mockedClassifyFailoverReason,
mockedGlobalHookRunner,
mockedRunEmbeddedAttempt,
overflowBaseRunParams,
resetRunOverflowCompactionHarnessMocks,
} from "./run.overflow-compaction.harness.js";
import type { EmbeddedRunAttemptResult } from "./run/types.js";
// Regression coverage for the silent-error retry in runEmbeddedPiAgent.
//
// Symptom: ollama/glm-5.1 occasionally ends a turn with stopReason="error" and
// zero output tokens after a successful tool-call sequence. The user sees no
// reply and has to nudge. The existing empty-response retry path is gated on
// the strict-agentic contract (gpt-5 only), so non-frontier models fell
// through to "incomplete turn detected". This suite locks in a narrower,
// model-agnostic resubmission.
let runEmbeddedPiAgent: typeof import("./run.js").runEmbeddedPiAgent;
function emptyErrorAttempt(
provider: string,
model: string,
outputTokens = 0,
): EmbeddedRunAttemptResult {
return makeAttemptResult({
assistantTexts: [],
lastAssistant: {
stopReason: "error",
provider,
model,
content: [],
usage: { input: 100, output: outputTokens, totalTokens: 100 + outputTokens },
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
});
}
function successAttempt(provider: string, model: string): EmbeddedRunAttemptResult {
return makeAttemptResult({
assistantTexts: ["Done."],
lastAssistant: {
stopReason: "stop",
provider,
model,
content: [{ type: "text", text: "Done." }],
usage: { input: 100, output: 5, totalTokens: 105 },
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
});
}
describe("runEmbeddedPiAgent silent-error retry", () => {
beforeAll(async () => {
({ runEmbeddedPiAgent } = await loadRunOverflowCompactionHarness());
});
beforeEach(() => {
resetRunOverflowCompactionHarnessMocks();
mockedGlobalHookRunner.hasHooks.mockImplementation(() => false);
mockedClassifyFailoverReason.mockReturnValue(null);
});
it("retries when a turn ends with stopReason=error and zero output tokens", async () => {
mockedRunEmbeddedAttempt.mockResolvedValueOnce(emptyErrorAttempt("ollama", "glm-5.1:cloud"));
mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("ollama", "glm-5.1:cloud"));
const result = await runEmbeddedPiAgent({
...overflowBaseRunParams,
provider: "ollama",
model: "glm-5.1:cloud",
runId: "run-empty-error-retry-basic",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2);
expect(result.payloads?.[0]?.isError).toBeFalsy();
});
it("caps retries at MAX_EMPTY_ERROR_RETRIES and surfaces incomplete-turn error", async () => {
// 1 initial + 3 retries = 4 attempts, all returning empty-error.
for (let i = 0; i < 4; i += 1) {
mockedRunEmbeddedAttempt.mockResolvedValueOnce(emptyErrorAttempt("ollama", "glm-5.1:cloud"));
}
const result = await runEmbeddedPiAgent({
...overflowBaseRunParams,
provider: "ollama",
model: "glm-5.1:cloud",
runId: "run-empty-error-retry-exhausted",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(4);
expect(result.payloads?.[0]?.isError).toBe(true);
});
it("does not retry when stopReason=error but output tokens > 0", async () => {
// Model produced something before erroring; surfacing that text is better
// than silent resubmission.
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
emptyErrorAttempt("ollama", "glm-5.1:cloud", 12),
);
await runEmbeddedPiAgent({
...overflowBaseRunParams,
provider: "ollama",
model: "glm-5.1:cloud",
runId: "run-empty-error-retry-skip-with-output",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
});
it("does not retry when stopReason=stop and output=0 (out of scope)", async () => {
// Clean stop with no output is a legitimate silent reply (e.g. NO_REPLY
// token path), not a crash. This retry must not trigger there.
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
makeAttemptResult({
assistantTexts: [],
lastAssistant: {
stopReason: "stop",
provider: "ollama",
model: "glm-5.1:cloud",
content: [],
usage: { input: 100, output: 0, totalTokens: 100 },
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
}),
);
await runEmbeddedPiAgent({
...overflowBaseRunParams,
provider: "ollama",
model: "glm-5.1:cloud",
runId: "run-empty-error-retry-skip-clean-stop",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
});
it("retries for frontier models too — the fix is model-agnostic", async () => {
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
emptyErrorAttempt("anthropic", "claude-opus-4-7"),
);
mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("anthropic", "claude-opus-4-7"));
const result = await runEmbeddedPiAgent({
...overflowBaseRunParams,
provider: "anthropic",
model: "claude-opus-4-7",
runId: "run-empty-error-retry-frontier",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2);
expect(result.payloads?.[0]?.isError).toBeFalsy();
});
});

View File

@@ -489,6 +489,13 @@ export async function runEmbeddedPiAgent(
});
let rateLimitProfileRotations = 0;
let timeoutCompactionAttempts = 0;
// Silent-error retry: non-strict-agentic models (e.g. ollama/glm-5.1) can
// end a turn with stopReason="error" + zero output tokens, producing no
// user-visible text. The existing empty-response retry is gated on
// isStrictAgenticSupportedProviderModel (gpt-5 only). This is an
// orthogonal, model-agnostic resubmission.
const MAX_EMPTY_ERROR_RETRIES = 3;
let emptyErrorRetries = 0;
const overloadFailoverBackoffMs = resolveOverloadFailoverBackoffMs(params.config);
const overloadProfileRotationLimit = resolveOverloadProfileRotationLimit(params.config);
const rateLimitProfileRotationLimit = resolveRateLimitProfileRotationLimit(params.config);
@@ -1911,6 +1918,42 @@ export async function runEmbeddedPiAgent(
`provider=${activeErrorContext.provider}/${activeErrorContext.model} attempts=${emptyResponseRetryAttempts}/${maxEmptyResponseRetryAttempts} — surfacing incomplete-turn error`,
);
}
// ── silent-error retry ────────────────────────────────────────────
// Observed with ollama/glm-5.1: a turn can end with stopReason="error"
// and zero output tokens AND empty content after a successful
// tool-call sequence, producing no user-visible text at all. The
// existing empty-response retry path (resolveEmptyResponseRetryInstruction)
// is gated on the strict-agentic contract (gpt-5 only), so non-frontier
// models fall through to "incomplete turn detected" → silent gap
// until the user nudges. This is a narrower, model-agnostic
// resubmission: same prompt, same session transcript (tool results
// already captured), no instruction injection. Placed before the
// incompleteTurnText return so it actually gets a chance to fire.
//
// Content-empty guard: a reasoning-only error (content has thinking
// blocks) is a distinct failure mode handled elsewhere; only retry
// when the assistant truly produced nothing.
const silentErrorContent = sessionLastAssistant?.content as Array<unknown> | undefined;
if (
incompleteTurnText &&
!aborted &&
!promptError &&
!timedOut &&
sessionLastAssistant?.stopReason === "error" &&
((sessionLastAssistant?.usage as { output?: number } | undefined)?.output ?? 0) === 0 &&
(silentErrorContent?.length ?? 0) === 0 &&
emptyErrorRetries < MAX_EMPTY_ERROR_RETRIES
) {
emptyErrorRetries += 1;
log.warn(
`[empty-error-retry] stopReason=error output=0; resubmitting ` +
`attempt=${emptyErrorRetries}/${MAX_EMPTY_ERROR_RETRIES} ` +
`provider=${sessionLastAssistant?.provider ?? provider} ` +
`model=${sessionLastAssistant?.model ?? model.id} ` +
`sessionKey=${params.sessionKey ?? params.sessionId}`,
);
continue;
}
if (incompleteTurnText) {
const replayInvalid = resolveReplayInvalidForAttempt(incompleteTurnText);
const livenessState = resolveRunLivenessState({