fix(acp): classify gateway chat error kinds

2026-05-28 03:00:34 +00:00 · 2026-04-10 08:16:50 +01:00
parent 8763614d1e
commit c2e2b87f28
8 changed files with 274 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -88,6 +88,7 @@ Docs: https://docs.openclaw.ai
 - Agents/failover: allow cooldown probes for `timeout` (including network outage classifications) so the primary model can recover after failover without a gateway restart. (#63996) Thanks @neeravmakwana.
 - iMessage (imsg): strip an accidental protobuf length-delimited UTF-8 field wrapper from inbound `text` and `reply_to_text` when it fully consumes the field, fixing leading garbage before the real message. (#63868) Thanks @neeravmakwana.
 - Gateway/pairing: fail closed for paired device records that have no device tokens, and reject pairing approvals whose requested scopes do not match the requested device roles.
+- ACP/gateway chat: classify lifecycle errors before forwarding them to ACP clients so refusals use ACP's refusal stop reason while transient backend errors continue to finish as normal turns.

 ## 2026.4.9

--- a/src/acp/translator.error-kind.test.ts
+++ b/src/acp/translator.error-kind.test.ts
@@ -0,0 +1,138 @@
+import type { PromptRequest } from "@agentclientprotocol/sdk";
+import { describe, expect, it, vi } from "vitest";
+import type { GatewayClient } from "../gateway/client.js";
+import type { EventFrame } from "../gateway/protocol/index.js";
+import { createInMemorySessionStore } from "./session.js";
+import { AcpGatewayAgent } from "./translator.js";
+import { createAcpConnection, createAcpGateway } from "./translator.test-helpers.js";
+
+type PendingPromptHarness = {
+  agent: AcpGatewayAgent;
+  promptPromise: ReturnType<AcpGatewayAgent["prompt"]>;
+  runId: string;
+};
+
+const DEFAULT_SESSION_ID = "session-1";
+const DEFAULT_SESSION_KEY = "agent:main:main";
+const DEFAULT_PROMPT_TEXT = "hello";
+
+function createSessionAgentHarness(
+  request: GatewayClient["request"],
+  options: { sessionId?: string; sessionKey?: string; cwd?: string } = {},
+) {
+  const sessionId = options.sessionId ?? DEFAULT_SESSION_ID;
+  const sessionKey = options.sessionKey ?? DEFAULT_SESSION_KEY;
+  const sessionStore = createInMemorySessionStore();
+  sessionStore.createSession({
+    sessionId,
+    sessionKey,
+    cwd: options.cwd ?? "/tmp",
+  });
+  const agent = new AcpGatewayAgent(createAcpConnection(), createAcpGateway(request), {
+    sessionStore,
+  });
+
+  return {
+    agent,
+    sessionId,
+    sessionKey,
+    sessionStore,
+  };
+}
+
+function promptAgent(
+  agent: AcpGatewayAgent,
+  sessionId = DEFAULT_SESSION_ID,
+  text = DEFAULT_PROMPT_TEXT,
+) {
+  return agent.prompt({
+    sessionId,
+    prompt: [{ type: "text", text }],
+    _meta: {},
+  } as unknown as PromptRequest);
+}
+
+async function createPendingPromptHarness(): Promise<PendingPromptHarness> {
+  let runId: string | undefined;
+  const request = vi.fn(async (method: string, params?: Record<string, unknown>) => {
+    if (method === "chat.send") {
+      runId = params?.idempotencyKey as string | undefined;
+      return new Promise<never>(() => {});
+    }
+    return {};
+  }) as GatewayClient["request"];
+
+  const { agent, sessionId } = createSessionAgentHarness(request);
+  const promptPromise = promptAgent(agent, sessionId);
+
+  await vi.waitFor(() => {
+    expect(runId).toBeDefined();
+  });
+
+  return {
+    agent,
+    promptPromise,
+    runId: runId!,
+  };
+}
+
+function createChatEvent(payload: Record<string, unknown>): EventFrame {
+  return {
+    type: "event",
+    event: "chat",
+    payload,
+  } as EventFrame;
+}
+
+describe("acp translator errorKind mapping", () => {
+  it("maps errorKind: refusal to stopReason: refusal", async () => {
+    const { agent, promptPromise, runId } = await createPendingPromptHarness();
+
+    await agent.handleGatewayEvent(
+      createChatEvent({
+        runId,
+        sessionKey: DEFAULT_SESSION_KEY,
+        seq: 1,
+        state: "error",
+        errorKind: "refusal",
+        errorMessage: "I cannot fulfill this request.",
+      }),
+    );
+
+    await expect(promptPromise).resolves.toEqual({ stopReason: "refusal" });
+  });
+
+  it("maps errorKind: timeout to stopReason: end_turn", async () => {
+    const { agent, promptPromise, runId } = await createPendingPromptHarness();
+
+    await agent.handleGatewayEvent(
+      createChatEvent({
+        runId,
+        sessionKey: DEFAULT_SESSION_KEY,
+        seq: 1,
+        state: "error",
+        errorKind: "timeout",
+        errorMessage: "gateway timeout",
+      }),
+    );
+
+    await expect(promptPromise).resolves.toEqual({ stopReason: "end_turn" });
+  });
+
+  it("maps unknown errorKind to stopReason: end_turn", async () => {
+    const { agent, promptPromise, runId } = await createPendingPromptHarness();
+
+    await agent.handleGatewayEvent(
+      createChatEvent({
+        runId,
+        sessionKey: DEFAULT_SESSION_KEY,
+        seq: 1,
+        state: "error",
+        errorKind: "unknown",
+        errorMessage: "something went wrong",
+      }),
+    );
+
+    await expect(promptPromise).resolves.toEqual({ stopReason: "end_turn" });
+  });
+});
--- a/src/acp/translator.ts
+++ b/src/acp/translator.ts
@@ -956,11 +956,9 @@ export class AcpGatewayAgent implements Agent {
      return;
    }
    if (state === "error") {
-      // ACP has no explicit "server_error" stop reason.  Use "end_turn" so clients
-      // do not treat transient backend errors (timeouts, rate-limits) as deliberate
-      // refusals.  TODO: when ChatEventSchema gains a structured errorKind field
-      // (e.g. "refusal" | "timeout" | "rate_limit"), use it to distinguish here.
-      void this.finishPrompt(pending.sessionId, pending, "end_turn");
+      const errorKind = payload.errorKind as string | undefined;
+      const stopReason: StopReason = errorKind === "refusal" ? "refusal" : "end_turn";
+      void this.finishPrompt(pending.sessionId, pending, stopReason);
    }
  }

--- a/src/gateway/protocol/schema/logs-chat.ts
+++ b/src/gateway/protocol/schema/logs-chat.ts
@@ -81,6 +81,15 @@ export const ChatEventSchema = Type.Object(
    ]),
    message: Type.Optional(Type.Unknown()),
    errorMessage: Type.Optional(Type.String()),
+    errorKind: Type.Optional(
+      Type.Union([
+        Type.Literal("refusal"),
+        Type.Literal("timeout"),
+        Type.Literal("rate_limit"),
+        Type.Literal("context_length"),
+        Type.Literal("unknown"),
+      ]),
+    ),
    usage: Type.Optional(Type.Unknown()),
    stopReason: Type.Optional(Type.String()),
  },
--- a/src/gateway/server-chat.agent-events.test.ts
+++ b/src/gateway/server-chat.agent-events.test.ts
@@ -1258,6 +1258,39 @@ describe("agent event handler", () => {
    expect(agentRunSeq.has("run-terminal-error")).toBe(false);
  });

+  it("adds detected errorKind to chat lifecycle error payloads", () => {
+    const { broadcast, nodeSendToSession, handler } = createHarness({
+      resolveSessionKeyForRun: () => "session-detected-error",
+      lifecycleErrorRetryGraceMs: 0,
+    });
+    registerAgentRunContext("run-detected-error", { sessionKey: "session-detected-error" });
+
+    handler({
+      runId: "run-detected-error",
+      seq: 1,
+      stream: "lifecycle",
+      ts: Date.now(),
+      data: {
+        phase: "error",
+        error: Object.assign(new Error("Too many requests"), { code: 429 }),
+      },
+    });
+
+    const payload = chatBroadcastCalls(broadcast).at(-1)?.[1] as {
+      state?: string;
+      errorKind?: string;
+      errorMessage?: string;
+    };
+    expect(payload.state).toBe("error");
+    expect(payload.errorKind).toBe("rate_limit");
+    expect(payload.errorMessage).toContain("Too many requests");
+
+    const nodePayload = sessionChatCalls(nodeSendToSession).at(-1)?.[2] as {
+      errorKind?: string;
+    };
+    expect(nodePayload.errorKind).toBe("rate_limit");
+  });
+
  it("suppresses delayed lifecycle chat errors for active chat.send runs while still cleaning up", () => {
    vi.useFakeTimers();
    const { broadcast, clearAgentRunContext, agentRunSeq, handler } = createHarness({
--- a/src/gateway/server-chat.ts
+++ b/src/gateway/server-chat.ts
@@ -7,6 +7,7 @@ import {
 } from "../auto-reply/tokens.js";
 import { loadConfig } from "../config/config.js";
 import { type AgentEventPayload, getAgentRunContext } from "../infra/agent-events.js";
+import { detectErrorKind, type ErrorKind } from "../infra/errors.js";
 import { resolveHeartbeatVisibility } from "../infra/heartbeat-visibility.js";
 import { stripInlineDirectiveTagsForDisplay } from "../utils/directive-tags.js";
 import {
@@ -437,6 +438,20 @@ export type ChatEventBroadcast = (

 export type NodeSendToSession = (sessionKey: string, event: string, payload: unknown) => void;

+const CHAT_ERROR_KINDS = new Set<ErrorKind>([
+  "refusal",
+  "timeout",
+  "rate_limit",
+  "context_length",
+  "unknown",
+]);
+
+function readChatErrorKind(value: unknown): ErrorKind | undefined {
+  return typeof value === "string" && CHAT_ERROR_KINDS.has(value as ErrorKind)
+    ? (value as ErrorKind)
+    : undefined;
+}
+
 export type AgentEventHandlerOptions = {
  broadcast: ChatEventBroadcast;
  broadcastToConnIds: (
@@ -583,6 +598,8 @@ export function createAgentEventHandler({
      if (!isAborted) {
        const evtStopReason =
          typeof evt.data?.stopReason === "string" ? evt.data.stopReason : undefined;
+        const evtErrorKind =
+          readChatErrorKind(evt.data?.errorKind) ?? detectErrorKind(evt.data?.error);
        if (chatLink) {
          const finished = chatRunState.registry.shift(evt.runId);
          if (!finished) {
@@ -598,6 +615,7 @@ export function createAgentEventHandler({
              lifecyclePhase === "error" ? "error" : "done",
              evt.data?.error,
              evtStopReason,
+              evtErrorKind,
            );
          }
        } else if (!(opts?.skipChatErrorFinal && lifecyclePhase === "error")) {
@@ -609,6 +627,7 @@ export function createAgentEventHandler({
            lifecyclePhase === "error" ? "error" : "done",
            evt.data?.error,
            evtStopReason,
+            evtErrorKind,
          );
        }
      } else {
@@ -791,6 +810,7 @@ export function createAgentEventHandler({
    jobState: "done" | "error",
    error?: unknown,
    stopReason?: string,
+    errorKind?: ErrorKind,
  ) => {
    const { text, shouldSuppressSilent } = resolveBufferedChatTextState(clientRunId, sourceRunId);
    // Flush any throttled delta so streaming clients receive the complete text
@@ -828,6 +848,7 @@ export function createAgentEventHandler({
      seq,
      state: "error" as const,
      errorMessage: error ? formatForLog(error) : undefined,
+      ...(errorKind && { errorKind }),
    };
    broadcast("chat", payload);
    nodeSendToSession(sessionKey, "chat", payload);
--- a/src/infra/errors.test.ts
+++ b/src/infra/errors.test.ts
@@ -1,6 +1,7 @@
 import { describe, expect, it } from "vitest";
 import {
  collectErrorGraphCandidates,
+  detectErrorKind,
  extractErrorCode,
  formatErrorMessage,
  formatUncaughtError,
@@ -94,6 +95,35 @@ describe("error helpers", () => {
    expect(formatted).not.toContain(token);
  });

+  it.each([
+    {
+      value: new Error("Unhandled stop reason: refusal_policy"),
+      expected: "refusal",
+    },
+    {
+      value: Object.assign(new Error("request timed out"), { code: "ETIMEDOUT" }),
+      expected: "timeout",
+    },
+    {
+      value: Object.assign(new Error("Too many requests"), { code: 429 }),
+      expected: "rate_limit",
+    },
+    {
+      value: new Error("context_window exceeded with too many tokens"),
+      expected: "context_length",
+    },
+    {
+      value: new Error("plain provider failure"),
+      expected: undefined,
+    },
+    {
+      value: undefined,
+      expected: undefined,
+    },
+  ] as const)("detects error kind for case %#", ({ value, expected }) => {
+    expect(detectErrorKind(value)).toBe(expected);
+  });
+
  it("uses message-only formatting for INVALID_CONFIG and stack formatting otherwise", () => {
    const invalidConfig = Object.assign(new Error("TOKEN=sk-abcdefghijklmnopqrstuv"), {
      code: "INVALID_CONFIG",
--- a/src/infra/errors.ts
+++ b/src/infra/errors.ts
@@ -111,3 +111,42 @@ export function formatUncaughtError(err: unknown): string {
  }
  return formatErrorMessage(err);
 }
+
+export type ErrorKind = "refusal" | "timeout" | "rate_limit" | "context_length" | "unknown";
+
+export function detectErrorKind(err: unknown): ErrorKind | undefined {
+  if (err === undefined) {
+    return undefined;
+  }
+  const message = formatErrorMessage(err).toLowerCase();
+  const code = extractErrorCode(err)?.toLowerCase();
+
+  if (
+    message.includes("refusal") ||
+    message.includes("content_filter") ||
+    message.includes("sensitive") ||
+    message.includes("unhandled stop reason: refusal_policy")
+  ) {
+    return "refusal";
+  }
+  if (message.includes("timeout") || code === "etimedout" || code === "timeout") {
+    return "timeout";
+  }
+  if (
+    message.includes("rate limit") ||
+    message.includes("too many requests") ||
+    message.includes("429") ||
+    code === "429"
+  ) {
+    return "rate_limit";
+  }
+  if (
+    message.includes("context length") ||
+    message.includes("too many tokens") ||
+    message.includes("token limit") ||
+    message.includes("context_window")
+  ) {
+    return "context_length";
+  }
+  return undefined;
+}