fix(gateway): make startup control-plane retries explicit

* fix(gateway): make startup control-plane retries explicit * docs(changelog): note startup control-plane retry fix
2026-05-06 06:50:43 +00:00 · 2026-05-02 03:16:04 -07:00
parent ebc26a0bef
commit ccb847e46f
4 changed files with 47 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -231,6 +231,7 @@ Docs: https://docs.openclaw.ai
 - Diagnostics: reset stuck-session timers on reply, tool, status, block, and ACP progress events, and back off repeated `session.stuck` diagnostics while a session remains unchanged. Supersedes #72010. Thanks @rubencu.
 - Gateway/agents: avoid rebuilding core tools for plugin-only allowlists and keep the full plugin registry cache warm across scoped plugin loads, reducing per-turn latency spikes. Fixes #75882, #75907, #75906, #75887, and #75851. (#75922) Thanks @obviyus.
 - Agents/failover: classify bare `status: internal server error` provider messages as retryable server errors so model fallback can rotate instead of stopping. (#73844) Thanks @thesomewhatyou.
+- Gateway/startup: return the shared retryable startup-sidecars error for startup-gated control-plane RPCs such as sessions.create, sessions.send, sessions.abort, agent.wait, and tools.effective, so clients can retry early sidecar races. (#76012) Thanks @scoootscooob.

 ## 2026.4.30

--- a/src/gateway/server-methods.control-plane-rate-limit.test.ts
+++ b/src/gateway/server-methods.control-plane-rate-limit.test.ts
@@ -3,8 +3,10 @@ import {
  __testing as controlPlaneRateLimitTesting,
  resolveControlPlaneRateLimitKey,
 } from "./control-plane-rate-limit.js";
+import { isRetryableGatewayStartupUnavailableError } from "./protocol/startup-unavailable.js";
 import { handleGatewayRequest } from "./server-methods.js";
 import type { GatewayRequestHandler } from "./server-methods/types.js";
+import { STARTUP_UNAVAILABLE_GATEWAY_METHODS } from "./server-startup-unavailable-methods.js";

 const noWebchat = () => false;

@@ -131,32 +133,37 @@ describe("gateway control-plane write rate limit", () => {
    expect(handlerCalls).toHaveBeenCalledTimes(4);
  });

-  it("blocks startup-gated methods before dispatch", async () => {
-    const handlerCalls = vi.fn();
-    const handler: GatewayRequestHandler = (opts) => {
-      handlerCalls(opts);
-      opts.respond(true, undefined, undefined);
-    };
-    const context = {
-      ...buildContext(),
-      unavailableGatewayMethods: new Set(["chat.history", "models.list"]),
-    } as Parameters<typeof handleGatewayRequest>[0]["context"];
-    const client = buildClient();
+  it.each(STARTUP_UNAVAILABLE_GATEWAY_METHODS)(
+    "blocks startup-gated method %s before dispatch with a retryable startup error",
+    async (method) => {
+      const handlerCalls = vi.fn();
+      const handler: GatewayRequestHandler = (opts) => {
+        handlerCalls(opts);
+        opts.respond(true, undefined, undefined);
+      };
+      const context = {
+        ...buildContext(),
+        unavailableGatewayMethods: new Set(STARTUP_UNAVAILABLE_GATEWAY_METHODS),
+      } as Parameters<typeof handleGatewayRequest>[0]["context"];
+      const client = buildClient();

-    const blocked = await runRequest({ method: "models.list", context, client, handler });
+      const blocked = await runRequest({ method, context, client, handler });

-    expect(handlerCalls).not.toHaveBeenCalled();
-    expect(blocked).toHaveBeenCalledWith(
-      false,
-      undefined,
-      expect.objectContaining({
-        code: "UNAVAILABLE",
-        retryable: true,
-        retryAfterMs: 500,
-        details: { method: "models.list" },
-      }),
-    );
-  });
+      expect(handlerCalls).not.toHaveBeenCalled();
+      expect(blocked).toHaveBeenCalledWith(
+        false,
+        undefined,
+        expect.objectContaining({
+          code: "UNAVAILABLE",
+          retryable: true,
+          retryAfterMs: 500,
+          details: { reason: "startup-sidecars", method },
+        }),
+      );
+      const error = blocked.mock.calls[0]?.[2];
+      expect(isRetryableGatewayStartupUnavailableError(error)).toBe(true);
+    },
+  );

  it("uses connId fallback when both device and client IP are unknown", () => {
    const key = resolveControlPlaneRateLimitKey({
--- a/src/gateway/server-methods.ts
+++ b/src/gateway/server-methods.ts
@@ -3,6 +3,10 @@ import { formatControlPlaneActor, resolveControlPlaneActor } from "./control-pla
 import { consumeControlPlaneWriteBudget } from "./control-plane-rate-limit.js";
 import { ADMIN_SCOPE, authorizeOperatorScopesForMethod } from "./method-scopes.js";
 import { ErrorCodes, errorShape } from "./protocol/index.js";
+import {
+  gatewayStartupUnavailableDetails,
+  GATEWAY_STARTUP_RETRY_AFTER_MS,
+} from "./protocol/startup-unavailable.js";
 import { isRoleAuthorizedForMethod, parseGatewayRole } from "./role-policy.js";
 import { agentHandlers } from "./server-methods/agent.js";
 import { agentsHandlers } from "./server-methods/agents.js";
@@ -128,8 +132,8 @@ export async function handleGatewayRequest(
      undefined,
      errorShape(ErrorCodes.UNAVAILABLE, `${req.method} unavailable during gateway startup`, {
        retryable: true,
-        retryAfterMs: 500,
-        details: { method: req.method },
+        retryAfterMs: GATEWAY_STARTUP_RETRY_AFTER_MS,
+        details: { ...gatewayStartupUnavailableDetails(), method: req.method },
      }),
    );
    return;
--- a/src/gateway/server-startup-unavailable-methods.ts
+++ b/src/gateway/server-startup-unavailable-methods.ts
@@ -1 +1,9 @@
-export const STARTUP_UNAVAILABLE_GATEWAY_METHODS = ["chat.history", "models.list"] as const;
+export const STARTUP_UNAVAILABLE_GATEWAY_METHODS = [
+  "agent.wait",
+  "chat.history",
+  "models.list",
+  "sessions.abort",
+  "sessions.create",
+  "sessions.send",
+  "tools.effective",
+] as const;