Fix gateway timeout embedded fallback session lock (#74543)

* Agent: isolate gateway timeout fallback sessions

* fix(cli): isolate gateway timeout fallback sessions

---------

Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Hemant Sudarshan
2026-04-30 00:08:11 +05:30
committed by GitHub
parent 89f871679e
commit fbae2a6441
6 changed files with 149 additions and 3 deletions

View File

@@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai
- Ollama: normalize provider-prefixed tool-call names at the native stream boundary so Kimi/Ollama calls such as `functions.exec` dispatch as `exec` instead of missing configured tools. Fixes #74487. Thanks @afurm and @carreipeia.
- Security/audit: resolve configured model aliases before model-tier and small-parameter checks, so alias-based GPT-5/Codex configs no longer report false weak-model warnings. Fixes #74455. Thanks @blaspat.
- CLI/agent: isolate Gateway-timeout embedded fallback runs under explicit `gateway-fallback-*` sessions so accepted Gateway runs cannot race transcript locks or replace the routed conversation session. Fixes #62981. Thanks @HemantSudarshan.
- Models/UI: hide unauthenticated providers from the default Web chat, `/models`, and model setup pickers while keeping explicit full-catalog browse paths through `view: "all"`, `/models <provider> all`, and `models list --all`. Fixes #74423. Thanks @guarismo and @SymbolStar.
- Slack/prompts: rely on Slack `interactiveReplies` guidance instead of generic `inlineButtons` config hints so enabled Slack button directives are not contradicted. Fixes #46647. Thanks @jeremykoerber.
- Slack/reactions: treat duplicate `already_reacted` responses as idempotent success so repeated agent reaction adds no longer surface as tool failures. Fixes #69005. Thanks @shipitsteven and @martingarramon.

View File

@@ -59,6 +59,7 @@ openclaw agent --agent ops --message "Run locally" --local
- `--channel`, `--reply-channel`, and `--reply-account` affect reply delivery, not session routing.
- `--json` keeps stdout reserved for the JSON response. Gateway, plugin, and embedded-fallback diagnostics are routed to stderr so scripts can parse stdout directly.
- Embedded fallback JSON includes `meta.transport: "embedded"` and `meta.fallbackFrom: "gateway"` so scripts can distinguish fallback runs from Gateway runs.
- If the Gateway accepts an agent run but the CLI times out waiting for the final reply, embedded fallback uses a fresh explicit `gateway-fallback-*` session/run id and reports `meta.fallbackReason: "gateway_timeout"` plus the fallback session fields. This avoids racing the Gateway-owned transcript lock or silently replacing the original routed conversation session.
- When this command triggers `models.json` regeneration, SecretRef-managed provider credentials are persisted as non-secret markers (for example env var names, `secretref-env:ENV_VAR_NAME`, or `secretref-managed`), not resolved secret plaintext.
- Marker writes are source-authoritative: OpenClaw persists markers from the active source config snapshot, not from resolved runtime secret values.

View File

@@ -54,7 +54,10 @@ type SessionIdMatchSet = {
storeByKey: Map<string, SessionKeyResolution>;
};
function buildExplicitSessionIdSessionKey(params: { sessionId: string; agentId?: string }): string {
export function buildExplicitSessionIdSessionKey(params: {
sessionId: string;
agentId?: string;
}): string {
return `agent:${normalizeAgentId(params.agentId)}:explicit:${params.sessionId.trim()}`;
}

View File

@@ -17,6 +17,9 @@ export type { AgentStreamParams } from "./shared-types.js";
export type AgentCommandResultMetaOverrides = {
transport?: "embedded";
fallbackFrom?: "gateway";
fallbackReason?: "gateway_timeout";
fallbackSessionId?: string;
fallbackSessionKey?: string;
};
export type AcpTurnSource = "manual_spawn";

View File

@@ -10,6 +10,15 @@ import type { agentCommand as AgentCommand } from "./agent.js";
const loadConfig = vi.hoisted(() => vi.fn());
const callGateway = vi.hoisted(() => vi.fn());
const isGatewayTransportError = vi.hoisted(() =>
vi.fn((value: unknown) => {
if (!(value instanceof Error) || value.name !== "GatewayTransportError") {
return false;
}
const kind = (value as { kind?: unknown }).kind;
return kind === "closed" || kind === "timeout";
}),
);
const agentCommand = vi.hoisted(() => vi.fn());
const runtime: RuntimeEnv = {
@@ -78,9 +87,24 @@ function mockLocalAgentReply(text = "local") {
});
}
function createGatewayTimeoutError() {
const err = new Error("gateway timeout after 90000ms");
err.name = "GatewayTransportError";
return Object.assign(err, {
kind: "timeout",
timeoutMs: 90_000,
connectionDetails: {
url: "ws://127.0.0.1:18789",
urlSource: "local loopback",
message: "Gateway target: ws://127.0.0.1:18789",
},
});
}
vi.mock("../config/config.js", () => ({ getRuntimeConfig: loadConfig, loadConfig }));
vi.mock("../gateway/call.js", () => ({
callGateway,
isGatewayTransportError,
randomIdempotencyKey: () => "idem-1",
}));
vi.mock("./agent.js", () => ({ agentCommand }));
@@ -182,6 +206,73 @@ describe("agentCliCommand", () => {
});
});
it("uses a fresh embedded session when gateway agent times out", async () => {
await withTempStore(async () => {
callGateway.mockRejectedValue(createGatewayTimeoutError());
mockLocalAgentReply();
await agentCliCommand(
{
message: "hi",
sessionId: "locked-session",
runId: "locked-run",
},
runtime,
);
expect(callGateway).toHaveBeenCalledTimes(1);
expect(agentCommand).toHaveBeenCalledTimes(1);
const fallbackOpts = agentCommand.mock.calls[0]?.[0] as {
sessionId?: string;
sessionKey?: string;
runId?: string;
resultMetaOverrides?: unknown;
};
expect(fallbackOpts.sessionId).toMatch(/^gateway-fallback-/);
expect(fallbackOpts.sessionId).not.toBe("locked-session");
expect(fallbackOpts.sessionKey).toBe(`agent:main:explicit:${fallbackOpts.sessionId}`);
expect(fallbackOpts.runId).toBe(fallbackOpts.sessionId);
expect(fallbackOpts.resultMetaOverrides).toMatchObject({
transport: "embedded",
fallbackFrom: "gateway",
fallbackReason: "gateway_timeout",
fallbackSessionId: fallbackOpts.sessionId,
fallbackSessionKey: fallbackOpts.sessionKey,
});
expect(runtime.error).toHaveBeenCalledWith(
expect.stringContaining(
"Gateway agent timed out; running embedded agent with fresh session",
),
);
expect(runtime.log).toHaveBeenCalledWith("local");
});
});
it("keeps timeout fallback from replacing the routed conversation session key", async () => {
await withTempStore(async () => {
callGateway.mockRejectedValue(createGatewayTimeoutError());
mockLocalAgentReply();
await agentCliCommand(
{
message: "hi",
to: "+1555",
},
runtime,
);
const fallbackOpts = agentCommand.mock.calls[0]?.[0] as {
sessionId?: string;
sessionKey?: string;
to?: string;
};
expect(fallbackOpts.to).toBe("+1555");
expect(fallbackOpts.sessionId).toMatch(/^gateway-fallback-/);
expect(fallbackOpts.sessionKey).toBe(`agent:main:explicit:${fallbackOpts.sessionId}`);
expect(fallbackOpts.sessionKey).not.toBe("agent:main:+1555");
});
});
it("passes fallback metadata into JSON embedded fallback output", async () => {
await withTempStore(async () => {
callGateway.mockRejectedValue(new Error("gateway not connected"));

View File

@@ -1,3 +1,4 @@
import { randomUUID } from "node:crypto";
import { resolveSendableOutboundReplyParts } from "openclaw/plugin-sdk/reply-payload";
import { listAgentIds } from "../agents/agent-scope.js";
import { formatCliCommand } from "../cli/command-format.js";
@@ -5,7 +6,7 @@ import type { CliDeps } from "../cli/deps.types.js";
import { withProgress } from "../cli/progress.js";
import { getRuntimeConfig } from "../config/config.js";
import type { OpenClawConfig } from "../config/types.openclaw.js";
import { callGateway, randomIdempotencyKey } from "../gateway/call.js";
import { callGateway, isGatewayTransportError, randomIdempotencyKey } from "../gateway/call.js";
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../gateway/protocol/client-info.js";
import { routeLogsToStderr } from "../logging/console.js";
import { normalizeAgentId } from "../routing/session-key.js";
@@ -13,7 +14,7 @@ import { type RuntimeEnv, writeRuntimeJson } from "../runtime.js";
import { normalizeOptionalString } from "../shared/string-coerce.js";
import { normalizeMessageChannel } from "../utils/message-channel.js";
import { agentCommand } from "./agent.js";
import { resolveSessionKeyForRequest } from "./agent/session.js";
import { buildExplicitSessionIdSessionKey, resolveSessionKeyForRequest } from "./agent/session.js";
type AgentGatewayResult = {
payloads?: Array<{
@@ -36,6 +37,7 @@ const EMBEDDED_FALLBACK_META = {
transport: "embedded",
fallbackFrom: "gateway",
} as const;
const GATEWAY_TIMEOUT_FALLBACK_SESSION_PREFIX = "gateway-fallback-";
export type AgentCliOpts = {
message: string;
@@ -96,6 +98,28 @@ function formatPayloadForLog(payload: {
return lines.join("\n").trimEnd();
}
function isGatewayAgentTimeoutError(err: unknown): boolean {
if (isGatewayTransportError(err)) {
return err.kind === "timeout";
}
return err instanceof Error && err.message.includes("gateway request timeout for agent");
}
function createGatewayTimeoutFallbackSessionId(): string {
return `${GATEWAY_TIMEOUT_FALLBACK_SESSION_PREFIX}${randomUUID()}`;
}
function createGatewayTimeoutFallbackSession(agentId?: string): {
sessionId: string;
sessionKey: string;
} {
const sessionId = createGatewayTimeoutFallbackSessionId();
return {
sessionId,
sessionKey: buildExplicitSessionIdSessionKey({ sessionId, agentId }),
};
}
export async function agentViaGatewayCommand(opts: AgentCliOpts, runtime: RuntimeEnv) {
protectJsonStdout(opts);
const body = (opts.message ?? "").trim();
@@ -207,6 +231,29 @@ export async function agentCliCommand(opts: AgentCliOpts, runtime: RuntimeEnv, d
try {
return await agentViaGatewayCommand(opts, runtime);
} catch (err) {
if (isGatewayAgentTimeoutError(err)) {
const fallbackSession = createGatewayTimeoutFallbackSession(opts.agent);
runtime.error?.(
`EMBEDDED FALLBACK: Gateway agent timed out; running embedded agent with fresh session ${fallbackSession.sessionId}: ${String(err)}`,
);
return await agentCommand(
{
...localOpts,
sessionId: fallbackSession.sessionId,
sessionKey: fallbackSession.sessionKey,
runId: fallbackSession.sessionId,
resultMetaOverrides: {
...EMBEDDED_FALLBACK_META,
fallbackReason: "gateway_timeout",
fallbackSessionId: fallbackSession.sessionId,
fallbackSessionKey: fallbackSession.sessionKey,
},
},
runtime,
deps,
);
}
runtime.error?.(
`EMBEDDED FALLBACK: Gateway agent failed; running embedded agent: ${String(err)}`,
);