fix(gateway): harden token fallback/reconnect behavior and docs (#42507)

* fix(gateway): harden token fallback and auth reconnect handling

* docs(gateway): clarify auth retry and token-drift recovery

* fix(gateway): tighten auth reconnect gating across clients

* fix: harden gateway token retry (#42507) (thanks @joshavant)
This commit is contained in:
Josh Avant
2026-03-10 17:05:57 -05:00
committed by GitHub
parent ff2e7a2945
commit a76e810193
21 changed files with 1188 additions and 80 deletions

View File

@@ -131,6 +131,41 @@ private let defaultOperatorConnectScopes: [String] = [
"operator.pairing",
]
private enum GatewayConnectErrorCodes {
static let authTokenMismatch = "AUTH_TOKEN_MISMATCH"
static let authDeviceTokenMismatch = "AUTH_DEVICE_TOKEN_MISMATCH"
static let authTokenMissing = "AUTH_TOKEN_MISSING"
static let authPasswordMissing = "AUTH_PASSWORD_MISSING"
static let authPasswordMismatch = "AUTH_PASSWORD_MISMATCH"
static let authRateLimited = "AUTH_RATE_LIMITED"
static let pairingRequired = "PAIRING_REQUIRED"
static let controlUiDeviceIdentityRequired = "CONTROL_UI_DEVICE_IDENTITY_REQUIRED"
static let deviceIdentityRequired = "DEVICE_IDENTITY_REQUIRED"
}
private struct GatewayConnectAuthError: LocalizedError {
let message: String
let detailCode: String?
let canRetryWithDeviceToken: Bool
var errorDescription: String? { self.message }
var isNonRecoverable: Bool {
switch self.detailCode {
case GatewayConnectErrorCodes.authTokenMissing,
GatewayConnectErrorCodes.authPasswordMissing,
GatewayConnectErrorCodes.authPasswordMismatch,
GatewayConnectErrorCodes.authRateLimited,
GatewayConnectErrorCodes.pairingRequired,
GatewayConnectErrorCodes.controlUiDeviceIdentityRequired,
GatewayConnectErrorCodes.deviceIdentityRequired:
return true
default:
return false
}
}
}
public actor GatewayChannelActor {
private let logger = Logger(subsystem: "ai.openclaw", category: "gateway")
private var task: WebSocketTaskBox?
@@ -160,6 +195,9 @@ public actor GatewayChannelActor {
private var watchdogTask: Task<Void, Never>?
private var tickTask: Task<Void, Never>?
private var keepaliveTask: Task<Void, Never>?
private var pendingDeviceTokenRetry = false
private var deviceTokenRetryBudgetUsed = false
private var reconnectPausedForAuthFailure = false
private let defaultRequestTimeoutMs: Double = 15000
private let pushHandler: (@Sendable (GatewayPush) async -> Void)?
private let connectOptions: GatewayConnectOptions?
@@ -232,10 +270,19 @@ public actor GatewayChannelActor {
while self.shouldReconnect {
guard await self.sleepUnlessCancelled(nanoseconds: 30 * 1_000_000_000) else { return } // 30s cadence
guard self.shouldReconnect else { return }
if self.reconnectPausedForAuthFailure { continue }
if self.connected { continue }
do {
try await self.connect()
} catch {
if self.shouldPauseReconnectAfterAuthFailure(error) {
self.reconnectPausedForAuthFailure = true
self.logger.error(
"gateway watchdog reconnect paused for non-recoverable auth failure " +
"\(error.localizedDescription, privacy: .public)"
)
continue
}
let wrapped = self.wrap(error, context: "gateway watchdog reconnect")
self.logger.error("gateway watchdog reconnect failed \(wrapped.localizedDescription, privacy: .public)")
}
@@ -267,7 +314,12 @@ public actor GatewayChannelActor {
},
operation: { try await self.sendConnect() })
} catch {
let wrapped = self.wrap(error, context: "connect to gateway @ \(self.url.absoluteString)")
let wrapped: Error
if let authError = error as? GatewayConnectAuthError {
wrapped = authError
} else {
wrapped = self.wrap(error, context: "connect to gateway @ \(self.url.absoluteString)")
}
self.connected = false
self.task?.cancel(with: .goingAway, reason: nil)
await self.disconnectHandler?("connect failed: \(wrapped.localizedDescription)")
@@ -281,6 +333,7 @@ public actor GatewayChannelActor {
}
self.listen()
self.connected = true
self.reconnectPausedForAuthFailure = false
self.backoffMs = 500
self.lastSeq = nil
self.startKeepalive()
@@ -371,11 +424,18 @@ public actor GatewayChannelActor {
(includeDeviceIdentity && identity != nil)
? DeviceAuthStore.loadToken(deviceId: identity!.deviceId, role: role)?.token
: nil
// If we're not sending a device identity, a device token can't be validated server-side.
// In that mode we always use the shared gateway token/password.
let authToken = includeDeviceIdentity ? (storedToken ?? self.token) : self.token
let shouldUseDeviceRetryToken =
includeDeviceIdentity && self.pendingDeviceTokenRetry &&
storedToken != nil && self.token != nil && self.isTrustedDeviceRetryEndpoint()
if shouldUseDeviceRetryToken {
self.pendingDeviceTokenRetry = false
}
// Keep shared credentials explicit when provided. Device token retry is attached
// only on a bounded second attempt after token mismatch.
let authToken = self.token ?? (includeDeviceIdentity ? storedToken : nil)
let authDeviceToken = shouldUseDeviceRetryToken ? storedToken : nil
let authSource: GatewayAuthSource
if storedToken != nil {
if authDeviceToken != nil || (self.token == nil && storedToken != nil) {
authSource = .deviceToken
} else if authToken != nil {
authSource = .sharedToken
@@ -386,9 +446,12 @@ public actor GatewayChannelActor {
}
self.lastAuthSource = authSource
self.logger.info("gateway connect auth=\(authSource.rawValue, privacy: .public)")
let canFallbackToShared = includeDeviceIdentity && storedToken != nil && self.token != nil
if let authToken {
params["auth"] = ProtoAnyCodable(["token": ProtoAnyCodable(authToken)])
var auth: [String: ProtoAnyCodable] = ["token": ProtoAnyCodable(authToken)]
if let authDeviceToken {
auth["deviceToken"] = ProtoAnyCodable(authDeviceToken)
}
params["auth"] = ProtoAnyCodable(auth)
} else if let password = self.password {
params["auth"] = ProtoAnyCodable(["password": ProtoAnyCodable(password)])
}
@@ -426,11 +489,24 @@ public actor GatewayChannelActor {
do {
let response = try await self.waitForConnectResponse(reqId: reqId)
try await self.handleConnectResponse(response, identity: identity, role: role)
self.pendingDeviceTokenRetry = false
self.deviceTokenRetryBudgetUsed = false
} catch {
if canFallbackToShared {
if let identity {
DeviceAuthStore.clearToken(deviceId: identity.deviceId, role: role)
}
let shouldRetryWithDeviceToken = self.shouldRetryWithStoredDeviceToken(
error: error,
explicitGatewayToken: self.token,
storedToken: storedToken,
attemptedDeviceTokenRetry: authDeviceToken != nil)
if shouldRetryWithDeviceToken {
self.pendingDeviceTokenRetry = true
self.deviceTokenRetryBudgetUsed = true
self.backoffMs = min(self.backoffMs, 250)
} else if authDeviceToken != nil,
let identity,
self.shouldClearStoredDeviceTokenAfterRetry(error)
{
// Retry failed with an explicit device-token mismatch; clear stale local token.
DeviceAuthStore.clearToken(deviceId: identity.deviceId, role: role)
}
throw error
}
@@ -443,7 +519,13 @@ public actor GatewayChannelActor {
) async throws {
if res.ok == false {
let msg = (res.error?["message"]?.value as? String) ?? "gateway connect failed"
throw NSError(domain: "Gateway", code: 1008, userInfo: [NSLocalizedDescriptionKey: msg])
let details = res.error?["details"]?.value as? [String: ProtoAnyCodable]
let detailCode = details?["code"]?.value as? String
let canRetryWithDeviceToken = details?["canRetryWithDeviceToken"]?.value as? Bool ?? false
throw GatewayConnectAuthError(
message: msg,
detailCode: detailCode,
canRetryWithDeviceToken: canRetryWithDeviceToken)
}
guard let payload = res.payload else {
throw NSError(
@@ -616,19 +698,91 @@ public actor GatewayChannelActor {
private func scheduleReconnect() async {
guard self.shouldReconnect else { return }
guard !self.reconnectPausedForAuthFailure else { return }
let delay = self.backoffMs / 1000
self.backoffMs = min(self.backoffMs * 2, 30000)
guard await self.sleepUnlessCancelled(nanoseconds: UInt64(delay * 1_000_000_000)) else { return }
guard self.shouldReconnect else { return }
guard !self.reconnectPausedForAuthFailure else { return }
do {
try await self.connect()
} catch {
if self.shouldPauseReconnectAfterAuthFailure(error) {
self.reconnectPausedForAuthFailure = true
self.logger.error(
"gateway reconnect paused for non-recoverable auth failure " +
"\(error.localizedDescription, privacy: .public)"
)
return
}
let wrapped = self.wrap(error, context: "gateway reconnect")
self.logger.error("gateway reconnect failed \(wrapped.localizedDescription, privacy: .public)")
await self.scheduleReconnect()
}
}
private func shouldRetryWithStoredDeviceToken(
error: Error,
explicitGatewayToken: String?,
storedToken: String?,
attemptedDeviceTokenRetry: Bool
) -> Bool {
if self.deviceTokenRetryBudgetUsed {
return false
}
if attemptedDeviceTokenRetry {
return false
}
guard explicitGatewayToken != nil, storedToken != nil else {
return false
}
guard self.isTrustedDeviceRetryEndpoint() else {
return false
}
guard let authError = error as? GatewayConnectAuthError else {
return false
}
return authError.canRetryWithDeviceToken ||
authError.detailCode == GatewayConnectErrorCodes.authTokenMismatch
}
private func shouldPauseReconnectAfterAuthFailure(_ error: Error) -> Bool {
guard let authError = error as? GatewayConnectAuthError else {
return false
}
if authError.isNonRecoverable {
return true
}
if authError.detailCode == GatewayConnectErrorCodes.authTokenMismatch &&
self.deviceTokenRetryBudgetUsed && !self.pendingDeviceTokenRetry
{
return true
}
return false
}
private func shouldClearStoredDeviceTokenAfterRetry(_ error: Error) -> Bool {
guard let authError = error as? GatewayConnectAuthError else {
return false
}
return authError.detailCode == GatewayConnectErrorCodes.authDeviceTokenMismatch
}
private func isTrustedDeviceRetryEndpoint() -> Bool {
// This client currently treats loopback as the only trusted retry target.
// Unlike the Node gateway client, it does not yet expose a pinned TLS-fingerprint
// trust path for remote retry, so remote fallback remains disabled by default.
guard let host = self.url.host?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased(),
!host.isEmpty
else {
return false
}
if host == "localhost" || host == "::1" || host == "127.0.0.1" || host.hasPrefix("127.") {
return true
}
return false
}
private nonisolated func sleepUnlessCancelled(nanoseconds: UInt64) async -> Bool {
do {
try await Task.sleep(nanoseconds: nanoseconds)
@@ -756,7 +910,8 @@ public actor GatewayChannelActor {
return (id: id, data: data)
} catch {
self.logger.error(
"gateway \(kind) encode failed \(method, privacy: .public) error=\(error.localizedDescription, privacy: .public)")
"gateway \(kind) encode failed \(method, privacy: .public) " +
"error=\(error.localizedDescription, privacy: .public)")
throw error
}
}