mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
fix(gateway): harden token fallback/reconnect behavior and docs (#42507)
* fix(gateway): harden token fallback and auth reconnect handling * docs(gateway): clarify auth retry and token-drift recovery * fix(gateway): tighten auth reconnect gating across clients * fix: harden gateway token retry (#42507) (thanks @joshavant)
This commit is contained in:
@@ -131,6 +131,41 @@ private let defaultOperatorConnectScopes: [String] = [
|
||||
"operator.pairing",
|
||||
]
|
||||
|
||||
private enum GatewayConnectErrorCodes {
|
||||
static let authTokenMismatch = "AUTH_TOKEN_MISMATCH"
|
||||
static let authDeviceTokenMismatch = "AUTH_DEVICE_TOKEN_MISMATCH"
|
||||
static let authTokenMissing = "AUTH_TOKEN_MISSING"
|
||||
static let authPasswordMissing = "AUTH_PASSWORD_MISSING"
|
||||
static let authPasswordMismatch = "AUTH_PASSWORD_MISMATCH"
|
||||
static let authRateLimited = "AUTH_RATE_LIMITED"
|
||||
static let pairingRequired = "PAIRING_REQUIRED"
|
||||
static let controlUiDeviceIdentityRequired = "CONTROL_UI_DEVICE_IDENTITY_REQUIRED"
|
||||
static let deviceIdentityRequired = "DEVICE_IDENTITY_REQUIRED"
|
||||
}
|
||||
|
||||
private struct GatewayConnectAuthError: LocalizedError {
|
||||
let message: String
|
||||
let detailCode: String?
|
||||
let canRetryWithDeviceToken: Bool
|
||||
|
||||
var errorDescription: String? { self.message }
|
||||
|
||||
var isNonRecoverable: Bool {
|
||||
switch self.detailCode {
|
||||
case GatewayConnectErrorCodes.authTokenMissing,
|
||||
GatewayConnectErrorCodes.authPasswordMissing,
|
||||
GatewayConnectErrorCodes.authPasswordMismatch,
|
||||
GatewayConnectErrorCodes.authRateLimited,
|
||||
GatewayConnectErrorCodes.pairingRequired,
|
||||
GatewayConnectErrorCodes.controlUiDeviceIdentityRequired,
|
||||
GatewayConnectErrorCodes.deviceIdentityRequired:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public actor GatewayChannelActor {
|
||||
private let logger = Logger(subsystem: "ai.openclaw", category: "gateway")
|
||||
private var task: WebSocketTaskBox?
|
||||
@@ -160,6 +195,9 @@ public actor GatewayChannelActor {
|
||||
private var watchdogTask: Task<Void, Never>?
|
||||
private var tickTask: Task<Void, Never>?
|
||||
private var keepaliveTask: Task<Void, Never>?
|
||||
private var pendingDeviceTokenRetry = false
|
||||
private var deviceTokenRetryBudgetUsed = false
|
||||
private var reconnectPausedForAuthFailure = false
|
||||
private let defaultRequestTimeoutMs: Double = 15000
|
||||
private let pushHandler: (@Sendable (GatewayPush) async -> Void)?
|
||||
private let connectOptions: GatewayConnectOptions?
|
||||
@@ -232,10 +270,19 @@ public actor GatewayChannelActor {
|
||||
while self.shouldReconnect {
|
||||
guard await self.sleepUnlessCancelled(nanoseconds: 30 * 1_000_000_000) else { return } // 30s cadence
|
||||
guard self.shouldReconnect else { return }
|
||||
if self.reconnectPausedForAuthFailure { continue }
|
||||
if self.connected { continue }
|
||||
do {
|
||||
try await self.connect()
|
||||
} catch {
|
||||
if self.shouldPauseReconnectAfterAuthFailure(error) {
|
||||
self.reconnectPausedForAuthFailure = true
|
||||
self.logger.error(
|
||||
"gateway watchdog reconnect paused for non-recoverable auth failure " +
|
||||
"\(error.localizedDescription, privacy: .public)"
|
||||
)
|
||||
continue
|
||||
}
|
||||
let wrapped = self.wrap(error, context: "gateway watchdog reconnect")
|
||||
self.logger.error("gateway watchdog reconnect failed \(wrapped.localizedDescription, privacy: .public)")
|
||||
}
|
||||
@@ -267,7 +314,12 @@ public actor GatewayChannelActor {
|
||||
},
|
||||
operation: { try await self.sendConnect() })
|
||||
} catch {
|
||||
let wrapped = self.wrap(error, context: "connect to gateway @ \(self.url.absoluteString)")
|
||||
let wrapped: Error
|
||||
if let authError = error as? GatewayConnectAuthError {
|
||||
wrapped = authError
|
||||
} else {
|
||||
wrapped = self.wrap(error, context: "connect to gateway @ \(self.url.absoluteString)")
|
||||
}
|
||||
self.connected = false
|
||||
self.task?.cancel(with: .goingAway, reason: nil)
|
||||
await self.disconnectHandler?("connect failed: \(wrapped.localizedDescription)")
|
||||
@@ -281,6 +333,7 @@ public actor GatewayChannelActor {
|
||||
}
|
||||
self.listen()
|
||||
self.connected = true
|
||||
self.reconnectPausedForAuthFailure = false
|
||||
self.backoffMs = 500
|
||||
self.lastSeq = nil
|
||||
self.startKeepalive()
|
||||
@@ -371,11 +424,18 @@ public actor GatewayChannelActor {
|
||||
(includeDeviceIdentity && identity != nil)
|
||||
? DeviceAuthStore.loadToken(deviceId: identity!.deviceId, role: role)?.token
|
||||
: nil
|
||||
// If we're not sending a device identity, a device token can't be validated server-side.
|
||||
// In that mode we always use the shared gateway token/password.
|
||||
let authToken = includeDeviceIdentity ? (storedToken ?? self.token) : self.token
|
||||
let shouldUseDeviceRetryToken =
|
||||
includeDeviceIdentity && self.pendingDeviceTokenRetry &&
|
||||
storedToken != nil && self.token != nil && self.isTrustedDeviceRetryEndpoint()
|
||||
if shouldUseDeviceRetryToken {
|
||||
self.pendingDeviceTokenRetry = false
|
||||
}
|
||||
// Keep shared credentials explicit when provided. Device token retry is attached
|
||||
// only on a bounded second attempt after token mismatch.
|
||||
let authToken = self.token ?? (includeDeviceIdentity ? storedToken : nil)
|
||||
let authDeviceToken = shouldUseDeviceRetryToken ? storedToken : nil
|
||||
let authSource: GatewayAuthSource
|
||||
if storedToken != nil {
|
||||
if authDeviceToken != nil || (self.token == nil && storedToken != nil) {
|
||||
authSource = .deviceToken
|
||||
} else if authToken != nil {
|
||||
authSource = .sharedToken
|
||||
@@ -386,9 +446,12 @@ public actor GatewayChannelActor {
|
||||
}
|
||||
self.lastAuthSource = authSource
|
||||
self.logger.info("gateway connect auth=\(authSource.rawValue, privacy: .public)")
|
||||
let canFallbackToShared = includeDeviceIdentity && storedToken != nil && self.token != nil
|
||||
if let authToken {
|
||||
params["auth"] = ProtoAnyCodable(["token": ProtoAnyCodable(authToken)])
|
||||
var auth: [String: ProtoAnyCodable] = ["token": ProtoAnyCodable(authToken)]
|
||||
if let authDeviceToken {
|
||||
auth["deviceToken"] = ProtoAnyCodable(authDeviceToken)
|
||||
}
|
||||
params["auth"] = ProtoAnyCodable(auth)
|
||||
} else if let password = self.password {
|
||||
params["auth"] = ProtoAnyCodable(["password": ProtoAnyCodable(password)])
|
||||
}
|
||||
@@ -426,11 +489,24 @@ public actor GatewayChannelActor {
|
||||
do {
|
||||
let response = try await self.waitForConnectResponse(reqId: reqId)
|
||||
try await self.handleConnectResponse(response, identity: identity, role: role)
|
||||
self.pendingDeviceTokenRetry = false
|
||||
self.deviceTokenRetryBudgetUsed = false
|
||||
} catch {
|
||||
if canFallbackToShared {
|
||||
if let identity {
|
||||
DeviceAuthStore.clearToken(deviceId: identity.deviceId, role: role)
|
||||
}
|
||||
let shouldRetryWithDeviceToken = self.shouldRetryWithStoredDeviceToken(
|
||||
error: error,
|
||||
explicitGatewayToken: self.token,
|
||||
storedToken: storedToken,
|
||||
attemptedDeviceTokenRetry: authDeviceToken != nil)
|
||||
if shouldRetryWithDeviceToken {
|
||||
self.pendingDeviceTokenRetry = true
|
||||
self.deviceTokenRetryBudgetUsed = true
|
||||
self.backoffMs = min(self.backoffMs, 250)
|
||||
} else if authDeviceToken != nil,
|
||||
let identity,
|
||||
self.shouldClearStoredDeviceTokenAfterRetry(error)
|
||||
{
|
||||
// Retry failed with an explicit device-token mismatch; clear stale local token.
|
||||
DeviceAuthStore.clearToken(deviceId: identity.deviceId, role: role)
|
||||
}
|
||||
throw error
|
||||
}
|
||||
@@ -443,7 +519,13 @@ public actor GatewayChannelActor {
|
||||
) async throws {
|
||||
if res.ok == false {
|
||||
let msg = (res.error?["message"]?.value as? String) ?? "gateway connect failed"
|
||||
throw NSError(domain: "Gateway", code: 1008, userInfo: [NSLocalizedDescriptionKey: msg])
|
||||
let details = res.error?["details"]?.value as? [String: ProtoAnyCodable]
|
||||
let detailCode = details?["code"]?.value as? String
|
||||
let canRetryWithDeviceToken = details?["canRetryWithDeviceToken"]?.value as? Bool ?? false
|
||||
throw GatewayConnectAuthError(
|
||||
message: msg,
|
||||
detailCode: detailCode,
|
||||
canRetryWithDeviceToken: canRetryWithDeviceToken)
|
||||
}
|
||||
guard let payload = res.payload else {
|
||||
throw NSError(
|
||||
@@ -616,19 +698,91 @@ public actor GatewayChannelActor {
|
||||
|
||||
private func scheduleReconnect() async {
|
||||
guard self.shouldReconnect else { return }
|
||||
guard !self.reconnectPausedForAuthFailure else { return }
|
||||
let delay = self.backoffMs / 1000
|
||||
self.backoffMs = min(self.backoffMs * 2, 30000)
|
||||
guard await self.sleepUnlessCancelled(nanoseconds: UInt64(delay * 1_000_000_000)) else { return }
|
||||
guard self.shouldReconnect else { return }
|
||||
guard !self.reconnectPausedForAuthFailure else { return }
|
||||
do {
|
||||
try await self.connect()
|
||||
} catch {
|
||||
if self.shouldPauseReconnectAfterAuthFailure(error) {
|
||||
self.reconnectPausedForAuthFailure = true
|
||||
self.logger.error(
|
||||
"gateway reconnect paused for non-recoverable auth failure " +
|
||||
"\(error.localizedDescription, privacy: .public)"
|
||||
)
|
||||
return
|
||||
}
|
||||
let wrapped = self.wrap(error, context: "gateway reconnect")
|
||||
self.logger.error("gateway reconnect failed \(wrapped.localizedDescription, privacy: .public)")
|
||||
await self.scheduleReconnect()
|
||||
}
|
||||
}
|
||||
|
||||
private func shouldRetryWithStoredDeviceToken(
|
||||
error: Error,
|
||||
explicitGatewayToken: String?,
|
||||
storedToken: String?,
|
||||
attemptedDeviceTokenRetry: Bool
|
||||
) -> Bool {
|
||||
if self.deviceTokenRetryBudgetUsed {
|
||||
return false
|
||||
}
|
||||
if attemptedDeviceTokenRetry {
|
||||
return false
|
||||
}
|
||||
guard explicitGatewayToken != nil, storedToken != nil else {
|
||||
return false
|
||||
}
|
||||
guard self.isTrustedDeviceRetryEndpoint() else {
|
||||
return false
|
||||
}
|
||||
guard let authError = error as? GatewayConnectAuthError else {
|
||||
return false
|
||||
}
|
||||
return authError.canRetryWithDeviceToken ||
|
||||
authError.detailCode == GatewayConnectErrorCodes.authTokenMismatch
|
||||
}
|
||||
|
||||
private func shouldPauseReconnectAfterAuthFailure(_ error: Error) -> Bool {
|
||||
guard let authError = error as? GatewayConnectAuthError else {
|
||||
return false
|
||||
}
|
||||
if authError.isNonRecoverable {
|
||||
return true
|
||||
}
|
||||
if authError.detailCode == GatewayConnectErrorCodes.authTokenMismatch &&
|
||||
self.deviceTokenRetryBudgetUsed && !self.pendingDeviceTokenRetry
|
||||
{
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
private func shouldClearStoredDeviceTokenAfterRetry(_ error: Error) -> Bool {
|
||||
guard let authError = error as? GatewayConnectAuthError else {
|
||||
return false
|
||||
}
|
||||
return authError.detailCode == GatewayConnectErrorCodes.authDeviceTokenMismatch
|
||||
}
|
||||
|
||||
private func isTrustedDeviceRetryEndpoint() -> Bool {
|
||||
// This client currently treats loopback as the only trusted retry target.
|
||||
// Unlike the Node gateway client, it does not yet expose a pinned TLS-fingerprint
|
||||
// trust path for remote retry, so remote fallback remains disabled by default.
|
||||
guard let host = self.url.host?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased(),
|
||||
!host.isEmpty
|
||||
else {
|
||||
return false
|
||||
}
|
||||
if host == "localhost" || host == "::1" || host == "127.0.0.1" || host.hasPrefix("127.") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
private nonisolated func sleepUnlessCancelled(nanoseconds: UInt64) async -> Bool {
|
||||
do {
|
||||
try await Task.sleep(nanoseconds: nanoseconds)
|
||||
@@ -756,7 +910,8 @@ public actor GatewayChannelActor {
|
||||
return (id: id, data: data)
|
||||
} catch {
|
||||
self.logger.error(
|
||||
"gateway \(kind) encode failed \(method, privacy: .public) error=\(error.localizedDescription, privacy: .public)")
|
||||
"gateway \(kind) encode failed \(method, privacy: .public) " +
|
||||
"error=\(error.localizedDescription, privacy: .public)")
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user