iOS/Gateway: stabilize background wake and reconnect behavior (#21226)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 7705a7741e
Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com>
Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com>
Reviewed-by: @mbelinky
This commit is contained in:
Mariano
2026-02-19 20:20:28 +00:00
committed by GitHub
parent f7a8c2df2c
commit e98ccc8e17
10 changed files with 604 additions and 54 deletions

View File

@@ -6,6 +6,8 @@ Docs: https://docs.openclaw.ai
### Changes
- iOS/Gateway: stabilize background wake and reconnect behavior with background reconnect suppression/lease windows, BGAppRefresh wake fallback, location wake hook throttling, and APNs wake retry+nudge instrumentation. (#21226) thanks @mbelinky.
### Fixes
- Discord/Gateway: handle close code 4014 (missing privileged gateway intents) without crashing the gateway. Thanks @thewilloftheshadow.

View File

@@ -67,6 +67,37 @@ pnpm ios:open
- iPhone node commands in foreground: camera snap/clip, canvas present/navigate/eval/snapshot, screen record, location, contacts, calendar, reminders, photos, motion, local notifications.
- Share extension deep-link forwarding into the connected gateway session.
## Location Automation Use Case (Testing)
Use this for automation signals ("I moved", "I arrived", "I left"), not as a keep-awake mechanism.
- Product intent:
- movement-aware automations driven by iOS location events
- example: arrival/exit geofence, significant movement, visit detection
- Non-goal:
- continuous GPS polling just to keep the app alive
Test path to include in QA runs:
1. Enable location permission in app:
- set `Always` permission
- verify background location capability is enabled in the build profile
2. Background the app and trigger movement:
- walk/drive enough for a significant location update, or cross a configured geofence
3. Validate gateway side effects:
- node reconnect/wake if needed
- expected location/movement event arrives at gateway
- automation trigger executes once (no duplicate storm)
4. Validate resource impact:
- no sustained high thermal state
- no excessive background battery drain over a short observation window
Pass criteria:
- movement events are delivered reliably enough for automation UX
- no location-driven reconnect spam loops
- app remains stable after repeated background/foreground transitions
## Known Issues / Limitations / Problems
- Foreground-first: iOS can suspend sockets in background; reconnect recovery is still being tuned.

View File

@@ -64,6 +64,10 @@
<string>audio</string>
<string>remote-notification</string>
</array>
<key>BGTaskSchedulerPermittedIdentifiers</key>
<array>
<string>ai.openclaw.ios.bgrefresh</string>
</array>
<key>UILaunchScreen</key>
<dict/>
<key>UISupportedInterfaceOrientations</key>

View File

@@ -10,7 +10,8 @@ enum SignificantLocationMonitor {
static func startIfNeeded(
locationService: any LocationServicing,
locationMode: OpenClawLocationMode,
gateway: GatewayNodeSession
gateway: GatewayNodeSession,
beforeSend: (@MainActor @Sendable () async -> Void)? = nil
) {
guard locationMode == .always else { return }
let status = locationService.authorizationStatus()
@@ -31,6 +32,9 @@ enum SignificantLocationMonitor {
let json = String(data: data, encoding: .utf8)
else { return }
Task { @MainActor in
if let beforeSend {
await beforeSend()
}
await gateway.sendEvent(event: "location.update", payloadJSON: json)
}
}

View File

@@ -42,6 +42,7 @@ private final class NotificationInvokeLatch<T: Sendable>: @unchecked Sendable {
final class NodeAppModel {
private let deepLinkLogger = Logger(subsystem: "ai.openclaw.ios", category: "DeepLink")
private let pushWakeLogger = Logger(subsystem: "ai.openclaw.ios", category: "PushWake")
private let locationWakeLogger = Logger(subsystem: "ai.openclaw.ios", category: "LocationWake")
enum CameraHUDKind {
case photo
case recording
@@ -103,6 +104,11 @@ final class NodeAppModel {
private var backgroundTalkKeptActive = false
private var backgroundedAt: Date?
private var reconnectAfterBackgroundArmed = false
private var backgroundGraceTaskID: UIBackgroundTaskIdentifier = .invalid
@ObservationIgnored private var backgroundGraceTaskTimer: Task<Void, Never>?
private var backgroundReconnectSuppressed = false
private var backgroundReconnectLeaseUntil: Date?
private var lastSignificantLocationWakeAt: Date?
private var gatewayConnected = false
private var operatorConnected = false
@@ -271,6 +277,7 @@ final class NodeAppModel {
self.stopGatewayHealthMonitor()
self.backgroundedAt = Date()
self.reconnectAfterBackgroundArmed = true
self.beginBackgroundConnectionGracePeriod()
// Release voice wake mic in background.
self.backgroundVoiceWakeSuspended = self.voiceWake.suspendForExternalAudioCapture()
let shouldKeepTalkActive = keepTalkActive && self.talkMode.isEnabled
@@ -278,6 +285,8 @@ final class NodeAppModel {
self.backgroundTalkSuspended = self.talkMode.suspendForBackground(keepActive: shouldKeepTalkActive)
case .active, .inactive:
self.isBackgrounded = false
self.endBackgroundConnectionGracePeriod(reason: "scene_foreground")
self.clearBackgroundReconnectSuppression(reason: "scene_foreground")
if self.operatorConnected {
self.startGatewayHealthMonitor()
}
@@ -329,9 +338,98 @@ final class NodeAppModel {
}
@unknown default:
self.isBackgrounded = false
self.endBackgroundConnectionGracePeriod(reason: "scene_unknown")
self.clearBackgroundReconnectSuppression(reason: "scene_unknown")
}
}
private func beginBackgroundConnectionGracePeriod(seconds: TimeInterval = 25) {
self.grantBackgroundReconnectLease(seconds: seconds, reason: "scene_background_grace")
self.endBackgroundConnectionGracePeriod(reason: "restart")
let taskID = UIApplication.shared.beginBackgroundTask(withName: "gateway-background-grace") { [weak self] in
Task { @MainActor in
self?.suppressBackgroundReconnect(
reason: "background_grace_expired",
disconnectIfNeeded: true)
self?.endBackgroundConnectionGracePeriod(reason: "expired")
}
}
guard taskID != .invalid else {
self.pushWakeLogger.info("Background grace unavailable: beginBackgroundTask returned invalid")
return
}
self.backgroundGraceTaskID = taskID
self.pushWakeLogger.info("Background grace started seconds=\(seconds, privacy: .public)")
self.backgroundGraceTaskTimer = Task { [weak self] in
guard let self else { return }
try? await Task.sleep(nanoseconds: UInt64(max(1, seconds) * 1_000_000_000))
await MainActor.run {
self.suppressBackgroundReconnect(reason: "background_grace_timer", disconnectIfNeeded: true)
self.endBackgroundConnectionGracePeriod(reason: "timer")
}
}
}
private func endBackgroundConnectionGracePeriod(reason: String) {
self.backgroundGraceTaskTimer?.cancel()
self.backgroundGraceTaskTimer = nil
guard self.backgroundGraceTaskID != .invalid else { return }
UIApplication.shared.endBackgroundTask(self.backgroundGraceTaskID)
self.backgroundGraceTaskID = .invalid
self.pushWakeLogger.info("Background grace ended reason=\(reason, privacy: .public)")
}
private func grantBackgroundReconnectLease(seconds: TimeInterval, reason: String) {
guard self.isBackgrounded else { return }
let leaseSeconds = max(5, seconds)
let leaseUntil = Date().addingTimeInterval(leaseSeconds)
if let existing = self.backgroundReconnectLeaseUntil, existing > leaseUntil {
// Keep the longer lease if one is already active.
} else {
self.backgroundReconnectLeaseUntil = leaseUntil
}
let wasSuppressed = self.backgroundReconnectSuppressed
self.backgroundReconnectSuppressed = false
self.pushWakeLogger.info(
"Background reconnect lease reason=\(reason, privacy: .public) seconds=\(leaseSeconds, privacy: .public) wasSuppressed=\(wasSuppressed, privacy: .public)")
}
private func suppressBackgroundReconnect(reason: String, disconnectIfNeeded: Bool) {
guard self.isBackgrounded else { return }
let hadLease = self.backgroundReconnectLeaseUntil != nil
let changed = hadLease || !self.backgroundReconnectSuppressed
self.backgroundReconnectLeaseUntil = nil
self.backgroundReconnectSuppressed = true
guard changed else { return }
self.pushWakeLogger.info(
"Background reconnect suppressed reason=\(reason, privacy: .public) disconnect=\(disconnectIfNeeded, privacy: .public)")
guard disconnectIfNeeded else { return }
Task { [weak self] in
guard let self else { return }
await self.operatorGateway.disconnect()
await self.nodeGateway.disconnect()
await MainActor.run {
self.operatorConnected = false
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
if self.isBackgrounded {
self.gatewayStatusText = "Background idle"
self.gatewayServerName = nil
self.gatewayRemoteAddress = nil
self.showLocalCanvasOnDisconnect()
}
}
}
}
private func clearBackgroundReconnectSuppression(reason: String) {
let changed = self.backgroundReconnectSuppressed || self.backgroundReconnectLeaseUntil != nil
self.backgroundReconnectSuppressed = false
self.backgroundReconnectLeaseUntil = nil
guard changed else { return }
self.pushWakeLogger.info("Background reconnect cleared reason=\(reason, privacy: .public)")
}
func setVoiceWakeEnabled(_ enabled: Bool) {
self.voiceWake.setEnabled(enabled)
if enabled {
@@ -568,7 +666,7 @@ final class NodeAppModel {
} catch {
if let gatewayError = error as? GatewayResponseError {
let lower = gatewayError.message.lowercased()
if lower.contains("unauthorized role") {
if lower.contains("unauthorized role") || lower.contains("missing scope") {
await self.setGatewayHealthMonitorDisabled(true)
return true
}
@@ -601,7 +699,7 @@ final class NodeAppModel {
} catch {
if let gatewayError = error as? GatewayResponseError {
let lower = gatewayError.message.lowercased()
if lower.contains("unauthorized role") {
if lower.contains("unauthorized role") || lower.contains("missing scope") {
await self.setGatewayHealthMonitorDisabled(true)
return
}
@@ -1725,6 +1823,23 @@ private extension NodeAppModel {
self.apnsLastRegisteredTokenHex = nil
}
func refreshBackgroundReconnectSuppressionIfNeeded(source: String) {
guard self.isBackgrounded else { return }
guard !self.backgroundReconnectSuppressed else { return }
guard let leaseUntil = self.backgroundReconnectLeaseUntil else {
self.suppressBackgroundReconnect(reason: "\(source):no_lease", disconnectIfNeeded: true)
return
}
if Date() >= leaseUntil {
self.suppressBackgroundReconnect(reason: "\(source):lease_expired", disconnectIfNeeded: true)
}
}
func shouldPauseReconnectLoopInBackground(source: String) -> Bool {
self.refreshBackgroundReconnectSuppressionIfNeeded(source: source)
return self.isBackgrounded && self.backgroundReconnectSuppressed
}
func startOperatorGatewayLoop(
url: URL,
stableID: String,
@@ -1747,6 +1862,7 @@ private extension NodeAppModel {
try? await Task.sleep(nanoseconds: 1_000_000_000)
continue
}
if self.shouldPauseReconnectLoopInBackground(source: "operator_loop") { try? await Task.sleep(nanoseconds: 2_000_000_000); continue }
if await self.isOperatorConnected() {
try? await Task.sleep(nanoseconds: 1_000_000_000)
continue
@@ -1834,6 +1950,7 @@ private extension NodeAppModel {
try? await Task.sleep(nanoseconds: 1_000_000_000)
continue
}
if self.shouldPauseReconnectLoopInBackground(source: "node_loop") { try? await Task.sleep(nanoseconds: 2_000_000_000); continue }
if await self.isGatewayConnected() {
try? await Task.sleep(nanoseconds: 1_000_000_000)
continue
@@ -1883,7 +2000,15 @@ private extension NodeAppModel {
}
await self.showA2UIOnConnectIfNeeded()
await self.onNodeGatewayConnected()
await MainActor.run { SignificantLocationMonitor.startIfNeeded(locationService: self.locationService, locationMode: self.locationMode(), gateway: self.nodeGateway) }
await MainActor.run {
SignificantLocationMonitor.startIfNeeded(
locationService: self.locationService,
locationMode: self.locationMode(),
gateway: self.nodeGateway,
beforeSend: { [weak self] in
await self?.handleSignificantLocationWakeIfNeeded()
})
}
},
onDisconnected: { [weak self] reason in
guard let self else { return }
@@ -2135,12 +2260,59 @@ extension NodeAppModel {
}
func handleSilentPushWake(_ userInfo: [AnyHashable: Any]) async -> Bool {
let wakeId = Self.makePushWakeAttemptID()
guard Self.isSilentPushPayload(userInfo) else {
self.pushWakeLogger.info("Ignored APNs payload: not silent push")
self.pushWakeLogger.info("Ignored APNs payload wakeId=\(wakeId, privacy: .public): not silent push")
return false
}
self.pushWakeLogger.info("Silent push received; attempting reconnect if needed")
return await self.reconnectGatewaySessionsForSilentPushIfNeeded()
let pushKind = Self.openclawPushKind(userInfo)
self.pushWakeLogger.info(
"Silent push received wakeId=\(wakeId, privacy: .public) kind=\(pushKind, privacy: .public) backgrounded=\(self.isBackgrounded, privacy: .public) autoReconnect=\(self.gatewayAutoReconnectEnabled, privacy: .public)")
let result = await self.reconnectGatewaySessionsForSilentPushIfNeeded(wakeId: wakeId)
self.pushWakeLogger.info(
"Silent push outcome wakeId=\(wakeId, privacy: .public) applied=\(result.applied, privacy: .public) reason=\(result.reason, privacy: .public) durationMs=\(result.durationMs, privacy: .public)")
return result.applied
}
func handleBackgroundRefreshWake(trigger: String = "bg_app_refresh") async -> Bool {
let wakeId = Self.makePushWakeAttemptID()
self.pushWakeLogger.info(
"Background refresh wake received wakeId=\(wakeId, privacy: .public) trigger=\(trigger, privacy: .public) backgrounded=\(self.isBackgrounded, privacy: .public) autoReconnect=\(self.gatewayAutoReconnectEnabled, privacy: .public)")
let result = await self.reconnectGatewaySessionsForSilentPushIfNeeded(wakeId: wakeId)
self.pushWakeLogger.info(
"Background refresh wake outcome wakeId=\(wakeId, privacy: .public) applied=\(result.applied, privacy: .public) reason=\(result.reason, privacy: .public) durationMs=\(result.durationMs, privacy: .public)")
return result.applied
}
func handleSignificantLocationWakeIfNeeded() async {
let wakeId = Self.makePushWakeAttemptID()
let now = Date()
let throttleWindowSeconds: TimeInterval = 180
if await self.isGatewayConnected() {
self.locationWakeLogger.info(
"Location wake no-op wakeId=\(wakeId, privacy: .public): already connected")
return
}
if let last = self.lastSignificantLocationWakeAt,
now.timeIntervalSince(last) < throttleWindowSeconds
{
self.locationWakeLogger.info(
"Location wake throttled wakeId=\(wakeId, privacy: .public) elapsedSec=\(now.timeIntervalSince(last), privacy: .public)")
return
}
self.lastSignificantLocationWakeAt = now
self.locationWakeLogger.info(
"Location wake begin wakeId=\(wakeId, privacy: .public) backgrounded=\(self.isBackgrounded, privacy: .public) autoReconnect=\(self.gatewayAutoReconnectEnabled, privacy: .public)")
let result = await self.reconnectGatewaySessionsForSilentPushIfNeeded(wakeId: wakeId)
self.locationWakeLogger.info(
"Location wake trigger wakeId=\(wakeId, privacy: .public) applied=\(result.applied, privacy: .public) reason=\(result.reason, privacy: .public) durationMs=\(result.durationMs, privacy: .public)")
guard result.applied else { return }
let connected = await self.waitForGatewayConnection(timeoutMs: 5000, pollMs: 250)
self.locationWakeLogger.info(
"Location wake post-check wakeId=\(wakeId, privacy: .public) connected=\(connected, privacy: .public)")
}
func updateAPNsDeviceToken(_ tokenData: Data) {
@@ -2210,28 +2382,83 @@ extension NodeAppModel {
return false
}
private func reconnectGatewaySessionsForSilentPushIfNeeded() async -> Bool {
guard self.isBackgrounded else {
self.pushWakeLogger.info("Wake no-op: app not backgrounded")
return false
private static func makePushWakeAttemptID() -> String {
let raw = UUID().uuidString.replacingOccurrences(of: "-", with: "")
return String(raw.prefix(8))
}
private static func openclawPushKind(_ userInfo: [AnyHashable: Any]) -> String {
if let payload = userInfo["openclaw"] as? [String: Any],
let kind = payload["kind"] as? String
{
let trimmed = kind.trimmingCharacters(in: .whitespacesAndNewlines)
if !trimmed.isEmpty { return trimmed }
}
guard self.gatewayAutoReconnectEnabled else {
self.pushWakeLogger.info("Wake no-op: auto reconnect disabled")
return false
if let payload = userInfo["openclaw"] as? [AnyHashable: Any],
let kind = payload["kind"] as? String
{
let trimmed = kind.trimmingCharacters(in: .whitespacesAndNewlines)
if !trimmed.isEmpty { return trimmed }
}
guard self.activeGatewayConnectConfig != nil else {
self.pushWakeLogger.info("Wake no-op: no active gateway config")
return false
return "unknown"
}
private struct SilentPushWakeAttemptResult {
var applied: Bool
var reason: String
var durationMs: Int
}
private func waitForGatewayConnection(timeoutMs: Int, pollMs: Int) async -> Bool {
let clampedTimeoutMs = max(0, timeoutMs)
let pollIntervalNs = UInt64(max(50, pollMs)) * 1_000_000
let deadline = Date().addingTimeInterval(Double(clampedTimeoutMs) / 1000.0)
while Date() < deadline {
if await self.isGatewayConnected() {
return true
}
try? await Task.sleep(nanoseconds: pollIntervalNs)
}
return await self.isGatewayConnected()
}
private func reconnectGatewaySessionsForSilentPushIfNeeded(
wakeId: String
) async -> SilentPushWakeAttemptResult {
let startedAt = Date()
let makeResult: (Bool, String) -> SilentPushWakeAttemptResult = { applied, reason in
let durationMs = Int(Date().timeIntervalSince(startedAt) * 1000)
return SilentPushWakeAttemptResult(
applied: applied,
reason: reason,
durationMs: max(0, durationMs))
}
guard self.isBackgrounded else {
self.pushWakeLogger.info("Wake no-op wakeId=\(wakeId, privacy: .public): app not backgrounded")
return makeResult(false, "not_backgrounded")
}
guard self.gatewayAutoReconnectEnabled else {
self.pushWakeLogger.info("Wake no-op wakeId=\(wakeId, privacy: .public): auto reconnect disabled")
return makeResult(false, "auto_reconnect_disabled")
}
guard let cfg = self.activeGatewayConnectConfig else {
self.pushWakeLogger.info("Wake no-op wakeId=\(wakeId, privacy: .public): no active gateway config")
return makeResult(false, "no_active_gateway_config")
}
self.pushWakeLogger.info(
"Wake reconnect begin wakeId=\(wakeId, privacy: .public) stableID=\(cfg.stableID, privacy: .public)")
self.grantBackgroundReconnectLease(seconds: 30, reason: "wake_\(wakeId)")
await self.operatorGateway.disconnect()
await self.nodeGateway.disconnect()
self.operatorConnected = false
self.gatewayConnected = false
self.gatewayStatusText = "Reconnecting…"
self.talkMode.updateGatewayConnected(false)
self.pushWakeLogger.info("Wake reconnect trigger applied")
return true
self.applyGatewayConnectConfig(cfg)
self.pushWakeLogger.info("Wake reconnect trigger applied wakeId=\(wakeId, privacy: .public)")
return makeResult(true, "reconnect_triggered")
}
}

View File

@@ -2,9 +2,13 @@ import SwiftUI
import Foundation
import os
import UIKit
import BackgroundTasks
final class OpenClawAppDelegate: NSObject, UIApplicationDelegate {
private let logger = Logger(subsystem: "ai.openclaw.ios", category: "Push")
private let backgroundWakeLogger = Logger(subsystem: "ai.openclaw.ios", category: "BackgroundWake")
private static let wakeRefreshTaskIdentifier = "ai.openclaw.ios.bgrefresh"
private var backgroundWakeTask: Task<Bool, Never>?
private var pendingAPNsDeviceToken: Data?
weak var appModel: NodeAppModel? {
didSet {
@@ -21,6 +25,7 @@ final class OpenClawAppDelegate: NSObject, UIApplicationDelegate {
didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]? = nil
) -> Bool
{
self.registerBackgroundWakeRefreshTask()
application.registerForRemoteNotifications()
return true
}
@@ -49,14 +54,70 @@ final class OpenClawAppDelegate: NSObject, UIApplicationDelegate {
Task { @MainActor in
guard let appModel = self.appModel else {
self.logger.info("APNs wake skipped: appModel unavailable")
self.scheduleBackgroundWakeRefresh(afterSeconds: 90, reason: "silent_push_no_model")
completionHandler(.noData)
return
}
let handled = await appModel.handleSilentPushWake(userInfo)
self.logger.info("APNs wake handled=\(handled, privacy: .public)")
if !handled {
self.scheduleBackgroundWakeRefresh(afterSeconds: 90, reason: "silent_push_not_applied")
}
completionHandler(handled ? .newData : .noData)
}
}
func scenePhaseChanged(_ phase: ScenePhase) {
if phase == .background {
self.scheduleBackgroundWakeRefresh(afterSeconds: 120, reason: "scene_background")
}
}
private func registerBackgroundWakeRefreshTask() {
BGTaskScheduler.shared.register(
forTaskWithIdentifier: Self.wakeRefreshTaskIdentifier,
using: nil
) { [weak self] task in
guard let refreshTask = task as? BGAppRefreshTask else {
task.setTaskCompleted(success: false)
return
}
self?.handleBackgroundWakeRefresh(task: refreshTask)
}
}
private func scheduleBackgroundWakeRefresh(afterSeconds delay: TimeInterval, reason: String) {
let request = BGAppRefreshTaskRequest(identifier: Self.wakeRefreshTaskIdentifier)
request.earliestBeginDate = Date().addingTimeInterval(max(60, delay))
do {
try BGTaskScheduler.shared.submit(request)
self.backgroundWakeLogger.info(
"Scheduled background wake refresh reason=\(reason, privacy: .public) delaySeconds=\(max(60, delay), privacy: .public)")
} catch {
self.backgroundWakeLogger.error(
"Failed scheduling background wake refresh reason=\(reason, privacy: .public) error=\(error.localizedDescription, privacy: .public)")
}
}
private func handleBackgroundWakeRefresh(task: BGAppRefreshTask) {
self.scheduleBackgroundWakeRefresh(afterSeconds: 15 * 60, reason: "reschedule")
self.backgroundWakeTask?.cancel()
let wakeTask = Task { @MainActor [weak self] in
guard let self, let appModel = self.appModel else { return false }
return await appModel.handleBackgroundRefreshWake(trigger: "bg_app_refresh")
}
self.backgroundWakeTask = wakeTask
task.expirationHandler = {
wakeTask.cancel()
}
Task {
let applied = await wakeTask.value
task.setTaskCompleted(success: applied)
self.backgroundWakeLogger.info(
"Background wake refresh finished applied=\(applied, privacy: .public)")
}
}
}
@main
@@ -89,6 +150,7 @@ struct OpenClawApp: App {
.onChange(of: self.scenePhase) { _, newValue in
self.appModel.setScenePhase(newValue)
self.gatewayController.setScenePhase(newValue)
self.appDelegate.scenePhaseChanged(newValue)
}
}
}

View File

@@ -7,6 +7,7 @@ public protocol WebSocketTasking: AnyObject {
func resume()
func cancel(with closeCode: URLSessionWebSocketTask.CloseCode, reason: Data?)
func send(_ message: URLSessionWebSocketTask.Message) async throws
func sendPing(pongReceiveHandler: @escaping @Sendable (Error?) -> Void)
func receive() async throws -> URLSessionWebSocketTask.Message
func receive(completionHandler: @escaping @Sendable (Result<URLSessionWebSocketTask.Message, Error>) -> Void)
}
@@ -40,6 +41,18 @@ public struct WebSocketTaskBox: @unchecked Sendable {
{
self.task.receive(completionHandler: completionHandler)
}
public func sendPing() async throws {
try await withCheckedThrowingContinuation { (continuation: CheckedContinuation<Void, Error>) in
self.task.sendPing { error in
if let error {
continuation.resume(throwing: error)
} else {
continuation.resume(returning: ())
}
}
}
}
}
public protocol WebSocketSessioning: AnyObject {
@@ -213,7 +226,7 @@ public actor GatewayChannelActor {
private func watchdogLoop() async {
// Keep nudging reconnect in case exponential backoff stalls.
while self.shouldReconnect {
try? await Task.sleep(nanoseconds: 30 * 1_000_000_000) // 30s cadence
guard await self.sleepUnlessCancelled(nanoseconds: 30 * 1_000_000_000) else { return } // 30s cadence
guard self.shouldReconnect else { return }
if self.connected { continue }
do {
@@ -285,13 +298,15 @@ public actor GatewayChannelActor {
private func keepaliveLoop() async {
while self.shouldReconnect {
try? await Task.sleep(nanoseconds: UInt64(self.keepaliveIntervalSeconds * 1_000_000_000))
guard await self.sleepUnlessCancelled(
nanoseconds: UInt64(self.keepaliveIntervalSeconds * 1_000_000_000))
else { return }
guard self.shouldReconnect else { return }
guard self.connected else { continue }
// Best-effort outbound message to keep intermediate NAT/proxy state alive.
// We intentionally ignore the response.
guard let task = self.task else { continue }
// Best-effort ping keeps NAT/proxy state alive without generating RPC load.
do {
try await self.send(method: "health", params: nil)
try await task.sendPing()
} catch {
// Avoid spamming logs; the reconnect paths will surface meaningful errors.
}
@@ -593,7 +608,7 @@ public actor GatewayChannelActor {
private func watchTicks() async {
let tolerance = self.tickIntervalMs * 2
while self.connected {
try? await Task.sleep(nanoseconds: UInt64(tolerance * 1_000_000))
guard await self.sleepUnlessCancelled(nanoseconds: UInt64(tolerance * 1_000_000)) else { return }
guard self.connected else { return }
if let last = self.lastTick {
let delta = Date().timeIntervalSince(last) * 1000
@@ -616,7 +631,7 @@ public actor GatewayChannelActor {
guard self.shouldReconnect else { return }
let delay = self.backoffMs / 1000
self.backoffMs = min(self.backoffMs * 2, 30000)
try? await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
guard await self.sleepUnlessCancelled(nanoseconds: UInt64(delay * 1_000_000_000)) else { return }
guard self.shouldReconnect else { return }
do {
try await self.connect()
@@ -627,6 +642,15 @@ public actor GatewayChannelActor {
}
}
private nonisolated func sleepUnlessCancelled(nanoseconds: UInt64) async -> Bool {
do {
try await Task.sleep(nanoseconds: nanoseconds)
} catch {
return false
}
return !Task.isCancelled
}
public func request(
method: String,
params: [String: AnyCodable]?,

View File

@@ -75,6 +75,10 @@ private final class FakeGatewayWebSocketTask: WebSocketTasking, @unchecked Senda
}
}
func sendPing(pongReceiveHandler: @escaping @Sendable (Error?) -> Void) {
pongReceiveHandler(nil)
}
func receive() async throws -> URLSessionWebSocketTask.Message {
let phase = self.lock.withLock { () -> Int in
let current = self.receivePhase

View File

@@ -13,6 +13,7 @@ const mocks = vi.hoisted(() => ({
loadApnsRegistration: vi.fn(),
resolveApnsAuthConfigFromEnv: vi.fn(),
sendApnsBackgroundWake: vi.fn(),
sendApnsAlert: vi.fn(),
}));
vi.mock("../../config/config.js", () => ({
@@ -32,6 +33,7 @@ vi.mock("../../infra/push-apns.js", () => ({
loadApnsRegistration: mocks.loadApnsRegistration,
resolveApnsAuthConfigFromEnv: mocks.resolveApnsAuthConfigFromEnv,
sendApnsBackgroundWake: mocks.sendApnsBackgroundWake,
sendApnsAlert: mocks.sendApnsAlert,
}));
type RespondCall = [
@@ -81,12 +83,17 @@ async function invokeNode(params: {
requestParams?: Partial<Record<string, unknown>>;
}) {
const respond = vi.fn();
const logGateway = {
info: vi.fn(),
warn: vi.fn(),
};
await nodeHandlers["node.invoke"]({
params: makeNodeInvokeParams(params.requestParams),
respond: respond as never,
context: {
nodeRegistry: params.nodeRegistry,
execApprovalManager: undefined,
logGateway,
} as never,
client: null,
req: { type: "req", id: "req-node-invoke", method: "node.invoke" },
@@ -135,6 +142,7 @@ describe("node.invoke APNs wake path", () => {
mocks.loadApnsRegistration.mockReset();
mocks.resolveApnsAuthConfigFromEnv.mockReset();
mocks.sendApnsBackgroundWake.mockReset();
mocks.sendApnsAlert.mockReset();
});
afterEach(() => {
@@ -202,7 +210,7 @@ describe("node.invoke APNs wake path", () => {
expect(call?.[1]).toMatchObject({ ok: true, nodeId: "ios-node-reconnect" });
});
it("throttles repeated wake attempts for the same disconnected node", async () => {
it("forces one retry wake when the first wake still fails to reconnect", async () => {
vi.useFakeTimers();
mockSuccessfulWakeConfig("ios-node-throttle");
@@ -211,21 +219,14 @@ describe("node.invoke APNs wake path", () => {
invoke: vi.fn().mockResolvedValue({ ok: true }),
};
const first = invokeNode({
const invokePromise = invokeNode({
nodeRegistry,
requestParams: { nodeId: "ios-node-throttle", idempotencyKey: "idem-throttle-1" },
});
await vi.advanceTimersByTimeAsync(WAKE_WAIT_TIMEOUT_MS);
await first;
await vi.advanceTimersByTimeAsync(20_000);
await invokePromise;
const second = invokeNode({
nodeRegistry,
requestParams: { nodeId: "ios-node-throttle", idempotencyKey: "idem-throttle-2" },
});
await vi.advanceTimersByTimeAsync(WAKE_WAIT_TIMEOUT_MS);
await second;
expect(mocks.sendApnsBackgroundWake).toHaveBeenCalledTimes(1);
expect(mocks.sendApnsBackgroundWake).toHaveBeenCalledTimes(2);
expect(nodeRegistry.invoke).not.toHaveBeenCalled();
});
});

View File

@@ -11,6 +11,7 @@ import {
import {
loadApnsRegistration,
resolveApnsAuthConfigFromEnv,
sendApnsAlert,
sendApnsBackgroundWake,
} from "../../infra/push-apns.js";
import { isNodeCommandAllowed, resolveNodeCommandAllowlist } from "../node-command-policy.js";
@@ -40,15 +41,36 @@ import {
import type { GatewayRequestHandlers } from "./types.js";
const NODE_WAKE_RECONNECT_WAIT_MS = 3_000;
const NODE_WAKE_RECONNECT_RETRY_WAIT_MS = 12_000;
const NODE_WAKE_RECONNECT_POLL_MS = 150;
const NODE_WAKE_THROTTLE_MS = 15_000;
const NODE_WAKE_NUDGE_THROTTLE_MS = 10 * 60_000;
type NodeWakeState = {
lastWakeAtMs: number;
inFlight?: Promise<boolean>;
inFlight?: Promise<NodeWakeAttempt>;
};
const nodeWakeById = new Map<string, NodeWakeState>();
const nodeWakeNudgeById = new Map<string, number>();
type NodeWakeAttempt = {
available: boolean;
throttled: boolean;
path: "throttled" | "no-registration" | "no-auth" | "sent" | "send-error";
durationMs: number;
apnsStatus?: number;
apnsReason?: string;
};
type NodeWakeNudgeAttempt = {
sent: boolean;
throttled: boolean;
reason: "throttled" | "no-registration" | "no-auth" | "send-error" | "apns-not-ok" | "sent";
durationMs: number;
apnsStatus?: number;
apnsReason?: string;
};
function isNodeEntry(entry: { role?: string; roles?: string[] }) {
if (entry.role === "node") {
@@ -64,7 +86,10 @@ async function delayMs(ms: number): Promise<void> {
await new Promise<void>((resolve) => setTimeout(resolve, ms));
}
async function maybeWakeNodeWithApns(nodeId: string): Promise<boolean> {
async function maybeWakeNodeWithApns(
nodeId: string,
opts?: { force?: boolean },
): Promise<NodeWakeAttempt> {
const state = nodeWakeById.get(nodeId) ?? { lastWakeAtMs: 0 };
nodeWakeById.set(nodeId, state);
@@ -73,36 +98,75 @@ async function maybeWakeNodeWithApns(nodeId: string): Promise<boolean> {
}
const now = Date.now();
if (state.lastWakeAtMs > 0 && now - state.lastWakeAtMs < NODE_WAKE_THROTTLE_MS) {
return true;
const force = opts?.force === true;
if (!force && state.lastWakeAtMs > 0 && now - state.lastWakeAtMs < NODE_WAKE_THROTTLE_MS) {
return { available: true, throttled: true, path: "throttled", durationMs: 0 };
}
state.inFlight = (async () => {
const startedAtMs = Date.now();
const withDuration = (attempt: Omit<NodeWakeAttempt, "durationMs">): NodeWakeAttempt => ({
...attempt,
durationMs: Math.max(0, Date.now() - startedAtMs),
});
try {
const registration = await loadApnsRegistration(nodeId);
if (!registration) {
return false;
return withDuration({ available: false, throttled: false, path: "no-registration" });
}
const auth = await resolveApnsAuthConfigFromEnv(process.env);
if (!auth.ok) {
return false;
return withDuration({
available: false,
throttled: false,
path: "no-auth",
apnsReason: auth.error,
});
}
state.lastWakeAtMs = Date.now();
await sendApnsBackgroundWake({
const wakeResult = await sendApnsBackgroundWake({
auth: auth.value,
registration,
nodeId,
wakeReason: "node.invoke",
});
} catch {
// Best-effort wake only.
if (state.lastWakeAtMs === 0) {
return false;
if (!wakeResult.ok) {
return withDuration({
available: true,
throttled: false,
path: "send-error",
apnsStatus: wakeResult.status,
apnsReason: wakeResult.reason,
});
}
return withDuration({
available: true,
throttled: false,
path: "sent",
apnsStatus: wakeResult.status,
apnsReason: wakeResult.reason,
});
} catch (err) {
// Best-effort wake only.
const message = err instanceof Error ? err.message : String(err);
if (state.lastWakeAtMs === 0) {
return withDuration({
available: false,
throttled: false,
path: "send-error",
apnsReason: message,
});
}
return withDuration({
available: true,
throttled: false,
path: "send-error",
apnsReason: message,
});
}
return true;
})();
try {
@@ -112,6 +176,70 @@ async function maybeWakeNodeWithApns(nodeId: string): Promise<boolean> {
}
}
async function maybeSendNodeWakeNudge(nodeId: string): Promise<NodeWakeNudgeAttempt> {
const startedAtMs = Date.now();
const withDuration = (
attempt: Omit<NodeWakeNudgeAttempt, "durationMs">,
): NodeWakeNudgeAttempt => ({
...attempt,
durationMs: Math.max(0, Date.now() - startedAtMs),
});
const lastNudgeAtMs = nodeWakeNudgeById.get(nodeId) ?? 0;
if (lastNudgeAtMs > 0 && Date.now() - lastNudgeAtMs < NODE_WAKE_NUDGE_THROTTLE_MS) {
return withDuration({ sent: false, throttled: true, reason: "throttled" });
}
const registration = await loadApnsRegistration(nodeId);
if (!registration) {
return withDuration({ sent: false, throttled: false, reason: "no-registration" });
}
const auth = await resolveApnsAuthConfigFromEnv(process.env);
if (!auth.ok) {
return withDuration({
sent: false,
throttled: false,
reason: "no-auth",
apnsReason: auth.error,
});
}
try {
const result = await sendApnsAlert({
auth: auth.value,
registration,
nodeId,
title: "OpenClaw needs a quick reopen",
body: "Tap to reopen OpenClaw and restore the node connection.",
});
if (!result.ok) {
return withDuration({
sent: false,
throttled: false,
reason: "apns-not-ok",
apnsStatus: result.status,
apnsReason: result.reason,
});
}
nodeWakeNudgeById.set(nodeId, Date.now());
return withDuration({
sent: true,
throttled: false,
reason: "sent",
apnsStatus: result.status,
apnsReason: result.reason,
});
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
return withDuration({
sent: false,
throttled: false,
reason: "send-error",
apnsReason: message,
});
}
}
async function waitForNodeReconnect(params: {
nodeId: string;
context: { nodeRegistry: { get: (nodeId: string) => unknown } };
@@ -430,7 +558,7 @@ export const nodeHandlers: GatewayRequestHandlers = {
);
});
},
"node.invoke": async ({ params, respond, context, client }) => {
"node.invoke": async ({ params, respond, context, client, req }) => {
if (!validateNodeInvokeParams(params)) {
respondInvalidParams({
respond,
@@ -472,12 +600,70 @@ export const nodeHandlers: GatewayRequestHandlers = {
await respondUnavailableOnThrow(respond, async () => {
let nodeSession = context.nodeRegistry.get(nodeId);
if (!nodeSession) {
const wakeAvailable = await maybeWakeNodeWithApns(nodeId);
if (wakeAvailable) {
await waitForNodeReconnect({ nodeId, context });
const wakeReqId = req.id;
const wakeFlowStartedAtMs = Date.now();
context.logGateway.info(
`node wake start node=${nodeId} req=${wakeReqId} command=${command}`,
);
const wake = await maybeWakeNodeWithApns(nodeId);
context.logGateway.info(
`node wake stage=wake1 node=${nodeId} req=${wakeReqId} ` +
`available=${wake.available} throttled=${wake.throttled} ` +
`path=${wake.path} durationMs=${wake.durationMs} ` +
`apnsStatus=${wake.apnsStatus ?? -1} apnsReason=${wake.apnsReason ?? "-"}`,
);
if (wake.available) {
const waitStartedAtMs = Date.now();
const waitTimeoutMs = NODE_WAKE_RECONNECT_WAIT_MS;
const reconnected = await waitForNodeReconnect({
nodeId,
context,
timeoutMs: waitTimeoutMs,
});
const waitDurationMs = Math.max(0, Date.now() - waitStartedAtMs);
context.logGateway.info(
`node wake stage=wait1 node=${nodeId} req=${wakeReqId} ` +
`reconnected=${reconnected} timeoutMs=${waitTimeoutMs} durationMs=${waitDurationMs}`,
);
}
nodeSession = context.nodeRegistry.get(nodeId);
if (!nodeSession && wake.available) {
const retryWake = await maybeWakeNodeWithApns(nodeId, { force: true });
context.logGateway.info(
`node wake stage=wake2 node=${nodeId} req=${wakeReqId} force=true ` +
`available=${retryWake.available} throttled=${retryWake.throttled} ` +
`path=${retryWake.path} durationMs=${retryWake.durationMs} ` +
`apnsStatus=${retryWake.apnsStatus ?? -1} apnsReason=${retryWake.apnsReason ?? "-"}`,
);
if (retryWake.available) {
const waitStartedAtMs = Date.now();
const waitTimeoutMs = NODE_WAKE_RECONNECT_RETRY_WAIT_MS;
const reconnected = await waitForNodeReconnect({
nodeId,
context,
timeoutMs: waitTimeoutMs,
});
const waitDurationMs = Math.max(0, Date.now() - waitStartedAtMs);
context.logGateway.info(
`node wake stage=wait2 node=${nodeId} req=${wakeReqId} ` +
`reconnected=${reconnected} timeoutMs=${waitTimeoutMs} durationMs=${waitDurationMs}`,
);
}
nodeSession = context.nodeRegistry.get(nodeId);
}
if (!nodeSession) {
const totalDurationMs = Math.max(0, Date.now() - wakeFlowStartedAtMs);
const nudge = await maybeSendNodeWakeNudge(nodeId);
context.logGateway.info(
`node wake nudge node=${nodeId} req=${wakeReqId} sent=${nudge.sent} ` +
`throttled=${nudge.throttled} reason=${nudge.reason} durationMs=${nudge.durationMs} ` +
`apnsStatus=${nudge.apnsStatus ?? -1} apnsReason=${nudge.apnsReason ?? "-"}`,
);
context.logGateway.warn(
`node wake done node=${nodeId} req=${wakeReqId} connected=false ` +
`reason=not_connected totalMs=${totalDurationMs}`,
);
respond(
false,
undefined,
@@ -487,6 +673,11 @@ export const nodeHandlers: GatewayRequestHandlers = {
);
return;
}
const totalDurationMs = Math.max(0, Date.now() - wakeFlowStartedAtMs);
context.logGateway.info(
`node wake done node=${nodeId} req=${wakeReqId} connected=true totalMs=${totalDurationMs}`,
);
}
const cfg = loadConfig();
const allowlist = resolveNodeCommandAllowlist(cfg, nodeSession);