diff --git a/apps/ios/Sources/Gateway/GatewayConnectionController.swift b/apps/ios/Sources/Gateway/GatewayConnectionController.swift index 1ae7e7fd70..2a918bcfdb 100644 --- a/apps/ios/Sources/Gateway/GatewayConnectionController.swift +++ b/apps/ios/Sources/Gateway/GatewayConnectionController.swift @@ -355,6 +355,8 @@ final class GatewayConnectionController { OpenClawSystemCommand.notify.rawValue, OpenClawTalkCommand.pttStart.rawValue, OpenClawTalkCommand.pttStop.rawValue, + OpenClawTalkCommand.pttCancel.rawValue, + OpenClawTalkCommand.pttOnce.rawValue, ] let caps = Set(self.currentCaps()) diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index eea1fbe2ae..12fe1be241 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -636,7 +636,9 @@ final class NodeAppModel { OpenClawMotionCommand.pedometer.rawValue: return try await self.handleMotionInvoke(req) case OpenClawTalkCommand.pttStart.rawValue, - OpenClawTalkCommand.pttStop.rawValue: + OpenClawTalkCommand.pttStop.rawValue, + OpenClawTalkCommand.pttCancel.rawValue, + OpenClawTalkCommand.pttOnce.rawValue: return try await self.handleTalkInvoke(req) default: return BridgeInvokeResponse( @@ -1175,6 +1177,21 @@ final class NodeAppModel { self.pttVoiceWakeSuspended = false let json = try Self.encodePayload(payload) return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json) + case OpenClawTalkCommand.pttCancel.rawValue: + let payload = await self.talkMode.cancelPushToTalk() + self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: self.pttVoiceWakeSuspended) + self.pttVoiceWakeSuspended = false + let json = try Self.encodePayload(payload) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json) + case OpenClawTalkCommand.pttOnce.rawValue: + self.pttVoiceWakeSuspended = self.voiceWake.suspendForExternalAudioCapture() + defer { + self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: self.pttVoiceWakeSuspended) + self.pttVoiceWakeSuspended = false + } + let payload = try await self.talkMode.runPushToTalkOnce() + let json = try Self.encodePayload(payload) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json) default: return BridgeInvokeResponse( id: req.id, diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index c5d94f5605..6d7c478128 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -26,6 +26,9 @@ final class TalkModeManager: NSObject { private var captureMode: CaptureMode = .idle private var resumeContinuousAfterPTT: Bool = false private var activePTTCaptureId: String? + private var pttAutoStopEnabled: Bool = false + private var pttCompletion: CheckedContinuation? + private var pttTimeoutTask: Task? private let allowSimulatorCapture: Bool @@ -146,6 +149,18 @@ final class TalkModeManager: NSObject { self.stopRecognition() self.stopSpeaking() self.lastInterruptedAtSeconds = nil + let pendingPTT = self.pttCompletion != nil + let pendingCaptureId = self.activePTTCaptureId ?? UUID().uuidString + self.pttTimeoutTask?.cancel() + self.pttTimeoutTask = nil + self.pttAutoStopEnabled = false + if pendingPTT { + let payload = OpenClawTalkPTTStopPayload( + captureId: pendingCaptureId, + transcript: nil, + status: "cancelled") + self.finishPTTOnce(payload) + } self.resumeContinuousAfterPTT = false self.activePTTCaptureId = nil TalkSystemSpeechSynthesizer.shared.stop() @@ -167,6 +182,9 @@ final class TalkModeManager: NSObject { } self.stopSpeaking(storeInterruption: false) + self.pttTimeoutTask?.cancel() + self.pttTimeoutTask = nil + self.pttAutoStopEnabled = false self.resumeContinuousAfterPTT = self.isEnabled && self.captureMode == .continuous self.silenceTask?.cancel() @@ -218,16 +236,21 @@ final class TalkModeManager: NSObject { func endPushToTalk() async -> OpenClawTalkPTTStopPayload { let captureId = self.activePTTCaptureId ?? UUID().uuidString guard self.isPushToTalkActive else { - return OpenClawTalkPTTStopPayload( + let payload = OpenClawTalkPTTStopPayload( captureId: captureId, transcript: nil, status: "idle") + self.finishPTTOnce(payload) + return payload } self.isPushToTalkActive = false self.isListening = false self.captureMode = .idle self.stopRecognition() + self.pttTimeoutTask?.cancel() + self.pttTimeoutTask = nil + self.pttAutoStopEnabled = false let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) self.lastTranscript = "" @@ -240,10 +263,12 @@ final class TalkModeManager: NSObject { } self.resumeContinuousAfterPTT = false self.activePTTCaptureId = nil - return OpenClawTalkPTTStopPayload( + let payload = OpenClawTalkPTTStopPayload( captureId: captureId, transcript: nil, status: "empty") + self.finishPTTOnce(payload) + return payload } guard self.gatewayConnected else { @@ -253,10 +278,12 @@ final class TalkModeManager: NSObject { } self.resumeContinuousAfterPTT = false self.activePTTCaptureId = nil - return OpenClawTalkPTTStopPayload( + let payload = OpenClawTalkPTTStopPayload( captureId: captureId, transcript: transcript, status: "offline") + self.finishPTTOnce(payload) + return payload } self.statusText = "Thinking…" @@ -265,10 +292,77 @@ final class TalkModeManager: NSObject { } self.resumeContinuousAfterPTT = false self.activePTTCaptureId = nil - return OpenClawTalkPTTStopPayload( + let payload = OpenClawTalkPTTStopPayload( captureId: captureId, transcript: transcript, status: "queued") + self.finishPTTOnce(payload) + return payload + } + + func runPushToTalkOnce(maxDurationSeconds: TimeInterval = 12) async throws -> OpenClawTalkPTTStopPayload { + if self.pttCompletion != nil { + _ = await self.cancelPushToTalk() + } + + if self.isPushToTalkActive { + let captureId = self.activePTTCaptureId ?? UUID().uuidString + return OpenClawTalkPTTStopPayload( + captureId: captureId, + transcript: nil, + status: "busy") + } + + _ = try await self.beginPushToTalk() + + return await withCheckedContinuation { cont in + self.pttCompletion = cont + self.pttAutoStopEnabled = true + self.startSilenceMonitor() + self.schedulePTTTimeout(seconds: maxDurationSeconds) + } + } + + func cancelPushToTalk() async -> OpenClawTalkPTTStopPayload { + let captureId = self.activePTTCaptureId ?? UUID().uuidString + guard self.isPushToTalkActive else { + let payload = OpenClawTalkPTTStopPayload( + captureId: captureId, + transcript: nil, + status: "idle") + self.finishPTTOnce(payload) + self.pttAutoStopEnabled = false + self.pttTimeoutTask?.cancel() + self.pttTimeoutTask = nil + self.resumeContinuousAfterPTT = false + self.activePTTCaptureId = nil + return payload + } + + let shouldResume = self.resumeContinuousAfterPTT + self.isPushToTalkActive = false + self.isListening = false + self.captureMode = .idle + self.stopRecognition() + self.lastTranscript = "" + self.lastHeard = nil + self.pttAutoStopEnabled = false + self.pttTimeoutTask?.cancel() + self.pttTimeoutTask = nil + self.resumeContinuousAfterPTT = false + self.activePTTCaptureId = nil + self.statusText = "Ready" + + let payload = OpenClawTalkPTTStopPayload( + captureId: captureId, + transcript: nil, + status: "cancelled") + self.finishPTTOnce(payload) + + if shouldResume { + await self.start() + } + return payload } private func startRecognition() throws { @@ -369,7 +463,7 @@ final class TalkModeManager: NSObject { self.silenceTask?.cancel() self.silenceTask = Task { [weak self] in guard let self else { return } - while self.isEnabled { + while self.isEnabled || (self.isPushToTalkActive && self.pttAutoStopEnabled) { try? await Task.sleep(nanoseconds: 200_000_000) await self.checkSilence() } @@ -377,13 +471,45 @@ final class TalkModeManager: NSObject { } private func checkSilence() async { - guard self.captureMode == .continuous else { return } - guard self.isListening, !self.isSpeaking else { return } + if self.captureMode == .continuous { + guard self.isListening, !self.isSpeaking else { return } + let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) + guard !transcript.isEmpty else { return } + guard let lastHeard else { return } + if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return } + await self.processTranscript(transcript, restartAfter: true) + return + } + + guard self.captureMode == .pushToTalk, self.pttAutoStopEnabled else { return } + guard self.isListening, !self.isSpeaking, self.isPushToTalkActive else { return } let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) guard !transcript.isEmpty else { return } guard let lastHeard else { return } if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return } - await self.processTranscript(transcript, restartAfter: true) + _ = await self.endPushToTalk() + } + + // Guardrail for PTT once so we don't stay open indefinitely. + private func schedulePTTTimeout(seconds: TimeInterval) { + guard seconds > 0 else { return } + let nanos = UInt64(seconds * 1_000_000_000) + self.pttTimeoutTask?.cancel() + self.pttTimeoutTask = Task { [weak self] in + try? await Task.sleep(nanoseconds: nanos) + await self?.handlePTTTimeout() + } + } + + private func handlePTTTimeout() async { + guard self.pttAutoStopEnabled, self.isPushToTalkActive else { return } + _ = await self.endPushToTalk() + } + + private func finishPTTOnce(_ payload: OpenClawTalkPTTStopPayload) { + guard let continuation = self.pttCompletion else { return } + self.pttCompletion = nil + continuation.resume(returning: payload) } private func processTranscript(_ transcript: String, restartAfter: Bool) async { @@ -890,5 +1016,13 @@ extension TalkModeManager { self.lastTranscript = transcript self.lastHeard = Date() } + + func _test_backdateLastHeard(seconds: TimeInterval) { + self.lastHeard = Date().addingTimeInterval(-seconds) + } + + func _test_runSilenceCheck() async { + await self.checkSilence() + } } #endif diff --git a/apps/ios/Tests/GatewayConnectionControllerTests.swift b/apps/ios/Tests/GatewayConnectionControllerTests.swift index f10edfdfa4..f21468b198 100644 --- a/apps/ios/Tests/GatewayConnectionControllerTests.swift +++ b/apps/ios/Tests/GatewayConnectionControllerTests.swift @@ -104,6 +104,8 @@ private func withUserDefaults(_ updates: [String: Any?], _ body: () throws -> #expect(commands.contains(OpenClawRemindersCommand.add.rawValue)) #expect(commands.contains(OpenClawTalkCommand.pttStart.rawValue)) #expect(commands.contains(OpenClawTalkCommand.pttStop.rawValue)) + #expect(commands.contains(OpenClawTalkCommand.pttCancel.rawValue)) + #expect(commands.contains(OpenClawTalkCommand.pttOnce.rawValue)) } } diff --git a/apps/ios/Tests/NodeAppModelInvokeTests.swift b/apps/ios/Tests/NodeAppModelInvokeTests.swift index 81fb3f4729..1aa665ea4a 100644 --- a/apps/ios/Tests/NodeAppModelInvokeTests.swift +++ b/apps/ios/Tests/NodeAppModelInvokeTests.swift @@ -167,6 +167,69 @@ private func makeTestAppModel( talkMode: talkMode) } +@MainActor +private func makeTalkTestAppModel(talkMode: TalkModeManager) -> NodeAppModel { + makeTestAppModel( + deviceStatusService: TestDeviceStatusService( + statusPayload: OpenClawDeviceStatusPayload( + battery: OpenClawBatteryStatusPayload(level: 0.5, state: .unplugged, lowPowerModeEnabled: false), + thermal: OpenClawThermalStatusPayload(state: .nominal), + storage: OpenClawStorageStatusPayload(totalBytes: 10, freeBytes: 5, usedBytes: 5), + network: OpenClawNetworkStatusPayload( + status: .satisfied, + isExpensive: false, + isConstrained: false, + interfaces: [.wifi]), + uptimeSeconds: 1), + infoPayload: OpenClawDeviceInfoPayload( + deviceName: "Test", + modelIdentifier: "Test1,1", + systemName: "iOS", + systemVersion: "1.0", + appVersion: "dev", + appBuild: "0", + locale: "en-US")), + photosService: TestPhotosService(payload: OpenClawPhotosLatestPayload(photos: [])), + contactsService: TestContactsService( + searchPayload: OpenClawContactsSearchPayload(contacts: []), + addPayload: OpenClawContactsAddPayload(contact: OpenClawContactPayload( + identifier: "c0", + displayName: "", + givenName: "", + familyName: "", + organizationName: "", + phoneNumbers: [], + emails: []))), + calendarService: TestCalendarService( + eventsPayload: OpenClawCalendarEventsPayload(events: []), + addPayload: OpenClawCalendarAddPayload(event: OpenClawCalendarEventPayload( + identifier: "e0", + title: "Test", + startISO: "2024-01-01T00:00:00Z", + endISO: "2024-01-01T00:10:00Z", + isAllDay: false, + location: nil, + calendarTitle: nil))), + remindersService: TestRemindersService( + listPayload: OpenClawRemindersListPayload(reminders: []), + addPayload: OpenClawRemindersAddPayload(reminder: OpenClawReminderPayload( + identifier: "r0", + title: "Test", + dueISO: nil, + completed: false, + listName: nil))), + motionService: TestMotionService( + activityPayload: OpenClawMotionActivityPayload(activities: []), + pedometerPayload: OpenClawPedometerPayload( + startISO: "2024-01-01T00:00:00Z", + endISO: "2024-01-01T01:00:00Z", + steps: nil, + distanceMeters: nil, + floorsAscended: nil, + floorsDescended: nil)), + talkMode: talkMode) +} + private func decodePayload(_ json: String?, as type: T.Type) throws -> T { let data = try #require(json?.data(using: .utf8)) return try JSONDecoder().decode(type, from: data) @@ -599,65 +662,7 @@ private func decodePayload(_ json: String?, as type: T.Type) throw @Test @MainActor func handleInvokePushToTalkReturnsTranscriptStatus() async throws { let talkMode = TalkModeManager(allowSimulatorCapture: true) talkMode.updateGatewayConnected(false) - let appModel = makeTestAppModel( - deviceStatusService: TestDeviceStatusService( - statusPayload: OpenClawDeviceStatusPayload( - battery: OpenClawBatteryStatusPayload(level: 0.5, state: .unplugged, lowPowerModeEnabled: false), - thermal: OpenClawThermalStatusPayload(state: .nominal), - storage: OpenClawStorageStatusPayload(totalBytes: 10, freeBytes: 5, usedBytes: 5), - network: OpenClawNetworkStatusPayload( - status: .satisfied, - isExpensive: false, - isConstrained: false, - interfaces: [.wifi]), - uptimeSeconds: 1), - infoPayload: OpenClawDeviceInfoPayload( - deviceName: "Test", - modelIdentifier: "Test1,1", - systemName: "iOS", - systemVersion: "1.0", - appVersion: "dev", - appBuild: "0", - locale: "en-US")), - photosService: TestPhotosService(payload: OpenClawPhotosLatestPayload(photos: [])), - contactsService: TestContactsService( - searchPayload: OpenClawContactsSearchPayload(contacts: []), - addPayload: OpenClawContactsAddPayload(contact: OpenClawContactPayload( - identifier: "c0", - displayName: "", - givenName: "", - familyName: "", - organizationName: "", - phoneNumbers: [], - emails: []))), - calendarService: TestCalendarService( - eventsPayload: OpenClawCalendarEventsPayload(events: []), - addPayload: OpenClawCalendarAddPayload(event: OpenClawCalendarEventPayload( - identifier: "e0", - title: "Test", - startISO: "2024-01-01T00:00:00Z", - endISO: "2024-01-01T00:10:00Z", - isAllDay: false, - location: nil, - calendarTitle: nil))), - remindersService: TestRemindersService( - listPayload: OpenClawRemindersListPayload(reminders: []), - addPayload: OpenClawRemindersAddPayload(reminder: OpenClawReminderPayload( - identifier: "r0", - title: "Test", - dueISO: nil, - completed: false, - listName: nil))), - motionService: TestMotionService( - activityPayload: OpenClawMotionActivityPayload(activities: []), - pedometerPayload: OpenClawPedometerPayload( - startISO: "2024-01-01T00:00:00Z", - endISO: "2024-01-01T01:00:00Z", - steps: nil, - distanceMeters: nil, - floorsAscended: nil, - floorsDescended: nil)), - talkMode: talkMode) + let appModel = makeTalkTestAppModel(talkMode: talkMode) let startReq = BridgeInvokeRequest(id: "ptt-start", command: OpenClawTalkCommand.pttStart.rawValue) let startRes = await appModel._test_handleInvoke(startReq) @@ -676,6 +681,48 @@ private func decodePayload(_ json: String?, as type: T.Type) throw #expect(stopPayload.status == "offline") } + @Test @MainActor func handleInvokePushToTalkCancelStopsSession() async throws { + let talkMode = TalkModeManager(allowSimulatorCapture: true) + talkMode.updateGatewayConnected(false) + let appModel = makeTalkTestAppModel(talkMode: talkMode) + + let startReq = BridgeInvokeRequest(id: "ptt-start", command: OpenClawTalkCommand.pttStart.rawValue) + let startRes = await appModel._test_handleInvoke(startReq) + #expect(startRes.ok == true) + let startPayload = try decodePayload(startRes.payloadJSON, as: OpenClawTalkPTTStartPayload.self) + + let cancelReq = BridgeInvokeRequest(id: "ptt-cancel", command: OpenClawTalkCommand.pttCancel.rawValue) + let cancelRes = await appModel._test_handleInvoke(cancelReq) + #expect(cancelRes.ok == true) + let cancelPayload = try decodePayload(cancelRes.payloadJSON, as: OpenClawTalkPTTStopPayload.self) + #expect(cancelPayload.captureId == startPayload.captureId) + #expect(cancelPayload.status == "cancelled") + } + + @Test @MainActor func handleInvokePushToTalkOnceAutoStopsAfterSilence() async throws { + let talkMode = TalkModeManager(allowSimulatorCapture: true) + talkMode.updateGatewayConnected(false) + let appModel = makeTalkTestAppModel(talkMode: talkMode) + + let onceReq = BridgeInvokeRequest(id: "ptt-once", command: OpenClawTalkCommand.pttOnce.rawValue) + let onceTask = Task { await appModel._test_handleInvoke(onceReq) } + + for _ in 0..<5 where !talkMode.isPushToTalkActive { + await Task.yield() + } + #expect(talkMode.isPushToTalkActive == true) + + talkMode._test_seedTranscript("Hello from PTT once") + talkMode._test_backdateLastHeard(seconds: 1.0) + await talkMode._test_runSilenceCheck() + + let onceRes = await onceTask.value + #expect(onceRes.ok == true) + let oncePayload = try decodePayload(onceRes.payloadJSON, as: OpenClawTalkPTTStopPayload.self) + #expect(oncePayload.transcript == "Hello from PTT once") + #expect(oncePayload.status == "offline") + } + @Test @MainActor func handleDeepLinkSetsErrorWhenNotConnected() async { let appModel = NodeAppModel() let url = URL(string: "openclaw://agent?message=hello")! diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift b/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift index d8646ac76d..755fc97a98 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift @@ -3,6 +3,8 @@ import Foundation public enum OpenClawTalkCommand: String, Codable, Sendable { case pttStart = "talk.ptt.start" case pttStop = "talk.ptt.stop" + case pttCancel = "talk.ptt.cancel" + case pttOnce = "talk.ptt.once" } public struct OpenClawTalkPTTStartPayload: Codable, Sendable, Equatable { diff --git a/src/gateway/node-command-policy.ts b/src/gateway/node-command-policy.ts index 588a64899b..6361fc3947 100644 --- a/src/gateway/node-command-policy.ts +++ b/src/gateway/node-command-policy.ts @@ -34,7 +34,7 @@ const MOTION_COMMANDS = ["motion.activity", "motion.pedometer"]; const SYSTEM_NOTIFY_COMMANDS = ["system.notify"]; -const TALK_COMMANDS = ["talk.ptt.start", "talk.ptt.stop"]; +const TALK_COMMANDS = ["talk.ptt.start", "talk.ptt.stop", "talk.ptt.cancel", "talk.ptt.once"]; const SYSTEM_COMMANDS = [ "system.run",