diff --git a/apps/ios/Sources/Gateway/GatewayConnectionController.swift b/apps/ios/Sources/Gateway/GatewayConnectionController.swift index dd8c6cf8a0..1ae7e7fd70 100644 --- a/apps/ios/Sources/Gateway/GatewayConnectionController.swift +++ b/apps/ios/Sources/Gateway/GatewayConnectionController.swift @@ -353,6 +353,8 @@ final class GatewayConnectionController { OpenClawCanvasA2UICommand.reset.rawValue, OpenClawScreenCommand.record.rawValue, OpenClawSystemCommand.notify.rawValue, + OpenClawTalkCommand.pttStart.rawValue, + OpenClawTalkCommand.pttStop.rawValue, ] let caps = Set(self.currentCaps()) diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index b76d25b74d..261369bd33 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -63,7 +63,7 @@ final class NodeAppModel { @ObservationIgnored private var cameraHUDDismissTask: Task? private let notificationCenter: NotificationCentering let voiceWake = VoiceWakeManager() - let talkMode = TalkModeManager() + let talkMode: TalkModeManager private let locationService: any LocationServicing private let deviceStatusService: any DeviceStatusServicing private let photosService: any PhotosServicing @@ -92,7 +92,8 @@ final class NodeAppModel { contactsService: any ContactsServicing = ContactsService(), calendarService: any CalendarServicing = CalendarService(), remindersService: any RemindersServicing = RemindersService(), - motionService: any MotionServicing = MotionService()) + motionService: any MotionServicing = MotionService(), + talkMode: TalkModeManager = TalkModeManager()) { self.screen = screen self.camera = camera @@ -105,6 +106,7 @@ final class NodeAppModel { self.calendarService = calendarService self.remindersService = remindersService self.motionService = motionService + self.talkMode = talkMode self.voiceWake.configure { [weak self] cmd in guard let self else { return } @@ -313,6 +315,7 @@ final class NodeAppModel { self.gatewayStatusText = "Connected" self.gatewayServerName = url.host ?? "gateway" self.gatewayConnected = true + self.talkMode.updateGatewayConnected(true) } if let addr = await self.gateway.currentRemoteAddress() { await MainActor.run { @@ -329,6 +332,7 @@ final class NodeAppModel { self.gatewayStatusText = "Disconnected" self.gatewayRemoteAddress = nil self.gatewayConnected = false + self.talkMode.updateGatewayConnected(false) self.showLocalCanvasOnDisconnect() self.gatewayStatusText = "Disconnected: \(reason)" } @@ -356,6 +360,7 @@ final class NodeAppModel { self.gatewayServerName = nil self.gatewayRemoteAddress = nil self.gatewayConnected = false + self.talkMode.updateGatewayConnected(false) self.showLocalCanvasOnDisconnect() } let sleepSeconds = min(8.0, 0.5 * pow(1.7, Double(attempt))) @@ -369,6 +374,7 @@ final class NodeAppModel { self.gatewayRemoteAddress = nil self.connectedGatewayID = nil self.gatewayConnected = false + self.talkMode.updateGatewayConnected(false) self.seamColorHex = nil if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) { self.mainSessionKey = "main" @@ -390,6 +396,7 @@ final class NodeAppModel { self.gatewayRemoteAddress = nil self.connectedGatewayID = nil self.gatewayConnected = false + self.talkMode.updateGatewayConnected(false) self.seamColorHex = nil if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) { self.mainSessionKey = "main" @@ -627,6 +634,9 @@ final class NodeAppModel { case OpenClawMotionCommand.activity.rawValue, OpenClawMotionCommand.pedometer.rawValue: return try await self.handleMotionInvoke(req) + case OpenClawTalkCommand.pttStart.rawValue, + OpenClawTalkCommand.pttStop.rawValue: + return try await self.handleTalkInvoke(req) default: return BridgeInvokeResponse( id: req.id, @@ -646,7 +656,8 @@ final class NodeAppModel { } private func isBackgroundRestricted(_ command: String) -> Bool { - command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.") + command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.") || + command.hasPrefix("talk.") } private func handleLocationInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse { @@ -1150,6 +1161,24 @@ final class NodeAppModel { } } + private func handleTalkInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse { + switch req.command { + case OpenClawTalkCommand.pttStart.rawValue: + let payload = try await self.talkMode.beginPushToTalk() + let json = try Self.encodePayload(payload) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json) + case OpenClawTalkCommand.pttStop.rawValue: + let payload = await self.talkMode.endPushToTalk() + let json = try Self.encodePayload(payload) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json) + default: + return BridgeInvokeResponse( + id: req.id, + ok: false, + error: OpenClawNodeError(code: .invalidRequest, message: "INVALID_REQUEST: unknown command")) + } + } + } private extension NodeAppModel { diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 6886666a3e..c5d94f5605 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -14,8 +14,21 @@ final class TalkModeManager: NSObject { var isEnabled: Bool = false var isListening: Bool = false var isSpeaking: Bool = false + var isPushToTalkActive: Bool = false var statusText: String = "Off" + private enum CaptureMode { + case idle + case continuous + case pushToTalk + } + + private var captureMode: CaptureMode = .idle + private var resumeContinuousAfterPTT: Bool = false + private var activePTTCaptureId: String? + + private let allowSimulatorCapture: Bool + private let audioEngine = AVAudioEngine() private var inputTapInstalled = false private var speechRecognizer: SFSpeechRecognizer? @@ -45,16 +58,26 @@ final class TalkModeManager: NSObject { var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared private var gateway: GatewayNodeSession? + private var gatewayConnected = false private let silenceWindow: TimeInterval = 0.7 private var chatSubscribedSessionKeys = Set() private let logger = Logger(subsystem: "bot.molt", category: "TalkMode") + init(allowSimulatorCapture: Bool = false) { + self.allowSimulatorCapture = allowSimulatorCapture + super.init() + } + func attachGateway(_ gateway: GatewayNodeSession) { self.gateway = gateway } + func updateGatewayConnected(_ connected: Bool) { + self.gatewayConnected = connected + } + func updateMainSessionKey(_ sessionKey: String?) { let trimmed = (sessionKey ?? "").trimmingCharacters(in: .whitespacesAndNewlines) guard !trimmed.isEmpty else { return } @@ -75,6 +98,7 @@ final class TalkModeManager: NSObject { func start() async { guard self.isEnabled else { return } + guard self.captureMode != .pushToTalk else { return } if self.isListening { return } self.logger.info("start") @@ -97,6 +121,7 @@ final class TalkModeManager: NSObject { try Self.configureAudioSession() try self.startRecognition() self.isListening = true + self.captureMode = .continuous self.statusText = "Listening" self.startSilenceMonitor() await self.subscribeChatIfNeeded(sessionKey: self.mainSessionKey) @@ -111,6 +136,8 @@ final class TalkModeManager: NSObject { func stop() { self.isEnabled = false self.isListening = false + self.isPushToTalkActive = false + self.captureMode = .idle self.statusText = "Off" self.lastTranscript = "" self.lastHeard = nil @@ -119,6 +146,8 @@ final class TalkModeManager: NSObject { self.stopRecognition() self.stopSpeaking() self.lastInterruptedAtSeconds = nil + self.resumeContinuousAfterPTT = false + self.activePTTCaptureId = nil TalkSystemSpeechSynthesizer.shared.stop() do { try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation]) @@ -132,11 +161,127 @@ final class TalkModeManager: NSObject { self.stopSpeaking() } + func beginPushToTalk() async throws -> OpenClawTalkPTTStartPayload { + if self.isPushToTalkActive, let captureId = self.activePTTCaptureId { + return OpenClawTalkPTTStartPayload(captureId: captureId) + } + + self.stopSpeaking(storeInterruption: false) + + self.resumeContinuousAfterPTT = self.isEnabled && self.captureMode == .continuous + self.silenceTask?.cancel() + self.silenceTask = nil + self.stopRecognition() + self.isListening = false + + let captureId = UUID().uuidString + self.activePTTCaptureId = captureId + self.lastTranscript = "" + self.lastHeard = nil + + self.statusText = "Requesting permissions…" + if !self.allowSimulatorCapture { + let micOk = await Self.requestMicrophonePermission() + guard micOk else { + self.statusText = "Microphone permission denied" + throw NSError(domain: "TalkMode", code: 4, userInfo: [ + NSLocalizedDescriptionKey: "Microphone permission denied", + ]) + } + let speechOk = await Self.requestSpeechPermission() + guard speechOk else { + self.statusText = "Speech recognition permission denied" + throw NSError(domain: "TalkMode", code: 5, userInfo: [ + NSLocalizedDescriptionKey: "Speech recognition permission denied", + ]) + } + } + + do { + try Self.configureAudioSession() + self.captureMode = .pushToTalk + try self.startRecognition() + self.isListening = true + self.isPushToTalkActive = true + self.statusText = "Listening (PTT)" + } catch { + self.isListening = false + self.isPushToTalkActive = false + self.captureMode = .idle + self.statusText = "Start failed: \(error.localizedDescription)" + throw error + } + + return OpenClawTalkPTTStartPayload(captureId: captureId) + } + + func endPushToTalk() async -> OpenClawTalkPTTStopPayload { + let captureId = self.activePTTCaptureId ?? UUID().uuidString + guard self.isPushToTalkActive else { + return OpenClawTalkPTTStopPayload( + captureId: captureId, + transcript: nil, + status: "idle") + } + + self.isPushToTalkActive = false + self.isListening = false + self.captureMode = .idle + self.stopRecognition() + + let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) + self.lastTranscript = "" + self.lastHeard = nil + + guard !transcript.isEmpty else { + self.statusText = "Ready" + if self.resumeContinuousAfterPTT { + await self.start() + } + self.resumeContinuousAfterPTT = false + self.activePTTCaptureId = nil + return OpenClawTalkPTTStopPayload( + captureId: captureId, + transcript: nil, + status: "empty") + } + + guard self.gatewayConnected else { + self.statusText = "Gateway not connected" + if self.resumeContinuousAfterPTT { + await self.start() + } + self.resumeContinuousAfterPTT = false + self.activePTTCaptureId = nil + return OpenClawTalkPTTStopPayload( + captureId: captureId, + transcript: transcript, + status: "offline") + } + + self.statusText = "Thinking…" + Task { @MainActor in + await self.processTranscript(transcript, restartAfter: self.resumeContinuousAfterPTT) + } + self.resumeContinuousAfterPTT = false + self.activePTTCaptureId = nil + return OpenClawTalkPTTStopPayload( + captureId: captureId, + transcript: transcript, + status: "queued") + } + private func startRecognition() throws { #if targetEnvironment(simulator) - throw NSError(domain: "TalkMode", code: 2, userInfo: [ - NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator", - ]) + if !self.allowSimulatorCapture { + throw NSError(domain: "TalkMode", code: 2, userInfo: [ + NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator", + ]) + } else { + self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + self.recognitionRequest?.shouldReportPartialResults = true + return + } #endif self.stopRecognition() @@ -232,16 +377,18 @@ final class TalkModeManager: NSObject { } private func checkSilence() async { + guard self.captureMode == .continuous else { return } guard self.isListening, !self.isSpeaking else { return } let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) guard !transcript.isEmpty else { return } guard let lastHeard else { return } if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return } - await self.finalizeTranscript(transcript) + await self.processTranscript(transcript, restartAfter: true) } - private func finalizeTranscript(_ transcript: String) async { + private func processTranscript(_ transcript: String, restartAfter: Bool) async { self.isListening = false + self.captureMode = .idle self.statusText = "Thinking…" self.lastTranscript = "" self.lastHeard = nil @@ -249,10 +396,12 @@ final class TalkModeManager: NSObject { await self.reloadConfig() let prompt = self.buildPrompt(transcript: transcript) - guard let gateway else { + guard self.gatewayConnected, let gateway else { self.statusText = "Gateway not connected" self.logger.warning("finalize: gateway not connected") - await self.start() + if restartAfter { + await self.start() + } return } @@ -297,7 +446,9 @@ final class TalkModeManager: NSObject { self.logger.error("finalize failed: \(error.localizedDescription, privacy: .public)") } - await self.start() + if restartAfter { + await self.start() + } } private func subscribeChatIfNeeded(sessionKey: String) async { @@ -732,3 +883,12 @@ final class TalkModeManager: NSObject { } } } + +#if DEBUG +extension TalkModeManager { + func _test_seedTranscript(_ transcript: String) { + self.lastTranscript = transcript + self.lastHeard = Date() + } +} +#endif diff --git a/apps/ios/SwiftSources.input.xcfilelist b/apps/ios/SwiftSources.input.xcfilelist index 759da0da3b..cdfb272469 100644 --- a/apps/ios/SwiftSources.input.xcfilelist +++ b/apps/ios/SwiftSources.input.xcfilelist @@ -70,6 +70,7 @@ Sources/Voice/VoiceWakePreferences.swift ../shared/OpenClawKit/Sources/OpenClawKit/ScreenCommands.swift ../shared/OpenClawKit/Sources/OpenClawKit/StoragePaths.swift ../shared/OpenClawKit/Sources/OpenClawKit/SystemCommands.swift +../shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift ../shared/OpenClawKit/Sources/OpenClawKit/TalkDirective.swift ../../Swabble/Sources/SwabbleKit/WakeWordGate.swift Sources/Voice/TalkModeManager.swift diff --git a/apps/ios/Tests/GatewayConnectionControllerTests.swift b/apps/ios/Tests/GatewayConnectionControllerTests.swift index 43543b4aa8..f10edfdfa4 100644 --- a/apps/ios/Tests/GatewayConnectionControllerTests.swift +++ b/apps/ios/Tests/GatewayConnectionControllerTests.swift @@ -102,6 +102,8 @@ private func withUserDefaults(_ updates: [String: Any?], _ body: () throws -> #expect(commands.contains(OpenClawContactsCommand.add.rawValue)) #expect(commands.contains(OpenClawCalendarCommand.add.rawValue)) #expect(commands.contains(OpenClawRemindersCommand.add.rawValue)) + #expect(commands.contains(OpenClawTalkCommand.pttStart.rawValue)) + #expect(commands.contains(OpenClawTalkCommand.pttStop.rawValue)) } } diff --git a/apps/ios/Tests/NodeAppModelInvokeTests.swift b/apps/ios/Tests/NodeAppModelInvokeTests.swift index d6c0306c0b..81fb3f4729 100644 --- a/apps/ios/Tests/NodeAppModelInvokeTests.swift +++ b/apps/ios/Tests/NodeAppModelInvokeTests.swift @@ -149,7 +149,8 @@ private func makeTestAppModel( contactsService: ContactsServicing, calendarService: CalendarServicing, remindersService: RemindersServicing, - motionService: MotionServicing) -> NodeAppModel + motionService: MotionServicing, + talkMode: TalkModeManager = TalkModeManager(allowSimulatorCapture: true)) -> NodeAppModel { NodeAppModel( screen: ScreenController(), @@ -162,7 +163,8 @@ private func makeTestAppModel( contactsService: contactsService, calendarService: calendarService, remindersService: remindersService, - motionService: motionService) + motionService: motionService, + talkMode: talkMode) } private func decodePayload(_ json: String?, as type: T.Type) throws -> T { @@ -594,6 +596,86 @@ private func decodePayload(_ json: String?, as type: T.Type) throw #expect(decodedPedometer == pedometerPayload) } + @Test @MainActor func handleInvokePushToTalkReturnsTranscriptStatus() async throws { + let talkMode = TalkModeManager(allowSimulatorCapture: true) + talkMode.updateGatewayConnected(false) + let appModel = makeTestAppModel( + deviceStatusService: TestDeviceStatusService( + statusPayload: OpenClawDeviceStatusPayload( + battery: OpenClawBatteryStatusPayload(level: 0.5, state: .unplugged, lowPowerModeEnabled: false), + thermal: OpenClawThermalStatusPayload(state: .nominal), + storage: OpenClawStorageStatusPayload(totalBytes: 10, freeBytes: 5, usedBytes: 5), + network: OpenClawNetworkStatusPayload( + status: .satisfied, + isExpensive: false, + isConstrained: false, + interfaces: [.wifi]), + uptimeSeconds: 1), + infoPayload: OpenClawDeviceInfoPayload( + deviceName: "Test", + modelIdentifier: "Test1,1", + systemName: "iOS", + systemVersion: "1.0", + appVersion: "dev", + appBuild: "0", + locale: "en-US")), + photosService: TestPhotosService(payload: OpenClawPhotosLatestPayload(photos: [])), + contactsService: TestContactsService( + searchPayload: OpenClawContactsSearchPayload(contacts: []), + addPayload: OpenClawContactsAddPayload(contact: OpenClawContactPayload( + identifier: "c0", + displayName: "", + givenName: "", + familyName: "", + organizationName: "", + phoneNumbers: [], + emails: []))), + calendarService: TestCalendarService( + eventsPayload: OpenClawCalendarEventsPayload(events: []), + addPayload: OpenClawCalendarAddPayload(event: OpenClawCalendarEventPayload( + identifier: "e0", + title: "Test", + startISO: "2024-01-01T00:00:00Z", + endISO: "2024-01-01T00:10:00Z", + isAllDay: false, + location: nil, + calendarTitle: nil))), + remindersService: TestRemindersService( + listPayload: OpenClawRemindersListPayload(reminders: []), + addPayload: OpenClawRemindersAddPayload(reminder: OpenClawReminderPayload( + identifier: "r0", + title: "Test", + dueISO: nil, + completed: false, + listName: nil))), + motionService: TestMotionService( + activityPayload: OpenClawMotionActivityPayload(activities: []), + pedometerPayload: OpenClawPedometerPayload( + startISO: "2024-01-01T00:00:00Z", + endISO: "2024-01-01T01:00:00Z", + steps: nil, + distanceMeters: nil, + floorsAscended: nil, + floorsDescended: nil)), + talkMode: talkMode) + + let startReq = BridgeInvokeRequest(id: "ptt-start", command: OpenClawTalkCommand.pttStart.rawValue) + let startRes = await appModel._test_handleInvoke(startReq) + #expect(startRes.ok == true) + let startPayload = try decodePayload(startRes.payloadJSON, as: OpenClawTalkPTTStartPayload.self) + #expect(!startPayload.captureId.isEmpty) + + talkMode._test_seedTranscript("Hello from PTT") + + let stopReq = BridgeInvokeRequest(id: "ptt-stop", command: OpenClawTalkCommand.pttStop.rawValue) + let stopRes = await appModel._test_handleInvoke(stopReq) + #expect(stopRes.ok == true) + let stopPayload = try decodePayload(stopRes.payloadJSON, as: OpenClawTalkPTTStopPayload.self) + #expect(stopPayload.captureId == startPayload.captureId) + #expect(stopPayload.transcript == "Hello from PTT") + #expect(stopPayload.status == "offline") + } + @Test @MainActor func handleDeepLinkSetsErrorWhenNotConnected() async { let appModel = NodeAppModel() let url = URL(string: "openclaw://agent?message=hello")! diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift b/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift new file mode 100644 index 0000000000..d8646ac76d --- /dev/null +++ b/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift @@ -0,0 +1,26 @@ +import Foundation + +public enum OpenClawTalkCommand: String, Codable, Sendable { + case pttStart = "talk.ptt.start" + case pttStop = "talk.ptt.stop" +} + +public struct OpenClawTalkPTTStartPayload: Codable, Sendable, Equatable { + public var captureId: String + + public init(captureId: String) { + self.captureId = captureId + } +} + +public struct OpenClawTalkPTTStopPayload: Codable, Sendable, Equatable { + public var captureId: String + public var transcript: String? + public var status: String + + public init(captureId: String, transcript: String?, status: String) { + self.captureId = captureId + self.transcript = transcript + self.status = status + } +} diff --git a/src/gateway/node-command-policy.ts b/src/gateway/node-command-policy.ts index d3dfe9766e..588a64899b 100644 --- a/src/gateway/node-command-policy.ts +++ b/src/gateway/node-command-policy.ts @@ -34,6 +34,8 @@ const MOTION_COMMANDS = ["motion.activity", "motion.pedometer"]; const SYSTEM_NOTIFY_COMMANDS = ["system.notify"]; +const TALK_COMMANDS = ["talk.ptt.start", "talk.ptt.stop"]; + const SYSTEM_COMMANDS = [ "system.run", "system.which", @@ -56,6 +58,7 @@ const PLATFORM_DEFAULTS: Record = { ...CALENDAR_COMMANDS, ...REMINDERS_COMMANDS, ...MOTION_COMMANDS, + ...TALK_COMMANDS, ], android: [ ...CANVAS_COMMANDS,