iOS: add push-to-talk node commands

This commit is contained in:
Mariano Belinky
2026-02-01 00:25:44 +01:00
committed by Mariano Belinky
parent a884955cd6
commit 9f101d3a9a
8 changed files with 318 additions and 13 deletions

View File

@@ -353,6 +353,8 @@ final class GatewayConnectionController {
OpenClawCanvasA2UICommand.reset.rawValue,
OpenClawScreenCommand.record.rawValue,
OpenClawSystemCommand.notify.rawValue,
OpenClawTalkCommand.pttStart.rawValue,
OpenClawTalkCommand.pttStop.rawValue,
]
let caps = Set(self.currentCaps())

View File

@@ -63,7 +63,7 @@ final class NodeAppModel {
@ObservationIgnored private var cameraHUDDismissTask: Task<Void, Never>?
private let notificationCenter: NotificationCentering
let voiceWake = VoiceWakeManager()
let talkMode = TalkModeManager()
let talkMode: TalkModeManager
private let locationService: any LocationServicing
private let deviceStatusService: any DeviceStatusServicing
private let photosService: any PhotosServicing
@@ -92,7 +92,8 @@ final class NodeAppModel {
contactsService: any ContactsServicing = ContactsService(),
calendarService: any CalendarServicing = CalendarService(),
remindersService: any RemindersServicing = RemindersService(),
motionService: any MotionServicing = MotionService())
motionService: any MotionServicing = MotionService(),
talkMode: TalkModeManager = TalkModeManager())
{
self.screen = screen
self.camera = camera
@@ -105,6 +106,7 @@ final class NodeAppModel {
self.calendarService = calendarService
self.remindersService = remindersService
self.motionService = motionService
self.talkMode = talkMode
self.voiceWake.configure { [weak self] cmd in
guard let self else { return }
@@ -313,6 +315,7 @@ final class NodeAppModel {
self.gatewayStatusText = "Connected"
self.gatewayServerName = url.host ?? "gateway"
self.gatewayConnected = true
self.talkMode.updateGatewayConnected(true)
}
if let addr = await self.gateway.currentRemoteAddress() {
await MainActor.run {
@@ -329,6 +332,7 @@ final class NodeAppModel {
self.gatewayStatusText = "Disconnected"
self.gatewayRemoteAddress = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.showLocalCanvasOnDisconnect()
self.gatewayStatusText = "Disconnected: \(reason)"
}
@@ -356,6 +360,7 @@ final class NodeAppModel {
self.gatewayServerName = nil
self.gatewayRemoteAddress = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.showLocalCanvasOnDisconnect()
}
let sleepSeconds = min(8.0, 0.5 * pow(1.7, Double(attempt)))
@@ -369,6 +374,7 @@ final class NodeAppModel {
self.gatewayRemoteAddress = nil
self.connectedGatewayID = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.seamColorHex = nil
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
self.mainSessionKey = "main"
@@ -390,6 +396,7 @@ final class NodeAppModel {
self.gatewayRemoteAddress = nil
self.connectedGatewayID = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.seamColorHex = nil
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
self.mainSessionKey = "main"
@@ -627,6 +634,9 @@ final class NodeAppModel {
case OpenClawMotionCommand.activity.rawValue,
OpenClawMotionCommand.pedometer.rawValue:
return try await self.handleMotionInvoke(req)
case OpenClawTalkCommand.pttStart.rawValue,
OpenClawTalkCommand.pttStop.rawValue:
return try await self.handleTalkInvoke(req)
default:
return BridgeInvokeResponse(
id: req.id,
@@ -646,7 +656,8 @@ final class NodeAppModel {
}
private func isBackgroundRestricted(_ command: String) -> Bool {
command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.")
command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.") ||
command.hasPrefix("talk.")
}
private func handleLocationInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse {
@@ -1150,6 +1161,24 @@ final class NodeAppModel {
}
}
private func handleTalkInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse {
switch req.command {
case OpenClawTalkCommand.pttStart.rawValue:
let payload = try await self.talkMode.beginPushToTalk()
let json = try Self.encodePayload(payload)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
case OpenClawTalkCommand.pttStop.rawValue:
let payload = await self.talkMode.endPushToTalk()
let json = try Self.encodePayload(payload)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
default:
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: OpenClawNodeError(code: .invalidRequest, message: "INVALID_REQUEST: unknown command"))
}
}
}
private extension NodeAppModel {

View File

@@ -14,8 +14,21 @@ final class TalkModeManager: NSObject {
var isEnabled: Bool = false
var isListening: Bool = false
var isSpeaking: Bool = false
var isPushToTalkActive: Bool = false
var statusText: String = "Off"
private enum CaptureMode {
case idle
case continuous
case pushToTalk
}
private var captureMode: CaptureMode = .idle
private var resumeContinuousAfterPTT: Bool = false
private var activePTTCaptureId: String?
private let allowSimulatorCapture: Bool
private let audioEngine = AVAudioEngine()
private var inputTapInstalled = false
private var speechRecognizer: SFSpeechRecognizer?
@@ -45,16 +58,26 @@ final class TalkModeManager: NSObject {
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
private var gateway: GatewayNodeSession?
private var gatewayConnected = false
private let silenceWindow: TimeInterval = 0.7
private var chatSubscribedSessionKeys = Set<String>()
private let logger = Logger(subsystem: "bot.molt", category: "TalkMode")
init(allowSimulatorCapture: Bool = false) {
self.allowSimulatorCapture = allowSimulatorCapture
super.init()
}
func attachGateway(_ gateway: GatewayNodeSession) {
self.gateway = gateway
}
func updateGatewayConnected(_ connected: Bool) {
self.gatewayConnected = connected
}
func updateMainSessionKey(_ sessionKey: String?) {
let trimmed = (sessionKey ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return }
@@ -75,6 +98,7 @@ final class TalkModeManager: NSObject {
func start() async {
guard self.isEnabled else { return }
guard self.captureMode != .pushToTalk else { return }
if self.isListening { return }
self.logger.info("start")
@@ -97,6 +121,7 @@ final class TalkModeManager: NSObject {
try Self.configureAudioSession()
try self.startRecognition()
self.isListening = true
self.captureMode = .continuous
self.statusText = "Listening"
self.startSilenceMonitor()
await self.subscribeChatIfNeeded(sessionKey: self.mainSessionKey)
@@ -111,6 +136,8 @@ final class TalkModeManager: NSObject {
func stop() {
self.isEnabled = false
self.isListening = false
self.isPushToTalkActive = false
self.captureMode = .idle
self.statusText = "Off"
self.lastTranscript = ""
self.lastHeard = nil
@@ -119,6 +146,8 @@ final class TalkModeManager: NSObject {
self.stopRecognition()
self.stopSpeaking()
self.lastInterruptedAtSeconds = nil
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
TalkSystemSpeechSynthesizer.shared.stop()
do {
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
@@ -132,11 +161,127 @@ final class TalkModeManager: NSObject {
self.stopSpeaking()
}
func beginPushToTalk() async throws -> OpenClawTalkPTTStartPayload {
if self.isPushToTalkActive, let captureId = self.activePTTCaptureId {
return OpenClawTalkPTTStartPayload(captureId: captureId)
}
self.stopSpeaking(storeInterruption: false)
self.resumeContinuousAfterPTT = self.isEnabled && self.captureMode == .continuous
self.silenceTask?.cancel()
self.silenceTask = nil
self.stopRecognition()
self.isListening = false
let captureId = UUID().uuidString
self.activePTTCaptureId = captureId
self.lastTranscript = ""
self.lastHeard = nil
self.statusText = "Requesting permissions…"
if !self.allowSimulatorCapture {
let micOk = await Self.requestMicrophonePermission()
guard micOk else {
self.statusText = "Microphone permission denied"
throw NSError(domain: "TalkMode", code: 4, userInfo: [
NSLocalizedDescriptionKey: "Microphone permission denied",
])
}
let speechOk = await Self.requestSpeechPermission()
guard speechOk else {
self.statusText = "Speech recognition permission denied"
throw NSError(domain: "TalkMode", code: 5, userInfo: [
NSLocalizedDescriptionKey: "Speech recognition permission denied",
])
}
}
do {
try Self.configureAudioSession()
self.captureMode = .pushToTalk
try self.startRecognition()
self.isListening = true
self.isPushToTalkActive = true
self.statusText = "Listening (PTT)"
} catch {
self.isListening = false
self.isPushToTalkActive = false
self.captureMode = .idle
self.statusText = "Start failed: \(error.localizedDescription)"
throw error
}
return OpenClawTalkPTTStartPayload(captureId: captureId)
}
func endPushToTalk() async -> OpenClawTalkPTTStopPayload {
let captureId = self.activePTTCaptureId ?? UUID().uuidString
guard self.isPushToTalkActive else {
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "idle")
}
self.isPushToTalkActive = false
self.isListening = false
self.captureMode = .idle
self.stopRecognition()
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
self.lastTranscript = ""
self.lastHeard = nil
guard !transcript.isEmpty else {
self.statusText = "Ready"
if self.resumeContinuousAfterPTT {
await self.start()
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "empty")
}
guard self.gatewayConnected else {
self.statusText = "Gateway not connected"
if self.resumeContinuousAfterPTT {
await self.start()
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: transcript,
status: "offline")
}
self.statusText = "Thinking…"
Task { @MainActor in
await self.processTranscript(transcript, restartAfter: self.resumeContinuousAfterPTT)
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: transcript,
status: "queued")
}
private func startRecognition() throws {
#if targetEnvironment(simulator)
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
])
if !self.allowSimulatorCapture {
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
])
} else {
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
return
}
#endif
self.stopRecognition()
@@ -232,16 +377,18 @@ final class TalkModeManager: NSObject {
}
private func checkSilence() async {
guard self.captureMode == .continuous else { return }
guard self.isListening, !self.isSpeaking else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
await self.finalizeTranscript(transcript)
await self.processTranscript(transcript, restartAfter: true)
}
private func finalizeTranscript(_ transcript: String) async {
private func processTranscript(_ transcript: String, restartAfter: Bool) async {
self.isListening = false
self.captureMode = .idle
self.statusText = "Thinking…"
self.lastTranscript = ""
self.lastHeard = nil
@@ -249,10 +396,12 @@ final class TalkModeManager: NSObject {
await self.reloadConfig()
let prompt = self.buildPrompt(transcript: transcript)
guard let gateway else {
guard self.gatewayConnected, let gateway else {
self.statusText = "Gateway not connected"
self.logger.warning("finalize: gateway not connected")
await self.start()
if restartAfter {
await self.start()
}
return
}
@@ -297,7 +446,9 @@ final class TalkModeManager: NSObject {
self.logger.error("finalize failed: \(error.localizedDescription, privacy: .public)")
}
await self.start()
if restartAfter {
await self.start()
}
}
private func subscribeChatIfNeeded(sessionKey: String) async {
@@ -732,3 +883,12 @@ final class TalkModeManager: NSObject {
}
}
}
#if DEBUG
extension TalkModeManager {
func _test_seedTranscript(_ transcript: String) {
self.lastTranscript = transcript
self.lastHeard = Date()
}
}
#endif