mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-25 03:04:29 -04:00
iOS: add push-to-talk node commands
This commit is contained in:
committed by
Mariano Belinky
parent
a884955cd6
commit
9f101d3a9a
@@ -353,6 +353,8 @@ final class GatewayConnectionController {
|
||||
OpenClawCanvasA2UICommand.reset.rawValue,
|
||||
OpenClawScreenCommand.record.rawValue,
|
||||
OpenClawSystemCommand.notify.rawValue,
|
||||
OpenClawTalkCommand.pttStart.rawValue,
|
||||
OpenClawTalkCommand.pttStop.rawValue,
|
||||
]
|
||||
|
||||
let caps = Set(self.currentCaps())
|
||||
|
||||
@@ -63,7 +63,7 @@ final class NodeAppModel {
|
||||
@ObservationIgnored private var cameraHUDDismissTask: Task<Void, Never>?
|
||||
private let notificationCenter: NotificationCentering
|
||||
let voiceWake = VoiceWakeManager()
|
||||
let talkMode = TalkModeManager()
|
||||
let talkMode: TalkModeManager
|
||||
private let locationService: any LocationServicing
|
||||
private let deviceStatusService: any DeviceStatusServicing
|
||||
private let photosService: any PhotosServicing
|
||||
@@ -92,7 +92,8 @@ final class NodeAppModel {
|
||||
contactsService: any ContactsServicing = ContactsService(),
|
||||
calendarService: any CalendarServicing = CalendarService(),
|
||||
remindersService: any RemindersServicing = RemindersService(),
|
||||
motionService: any MotionServicing = MotionService())
|
||||
motionService: any MotionServicing = MotionService(),
|
||||
talkMode: TalkModeManager = TalkModeManager())
|
||||
{
|
||||
self.screen = screen
|
||||
self.camera = camera
|
||||
@@ -105,6 +106,7 @@ final class NodeAppModel {
|
||||
self.calendarService = calendarService
|
||||
self.remindersService = remindersService
|
||||
self.motionService = motionService
|
||||
self.talkMode = talkMode
|
||||
|
||||
self.voiceWake.configure { [weak self] cmd in
|
||||
guard let self else { return }
|
||||
@@ -313,6 +315,7 @@ final class NodeAppModel {
|
||||
self.gatewayStatusText = "Connected"
|
||||
self.gatewayServerName = url.host ?? "gateway"
|
||||
self.gatewayConnected = true
|
||||
self.talkMode.updateGatewayConnected(true)
|
||||
}
|
||||
if let addr = await self.gateway.currentRemoteAddress() {
|
||||
await MainActor.run {
|
||||
@@ -329,6 +332,7 @@ final class NodeAppModel {
|
||||
self.gatewayStatusText = "Disconnected"
|
||||
self.gatewayRemoteAddress = nil
|
||||
self.gatewayConnected = false
|
||||
self.talkMode.updateGatewayConnected(false)
|
||||
self.showLocalCanvasOnDisconnect()
|
||||
self.gatewayStatusText = "Disconnected: \(reason)"
|
||||
}
|
||||
@@ -356,6 +360,7 @@ final class NodeAppModel {
|
||||
self.gatewayServerName = nil
|
||||
self.gatewayRemoteAddress = nil
|
||||
self.gatewayConnected = false
|
||||
self.talkMode.updateGatewayConnected(false)
|
||||
self.showLocalCanvasOnDisconnect()
|
||||
}
|
||||
let sleepSeconds = min(8.0, 0.5 * pow(1.7, Double(attempt)))
|
||||
@@ -369,6 +374,7 @@ final class NodeAppModel {
|
||||
self.gatewayRemoteAddress = nil
|
||||
self.connectedGatewayID = nil
|
||||
self.gatewayConnected = false
|
||||
self.talkMode.updateGatewayConnected(false)
|
||||
self.seamColorHex = nil
|
||||
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
|
||||
self.mainSessionKey = "main"
|
||||
@@ -390,6 +396,7 @@ final class NodeAppModel {
|
||||
self.gatewayRemoteAddress = nil
|
||||
self.connectedGatewayID = nil
|
||||
self.gatewayConnected = false
|
||||
self.talkMode.updateGatewayConnected(false)
|
||||
self.seamColorHex = nil
|
||||
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
|
||||
self.mainSessionKey = "main"
|
||||
@@ -627,6 +634,9 @@ final class NodeAppModel {
|
||||
case OpenClawMotionCommand.activity.rawValue,
|
||||
OpenClawMotionCommand.pedometer.rawValue:
|
||||
return try await self.handleMotionInvoke(req)
|
||||
case OpenClawTalkCommand.pttStart.rawValue,
|
||||
OpenClawTalkCommand.pttStop.rawValue:
|
||||
return try await self.handleTalkInvoke(req)
|
||||
default:
|
||||
return BridgeInvokeResponse(
|
||||
id: req.id,
|
||||
@@ -646,7 +656,8 @@ final class NodeAppModel {
|
||||
}
|
||||
|
||||
private func isBackgroundRestricted(_ command: String) -> Bool {
|
||||
command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.")
|
||||
command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.") ||
|
||||
command.hasPrefix("talk.")
|
||||
}
|
||||
|
||||
private func handleLocationInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse {
|
||||
@@ -1150,6 +1161,24 @@ final class NodeAppModel {
|
||||
}
|
||||
}
|
||||
|
||||
private func handleTalkInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse {
|
||||
switch req.command {
|
||||
case OpenClawTalkCommand.pttStart.rawValue:
|
||||
let payload = try await self.talkMode.beginPushToTalk()
|
||||
let json = try Self.encodePayload(payload)
|
||||
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
|
||||
case OpenClawTalkCommand.pttStop.rawValue:
|
||||
let payload = await self.talkMode.endPushToTalk()
|
||||
let json = try Self.encodePayload(payload)
|
||||
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
|
||||
default:
|
||||
return BridgeInvokeResponse(
|
||||
id: req.id,
|
||||
ok: false,
|
||||
error: OpenClawNodeError(code: .invalidRequest, message: "INVALID_REQUEST: unknown command"))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private extension NodeAppModel {
|
||||
|
||||
@@ -14,8 +14,21 @@ final class TalkModeManager: NSObject {
|
||||
var isEnabled: Bool = false
|
||||
var isListening: Bool = false
|
||||
var isSpeaking: Bool = false
|
||||
var isPushToTalkActive: Bool = false
|
||||
var statusText: String = "Off"
|
||||
|
||||
private enum CaptureMode {
|
||||
case idle
|
||||
case continuous
|
||||
case pushToTalk
|
||||
}
|
||||
|
||||
private var captureMode: CaptureMode = .idle
|
||||
private var resumeContinuousAfterPTT: Bool = false
|
||||
private var activePTTCaptureId: String?
|
||||
|
||||
private let allowSimulatorCapture: Bool
|
||||
|
||||
private let audioEngine = AVAudioEngine()
|
||||
private var inputTapInstalled = false
|
||||
private var speechRecognizer: SFSpeechRecognizer?
|
||||
@@ -45,16 +58,26 @@ final class TalkModeManager: NSObject {
|
||||
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
|
||||
|
||||
private var gateway: GatewayNodeSession?
|
||||
private var gatewayConnected = false
|
||||
private let silenceWindow: TimeInterval = 0.7
|
||||
|
||||
private var chatSubscribedSessionKeys = Set<String>()
|
||||
|
||||
private let logger = Logger(subsystem: "bot.molt", category: "TalkMode")
|
||||
|
||||
init(allowSimulatorCapture: Bool = false) {
|
||||
self.allowSimulatorCapture = allowSimulatorCapture
|
||||
super.init()
|
||||
}
|
||||
|
||||
func attachGateway(_ gateway: GatewayNodeSession) {
|
||||
self.gateway = gateway
|
||||
}
|
||||
|
||||
func updateGatewayConnected(_ connected: Bool) {
|
||||
self.gatewayConnected = connected
|
||||
}
|
||||
|
||||
func updateMainSessionKey(_ sessionKey: String?) {
|
||||
let trimmed = (sessionKey ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return }
|
||||
@@ -75,6 +98,7 @@ final class TalkModeManager: NSObject {
|
||||
|
||||
func start() async {
|
||||
guard self.isEnabled else { return }
|
||||
guard self.captureMode != .pushToTalk else { return }
|
||||
if self.isListening { return }
|
||||
|
||||
self.logger.info("start")
|
||||
@@ -97,6 +121,7 @@ final class TalkModeManager: NSObject {
|
||||
try Self.configureAudioSession()
|
||||
try self.startRecognition()
|
||||
self.isListening = true
|
||||
self.captureMode = .continuous
|
||||
self.statusText = "Listening"
|
||||
self.startSilenceMonitor()
|
||||
await self.subscribeChatIfNeeded(sessionKey: self.mainSessionKey)
|
||||
@@ -111,6 +136,8 @@ final class TalkModeManager: NSObject {
|
||||
func stop() {
|
||||
self.isEnabled = false
|
||||
self.isListening = false
|
||||
self.isPushToTalkActive = false
|
||||
self.captureMode = .idle
|
||||
self.statusText = "Off"
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
@@ -119,6 +146,8 @@ final class TalkModeManager: NSObject {
|
||||
self.stopRecognition()
|
||||
self.stopSpeaking()
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
self.resumeContinuousAfterPTT = false
|
||||
self.activePTTCaptureId = nil
|
||||
TalkSystemSpeechSynthesizer.shared.stop()
|
||||
do {
|
||||
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
|
||||
@@ -132,11 +161,127 @@ final class TalkModeManager: NSObject {
|
||||
self.stopSpeaking()
|
||||
}
|
||||
|
||||
func beginPushToTalk() async throws -> OpenClawTalkPTTStartPayload {
|
||||
if self.isPushToTalkActive, let captureId = self.activePTTCaptureId {
|
||||
return OpenClawTalkPTTStartPayload(captureId: captureId)
|
||||
}
|
||||
|
||||
self.stopSpeaking(storeInterruption: false)
|
||||
|
||||
self.resumeContinuousAfterPTT = self.isEnabled && self.captureMode == .continuous
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = nil
|
||||
self.stopRecognition()
|
||||
self.isListening = false
|
||||
|
||||
let captureId = UUID().uuidString
|
||||
self.activePTTCaptureId = captureId
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
|
||||
self.statusText = "Requesting permissions…"
|
||||
if !self.allowSimulatorCapture {
|
||||
let micOk = await Self.requestMicrophonePermission()
|
||||
guard micOk else {
|
||||
self.statusText = "Microphone permission denied"
|
||||
throw NSError(domain: "TalkMode", code: 4, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Microphone permission denied",
|
||||
])
|
||||
}
|
||||
let speechOk = await Self.requestSpeechPermission()
|
||||
guard speechOk else {
|
||||
self.statusText = "Speech recognition permission denied"
|
||||
throw NSError(domain: "TalkMode", code: 5, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Speech recognition permission denied",
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
try Self.configureAudioSession()
|
||||
self.captureMode = .pushToTalk
|
||||
try self.startRecognition()
|
||||
self.isListening = true
|
||||
self.isPushToTalkActive = true
|
||||
self.statusText = "Listening (PTT)"
|
||||
} catch {
|
||||
self.isListening = false
|
||||
self.isPushToTalkActive = false
|
||||
self.captureMode = .idle
|
||||
self.statusText = "Start failed: \(error.localizedDescription)"
|
||||
throw error
|
||||
}
|
||||
|
||||
return OpenClawTalkPTTStartPayload(captureId: captureId)
|
||||
}
|
||||
|
||||
func endPushToTalk() async -> OpenClawTalkPTTStopPayload {
|
||||
let captureId = self.activePTTCaptureId ?? UUID().uuidString
|
||||
guard self.isPushToTalkActive else {
|
||||
return OpenClawTalkPTTStopPayload(
|
||||
captureId: captureId,
|
||||
transcript: nil,
|
||||
status: "idle")
|
||||
}
|
||||
|
||||
self.isPushToTalkActive = false
|
||||
self.isListening = false
|
||||
self.captureMode = .idle
|
||||
self.stopRecognition()
|
||||
|
||||
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
|
||||
guard !transcript.isEmpty else {
|
||||
self.statusText = "Ready"
|
||||
if self.resumeContinuousAfterPTT {
|
||||
await self.start()
|
||||
}
|
||||
self.resumeContinuousAfterPTT = false
|
||||
self.activePTTCaptureId = nil
|
||||
return OpenClawTalkPTTStopPayload(
|
||||
captureId: captureId,
|
||||
transcript: nil,
|
||||
status: "empty")
|
||||
}
|
||||
|
||||
guard self.gatewayConnected else {
|
||||
self.statusText = "Gateway not connected"
|
||||
if self.resumeContinuousAfterPTT {
|
||||
await self.start()
|
||||
}
|
||||
self.resumeContinuousAfterPTT = false
|
||||
self.activePTTCaptureId = nil
|
||||
return OpenClawTalkPTTStopPayload(
|
||||
captureId: captureId,
|
||||
transcript: transcript,
|
||||
status: "offline")
|
||||
}
|
||||
|
||||
self.statusText = "Thinking…"
|
||||
Task { @MainActor in
|
||||
await self.processTranscript(transcript, restartAfter: self.resumeContinuousAfterPTT)
|
||||
}
|
||||
self.resumeContinuousAfterPTT = false
|
||||
self.activePTTCaptureId = nil
|
||||
return OpenClawTalkPTTStopPayload(
|
||||
captureId: captureId,
|
||||
transcript: transcript,
|
||||
status: "queued")
|
||||
}
|
||||
|
||||
private func startRecognition() throws {
|
||||
#if targetEnvironment(simulator)
|
||||
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
|
||||
])
|
||||
if !self.allowSimulatorCapture {
|
||||
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
|
||||
])
|
||||
} else {
|
||||
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||
self.recognitionRequest?.shouldReportPartialResults = true
|
||||
return
|
||||
}
|
||||
#endif
|
||||
|
||||
self.stopRecognition()
|
||||
@@ -232,16 +377,18 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
|
||||
private func checkSilence() async {
|
||||
guard self.captureMode == .continuous else { return }
|
||||
guard self.isListening, !self.isSpeaking else { return }
|
||||
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !transcript.isEmpty else { return }
|
||||
guard let lastHeard else { return }
|
||||
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
|
||||
await self.finalizeTranscript(transcript)
|
||||
await self.processTranscript(transcript, restartAfter: true)
|
||||
}
|
||||
|
||||
private func finalizeTranscript(_ transcript: String) async {
|
||||
private func processTranscript(_ transcript: String, restartAfter: Bool) async {
|
||||
self.isListening = false
|
||||
self.captureMode = .idle
|
||||
self.statusText = "Thinking…"
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
@@ -249,10 +396,12 @@ final class TalkModeManager: NSObject {
|
||||
|
||||
await self.reloadConfig()
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
guard let gateway else {
|
||||
guard self.gatewayConnected, let gateway else {
|
||||
self.statusText = "Gateway not connected"
|
||||
self.logger.warning("finalize: gateway not connected")
|
||||
await self.start()
|
||||
if restartAfter {
|
||||
await self.start()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -297,7 +446,9 @@ final class TalkModeManager: NSObject {
|
||||
self.logger.error("finalize failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
|
||||
await self.start()
|
||||
if restartAfter {
|
||||
await self.start()
|
||||
}
|
||||
}
|
||||
|
||||
private func subscribeChatIfNeeded(sessionKey: String) async {
|
||||
@@ -732,3 +883,12 @@ final class TalkModeManager: NSObject {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
extension TalkModeManager {
|
||||
func _test_seedTranscript(_ transcript: String) {
|
||||
self.lastTranscript = transcript
|
||||
self.lastHeard = Date()
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user