feat: add camera list and device selection

This commit is contained in:
Peter Steinberger
2026-01-02 18:23:26 +01:00
parent 2b34bf08da
commit 74db53d939
12 changed files with 293 additions and 18 deletions

View File

@@ -79,6 +79,8 @@
- Agent tools: map `camera.snap` JPEG payloads to `image/jpeg` to avoid MIME mismatch errors.
- Tests: cover `camera.snap` MIME mapping to prevent image/png vs image/jpeg mismatches.
- macOS camera: wait for exposure/white balance to settle before capturing a snap to avoid dark images.
- Camera snap: add `delayMs` parameter (default 2000ms on macOS) to improve exposure reliability.
- Camera: add `camera.list` and optional `deviceId` selection for snaps/clips.
- macOS packaging: move rpath config into swift build for reliability (#69) — thanks @petter-b
- macOS: prioritize main bundle for device resources to prevent crash (#73) — thanks @petter-b
- macOS remote: route settings through gateway config and avoid local config reads in remote mode.

View File

@@ -247,6 +247,7 @@ final class BridgeConnectionController {
let caps = Set(self.currentCaps())
if caps.contains(ClawdisCapability.camera.rawValue) {
commands.append(ClawdisCameraCommand.list.rawValue)
commands.append(ClawdisCameraCommand.snap.rawValue)
commands.append(ClawdisCameraCommand.clip.rawValue)
}

View File

@@ -3,6 +3,13 @@ import ClawdisKit
import Foundation
actor CameraController {
struct CameraDeviceInfo: Codable, Sendable {
var id: String
var name: String
var position: String
var deviceType: String
}
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
@@ -41,13 +48,14 @@ actor CameraController {
// If you need the full-res photo, explicitly request a larger maxWidth.
let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil } ?? 1600
let quality = Self.clampQuality(params.quality)
let delayMs = max(0, params.delayMs ?? 0)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
guard let device = Self.pickCamera(facing: facing, deviceId: params.deviceId) else {
throw CameraError.cameraUnavailable
}
@@ -67,6 +75,7 @@ actor CameraController {
session.startRunning()
defer { session.stopRunning() }
await Self.warmUpCaptureSession()
await Self.sleepDelayMs(delayMs)
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
@@ -119,7 +128,7 @@ actor CameraController {
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
guard let camera = Self.pickCamera(facing: facing, deviceId: params.deviceId) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
@@ -180,6 +189,24 @@ actor CameraController {
hasAudio: includeAudio)
}
func listDevices() -> [CameraDeviceInfo] {
let types: [AVCaptureDevice.DeviceType] = [
.builtInWideAngleCamera,
.externalUnknown,
]
let session = AVCaptureDevice.DiscoverySession(
deviceTypes: types,
mediaType: .video,
position: .unspecified)
return session.devices.map { device in
CameraDeviceInfo(
id: device.uniqueID,
name: device.localizedName,
position: Self.positionLabel(device.position),
deviceType: device.deviceType.rawValue)
}
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
@@ -201,7 +228,15 @@ actor CameraController {
}
}
private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? {
private nonisolated static func pickCamera(
facing: ClawdisCameraFacing,
deviceId: String?) -> AVCaptureDevice?
{
if let deviceId, !deviceId.isEmpty {
if let match = AVCaptureDevice.devices(for: .video).first(where: { $0.uniqueID == deviceId }) {
return match
}
}
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) {
return device
@@ -210,6 +245,14 @@ actor CameraController {
return AVCaptureDevice.default(for: .video)
}
private nonisolated static func positionLabel(_ position: AVCaptureDevice.Position) -> String {
switch position {
case .front: "front"
case .back: "back"
default: "unspecified"
}
}
nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
@@ -262,6 +305,12 @@ actor CameraController {
// A short delay after `startRunning()` significantly reduces "blank first frame" captures on some devices.
try? await Task.sleep(nanoseconds: 150_000_000) // 150ms
}
private nonisolated static func sleepDelayMs(_ delayMs: Int) async {
guard delayMs > 0 else { return }
let ns = UInt64(min(delayMs, 10_000)) * 1_000_000
try? await Task.sleep(nanoseconds: ns)
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {

View File

@@ -589,6 +589,14 @@ final class NodeAppModel {
let resultJSON = try await self.screen.eval(javaScript: js)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: resultJSON)
case ClawdisCameraCommand.list.rawValue:
let devices = await self.camera.listDevices()
struct Payload: Codable {
var devices: [CameraController.CameraDeviceInfo]
}
let payload = try Self.encodePayload(Payload(devices: devices))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.snap.rawValue:
self.showCameraHUD(text: "Taking photo…", kind: .photo)
self.triggerCameraFlash()

View File

@@ -333,6 +333,7 @@ struct SettingsTab: View {
let caps = Set(self.currentCaps())
if caps.contains(ClawdisCapability.camera.rawValue) {
commands.append(ClawdisCameraCommand.list.rawValue)
commands.append(ClawdisCameraCommand.snap.rawValue)
commands.append(ClawdisCameraCommand.clip.rawValue)
}

View File

@@ -6,6 +6,13 @@ import Foundation
import OSLog
actor CameraCaptureService {
struct CameraDeviceInfo: Encodable, Sendable {
let id: String
let name: String
let position: String
let deviceType: String
}
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
@@ -31,18 +38,36 @@ actor CameraCaptureService {
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera")
func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) {
func listDevices() -> [CameraDeviceInfo] {
Self.availableCameras().map { device in
CameraDeviceInfo(
id: device.uniqueID,
name: device.localizedName,
position: Self.positionLabel(device.position),
deviceType: device.deviceType.rawValue)
}
}
func snap(
facing: CameraFacing?,
maxWidth: Int?,
quality: Double?,
deviceId: String?,
delayMs: Int) async throws -> (data: Data, size: CGSize)
{
let facing = facing ?? .front
let normalized = Self.normalizeSnap(maxWidth: maxWidth, quality: quality)
let maxWidth = normalized.maxWidth
let quality = normalized.quality
let delayMs = max(0, delayMs)
let deviceId = deviceId?.trimmingCharacters(in: .whitespacesAndNewlines)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
guard let device = Self.pickCamera(facing: facing, deviceId: deviceId) else {
throw CameraError.cameraUnavailable
}
@@ -63,6 +88,7 @@ actor CameraCaptureService {
defer { session.stopRunning() }
await Self.warmUpCaptureSession()
await self.waitForExposureAndWhiteBalance(device: device)
await self.sleepDelayMs(delayMs)
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
@@ -95,10 +121,12 @@ actor CameraCaptureService {
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
deviceId: String?,
outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool)
{
let facing = facing ?? .front
let durationMs = Self.clampDurationMs(durationMs)
let deviceId = deviceId?.trimmingCharacters(in: .whitespacesAndNewlines)
try await self.ensureAccess(for: .video)
if includeAudio {
@@ -108,7 +136,7 @@ actor CameraCaptureService {
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
guard let camera = Self.pickCamera(facing: facing, deviceId: deviceId) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
@@ -188,7 +216,28 @@ actor CameraCaptureService {
}
}
private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? {
private nonisolated static func availableCameras() -> [AVCaptureDevice] {
let types: [AVCaptureDevice.DeviceType] = [
.builtInWideAngleCamera,
.externalUnknown,
.continuityCamera,
]
let session = AVCaptureDevice.DiscoverySession(
deviceTypes: types,
mediaType: .video,
position: .unspecified)
return session.devices
}
private nonisolated static func pickCamera(
facing: CameraFacing,
deviceId: String?) -> AVCaptureDevice?
{
if let deviceId, !deviceId.isEmpty {
if let match = Self.availableCameras().first(where: { $0.uniqueID == deviceId }) {
return match
}
}
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) {
@@ -269,6 +318,20 @@ actor CameraCaptureService {
try? await Task.sleep(nanoseconds: stepNs)
}
}
private func sleepDelayMs(_ delayMs: Int) async {
guard delayMs > 0 else { return }
let ns = UInt64(min(delayMs, 10_000)) * 1_000_000
try? await Task.sleep(nanoseconds: ns)
}
private nonisolated static func positionLabel(_ position: AVCaptureDevice.Position) -> String {
switch position {
case .front: "front"
case .back: "back"
default: "unspecified"
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {

View File

@@ -135,6 +135,7 @@ final class MacNodeModeCoordinator {
let capsSet = Set(caps)
if capsSet.contains(ClawdisCapability.camera.rawValue) {
commands.append(ClawdisCameraCommand.list.rawValue)
commands.append(ClawdisCameraCommand.snap.rawValue)
commands.append(ClawdisCameraCommand.clip.rawValue)
}

View File

@@ -103,10 +103,13 @@ actor MacNodeRuntime {
}
let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ??
ClawdisCameraSnapParams()
let delayMs = min(10_000, max(0, params.delayMs ?? 2000))
let res = try await self.cameraCapture.snap(
facing: CameraFacing(rawValue: params.facing?.rawValue ?? "") ?? .front,
maxWidth: params.maxWidth,
quality: params.quality)
quality: params.quality,
deviceId: params.deviceId,
delayMs: delayMs)
struct SnapPayload: Encodable {
var format: String
var base64: String
@@ -135,6 +138,7 @@ actor MacNodeRuntime {
facing: CameraFacing(rawValue: params.facing?.rawValue ?? "") ?? .front,
durationMs: params.durationMs,
includeAudio: params.includeAudio ?? true,
deviceId: params.deviceId,
outPath: nil)
defer { try? FileManager.default.removeItem(atPath: res.path) }
let data = try Data(contentsOf: URL(fileURLWithPath: res.path))
@@ -151,6 +155,19 @@ actor MacNodeRuntime {
hasAudio: res.hasAudio))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.list.rawValue:
guard Self.cameraEnabled() else {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .unavailable,
message: "CAMERA_DISABLED: enable Camera in Settings"))
}
let devices = await self.cameraCapture.listDevices()
let payload = try Self.encodePayload(["devices": devices])
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case MacNodeScreenCommand.record.rawValue:
let params = (try? Self.decodeParams(MacNodeScreenRecordParams.self, from: req.paramsJSON)) ??
MacNodeScreenRecordParams()

View File

@@ -1,6 +1,7 @@
import Foundation
public enum ClawdisCameraCommand: String, Codable, Sendable {
case list = "camera.list"
case snap = "camera.snap"
case clip = "camera.clip"
}
@@ -24,17 +25,23 @@ public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable {
public var maxWidth: Int?
public var quality: Double?
public var format: ClawdisCameraImageFormat?
public var deviceId: String?
public var delayMs: Int?
public init(
facing: ClawdisCameraFacing? = nil,
maxWidth: Int? = nil,
quality: Double? = nil,
format: ClawdisCameraImageFormat? = nil)
format: ClawdisCameraImageFormat? = nil,
deviceId: String? = nil,
delayMs: Int? = nil)
{
self.facing = facing
self.maxWidth = maxWidth
self.quality = quality
self.format = format
self.deviceId = deviceId
self.delayMs = delayMs
}
}
@@ -43,16 +50,19 @@ public struct ClawdisCameraClipParams: Codable, Sendable, Equatable {
public var durationMs: Int?
public var includeAudio: Bool?
public var format: ClawdisCameraVideoFormat?
public var deviceId: String?
public init(
facing: ClawdisCameraFacing? = nil,
durationMs: Int? = nil,
includeAudio: Bool? = nil,
format: ClawdisCameraVideoFormat? = nil)
format: ClawdisCameraVideoFormat? = nil,
deviceId: String? = nil)
{
self.facing = facing
self.durationMs = durationMs
self.includeAudio = includeAudio
self.format = format
self.deviceId = deviceId
}
}

View File

@@ -25,12 +25,18 @@ All camera access is gated behind **user-controlled settings**.
### Commands (via Gateway `node.invoke`)
- `camera.list`
- Response payload:
- `devices`: array of `{ id, name, position, deviceType }`
- `camera.snap`
- Params:
- `facing`: `front|back` (default: `front`)
- `maxWidth`: number (optional; default `1600` on the iOS node)
- `quality`: `0..1` (optional; default `0.9`)
- `format`: currently `jpg`
- `delayMs`: number (optional; default `0`)
- `deviceId`: string (optional; from `camera.list`)
- Response payload:
- `format: "jpg"`
- `base64: "<...>"`
@@ -43,6 +49,7 @@ All camera access is gated behind **user-controlled settings**.
- `durationMs`: number (default `3000`, clamped to a max of `60000`)
- `includeAudio`: boolean (default `true`)
- `format`: currently `mp4`
- `deviceId`: string (optional; from `camera.list`)
- Response payload:
- `format: "mp4"`
- `base64: "<...>"`
@@ -112,15 +119,20 @@ Use the main `clawdis` CLI to invoke camera commands on the macOS node.
Examples:
```bash
clawdis nodes camera list --node <id> # list camera ids
clawdis nodes camera snap --node <id> # prints MEDIA:<path>
clawdis nodes camera snap --node <id> --max-width 1280
clawdis nodes camera snap --node <id> --delay-ms 2000
clawdis nodes camera snap --node <id> --device-id <id>
clawdis nodes camera clip --node <id> --duration 10s # prints MEDIA:<path>
clawdis nodes camera clip --node <id> --duration-ms 3000 # prints MEDIA:<path> (legacy flag)
clawdis nodes camera clip --node <id> --device-id <id>
clawdis nodes camera clip --node <id> --no-audio
```
Notes:
- `clawdis nodes camera snap` defaults to `maxWidth=1600` unless overridden.
- On macOS, `camera.snap` waits `delayMs` (default 2000ms) after warm-up/exposure settle before capturing.
- Photo payloads are recompressed to keep base64 under 5 MB.
## Safety + practical limits

View File

@@ -746,6 +746,7 @@ const CanvasToolSchema = Type.Union([
),
maxWidth: Type.Optional(Type.Number()),
quality: Type.Optional(Type.Number()),
delayMs: Type.Optional(Type.Number()),
}),
Type.Object({
action: Type.Literal("a2ui_push"),
@@ -864,6 +865,20 @@ function createCanvasTool(): AnyAgentTool {
Number.isFinite(params.quality)
? params.quality
: undefined;
const delayMs =
typeof params.delayMs === "number" &&
Number.isFinite(params.delayMs)
? params.delayMs
: undefined;
const deviceId =
typeof params.deviceId === "string" && params.deviceId.trim()
? params.deviceId.trim()
: undefined;
const delayMs =
typeof params.delayMs === "number" &&
Number.isFinite(params.delayMs)
? params.delayMs
: undefined;
const raw = (await invoke("canvas.snapshot", {
format,
maxWidth,
@@ -978,6 +993,15 @@ const NodesToolSchema = Type.Union([
),
maxWidth: Type.Optional(Type.Number()),
quality: Type.Optional(Type.Number()),
delayMs: Type.Optional(Type.Number()),
deviceId: Type.Optional(Type.String()),
}),
Type.Object({
action: Type.Literal("camera_list"),
gatewayUrl: Type.Optional(Type.String()),
gatewayToken: Type.Optional(Type.String()),
timeoutMs: Type.Optional(Type.Number()),
node: Type.String(),
}),
Type.Object({
action: Type.Literal("camera_clip"),
@@ -991,6 +1015,7 @@ const NodesToolSchema = Type.Union([
duration: Type.Optional(Type.String()),
durationMs: Type.Optional(Type.Number()),
includeAudio: Type.Optional(Type.Boolean()),
deviceId: Type.Optional(Type.String()),
}),
Type.Object({
action: Type.Literal("screen_record"),
@@ -1127,6 +1152,8 @@ function createNodesTool(): AnyAgentTool {
maxWidth,
quality,
format: "jpg",
delayMs,
deviceId,
},
idempotencyKey: crypto.randomUUID(),
})) as { payload?: unknown };
@@ -1155,6 +1182,21 @@ function createNodesTool(): AnyAgentTool {
const result: AgentToolResult<unknown> = { content, details };
return await sanitizeToolResultImages(result, "nodes:camera_snap");
}
case "camera_list": {
const node = readStringParam(params, "node", { required: true });
const nodeId = await resolveNodeId(gatewayOpts, node);
const raw = (await callGatewayTool("node.invoke", gatewayOpts, {
nodeId,
command: "camera.list",
params: {},
idempotencyKey: crypto.randomUUID(),
})) as { payload?: unknown };
const payload =
raw && typeof raw.payload === "object" && raw.payload !== null
? raw.payload
: {};
return jsonResult(payload);
}
case "camera_clip": {
const node = readStringParam(params, "node", { required: true });
const nodeId = await resolveNodeId(gatewayOpts, node);
@@ -1176,6 +1218,10 @@ function createNodesTool(): AnyAgentTool {
typeof params.includeAudio === "boolean"
? params.includeAudio
: true;
const deviceId =
typeof params.deviceId === "string" && params.deviceId.trim()
? params.deviceId.trim()
: undefined;
const raw = (await callGatewayTool("node.invoke", gatewayOpts, {
nodeId,
command: "camera.clip",
@@ -1184,6 +1230,7 @@ function createNodesTool(): AnyAgentTool {
durationMs,
includeAudio,
format: "mp4",
deviceId,
},
idempotencyKey: crypto.randomUUID(),
})) as { payload?: unknown };

View File

@@ -846,14 +846,69 @@ export function registerNodesCli(program: Command) {
{ timeoutMs: 60_000 },
);
nodesCallOpts(
camera
.command("list")
.description("List available cameras on a node")
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.action(async (opts: NodesRpcOpts) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const raw = (await callGatewayCli("node.invoke", opts, {
nodeId,
command: "camera.list",
params: {},
idempotencyKey: randomIdempotencyKey(),
})) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload =
typeof res.payload === "object" && res.payload !== null
? (res.payload as { devices?: unknown })
: {};
const devices = Array.isArray(payload.devices)
? (payload.devices as Array<Record<string, unknown>>)
: [];
if (opts.json) {
defaultRuntime.log(JSON.stringify({ devices }, null, 2));
return;
}
if (devices.length === 0) {
defaultRuntime.log("No cameras reported.");
return;
}
for (const device of devices) {
const id = typeof device.id === "string" ? device.id : "";
const name =
typeof device.name === "string" ? device.name : "Unknown Camera";
const position =
typeof device.position === "string" ? device.position : "unspecified";
defaultRuntime.log(`${name} (${position})${id ? `${id}` : ""}`);
}
} catch (err) {
defaultRuntime.error(`nodes camera list failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 60_000 },
);
nodesCallOpts(
camera
.command("snap")
.description("Capture a photo from a node camera (prints MEDIA:<path>)")
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back|both>", "Camera facing", "both")
.option("--device-id <id>", "Camera device id (from nodes camera list)")
.option("--max-width <px>", "Max width in px (optional)")
.option("--quality <0-1>", "JPEG quality (default 0.9)")
.option("--delay-ms <ms>", "Delay before capture in ms (macOS default 2000)")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 20000)",
@@ -882,6 +937,10 @@ export function registerNodesCli(program: Command) {
const quality = opts.quality
? Number.parseFloat(String(opts.quality))
: undefined;
const delayMs = opts.delayMs
? Number.parseInt(String(opts.delayMs), 10)
: undefined;
const deviceId = opts.deviceId ? String(opts.deviceId).trim() : undefined;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
@@ -902,6 +961,8 @@ export function registerNodesCli(program: Command) {
maxWidth: Number.isFinite(maxWidth) ? maxWidth : undefined,
quality: Number.isFinite(quality) ? quality : undefined,
format: "jpg",
delayMs: Number.isFinite(delayMs) ? delayMs : undefined,
deviceId: deviceId || undefined,
},
idempotencyKey: randomIdempotencyKey(),
};
@@ -955,6 +1016,7 @@ export function registerNodesCli(program: Command) {
)
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back>", "Camera facing", "front")
.option("--device-id <id>", "Camera device id (from nodes camera list)")
.option(
"--duration <ms|10s|1m>",
"Duration (default 3000ms; supports ms/s/m, e.g. 10s)",
@@ -975,18 +1037,20 @@ export function registerNodesCli(program: Command) {
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const deviceId = opts.deviceId ? String(opts.deviceId).trim() : undefined;
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.clip",
params: {
facing,
durationMs: Number.isFinite(durationMs) ? durationMs : undefined,
includeAudio,
format: "mp4",
},
idempotencyKey: randomIdempotencyKey(),
};
params: {
facing,
durationMs: Number.isFinite(durationMs) ? durationMs : undefined,
includeAudio,
format: "mp4",
deviceId: deviceId || undefined,
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}