From ea6bcd03e2c03fbef68bbd42b70c57a3a75ab8fd Mon Sep 17 00:00:00 2001 From: Lachlan Sneff Date: Mon, 23 Mar 2020 10:12:53 -0400 Subject: [PATCH] Rewrite GpuFuture to avoid blocking and to use less space Remove odd patching matching Replace std::sync::Mutex with spin::Mutex in GpuFuture Reduce GpuFuture usage to one explicit allocation instead of two Fix examples to poll the device in the background when using map_read or map_write Remove device.poll from GpuFuture::poll and document future invariants Massively simplify examples Use Arc::clone(...) instead of arc.clone() Switch println to log::info --- Cargo.toml | 1 + examples/capture/main.rs | 17 ++++-- examples/hello-compute/main.rs | 13 +++- src/backend/native_gpu_future.rs | 94 ++++++++++++++++++----------- src/lib.rs | 100 +++++++++++++++++-------------- 5 files changed, 138 insertions(+), 87 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 89d10990bf..47b9bfe105 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,7 @@ rev = "08e8d406c175579da5ef18c1abf4d6c00e2a9726" arrayvec = "0.5" smallvec = "1" raw-window-handle = "0.3" +parking_lot = "0.10" [dev-dependencies] cgmath = "0.17" diff --git a/examples/capture/main.rs b/examples/capture/main.rs index 650c50052a..ab4ecd5bcd 100644 --- a/examples/capture/main.rs +++ b/examples/capture/main.rs @@ -5,8 +5,6 @@ use std::fs::File; use std::mem::size_of; async fn run() { - env_logger::init(); - let adapter = wgpu::Adapter::request( &wgpu::RequestAdapterOptions { power_preference: wgpu::PowerPreference::Default, @@ -87,8 +85,16 @@ async fn run() { queue.submit(&[command_buffer]); + // Note that we're not calling `.await` here. + let buffer_future = output_buffer.map_read(0, (size * size) as u64 * size_of::() as u64); + + // Poll the device in a blocking manner so that our future resolves. + // In an actual application, `device.poll(...)` should + // be called in an event loop or on another thread. + device.poll(wgpu::Maintain::Wait); + // Write the buffer as a PNG - if let Ok(mapping) = output_buffer.map_read(0u64, (size * size) as u64 * size_of::() as u64).await { + if let Ok(mapping) = buffer_future.await { let mut png_encoder = png::Encoder::new(File::create("red.png").unwrap(), size, size); png_encoder.set_depth(png::BitDepth::Eight); png_encoder.set_color(png::ColorType::RGBA); @@ -98,11 +104,10 @@ async fn run() { .write_image_data(mapping.as_slice()) .unwrap(); } - - // The device will be polled when it is dropped but we can also poll it explicitly - device.poll(true); } fn main() { + env_logger::init(); + futures::executor::block_on(run()); } diff --git a/examples/hello-compute/main.rs b/examples/hello-compute/main.rs index 7165636302..1f3097cf55 100644 --- a/examples/hello-compute/main.rs +++ b/examples/hello-compute/main.rs @@ -13,6 +13,7 @@ async fn run() { .collect() }; + // To see the output, run `RUST_LOG=info cargo run --example hello-compute`. log::info!("Times: {:?}", execute_gpu(numbers).await); } @@ -98,7 +99,16 @@ async fn execute_gpu(numbers: Vec) -> Vec { encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffer, 0, size); queue.submit(&[encoder.finish()]); - if let Ok(mapping) = staging_buffer.map_read(0u64, size).await { + + // Note that we're not calling `.await` here. + let buffer_future = staging_buffer.map_read(0, size); + + // Poll the device in a blocking manner so that our future resolves. + // In an actual application, `device.poll(...)` should + // be called in an event loop or on another thread. + device.poll(wgpu::Maintain::Wait); + + if let Ok(mapping) = buffer_future.await { mapping .as_slice() .chunks_exact(4) @@ -111,6 +121,7 @@ async fn execute_gpu(numbers: Vec) -> Vec { fn main() { env_logger::init(); + futures::executor::block_on(run()); } diff --git a/src/backend/native_gpu_future.rs b/src/backend/native_gpu_future.rs index ecf2b4390c..48a6e4d3fe 100644 --- a/src/backend/native_gpu_future.rs +++ b/src/backend/native_gpu_future.rs @@ -1,22 +1,31 @@ use std::future::Future; use std::pin::Pin; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::task::{Context, Poll, Waker}; +use parking_lot::Mutex; +use crate::BufferAddress; -struct GpuFutureInner { - id: wgc::id::DeviceId, - result: Option, - waker: Option, +enum WakerOrResult { + Waker(Waker), + Result(T), } /// A Future that can poll the wgpu::Device pub struct GpuFuture { - inner: Arc>>, + data: Arc>, +} + +pub enum OpaqueData {} + +struct Data { + buffer_id: wgc::id::BufferId, + size: BufferAddress, + waker_or_result: Mutex>>, } /// A completion handle to set the result on a GpuFuture pub struct GpuFutureCompletion { - inner: Arc>>, + data: Arc>, } impl Future for GpuFuture @@ -24,49 +33,62 @@ impl Future for GpuFuture type Output = T; fn poll(self: Pin<&mut Self>, context: &mut Context) -> Poll { - // grab a clone of the Arc - let arc = Arc::clone(&self.get_mut().inner); + let mut waker_or_result = self.into_ref().get_ref().data.waker_or_result.lock(); - // grab the device id and set the waker, but release the lock, so that the native callback can write to it - let device_id = { - let mut inner = arc.lock().unwrap(); - inner.waker.replace(context.waker().clone()); - inner.id - }; - - // polling the device should trigger the callback - wgn::wgpu_device_poll(device_id, true); - - // now take the lock again, and check whether the future is complete - let mut inner = arc.lock().unwrap(); - match inner.result.take() { - Some(value) => Poll::Ready(value), - _ => Poll::Pending, + match waker_or_result.take() { + Some(WakerOrResult::Result(res)) => Poll::Ready(res), + _ => { + *waker_or_result = Some(WakerOrResult::Waker(context.waker().clone())); + Poll::Pending + } } } } impl GpuFutureCompletion { pub fn complete(self, value: T) { - let mut inner = self.inner.lock().unwrap(); - inner.result.replace(value); - if let Some(waker) = &inner.waker { - waker.wake_by_ref(); + let mut waker_or_result = self.data.waker_or_result.lock(); + + match waker_or_result.replace(WakerOrResult::Result(value)) { + Some(WakerOrResult::Waker(waker)) => waker.wake(), + None => {} + Some(WakerOrResult::Result(_)) => { + // Drop before panicking. Not sure if this is necessary, but it makes me feel better. + drop(waker_or_result); + unreachable!() + }, + }; + } + + pub(crate) fn to_raw(self) -> *mut OpaqueData { + Arc::into_raw(self.data) as _ + } + + pub(crate) unsafe fn from_raw(this: *mut OpaqueData) -> Self { + Self { + data: Arc::from_raw(this as _) } } + + pub(crate) fn get_buffer_info(&self) -> (wgc::id::BufferId, BufferAddress) { + (self.data.buffer_id, self.data.size) + } } -pub(crate) fn new_gpu_future(id: wgc::id::DeviceId) -> (GpuFuture, GpuFutureCompletion) { - let inner = Arc::new(Mutex::new(GpuFutureInner { - id, - result: None, - waker: None, - })); +pub(crate) fn new_gpu_future( + buffer_id: wgc::id::BufferId, + size: BufferAddress, +) -> (GpuFuture, GpuFutureCompletion) { + let data = Arc::new(Data { + buffer_id, + size, + waker_or_result: Mutex::new(None), + }); ( GpuFuture { - inner: inner.clone(), + data: Arc::clone(&data), }, - GpuFutureCompletion { inner }, + GpuFutureCompletion { data }, ) } diff --git a/src/lib.rs b/src/lib.rs index c35901bca1..88bcbe7ff3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,7 @@ use smallvec::SmallVec; use std::{ ffi::CString, ops::Range, + future::Future, ptr, slice, thread, @@ -55,6 +56,14 @@ pub struct Device { temp: Temp, } +/// This is passed to `Device::poll` to control whether +/// it should block or not. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum Maintain { + Wait, + Poll, +} + /// A handle to a GPU-accessible buffer. #[derive(Debug, PartialEq)] pub struct Buffer { @@ -536,8 +545,11 @@ impl Adapter { impl Device { /// Check for resource cleanups and mapping callbacks. - pub fn poll(&self, force_wait: bool) { - wgn::wgpu_device_poll(self.id, force_wait); + pub fn poll(&self, maintain: Maintain) { + wgn::wgpu_device_poll(self.id, match maintain { + Maintain::Poll => false, + Maintain::Wait => true, + }); } /// Creates a shader module from SPIR-V source code. @@ -889,25 +901,23 @@ impl Drop for BufferWriteMapping { } } -struct BufferMapReadFutureUserData -{ - size: BufferAddress, - completion: native_gpu_future::GpuFutureCompletion>, - buffer_id: wgc::id::BufferId, -} -struct BufferMapWriteFutureUserData -{ - size: BufferAddress, - completion: native_gpu_future::GpuFutureCompletion>, - buffer_id: wgc::id::BufferId, -} impl Buffer { /// Map the buffer for reading. The result is returned in a future. - pub async fn map_read(&self, start: BufferAddress, size: BufferAddress) -> Result + /// + /// For the future to complete, `device.poll(...)` must be called elsewhere in the runtime, possibly integrated + /// into an event loop, run on a separate thread, or continually polled in the same task runtime that this + /// future will be run on. + /// + /// It's expected that wgpu will eventually supply its own event loop infrastructure that will be easy to integrate + /// into other event loops, like winit's. + pub fn map_read(&self, start: BufferAddress, size: BufferAddress) -> impl Future> { - let (future, completion) = native_gpu_future::new_gpu_future(self.device_id); + let (future, completion) = native_gpu_future::new_gpu_future( + self.id, + size, + ); extern "C" fn buffer_map_read_future_wrapper( status: wgc::resource::BufferMapAsyncStatus, @@ -915,39 +925,43 @@ impl Buffer { user_data: *mut u8, ) { - let user_data = - unsafe { Box::from_raw(user_data as *mut BufferMapReadFutureUserData) }; + let completion = unsafe { + native_gpu_future::GpuFutureCompletion::from_raw(user_data as _) + }; + let (buffer_id, size) = completion.get_buffer_info(); + if let wgc::resource::BufferMapAsyncStatus::Success = status { - user_data.completion.complete(Ok(BufferReadMapping { + completion.complete(Ok(BufferReadMapping { data, - size: user_data.size as usize, - buffer_id: user_data.buffer_id, + size: size as usize, + buffer_id, })); } else { - user_data.completion.complete(Err(BufferAsyncErr)); + completion.complete(Err(BufferAsyncErr)); } } - let user_data = Box::new(BufferMapReadFutureUserData { - size, - completion, - buffer_id: self.id, - }); wgn::wgpu_buffer_map_read_async( self.id, start, size, buffer_map_read_future_wrapper, - Box::into_raw(user_data) as *mut u8, + completion.to_raw() as _, ); - future.await + future } /// Map the buffer for writing. The result is returned in a future. - pub async fn map_write(&self, start: BufferAddress, size: BufferAddress) -> Result + /// + /// See the documentation of (map_read)[#method.map_read] for more information about + /// how to run this future. + pub fn map_write(&self, start: BufferAddress, size: BufferAddress) -> impl Future> { - let (future, completion) = native_gpu_future::new_gpu_future(self.device_id); + let (future, completion) = native_gpu_future::new_gpu_future( + self.id, + size, + ); extern "C" fn buffer_map_write_future_wrapper( status: wgc::resource::BufferMapAsyncStatus, @@ -955,33 +969,31 @@ impl Buffer { user_data: *mut u8, ) { - let user_data = - unsafe { Box::from_raw(user_data as *mut BufferMapWriteFutureUserData) }; + let completion = unsafe { + native_gpu_future::GpuFutureCompletion::from_raw(user_data as _) + }; + let (buffer_id, size) = completion.get_buffer_info(); + if let wgc::resource::BufferMapAsyncStatus::Success = status { - user_data.completion.complete(Ok(BufferWriteMapping { + completion.complete(Ok(BufferWriteMapping { data, - size: user_data.size as usize, - buffer_id: user_data.buffer_id, + size: size as usize, + buffer_id, })); } else { - user_data.completion.complete(Err(BufferAsyncErr)); + completion.complete(Err(BufferAsyncErr)); } } - let user_data = Box::new(BufferMapWriteFutureUserData { - size, - completion, - buffer_id: self.id, - }); wgn::wgpu_buffer_map_write_async( self.id, start, size, buffer_map_write_future_wrapper, - Box::into_raw(user_data) as *mut u8, + completion.to_raw() as _, ); - future.await + future } /// Flushes any pending write operations and unmaps the buffer from host memory.