From b55cd162bfeccc987b5861ce748585df21882db5 Mon Sep 17 00:00:00 2001
From: Layl <2385329-layl@users.noreply.gitlab.com>
Date: Tue, 31 Dec 2019 22:52:02 +0100
Subject: [PATCH 1/2] Defer command buffer recycling until allocate

---
 wgpu-core/src/command/allocator.rs | 77 ++++++++++++++++--------------
 wgpu-core/src/device.rs            | 14 ++++--
 2 files changed, 51 insertions(+), 40 deletions(-)
diff --git a/wgpu-core/src/command/allocator.rs b/wgpu-core/src/command/allocator.rs
index de8af9a0c5..2ed4de1221 100644
--- a/wgpu-core/src/command/allocator.rs
+++ b/wgpu-core/src/command/allocator.rs
@@ -22,9 +22,37 @@ use std::{collections::HashMap, sync::atomic::Ordering, thread};
 struct CommandPool<B: hal::Backend> {
     raw: B::CommandPool,
     available: Vec<B::CommandBuffer>,
+    pending: Vec<CommandBuffer<B>>,
 }
 
 impl<B: hal::Backend> CommandPool<B> {
+    fn maintain(&mut self, last_done: usize) {
+        for i in (0 .. self.pending.len()).rev() {
+            let index = self.pending[i]
+                .life_guard
+                .submission_index
+                .load(Ordering::Acquire);
+            if index <= last_done {
+                let cmd_buf = self.pending.swap_remove(i);
+                log::trace!(
+                    "recycling comb submitted in {} when {} is done",
+                    index,
+                    last_done
+                );
+                self.recycle(cmd_buf);
+            }
+        }
+    }
+
+    fn recycle(&mut self, cmd_buf: CommandBuffer<B>) {
+        for mut raw in cmd_buf.raw {
+            unsafe {
+                raw.reset(false);
+            }
+            self.available.push(raw);
+        }
+    }
+
     fn allocate(&mut self) -> B::CommandBuffer {
         if self.available.is_empty() {
             let extra = unsafe { self.raw.allocate_vec(20, hal::command::Level::Primary) };
@@ -37,20 +65,9 @@ impl<B: hal::Backend> CommandPool<B> {
 
 #[derive(Debug)]
 struct Inner<B: hal::Backend> {
+    // TODO: Currently pools from threads that are stopped or no longer call into wgpu will never be
+    // cleaned up.
     pools: HashMap<thread::ThreadId, CommandPool<B>>,
-    pending: Vec<CommandBuffer<B>>,
-}
-
-impl<B: hal::Backend> Inner<B> {
-    fn recycle(&mut self, cmd_buf: CommandBuffer<B>) {
-        let pool = self.pools.get_mut(&cmd_buf.recorded_thread_id).unwrap();
-        for mut raw in cmd_buf.raw {
-            unsafe {
-                raw.reset(false);
-            }
-            pool.available.push(raw);
-        }
-    }
 }
 
 #[derive(Debug)]
@@ -65,6 +82,7 @@ impl<B: GfxBackend> CommandAllocator<B> {
         device_id: Stored<DeviceId>,
         device: &B::Device,
         features: Features,
+        last_done: usize,
     ) -> CommandBuffer<B> {
         //debug_assert_eq!(device_id.backend(), B::VARIANT);
         let thread_id = thread::current().id();
@@ -79,7 +97,12 @@ impl<B: GfxBackend> CommandAllocator<B> {
             }
             .unwrap(),
             available: Vec::new(),
+            pending: Vec::new(),
         });
+
+        // Recycle completed command buffers
+        pool.maintain(last_done);
+
         let init = pool.allocate();
 
         CommandBuffer {
@@ -101,7 +124,6 @@ impl<B: hal::Backend> CommandAllocator<B> {
             queue_family,
             inner: Mutex::new(Inner {
                 pools: HashMap::new(),
-                pending: Vec::new(),
             }),
         }
     }
@@ -124,34 +146,19 @@ impl<B: hal::Backend> CommandAllocator<B> {
             .life_guard
             .submission_index
             .store(submit_index, Ordering::Release);
-        self.inner.lock().pending.push(cmd_buf);
-    }
 
-    pub fn maintain(&self, last_done: SubmissionIndex) {
+        // Record this command buffer as pending
         let mut inner = self.inner.lock();
-        for i in (0 .. inner.pending.len()).rev() {
-            let index = inner.pending[i]
-                .life_guard
-                .submission_index
-                .load(Ordering::Acquire);
-            if index <= last_done {
-                let cmd_buf = inner.pending.swap_remove(i);
-                log::trace!(
-                    "recycling comb submitted in {} when {} is done",
-                    index,
-                    last_done
-                );
-                inner.recycle(cmd_buf);
-            }
-        }
+        let pool = inner.pools.get_mut(&cmd_buf.recorded_thread_id).unwrap();
+        pool.pending.push(cmd_buf);
     }
 
     pub fn destroy(self, device: &B::Device) {
         let mut inner = self.inner.lock();
-        while let Some(cmd_buf) = inner.pending.pop() {
-            inner.recycle(cmd_buf);
-        }
         for (_, mut pool) in inner.pools.drain() {
+            while let Some(cmd_buf) = pool.pending.pop() {
+                pool.recycle(cmd_buf);
+            }
             unsafe {
                 pool.raw.free(pool.available);
                 device.destroy_command_pool(pool.raw);
diff --git a/wgpu-core/src/device.rs b/wgpu-core/src/device.rs
index b48fd63354..dc918d679c 100644
--- a/wgpu-core/src/device.rs
+++ b/wgpu-core/src/device.rs
@@ -516,6 +516,9 @@ pub struct Device<B: hal::Backend> {
     pub(crate) framebuffers: Mutex<FastHashMap<FramebufferKey, B::Framebuffer>>,
     pending: Mutex<PendingResources<B>>,
     pub(crate) features: Features,
+    /// The last submission index that is done, used to reclaim command buffers on encoder creation.
+    /// Because of AtomicUsize not having fetch_max stabilized, this has to be a mutex right now.
+    last_done: Mutex<usize>,
 }
 
 impl<B: GfxBackend> Device<B> {
@@ -575,6 +578,7 @@ impl<B: GfxBackend> Device<B> {
                 max_bind_groups,
                 supports_texture_d24_s8,
             },
+            last_done: Mutex::new(0),
         }
     }
 
@@ -596,16 +600,16 @@ impl<B: GfxBackend> Device<B> {
             &self.desc_allocator,
             force_wait,
         );
+        {
+            let mut last_done_guard = self.last_done.lock();
+            *last_done_guard = last_done_guard.max(last_done);
+        }
         let callbacks = pending.handle_mapping(global, &self.raw, token);
 
         unsafe {
             self.desc_allocator.lock().cleanup(&self.raw);
         }
 
-        if last_done != 0 {
-            self.com_allocator.maintain(last_done);
-        }
-
         callbacks
     }
 
@@ -1562,7 +1566,7 @@ impl<F: IdentityFilter<CommandEncoderId>> Global<F> {
         };
         let mut comb = device
             .com_allocator
-            .allocate(dev_stored, &device.raw, device.features);
+            .allocate(dev_stored, &device.raw, device.features, *device.last_done.lock());
         unsafe {
             comb.raw.last_mut().unwrap().begin(
                 hal::command::CommandBufferFlags::ONE_TIME_SUBMIT,

From ac8f75288f331db600a2a4557bf63545cf4c5f12 Mon Sep 17 00:00:00 2001
From: Layl <2385329-layl@users.noreply.gitlab.com>
Date: Thu, 2 Jan 2020 01:24:02 +0100
Subject: [PATCH 2/2] Use current active list instead of storing last done

---
 wgpu-core/src/command/allocator.rs | 10 +++++-----
 wgpu-core/src/device.rs            | 28 ++++++++++------------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/wgpu-core/src/command/allocator.rs b/wgpu-core/src/command/allocator.rs
index 2ed4de1221..92ac46d27d 100644
--- a/wgpu-core/src/command/allocator.rs
+++ b/wgpu-core/src/command/allocator.rs
@@ -26,18 +26,18 @@ struct CommandPool<B: hal::Backend> {
 }
 
 impl<B: hal::Backend> CommandPool<B> {
-    fn maintain(&mut self, last_done: usize) {
+    fn maintain(&mut self, lowest_active_index: SubmissionIndex) {
         for i in (0 .. self.pending.len()).rev() {
             let index = self.pending[i]
                 .life_guard
                 .submission_index
                 .load(Ordering::Acquire);
-            if index <= last_done {
+            if index < lowest_active_index {
                 let cmd_buf = self.pending.swap_remove(i);
                 log::trace!(
                     "recycling comb submitted in {} when {} is done",
                     index,
-                    last_done
+                    lowest_active_index,
                 );
                 self.recycle(cmd_buf);
             }
@@ -82,7 +82,7 @@ impl<B: GfxBackend> CommandAllocator<B> {
         device_id: Stored<DeviceId>,
         device: &B::Device,
         features: Features,
-        last_done: usize,
+        lowest_active_index: SubmissionIndex,
     ) -> CommandBuffer<B> {
         //debug_assert_eq!(device_id.backend(), B::VARIANT);
         let thread_id = thread::current().id();
@@ -101,7 +101,7 @@ impl<B: GfxBackend> CommandAllocator<B> {
         });
 
         // Recycle completed command buffers
-        pool.maintain(last_done);
+        pool.maintain(lowest_active_index);
 
         let init = pool.allocate();
 
diff --git a/wgpu-core/src/device.rs b/wgpu-core/src/device.rs
index dc918d679c..062930e60c 100644
--- a/wgpu-core/src/device.rs
+++ b/wgpu-core/src/device.rs
@@ -200,7 +200,7 @@ impl<B: GfxBackend> PendingResources<B> {
         heaps_mutex: &Mutex<Heaps<B>>,
         descriptor_allocator_mutex: &Mutex<DescriptorAllocator<B>>,
         force_wait: bool,
-    ) -> SubmissionIndex {
+    ) {
         if force_wait && !self.active.is_empty() {
             let status = unsafe {
                 device.wait_for_fences(
@@ -219,11 +219,6 @@ impl<B: GfxBackend> PendingResources<B> {
             .iter()
             .position(|a| unsafe { !device.get_fence_status(&a.fence).unwrap() })
             .unwrap_or(self.active.len());
-        let last_done = if done_count != 0 {
-            self.active[done_count - 1].index
-        } else {
-            return 0;
-        };
 
         for a in self.active.drain(.. done_count) {
             log::trace!("Active submission {} is done", a.index);
@@ -260,8 +255,6 @@ impl<B: GfxBackend> PendingResources<B> {
                 },
             }
         }
-
-        last_done
     }
 
     fn triage_referenced<F: AllIdentityFilter>(
@@ -516,9 +509,6 @@ pub struct Device<B: hal::Backend> {
     pub(crate) framebuffers: Mutex<FastHashMap<FramebufferKey, B::Framebuffer>>,
     pending: Mutex<PendingResources<B>>,
     pub(crate) features: Features,
-    /// The last submission index that is done, used to reclaim command buffers on encoder creation.
-    /// Because of AtomicUsize not having fetch_max stabilized, this has to be a mutex right now.
-    last_done: Mutex<usize>,
 }
 
 impl<B: GfxBackend> Device<B> {
@@ -578,7 +568,6 @@ impl<B: GfxBackend> Device<B> {
                 max_bind_groups,
                 supports_texture_d24_s8,
             },
-            last_done: Mutex::new(0),
         }
     }
 
@@ -594,16 +583,12 @@ impl<B: GfxBackend> Device<B> {
         pending.triage_referenced(global, &mut *trackers, token);
         pending.triage_mapped(global, token);
         pending.triage_framebuffers(global, &mut *self.framebuffers.lock(), token);
-        let last_done = pending.cleanup(
+        pending.cleanup(
             &self.raw,
             &self.mem_allocator,
             &self.desc_allocator,
             force_wait,
         );
-        {
-            let mut last_done_guard = self.last_done.lock();
-            *last_done_guard = last_done_guard.max(last_done);
-        }
         let callbacks = pending.handle_mapping(global, &self.raw, token);
 
         unsafe {
@@ -1564,9 +1549,16 @@ impl<F: IdentityFilter<CommandEncoderId>> Global<F> {
             value: device_id,
             ref_count: device.life_guard.ref_count.clone(),
         };
+
+        // The first entry in the active list should have the lowest index
+        let lowest_active_index = device.pending.lock()
+            .active.get(0)
+            .map(|active| active.index)
+            .unwrap_or(0);
+
         let mut comb = device
             .com_allocator
-            .allocate(dev_stored, &device.raw, device.features, *device.last_done.lock());
+            .allocate(dev_stored, &device.raw, device.features, lowest_active_index);
         unsafe {
             comb.raw.last_mut().unwrap().begin(
                 hal::command::CommandBufferFlags::ONE_TIME_SUBMIT,