diff --git a/wgpu-core/src/command/allocator.rs b/wgpu-core/src/command/allocator.rs
index 5a060e1344..1a7983c5ce 100644
--- a/wgpu-core/src/command/allocator.rs
+++ b/wgpu-core/src/command/allocator.rs
@@ -4,14 +4,13 @@
 
 use super::CommandBuffer;
 use crate::{
-    hub::GfxBackend, id::DeviceId, track::TrackerSet, LifeGuard, PrivateFeatures, Stored,
-    SubmissionIndex,
+    hub::GfxBackend, id::DeviceId, track::TrackerSet, PrivateFeatures, Stored, SubmissionIndex,
 };
 
 use hal::{command::CommandBuffer as _, device::Device as _, pool::CommandPool as _};
 use parking_lot::Mutex;
 
-use std::{collections::HashMap, sync::atomic::Ordering, thread};
+use std::{collections::HashMap, thread};
 
 const GROW_AMOUNT: usize = 20;
 
@@ -20,21 +19,17 @@ struct CommandPool<B: hal::Backend> {
     raw: B::CommandPool,
     total: usize,
     available: Vec<B::CommandBuffer>,
-    pending: Vec<CommandBuffer<B>>,
+    pending: Vec<(B::CommandBuffer, SubmissionIndex)>,
 }
 
 impl<B: hal::Backend> CommandPool<B> {
     fn maintain(&mut self, lowest_active_index: SubmissionIndex) {
         for i in (0..self.pending.len()).rev() {
-            let index = self.pending[i]
-                .life_guard
-                .submission_index
-                .load(Ordering::Acquire);
-            if index < lowest_active_index {
-                let cmd_buf = self.pending.swap_remove(i);
+            if self.pending[i].1 < lowest_active_index {
+                let cmd_buf = self.pending.swap_remove(i).0;
                 log::trace!(
                     "recycling comb submitted in {} when {} is lowest active",
-                    index,
+                    self.pending[i].1,
                     lowest_active_index,
                 );
                 self.recycle(cmd_buf);
@@ -42,13 +37,11 @@ impl<B: hal::Backend> CommandPool<B> {
         }
     }
 
-    fn recycle(&mut self, cmd_buf: CommandBuffer<B>) {
-        for mut raw in cmd_buf.raw {
-            unsafe {
-                raw.reset(false);
-            }
-            self.available.push(raw);
+    fn recycle(&mut self, mut raw: B::CommandBuffer) {
+        unsafe {
+            raw.reset(false);
         }
+        self.available.push(raw);
     }
 
     fn allocate(&mut self) -> B::CommandBuffer {
@@ -113,7 +106,6 @@ impl<B: GfxBackend> CommandAllocator<B> {
             is_recording: true,
             recorded_thread_id: thread_id,
             device_id,
-            life_guard: LifeGuard::new(),
             trackers: TrackerSet::new(B::VARIANT),
             used_swap_chain: None,
             limits,
@@ -138,32 +130,39 @@ impl<B: hal::Backend> CommandAllocator<B> {
         }
     }
 
-    pub fn extend(&self, cmd_buf: &CommandBuffer<B>) -> B::CommandBuffer {
+    fn allocate_for_thread_id(&self, thread_id: thread::ThreadId) -> B::CommandBuffer {
         let mut inner = self.inner.lock();
-        inner
-            .pools
-            .get_mut(&cmd_buf.recorded_thread_id)
-            .unwrap()
-            .allocate()
+        inner.pools.get_mut(&thread_id).unwrap().allocate()
+    }
+
+    pub fn allocate_internal(&self) -> B::CommandBuffer {
+        self.allocate_for_thread_id(thread::current().id())
+    }
+
+    pub fn extend(&self, cmd_buf: &CommandBuffer<B>) -> B::CommandBuffer {
+        self.allocate_for_thread_id(cmd_buf.recorded_thread_id)
     }
 
     pub fn discard(&self, mut cmd_buf: CommandBuffer<B>) {
         cmd_buf.trackers.clear();
         let mut inner = self.inner.lock();
-        inner
-            .pools
-            .get_mut(&cmd_buf.recorded_thread_id)
-            .unwrap()
-            .recycle(cmd_buf);
+        let pool = inner.pools.get_mut(&cmd_buf.recorded_thread_id).unwrap();
+        for raw in cmd_buf.raw {
+            pool.recycle(raw);
+        }
     }
 
-    pub fn after_submit(&self, mut cmd_buf: CommandBuffer<B>, submit_index: SubmissionIndex) {
-        cmd_buf.trackers.clear();
-        cmd_buf
-            .life_guard
-            .submission_index
-            .store(submit_index, Ordering::Release);
+    pub fn after_submit_internal(&self, raw: B::CommandBuffer, submit_index: SubmissionIndex) {
+        let mut inner = self.inner.lock();
+        inner
+            .pools
+            .get_mut(&thread::current().id())
+            .unwrap()
+            .pending
+            .push((raw, submit_index));
+    }
 
+    pub fn after_submit(&self, cmd_buf: CommandBuffer<B>, submit_index: SubmissionIndex) {
         // Record this command buffer as pending
         let mut inner = self.inner.lock();
         inner
@@ -171,7 +170,7 @@ impl<B: hal::Backend> CommandAllocator<B> {
             .get_mut(&cmd_buf.recorded_thread_id)
             .unwrap()
             .pending
-            .push(cmd_buf);
+            .extend(cmd_buf.raw.into_iter().map(|raw| (raw, submit_index)));
     }
 
     pub fn maintain(&self, device: &B::Device, lowest_active_index: SubmissionIndex) {
@@ -197,8 +196,8 @@ impl<B: hal::Backend> CommandAllocator<B> {
     pub fn destroy(self, device: &B::Device) {
         let mut inner = self.inner.lock();
         for (_, mut pool) in inner.pools.drain() {
-            while let Some(cmd_buf) = pool.pending.pop() {
-                pool.recycle(cmd_buf);
+            while let Some((raw, _)) = pool.pending.pop() {
+                pool.recycle(raw);
             }
             if pool.total != pool.available.len() {
                 log::error!(
diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs
index 63a3b61e9e..ea8ed0e5c5 100644
--- a/wgpu-core/src/command/mod.rs
+++ b/wgpu-core/src/command/mod.rs
@@ -19,7 +19,7 @@ use crate::{
     id,
     resource::{Buffer, Texture},
     track::TrackerSet,
-    LifeGuard, PrivateFeatures, Stored,
+    PrivateFeatures, Stored,
 };
 
 use peek_poke::PeekPoke;
@@ -157,7 +157,6 @@ pub struct CommandBuffer<B: hal::Backend> {
     is_recording: bool,
     recorded_thread_id: ThreadId,
     pub(crate) device_id: Stored<id::DeviceId>,
-    pub(crate) life_guard: LifeGuard,
     pub(crate) trackers: TrackerSet,
     pub(crate) used_swap_chain: Option<(Stored<id::SwapChainId>, B::Framebuffer)>,
     limits: wgt::Limits,
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 5906e57920..e6e41b00e4 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -15,7 +15,6 @@ use copyless::VecHelper as _;
 use gfx_descriptor::DescriptorAllocator;
 use gfx_memory::{Block, Heaps};
 use hal::{
-    self,
     command::CommandBuffer as _,
     device::Device as _,
     queue::CommandQueue as _,
@@ -202,6 +201,7 @@ pub struct Device<B: hal::Backend> {
     pub(crate) private_features: PrivateFeatures,
     limits: wgt::Limits,
     extensions: wgt::Extensions,
+    pending_write_command_buffer: Option<B::CommandBuffer>,
     #[cfg(feature = "trace")]
     pub(crate) trace: Option<Mutex<Trace>>,
 }
@@ -273,6 +273,7 @@ impl<B: GfxBackend> Device<B> {
             },
             limits: desc.limits.clone(),
             extensions: desc.extensions.clone(),
+            pending_write_command_buffer: None,
         }
     }
 
@@ -1683,6 +1684,116 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         self.command_encoder_destroy::<B>(command_buffer_id)
     }
 
+    pub fn queue_write_buffer<B: GfxBackend>(
+        &self,
+        queue_id: id::QueueId,
+        data: &[u8],
+        buffer_id: id::BufferId,
+        buffer_offset: BufferAddress,
+    ) {
+        let hub = B::hub(self);
+        let mut token = Token::root();
+        let (mut device_guard, mut token) = hub.devices.write(&mut token);
+        let device = &mut device_guard[queue_id];
+        let (buffer_guard, _) = hub.buffers.read(&mut token);
+
+        #[cfg(feature = "trace")]
+        match device.trace {
+            Some(ref trace) => {
+                let mut trace = trace.lock();
+                let data_path = trace.make_binary("bin", data);
+                trace.add(trace::Action::WriteBuffer {
+                    id: buffer_id,
+                    data: data_path,
+                    range: buffer_offset..buffer_offset + data.len() as BufferAddress,
+                });
+            }
+            None => {}
+        }
+
+        let mut trackers = device.trackers.lock();
+        let (dst, transition) = trackers.buffers.use_replace(
+            &*buffer_guard,
+            buffer_id,
+            (),
+            resource::BufferUse::COPY_DST,
+        );
+        assert!(
+            dst.usage.contains(wgt::BufferUsage::COPY_DST),
+            "Write buffer usage {:?} must contain usage flag DST_SRC",
+            dst.usage
+        );
+
+        let submit_index = 1 + device.life_guard.submission_index.load(Ordering::Relaxed);
+        if !dst.life_guard.use_at(submit_index) {
+            device.temp_suspected.buffers.push(buffer_id);
+        }
+
+        let src_raw = unsafe {
+            let mut buf = device
+                .raw
+                .create_buffer(
+                    data.len() as BufferAddress,
+                    hal::buffer::Usage::TRANSFER_SRC,
+                )
+                .unwrap();
+            device.raw.set_buffer_name(&mut buf, "<write_buffer_temp>");
+            buf
+        };
+        //TODO: do we need to transition into HOST_WRITE access first?
+        let requirements = unsafe { device.raw.get_buffer_requirements(&src_raw) };
+
+        let mut memory = device
+            .mem_allocator
+            .lock()
+            .allocate(
+                &device.raw,
+                requirements.type_mask as u32,
+                gfx_memory::MemoryUsage::Staging { read_back: false },
+                gfx_memory::Kind::Linear,
+                requirements.size,
+                requirements.alignment,
+            )
+            .unwrap();
+
+        let mut mapped = memory.map(&device.raw, hal::memory::Segment::ALL).unwrap();
+        unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) }
+            .unwrap()
+            .slice
+            .copy_from_slice(data);
+
+        let mut comb = match device.pending_write_command_buffer.take() {
+            Some(comb) => comb,
+            None => {
+                let mut comb = device.com_allocator.allocate_internal();
+                unsafe {
+                    comb.begin_primary(hal::command::CommandBufferFlags::ONE_TIME_SUBMIT);
+                }
+                comb
+            }
+        };
+        let region = hal::command::BufferCopy {
+            src: 0,
+            dst: buffer_offset,
+            size: data.len() as _,
+        };
+        unsafe {
+            comb.pipeline_barrier(
+                all_buffer_stages()..all_buffer_stages(),
+                hal::memory::Dependencies::empty(),
+                iter::once(hal::memory::Barrier::Buffer {
+                    states: hal::buffer::Access::HOST_WRITE..hal::buffer::Access::TRANSFER_READ,
+                    target: &src_raw,
+                    range: hal::buffer::SubRange::WHOLE,
+                    families: None,
+                })
+                .chain(transition.map(|pending| pending.into_hal(dst))),
+            );
+            comb.copy_buffer(&src_raw, &dst.raw, iter::once(region));
+        }
+        device.pending_write_command_buffer = Some(comb);
+    }
+
     pub fn queue_submit<B: GfxBackend>(
         &self,
         queue_id: id::QueueId,
@@ -1694,6 +1805,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             let mut token = Token::root();
             let (mut device_guard, mut token) = hub.devices.write(&mut token);
             let device = &mut device_guard[queue_id];
+            let pending_write_command_buffer = device.pending_write_command_buffer.take();
             device.temp_suspected.clear();
 
             let submit_index = 1 + device
@@ -1815,9 +1927,11 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             // now prepare the GPU submission
             let fence = device.raw.create_fence(false).unwrap();
             let submission = hal::queue::Submission {
-                command_buffers: command_buffer_ids
-                    .iter()
-                    .flat_map(|&cmb_id| &command_buffer_guard[cmb_id].raw),
+                command_buffers: pending_write_command_buffer.as_ref().into_iter().chain(
+                    command_buffer_ids
+                        .iter()
+                        .flat_map(|&cmb_id| &command_buffer_guard[cmb_id].raw),
+                ),
                 wait_semaphores: Vec::new(),
                 signal_semaphores: signal_swapchain_semaphores
                     .into_iter()
@@ -1828,6 +1942,11 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                 device.queue_group.queues[0].submit(submission, Some(&fence));
             }
 
+            if let Some(comb_raw) = pending_write_command_buffer {
+                device
+                    .com_allocator
+                    .after_submit_internal(comb_raw, submit_index);
+            }
             (submit_index, fence)
         };