Merge #653

653: Implement Queue::write_buffer r=nobody a=kvark Implements https://github.com/gpuweb/gpuweb/pull/749 TODO: - [x] handle a case where the buffer is dropped while there is a pending write. Edit: we bump the submission index on the buffer, so it will be kept alive. - [x] properly free the temporary buffer and memory - [x] properly destroy the pending command buffer on device drop - [x] tweak the linear allocator settings - bumped to 16 megs - [x] provide a patch to wgpu-rs and verify it works on the examples - https://github.com/gfx-rs/wgpu-rs/pull/307 - [x] provide a patch to wgpu-native - https://github.com/gfx-rs/wgpu-native/pull/25 - [x] check/fix the trace/replay support Co-authored-by: Dzmitry Malyshau <kvarkus@gmail.com>
2026-04-22 03:02:01 -04:00 · 2020-05-13 13:29:09 +00:00
parent da4f9006a8 4c62b20282
commit a7200bb865
7 changed files with 460 additions and 227 deletions
--- a/player/src/main.rs
+++ b/player/src/main.rs
@@ -402,11 +402,20 @@ impl GlobalExt for wgc::hub::Global<IdentityPassThroughFactory> {
            A::DestroyRenderPipeline(id) => {
                self.render_pipeline_destroy::<B>(id);
            }
-            A::WriteBuffer { id, data, range } => {
+            A::WriteBuffer {
+                id,
+                data,
+                range,
+                queued,
+            } => {
                let bin = std::fs::read(dir.join(data)).unwrap();
                let size = (range.end - range.start) as usize;
-                self.device_wait_for_buffer::<B>(device, id);
-                self.device_set_buffer_sub_data::<B>(device, id, range.start, &bin[..size]);
+                if queued {
+                    self.queue_write_buffer::<B>(device, &bin, id, range.start);
+                } else {
+                    self.device_wait_for_buffer::<B>(device, id);
+                    self.device_set_buffer_sub_data::<B>(device, id, range.start, &bin[..size]);
+                }
            }
            A::Submit(_index, commands) => {
                let encoder = self.device_create_command_encoder::<B>(
@@ -446,7 +455,7 @@ fn main() {
    log::info!("Found {} actions", actions.len());

    #[cfg(feature = "winit")]
-    let mut event_loop = {
+    let event_loop = {
        log::info!("Creating a window");
        EventLoop::new()
    };
@@ -514,7 +523,6 @@ fn main() {
        use winit::{
            event::{ElementState, Event, KeyboardInput, VirtualKeyCode, WindowEvent},
            event_loop::ControlFlow,
-            platform::desktop::EventLoopExtDesktop,
        };

        let mut frame_count = 0;
--- a/wgpu-core/src/command/allocator.rs
+++ b/wgpu-core/src/command/allocator.rs
@@ -4,14 +4,14 @@

 use super::CommandBuffer;
 use crate::{
-    hub::GfxBackend, id::DeviceId, track::TrackerSet, LifeGuard, PrivateFeatures, Stored,
+    hub::GfxBackend, id::DeviceId, track::TrackerSet, FastHashMap, PrivateFeatures, Stored,
    SubmissionIndex,
 };

 use hal::{command::CommandBuffer as _, device::Device as _, pool::CommandPool as _};
 use parking_lot::Mutex;

-use std::{collections::HashMap, sync::atomic::Ordering, thread};
+use std::thread;

 const GROW_AMOUNT: usize = 20;

@@ -20,21 +20,17 @@ struct CommandPool<B: hal::Backend> {
    raw: B::CommandPool,
    total: usize,
    available: Vec<B::CommandBuffer>,
-    pending: Vec<CommandBuffer<B>>,
+    pending: Vec<(B::CommandBuffer, SubmissionIndex)>,
 }

 impl<B: hal::Backend> CommandPool<B> {
    fn maintain(&mut self, lowest_active_index: SubmissionIndex) {
        for i in (0..self.pending.len()).rev() {
-            let index = self.pending[i]
-                .life_guard
-                .submission_index
-                .load(Ordering::Acquire);
-            if index < lowest_active_index {
-                let cmd_buf = self.pending.swap_remove(i);
+            if self.pending[i].1 < lowest_active_index {
+                let cmd_buf = self.pending.swap_remove(i).0;
                log::trace!(
                    "recycling comb submitted in {} when {} is lowest active",
-                    index,
+                    self.pending[i].1,
                    lowest_active_index,
                );
                self.recycle(cmd_buf);
@@ -42,13 +38,11 @@ impl<B: hal::Backend> CommandPool<B> {
        }
    }

-    fn recycle(&mut self, cmd_buf: CommandBuffer<B>) {
-        for mut raw in cmd_buf.raw {
-            unsafe {
-                raw.reset(false);
-            }
-            self.available.push(raw);
+    fn recycle(&mut self, mut raw: B::CommandBuffer) {
+        unsafe {
+            raw.reset(false);
        }
+        self.available.push(raw);
    }

    fn allocate(&mut self) -> B::CommandBuffer {
@@ -68,12 +62,13 @@ impl<B: hal::Backend> CommandPool<B> {

 #[derive(Debug)]
 struct Inner<B: hal::Backend> {
-    pools: HashMap<thread::ThreadId, CommandPool<B>>,
+    pools: FastHashMap<thread::ThreadId, CommandPool<B>>,
 }

 #[derive(Debug)]
 pub struct CommandAllocator<B: hal::Backend> {
    queue_family: hal::queue::QueueFamilyId,
+    internal_thread_id: thread::ThreadId,
    inner: Mutex<Inner<B>>,
 }

@@ -113,7 +108,6 @@ impl<B: GfxBackend> CommandAllocator<B> {
            is_recording: true,
            recorded_thread_id: thread_id,
            device_id,
-            life_guard: LifeGuard::new(),
            trackers: TrackerSet::new(B::VARIANT),
            used_swap_chain: None,
            limits,
@@ -129,41 +123,75 @@ impl<B: GfxBackend> CommandAllocator<B> {
 }

 impl<B: hal::Backend> CommandAllocator<B> {
-    pub fn new(queue_family: hal::queue::QueueFamilyId) -> Self {
+    pub fn new(queue_family: hal::queue::QueueFamilyId, device: &B::Device) -> Self {
+        let internal_thread_id = thread::current().id();
+        log::info!("Starting on (internal) thread {:?}", internal_thread_id);
+        let mut pools = FastHashMap::default();
+        pools.insert(
+            internal_thread_id,
+            CommandPool {
+                raw: unsafe {
+                    device
+                        .create_command_pool(
+                            queue_family,
+                            hal::pool::CommandPoolCreateFlags::RESET_INDIVIDUAL,
+                        )
+                        .unwrap()
+                },
+                total: 0,
+                available: Vec::new(),
+                pending: Vec::new(),
+            },
+        );
        CommandAllocator {
            queue_family,
-            inner: Mutex::new(Inner {
-                pools: HashMap::new(),
-            }),
+            internal_thread_id,
+            inner: Mutex::new(Inner { pools }),
        }
    }

+    fn allocate_for_thread_id(&self, thread_id: thread::ThreadId) -> B::CommandBuffer {
+        let mut inner = self.inner.lock();
+        inner.pools.get_mut(&thread_id).unwrap().allocate()
+    }
+
+    pub fn allocate_internal(&self) -> B::CommandBuffer {
+        self.allocate_for_thread_id(self.internal_thread_id)
+    }
+
    pub fn extend(&self, cmd_buf: &CommandBuffer<B>) -> B::CommandBuffer {
+        self.allocate_for_thread_id(cmd_buf.recorded_thread_id)
+    }
+
+    pub fn discard_internal(&self, raw: B::CommandBuffer) {
        let mut inner = self.inner.lock();
        inner
            .pools
-            .get_mut(&cmd_buf.recorded_thread_id)
+            .get_mut(&self.internal_thread_id)
            .unwrap()
-            .allocate()
+            .recycle(raw);
    }

    pub fn discard(&self, mut cmd_buf: CommandBuffer<B>) {
        cmd_buf.trackers.clear();
        let mut inner = self.inner.lock();
-        inner
-            .pools
-            .get_mut(&cmd_buf.recorded_thread_id)
-            .unwrap()
-            .recycle(cmd_buf);
+        let pool = inner.pools.get_mut(&cmd_buf.recorded_thread_id).unwrap();
+        for raw in cmd_buf.raw {
+            pool.recycle(raw);
+        }
    }

-    pub fn after_submit(&self, mut cmd_buf: CommandBuffer<B>, submit_index: SubmissionIndex) {
-        cmd_buf.trackers.clear();
-        cmd_buf
-            .life_guard
-            .submission_index
-            .store(submit_index, Ordering::Release);
+    pub fn after_submit_internal(&self, raw: B::CommandBuffer, submit_index: SubmissionIndex) {
+        let mut inner = self.inner.lock();
+        inner
+            .pools
+            .get_mut(&thread::current().id())
+            .unwrap()
+            .pending
+            .push((raw, submit_index));
+    }

+    pub fn after_submit(&self, cmd_buf: CommandBuffer<B>, submit_index: SubmissionIndex) {
        // Record this command buffer as pending
        let mut inner = self.inner.lock();
        inner
@@ -171,7 +199,7 @@ impl<B: hal::Backend> CommandAllocator<B> {
            .get_mut(&cmd_buf.recorded_thread_id)
            .unwrap()
            .pending
-            .push(cmd_buf);
+            .extend(cmd_buf.raw.into_iter().map(|raw| (raw, submit_index)));
    }

    pub fn maintain(&self, device: &B::Device, lowest_active_index: SubmissionIndex) {
@@ -197,8 +225,8 @@ impl<B: hal::Backend> CommandAllocator<B> {
    pub fn destroy(self, device: &B::Device) {
        let mut inner = self.inner.lock();
        for (_, mut pool) in inner.pools.drain() {
-            while let Some(cmd_buf) = pool.pending.pop() {
-                pool.recycle(cmd_buf);
+            while let Some((raw, _)) = pool.pending.pop() {
+                pool.recycle(raw);
            }
            if pool.total != pool.available.len() {
                log::error!(
--- a/wgpu-core/src/command/mod.rs
+++ b/wgpu-core/src/command/mod.rs
@@ -19,7 +19,7 @@ use crate::{
    id,
    resource::{Buffer, Texture},
    track::TrackerSet,
-    LifeGuard, PrivateFeatures, Stored,
+    PrivateFeatures, Stored,
 };

 use peek_poke::PeekPoke;
@@ -157,7 +157,6 @@ pub struct CommandBuffer<B: hal::Backend> {
    is_recording: bool,
    recorded_thread_id: ThreadId,
    pub(crate) device_id: Stored<id::DeviceId>,
-    pub(crate) life_guard: LifeGuard,
    pub(crate) trackers: TrackerSet,
    pub(crate) used_swap_chain: Option<(Stored<id::SwapChainId>, B::Framebuffer)>,
    limits: wgt::Limits,
--- a/wgpu-core/src/device/life.rs
+++ b/wgpu-core/src/device/life.rs
@@ -212,12 +212,15 @@ impl<B: hal::Backend> LifetimeTracker<B> {
        index: SubmissionIndex,
        fence: B::Fence,
        new_suspects: &SuspectedResources,
+        temp_buffers: impl Iterator<Item = (B::Buffer, MemoryBlock<B>)>,
    ) {
+        let mut last_resources = NonReferencedResources::new();
+        last_resources.buffers.extend(temp_buffers);
        self.suspected_resources.extend(new_suspects);
        self.active.alloc().init(ActiveSubmission {
            index,
            fence,
-            last_resources: NonReferencedResources::new(),
+            last_resources,
            mapped: Vec::new(),
        });
    }
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -15,14 +15,11 @@ use copyless::VecHelper as _;
 use gfx_descriptor::DescriptorAllocator;
 use gfx_memory::{Block, Heaps};
 use hal::{
-    self,
    command::CommandBuffer as _,
    device::Device as _,
-    queue::CommandQueue as _,
    window::{PresentationSurface as _, Surface as _},
 };
 use parking_lot::{Mutex, MutexGuard};
-use smallvec::SmallVec;
 use wgt::{BufferAddress, InputStepMode, TextureDimension, TextureFormat, BIND_BUFFER_ALIGNMENT};

 use std::{
@@ -33,8 +30,10 @@ use std::{
 use spirv_headers::ExecutionModel;

 mod life;
+mod queue;
 #[cfg(any(feature = "trace", feature = "replay"))]
 pub mod trace;
+
 #[cfg(feature = "trace")]
 use trace::{Action, Trace};

@@ -202,6 +201,7 @@ pub struct Device<B: hal::Backend> {
    pub(crate) private_features: PrivateFeatures,
    limits: wgt::Limits,
    extensions: wgt::Extensions,
+    pending_writes: queue::PendingWrites<B>,
    #[cfg(feature = "trace")]
    pub(crate) trace: Option<Mutex<Trace>>,
 }
@@ -221,6 +221,7 @@ impl<B: GfxBackend> Device<B> {
        let life_guard = LifeGuard::new();
        life_guard.submission_index.fetch_add(1, Ordering::Relaxed);

+        let com_allocator = command::CommandAllocator::new(queue_group.family, &raw);
        let heaps = unsafe {
            Heaps::new(
                &mem_props,
@@ -230,7 +231,7 @@ impl<B: GfxBackend> Device<B> {
                    min_device_allocation: 0x1_0000,
                },
                gfx_memory::LinearConfig {
-                    linear_size: 0x10_0000,
+                    linear_size: 0x100_0000,
                },
                non_coherent_atom_size,
            )
@@ -244,7 +245,7 @@ impl<B: GfxBackend> Device<B> {
        Device {
            raw,
            adapter_id,
-            com_allocator: command::CommandAllocator::new(queue_group.family),
+            com_allocator,
            mem_allocator: Mutex::new(heaps),
            desc_allocator: Mutex::new(DescriptorAllocator::new()),
            queue_group,
@@ -273,14 +274,22 @@ impl<B: GfxBackend> Device<B> {
            },
            limits: desc.limits.clone(),
            extensions: desc.extensions.clone(),
+            pending_writes: queue::PendingWrites::new(),
        }
    }

+    fn lock_life_internal<'this, 'token: 'this>(
+        tracker: &'this Mutex<life::LifetimeTracker<B>>,
+        _token: &mut Token<'token, Self>,
+    ) -> MutexGuard<'this, life::LifetimeTracker<B>> {
+        tracker.lock()
+    }
+
    fn lock_life<'this, 'token: 'this>(
        &'this self,
-        _token: &mut Token<'token, Self>,
+        token: &mut Token<'token, Self>,
    ) -> MutexGuard<'this, life::LifetimeTracker<B>> {
-        self.life_tracker.lock()
+        Self::lock_life_internal(&self.life_tracker, token)
    }

    fn maintain<'this, 'token: 'this, G: GlobalIdentityHandlerFactory>(
@@ -510,9 +519,11 @@ impl<B: hal::Backend> Device<B> {
    }

    pub(crate) fn dispose(self) {
-        self.com_allocator.destroy(&self.raw);
        let mut desc_alloc = self.desc_allocator.into_inner();
        let mut mem_alloc = self.mem_allocator.into_inner();
+        self.pending_writes
+            .dispose(&self.raw, &self.com_allocator, &mut mem_alloc);
+        self.com_allocator.destroy(&self.raw);
        unsafe {
            desc_alloc.clear(&self.raw);
            mem_alloc.clear(&self.raw);
@@ -687,6 +698,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                    id: buffer_id,
                    data: data_path,
                    range: offset..offset + data.len() as BufferAddress,
+                    queued: false,
                });
            }
            None => (),
@@ -1683,179 +1695,6 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
        self.command_encoder_destroy::<B>(command_buffer_id)
    }

-    pub fn queue_submit<B: GfxBackend>(
-        &self,
-        queue_id: id::QueueId,
-        command_buffer_ids: &[id::CommandBufferId],
-    ) {
-        let hub = B::hub(self);
-
-        let (submit_index, fence) = {
-            let mut token = Token::root();
-            let (mut device_guard, mut token) = hub.devices.write(&mut token);
-            let device = &mut device_guard[queue_id];
-            device.temp_suspected.clear();
-
-            let submit_index = 1 + device
-                .life_guard
-                .submission_index
-                .fetch_add(1, Ordering::Relaxed);
-
-            let (mut swap_chain_guard, mut token) = hub.swap_chains.write(&mut token);
-            let (mut command_buffer_guard, mut token) = hub.command_buffers.write(&mut token);
-            let (bind_group_guard, mut token) = hub.bind_groups.read(&mut token);
-            let (compute_pipe_guard, mut token) = hub.compute_pipelines.read(&mut token);
-            let (render_pipe_guard, mut token) = hub.render_pipelines.read(&mut token);
-            let (mut buffer_guard, mut token) = hub.buffers.write(&mut token);
-            let (texture_guard, mut token) = hub.textures.read(&mut token);
-            let (texture_view_guard, mut token) = hub.texture_views.read(&mut token);
-            let (sampler_guard, _) = hub.samplers.read(&mut token);
-
-            //Note: locking the trackers has to be done after the storages
-            let mut signal_swapchain_semaphores = SmallVec::<[_; 1]>::new();
-            let mut trackers = device.trackers.lock();
-
-            //TODO: if multiple command buffers are submitted, we can re-use the last
-            // native command buffer of the previous chain instead of always creating
-            // a temporary one, since the chains are not finished.
-
-            // finish all the command buffers first
-            for &cmb_id in command_buffer_ids {
-                let comb = &mut command_buffer_guard[cmb_id];
-                #[cfg(feature = "trace")]
-                match device.trace {
-                    Some(ref trace) => trace
-                        .lock()
-                        .add(Action::Submit(submit_index, comb.commands.take().unwrap())),
-                    None => (),
-                };
-
-                if let Some((sc_id, fbo)) = comb.used_swap_chain.take() {
-                    let sc = &mut swap_chain_guard[sc_id.value];
-                    assert!(sc.acquired_view_id.is_some(),
-                        "SwapChainOutput for {:?} was dropped before the respective command buffer {:?} got submitted!",
-                        sc_id.value, cmb_id);
-                    if sc.acquired_framebuffers.is_empty() {
-                        signal_swapchain_semaphores.push(sc_id.value);
-                    }
-                    sc.acquired_framebuffers.push(fbo);
-                }
-
-                // optimize the tracked states
-                comb.trackers.optimize();
-
-                // update submission IDs
-                for id in comb.trackers.buffers.used() {
-                    if let resource::BufferMapState::Waiting(_) = buffer_guard[id].map_state {
-                        panic!("Buffer has a pending mapping.");
-                    }
-                    if !buffer_guard[id].life_guard.use_at(submit_index) {
-                        if let resource::BufferMapState::Active { .. } = buffer_guard[id].map_state
-                        {
-                            log::warn!("Dropped buffer has a pending mapping.");
-                            unmap_buffer(&device.raw, &mut buffer_guard[id]);
-                        }
-                        device.temp_suspected.buffers.push(id);
-                    }
-                }
-                for id in comb.trackers.textures.used() {
-                    if !texture_guard[id].life_guard.use_at(submit_index) {
-                        device.temp_suspected.textures.push(id);
-                    }
-                }
-                for id in comb.trackers.views.used() {
-                    if !texture_view_guard[id].life_guard.use_at(submit_index) {
-                        device.temp_suspected.texture_views.push(id);
-                    }
-                }
-                for id in comb.trackers.bind_groups.used() {
-                    if !bind_group_guard[id].life_guard.use_at(submit_index) {
-                        device.temp_suspected.bind_groups.push(id);
-                    }
-                }
-                for id in comb.trackers.samplers.used() {
-                    if !sampler_guard[id].life_guard.use_at(submit_index) {
-                        device.temp_suspected.samplers.push(id);
-                    }
-                }
-                for id in comb.trackers.compute_pipes.used() {
-                    if !compute_pipe_guard[id].life_guard.use_at(submit_index) {
-                        device.temp_suspected.compute_pipelines.push(id);
-                    }
-                }
-                for id in comb.trackers.render_pipes.used() {
-                    if !render_pipe_guard[id].life_guard.use_at(submit_index) {
-                        device.temp_suspected.render_pipelines.push(id);
-                    }
-                }
-
-                // execute resource transitions
-                let mut transit = device.com_allocator.extend(comb);
-                unsafe {
-                    // the last buffer was open, closing now
-                    comb.raw.last_mut().unwrap().finish();
-                    transit.begin_primary(hal::command::CommandBufferFlags::ONE_TIME_SUBMIT);
-                }
-                log::trace!("Stitching command buffer {:?} before submission", cmb_id);
-                command::CommandBuffer::insert_barriers(
-                    &mut transit,
-                    &mut *trackers,
-                    &comb.trackers,
-                    &*buffer_guard,
-                    &*texture_guard,
-                );
-                unsafe {
-                    transit.finish();
-                }
-                comb.raw.insert(0, transit);
-            }
-
-            log::debug!("Device after submission {}: {:#?}", submit_index, trackers);
-
-            // now prepare the GPU submission
-            let fence = device.raw.create_fence(false).unwrap();
-            let submission = hal::queue::Submission {
-                command_buffers: command_buffer_ids
-                    .iter()
-                    .flat_map(|&cmb_id| &command_buffer_guard[cmb_id].raw),
-                wait_semaphores: Vec::new(),
-                signal_semaphores: signal_swapchain_semaphores
-                    .into_iter()
-                    .map(|sc_id| &swap_chain_guard[sc_id].semaphore),
-            };
-
-            unsafe {
-                device.queue_group.queues[0].submit(submission, Some(&fence));
-            }
-
-            (submit_index, fence)
-        };
-
-        // No need for write access to the device from here on out
-        let callbacks = {
-            let mut token = Token::root();
-            let (device_guard, mut token) = hub.devices.read(&mut token);
-            let device = &device_guard[queue_id];
-
-            let callbacks = device.maintain(self, false, &mut token);
-            device.lock_life(&mut token).track_submission(
-                submit_index,
-                fence,
-                &device.temp_suspected,
-            );
-
-            // finally, return the command buffers to the allocator
-            for &cmb_id in command_buffer_ids {
-                let (cmd_buf, _) = hub.command_buffers.unregister(cmb_id, &mut token);
-                device.com_allocator.after_submit(cmd_buf, submit_index);
-            }
-
-            callbacks
-        };
-
-        fire_map_callbacks(callbacks);
-    }
-
    pub fn device_create_render_pipeline<B: GfxBackend>(
        &self,
        device_id: id::DeviceId,
@@ -2623,6 +2462,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                                id: buffer_id,
                                data,
                                range: sub_range.offset..sub_range.offset + size,
+                                queued: false,
                            });
                        }
                        None => (),
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -0,0 +1,354 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#[cfg(feature = "trace")]
+use crate::device::trace::Action;
+use crate::{
+    command::{CommandAllocator, CommandBuffer},
+    hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Token},
+    id,
+    resource::{BufferMapState, BufferUse},
+};
+
+use gfx_memory::{Block, Heaps, MemoryBlock};
+use hal::{command::CommandBuffer as _, device::Device as _, queue::CommandQueue as _};
+use smallvec::SmallVec;
+use std::{iter, sync::atomic::Ordering};
+
+#[derive(Debug, Default)]
+pub(crate) struct PendingWrites<B: hal::Backend> {
+    pub command_buffer: Option<B::CommandBuffer>,
+    pub temp_buffers: Vec<(B::Buffer, MemoryBlock<B>)>,
+}
+
+impl<B: hal::Backend> PendingWrites<B> {
+    pub fn new() -> Self {
+        PendingWrites {
+            command_buffer: None,
+            temp_buffers: Vec::new(),
+        }
+    }
+
+    pub fn dispose(
+        self,
+        device: &B::Device,
+        com_allocator: &CommandAllocator<B>,
+        mem_allocator: &mut Heaps<B>,
+    ) {
+        if let Some(raw) = self.command_buffer {
+            com_allocator.discard_internal(raw);
+        }
+        for (buffer, memory) in self.temp_buffers {
+            mem_allocator.free(device, memory);
+            unsafe {
+                device.destroy_buffer(buffer);
+            }
+        }
+    }
+}
+
+impl<G: GlobalIdentityHandlerFactory> Global<G> {
+    pub fn queue_write_buffer<B: GfxBackend>(
+        &self,
+        queue_id: id::QueueId,
+        data: &[u8],
+        buffer_id: id::BufferId,
+        buffer_offset: wgt::BufferAddress,
+    ) {
+        let hub = B::hub(self);
+        let mut token = Token::root();
+        let (mut device_guard, mut token) = hub.devices.write(&mut token);
+        let device = &mut device_guard[queue_id];
+        let (buffer_guard, _) = hub.buffers.read(&mut token);
+
+        #[cfg(feature = "trace")]
+        match device.trace {
+            Some(ref trace) => {
+                let mut trace = trace.lock();
+                let data_path = trace.make_binary("bin", data);
+                trace.add(Action::WriteBuffer {
+                    id: buffer_id,
+                    data: data_path,
+                    range: buffer_offset..buffer_offset + data.len() as wgt::BufferAddress,
+                    queued: true,
+                });
+            }
+            None => {}
+        }
+
+        let mut trackers = device.trackers.lock();
+        let (dst, transition) =
+            trackers
+                .buffers
+                .use_replace(&*buffer_guard, buffer_id, (), BufferUse::COPY_DST);
+        assert!(
+            dst.usage.contains(wgt::BufferUsage::COPY_DST),
+            "Write buffer usage {:?} must contain usage flag DST_SRC",
+            dst.usage
+        );
+
+        let last_submit_index = device.life_guard.submission_index.load(Ordering::Relaxed);
+        dst.life_guard.use_at(last_submit_index + 1);
+
+        let mut src_raw = unsafe {
+            device
+                .raw
+                .create_buffer(
+                    data.len() as wgt::BufferAddress,
+                    hal::buffer::Usage::TRANSFER_SRC,
+                )
+                .unwrap()
+        };
+        //TODO: do we need to transition into HOST_WRITE access first?
+        let requirements = unsafe { device.raw.get_buffer_requirements(&src_raw) };
+
+        let mut memory = device
+            .mem_allocator
+            .lock()
+            .allocate(
+                &device.raw,
+                requirements.type_mask as u32,
+                gfx_memory::MemoryUsage::Staging { read_back: false },
+                gfx_memory::Kind::Linear,
+                requirements.size,
+                requirements.alignment,
+            )
+            .unwrap();
+        unsafe {
+            device
+                .raw
+                .set_buffer_name(&mut src_raw, "<write_buffer_temp>");
+            device
+                .raw
+                .bind_buffer_memory(memory.memory(), memory.segment().offset, &mut src_raw)
+                .unwrap();
+        }
+
+        let mut mapped = memory.map(&device.raw, hal::memory::Segment::ALL).unwrap();
+        unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) }
+            .unwrap()
+            .slice[..data.len()]
+            .copy_from_slice(data);
+
+        let mut comb = match device.pending_writes.command_buffer.take() {
+            Some(comb) => comb,
+            None => {
+                let mut comb = device.com_allocator.allocate_internal();
+                unsafe {
+                    comb.begin_primary(hal::command::CommandBufferFlags::ONE_TIME_SUBMIT);
+                }
+                comb
+            }
+        };
+        let region = hal::command::BufferCopy {
+            src: 0,
+            dst: buffer_offset,
+            size: data.len() as _,
+        };
+        unsafe {
+            comb.pipeline_barrier(
+                super::all_buffer_stages()..hal::pso::PipelineStage::TRANSFER,
+                hal::memory::Dependencies::empty(),
+                iter::once(hal::memory::Barrier::Buffer {
+                    states: hal::buffer::Access::HOST_WRITE..hal::buffer::Access::TRANSFER_READ,
+                    target: &src_raw,
+                    range: hal::buffer::SubRange::WHOLE,
+                    families: None,
+                })
+                .chain(transition.map(|pending| pending.into_hal(dst))),
+            );
+            comb.copy_buffer(&src_raw, &dst.raw, iter::once(region));
+        }
+        device.pending_writes.temp_buffers.push((src_raw, memory));
+        device.pending_writes.command_buffer = Some(comb);
+    }
+
+    pub fn queue_submit<B: GfxBackend>(
+        &self,
+        queue_id: id::QueueId,
+        command_buffer_ids: &[id::CommandBufferId],
+    ) {
+        let hub = B::hub(self);
+
+        let callbacks = {
+            let mut token = Token::root();
+            let (mut device_guard, mut token) = hub.devices.write(&mut token);
+            let device = &mut device_guard[queue_id];
+            let pending_write_command_buffer =
+                device
+                    .pending_writes
+                    .command_buffer
+                    .take()
+                    .map(|mut comb_raw| unsafe {
+                        comb_raw.finish();
+                        comb_raw
+                    });
+            device.temp_suspected.clear();
+
+            let submit_index = 1 + device
+                .life_guard
+                .submission_index
+                .fetch_add(1, Ordering::Relaxed);
+
+            let fence = {
+                let mut signal_swapchain_semaphores = SmallVec::<[_; 1]>::new();
+                let (mut swap_chain_guard, mut token) = hub.swap_chains.write(&mut token);
+                let (mut command_buffer_guard, mut token) = hub.command_buffers.write(&mut token);
+
+                {
+                    let (bind_group_guard, mut token) = hub.bind_groups.read(&mut token);
+                    let (compute_pipe_guard, mut token) = hub.compute_pipelines.read(&mut token);
+                    let (render_pipe_guard, mut token) = hub.render_pipelines.read(&mut token);
+                    let (mut buffer_guard, mut token) = hub.buffers.write(&mut token);
+                    let (texture_guard, mut token) = hub.textures.read(&mut token);
+                    let (texture_view_guard, mut token) = hub.texture_views.read(&mut token);
+                    let (sampler_guard, _) = hub.samplers.read(&mut token);
+
+                    //Note: locking the trackers has to be done after the storages
+                    let mut trackers = device.trackers.lock();
+
+                    //TODO: if multiple command buffers are submitted, we can re-use the last
+                    // native command buffer of the previous chain instead of always creating
+                    // a temporary one, since the chains are not finished.
+
+                    // finish all the command buffers first
+                    for &cmb_id in command_buffer_ids {
+                        let comb = &mut command_buffer_guard[cmb_id];
+                        #[cfg(feature = "trace")]
+                        match device.trace {
+                            Some(ref trace) => trace
+                                .lock()
+                                .add(Action::Submit(submit_index, comb.commands.take().unwrap())),
+                            None => (),
+                        };
+
+                        if let Some((sc_id, fbo)) = comb.used_swap_chain.take() {
+                            let sc = &mut swap_chain_guard[sc_id.value];
+                            assert!(sc.acquired_view_id.is_some(),
+                                "SwapChainOutput for {:?} was dropped before the respective command buffer {:?} got submitted!",
+                                sc_id.value, cmb_id);
+                            if sc.acquired_framebuffers.is_empty() {
+                                signal_swapchain_semaphores.push(sc_id.value);
+                            }
+                            sc.acquired_framebuffers.push(fbo);
+                        }
+
+                        // optimize the tracked states
+                        comb.trackers.optimize();
+
+                        // update submission IDs
+                        for id in comb.trackers.buffers.used() {
+                            if let BufferMapState::Waiting(_) = buffer_guard[id].map_state {
+                                panic!("Buffer has a pending mapping.");
+                            }
+                            if !buffer_guard[id].life_guard.use_at(submit_index) {
+                                if let BufferMapState::Active { .. } = buffer_guard[id].map_state {
+                                    log::warn!("Dropped buffer has a pending mapping.");
+                                    super::unmap_buffer(&device.raw, &mut buffer_guard[id]);
+                                }
+                                device.temp_suspected.buffers.push(id);
+                            }
+                        }
+                        for id in comb.trackers.textures.used() {
+                            if !texture_guard[id].life_guard.use_at(submit_index) {
+                                device.temp_suspected.textures.push(id);
+                            }
+                        }
+                        for id in comb.trackers.views.used() {
+                            if !texture_view_guard[id].life_guard.use_at(submit_index) {
+                                device.temp_suspected.texture_views.push(id);
+                            }
+                        }
+                        for id in comb.trackers.bind_groups.used() {
+                            if !bind_group_guard[id].life_guard.use_at(submit_index) {
+                                device.temp_suspected.bind_groups.push(id);
+                            }
+                        }
+                        for id in comb.trackers.samplers.used() {
+                            if !sampler_guard[id].life_guard.use_at(submit_index) {
+                                device.temp_suspected.samplers.push(id);
+                            }
+                        }
+                        for id in comb.trackers.compute_pipes.used() {
+                            if !compute_pipe_guard[id].life_guard.use_at(submit_index) {
+                                device.temp_suspected.compute_pipelines.push(id);
+                            }
+                        }
+                        for id in comb.trackers.render_pipes.used() {
+                            if !render_pipe_guard[id].life_guard.use_at(submit_index) {
+                                device.temp_suspected.render_pipelines.push(id);
+                            }
+                        }
+
+                        // execute resource transitions
+                        let mut transit = device.com_allocator.extend(comb);
+                        unsafe {
+                            // the last buffer was open, closing now
+                            comb.raw.last_mut().unwrap().finish();
+                            transit
+                                .begin_primary(hal::command::CommandBufferFlags::ONE_TIME_SUBMIT);
+                        }
+                        log::trace!("Stitching command buffer {:?} before submission", cmb_id);
+                        CommandBuffer::insert_barriers(
+                            &mut transit,
+                            &mut *trackers,
+                            &comb.trackers,
+                            &*buffer_guard,
+                            &*texture_guard,
+                        );
+                        unsafe {
+                            transit.finish();
+                        }
+                        comb.raw.insert(0, transit);
+                    }
+
+                    log::debug!("Device after submission {}: {:#?}", submit_index, trackers);
+                }
+
+                // now prepare the GPU submission
+                let fence = device.raw.create_fence(false).unwrap();
+                let submission = hal::queue::Submission {
+                    command_buffers: pending_write_command_buffer.as_ref().into_iter().chain(
+                        command_buffer_ids
+                            .iter()
+                            .flat_map(|&cmb_id| &command_buffer_guard[cmb_id].raw),
+                    ),
+                    wait_semaphores: Vec::new(),
+                    signal_semaphores: signal_swapchain_semaphores
+                        .into_iter()
+                        .map(|sc_id| &swap_chain_guard[sc_id].semaphore),
+                };
+
+                unsafe {
+                    device.queue_group.queues[0].submit(submission, Some(&fence));
+                }
+                fence
+            };
+
+            if let Some(comb_raw) = pending_write_command_buffer {
+                device
+                    .com_allocator
+                    .after_submit_internal(comb_raw, submit_index);
+            }
+
+            let callbacks = device.maintain(self, false, &mut token);
+            super::Device::lock_life_internal(&device.life_tracker, &mut token).track_submission(
+                submit_index,
+                fence,
+                &device.temp_suspected,
+                device.pending_writes.temp_buffers.drain(..),
+            );
+
+            // finally, return the command buffers to the allocator
+            for &cmb_id in command_buffer_ids {
+                let (cmd_buf, _) = hub.command_buffers.unregister(cmb_id, &mut token);
+                device.com_allocator.after_submit(cmd_buf, submit_index);
+            }
+
+            callbacks
+        };
+
+        super::fire_map_callbacks(callbacks);
+    }
+}
--- a/wgpu-core/src/device/trace.rs
+++ b/wgpu-core/src/device/trace.rs
@@ -166,6 +166,7 @@ pub enum Action {
        id: id::BufferId,
        data: FileName,
        range: Range<wgt::BufferAddress>,
+        queued: bool,
    },
    Submit(crate::SubmissionIndex, Vec<Command>),
 }