From 1efdc2b22925ee373aa89db33e5c091ebc5bf0dd Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Sun, 24 Jan 2021 16:04:07 +0100 Subject: [PATCH] Queue_submit zeros out uninitialized buffer regions now again but with fine grained scheme now --- wgpu-core/src/command/bundle.rs | 14 ++-- wgpu-core/src/command/compute.rs | 7 +- wgpu-core/src/command/render.rs | 25 +++---- wgpu-core/src/command/transfer.rs | 24 +++--- wgpu-core/src/device/mod.rs | 45 ++++++----- wgpu-core/src/device/queue.rs | 108 ++++++++++++++++++++++++--- wgpu-core/src/memory_init_tracker.rs | 37 +++++---- 7 files changed, 168 insertions(+), 92 deletions(-) diff --git a/wgpu-core/src/command/bundle.rs b/wgpu-core/src/command/bundle.rs index 8d15b09ae7..357da0ab75 100644 --- a/wgpu-core/src/command/bundle.rs +++ b/wgpu-core/src/command/bundle.rs @@ -49,7 +49,7 @@ use crate::{ }, hub::{GfxBackend, GlobalIdentityHandlerFactory, Hub, Resource, Storage, Token}, id, - memory_init_tracker::{MemoryInitTrackerAction, ResourceMemoryInitTrackerAction}, + memory_init_tracker::{MemoryInitKind, ResourceMemoryInitTrackerAction}, resource::BufferUse, span, track::{TrackerSet, UsageConflict}, @@ -269,7 +269,8 @@ impl RenderBundleEncoder { }; used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory(offset..end), + range: offset..end, + kind: MemoryInitKind::NeedsInitializedMemory, }); state.index.set_format(index_format); state.index.set_buffer(buffer_id, offset..end); @@ -295,7 +296,8 @@ impl RenderBundleEncoder { }; used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory(offset..end), + range: offset..end, + kind: MemoryInitKind::NeedsInitializedMemory, }); state.vertex[slot as usize].set_buffer(buffer_id, offset..end); } @@ -411,7 +413,8 @@ impl RenderBundleEncoder { used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory(0..buffer.size), + range: 0..buffer.size, + kind: MemoryInitKind::NeedsInitializedMemory, }); commands.extend(state.flush_vertices()); @@ -441,7 +444,8 @@ impl RenderBundleEncoder { let stride = 4 * 4; // 4 integers, vertexCount, instanceCount, firstVertex, firstInstance used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory(0..stride), + range: 0..stride, + kind: MemoryInitKind::NeedsInitializedMemory, }); commands.extend(state.index.flush()); diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs index d5f41e1640..f9119fedce 100644 --- a/wgpu-core/src/command/compute.rs +++ b/wgpu-core/src/command/compute.rs @@ -11,7 +11,7 @@ use crate::{ }, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Storage, Token}, id, - memory_init_tracker::{MemoryInitTrackerAction, ResourceMemoryInitTrackerAction}, + memory_init_tracker::{MemoryInitKind, ResourceMemoryInitTrackerAction}, resource::{Buffer, BufferUse, Texture}, span, track::{TrackerSet, UsageConflict}, @@ -523,9 +523,8 @@ impl Global { .used_buffer_ranges .push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - offset..(offset + stride), - ), + range: offset..(offset + stride), + kind: MemoryInitKind::NeedsInitializedMemory, }); state diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs index 05ae085d7d..bf54a24a59 100644 --- a/wgpu-core/src/command/render.rs +++ b/wgpu-core/src/command/render.rs @@ -17,7 +17,7 @@ use crate::{ }, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Storage, Token}, id, - memory_init_tracker::{MemoryInitTrackerAction, ResourceMemoryInitTrackerAction}, + memory_init_tracker::{MemoryInitKind, ResourceMemoryInitTrackerAction}, pipeline::PipelineFlags, resource::{BufferUse, Texture, TextureUse, TextureView, TextureViewInner}, span, @@ -1300,7 +1300,8 @@ impl Global { used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory(offset..end), + range: offset..end, + kind: MemoryInitKind::NeedsInitializedMemory, }); let range = hal::buffer::SubRange { @@ -1348,9 +1349,8 @@ impl Global { used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - offset..(offset + vertex_state.total_size), - ), + range: offset..(offset + vertex_state.total_size), + kind: MemoryInitKind::NeedsInitializedMemory, }); let range = hal::buffer::SubRange { @@ -1607,9 +1607,8 @@ impl Global { used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - offset..end_offset, - ), + range: offset..end_offset, + kind: MemoryInitKind::NeedsInitializedMemory, }); match indexed { @@ -1709,15 +1708,13 @@ impl Global { used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - offset..end_offset, - ), + range: offset..end_offset, + kind: MemoryInitKind::NeedsInitializedMemory, }); used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: count_buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - count_buffer_offset..end_count_offset, - ), + range: count_buffer_offset..end_count_offset, + kind: MemoryInitKind::NeedsInitializedMemory, }); match indexed { diff --git a/wgpu-core/src/command/transfer.rs b/wgpu-core/src/command/transfer.rs index c9db824266..2f59ca37d3 100644 --- a/wgpu-core/src/command/transfer.rs +++ b/wgpu-core/src/command/transfer.rs @@ -10,7 +10,7 @@ use crate::{ device::{all_buffer_stages, all_image_stages}, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Storage, Token}, id::{BufferId, CommandEncoderId, TextureId}, - memory_init_tracker::{MemoryInitTrackerAction, ResourceMemoryInitTrackerAction}, + memory_init_tracker::{MemoryInitKind, ResourceMemoryInitTrackerAction}, resource::{BufferUse, Texture, TextureErrorDimension, TextureUse}, span, track::TextureSelector, @@ -408,17 +408,15 @@ impl Global { .used_buffer_ranges .push(ResourceMemoryInitTrackerAction { id: destination, - action: MemoryInitTrackerAction::ImplicitlyInitialized( - destination_offset..(destination_offset + size), - ), + range: destination_offset..(destination_offset + size), + kind: MemoryInitKind::ImplicitlyInitialized, }); cmd_buf .used_buffer_ranges .push(ResourceMemoryInitTrackerAction { id: source, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - source_offset..(source_offset + size), - ), + range: source_offset..(source_offset + size), + kind: MemoryInitKind::NeedsInitializedMemory, }); let region = hal::command::BufferCopy { @@ -538,9 +536,8 @@ impl Global { .used_buffer_ranges .push(ResourceMemoryInitTrackerAction { id: source.buffer, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - source.layout.offset..(source.layout.offset + required_buffer_bytes_in_copy), - ), + range: source.layout.offset..(source.layout.offset + required_buffer_bytes_in_copy), + kind: MemoryInitKind::NeedsInitializedMemory, }); // TODO: Mark dest texture memory as implicitly initialized here. @@ -695,10 +692,9 @@ impl Global { .used_buffer_ranges .push(ResourceMemoryInitTrackerAction { id: destination.buffer, - action: MemoryInitTrackerAction::ImplicitlyInitialized( - destination.layout.offset - ..(destination.layout.offset + required_buffer_bytes_in_copy), - ), + range: destination.layout.offset + ..(destination.layout.offset + required_buffer_bytes_in_copy), + kind: MemoryInitKind::ImplicitlyInitialized, }); // TODO: Mark dest texture memory as required to be initialized here. diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs index 7047c6d4e1..4d6ec533fc 100644 --- a/wgpu-core/src/device/mod.rs +++ b/wgpu-core/src/device/mod.rs @@ -209,27 +209,22 @@ fn map_buffer( // If this is a write mapping zeroing out the memory here is the only reasonable way as all data is pushed to GPU anyways. let zero_init_needs_flush_now = !block.is_coherent() && buffer.sync_mapped_writes.is_none(); // No need to flush if it is flushed later anyways. - for uninitialized_segment in - buffer - .initialization_status - .drain_uninitialized_segments(hal::memory::Segment { - offset, - size: Some(size), - }) + if let Some(uninitialized_ranges) = buffer + .initialization_status + .drain_uninitialized_ranges(offset..(size + offset)) { - unsafe { - ptr::write_bytes( - ptr.as_ptr().offset(uninitialized_segment.offset as isize), - 0, - uninitialized_segment.size.unwrap_or(size) as usize, - ) - }; - if zero_init_needs_flush_now { - block.flush_range( - raw, - uninitialized_segment.offset, - uninitialized_segment.size, - )?; + for uninitialized_range in uninitialized_ranges { + let num_bytes = uninitialized_range.end - uninitialized_range.start; + unsafe { + ptr::write_bytes( + ptr.as_ptr().offset(uninitialized_range.start as isize), + 0, + num_bytes as usize, + ) + }; + if zero_init_needs_flush_now { + block.flush_range(raw, uninitialized_range.start, Some(num_bytes))?; + } } } @@ -507,6 +502,11 @@ impl Device { // we are going to be copying into it, internally usage |= hal::buffer::Usage::TRANSFER_DST; } + } else { + // We are required to zero out (initialize) all memory. + // This can be done via mapping, transfer or with a shader. However, right now we only implement it via transfer! + // TODO: Investigate if we could also do init via mapping or shader so that we need to alter the requested usage in less cases. + usage |= hal::buffer::Usage::TRANSFER_DST; } if desc.usage.is_empty() { @@ -1369,9 +1369,8 @@ impl Device { used_buffer_ranges.push(ResourceMemoryInitTrackerAction { id: bb.buffer_id, - action: MemoryInitTrackerAction::NeedsInitializedMemory( - bb.offset..(bb.offset + bind_size), - ), + range: bb.offset..(bb.offset + bind_size), + kind: MemoryInitKind::NeedsInitializedMemory, }); let sub_range = hal::buffer::SubRange { diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index 44e713967a..b5303ab16f 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -13,13 +13,14 @@ use crate::{ device::{alloc, DeviceError, WaitIdleError}, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Token}, id, + memory_init_tracker::MemoryInitKind, resource::{BufferAccessError, BufferMapState, BufferUse, TextureUse}, - span, FastHashSet, + span, FastHashMap, FastHashSet, }; use hal::{command::CommandBuffer as _, device::Device as _, queue::CommandQueue as _}; use smallvec::SmallVec; -use std::{iter, ptr}; +use std::{iter, ops::Range, ptr}; use thiserror::Error; struct StagingData { @@ -274,13 +275,12 @@ impl Global { // Ensure the overwritten bytes are marked as initialized so they don't need to be nulled prior to mapping or binding. { let dst = buffer_guard.get_mut(buffer_id).unwrap(); - for _ in dst + if let Some(uninitialized_ranges) = dst .initialization_status - .drain_uninitialized_segments(hal::memory::Segment { - offset: buffer_offset, - size: Some(buffer_offset + data_size), - }) - {} + .drain_uninitialized_ranges(buffer_offset..(buffer_offset + data_size)) + { + uninitialized_ranges.for_each(drop); + } } Ok(()) @@ -478,9 +478,96 @@ impl Global { span!(_guard, INFO, "Queue::submit"); let hub = B::hub(self); + let mut token = Token::root(); + + // Insert zero initializations if necessary. + // Since we push those to the pending_writes buffer, this needs to happen before any other processing + if !command_buffer_ids.is_empty() { + // Note that in the vast majority of cases this remains empty! + let required_buffer_inits = { + let (command_buffer_guard, mut token) = hub.command_buffers.read(&mut token); + let (mut buffer_guard, _) = hub.buffers.write(&mut token); + + let mut required_buffer_inits: FastHashMap< + id::BufferId, + Vec>, + > = FastHashMap::default(); + + for &cmb_id in command_buffer_ids { + let cmdbuf = command_buffer_guard + .get(cmb_id) + .map_err(|_| QueueSubmitError::InvalidCommandBuffer(cmb_id))?; + + for buffer_use in cmdbuf.used_buffer_ranges.iter() { + let buffer = buffer_guard + .get_mut(buffer_use.id) + .map_err(|_| QueueSubmitError::DestroyedBuffer(buffer_use.id))?; + + if let Some(uninitialized_ranges) = buffer + .initialization_status + .drain_uninitialized_ranges(buffer_use.range.clone()) + { + match buffer_use.kind { + MemoryInitKind::ImplicitlyInitialized => { + uninitialized_ranges.for_each(drop); + } + MemoryInitKind::NeedsInitializedMemory => { + required_buffer_inits + .entry(buffer_use.id) + .or_default() + .extend(uninitialized_ranges); + } + } + } + } + } + required_buffer_inits + }; + + if !required_buffer_inits.is_empty() { + let (mut device_guard, mut token) = hub.devices.write(&mut token); + let (buffer_guard, _) = hub.buffers.read(&mut token); + let device = device_guard + .get_mut(queue_id) + .map_err(|_| DeviceError::Invalid)?; + + device + .pending_writes + .dst_buffers + .extend(required_buffer_inits.keys()); + let pending_writes_cmd_buf = device.borrow_pending_writes(); + + for (buffer_id, ranges) in required_buffer_inits { + let buffer_raw = &buffer_guard + .get(buffer_id) + .map_err(|_| QueueSubmitError::DestroyedBuffer(buffer_id))? + .raw + .as_ref() + .ok_or(QueueSubmitError::DestroyedBuffer(buffer_id))? + .0; + + // TODO: Currently, we implement zero init only via fill_buffer. Should we implement this also via map & shader/storage? + // Note that this requires TRANSFER usage on the buffer - see also device.create_buffer + + // TODO: Collapse ranges + for range in ranges { + // TODO SHIPSTOPPER: Insert necessary barriers!! + unsafe { + pending_writes_cmd_buf.fill_buffer( + buffer_raw, + hal::buffer::SubRange { + offset: range.start, + size: Some(range.end - range.start), + }, + 0, + ); + } + } + } + } + } let callbacks = { - let mut token = Token::root(); let (mut device_guard, mut token) = hub.devices.write(&mut token); let device = device_guard .get_mut(queue_id) @@ -601,9 +688,6 @@ impl Global { } } - // TODO - // go through memory_init_tracker of every buffer and gather which areas need to be cleared. - // execute resource transitions let mut transit = device.cmd_allocator.extend(cmdbuf); unsafe { diff --git a/wgpu-core/src/memory_init_tracker.rs b/wgpu-core/src/memory_init_tracker.rs index 551621e917..edc510fef8 100644 --- a/wgpu-core/src/memory_init_tracker.rs +++ b/wgpu-core/src/memory_init_tracker.rs @@ -1,18 +1,18 @@ -use hal::memory::Segment; use std::ops::Range; #[derive(Debug, Clone)] -pub(crate) enum MemoryInitTrackerAction { +pub(crate) enum MemoryInitKind { // The memory range is going to be written by an already initialized source, thus doesn't need extra attention. - ImplicitlyInitialized(Range), + ImplicitlyInitialized, // The memory range is going to be read, therefore needs to ensure prior initialization. - NeedsInitializedMemory(Range), + NeedsInitializedMemory, } #[derive(Debug, Clone)] pub(crate) struct ResourceMemoryInitTrackerAction { pub(crate) id: ResourceId, - pub(crate) action: MemoryInitTrackerAction, + pub(crate) range: Range, + pub(crate) kind: MemoryInitKind, } /// Tracks initialization status of a linear range from 0..size @@ -34,15 +34,11 @@ impl MemoryInitTracker { } } - pub(crate) fn drain_uninitialized_segments<'a>( + #[must_use] + pub(crate) fn drain_uninitialized_ranges<'a>( &'a mut self, - segment: Segment, - ) -> impl Iterator + 'a { - let range = match segment.size { - Some(size) => segment.offset..(segment.offset + size), - None => segment.offset..self.uninitialized_ranges.initial_range().end, - }; - + range: Range, + ) -> Option> + 'a> { let mut uninitialized_ranges: Vec> = self .uninitialized_ranges .allocated_ranges() @@ -58,22 +54,23 @@ impl MemoryInitTracker { }) .collect(); - std::iter::from_fn(move || { + if uninitialized_ranges.is_empty() { + return None; + } + + Some(std::iter::from_fn(move || { let range: Option> = uninitialized_ranges.last().map(|r| r.clone()); match range { Some(range) => { uninitialized_ranges.pop(); - let result = Some(Segment { - offset: range.start, - size: Some(range.end - range.start), - }); + let result = range.clone(); self.uninitialized_ranges.free_range(range); - result + Some(result) } None => None, } - }) + })) } }