diff --git a/player/tests/data/all.ron b/player/tests/data/all.ron index ef76e7dac8..0b493a9b83 100644 --- a/player/tests/data/all.ron +++ b/player/tests/data/all.ron @@ -2,6 +2,7 @@ backends: (bits: 0xF), tests: [ "buffer-copy.ron", + "buffer-zero-init.ron", "bind-group.ron", "quad.ron", ], diff --git a/player/tests/data/buffer-zero-init.ron b/player/tests/data/buffer-zero-init.ron new file mode 100644 index 0000000000..90a9f0c7b4 --- /dev/null +++ b/player/tests/data/buffer-zero-init.ron @@ -0,0 +1,70 @@ +( + features: (bits: 0x0), + expectations: [ + ( + name: "mapped_at_creation: false, with MAP_WRITE", + buffer: (index: 0, epoch: 1), + offset: 0, + data: Raw([0x00, 0x00, 0x00, 0x00]), + ), + ( + name: "mapped_at_creation: false, without MAP_WRITE", + buffer: (index: 1, epoch: 1), + offset: 0, + data: Raw([0x00, 0x00, 0x00, 0x00]), + ), + ( + name: "partially written buffer", + buffer: (index: 2, epoch: 1), + offset: 0, + data: Raw([0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0xBF, 0x00, 0x00, 0x80, 0xBF, 0x00, 0x00, 0x80, 0x3F, 0x00, 0x00, 0x80, 0x3F, 0x00, 0x00, 0x00, 0x00]), + ), + ], + actions: [ + CreateBuffer( + Id(0, 1, Empty), + ( + label: Some("mapped_at_creation: false, with MAP_WRITE"), + size: 16, + usage: ( + bits: 131, // STORAGE + MAP_READ + MAP_WRITE + ), + mapped_at_creation: false, + ), + ), + CreateBuffer( + Id(1, 1, Empty), + ( + label: Some("mapped_at_creation: false, without MAP_WRITE"), + size: 16, + usage: ( + bits: 129, // STORAGE + MAP_READ + ), + mapped_at_creation: false, + ), + ), + CreateBuffer( + Id(2, 1, Empty), + ( + label: Some("partially written"), + size: 24, + usage: ( + bits: 9, // MAP_READ + COPY_DST + ), + mapped_at_creation: false, + ), + ), + WriteBuffer( + id: Id(2, 1, Empty), + data: "data1.bin", + range: ( + start: 4, + end: 20, + ), + queued: true, + ), + + // TODO: Add a buffer with `mapped_at_creation: true`. As of writing there's no unmap action we could insert here. + Submit(1, []), + ], +) \ No newline at end of file diff --git a/wgpu-core/src/binding_model.rs b/wgpu-core/src/binding_model.rs index da9f33200c..dd40398ff0 100644 --- a/wgpu-core/src/binding_model.rs +++ b/wgpu-core/src/binding_model.rs @@ -9,6 +9,7 @@ use crate::{ }, hub::Resource, id::{BindGroupLayoutId, BufferId, DeviceId, SamplerId, TextureViewId, Valid}, + memory_init_tracker::MemoryInitTrackerAction, track::{TrackerSet, DUMMY_SELECTOR}, validation::{MissingBufferUsageError, MissingTextureUsageError}, FastHashMap, Label, LifeGuard, MultiRefCount, Stored, MAX_BIND_GROUPS, @@ -588,6 +589,7 @@ pub struct BindGroup { pub(crate) layout_id: Valid, pub(crate) life_guard: LifeGuard, pub(crate) used: TrackerSet, + pub(crate) used_buffer_ranges: Vec>, pub(crate) dynamic_binding_info: Vec, } diff --git a/wgpu-core/src/command/allocator.rs b/wgpu-core/src/command/allocator.rs index f00c5d9eda..4a021c9ab0 100644 --- a/wgpu-core/src/command/allocator.rs +++ b/wgpu-core/src/command/allocator.rs @@ -123,6 +123,7 @@ impl CommandAllocator { device_id, trackers: TrackerSet::new(B::VARIANT), used_swap_chains: Default::default(), + buffer_memory_init_actions: Default::default(), limits, private_features, has_labels: label.is_some(), diff --git a/wgpu-core/src/command/bundle.rs b/wgpu-core/src/command/bundle.rs index dcdb5faa30..f67fb61cb1 100644 --- a/wgpu-core/src/command/bundle.rs +++ b/wgpu-core/src/command/bundle.rs @@ -49,6 +49,7 @@ use crate::{ }, hub::{GfxBackend, GlobalIdentityHandlerFactory, Hub, Resource, Storage, Token}, id, + memory_init_tracker::{MemoryInitKind, MemoryInitTrackerAction}, resource::BufferUse, span, track::{TrackerSet, UsageConflict}, @@ -56,7 +57,7 @@ use crate::{ Label, LabelHelpers, LifeGuard, Stored, MAX_BIND_GROUPS, }; use arrayvec::ArrayVec; -use std::{borrow::Cow, iter, ops::Range}; +use std::{borrow::Cow, iter, mem, ops::Range}; use thiserror::Error; /// Describes a [`RenderBundleEncoder`]. @@ -159,6 +160,7 @@ impl RenderBundleEncoder { let mut commands = Vec::new(); let mut base = self.base.as_ref(); let mut pipeline_layout_id = None::>; + let mut buffer_memory_init_actions = Vec::new(); for &command in base.commands { match command { @@ -204,6 +206,8 @@ impl RenderBundleEncoder { .map_pass_err(scope); } + buffer_memory_init_actions.extend_from_slice(&bind_group.used_buffer_ranges); + state.set_bind_group(index, bind_group_id, bind_group.layout_id, offsets); state .trackers @@ -262,6 +266,11 @@ impl RenderBundleEncoder { Some(s) => offset + s.get(), None => buffer.size, }; + buffer_memory_init_actions.push(MemoryInitTrackerAction { + id: buffer_id, + range: offset..end, + kind: MemoryInitKind::NeedsInitializedMemory, + }); state.index.set_format(index_format); state.index.set_buffer(buffer_id, offset..end); } @@ -284,6 +293,11 @@ impl RenderBundleEncoder { Some(s) => offset + s.get(), None => buffer.size, }; + buffer_memory_init_actions.push(MemoryInitTrackerAction { + id: buffer_id, + range: offset..end, + kind: MemoryInitKind::NeedsInitializedMemory, + }); state.vertex[slot as usize].set_buffer(buffer_id, offset..end); } RenderCommand::SetPushConstant { @@ -379,7 +393,7 @@ impl RenderBundleEncoder { } RenderCommand::MultiDrawIndirect { buffer_id, - offset: _, + offset, count: None, indexed: false, } => { @@ -396,13 +410,26 @@ impl RenderBundleEncoder { check_buffer_usage(buffer.usage, wgt::BufferUsage::INDIRECT) .map_pass_err(scope)?; + buffer_memory_init_actions.extend( + buffer + .initialization_status + .check( + offset..(offset + mem::size_of::() as u64), + ) + .map(|range| MemoryInitTrackerAction { + id: buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + commands.extend(state.flush_vertices()); commands.extend(state.flush_binds()); commands.push(command); } RenderCommand::MultiDrawIndirect { buffer_id, - offset: _, + offset, count: None, indexed: true, } => { @@ -420,6 +447,19 @@ impl RenderBundleEncoder { check_buffer_usage(buffer.usage, wgt::BufferUsage::INDIRECT) .map_pass_err(scope)?; + buffer_memory_init_actions.extend( + buffer + .initialization_status + .check( + offset..(offset + mem::size_of::() as u64), + ) + .map(|range| MemoryInitTrackerAction { + id: buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + commands.extend(state.index.flush()); commands.extend(state.flush_vertices()); commands.extend(state.flush_binds()); @@ -454,6 +494,7 @@ impl RenderBundleEncoder { ref_count: device.life_guard.add_ref(), }, used: state.trackers, + buffer_memory_init_actions, context: self.context, life_guard: LifeGuard::new(desc.label.borrow_or_default()), }) @@ -502,6 +543,7 @@ pub struct RenderBundle { base: BasePass, pub(crate) device_id: Stored, pub(crate) used: TrackerSet, + pub(crate) buffer_memory_init_actions: Vec>, pub(crate) context: RenderPassContext, pub(crate) life_guard: LifeGuard, } diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs index ba19a2f0fc..c810b327ef 100644 --- a/wgpu-core/src/command/compute.rs +++ b/wgpu-core/src/command/compute.rs @@ -11,6 +11,7 @@ use crate::{ }, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Storage, Token}, id, + memory_init_tracker::{MemoryInitKind, MemoryInitTrackerAction}, resource::{Buffer, BufferUse, Texture}, span, track::{TrackerSet, UsageConflict}, @@ -146,6 +147,8 @@ pub enum ComputePassErrorInner { end_offset: u64, buffer_size: u64, }, + #[error("buffer {0:?} is invalid or destroyed")] + InvalidBuffer(id::BufferId), #[error(transparent)] ResourceUsageConflict(#[from] UsageConflict), #[error(transparent)] @@ -329,6 +332,22 @@ impl Global { .validate_dynamic_bindings(&temp_offsets) .map_pass_err(scope)?; + cmd_buf.buffer_memory_init_actions.extend( + bind_group.used_buffer_ranges.iter().filter_map( + |action| match buffer_guard.get(action.id) { + Ok(buffer) => buffer + .initialization_status + .check(action.range.clone()) + .map(|range| MemoryInitTrackerAction { + id: action.id, + range, + kind: action.kind, + }), + Err(_) => None, + }, + ), + ); + if let Some((pipeline_layout_id, follow_ups)) = state.binder.provide_entry( index as usize, id::Valid(bind_group_id), @@ -513,6 +532,19 @@ impl Global { .ok_or(ComputePassErrorInner::InvalidIndirectBuffer(buffer_id)) .map_pass_err(scope)?; + let stride = 3 * 4; // 3 integers, x/y/z group size + + cmd_buf.buffer_memory_init_actions.extend( + indirect_buffer + .initialization_status + .check(offset..(offset + stride)) + .map(|range| MemoryInitTrackerAction { + id: buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + state .flush_states( raw, diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs index b98ce39598..edbda616e5 100644 --- a/wgpu-core/src/command/mod.rs +++ b/wgpu-core/src/command/mod.rs @@ -24,6 +24,7 @@ use crate::{ device::{all_buffer_stages, all_image_stages}, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Storage, Token}, id, + memory_init_tracker::MemoryInitTrackerAction, resource::{Buffer, Texture}, span, track::TrackerSet, @@ -46,6 +47,7 @@ pub struct CommandBuffer { pub(crate) device_id: Stored, pub(crate) trackers: TrackerSet, pub(crate) used_swap_chains: SmallVec<[Stored; 1]>, + pub(crate) buffer_memory_init_actions: Vec>, limits: wgt::Limits, private_features: PrivateFeatures, has_labels: bool, @@ -56,17 +58,6 @@ pub struct CommandBuffer { } impl CommandBuffer { - fn get_encoder( - storage: &Storage, - id: id::CommandEncoderId, - ) -> Result<&Self, CommandEncoderError> { - match storage.get(id) { - Ok(cmd_buf) if cmd_buf.is_recording => Ok(cmd_buf), - Ok(_) => Err(CommandEncoderError::NotRecording), - Err(_) => Err(CommandEncoderError::Invalid), - } - } - fn get_encoder_mut( storage: &mut Storage, id: id::CommandEncoderId, diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs index cf4292e7ec..cb5547750a 100644 --- a/wgpu-core/src/command/render.rs +++ b/wgpu-core/src/command/render.rs @@ -17,6 +17,7 @@ use crate::{ }, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Storage, Token}, id, + memory_init_tracker::{MemoryInitKind, MemoryInitTrackerAction}, pipeline::PipelineFlags, resource::{BufferUse, Texture, TextureUse, TextureView, TextureViewInner}, span, @@ -1026,10 +1027,10 @@ impl Global { let (cmd_buf_raw, trackers, used_swapchain, query_reset_state) = { // read-only lock guard - let (cmb_guard, mut token) = hub.command_buffers.read(&mut token); + let (mut cmb_guard, mut token) = hub.command_buffers.write(&mut token); let cmd_buf = - CommandBuffer::get_encoder(&*cmb_guard, encoder_id).map_pass_err(scope)?; + CommandBuffer::get_encoder_mut(&mut *cmb_guard, encoder_id).map_pass_err(scope)?; let device = &device_guard[cmd_buf.device_id.value]; let mut raw = device.cmd_allocator.extend(cmd_buf); unsafe { @@ -1117,6 +1118,22 @@ impl Global { .merge_extend(&bind_group.used) .map_pass_err(scope)?; + cmd_buf.buffer_memory_init_actions.extend( + bind_group.used_buffer_ranges.iter().filter_map(|action| { + match buffer_guard.get(action.id) { + Ok(buffer) => buffer + .initialization_status + .check(action.range.clone()) + .map(|range| MemoryInitTrackerAction { + id: action.id, + range, + kind: action.kind, + }), + Err(_) => None, + } + }), + ); + if let Some((pipeline_layout_id, follow_ups)) = state.binder.provide_entry( index as usize, id::Valid(bind_group_id), @@ -1293,6 +1310,17 @@ impl Global { state.index.format = Some(index_format); state.index.update_limit(); + cmd_buf.buffer_memory_init_actions.extend( + buffer + .initialization_status + .check(offset..end) + .map(|range| MemoryInitTrackerAction { + id: buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + let range = hal::buffer::SubRange { offset, size: Some(end - offset), @@ -1336,6 +1364,17 @@ impl Global { }; vertex_state.bound = true; + cmd_buf.buffer_memory_init_actions.extend( + buffer + .initialization_status + .check(offset..(offset + vertex_state.total_size)) + .map(|range| MemoryInitTrackerAction { + id: buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + let range = hal::buffer::SubRange { offset, size: size.map(|s| s.get()), @@ -1588,6 +1627,17 @@ impl Global { .map_pass_err(scope); } + cmd_buf.buffer_memory_init_actions.extend( + indirect_buffer + .initialization_status + .check(offset..end_offset) + .map(|range| MemoryInitTrackerAction { + id: buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + match indexed { false => unsafe { raw.draw_indirect( @@ -1671,6 +1721,16 @@ impl Global { }) .map_pass_err(scope); } + cmd_buf.buffer_memory_init_actions.extend( + indirect_buffer + .initialization_status + .check(offset..end_offset) + .map(|range| MemoryInitTrackerAction { + id: buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); let begin_count_offset = count_buffer_offset; let end_count_offset = count_buffer_offset + 4; @@ -1682,6 +1742,16 @@ impl Global { }) .map_pass_err(scope); } + cmd_buf.buffer_memory_init_actions.extend( + count_buffer + .initialization_status + .check(count_buffer_offset..end_count_offset) + .map(|range| MemoryInitTrackerAction { + id: count_buffer_id, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); match indexed { false => unsafe { @@ -1815,6 +1885,23 @@ impl Global { .map_err(RenderPassErrorInner::IncompatibleRenderBundle) .map_pass_err(scope)?; + cmd_buf.buffer_memory_init_actions.extend( + bundle + .buffer_memory_init_actions + .iter() + .filter_map(|action| match buffer_guard.get(action.id) { + Ok(buffer) => buffer + .initialization_status + .check(action.range.clone()) + .map(|range| MemoryInitTrackerAction { + id: action.id, + range, + kind: action.kind, + }), + Err(_) => None, + }), + ); + unsafe { bundle.execute( &mut raw, diff --git a/wgpu-core/src/command/transfer.rs b/wgpu-core/src/command/transfer.rs index e19fcd3a87..2502236589 100644 --- a/wgpu-core/src/command/transfer.rs +++ b/wgpu-core/src/command/transfer.rs @@ -10,6 +10,7 @@ use crate::{ device::{all_buffer_stages, all_image_stages}, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Storage, Token}, id::{BufferId, CommandEncoderId, TextureId}, + memory_init_tracker::{MemoryInitKind, MemoryInitTrackerAction}, resource::{BufferUse, Texture, TextureErrorDimension, TextureUse}, span, track::TextureSelector, @@ -149,6 +150,7 @@ pub(crate) fn texture_copy_view_to_hal( } /// Function copied with minor modifications from webgpu standard https://gpuweb.github.io/gpuweb/#valid-texture-copy-range +/// If successful, returns number of buffer bytes required for this copy. pub(crate) fn validate_linear_texture_data( layout: &wgt::TextureDataLayout, format: wgt::TextureFormat, @@ -156,7 +158,7 @@ pub(crate) fn validate_linear_texture_data( buffer_side: CopySide, bytes_per_block: BufferAddress, copy_size: &Extent3d, -) -> Result<(), TransferError> { +) -> Result { // Convert all inputs to BufferAddress (u64) to prevent overflow issues let copy_width = copy_size.width as BufferAddress; let copy_height = copy_size.height as BufferAddress; @@ -217,7 +219,7 @@ pub(crate) fn validate_linear_texture_data( if copy_depth > 1 && rows_per_image == 0 { return Err(TransferError::InvalidRowsPerImage); } - Ok(()) + Ok(required_bytes_in_copy) } /// Function copied with minor modifications from webgpu standard https://gpuweb.github.io/gpuweb/#valid-texture-copy-range @@ -401,6 +403,28 @@ impl Global { return Ok(()); } + // Make sure source is initialized memory and mark dest as initialized. + cmd_buf.buffer_memory_init_actions.extend( + dst_buffer + .initialization_status + .check(destination_offset..(destination_offset + size)) + .map(|range| MemoryInitTrackerAction { + id: destination, + range, + kind: MemoryInitKind::ImplicitlyInitialized, + }), + ); + cmd_buf.buffer_memory_init_actions.extend( + src_buffer + .initialization_status + .check(source_offset..(source_offset + size)) + .map(|range| MemoryInitTrackerAction { + id: source, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + let region = hal::command::BufferCopy { src: source_offset, dst: destination_offset, @@ -505,7 +529,7 @@ impl Global { CopySide::Destination, copy_size, )?; - validate_linear_texture_data( + let required_buffer_bytes_in_copy = validate_linear_texture_data( &source.layout, dst_texture.format, src_buffer.size, @@ -514,6 +538,17 @@ impl Global { copy_size, )?; + cmd_buf.buffer_memory_init_actions.extend( + src_buffer + .initialization_status + .check(source.layout.offset..(source.layout.offset + required_buffer_bytes_in_copy)) + .map(|range| MemoryInitTrackerAction { + id: source.buffer, + range, + kind: MemoryInitKind::NeedsInitializedMemory, + }), + ); + let (block_width, _) = dst_texture.format.describe().block_dimensions; if !conv::is_valid_copy_dst_texture_format(dst_texture.format) { Err(TransferError::CopyToForbiddenTextureFormat( @@ -645,7 +680,7 @@ impl Global { CopySide::Source, copy_size, )?; - validate_linear_texture_data( + let required_buffer_bytes_in_copy = validate_linear_texture_data( &destination.layout, src_texture.format, dst_buffer.size, @@ -661,6 +696,20 @@ impl Global { ))? } + cmd_buf.buffer_memory_init_actions.extend( + dst_buffer + .initialization_status + .check( + destination.layout.offset + ..(destination.layout.offset + required_buffer_bytes_in_copy), + ) + .map(|range| MemoryInitTrackerAction { + id: destination.buffer, + range, + kind: MemoryInitKind::ImplicitlyInitialized, + }), + ); + // WebGPU uses the physical size of the texture for copies whereas vulkan uses // the virtual size. We have passed validation, so it's safe to use the // image extent data directly. We want the provided copy size to be no larger than diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs index 1a2fcc733c..0cc5f46de3 100644 --- a/wgpu-core/src/device/mod.rs +++ b/wgpu-core/src/device/mod.rs @@ -8,7 +8,9 @@ use crate::{ hub::{ GfxBackend, Global, GlobalIdentityHandlerFactory, Hub, Input, InvalidId, Storage, Token, }, - id, instance, pipeline, resource, span, swap_chain, + id, instance, + memory_init_tracker::{MemoryInitKind, MemoryInitTracker, MemoryInitTrackerAction}, + pipeline, resource, span, swap_chain, track::{BufferState, TextureSelector, TextureState, TrackerSet}, validation::{self, check_buffer_usage, check_texture_usage}, FastHashMap, FastHashSet, Label, LabelHelpers, LifeGuard, MultiRefCount, PrivateFeatures, @@ -196,6 +198,30 @@ fn map_buffer( }), _ => None, }; + + // Zero out uninitialized parts of the mapping. (Spec dictates all resources behave as if they were initialized with zero) + // + // If this is a read mapping, ideally we would use a `fill_buffer` command before reading the data from GPU (i.e. `invalidate_range`). + // However, this would require us to kick off and wait for a command buffer or piggy back on an existing one (the later is likely the only worthwhile option). + // As reading uninitialized memory isn't a particular important path to support, + // we instead just initialize the memory here and make sure it is GPU visible, so this happens at max only once for every buffer region. + // + // If this is a write mapping zeroing out the memory here is the only reasonable way as all data is pushed to GPU anyways. + let zero_init_needs_flush_now = !block.is_coherent() && buffer.sync_mapped_writes.is_none(); // No need to flush if it is flushed later anyways. + for uninitialized_range in buffer.initialization_status.drain(offset..(size + offset)) { + let num_bytes = uninitialized_range.end - uninitialized_range.start; + unsafe { + ptr::write_bytes( + ptr.as_ptr().offset(uninitialized_range.start as isize), + 0, + num_bytes as usize, + ) + }; + if zero_init_needs_flush_now { + block.flush_range(raw, uninitialized_range.start, Some(num_bytes))?; + } + } + Ok(ptr) } @@ -470,6 +496,10 @@ impl Device { // we are going to be copying into it, internally usage |= hal::buffer::Usage::TRANSFER_DST; } + } else { + // We are required to zero out (initialize) all memory. + // This is done on demand using fill_buffer which requires write transfer usage! + usage |= hal::buffer::Usage::TRANSFER_DST; } if desc.usage.is_empty() { @@ -542,7 +572,7 @@ impl Device { }, usage: desc.usage, size: desc.size, - full_range: (), + initialization_status: MemoryInitTracker::new(desc.size), sync_mapped_writes: None, map_state: resource::BufferMapState::Idle, life_guard: LifeGuard::new(desc.label.borrow_or_default()), @@ -1237,6 +1267,7 @@ impl Device { // `BTreeMap` has ordered bindings as keys, which allows us to coalesce // the descriptor writes into a single transaction. let mut write_map = BTreeMap::new(); + let mut used_buffer_ranges = Vec::new(); for entry in desc.entries.iter() { let binding = entry.binding; // Find the corresponding declaration in the layout @@ -1329,6 +1360,12 @@ impl Device { return Err(Error::BindingZeroSize(bb.buffer_id)); } + used_buffer_ranges.push(MemoryInitTrackerAction { + id: bb.buffer_id, + range: bb.offset..(bb.offset + bind_size), + kind: MemoryInitKind::NeedsInitializedMemory, + }); + let sub_range = hal::buffer::SubRange { offset: bb.offset, size: Some(bind_size), @@ -1544,6 +1581,7 @@ impl Device { layout_id: id::Valid(desc.layout), life_guard: LifeGuard::new(desc.label.borrow_or_default()), used, + used_buffer_ranges, dynamic_binding_info, }) } @@ -2530,7 +2568,7 @@ impl Global { usage: wgt::BufferUsage::MAP_WRITE | wgt::BufferUsage::COPY_SRC, mapped_at_creation: false, }; - let stage = match device.create_buffer(device_id, &stage_desc, true) { + let mut stage = match device.create_buffer(device_id, &stage_desc, true) { Ok(stage) => stage, Err(e) => { let (raw, memory) = buffer.raw.unwrap(); @@ -2561,6 +2599,13 @@ impl Global { break e.into(); } }; + + // Zero initialize memory and then mark both staging and buffer as initialized + // (it's guaranteed that this is the case by the time the buffer is usable) + unsafe { ptr::write_bytes(ptr.as_ptr(), 0, buffer.size as usize) }; + buffer.initialization_status.clear(0..buffer.size); + stage.initialization_status.clear(0..buffer.size); + buffer.map_state = resource::BufferMapState::Init { ptr, needs_flush: !stage_memory.is_coherent(), diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index dbc53f28a4..ebbec17a06 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -13,13 +13,14 @@ use crate::{ device::{alloc, DeviceError, WaitIdleError}, hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Token}, id, + memory_init_tracker::MemoryInitKind, resource::{BufferAccessError, BufferMapState, BufferUse, TextureUse}, - span, FastHashSet, + span, FastHashMap, FastHashSet, }; use hal::{command::CommandBuffer as _, device::Device as _, queue::CommandQueue as _}; use smallvec::SmallVec; -use std::{iter, ptr}; +use std::{iter, ops::Range, ptr}; use thiserror::Error; struct StagingData { @@ -271,6 +272,16 @@ impl Global { device.pending_writes.consume(stage); device.pending_writes.dst_buffers.insert(buffer_id); + // Ensure the overwritten bytes are marked as initialized so they don't need to be nulled prior to mapping or binding. + { + drop(buffer_guard); + let (mut buffer_guard, _) = hub.buffers.write(&mut token); + + let dst = buffer_guard.get_mut(buffer_id).unwrap(); + dst.initialization_status + .clear(buffer_offset..(buffer_offset + data_size)); + } + Ok(()) } @@ -458,6 +469,129 @@ impl Global { Ok(()) } + // Enacts all zero initializations required by the given command buffers + // Required commands are appended to device.pending_writes + fn initialize_used_uninitialized_memory( + &self, + queue_id: id::QueueId, + command_buffer_ids: &[id::CommandBufferId], + ) -> Result<(), QueueSubmitError> { + if command_buffer_ids.is_empty() { + return Ok(()); + } + + let hub = B::hub(self); + let mut token = Token::root(); + + let mut required_buffer_inits = { + let (command_buffer_guard, mut token) = hub.command_buffers.read(&mut token); + + let mut required_buffer_inits: FastHashMap< + id::BufferId, + Vec>, + > = FastHashMap::default(); + + for &cmb_id in command_buffer_ids { + let cmdbuf = command_buffer_guard + .get(cmb_id) + .map_err(|_| QueueSubmitError::InvalidCommandBuffer(cmb_id))?; + + if cmdbuf.buffer_memory_init_actions.len() == 0 { + continue; + } + + let (mut buffer_guard, _) = hub.buffers.write(&mut token); + + for buffer_use in cmdbuf.buffer_memory_init_actions.iter() { + let buffer = buffer_guard + .get_mut(buffer_use.id) + .map_err(|_| QueueSubmitError::DestroyedBuffer(buffer_use.id))?; + + let uninitialized_ranges = + buffer.initialization_status.drain(buffer_use.range.clone()); + match buffer_use.kind { + MemoryInitKind::ImplicitlyInitialized => { + uninitialized_ranges.for_each(drop); + } + MemoryInitKind::NeedsInitializedMemory => { + required_buffer_inits + .entry(buffer_use.id) + .or_default() + .extend(uninitialized_ranges); + } + } + } + } + required_buffer_inits + }; + + // Memory init is expected to be rare (means user relies on default zero!), so most of the time we early here! + if required_buffer_inits.is_empty() { + return Ok(()); + } + + let (mut device_guard, mut token) = hub.devices.write(&mut token); + let (buffer_guard, _) = hub.buffers.read(&mut token); + let device = device_guard + .get_mut(queue_id) + .map_err(|_| DeviceError::Invalid)?; + + device + .pending_writes + .dst_buffers + .extend(required_buffer_inits.keys()); + device.borrow_pending_writes(); // Call ensures there is a pending_writes cmdbuffer, but using the reference returned would make the borrow checker unhappy! + let pending_writes_cmd_buf = device.pending_writes.command_buffer.as_mut().unwrap(); + let mut trackers = device.trackers.lock(); + + for (buffer_id, mut ranges) in required_buffer_inits.drain() { + // Collapse touching ranges. We can't do this any earlier since we only now gathered ranges from several different command buffers! + ranges.sort_by(|a, b| a.start.cmp(&b.start)); + for i in (1..ranges.len()).rev() { + assert!(ranges[i - 1].end <= ranges[i].start); // The memory init tracker made sure of this! + if ranges[i].start == ranges[i - 1].end { + ranges[i - 1].end = ranges[i].end; + ranges.swap_remove(i); // Ordering not important at this point + } + } + + let (buffer, transition) = trackers + .buffers + .use_replace(&*buffer_guard, buffer_id, (), BufferUse::COPY_DST) + .map_err(|_| QueueSubmitError::DestroyedBuffer(buffer_id))?; + let &(ref buffer_raw, _) = buffer + .raw + .as_ref() + .ok_or(QueueSubmitError::DestroyedBuffer(buffer_id))?; + unsafe { + pending_writes_cmd_buf.pipeline_barrier( + super::all_buffer_stages()..hal::pso::PipelineStage::TRANSFER, + hal::memory::Dependencies::empty(), + transition.map(|pending| pending.into_hal(buffer)), + ); + } + for range in ranges { + let size = range.end - range.start; + + assert!(range.start % 4 == 0, "Buffer {:?} has an uninitialized range with a start not aligned to 4 (start was {})", buffer, range.start); + assert!(size % 4 == 0, "Buffer {:?} has an uninitialized range with a size not aligned to 4 (size was {})", buffer, size); + + unsafe { + pending_writes_cmd_buf.fill_buffer( + buffer_raw, + hal::buffer::SubRange { + offset: range.start, + size: Some(size), + }, + 0, + ); + } + } + } + + Ok(()) + } + pub fn queue_submit( &self, queue_id: id::QueueId, @@ -465,10 +599,12 @@ impl Global { ) -> Result<(), QueueSubmitError> { span!(_guard, INFO, "Queue::submit"); + self.initialize_used_uninitialized_memory::(queue_id, command_buffer_ids)?; + let hub = B::hub(self); + let mut token = Token::root(); let callbacks = { - let mut token = Token::root(); let (mut device_guard, mut token) = hub.devices.write(&mut token); let device = device_guard .get_mut(queue_id) @@ -490,7 +626,7 @@ impl Global { let (compute_pipe_guard, mut token) = hub.compute_pipelines.read(&mut token); let (render_pipe_guard, mut token) = hub.render_pipelines.read(&mut token); let (mut buffer_guard, mut token) = hub.buffers.write(&mut token); - let (texture_guard, mut token) = hub.textures.read(&mut token); + let (texture_guard, mut token) = hub.textures.write(&mut token); let (texture_view_guard, mut token) = hub.texture_views.read(&mut token); let (sampler_guard, _) = hub.samplers.read(&mut token); diff --git a/wgpu-core/src/lib.rs b/wgpu-core/src/lib.rs index bd099a0fbe..6b102ce3e9 100644 --- a/wgpu-core/src/lib.rs +++ b/wgpu-core/src/lib.rs @@ -36,6 +36,7 @@ pub mod device; pub mod hub; pub mod id; pub mod instance; +mod memory_init_tracker; pub mod pipeline; pub mod resource; pub mod swap_chain; diff --git a/wgpu-core/src/memory_init_tracker.rs b/wgpu-core/src/memory_init_tracker.rs new file mode 100644 index 0000000000..c153be3906 --- /dev/null +++ b/wgpu-core/src/memory_init_tracker.rs @@ -0,0 +1,279 @@ +use std::ops::Range; + +#[derive(Debug, Clone, Copy)] +pub(crate) enum MemoryInitKind { + // The memory range is going to be written by an already initialized source, thus doesn't need extra attention other than marking as initialized. + ImplicitlyInitialized, + // The memory range is going to be read, therefore needs to ensure prior initialization. + NeedsInitializedMemory, +} + +#[derive(Debug, Clone)] +pub(crate) struct MemoryInitTrackerAction { + pub(crate) id: ResourceId, + pub(crate) range: Range, + pub(crate) kind: MemoryInitKind, +} + +/// Tracks initialization status of a linear range from 0..size +#[derive(Debug)] +pub(crate) struct MemoryInitTracker { + // Ordered, non overlapping list of all uninitialized ranges. + uninitialized_ranges: Vec>, +} + +pub(crate) struct MemoryInitTrackerDrain<'a> { + uninitialized_ranges: &'a mut Vec>, + drain_range: Range, + first_index: usize, + next_index: usize, +} + +impl<'a> Iterator for MemoryInitTrackerDrain<'a> { + type Item = Range; + + fn next(&mut self) -> Option { + if let Some(r) = self + .uninitialized_ranges + .get(self.next_index) + .and_then(|range| { + if range.start < self.drain_range.end { + Some(range.clone()) + } else { + None + } + }) + { + self.next_index += 1; + Some(r.start.max(self.drain_range.start)..r.end.min(self.drain_range.end)) + } else { + let num_affected = self.next_index - self.first_index; + if num_affected == 0 { + return None; + } + + let first_range = &mut self.uninitialized_ranges[self.first_index]; + + // Split one "big" uninitialized range? + if num_affected == 1 + && first_range.start < self.drain_range.start + && first_range.end > self.drain_range.end + { + let old_start = first_range.start; + first_range.start = self.drain_range.end; + self.uninitialized_ranges + .insert(self.first_index, old_start..self.drain_range.start); + } + // Adjust border ranges and delete everything in-between. + else { + let remove_start = if first_range.start >= self.drain_range.start { + self.first_index + } else { + first_range.end = self.drain_range.start; + self.first_index + 1 + }; + + let last_range = &mut self.uninitialized_ranges[self.next_index - 1]; + let remove_end = if last_range.end <= self.drain_range.end { + self.next_index + } else { + last_range.start = self.drain_range.end; + self.next_index - 1 + }; + + self.uninitialized_ranges.drain(remove_start..remove_end); + } + + None + } + } +} + +impl MemoryInitTracker { + pub(crate) fn new(size: wgt::BufferAddress) -> Self { + Self { + uninitialized_ranges: vec![0..size], + } + } + + // Search smallest range.end which is bigger than bound in O(log n) (with n being number of uninitialized ranges) + fn lower_bound(&self, bound: wgt::BufferAddress) -> usize { + // This is equivalent to, except that it may return an out of bounds index instead of + //self.uninitialized_ranges.iter().position(|r| r.end > bound) + + // In future Rust versions this operation can be done with partition_point + // See https://github.com/rust-lang/rust/pull/73577/ + let mut left = 0; + let mut right = self.uninitialized_ranges.len(); + + while left != right { + let mid = left + (right - left) / 2; + let value = unsafe { self.uninitialized_ranges.get_unchecked(mid) }; + + if value.end <= bound { + left = mid + 1; + } else { + right = mid; + } + } + + left + } + + // Checks if there's any uninitialized ranges within a query. + // If there are any, the range returned a the subrange of the query_range that contains all these uninitialized regions. + // Returned range may be larger than necessary (tradeoff for making this function O(log n)) + pub(crate) fn check( + &self, + query_range: Range, + ) -> Option> { + let index = self.lower_bound(query_range.start); + self.uninitialized_ranges + .get(index) + .map(|start_range| { + if start_range.start < query_range.end { + let start = start_range.start.max(query_range.start); + match self.uninitialized_ranges.get(index + 1) { + Some(next_range) => { + if next_range.start < query_range.end { + // Would need to keep iterating for more accurate upper bound. Don't do that here. + Some(start..query_range.end) + } else { + Some(start..start_range.end.min(query_range.end)) + } + } + None => Some(start..start_range.end.min(query_range.end)), + } + } else { + None + } + }) + .flatten() + } + + // Drains uninitialized ranges in a query range. + #[must_use] + pub(crate) fn drain<'a>( + &'a mut self, + drain_range: Range, + ) -> MemoryInitTrackerDrain<'a> { + let index = self.lower_bound(drain_range.start); + MemoryInitTrackerDrain { + drain_range, + uninitialized_ranges: &mut self.uninitialized_ranges, + first_index: index, + next_index: index, + } + } + + // Clears uninitialized ranges in a query range. + pub(crate) fn clear(&mut self, range: Range) { + self.drain(range).for_each(drop); + } +} + +#[cfg(test)] +mod test { + use super::MemoryInitTracker; + use std::ops::Range; + + #[test] + fn check_for_newly_created_tracker() { + let tracker = MemoryInitTracker::new(10); + assert_eq!(tracker.check(0..10), Some(0..10)); + assert_eq!(tracker.check(0..3), Some(0..3)); + assert_eq!(tracker.check(3..4), Some(3..4)); + assert_eq!(tracker.check(4..10), Some(4..10)); + } + + #[test] + fn check_for_cleared_tracker() { + let mut tracker = MemoryInitTracker::new(10); + tracker.clear(0..10); + assert_eq!(tracker.check(0..10), None); + assert_eq!(tracker.check(0..3), None); + assert_eq!(tracker.check(3..4), None); + assert_eq!(tracker.check(4..10), None); + } + + #[test] + fn check_for_partially_filled_tracker() { + let mut tracker = MemoryInitTracker::new(25); + // Two regions of uninitialized memory + tracker.clear(0..5); + tracker.clear(10..15); + tracker.clear(20..25); + + assert_eq!(tracker.check(0..25), Some(5..25)); // entire range + + assert_eq!(tracker.check(0..5), None); // left non-overlapping + assert_eq!(tracker.check(3..8), Some(5..8)); // left overlapping region + assert_eq!(tracker.check(3..17), Some(5..17)); // left overlapping region + contained region + + assert_eq!(tracker.check(8..22), Some(8..22)); // right overlapping region + contained region (yes, doesn't fix range end!) + assert_eq!(tracker.check(17..22), Some(17..20)); // right overlapping region + assert_eq!(tracker.check(20..25), None); // right non-overlapping + } + + #[test] + fn clear_already_cleared() { + let mut tracker = MemoryInitTracker::new(30); + tracker.clear(10..20); + + // Overlapping with non-cleared + tracker.clear(5..15); // Left overlap + tracker.clear(15..25); // Right overlap + tracker.clear(0..30); // Inner overlap + + // Clear fully cleared + tracker.clear(0..30); + + assert_eq!(tracker.check(0..30), None); + } + + #[test] + fn drain_never_returns_ranges_twice_for_same_range() { + let mut tracker = MemoryInitTracker::new(19); + assert_eq!(tracker.drain(0..19).count(), 1); + assert_eq!(tracker.drain(0..19).count(), 0); + + let mut tracker = MemoryInitTracker::new(17); + assert_eq!(tracker.drain(5..8).count(), 1); + assert_eq!(tracker.drain(5..8).count(), 0); + assert_eq!(tracker.drain(1..3).count(), 1); + assert_eq!(tracker.drain(1..3).count(), 0); + assert_eq!(tracker.drain(7..13).count(), 1); + assert_eq!(tracker.drain(7..13).count(), 0); + } + + #[test] + fn drain_splits_ranges_correctly() { + let mut tracker = MemoryInitTracker::new(1337); + assert_eq!( + tracker + .drain(21..42) + .collect::>>(), + vec![21..42] + ); + assert_eq!( + tracker + .drain(900..1000) + .collect::>>(), + vec![900..1000] + ); + + // Splitted ranges. + assert_eq!( + tracker + .drain(5..1003) + .collect::>>(), + vec![5..21, 42..900, 1000..1003] + ); + assert_eq!( + tracker + .drain(0..1337) + .collect::>>(), + vec![0..5, 1003..1337] + ); + } +} diff --git a/wgpu-core/src/resource.rs b/wgpu-core/src/resource.rs index 23aab5c865..310bdd7a1a 100644 --- a/wgpu-core/src/resource.rs +++ b/wgpu-core/src/resource.rs @@ -6,6 +6,7 @@ use crate::{ device::{alloc::MemoryBlock, DeviceError, HostMap}, hub::Resource, id::{DeviceId, SwapChainId, TextureId}, + memory_init_tracker::MemoryInitTracker, track::{TextureSelector, DUMMY_SELECTOR}, validation::MissingBufferUsageError, Label, LifeGuard, RefCount, Stored, @@ -160,7 +161,7 @@ pub struct Buffer { pub(crate) device_id: Stored, pub(crate) usage: wgt::BufferUsage, pub(crate) size: wgt::BufferAddress, - pub(crate) full_range: (), + pub(crate) initialization_status: MemoryInitTracker, pub(crate) sync_mapped_writes: Option, pub(crate) life_guard: LifeGuard, pub(crate) map_state: BufferMapState,