diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs index c6842ab630..bab58d3e96 100644 --- a/wgpu-core/src/command/render.rs +++ b/wgpu-core/src/command/render.rs @@ -307,7 +307,6 @@ impl Global { let hub = B::hub(self); let mut token = Token::root(); - let (adapter_guard, mut token) = hub.adapters.read(&mut token); let (device_guard, mut token) = hub.devices.read(&mut token); let (mut cmb_guard, mut token) = hub.command_buffers.write(&mut token); @@ -371,13 +370,9 @@ impl Global { }; let (context, sample_count) = { - use hal::{adapter::PhysicalDevice as _, device::Device as _}; + use hal::device::Device as _; - let limits = adapter_guard[device.adapter_id.value] - .raw - .physical_device - .limits(); - let samples_count_limit = limits.framebuffer_color_sample_counts; + let samples_count_limit = device.hal_limits.framebuffer_color_sample_counts; let base_trackers = &cmb.trackers; let mut extent = None; diff --git a/wgpu-core/src/command/transfer.rs b/wgpu-core/src/command/transfer.rs index a674962c8f..2d4e5d3209 100644 --- a/wgpu-core/src/command/transfer.rs +++ b/wgpu-core/src/command/transfer.rs @@ -199,14 +199,15 @@ impl Global { .surface_desc() .bits as u32 / BITS_PER_BYTE; - let buffer_width = source.layout.bytes_per_row / bytes_per_texel; + assert_eq!(wgt::COPY_BYTES_PER_ROW_ALIGNMENT % bytes_per_texel, 0); assert_eq!( - source.layout.bytes_per_row % bytes_per_texel, + source.layout.bytes_per_row % wgt::COPY_BYTES_PER_ROW_ALIGNMENT, 0, - "Source bytes per row ({}) must be a multiple of bytes per texel ({})", + "Source bytes per row ({}) must be a multiple of {}", source.layout.bytes_per_row, - bytes_per_texel + wgt::COPY_BYTES_PER_ROW_ALIGNMENT ); + let buffer_width = source.layout.bytes_per_row / bytes_per_texel; let region = hal::command::BufferImageCopy { buffer_offset: source.layout.offset, buffer_width, @@ -286,14 +287,15 @@ impl Global { .surface_desc() .bits as u32 / BITS_PER_BYTE; - let buffer_width = destination.layout.bytes_per_row / bytes_per_texel; + assert_eq!(wgt::COPY_BYTES_PER_ROW_ALIGNMENT % bytes_per_texel, 0); assert_eq!( - destination.layout.bytes_per_row % bytes_per_texel, + destination.layout.bytes_per_row % wgt::COPY_BYTES_PER_ROW_ALIGNMENT, 0, - "Destination bytes per row ({}) must be a multiple of bytes per texel ({})", + "Destination bytes per row ({}) must be a multiple of {}", destination.layout.bytes_per_row, - bytes_per_texel + wgt::COPY_BYTES_PER_ROW_ALIGNMENT ); + let buffer_width = destination.layout.bytes_per_row / bytes_per_texel; let region = hal::command::BufferImageCopy { buffer_offset: destination.layout.offset, buffer_width, diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs index 52a34757a3..829cee5697 100644 --- a/wgpu-core/src/device/mod.rs +++ b/wgpu-core/src/device/mod.rs @@ -202,6 +202,7 @@ pub struct Device { // Life tracker should be locked right after the device and before anything else. life_tracker: Mutex>, temp_suspected: life::SuspectedResources, + pub(crate) hal_limits: hal::Limits, pub(crate) private_features: PrivateFeatures, limits: wgt::Limits, extensions: wgt::Extensions, @@ -216,7 +217,7 @@ impl Device { adapter_id: Stored, queue_group: hal::queue::QueueGroup, mem_props: hal::adapter::MemoryProperties, - non_coherent_atom_size: u64, + hal_limits: hal::Limits, supports_texture_d24_s8: bool, desc: &wgt::DeviceDescriptor, trace_path: Option<&std::path::Path>, @@ -237,7 +238,7 @@ impl Device { gfx_memory::LinearConfig { linear_size: 0x100_0000, }, - non_coherent_atom_size, + hal_limits.non_coherent_atom_size as u64, ) }; #[cfg(not(feature = "trace"))] @@ -273,6 +274,7 @@ impl Device { None } }), + hal_limits, private_features: PrivateFeatures { supports_texture_d24_s8, }, diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index 51c5b71c84..8c78081b9a 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -61,19 +61,16 @@ impl PendingWrites { } impl super::Device { - fn stage_data(&mut self, data: &[u8]) -> StagingData { + fn prepare_stage(&mut self, size: wgt::BufferAddress) -> StagingData { let mut buffer = unsafe { self.raw - .create_buffer( - data.len() as wgt::BufferAddress, - hal::buffer::Usage::TRANSFER_SRC, - ) + .create_buffer(size, hal::buffer::Usage::TRANSFER_SRC) .unwrap() }; //TODO: do we need to transition into HOST_WRITE access first? let requirements = unsafe { self.raw.get_buffer_requirements(&buffer) }; - let mut memory = self + let memory = self .mem_allocator .lock() .allocate( @@ -92,12 +89,6 @@ impl super::Device { .unwrap(); } - let mut mapped = memory.map(&self.raw, hal::memory::Segment::ALL).unwrap(); - unsafe { mapped.write(&self.raw, hal::memory::Segment::ALL) } - .unwrap() - .slice[..data.len()] - .copy_from_slice(data); - let comb = match self.pending_writes.command_buffer.take() { Some(comb) => comb, None => { @@ -147,7 +138,17 @@ impl Global { None => {} } - let mut stage = device.stage_data(data); + let mut stage = device.prepare_stage(data.len() as wgt::BufferAddress); + { + let mut mapped = stage + .memory + .map(&device.raw, hal::memory::Segment::ALL) + .unwrap(); + unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) } + .unwrap() + .slice[..data.len()] + .copy_from_slice(data); + } let mut trackers = device.trackers.lock(); let (dst, transition) = @@ -217,7 +218,45 @@ impl Global { None => {} } - let mut stage = device.stage_data(data); + let texture_format = texture_guard[destination.texture].format; + let bytes_per_texel = conv::map_texture_format(texture_format, device.private_features) + .surface_desc() + .bits as u32 + / BITS_PER_BYTE; + + let stage_bytes_per_row = get_lowest_common_denom( + device.hal_limits.optimal_buffer_copy_pitch_alignment as u32, + bytes_per_texel, + ); + let stage_size = stage_bytes_per_row as u64 + * ((size.depth - 1) * data_layout.rows_per_image + size.height) as u64; + let mut stage = device.prepare_stage(stage_size); + { + let mut mapped = stage + .memory + .map(&device.raw, hal::memory::Segment::ALL) + .unwrap(); + let mapping = unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) }.unwrap(); + if stage_bytes_per_row == data_layout.bytes_per_row { + // Unlikely case of the data already being aligned optimally. + mapping.slice[..stage_size as usize].copy_from_slice(data); + } else { + // Copy row by row into the optimal alignment. + let copy_bytes_per_row = + stage_bytes_per_row.min(data_layout.bytes_per_row) as usize; + for layer in 0..size.depth { + let rows_offset = layer * data_layout.rows_per_image; + for row in 0..size.height { + let data_offset = + (rows_offset + row) as usize * data_layout.bytes_per_row as usize; + let stage_offset = + (rows_offset + row) as usize * stage_bytes_per_row as usize; + mapping.slice[stage_offset..stage_offset + copy_bytes_per_row] + .copy_from_slice(&data[data_offset..data_offset + copy_bytes_per_row]); + } + } + } + } let mut trackers = device.trackers.lock(); let (dst, transition) = trackers.textures.use_replace( @@ -235,21 +274,9 @@ impl Global { let last_submit_index = device.life_guard.submission_index.load(Ordering::Relaxed); dst.life_guard.use_at(last_submit_index + 1); - let bytes_per_texel = conv::map_texture_format(dst.format, device.private_features) - .surface_desc() - .bits as u32 - / BITS_PER_BYTE; - let buffer_width = data_layout.bytes_per_row / bytes_per_texel; - assert_eq!( - data_layout.bytes_per_row % bytes_per_texel, - 0, - "Source bytes per row ({}) must be a multiple of bytes per texel ({})", - data_layout.bytes_per_row, - bytes_per_texel - ); let region = hal::command::BufferImageCopy { buffer_offset: 0, - buffer_width, + buffer_width: stage_bytes_per_row / bytes_per_texel, buffer_height: data_layout.rows_per_image, image_layers, image_offset, @@ -467,3 +494,40 @@ impl Global { super::fire_map_callbacks(callbacks); } } + +fn get_lowest_common_denom(a: u32, b: u32) -> u32 { + let gcd = if a >= b { + get_greatest_common_divisor(a, b) + } else { + get_greatest_common_divisor(b, a) + }; + a * b / gcd +} + +fn get_greatest_common_divisor(mut a: u32, mut b: u32) -> u32 { + assert!(a >= b); + loop { + let c = a % b; + if c == 0 { + return b; + } else { + a = b; + b = c; + } + } +} + +#[test] +fn test_lcd() { + assert_eq!(get_lowest_common_denom(2, 2), 2); + assert_eq!(get_lowest_common_denom(2, 3), 6); + assert_eq!(get_lowest_common_denom(6, 4), 12); +} + +#[test] +fn test_gcd() { + assert_eq!(get_greatest_common_divisor(5, 1), 1); + assert_eq!(get_greatest_common_divisor(4, 2), 2); + assert_eq!(get_greatest_common_divisor(6, 4), 2); + assert_eq!(get_greatest_common_divisor(7, 7), 7); +} diff --git a/wgpu-core/src/instance.rs b/wgpu-core/src/instance.rs index e99172de6e..5fbe6eff70 100644 --- a/wgpu-core/src/instance.rs +++ b/wgpu-core/src/instance.rs @@ -18,7 +18,6 @@ use serde::Deserialize; use serde::Serialize; use hal::{ - self, adapter::{AdapterInfo as HalAdapterInfo, DeviceType as HalDeviceType, PhysicalDevice as _}, queue::QueueFamily as _, window::Surface as _, @@ -652,7 +651,7 @@ impl Global { }, gpu.queue_groups.swap_remove(0), mem_props, - limits.non_coherent_atom_size as u64, + limits, supports_texture_d24_s8, desc, trace_path, diff --git a/wgpu-core/src/swap_chain.rs b/wgpu-core/src/swap_chain.rs index 5f465003f9..67e3ed17b4 100644 --- a/wgpu-core/src/swap_chain.rs +++ b/wgpu-core/src/swap_chain.rs @@ -101,6 +101,7 @@ impl Global { let (device_guard, mut token) = hub.devices.read(&mut token); let (mut swap_chain_guard, mut token) = hub.swap_chains.write(&mut token); let sc = &mut swap_chain_guard[swap_chain_id]; + #[cfg_attr(not(feature = "trace"), allow(unused_variables))] let device = &device_guard[sc.device_id.value]; let suf = B::get_surface_mut(surface); diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs index 1059271f2b..d59e650617 100644 --- a/wgpu-types/src/lib.rs +++ b/wgpu-types/src/lib.rs @@ -10,6 +10,11 @@ use serde::Deserialize; use serde::Serialize; use std::{io, ptr, slice}; +/// Buffer-Texture copies on command encoders have to have the `bytes_per_row` +/// aligned to this number. +/// +/// This doesn't apply to `Queue::write_texture`. +pub const COPY_BYTES_PER_ROW_ALIGNMENT: u32 = 256; /// Bound uniform/storage buffer offsets must be aligned to this number. pub const BIND_BUFFER_ALIGNMENT: u64 = 256;