Merge #679

679: Enforce copy buffer-texture alignment r=cwfitzgerald a=kvark **Connections** This is a follow-up to #666 **Description** We are now enforcing the `bytes_per_row` on copy-texture copies to `COPY_BYTES_PER_ROW_ALIGNMENT`. We allow it being non-aligned for `write_texture`, which now has the code to properly align the staging space it uses, and to copy the rows one by one with proper alignment. We are also moving `BufferSize` to wgpu-types, because it's required for wgpu-rs to build. **Testing** Testing this needs https://github.com/gfx-rs/wgpu-rs/pull/328, which is blocked by https://github.com/gfx-rs/wgpu-rs/pull/323 Co-authored-by: Dzmitry Malyshau <kvarkus@gmail.com>
2026-04-22 03:02:01 -04:00 · 2020-06-01 20:52:38 +00:00
parent 3a6cdeec94 321a5cee0f
commit 201073b0f6
7 changed files with 114 additions and 46 deletions
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -307,7 +307,6 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
        let hub = B::hub(self);
        let mut token = Token::root();

-        let (adapter_guard, mut token) = hub.adapters.read(&mut token);
        let (device_guard, mut token) = hub.devices.read(&mut token);
        let (mut cmb_guard, mut token) = hub.command_buffers.write(&mut token);

@@ -371,13 +370,9 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
        };

        let (context, sample_count) = {
-            use hal::{adapter::PhysicalDevice as _, device::Device as _};
+            use hal::device::Device as _;

-            let limits = adapter_guard[device.adapter_id.value]
-                .raw
-                .physical_device
-                .limits();
-            let samples_count_limit = limits.framebuffer_color_sample_counts;
+            let samples_count_limit = device.hal_limits.framebuffer_color_sample_counts;
            let base_trackers = &cmb.trackers;

            let mut extent = None;
--- a/wgpu-core/src/command/transfer.rs
+++ b/wgpu-core/src/command/transfer.rs
@@ -199,14 +199,15 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
            .surface_desc()
            .bits as u32
            / BITS_PER_BYTE;
-        let buffer_width = source.layout.bytes_per_row / bytes_per_texel;
+        assert_eq!(wgt::COPY_BYTES_PER_ROW_ALIGNMENT % bytes_per_texel, 0);
        assert_eq!(
-            source.layout.bytes_per_row % bytes_per_texel,
+            source.layout.bytes_per_row % wgt::COPY_BYTES_PER_ROW_ALIGNMENT,
            0,
-            "Source bytes per row ({}) must be a multiple of bytes per texel ({})",
+            "Source bytes per row ({}) must be a multiple of {}",
            source.layout.bytes_per_row,
-            bytes_per_texel
+            wgt::COPY_BYTES_PER_ROW_ALIGNMENT
        );
+        let buffer_width = source.layout.bytes_per_row / bytes_per_texel;
        let region = hal::command::BufferImageCopy {
            buffer_offset: source.layout.offset,
            buffer_width,
@@ -286,14 +287,15 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
            .surface_desc()
            .bits as u32
            / BITS_PER_BYTE;
-        let buffer_width = destination.layout.bytes_per_row / bytes_per_texel;
+        assert_eq!(wgt::COPY_BYTES_PER_ROW_ALIGNMENT % bytes_per_texel, 0);
        assert_eq!(
-            destination.layout.bytes_per_row % bytes_per_texel,
+            destination.layout.bytes_per_row % wgt::COPY_BYTES_PER_ROW_ALIGNMENT,
            0,
-            "Destination bytes per row ({}) must be a multiple of bytes per texel ({})",
+            "Destination bytes per row ({}) must be a multiple of {}",
            destination.layout.bytes_per_row,
-            bytes_per_texel
+            wgt::COPY_BYTES_PER_ROW_ALIGNMENT
        );
+        let buffer_width = destination.layout.bytes_per_row / bytes_per_texel;
        let region = hal::command::BufferImageCopy {
            buffer_offset: destination.layout.offset,
            buffer_width,
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -202,6 +202,7 @@ pub struct Device<B: hal::Backend> {
    // Life tracker should be locked right after the device and before anything else.
    life_tracker: Mutex<life::LifetimeTracker<B>>,
    temp_suspected: life::SuspectedResources,
+    pub(crate) hal_limits: hal::Limits,
    pub(crate) private_features: PrivateFeatures,
    limits: wgt::Limits,
    extensions: wgt::Extensions,
@@ -216,7 +217,7 @@ impl<B: GfxBackend> Device<B> {
        adapter_id: Stored<id::AdapterId>,
        queue_group: hal::queue::QueueGroup<B>,
        mem_props: hal::adapter::MemoryProperties,
-        non_coherent_atom_size: u64,
+        hal_limits: hal::Limits,
        supports_texture_d24_s8: bool,
        desc: &wgt::DeviceDescriptor,
        trace_path: Option<&std::path::Path>,
@@ -237,7 +238,7 @@ impl<B: GfxBackend> Device<B> {
                gfx_memory::LinearConfig {
                    linear_size: 0x100_0000,
                },
-                non_coherent_atom_size,
+                hal_limits.non_coherent_atom_size as u64,
            )
        };
        #[cfg(not(feature = "trace"))]
@@ -273,6 +274,7 @@ impl<B: GfxBackend> Device<B> {
                    None
                }
            }),
+            hal_limits,
            private_features: PrivateFeatures {
                supports_texture_d24_s8,
            },
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -61,19 +61,16 @@ impl<B: hal::Backend> PendingWrites<B> {
 }

 impl<B: hal::Backend> super::Device<B> {
-    fn stage_data(&mut self, data: &[u8]) -> StagingData<B> {
+    fn prepare_stage(&mut self, size: wgt::BufferAddress) -> StagingData<B> {
        let mut buffer = unsafe {
            self.raw
-                .create_buffer(
-                    data.len() as wgt::BufferAddress,
-                    hal::buffer::Usage::TRANSFER_SRC,
-                )
+                .create_buffer(size, hal::buffer::Usage::TRANSFER_SRC)
                .unwrap()
        };
        //TODO: do we need to transition into HOST_WRITE access first?
        let requirements = unsafe { self.raw.get_buffer_requirements(&buffer) };

-        let mut memory = self
+        let memory = self
            .mem_allocator
            .lock()
            .allocate(
@@ -92,12 +89,6 @@ impl<B: hal::Backend> super::Device<B> {
                .unwrap();
        }

-        let mut mapped = memory.map(&self.raw, hal::memory::Segment::ALL).unwrap();
-        unsafe { mapped.write(&self.raw, hal::memory::Segment::ALL) }
-            .unwrap()
-            .slice[..data.len()]
-            .copy_from_slice(data);
-
        let comb = match self.pending_writes.command_buffer.take() {
            Some(comb) => comb,
            None => {
@@ -147,7 +138,17 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
            None => {}
        }

-        let mut stage = device.stage_data(data);
+        let mut stage = device.prepare_stage(data.len() as wgt::BufferAddress);
+        {
+            let mut mapped = stage
+                .memory
+                .map(&device.raw, hal::memory::Segment::ALL)
+                .unwrap();
+            unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) }
+                .unwrap()
+                .slice[..data.len()]
+                .copy_from_slice(data);
+        }

        let mut trackers = device.trackers.lock();
        let (dst, transition) =
@@ -217,7 +218,45 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
            None => {}
        }

-        let mut stage = device.stage_data(data);
+        let texture_format = texture_guard[destination.texture].format;
+        let bytes_per_texel = conv::map_texture_format(texture_format, device.private_features)
+            .surface_desc()
+            .bits as u32
+            / BITS_PER_BYTE;
+
+        let stage_bytes_per_row = get_lowest_common_denom(
+            device.hal_limits.optimal_buffer_copy_pitch_alignment as u32,
+            bytes_per_texel,
+        );
+        let stage_size = stage_bytes_per_row as u64
+            * ((size.depth - 1) * data_layout.rows_per_image + size.height) as u64;
+        let mut stage = device.prepare_stage(stage_size);
+        {
+            let mut mapped = stage
+                .memory
+                .map(&device.raw, hal::memory::Segment::ALL)
+                .unwrap();
+            let mapping = unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) }.unwrap();
+            if stage_bytes_per_row == data_layout.bytes_per_row {
+                // Unlikely case of the data already being aligned optimally.
+                mapping.slice[..stage_size as usize].copy_from_slice(data);
+            } else {
+                // Copy row by row into the optimal alignment.
+                let copy_bytes_per_row =
+                    stage_bytes_per_row.min(data_layout.bytes_per_row) as usize;
+                for layer in 0..size.depth {
+                    let rows_offset = layer * data_layout.rows_per_image;
+                    for row in 0..size.height {
+                        let data_offset =
+                            (rows_offset + row) as usize * data_layout.bytes_per_row as usize;
+                        let stage_offset =
+                            (rows_offset + row) as usize * stage_bytes_per_row as usize;
+                        mapping.slice[stage_offset..stage_offset + copy_bytes_per_row]
+                            .copy_from_slice(&data[data_offset..data_offset + copy_bytes_per_row]);
+                    }
+                }
+            }
+        }

        let mut trackers = device.trackers.lock();
        let (dst, transition) = trackers.textures.use_replace(
@@ -235,21 +274,9 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
        let last_submit_index = device.life_guard.submission_index.load(Ordering::Relaxed);
        dst.life_guard.use_at(last_submit_index + 1);

-        let bytes_per_texel = conv::map_texture_format(dst.format, device.private_features)
-            .surface_desc()
-            .bits as u32
-            / BITS_PER_BYTE;
-        let buffer_width = data_layout.bytes_per_row / bytes_per_texel;
-        assert_eq!(
-            data_layout.bytes_per_row % bytes_per_texel,
-            0,
-            "Source bytes per row ({}) must be a multiple of bytes per texel ({})",
-            data_layout.bytes_per_row,
-            bytes_per_texel
-        );
        let region = hal::command::BufferImageCopy {
            buffer_offset: 0,
-            buffer_width,
+            buffer_width: stage_bytes_per_row / bytes_per_texel,
            buffer_height: data_layout.rows_per_image,
            image_layers,
            image_offset,
@@ -467,3 +494,40 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
        super::fire_map_callbacks(callbacks);
    }
 }
+
+fn get_lowest_common_denom(a: u32, b: u32) -> u32 {
+    let gcd = if a >= b {
+        get_greatest_common_divisor(a, b)
+    } else {
+        get_greatest_common_divisor(b, a)
+    };
+    a * b / gcd
+}
+
+fn get_greatest_common_divisor(mut a: u32, mut b: u32) -> u32 {
+    assert!(a >= b);
+    loop {
+        let c = a % b;
+        if c == 0 {
+            return b;
+        } else {
+            a = b;
+            b = c;
+        }
+    }
+}
+
+#[test]
+fn test_lcd() {
+    assert_eq!(get_lowest_common_denom(2, 2), 2);
+    assert_eq!(get_lowest_common_denom(2, 3), 6);
+    assert_eq!(get_lowest_common_denom(6, 4), 12);
+}
+
+#[test]
+fn test_gcd() {
+    assert_eq!(get_greatest_common_divisor(5, 1), 1);
+    assert_eq!(get_greatest_common_divisor(4, 2), 2);
+    assert_eq!(get_greatest_common_divisor(6, 4), 2);
+    assert_eq!(get_greatest_common_divisor(7, 7), 7);
+}
--- a/wgpu-core/src/instance.rs
+++ b/wgpu-core/src/instance.rs
@@ -18,7 +18,6 @@ use serde::Deserialize;
 use serde::Serialize;

 use hal::{
-    self,
    adapter::{AdapterInfo as HalAdapterInfo, DeviceType as HalDeviceType, PhysicalDevice as _},
    queue::QueueFamily as _,
    window::Surface as _,
@@ -652,7 +651,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                },
                gpu.queue_groups.swap_remove(0),
                mem_props,
-                limits.non_coherent_atom_size as u64,
+                limits,
                supports_texture_d24_s8,
                desc,
                trace_path,
--- a/wgpu-core/src/swap_chain.rs
+++ b/wgpu-core/src/swap_chain.rs
@@ -101,6 +101,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
        let (device_guard, mut token) = hub.devices.read(&mut token);
        let (mut swap_chain_guard, mut token) = hub.swap_chains.write(&mut token);
        let sc = &mut swap_chain_guard[swap_chain_id];
+        #[cfg_attr(not(feature = "trace"), allow(unused_variables))]
        let device = &device_guard[sc.device_id.value];

        let suf = B::get_surface_mut(surface);
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -10,6 +10,11 @@ use serde::Deserialize;
 use serde::Serialize;
 use std::{io, ptr, slice};

+/// Buffer-Texture copies on command encoders have to have the `bytes_per_row`
+/// aligned to this number.
+///
+/// This doesn't apply to `Queue::write_texture`.
+pub const COPY_BYTES_PER_ROW_ALIGNMENT: u32 = 256;
 /// Bound uniform/storage buffer offsets must be aligned to this number.
 pub const BIND_BUFFER_ALIGNMENT: u64 = 256;