diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index c6842ab630..bab58d3e96 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -307,7 +307,6 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         let hub = B::hub(self);
         let mut token = Token::root();
 
-        let (adapter_guard, mut token) = hub.adapters.read(&mut token);
         let (device_guard, mut token) = hub.devices.read(&mut token);
         let (mut cmb_guard, mut token) = hub.command_buffers.write(&mut token);
 
@@ -371,13 +370,9 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         };
 
         let (context, sample_count) = {
-            use hal::{adapter::PhysicalDevice as _, device::Device as _};
+            use hal::device::Device as _;
 
-            let limits = adapter_guard[device.adapter_id.value]
-                .raw
-                .physical_device
-                .limits();
-            let samples_count_limit = limits.framebuffer_color_sample_counts;
+            let samples_count_limit = device.hal_limits.framebuffer_color_sample_counts;
             let base_trackers = &cmb.trackers;
 
             let mut extent = None;
diff --git a/wgpu-core/src/command/transfer.rs b/wgpu-core/src/command/transfer.rs
index a674962c8f..2d4e5d3209 100644
--- a/wgpu-core/src/command/transfer.rs
+++ b/wgpu-core/src/command/transfer.rs
@@ -199,14 +199,15 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             .surface_desc()
             .bits as u32
             / BITS_PER_BYTE;
-        let buffer_width = source.layout.bytes_per_row / bytes_per_texel;
+        assert_eq!(wgt::COPY_BYTES_PER_ROW_ALIGNMENT % bytes_per_texel, 0);
         assert_eq!(
-            source.layout.bytes_per_row % bytes_per_texel,
+            source.layout.bytes_per_row % wgt::COPY_BYTES_PER_ROW_ALIGNMENT,
             0,
-            "Source bytes per row ({}) must be a multiple of bytes per texel ({})",
+            "Source bytes per row ({}) must be a multiple of {}",
             source.layout.bytes_per_row,
-            bytes_per_texel
+            wgt::COPY_BYTES_PER_ROW_ALIGNMENT
         );
+        let buffer_width = source.layout.bytes_per_row / bytes_per_texel;
         let region = hal::command::BufferImageCopy {
             buffer_offset: source.layout.offset,
             buffer_width,
@@ -286,14 +287,15 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             .surface_desc()
             .bits as u32
             / BITS_PER_BYTE;
-        let buffer_width = destination.layout.bytes_per_row / bytes_per_texel;
+        assert_eq!(wgt::COPY_BYTES_PER_ROW_ALIGNMENT % bytes_per_texel, 0);
         assert_eq!(
-            destination.layout.bytes_per_row % bytes_per_texel,
+            destination.layout.bytes_per_row % wgt::COPY_BYTES_PER_ROW_ALIGNMENT,
             0,
-            "Destination bytes per row ({}) must be a multiple of bytes per texel ({})",
+            "Destination bytes per row ({}) must be a multiple of {}",
             destination.layout.bytes_per_row,
-            bytes_per_texel
+            wgt::COPY_BYTES_PER_ROW_ALIGNMENT
         );
+        let buffer_width = destination.layout.bytes_per_row / bytes_per_texel;
         let region = hal::command::BufferImageCopy {
             buffer_offset: destination.layout.offset,
             buffer_width,
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 52a34757a3..829cee5697 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -202,6 +202,7 @@ pub struct Device<B: hal::Backend> {
     // Life tracker should be locked right after the device and before anything else.
     life_tracker: Mutex<life::LifetimeTracker<B>>,
     temp_suspected: life::SuspectedResources,
+    pub(crate) hal_limits: hal::Limits,
     pub(crate) private_features: PrivateFeatures,
     limits: wgt::Limits,
     extensions: wgt::Extensions,
@@ -216,7 +217,7 @@ impl<B: GfxBackend> Device<B> {
         adapter_id: Stored<id::AdapterId>,
         queue_group: hal::queue::QueueGroup<B>,
         mem_props: hal::adapter::MemoryProperties,
-        non_coherent_atom_size: u64,
+        hal_limits: hal::Limits,
         supports_texture_d24_s8: bool,
         desc: &wgt::DeviceDescriptor,
         trace_path: Option<&std::path::Path>,
@@ -237,7 +238,7 @@ impl<B: GfxBackend> Device<B> {
                 gfx_memory::LinearConfig {
                     linear_size: 0x100_0000,
                 },
-                non_coherent_atom_size,
+                hal_limits.non_coherent_atom_size as u64,
             )
         };
         #[cfg(not(feature = "trace"))]
@@ -273,6 +274,7 @@ impl<B: GfxBackend> Device<B> {
                     None
                 }
             }),
+            hal_limits,
             private_features: PrivateFeatures {
                 supports_texture_d24_s8,
             },
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 51c5b71c84..8c78081b9a 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -61,19 +61,16 @@ impl<B: hal::Backend> PendingWrites<B> {
 }
 
 impl<B: hal::Backend> super::Device<B> {
-    fn stage_data(&mut self, data: &[u8]) -> StagingData<B> {
+    fn prepare_stage(&mut self, size: wgt::BufferAddress) -> StagingData<B> {
         let mut buffer = unsafe {
             self.raw
-                .create_buffer(
-                    data.len() as wgt::BufferAddress,
-                    hal::buffer::Usage::TRANSFER_SRC,
-                )
+                .create_buffer(size, hal::buffer::Usage::TRANSFER_SRC)
                 .unwrap()
         };
         //TODO: do we need to transition into HOST_WRITE access first?
         let requirements = unsafe { self.raw.get_buffer_requirements(&buffer) };
 
-        let mut memory = self
+        let memory = self
             .mem_allocator
             .lock()
             .allocate(
@@ -92,12 +89,6 @@ impl<B: hal::Backend> super::Device<B> {
                 .unwrap();
         }
 
-        let mut mapped = memory.map(&self.raw, hal::memory::Segment::ALL).unwrap();
-        unsafe { mapped.write(&self.raw, hal::memory::Segment::ALL) }
-            .unwrap()
-            .slice[..data.len()]
-            .copy_from_slice(data);
-
         let comb = match self.pending_writes.command_buffer.take() {
             Some(comb) => comb,
             None => {
@@ -147,7 +138,17 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             None => {}
         }
 
-        let mut stage = device.stage_data(data);
+        let mut stage = device.prepare_stage(data.len() as wgt::BufferAddress);
+        {
+            let mut mapped = stage
+                .memory
+                .map(&device.raw, hal::memory::Segment::ALL)
+                .unwrap();
+            unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) }
+                .unwrap()
+                .slice[..data.len()]
+                .copy_from_slice(data);
+        }
 
         let mut trackers = device.trackers.lock();
         let (dst, transition) =
@@ -217,7 +218,45 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             None => {}
         }
 
-        let mut stage = device.stage_data(data);
+        let texture_format = texture_guard[destination.texture].format;
+        let bytes_per_texel = conv::map_texture_format(texture_format, device.private_features)
+            .surface_desc()
+            .bits as u32
+            / BITS_PER_BYTE;
+
+        let stage_bytes_per_row = get_lowest_common_denom(
+            device.hal_limits.optimal_buffer_copy_pitch_alignment as u32,
+            bytes_per_texel,
+        );
+        let stage_size = stage_bytes_per_row as u64
+            * ((size.depth - 1) * data_layout.rows_per_image + size.height) as u64;
+        let mut stage = device.prepare_stage(stage_size);
+        {
+            let mut mapped = stage
+                .memory
+                .map(&device.raw, hal::memory::Segment::ALL)
+                .unwrap();
+            let mapping = unsafe { mapped.write(&device.raw, hal::memory::Segment::ALL) }.unwrap();
+            if stage_bytes_per_row == data_layout.bytes_per_row {
+                // Unlikely case of the data already being aligned optimally.
+                mapping.slice[..stage_size as usize].copy_from_slice(data);
+            } else {
+                // Copy row by row into the optimal alignment.
+                let copy_bytes_per_row =
+                    stage_bytes_per_row.min(data_layout.bytes_per_row) as usize;
+                for layer in 0..size.depth {
+                    let rows_offset = layer * data_layout.rows_per_image;
+                    for row in 0..size.height {
+                        let data_offset =
+                            (rows_offset + row) as usize * data_layout.bytes_per_row as usize;
+                        let stage_offset =
+                            (rows_offset + row) as usize * stage_bytes_per_row as usize;
+                        mapping.slice[stage_offset..stage_offset + copy_bytes_per_row]
+                            .copy_from_slice(&data[data_offset..data_offset + copy_bytes_per_row]);
+                    }
+                }
+            }
+        }
 
         let mut trackers = device.trackers.lock();
         let (dst, transition) = trackers.textures.use_replace(
@@ -235,21 +274,9 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         let last_submit_index = device.life_guard.submission_index.load(Ordering::Relaxed);
         dst.life_guard.use_at(last_submit_index + 1);
 
-        let bytes_per_texel = conv::map_texture_format(dst.format, device.private_features)
-            .surface_desc()
-            .bits as u32
-            / BITS_PER_BYTE;
-        let buffer_width = data_layout.bytes_per_row / bytes_per_texel;
-        assert_eq!(
-            data_layout.bytes_per_row % bytes_per_texel,
-            0,
-            "Source bytes per row ({}) must be a multiple of bytes per texel ({})",
-            data_layout.bytes_per_row,
-            bytes_per_texel
-        );
         let region = hal::command::BufferImageCopy {
             buffer_offset: 0,
-            buffer_width,
+            buffer_width: stage_bytes_per_row / bytes_per_texel,
             buffer_height: data_layout.rows_per_image,
             image_layers,
             image_offset,
@@ -467,3 +494,40 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         super::fire_map_callbacks(callbacks);
     }
 }
+
+fn get_lowest_common_denom(a: u32, b: u32) -> u32 {
+    let gcd = if a >= b {
+        get_greatest_common_divisor(a, b)
+    } else {
+        get_greatest_common_divisor(b, a)
+    };
+    a * b / gcd
+}
+
+fn get_greatest_common_divisor(mut a: u32, mut b: u32) -> u32 {
+    assert!(a >= b);
+    loop {
+        let c = a % b;
+        if c == 0 {
+            return b;
+        } else {
+            a = b;
+            b = c;
+        }
+    }
+}
+
+#[test]
+fn test_lcd() {
+    assert_eq!(get_lowest_common_denom(2, 2), 2);
+    assert_eq!(get_lowest_common_denom(2, 3), 6);
+    assert_eq!(get_lowest_common_denom(6, 4), 12);
+}
+
+#[test]
+fn test_gcd() {
+    assert_eq!(get_greatest_common_divisor(5, 1), 1);
+    assert_eq!(get_greatest_common_divisor(4, 2), 2);
+    assert_eq!(get_greatest_common_divisor(6, 4), 2);
+    assert_eq!(get_greatest_common_divisor(7, 7), 7);
+}
diff --git a/wgpu-core/src/instance.rs b/wgpu-core/src/instance.rs
index e99172de6e..5fbe6eff70 100644
--- a/wgpu-core/src/instance.rs
+++ b/wgpu-core/src/instance.rs
@@ -18,7 +18,6 @@ use serde::Deserialize;
 use serde::Serialize;
 
 use hal::{
-    self,
     adapter::{AdapterInfo as HalAdapterInfo, DeviceType as HalDeviceType, PhysicalDevice as _},
     queue::QueueFamily as _,
     window::Surface as _,
@@ -652,7 +651,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                 },
                 gpu.queue_groups.swap_remove(0),
                 mem_props,
-                limits.non_coherent_atom_size as u64,
+                limits,
                 supports_texture_d24_s8,
                 desc,
                 trace_path,
diff --git a/wgpu-core/src/swap_chain.rs b/wgpu-core/src/swap_chain.rs
index 5f465003f9..67e3ed17b4 100644
--- a/wgpu-core/src/swap_chain.rs
+++ b/wgpu-core/src/swap_chain.rs
@@ -101,6 +101,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         let (device_guard, mut token) = hub.devices.read(&mut token);
         let (mut swap_chain_guard, mut token) = hub.swap_chains.write(&mut token);
         let sc = &mut swap_chain_guard[swap_chain_id];
+        #[cfg_attr(not(feature = "trace"), allow(unused_variables))]
         let device = &device_guard[sc.device_id.value];
 
         let suf = B::get_surface_mut(surface);
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 1059271f2b..d59e650617 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -10,6 +10,11 @@ use serde::Deserialize;
 use serde::Serialize;
 use std::{io, ptr, slice};
 
+/// Buffer-Texture copies on command encoders have to have the `bytes_per_row`
+/// aligned to this number.
+///
+/// This doesn't apply to `Queue::write_texture`.
+pub const COPY_BYTES_PER_ROW_ALIGNMENT: u32 = 256;
 /// Bound uniform/storage buffer offsets must be aligned to this number.
 pub const BIND_BUFFER_ALIGNMENT: u64 = 256;