use std::num::NonZeroU64; use wgpu::{ include_wgsl, BindGroupDescriptor, BindGroupEntry, BindGroupLayoutDescriptor, BindGroupLayoutEntry, BindingResource, BindingType, BufferBinding, BufferBindingType, BufferDescriptor, BufferUsages, CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor, DownlevelFlags, Limits, MapMode, PipelineLayoutDescriptor, PollType, ShaderStages, }; use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters}; #[gpu_test] static ZERO_INIT_WORKGROUP_MEMORY: GpuTestConfiguration = GpuTestConfiguration::new() .parameters( TestParameters::default() .downlevel_flags(DownlevelFlags::COMPUTE_SHADERS) .limits(Limits::downlevel_defaults()), ) .run_async(|ctx| async move { let bgl = ctx .device .create_bind_group_layout(&BindGroupLayoutDescriptor { label: None, entries: &[BindGroupLayoutEntry { binding: 0, visibility: ShaderStages::COMPUTE, ty: BindingType::Buffer { ty: BufferBindingType::Storage { read_only: false }, has_dynamic_offset: true, min_binding_size: None, }, count: None, }], }); let output_buffer = ctx.device.create_buffer(&BufferDescriptor { label: Some("output buffer"), size: BUFFER_SIZE, usage: BufferUsages::COPY_DST | BufferUsages::COPY_SRC | BufferUsages::STORAGE, mapped_at_creation: false, }); let mapping_buffer = ctx.device.create_buffer(&BufferDescriptor { label: Some("mapping buffer"), size: BUFFER_SIZE, usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ, mapped_at_creation: false, }); let bg = ctx.device.create_bind_group(&BindGroupDescriptor { label: None, layout: &bgl, entries: &[BindGroupEntry { binding: 0, resource: BindingResource::Buffer(BufferBinding { buffer: &output_buffer, offset: 0, size: Some(NonZeroU64::new(BUFFER_BINDING_SIZE as u64).unwrap()), }), }], }); let pll = ctx .device .create_pipeline_layout(&PipelineLayoutDescriptor { label: None, bind_group_layouts: &[&bgl], push_constant_ranges: &[], }); let sm = ctx .device .create_shader_module(include_wgsl!("zero_init_workgroup_mem.wgsl")); let pipeline_read = ctx .device .create_compute_pipeline(&ComputePipelineDescriptor { label: Some("pipeline read"), layout: Some(&pll), module: &sm, entry_point: Some("read"), compilation_options: Default::default(), cache: None, }); let pipeline_write = ctx .device .create_compute_pipeline(&ComputePipelineDescriptor { label: Some("pipeline write"), layout: None, module: &sm, entry_point: Some("write"), compilation_options: Default::default(), cache: None, }); // -- Initializing data -- let output_pre_init_data = vec![1; OUTPUT_ARRAY_SIZE as usize]; ctx.queue.write_buffer( &output_buffer, 0, bytemuck::cast_slice(&output_pre_init_data), ); // -- Run test -- let mut encoder = ctx .device .create_command_encoder(&CommandEncoderDescriptor::default()); let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default()); cpass.set_pipeline(&pipeline_write); for _ in 0..NR_OF_DISPATCHES { cpass.dispatch_workgroups(DISPATCH_SIZE.0, DISPATCH_SIZE.1, DISPATCH_SIZE.2); } cpass.set_pipeline(&pipeline_read); for i in 0..NR_OF_DISPATCHES { cpass.set_bind_group(0, &bg, &[i * BUFFER_BINDING_SIZE]); cpass.dispatch_workgroups(DISPATCH_SIZE.0, DISPATCH_SIZE.1, DISPATCH_SIZE.2); } drop(cpass); // -- Pulldown data -- encoder.copy_buffer_to_buffer(&output_buffer, 0, &mapping_buffer, 0, BUFFER_SIZE); ctx.queue.submit(Some(encoder.finish())); mapping_buffer.slice(..).map_async(MapMode::Read, |_| ()); ctx.async_poll(PollType::wait()).await.unwrap(); let mapped = mapping_buffer.slice(..).get_mapped_range(); let typed: &[u32] = bytemuck::cast_slice(&mapped); // -- Check results -- let num_disptaches_failed = typed.iter().filter(|&&res| res != 0).count(); let ratio = (num_disptaches_failed as f32 / OUTPUT_ARRAY_SIZE as f32) * 100.; assert!( num_disptaches_failed == 0, "Zero-initialization of workgroup memory failed ({ratio:.0}% of disptaches failed)." ); drop(mapped); mapping_buffer.unmap(); }); const DISPATCH_SIZE: (u32, u32, u32) = (64, 64, 64); const TOTAL_WORK_GROUPS: u32 = DISPATCH_SIZE.0 * DISPATCH_SIZE.1 * DISPATCH_SIZE.2; /// nr of bytes we use in the shader const SHADER_WORKGROUP_MEMORY: u32 = 512 * 4 + 4; // assume we have this much workgroup memory (2GB) const MAX_DEVICE_WORKGROUP_MEMORY: u32 = i32::MAX as u32; const NR_OF_DISPATCHES: u32 = MAX_DEVICE_WORKGROUP_MEMORY / (SHADER_WORKGROUP_MEMORY * TOTAL_WORK_GROUPS) + 1; // TODO: use div_ceil once stabilized const OUTPUT_ARRAY_SIZE: u32 = TOTAL_WORK_GROUPS * NR_OF_DISPATCHES; const BUFFER_SIZE: u64 = OUTPUT_ARRAY_SIZE as u64 * 4; const BUFFER_BINDING_SIZE: u32 = TOTAL_WORK_GROUPS * 4;