use std::{fmt::Write, num::NonZeroU64}; use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters, TestingContext}; /// We want to test that using a pipeline cache doesn't cause failure /// /// It would be nice if we could also assert that reusing a pipeline cache would make compilation /// be faster however, some drivers use a fallback pipeline cache, which makes this inconsistent /// (both intra- and inter-run). #[gpu_test] static PIPELINE_CACHE: GpuTestConfiguration = GpuTestConfiguration::new() .parameters( TestParameters::default() .test_features_limits() .features(wgpu::Features::PIPELINE_CACHE), ) .run_async(pipeline_cache_test); /// Set to a higher value if adding a timing based assertion. This is otherwise fast to compile const ARRAY_SIZE: u64 = 256; /// Create a shader which should be slow-ish to compile fn shader() -> String { let mut body = String::new(); for idx in 0..ARRAY_SIZE { // "Safety": There will only be a single workgroup, and a single thread in that workgroup writeln!(body, " output[{idx}] = {idx}u;") .expect("`u64::fmt` and `String::write_fmt` are infallible"); } format!( r#" @group(0) @binding(0) var output: array; @compute @workgroup_size(1) fn main() {{ {body} }} "#, ) } async fn pipeline_cache_test(ctx: TestingContext) { let shader = shader(); let sm = ctx .device .create_shader_module(wgpu::ShaderModuleDescriptor { label: Some("shader"), source: wgpu::ShaderSource::Wgsl(shader.into()), }); let bgl = ctx .device .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { label: Some("bind_group_layout"), entries: &[wgpu::BindGroupLayoutEntry { binding: 0, visibility: wgpu::ShaderStages::COMPUTE, ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: false }, has_dynamic_offset: false, min_binding_size: NonZeroU64::new(ARRAY_SIZE * 4), }, count: None, }], }); let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { label: Some("gpu_buffer"), size: ARRAY_SIZE * 4, usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, mapped_at_creation: false, }); let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { label: Some("cpu_buffer"), size: ARRAY_SIZE * 4, usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, mapped_at_creation: false, }); let bind_group = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor { label: Some("bind_group"), layout: &bgl, entries: &[wgpu::BindGroupEntry { binding: 0, resource: gpu_buffer.as_entire_binding(), }], }); let pipeline_layout = ctx .device .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { label: Some("pipeline_layout"), bind_group_layouts: &[&bgl], push_constant_ranges: &[], }); let first_cache_data; { let first_cache = unsafe { ctx.device .create_pipeline_cache(&wgpu::PipelineCacheDescriptor { label: Some("pipeline_cache"), data: None, fallback: false, }) }; let first_pipeline = ctx .device .create_compute_pipeline(&wgpu::ComputePipelineDescriptor { label: Some("pipeline"), layout: Some(&pipeline_layout), module: &sm, entry_point: Some("main"), compilation_options: Default::default(), cache: Some(&first_cache), }); validate_pipeline(&ctx, first_pipeline, &bind_group, &gpu_buffer, &cpu_buffer).await; first_cache_data = first_cache.get_data(); } assert!(first_cache_data.is_some()); let second_cache = unsafe { ctx.device .create_pipeline_cache(&wgpu::PipelineCacheDescriptor { label: Some("pipeline_cache"), data: first_cache_data.as_deref(), fallback: false, }) }; let first_pipeline = ctx .device .create_compute_pipeline(&wgpu::ComputePipelineDescriptor { label: Some("pipeline"), layout: Some(&pipeline_layout), module: &sm, entry_point: Some("main"), compilation_options: Default::default(), cache: Some(&second_cache), }); validate_pipeline(&ctx, first_pipeline, &bind_group, &gpu_buffer, &cpu_buffer).await; // Ideally, we could assert here that the second compilation was faster than the first // However, that doesn't actually work, because drivers have their own internal caches. // This does work on my machine if I set `MESA_DISABLE_PIPELINE_CACHE=1` // before running the test; but of course that is not a realistic scenario } async fn validate_pipeline( ctx: &TestingContext, pipeline: wgpu::ComputePipeline, bind_group: &wgpu::BindGroup, gpu_buffer: &wgpu::Buffer, cpu_buffer: &wgpu::Buffer, ) { let mut encoder = ctx .device .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: Some("encoder"), }); { let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { label: Some("compute_pass"), timestamp_writes: None, }); cpass.set_pipeline(&pipeline); cpass.set_bind_group(0, Some(bind_group), &[]); cpass.dispatch_workgroups(1, 1, 1); } encoder.copy_buffer_to_buffer(gpu_buffer, 0, cpu_buffer, 0, ARRAY_SIZE * 4); ctx.queue.submit([encoder.finish()]); cpu_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ()); ctx.async_poll(wgpu::PollType::wait()).await.unwrap(); let data = cpu_buffer.slice(..).get_mapped_range(); let arrays: &[u32] = bytemuck::cast_slice(&data); assert_eq!(arrays.len(), ARRAY_SIZE as usize); for (idx, value) in arrays.iter().copied().enumerate() { assert_eq!(value as usize, idx); } drop(data); cpu_buffer.unmap(); }