From 9003580da44a73847664fc1953a26246a20c6cb9 Mon Sep 17 00:00:00 2001 From: rickwebiii Date: Fri, 24 Feb 2023 13:32:16 -0800 Subject: [PATCH] Better validation --- Cargo.lock | 1 + sunscreen_math/Cargo.toml | 3 +- sunscreen_math/build.rs | 27 ++-- sunscreen_math/src/webgpu_impl/mod.rs | 130 +++++++++++++----- .../src/webgpu_impl/ristrettovec.rs | 4 +- sunscreen_math/src/webgpu_impl/scalarvec.rs | 58 ++++++-- .../src/webgpu_impl/shaders/constants.wgsl | 6 + .../src/webgpu_impl/shaders/scalar.test.wgsl | 14 ++ .../src/webgpu_impl/shaders/scalar.wgsl | 40 +++++- 9 files changed, 219 insertions(+), 64 deletions(-) create mode 100644 sunscreen_math/src/webgpu_impl/shaders/constants.wgsl diff --git a/Cargo.lock b/Cargo.lock index 3129ea060..d69d2ad01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2450,6 +2450,7 @@ dependencies = [ "sunscreen_curve25519", "tokio", "wgpu", + "wgpu-core", ] [[package]] diff --git a/sunscreen_math/Cargo.toml b/sunscreen_math/Cargo.toml index eb92921f4..e542aca49 100644 --- a/sunscreen_math/Cargo.toml +++ b/sunscreen_math/Cargo.toml @@ -18,11 +18,12 @@ wgpu = { version = "0.15.1", optional = true } [build-dependencies] naga = { version = "0.11.0", optional = true, features = ["wgsl-in"]} +wgpu-core = { version = "0.15.1", optional = true, features = ["metal", "wgsl"] } [features] default = ["webgpu"] nightly-features = [] metal = ["dep:metal", "gpu"] -webgpu = ["dep:wgpu", "dep:tokio", "dep:futures", "dep:naga", "dep:bytemuck", "gpu"] +webgpu = ["dep:wgpu", "dep:tokio", "dep:futures", "dep:naga", "dep:wgpu-core", "dep:bytemuck", "gpu"] gpu = [] pina = [] \ No newline at end of file diff --git a/sunscreen_math/build.rs b/sunscreen_math/build.rs index ab6f32cd0..e9380ac8c 100644 --- a/sunscreen_math/build.rs +++ b/sunscreen_math/build.rs @@ -1,11 +1,12 @@ #[cfg(feature = "webgpu")] // This simply concatenates all the wgsl shaders, which get compiled at runtime. fn compile_wgsl_shaders() { - use std::{path::PathBuf, process::Command}; - use std::fs::{DirEntry, File, read_to_string}; + use std::fs::{read_to_string, DirEntry, File}; use std::io::Write; + use std::{path::PathBuf}; - use naga::valid::{ValidationFlags, Capabilities}; + use naga::valid::{Capabilities, ValidationFlags}; + use wgpu_core::pipeline::{CreateShaderModuleError, ShaderError}; let outdir = PathBuf::from(std::env::var("OUT_DIR").unwrap()); let shader_dir = PathBuf::from(".") @@ -27,9 +28,7 @@ fn compile_wgsl_shaders() { let include_file: Box bool> = if config == "test" { Box::new(is_wgsl_file) } else { - Box::new(|file: &DirEntry| { - is_wgsl_file(file) && !is_test_wgsl_file(file) - }) + Box::new(|file: &DirEntry| is_wgsl_file(file) && !is_test_wgsl_file(file)) }; let shaders = std::fs::read_dir(&shader_dir) @@ -38,7 +37,7 @@ fn compile_wgsl_shaders() { .filter(include_file); let out_file_path = outdir.join(format!("shaders-{config}.wgsl")); - + { let mut out_file = File::create(&out_file_path).unwrap(); @@ -50,15 +49,23 @@ fn compile_wgsl_shaders() { }; // Validate the shader - let shader_contents = read_to_string(out_file_path).unwrap(); + let shader_contents = read_to_string(&out_file_path).unwrap(); let parse_result = naga::front::wgsl::parse_str(&shader_contents); if let Err(e) = parse_result { - panic!("{}", e.message()); + let e = ShaderError { + source: shader_contents, + label: None, + inner: Box::new(e) + }; + + let e = CreateShaderModuleError::Parsing(e); + panic!("{}", e); } - let mut validator = naga::valid::Validator::new(ValidationFlags::all(), Capabilities::empty()); + let mut validator = + naga::valid::Validator::new(ValidationFlags::all(), Capabilities::empty()); let validation_results = validator.validate(&parse_result.unwrap()); diff --git a/sunscreen_math/src/webgpu_impl/mod.rs b/sunscreen_math/src/webgpu_impl/mod.rs index 68795b9e9..993abad56 100644 --- a/sunscreen_math/src/webgpu_impl/mod.rs +++ b/sunscreen_math/src/webgpu_impl/mod.rs @@ -1,11 +1,21 @@ use core::slice; -use std::{borrow::Cow, ops::Deref, mem::{align_of, MaybeUninit, size_of}}; +use std::{ + borrow::Cow, + mem::{align_of, size_of, MaybeUninit}, + ops::Deref, +}; use bytemuck::{cast, cast_slice, Pod}; use futures::channel::oneshot; use lazy_static::lazy_static; use tokio::runtime::{Builder as TokioRuntimeBuilder, Runtime as TokioRuntime}; -use wgpu::{Instance, RequestAdapterOptions, Device, Queue, ShaderModuleDescriptor, ShaderModule, BufferDescriptor, COPY_BUFFER_ALIGNMENT, BufferUsages, Buffer, ComputePipelineDescriptor, BindGroupDescriptor, BindGroupEntry, CommandEncoderDescriptor, ComputePassDescriptor, util::{BufferInitDescriptor, DeviceExt}, Maintain}; +use wgpu::{ + util::{BufferInitDescriptor, DeviceExt}, + BindGroupDescriptor, BindGroupEntry, Buffer, BufferDescriptor, BufferUsages, + CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor, Device, Instance, + Maintain, Queue, RequestAdapterOptions, ShaderModule, ShaderModuleDescriptor, + COPY_BUFFER_ALIGNMENT, +}; mod ristrettovec; mod scalarvec; @@ -23,14 +33,10 @@ pub struct Runtime { // test and release. In test, we #define the TEST macro, which exposes test kernels. // The release library does not feature these kernels. #[cfg(not(test))] -const SHADERS: &str = include_str!(concat!( - env!("OUT_DIR"), - "/shaders-release.wgsl" -)); +const SHADERS: &str = include_str!(concat!(env!("OUT_DIR"), "/shaders-release.wgsl")); #[cfg(test)] -const SHADERS: &str = - include_str!(concat!(env!("OUT_DIR"), "/shaders-test.wgsl")); +const SHADERS: &str = include_str!(concat!(env!("OUT_DIR"), "/shaders-test.wgsl")); fn assert_aligned(ptr: *const T) { assert!(ptr.cast::<()>().align_offset(align_of::()) == 0); @@ -52,6 +58,33 @@ trait BufferExt { fn get_data(&self) -> Vec; } +pub trait GpuVec { + type IterType: Sized; + + fn get_buffer(&self) -> &Buffer; + + fn len(&self) -> usize; + + fn byte_len(&self) -> usize { + self.len() * size_of::() + } + + fn u32_len(&self) -> usize { + self.byte_len() / size_of::() + } + + fn run_unary(&self, output: &Buffer, kernel_name: &'static str) { + let runtime = Runtime::get(); + let len = GpuU32::new(self.len() as u32); + + runtime.run( + kernel_name, + &[self.get_buffer(), &DUMMY_BUFFER, output, &len.data], + &Grid::new(self.len() as u32 / 128, 1, 1), + ); + } +} + impl BufferExt for Buffer { fn clone(&self) -> Buffer { let runtime = Runtime::get(); @@ -65,7 +98,9 @@ impl BufferExt for Buffer { fn copy_into(&self, dst: &Buffer) { let runtime = Runtime::get(); - let mut encoder = runtime.device.create_command_encoder(&CommandEncoderDescriptor { label: None }); + let mut encoder = runtime + .device + .create_command_encoder(&CommandEncoderDescriptor { label: None }); encoder.copy_buffer_to_buffer(self, 0, &dst, 0, self.size()); runtime.queue.submit(Some(encoder.finish())); @@ -76,7 +111,7 @@ impl BufferExt for Buffer { let (s, r) = oneshot::channel(); // In vanilla WebGPU, if you use the MAP_READ flag, you must also set COPY_DST - // and *only* COPY_DST. This means you can't use such buffers in compute + // and *only* COPY_DST. This means you can't use such buffers in compute // shaders. As such, we create a temporary buffer with these properties so we // can copy data out of the shader-capable buffer and return the results. let runtime = Runtime::get(); @@ -85,13 +120,15 @@ impl BufferExt for Buffer { label: None, size: self.size(), usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, - mapped_at_creation: false + mapped_at_creation: false, }); self.copy_into(©_buf); let buffer_slice = copy_buf.slice(..); - buffer_slice.map_async(wgpu::MapMode::Read, move |v| { s.send(v).unwrap(); }); + buffer_slice.map_async(wgpu::MapMode::Read, move |v| { + s.send(v).unwrap(); + }); runtime.device.poll(Maintain::Wait); @@ -113,15 +150,19 @@ impl Runtime { pub fn alloc(&self, len: usize) -> Buffer { let len = size_of::() * len; - // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use + // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use // mapped_at_creation=true - let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { len } else { (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize}; + let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { + len + } else { + (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize + }; let buffer = self.device.create_buffer(&BufferDescriptor { label: None, size: len as u64, usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST, - mapped_at_creation: false + mapped_at_creation: false, }); buffer @@ -130,9 +171,13 @@ impl Runtime { pub fn alloc_from_slice(&self, data: &[T]) -> Buffer { let len = size_of::() * data.len(); - // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use + // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use // mapped_at_creation=true - let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { len } else { (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize}; + let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { + len + } else { + (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize + }; let buffer = self.device.create_buffer_init(&BufferInitDescriptor { label: None, @@ -144,19 +189,23 @@ impl Runtime { } pub fn run(&self, kernel_name: &'static str, args: &[&Buffer], threadgroups: &Grid) { - let pipeline = self.device.create_compute_pipeline(&ComputePipelineDescriptor { - label: None, - layout: None, - module: &self.shaders, - entry_point: kernel_name - }); + let pipeline = self + .device + .create_compute_pipeline(&ComputePipelineDescriptor { + label: None, + layout: None, + module: &self.shaders, + entry_point: kernel_name, + }); - let bindings = args.iter().enumerate().map(|(i, b)| { - BindGroupEntry { + let bindings = args + .iter() + .enumerate() + .map(|(i, b)| BindGroupEntry { binding: i as u32, - resource: b.as_entire_binding() - } - }).collect::>(); + resource: b.as_entire_binding(), + }) + .collect::>(); let layout = pipeline.get_bind_group_layout(0); @@ -166,7 +215,9 @@ impl Runtime { entries: &bindings, }); - let mut encoder = self.device.create_command_encoder(&CommandEncoderDescriptor { label: None }); + let mut encoder = self + .device + .create_command_encoder(&CommandEncoderDescriptor { label: None }); { let (x, y, z) = threadgroups.0; @@ -183,7 +234,6 @@ impl Runtime { } } - lazy_static! { static ref TOKIO_RUNTIME: TokioRuntime = { TokioRuntimeBuilder::new_current_thread() @@ -228,10 +278,16 @@ lazy_static! { shaders } }; + + /// A 4-byte buffer that exists so you can bind a buffer to `g_b` + /// in unary shaders. + static ref DUMMY_BUFFER: Buffer = { + Runtime::get().alloc::(1) + }; } pub struct GpuU32 { - data: Buffer + data: Buffer, } impl GpuU32 { @@ -239,9 +295,7 @@ impl GpuU32 { let runtime = Runtime::get(); let data = runtime.alloc_from_slice(&[val]); - Self { - data - } + Self { data } } } @@ -269,10 +323,14 @@ mod tests { let n = GpuU32::new(a.len() as u32); - runtime.run("add", &[&a_gpu, &b_gpu, &c_gpu, &n.data], &Grid::new(1, 1, 1)); + runtime.run( + "add", + &[&a_gpu, &b_gpu, &c_gpu, &n.data], + &Grid::new(1, 1, 1), + ); for (c, (a, b)) in c_gpu.get_data::().iter().zip(a.iter().zip(b.iter())) { assert_eq!(*c, a + b); } } -} \ No newline at end of file +} diff --git a/sunscreen_math/src/webgpu_impl/ristrettovec.rs b/sunscreen_math/src/webgpu_impl/ristrettovec.rs index f40794529..21c6dad66 100644 --- a/sunscreen_math/src/webgpu_impl/ristrettovec.rs +++ b/sunscreen_math/src/webgpu_impl/ristrettovec.rs @@ -1,3 +1 @@ -pub struct GpuRistrettoPointVec { - -} \ No newline at end of file +pub struct GpuRistrettoPointVec {} diff --git a/sunscreen_math/src/webgpu_impl/scalarvec.rs b/sunscreen_math/src/webgpu_impl/scalarvec.rs index 781716ebb..d8c39849f 100644 --- a/sunscreen_math/src/webgpu_impl/scalarvec.rs +++ b/sunscreen_math/src/webgpu_impl/scalarvec.rs @@ -3,7 +3,7 @@ use std::mem::size_of; use curve25519_dalek::scalar::Scalar; use wgpu::Buffer; -use super::{Runtime, BufferExt}; +use super::{BufferExt, Runtime}; pub struct GpuScalarVec { data: Buffer, @@ -12,14 +12,14 @@ pub struct GpuScalarVec { impl Clone for GpuScalarVec { fn clone(&self) -> Self { Self { - data: self.data.clone() + data: self.data.clone(), } } } pub struct Scalars { data: Vec, - i: usize + i: usize, } impl Scalars { @@ -109,7 +109,7 @@ impl GpuScalarVec { } Self { - data: runtime.alloc_from_slice(&packed_data) + data: runtime.alloc_from_slice(&packed_data), } } @@ -124,13 +124,16 @@ impl GpuScalarVec { mod tests { use rand::thread_rng; - use crate::webgpu_impl::{Grid, GpuU32}; + use crate::webgpu_impl::{GpuU32, Grid}; use super::*; - + #[test] fn can_unpack_scalarvec() { - let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::>(); + let a = (0..238) + .into_iter() + .map(|_| Scalar::random(&mut thread_rng())) + .collect::>(); let a_v = GpuScalarVec::new(&a); @@ -146,7 +149,10 @@ mod tests { #[test] fn can_clone_scalarvec() { - let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::>(); + let a = (0..238) + .into_iter() + .map(|_| Scalar::random(&mut thread_rng())) + .collect::>(); let a_v = GpuScalarVec::new(&a); let a_v_clone = a_v.clone(); @@ -161,11 +167,13 @@ mod tests { assert_eq!(count, a.len()); } - #[test] fn can_pack_unpack_shader_operand_a() { // Use 238 because it's a weird number not a multiple of the threadgroup size. - let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::>(); + let a = (0..238) + .into_iter() + .map(|_| Scalar::random(&mut thread_rng())) + .collect::>(); let a_v = GpuScalarVec::new(&a); let c_v = a_v.clone(); @@ -174,6 +182,32 @@ mod tests { let len = GpuU32::new(a.len() as u32); - runtime.run("test_scalar_can_pack_unpack_a", &[&a_v.data, &a_v.data, &c_v.data, &len.data], &Grid::new(2, 1, 1)); + runtime.run( + "test_scalar_can_pack_unpack_a", + &[&a_v.data, &a_v.data, &c_v.data, &len.data], + &Grid::new(2, 1, 1), + ); } -} \ No newline at end of file + + #[test] + fn can_pack_unpack_shader_operand_b() { + // Use 238 because it's a weird number not a multiple of the threadgroup size. + let a = (0..238) + .into_iter() + .map(|_| Scalar::random(&mut thread_rng())) + .collect::>(); + + let a_v = GpuScalarVec::new(&a); + let c_v = a_v.clone(); + + let runtime = Runtime::get(); + + let len = GpuU32::new(a.len() as u32); + + runtime.run( + "test_scalar_can_pack_unpack_b", + &[&a_v.data, &a_v.data, &c_v.data, &len.data], + &Grid::new(2, 1, 1), + ); + } +} diff --git a/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl b/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl new file mode 100644 index 000000000..d9b427ec6 --- /dev/null +++ b/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl @@ -0,0 +1,6 @@ +const Scalar29_Zero: Scalar29 = Scalar29(array(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u)); + +const Scalar29_L: Scalar29 = Scalar29(array( + 0x1cf5d3edu, 0x009318d2u, 0x1de73596u, 0x1df3bd45u, 0x0000014du, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00100000u, +)); \ No newline at end of file diff --git a/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl b/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl index 2c2bf135d..bf5f54f7d 100644 --- a/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl +++ b/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl @@ -10,4 +10,18 @@ fn test_scalar_can_pack_unpack_a( let a = scalar29_unpack_a(gid.x, g_len); scalar29_pack_c(a, gid.x, g_len); +} + +@compute +@workgroup_size(128, 1, 1) +fn test_scalar_can_pack_unpack_b( + @builtin(global_invocation_id) gid: vec3, +) { + if gid.x >= g_len { + unused_b(); + return; + } + + let b = scalar29_unpack_b(gid.x, g_len); + scalar29_pack_c(b, gid.x, g_len); } \ No newline at end of file diff --git a/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl b/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl index da815fe4b..f3dbfb863 100644 --- a/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl +++ b/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl @@ -10,8 +10,6 @@ struct Foo { v: array } -const Scalar29_Zero: Scalar29 = Scalar29(array(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u)); - // WGSL is terrible in that you can't pass arrays of unknown length to functions. So, we create // functions for unpacking bindings a, b respectively. fn scalar29_unpack_a(grid_tid: u32, stride: u32) -> Scalar29 { @@ -69,3 +67,41 @@ fn scalar29_pack_c(val: Scalar29, grid_tid: u32, stride: u32) { word = val.v[7] >> 21u | val.v[8u] << 8u; g_c[7u * stride + grid_tid] = word; } + +fn scalar29_add(a: ptr, b: ptr) -> Scalar29 { + var sum = Scalar29_Zero; + let mask = (0x1u << 29u) - 1u; + + // a + b + var carry = 0u; + for (var i = 0u; i < 9u; i++) { + carry = a.v[i] + b.v[i] + (carry >> 29u); + sum.v[i] = carry & mask; + } + + // subtract l if the sum is >= l + return scalar29_sub(&sum, &Scalar29_L); +} + +fn scalar29_sub(a: ptr, b: ptr) -> Scalar29 { + var difference = Scalar29_Zero; + let mask = (1u << 29u) - 1u; + + // a - b + var borrow = 0u; + for (var i = 0u; i < 9u; i++) { + borrow = a.v[i] - (b.v[i] + (borrow >> 31u)); + difference.v[i] = borrow & mask; + } + + // conditionally add l if the difference is negative + let underflow_mask = ((borrow >> 31u) ^ 1u) - 1u; + + var carry = 0u; + for (var i = 0u; i < 9u; i++) { + carry = (carry >> 29) + difference[i] + (Scalar29_L.v[i] & underflow_mask); + difference[i] = carry & mask; + } + + return difference; +} \ No newline at end of file