Better validation

2026-05-11 03:00:37 -04:00 · 2023-02-24 13:32:16 -08:00
parent 4a4f959f26
commit 9003580da4
9 changed files with 219 additions and 64 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2450,6 +2450,7 @@ dependencies = [
 "sunscreen_curve25519",
 "tokio",
 "wgpu",
+ "wgpu-core",
 ]

 [[package]]
--- a/sunscreen_math/Cargo.toml
+++ b/sunscreen_math/Cargo.toml
@@ -18,11 +18,12 @@ wgpu = { version = "0.15.1", optional = true }

 [build-dependencies]
 naga = { version = "0.11.0", optional = true, features = ["wgsl-in"]}
+wgpu-core = { version = "0.15.1", optional = true, features = ["metal", "wgsl"] }

 [features]
 default = ["webgpu"]
 nightly-features = []
 metal = ["dep:metal", "gpu"]
-webgpu = ["dep:wgpu", "dep:tokio", "dep:futures", "dep:naga", "dep:bytemuck", "gpu"]
+webgpu = ["dep:wgpu", "dep:tokio", "dep:futures", "dep:naga", "dep:wgpu-core", "dep:bytemuck", "gpu"]
 gpu = []
 pina = []
--- a/sunscreen_math/build.rs
+++ b/sunscreen_math/build.rs
@@ -1,11 +1,12 @@
 #[cfg(feature = "webgpu")]
 // This simply concatenates all the wgsl shaders, which get compiled at runtime.
 fn compile_wgsl_shaders() {
-    use std::{path::PathBuf, process::Command};
-    use std::fs::{DirEntry, File, read_to_string};
+    use std::fs::{read_to_string, DirEntry, File};
    use std::io::Write;
+    use std::{path::PathBuf};

-    use naga::valid::{ValidationFlags, Capabilities};
+    use naga::valid::{Capabilities, ValidationFlags};
+    use wgpu_core::pipeline::{CreateShaderModuleError, ShaderError};

    let outdir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
    let shader_dir = PathBuf::from(".")
@@ -27,9 +28,7 @@ fn compile_wgsl_shaders() {
        let include_file: Box<dyn Fn(&DirEntry) -> bool> = if config == "test" {
            Box::new(is_wgsl_file)
        } else {
-            Box::new(|file: &DirEntry| {
-                is_wgsl_file(file) && !is_test_wgsl_file(file)
-            })
+            Box::new(|file: &DirEntry| is_wgsl_file(file) && !is_test_wgsl_file(file))
        };

        let shaders = std::fs::read_dir(&shader_dir)
@@ -38,7 +37,7 @@ fn compile_wgsl_shaders() {
            .filter(include_file);

        let out_file_path = outdir.join(format!("shaders-{config}.wgsl"));
-        
+
        {
            let mut out_file = File::create(&out_file_path).unwrap();

@@ -50,15 +49,23 @@ fn compile_wgsl_shaders() {
        };

        // Validate the shader
-        let shader_contents = read_to_string(out_file_path).unwrap();
+        let shader_contents = read_to_string(&out_file_path).unwrap();

        let parse_result = naga::front::wgsl::parse_str(&shader_contents);

        if let Err(e) = parse_result {
-            panic!("{}", e.message());
+            let e = ShaderError {
+                source: shader_contents,
+                label: None,
+                inner: Box::new(e)
+            };
+
+            let e = CreateShaderModuleError::Parsing(e);
+            panic!("{}", e);
        }

-        let mut validator = naga::valid::Validator::new(ValidationFlags::all(), Capabilities::empty());
+        let mut validator =
+            naga::valid::Validator::new(ValidationFlags::all(), Capabilities::empty());

        let validation_results = validator.validate(&parse_result.unwrap());

--- a/sunscreen_math/src/webgpu_impl/mod.rs
+++ b/sunscreen_math/src/webgpu_impl/mod.rs
@@ -1,11 +1,21 @@
 use core::slice;
-use std::{borrow::Cow, ops::Deref, mem::{align_of, MaybeUninit, size_of}};
+use std::{
+    borrow::Cow,
+    mem::{align_of, size_of, MaybeUninit},
+    ops::Deref,
+};

 use bytemuck::{cast, cast_slice, Pod};
 use futures::channel::oneshot;
 use lazy_static::lazy_static;
 use tokio::runtime::{Builder as TokioRuntimeBuilder, Runtime as TokioRuntime};
-use wgpu::{Instance, RequestAdapterOptions, Device, Queue, ShaderModuleDescriptor, ShaderModule, BufferDescriptor, COPY_BUFFER_ALIGNMENT, BufferUsages, Buffer, ComputePipelineDescriptor, BindGroupDescriptor, BindGroupEntry, CommandEncoderDescriptor, ComputePassDescriptor, util::{BufferInitDescriptor, DeviceExt}, Maintain};
+use wgpu::{
+    util::{BufferInitDescriptor, DeviceExt},
+    BindGroupDescriptor, BindGroupEntry, Buffer, BufferDescriptor, BufferUsages,
+    CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor, Device, Instance,
+    Maintain, Queue, RequestAdapterOptions, ShaderModule, ShaderModuleDescriptor,
+    COPY_BUFFER_ALIGNMENT,
+};

 mod ristrettovec;
 mod scalarvec;
@@ -23,14 +33,10 @@ pub struct Runtime {
 // test and release. In test, we #define the TEST macro, which exposes test kernels.
 // The release library does not feature these kernels.
 #[cfg(not(test))]
-const SHADERS: &str = include_str!(concat!(
-    env!("OUT_DIR"),
-    "/shaders-release.wgsl"
-));
+const SHADERS: &str = include_str!(concat!(env!("OUT_DIR"), "/shaders-release.wgsl"));

 #[cfg(test)]
-const SHADERS: &str =
-    include_str!(concat!(env!("OUT_DIR"), "/shaders-test.wgsl"));
+const SHADERS: &str = include_str!(concat!(env!("OUT_DIR"), "/shaders-test.wgsl"));

 fn assert_aligned<T>(ptr: *const T) {
    assert!(ptr.cast::<()>().align_offset(align_of::<T>()) == 0);
@@ -52,6 +58,33 @@ trait BufferExt {
    fn get_data<T: Pod + Copy>(&self) -> Vec<T>;
 }

+pub trait GpuVec {
+    type IterType: Sized;
+
+    fn get_buffer(&self) -> &Buffer;
+
+    fn len(&self) -> usize;
+
+    fn byte_len(&self) -> usize {
+        self.len() * size_of::<Self::IterType>()
+    }
+
+    fn u32_len(&self) -> usize {
+        self.byte_len() / size_of::<u32>()
+    }
+
+    fn run_unary<Rhs: GpuVec>(&self, output: &Buffer, kernel_name: &'static str) {
+        let runtime = Runtime::get();
+        let len = GpuU32::new(self.len() as u32);
+
+        runtime.run(
+            kernel_name,
+            &[self.get_buffer(), &DUMMY_BUFFER, output, &len.data],
+            &Grid::new(self.len() as u32 / 128, 1, 1),
+        );
+    }
+}
+
 impl BufferExt for Buffer {
    fn clone(&self) -> Buffer {
        let runtime = Runtime::get();
@@ -65,7 +98,9 @@ impl BufferExt for Buffer {
    fn copy_into(&self, dst: &Buffer) {
        let runtime = Runtime::get();

-        let mut encoder = runtime.device.create_command_encoder(&CommandEncoderDescriptor { label: None });
+        let mut encoder = runtime
+            .device
+            .create_command_encoder(&CommandEncoderDescriptor { label: None });
        encoder.copy_buffer_to_buffer(self, 0, &dst, 0, self.size());

        runtime.queue.submit(Some(encoder.finish()));
@@ -76,7 +111,7 @@ impl BufferExt for Buffer {
        let (s, r) = oneshot::channel();

        // In vanilla WebGPU, if you use the MAP_READ flag, you must also set COPY_DST
-        // and *only* COPY_DST. This means you can't use such buffers in compute 
+        // and *only* COPY_DST. This means you can't use such buffers in compute
        // shaders. As such, we create a temporary buffer with these properties so we
        // can copy data out of the shader-capable buffer and return the results.
        let runtime = Runtime::get();
@@ -85,13 +120,15 @@ impl BufferExt for Buffer {
            label: None,
            size: self.size(),
            usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST,
-            mapped_at_creation: false
+            mapped_at_creation: false,
        });

        self.copy_into(&copy_buf);

        let buffer_slice = copy_buf.slice(..);
-        buffer_slice.map_async(wgpu::MapMode::Read, move |v| { s.send(v).unwrap(); });
+        buffer_slice.map_async(wgpu::MapMode::Read, move |v| {
+            s.send(v).unwrap();
+        });

        runtime.device.poll(Maintain::Wait);

@@ -113,15 +150,19 @@ impl Runtime {
    pub fn alloc<T>(&self, len: usize) -> Buffer {
        let len = size_of::<T>() * len;

-        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use 
+        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use
        // mapped_at_creation=true
-        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { len } else { (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize};
+        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 {
+            len
+        } else {
+            (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize
+        };

        let buffer = self.device.create_buffer(&BufferDescriptor {
            label: None,
            size: len as u64,
            usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
-            mapped_at_creation: false
+            mapped_at_creation: false,
        });

        buffer
@@ -130,9 +171,13 @@ impl Runtime {
    pub fn alloc_from_slice<T: Pod>(&self, data: &[T]) -> Buffer {
        let len = size_of::<T>() * data.len();

-        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use 
+        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use
        // mapped_at_creation=true
-        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { len } else { (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize};
+        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 {
+            len
+        } else {
+            (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize
+        };

        let buffer = self.device.create_buffer_init(&BufferInitDescriptor {
            label: None,
@@ -144,19 +189,23 @@ impl Runtime {
    }

    pub fn run(&self, kernel_name: &'static str, args: &[&Buffer], threadgroups: &Grid) {
-        let pipeline = self.device.create_compute_pipeline(&ComputePipelineDescriptor {
-            label: None,
-            layout: None,
-            module: &self.shaders,
-            entry_point: kernel_name
-        });
+        let pipeline = self
+            .device
+            .create_compute_pipeline(&ComputePipelineDescriptor {
+                label: None,
+                layout: None,
+                module: &self.shaders,
+                entry_point: kernel_name,
+            });

-        let bindings = args.iter().enumerate().map(|(i, b)| {
-            BindGroupEntry {
+        let bindings = args
+            .iter()
+            .enumerate()
+            .map(|(i, b)| BindGroupEntry {
                binding: i as u32,
-                resource: b.as_entire_binding()
-            }
-        }).collect::<Vec<_>>();
+                resource: b.as_entire_binding(),
+            })
+            .collect::<Vec<_>>();

        let layout = pipeline.get_bind_group_layout(0);

@@ -166,7 +215,9 @@ impl Runtime {
            entries: &bindings,
        });

-        let mut encoder = self.device.create_command_encoder(&CommandEncoderDescriptor { label: None });
+        let mut encoder = self
+            .device
+            .create_command_encoder(&CommandEncoderDescriptor { label: None });

        {
            let (x, y, z) = threadgroups.0;
@@ -183,7 +234,6 @@ impl Runtime {
    }
 }

-
 lazy_static! {
    static ref TOKIO_RUNTIME: TokioRuntime = {
        TokioRuntimeBuilder::new_current_thread()
@@ -228,10 +278,16 @@ lazy_static! {
            shaders
        }
    };
+
+    /// A 4-byte buffer that exists so you can bind a buffer to `g_b`
+    /// in unary shaders.
+    static ref DUMMY_BUFFER: Buffer = {
+        Runtime::get().alloc::<u32>(1)
+    };
 }

 pub struct GpuU32 {
-    data: Buffer
+    data: Buffer,
 }

 impl GpuU32 {
@@ -239,9 +295,7 @@ impl GpuU32 {
        let runtime = Runtime::get();
        let data = runtime.alloc_from_slice(&[val]);

-        Self {
-            data
-        }
+        Self { data }
    }
 }

@@ -269,10 +323,14 @@ mod tests {

        let n = GpuU32::new(a.len() as u32);

-        runtime.run("add", &[&a_gpu, &b_gpu, &c_gpu, &n.data], &Grid::new(1, 1, 1));
+        runtime.run(
+            "add",
+            &[&a_gpu, &b_gpu, &c_gpu, &n.data],
+            &Grid::new(1, 1, 1),
+        );

        for (c, (a, b)) in c_gpu.get_data::<u32>().iter().zip(a.iter().zip(b.iter())) {
            assert_eq!(*c, a + b);
        }
    }
-}
+}
--- a/sunscreen_math/src/webgpu_impl/ristrettovec.rs
+++ b/sunscreen_math/src/webgpu_impl/ristrettovec.rs
@@ -1,3 +1 @@
-pub struct GpuRistrettoPointVec {
-    
-}
+pub struct GpuRistrettoPointVec {}
--- a/sunscreen_math/src/webgpu_impl/scalarvec.rs
+++ b/sunscreen_math/src/webgpu_impl/scalarvec.rs
@@ -3,7 +3,7 @@ use std::mem::size_of;
 use curve25519_dalek::scalar::Scalar;
 use wgpu::Buffer;

-use super::{Runtime, BufferExt};
+use super::{BufferExt, Runtime};

 pub struct GpuScalarVec {
    data: Buffer,
@@ -12,14 +12,14 @@ pub struct GpuScalarVec {
 impl Clone for GpuScalarVec {
    fn clone(&self) -> Self {
        Self {
-            data: self.data.clone()
+            data: self.data.clone(),
        }
    }
 }

 pub struct Scalars {
    data: Vec<u32>,
-    i: usize
+    i: usize,
 }

 impl Scalars {
@@ -109,7 +109,7 @@ impl GpuScalarVec {
        }

        Self {
-            data: runtime.alloc_from_slice(&packed_data)
+            data: runtime.alloc_from_slice(&packed_data),
        }
    }

@@ -124,13 +124,16 @@ impl GpuScalarVec {
 mod tests {
    use rand::thread_rng;

-    use crate::webgpu_impl::{Grid, GpuU32};
+    use crate::webgpu_impl::{GpuU32, Grid};

    use super::*;
-    
+
    #[test]
    fn can_unpack_scalarvec() {
-        let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::<Vec<_>>();
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();

        let a_v = GpuScalarVec::new(&a);

@@ -146,7 +149,10 @@ mod tests {

    #[test]
    fn can_clone_scalarvec() {
-        let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::<Vec<_>>();
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();

        let a_v = GpuScalarVec::new(&a);
        let a_v_clone = a_v.clone();
@@ -161,11 +167,13 @@ mod tests {
        assert_eq!(count, a.len());
    }

-
    #[test]
    fn can_pack_unpack_shader_operand_a() {
        // Use 238 because it's a weird number not a multiple of the threadgroup size.
-        let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::<Vec<_>>();
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();

        let a_v = GpuScalarVec::new(&a);
        let c_v = a_v.clone();
@@ -174,6 +182,32 @@ mod tests {

        let len = GpuU32::new(a.len() as u32);

-        runtime.run("test_scalar_can_pack_unpack_a", &[&a_v.data, &a_v.data, &c_v.data, &len.data], &Grid::new(2, 1, 1));
+        runtime.run(
+            "test_scalar_can_pack_unpack_a",
+            &[&a_v.data, &a_v.data, &c_v.data, &len.data],
+            &Grid::new(2, 1, 1),
+        );
    }
-}
+
+    #[test]
+    fn can_pack_unpack_shader_operand_b() {
+        // Use 238 because it's a weird number not a multiple of the threadgroup size.
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();
+
+        let a_v = GpuScalarVec::new(&a);
+        let c_v = a_v.clone();
+
+        let runtime = Runtime::get();
+
+        let len = GpuU32::new(a.len() as u32);
+
+        runtime.run(
+            "test_scalar_can_pack_unpack_b",
+            &[&a_v.data, &a_v.data, &c_v.data, &len.data],
+            &Grid::new(2, 1, 1),
+        );
+    }
+}
--- a/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl
+++ b/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl
@@ -0,0 +1,6 @@
+const Scalar29_Zero: Scalar29 = Scalar29(array<u32, 9>(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u));
+
+const Scalar29_L: Scalar29 = Scalar29(array<u32, 9>(
+        0x1cf5d3edu, 0x009318d2u, 0x1de73596u, 0x1df3bd45u, 0x0000014du, 0x00000000u, 0x00000000u, 0x00000000u,
+        0x00100000u,
+));
--- a/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl
+++ b/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl
@@ -10,4 +10,18 @@ fn test_scalar_can_pack_unpack_a(

    let a = scalar29_unpack_a(gid.x, g_len);
    scalar29_pack_c(a, gid.x, g_len);
+}
+
+@compute
+@workgroup_size(128, 1, 1)
+fn test_scalar_can_pack_unpack_b(
+    @builtin(global_invocation_id) gid: vec3<u32>,
+) {
+    if gid.x >= g_len {
+        unused_b();
+        return;
+    }
+
+    let b = scalar29_unpack_b(gid.x, g_len);
+    scalar29_pack_c(b, gid.x, g_len);
 }
--- a/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl
+++ b/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl
@@ -10,8 +10,6 @@ struct Foo {
    v: array<u32>
 }

-const Scalar29_Zero: Scalar29 = Scalar29(array<u32, 9>(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u));
-
 // WGSL is terrible in that you can't pass arrays of unknown length to functions. So, we create
 // functions for unpacking bindings a, b respectively.
 fn scalar29_unpack_a(grid_tid: u32, stride: u32) -> Scalar29 {
@@ -69,3 +67,41 @@ fn scalar29_pack_c(val: Scalar29, grid_tid: u32, stride: u32) {
    word = val.v[7] >> 21u | val.v[8u] << 8u;
    g_c[7u * stride + grid_tid] = word;
 }
+
+fn scalar29_add(a: ptr<function, Scalar29>, b: ptr<function, Scalar29>) -> Scalar29 {
+    var sum = Scalar29_Zero;
+    let mask = (0x1u << 29u) - 1u;
+
+    // a + b
+    var carry = 0u;
+    for (var i = 0u; i < 9u; i++) {
+        carry = a.v[i] + b.v[i] + (carry >> 29u);
+        sum.v[i] = carry & mask;
+    }
+
+    // subtract l if the sum is >= l
+    return scalar29_sub(&sum, &Scalar29_L);
+}
+
+fn scalar29_sub(a: ptr<function, Scalar29>, b: ptr<function, Scalar29>) -> Scalar29 {
+    var difference = Scalar29_Zero;
+    let mask = (1u << 29u) - 1u;
+
+    // a - b
+    var borrow = 0u;
+    for (var i = 0u; i < 9u; i++) {
+        borrow = a.v[i] - (b.v[i] + (borrow >> 31u));
+        difference.v[i] = borrow & mask;
+    }
+
+    // conditionally add l if the difference is negative
+    let underflow_mask = ((borrow >> 31u) ^ 1u) - 1u;
+
+    var carry = 0u;
+    for (var i = 0u; i < 9u; i++) {
+        carry = (carry >> 29) + difference[i] + (Scalar29_L.v[i] & underflow_mask);
+        difference[i] = carry & mask;
+    }
+
+    return difference;
+}