From 9003580da44a73847664fc1953a26246a20c6cb9 Mon Sep 17 00:00:00 2001
From: rickwebiii <rick.weber.iii@gmail.com>
Date: Fri, 24 Feb 2023 13:32:16 -0800
Subject: [PATCH] Better validation

---
 Cargo.lock                                    |   1 +
 sunscreen_math/Cargo.toml                     |   3 +-
 sunscreen_math/build.rs                       |  27 ++--
 sunscreen_math/src/webgpu_impl/mod.rs         | 130 +++++++++++++-----
 .../src/webgpu_impl/ristrettovec.rs           |   4 +-
 sunscreen_math/src/webgpu_impl/scalarvec.rs   |  58 ++++++--
 .../src/webgpu_impl/shaders/constants.wgsl    |   6 +
 .../src/webgpu_impl/shaders/scalar.test.wgsl  |  14 ++
 .../src/webgpu_impl/shaders/scalar.wgsl       |  40 +++++-
 9 files changed, 219 insertions(+), 64 deletions(-)
 create mode 100644 sunscreen_math/src/webgpu_impl/shaders/constants.wgsl
diff --git a/Cargo.lock b/Cargo.lock
index 3129ea060..d69d2ad01 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2450,6 +2450,7 @@ dependencies = [
  "sunscreen_curve25519",
  "tokio",
  "wgpu",
+ "wgpu-core",
 ]
 
 [[package]]
diff --git a/sunscreen_math/Cargo.toml b/sunscreen_math/Cargo.toml
index eb92921f4..e542aca49 100644
--- a/sunscreen_math/Cargo.toml
+++ b/sunscreen_math/Cargo.toml
@@ -18,11 +18,12 @@ wgpu = { version = "0.15.1", optional = true }
 
 [build-dependencies]
 naga = { version = "0.11.0", optional = true, features = ["wgsl-in"]}
+wgpu-core = { version = "0.15.1", optional = true, features = ["metal", "wgsl"] }
 
 [features]
 default = ["webgpu"]
 nightly-features = []
 metal = ["dep:metal", "gpu"]
-webgpu = ["dep:wgpu", "dep:tokio", "dep:futures", "dep:naga", "dep:bytemuck", "gpu"]
+webgpu = ["dep:wgpu", "dep:tokio", "dep:futures", "dep:naga", "dep:wgpu-core", "dep:bytemuck", "gpu"]
 gpu = []
 pina = []
\ No newline at end of file
diff --git a/sunscreen_math/build.rs b/sunscreen_math/build.rs
index ab6f32cd0..e9380ac8c 100644
--- a/sunscreen_math/build.rs
+++ b/sunscreen_math/build.rs
@@ -1,11 +1,12 @@
 #[cfg(feature = "webgpu")]
 // This simply concatenates all the wgsl shaders, which get compiled at runtime.
 fn compile_wgsl_shaders() {
-    use std::{path::PathBuf, process::Command};
-    use std::fs::{DirEntry, File, read_to_string};
+    use std::fs::{read_to_string, DirEntry, File};
     use std::io::Write;
+    use std::{path::PathBuf};
 
-    use naga::valid::{ValidationFlags, Capabilities};
+    use naga::valid::{Capabilities, ValidationFlags};
+    use wgpu_core::pipeline::{CreateShaderModuleError, ShaderError};
 
     let outdir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
     let shader_dir = PathBuf::from(".")
@@ -27,9 +28,7 @@ fn compile_wgsl_shaders() {
         let include_file: Box<dyn Fn(&DirEntry) -> bool> = if config == "test" {
             Box::new(is_wgsl_file)
         } else {
-            Box::new(|file: &DirEntry| {
-                is_wgsl_file(file) && !is_test_wgsl_file(file)
-            })
+            Box::new(|file: &DirEntry| is_wgsl_file(file) && !is_test_wgsl_file(file))
         };
 
         let shaders = std::fs::read_dir(&shader_dir)
@@ -38,7 +37,7 @@ fn compile_wgsl_shaders() {
             .filter(include_file);
 
         let out_file_path = outdir.join(format!("shaders-{config}.wgsl"));
-        
+
         {
             let mut out_file = File::create(&out_file_path).unwrap();
 
@@ -50,15 +49,23 @@ fn compile_wgsl_shaders() {
         };
 
         // Validate the shader
-        let shader_contents = read_to_string(out_file_path).unwrap();
+        let shader_contents = read_to_string(&out_file_path).unwrap();
 
         let parse_result = naga::front::wgsl::parse_str(&shader_contents);
 
         if let Err(e) = parse_result {
-            panic!("{}", e.message());
+            let e = ShaderError {
+                source: shader_contents,
+                label: None,
+                inner: Box::new(e)
+            };
+
+            let e = CreateShaderModuleError::Parsing(e);
+            panic!("{}", e);
         }
 
-        let mut validator = naga::valid::Validator::new(ValidationFlags::all(), Capabilities::empty());
+        let mut validator =
+            naga::valid::Validator::new(ValidationFlags::all(), Capabilities::empty());
 
         let validation_results = validator.validate(&parse_result.unwrap());
 
diff --git a/sunscreen_math/src/webgpu_impl/mod.rs b/sunscreen_math/src/webgpu_impl/mod.rs
index 68795b9e9..993abad56 100644
--- a/sunscreen_math/src/webgpu_impl/mod.rs
+++ b/sunscreen_math/src/webgpu_impl/mod.rs
@@ -1,11 +1,21 @@
 use core::slice;
-use std::{borrow::Cow, ops::Deref, mem::{align_of, MaybeUninit, size_of}};
+use std::{
+    borrow::Cow,
+    mem::{align_of, size_of, MaybeUninit},
+    ops::Deref,
+};
 
 use bytemuck::{cast, cast_slice, Pod};
 use futures::channel::oneshot;
 use lazy_static::lazy_static;
 use tokio::runtime::{Builder as TokioRuntimeBuilder, Runtime as TokioRuntime};
-use wgpu::{Instance, RequestAdapterOptions, Device, Queue, ShaderModuleDescriptor, ShaderModule, BufferDescriptor, COPY_BUFFER_ALIGNMENT, BufferUsages, Buffer, ComputePipelineDescriptor, BindGroupDescriptor, BindGroupEntry, CommandEncoderDescriptor, ComputePassDescriptor, util::{BufferInitDescriptor, DeviceExt}, Maintain};
+use wgpu::{
+    util::{BufferInitDescriptor, DeviceExt},
+    BindGroupDescriptor, BindGroupEntry, Buffer, BufferDescriptor, BufferUsages,
+    CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor, Device, Instance,
+    Maintain, Queue, RequestAdapterOptions, ShaderModule, ShaderModuleDescriptor,
+    COPY_BUFFER_ALIGNMENT,
+};
 
 mod ristrettovec;
 mod scalarvec;
@@ -23,14 +33,10 @@ pub struct Runtime {
 // test and release. In test, we #define the TEST macro, which exposes test kernels.
 // The release library does not feature these kernels.
 #[cfg(not(test))]
-const SHADERS: &str = include_str!(concat!(
-    env!("OUT_DIR"),
-    "/shaders-release.wgsl"
-));
+const SHADERS: &str = include_str!(concat!(env!("OUT_DIR"), "/shaders-release.wgsl"));
 
 #[cfg(test)]
-const SHADERS: &str =
-    include_str!(concat!(env!("OUT_DIR"), "/shaders-test.wgsl"));
+const SHADERS: &str = include_str!(concat!(env!("OUT_DIR"), "/shaders-test.wgsl"));
 
 fn assert_aligned<T>(ptr: *const T) {
     assert!(ptr.cast::<()>().align_offset(align_of::<T>()) == 0);
@@ -52,6 +58,33 @@ trait BufferExt {
     fn get_data<T: Pod + Copy>(&self) -> Vec<T>;
 }
 
+pub trait GpuVec {
+    type IterType: Sized;
+
+    fn get_buffer(&self) -> &Buffer;
+
+    fn len(&self) -> usize;
+
+    fn byte_len(&self) -> usize {
+        self.len() * size_of::<Self::IterType>()
+    }
+
+    fn u32_len(&self) -> usize {
+        self.byte_len() / size_of::<u32>()
+    }
+
+    fn run_unary<Rhs: GpuVec>(&self, output: &Buffer, kernel_name: &'static str) {
+        let runtime = Runtime::get();
+        let len = GpuU32::new(self.len() as u32);
+
+        runtime.run(
+            kernel_name,
+            &[self.get_buffer(), &DUMMY_BUFFER, output, &len.data],
+            &Grid::new(self.len() as u32 / 128, 1, 1),
+        );
+    }
+}
+
 impl BufferExt for Buffer {
     fn clone(&self) -> Buffer {
         let runtime = Runtime::get();
@@ -65,7 +98,9 @@ impl BufferExt for Buffer {
     fn copy_into(&self, dst: &Buffer) {
         let runtime = Runtime::get();
 
-        let mut encoder = runtime.device.create_command_encoder(&CommandEncoderDescriptor { label: None });
+        let mut encoder = runtime
+            .device
+            .create_command_encoder(&CommandEncoderDescriptor { label: None });
         encoder.copy_buffer_to_buffer(self, 0, &dst, 0, self.size());
 
         runtime.queue.submit(Some(encoder.finish()));
@@ -76,7 +111,7 @@ impl BufferExt for Buffer {
         let (s, r) = oneshot::channel();
 
         // In vanilla WebGPU, if you use the MAP_READ flag, you must also set COPY_DST
-        // and *only* COPY_DST. This means you can't use such buffers in compute 
+        // and *only* COPY_DST. This means you can't use such buffers in compute
         // shaders. As such, we create a temporary buffer with these properties so we
         // can copy data out of the shader-capable buffer and return the results.
         let runtime = Runtime::get();
@@ -85,13 +120,15 @@ impl BufferExt for Buffer {
             label: None,
             size: self.size(),
             usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST,
-            mapped_at_creation: false
+            mapped_at_creation: false,
         });
 
         self.copy_into(&copy_buf);
 
         let buffer_slice = copy_buf.slice(..);
-        buffer_slice.map_async(wgpu::MapMode::Read, move |v| { s.send(v).unwrap(); });
+        buffer_slice.map_async(wgpu::MapMode::Read, move |v| {
+            s.send(v).unwrap();
+        });
 
         runtime.device.poll(Maintain::Wait);
 
@@ -113,15 +150,19 @@ impl Runtime {
     pub fn alloc<T>(&self, len: usize) -> Buffer {
         let len = size_of::<T>() * len;
 
-        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use 
+        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use
         // mapped_at_creation=true
-        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { len } else { (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize};
+        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 {
+            len
+        } else {
+            (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize
+        };
 
         let buffer = self.device.create_buffer(&BufferDescriptor {
             label: None,
             size: len as u64,
             usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
-            mapped_at_creation: false
+            mapped_at_creation: false,
         });
 
         buffer
@@ -130,9 +171,13 @@ impl Runtime {
     pub fn alloc_from_slice<T: Pod>(&self, data: &[T]) -> Buffer {
         let len = size_of::<T>() * data.len();
 
-        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use 
+        // Round up len to a multiple of COPY_BUFFER_ALIGNMENT, as required to use
         // mapped_at_creation=true
-        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 { len } else { (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize};
+        let len = if len % COPY_BUFFER_ALIGNMENT as usize == 0 {
+            len
+        } else {
+            (len / COPY_BUFFER_ALIGNMENT as usize + 1) * COPY_BUFFER_ALIGNMENT as usize
+        };
 
         let buffer = self.device.create_buffer_init(&BufferInitDescriptor {
             label: None,
@@ -144,19 +189,23 @@ impl Runtime {
     }
 
     pub fn run(&self, kernel_name: &'static str, args: &[&Buffer], threadgroups: &Grid) {
-        let pipeline = self.device.create_compute_pipeline(&ComputePipelineDescriptor {
-            label: None,
-            layout: None,
-            module: &self.shaders,
-            entry_point: kernel_name
-        });
+        let pipeline = self
+            .device
+            .create_compute_pipeline(&ComputePipelineDescriptor {
+                label: None,
+                layout: None,
+                module: &self.shaders,
+                entry_point: kernel_name,
+            });
 
-        let bindings = args.iter().enumerate().map(|(i, b)| {
-            BindGroupEntry {
+        let bindings = args
+            .iter()
+            .enumerate()
+            .map(|(i, b)| BindGroupEntry {
                 binding: i as u32,
-                resource: b.as_entire_binding()
-            }
-        }).collect::<Vec<_>>();
+                resource: b.as_entire_binding(),
+            })
+            .collect::<Vec<_>>();
 
         let layout = pipeline.get_bind_group_layout(0);
 
@@ -166,7 +215,9 @@ impl Runtime {
             entries: &bindings,
         });
 
-        let mut encoder = self.device.create_command_encoder(&CommandEncoderDescriptor { label: None });
+        let mut encoder = self
+            .device
+            .create_command_encoder(&CommandEncoderDescriptor { label: None });
 
         {
             let (x, y, z) = threadgroups.0;
@@ -183,7 +234,6 @@ impl Runtime {
     }
 }
 
-
 lazy_static! {
     static ref TOKIO_RUNTIME: TokioRuntime = {
         TokioRuntimeBuilder::new_current_thread()
@@ -228,10 +278,16 @@ lazy_static! {
             shaders
         }
     };
+
+    /// A 4-byte buffer that exists so you can bind a buffer to `g_b`
+    /// in unary shaders.
+    static ref DUMMY_BUFFER: Buffer = {
+        Runtime::get().alloc::<u32>(1)
+    };
 }
 
 pub struct GpuU32 {
-    data: Buffer
+    data: Buffer,
 }
 
 impl GpuU32 {
@@ -239,9 +295,7 @@ impl GpuU32 {
         let runtime = Runtime::get();
         let data = runtime.alloc_from_slice(&[val]);
 
-        Self {
-            data
-        }
+        Self { data }
     }
 }
 
@@ -269,10 +323,14 @@ mod tests {
 
         let n = GpuU32::new(a.len() as u32);
 
-        runtime.run("add", &[&a_gpu, &b_gpu, &c_gpu, &n.data], &Grid::new(1, 1, 1));
+        runtime.run(
+            "add",
+            &[&a_gpu, &b_gpu, &c_gpu, &n.data],
+            &Grid::new(1, 1, 1),
+        );
 
         for (c, (a, b)) in c_gpu.get_data::<u32>().iter().zip(a.iter().zip(b.iter())) {
             assert_eq!(*c, a + b);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/sunscreen_math/src/webgpu_impl/ristrettovec.rs b/sunscreen_math/src/webgpu_impl/ristrettovec.rs
index f40794529..21c6dad66 100644
--- a/sunscreen_math/src/webgpu_impl/ristrettovec.rs
+++ b/sunscreen_math/src/webgpu_impl/ristrettovec.rs
@@ -1,3 +1 @@
-pub struct GpuRistrettoPointVec {
-    
-}
\ No newline at end of file
+pub struct GpuRistrettoPointVec {}
diff --git a/sunscreen_math/src/webgpu_impl/scalarvec.rs b/sunscreen_math/src/webgpu_impl/scalarvec.rs
index 781716ebb..d8c39849f 100644
--- a/sunscreen_math/src/webgpu_impl/scalarvec.rs
+++ b/sunscreen_math/src/webgpu_impl/scalarvec.rs
@@ -3,7 +3,7 @@ use std::mem::size_of;
 use curve25519_dalek::scalar::Scalar;
 use wgpu::Buffer;
 
-use super::{Runtime, BufferExt};
+use super::{BufferExt, Runtime};
 
 pub struct GpuScalarVec {
     data: Buffer,
@@ -12,14 +12,14 @@ pub struct GpuScalarVec {
 impl Clone for GpuScalarVec {
     fn clone(&self) -> Self {
         Self {
-            data: self.data.clone()
+            data: self.data.clone(),
         }
     }
 }
 
 pub struct Scalars {
     data: Vec<u32>,
-    i: usize
+    i: usize,
 }
 
 impl Scalars {
@@ -109,7 +109,7 @@ impl GpuScalarVec {
         }
 
         Self {
-            data: runtime.alloc_from_slice(&packed_data)
+            data: runtime.alloc_from_slice(&packed_data),
         }
     }
 
@@ -124,13 +124,16 @@ impl GpuScalarVec {
 mod tests {
     use rand::thread_rng;
 
-    use crate::webgpu_impl::{Grid, GpuU32};
+    use crate::webgpu_impl::{GpuU32, Grid};
 
     use super::*;
-    
+
     #[test]
     fn can_unpack_scalarvec() {
-        let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::<Vec<_>>();
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();
 
         let a_v = GpuScalarVec::new(&a);
 
@@ -146,7 +149,10 @@ mod tests {
 
     #[test]
     fn can_clone_scalarvec() {
-        let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::<Vec<_>>();
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();
 
         let a_v = GpuScalarVec::new(&a);
         let a_v_clone = a_v.clone();
@@ -161,11 +167,13 @@ mod tests {
         assert_eq!(count, a.len());
     }
 
-
     #[test]
     fn can_pack_unpack_shader_operand_a() {
         // Use 238 because it's a weird number not a multiple of the threadgroup size.
-        let a = (0..238).into_iter().map(|_| Scalar::random(&mut thread_rng())).collect::<Vec<_>>();
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();
 
         let a_v = GpuScalarVec::new(&a);
         let c_v = a_v.clone();
@@ -174,6 +182,32 @@ mod tests {
 
         let len = GpuU32::new(a.len() as u32);
 
-        runtime.run("test_scalar_can_pack_unpack_a", &[&a_v.data, &a_v.data, &c_v.data, &len.data], &Grid::new(2, 1, 1));
+        runtime.run(
+            "test_scalar_can_pack_unpack_a",
+            &[&a_v.data, &a_v.data, &c_v.data, &len.data],
+            &Grid::new(2, 1, 1),
+        );
     }
-}
\ No newline at end of file
+
+    #[test]
+    fn can_pack_unpack_shader_operand_b() {
+        // Use 238 because it's a weird number not a multiple of the threadgroup size.
+        let a = (0..238)
+            .into_iter()
+            .map(|_| Scalar::random(&mut thread_rng()))
+            .collect::<Vec<_>>();
+
+        let a_v = GpuScalarVec::new(&a);
+        let c_v = a_v.clone();
+
+        let runtime = Runtime::get();
+
+        let len = GpuU32::new(a.len() as u32);
+
+        runtime.run(
+            "test_scalar_can_pack_unpack_b",
+            &[&a_v.data, &a_v.data, &c_v.data, &len.data],
+            &Grid::new(2, 1, 1),
+        );
+    }
+}
diff --git a/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl b/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl
new file mode 100644
index 000000000..d9b427ec6
--- /dev/null
+++ b/sunscreen_math/src/webgpu_impl/shaders/constants.wgsl
@@ -0,0 +1,6 @@
+const Scalar29_Zero: Scalar29 = Scalar29(array<u32, 9>(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u));
+
+const Scalar29_L: Scalar29 = Scalar29(array<u32, 9>(
+        0x1cf5d3edu, 0x009318d2u, 0x1de73596u, 0x1df3bd45u, 0x0000014du, 0x00000000u, 0x00000000u, 0x00000000u,
+        0x00100000u,
+));
\ No newline at end of file
diff --git a/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl b/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl
index 2c2bf135d..bf5f54f7d 100644
--- a/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl
+++ b/sunscreen_math/src/webgpu_impl/shaders/scalar.test.wgsl
@@ -10,4 +10,18 @@ fn test_scalar_can_pack_unpack_a(
 
     let a = scalar29_unpack_a(gid.x, g_len);
     scalar29_pack_c(a, gid.x, g_len);
+}
+
+@compute
+@workgroup_size(128, 1, 1)
+fn test_scalar_can_pack_unpack_b(
+    @builtin(global_invocation_id) gid: vec3<u32>,
+) {
+    if gid.x >= g_len {
+        unused_b();
+        return;
+    }
+
+    let b = scalar29_unpack_b(gid.x, g_len);
+    scalar29_pack_c(b, gid.x, g_len);
 }
\ No newline at end of file
diff --git a/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl b/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl
index da815fe4b..f3dbfb863 100644
--- a/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl
+++ b/sunscreen_math/src/webgpu_impl/shaders/scalar.wgsl
@@ -10,8 +10,6 @@ struct Foo {
     v: array<u32>
 }
 
-const Scalar29_Zero: Scalar29 = Scalar29(array<u32, 9>(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u));
-
 // WGSL is terrible in that you can't pass arrays of unknown length to functions. So, we create
 // functions for unpacking bindings a, b respectively.
 fn scalar29_unpack_a(grid_tid: u32, stride: u32) -> Scalar29 {
@@ -69,3 +67,41 @@ fn scalar29_pack_c(val: Scalar29, grid_tid: u32, stride: u32) {
     word = val.v[7] >> 21u | val.v[8u] << 8u;
     g_c[7u * stride + grid_tid] = word;
 }
+
+fn scalar29_add(a: ptr<function, Scalar29>, b: ptr<function, Scalar29>) -> Scalar29 {
+    var sum = Scalar29_Zero;
+    let mask = (0x1u << 29u) - 1u;
+
+    // a + b
+    var carry = 0u;
+    for (var i = 0u; i < 9u; i++) {
+        carry = a.v[i] + b.v[i] + (carry >> 29u);
+        sum.v[i] = carry & mask;
+    }
+
+    // subtract l if the sum is >= l
+    return scalar29_sub(&sum, &Scalar29_L);
+}
+
+fn scalar29_sub(a: ptr<function, Scalar29>, b: ptr<function, Scalar29>) -> Scalar29 {
+    var difference = Scalar29_Zero;
+    let mask = (1u << 29u) - 1u;
+
+    // a - b
+    var borrow = 0u;
+    for (var i = 0u; i < 9u; i++) {
+        borrow = a.v[i] - (b.v[i] + (borrow >> 31u));
+        difference.v[i] = borrow & mask;
+    }
+
+    // conditionally add l if the difference is negative
+    let underflow_mask = ((borrow >> 31u) ^ 1u) - 1u;
+
+    var carry = 0u;
+    for (var i = 0u; i < 9u; i++) {
+        carry = (carry >> 29) + difference[i] + (Scalar29_L.v[i] & underflow_mask);
+        difference[i] = carry & mask;
+    }
+
+    return difference;
+}
\ No newline at end of file